1 #ifdef PERL_EXT_RE_BUILD
2 #include "re_top.h"
3 #endif
4
5 #include "EXTERN.h"
6 #define PERL_IN_REGEX_ENGINE
7 #define PERL_IN_REGCOMP_ANY
8 #define PERL_IN_REGCOMP_DEBUG_C
9 #include "perl.h"
10
11 #ifdef PERL_IN_XSUB_RE
12 # include "re_comp.h"
13 #else
14 # include "regcomp.h"
15 #endif
16
17 #include "invlist_inline.h"
18 #include "unicode_constants.h"
19 #include "regcomp_internal.h"
20
21 #ifdef PERL_RE_BUILD_DEBUG
22 int
Perl_re_printf(pTHX_ const char * fmt,...)23 Perl_re_printf(pTHX_ const char *fmt, ...)
24 {
25 va_list ap;
26 int result;
27 PerlIO *f= Perl_debug_log;
28 PERL_ARGS_ASSERT_RE_PRINTF;
29 va_start(ap, fmt);
30 result = PerlIO_vprintf(f, fmt, ap);
31 va_end(ap);
32 return result;
33 }
34
35 int
Perl_re_indentf(pTHX_ const char * fmt,U32 depth,...)36 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
37 {
38 va_list ap;
39 int result;
40 PerlIO *f= Perl_debug_log;
41 PERL_ARGS_ASSERT_RE_INDENTF;
42 va_start(ap, depth);
43 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
44 result = PerlIO_vprintf(f, fmt, ap);
45 va_end(ap);
46 return result;
47 }
48
49 void
Perl_debug_show_study_flags(pTHX_ U32 flags,const char * open_str,const char * close_str)50 Perl_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
51 const char *close_str)
52 {
53 PERL_ARGS_ASSERT_DEBUG_SHOW_STUDY_FLAGS;
54 if (!flags)
55 return;
56
57 Perl_re_printf( aTHX_ "%s", open_str);
58 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
59 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
60 DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
61 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
62 DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
63 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
64 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
65 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
66 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
67 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
68 DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
69 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
70 DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
71 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
72 DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
73 Perl_re_printf( aTHX_ "%s", close_str);
74 }
75
76 void
Perl_debug_studydata(pTHX_ const char * where,scan_data_t * data,U32 depth,int is_inf,SSize_t min,SSize_t stopmin,SSize_t delta)77 Perl_debug_studydata(pTHX_ const char *where, scan_data_t *data,
78 U32 depth, int is_inf,
79 SSize_t min, SSize_t stopmin, SSize_t delta)
80 {
81 PERL_ARGS_ASSERT_DEBUG_STUDYDATA;
82 DECLARE_AND_GET_RE_DEBUG_FLAGS;
83
84 DEBUG_OPTIMISE_MORE_r({
85 if (!data) {
86 Perl_re_indentf(aTHX_ "%s: NO DATA",
87 depth,
88 where);
89 return;
90 }
91 Perl_re_indentf(aTHX_ "%s: M/S/D: %" IVdf "/%" IVdf "/%" IVdf " Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
92 depth,
93 where,
94 min, stopmin, delta,
95 (IV)data->pos_min,
96 (IV)data->pos_delta,
97 (UV)data->flags
98 );
99
100 Perl_debug_show_study_flags(aTHX_ data->flags," [","]");
101
102 Perl_re_printf( aTHX_
103 " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
104 (IV)data->whilem_c,
105 (IV)(data->last_closep ? *((data)->last_closep) : -1),
106 is_inf ? "INF " : ""
107 );
108
109 if (data->last_found) {
110 int i;
111 Perl_re_printf(aTHX_
112 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
113 SvPVX_const(data->last_found),
114 (IV)data->last_end,
115 (IV)data->last_start_min,
116 (IV)data->last_start_max
117 );
118
119 for (i = 0; i < 2; i++) {
120 Perl_re_printf(aTHX_
121 " %s%s: '%s' @ %" IVdf "/%" IVdf,
122 data->cur_is_floating == i ? "*" : "",
123 i ? "Float" : "Fixed",
124 SvPVX_const(data->substrs[i].str),
125 (IV)data->substrs[i].min_offset,
126 (IV)data->substrs[i].max_offset
127 );
128 Perl_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
129 }
130 }
131
132 Perl_re_printf( aTHX_ "\n");
133 });
134 }
135
136
137 void
Perl_debug_peep(pTHX_ const char * str,const RExC_state_t * pRExC_state,regnode * scan,U32 depth,U32 flags)138 Perl_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
139 regnode *scan, U32 depth, U32 flags)
140 {
141 PERL_ARGS_ASSERT_DEBUG_PEEP;
142 DECLARE_AND_GET_RE_DEBUG_FLAGS;
143
144 DEBUG_OPTIMISE_r({
145 regnode *Next;
146
147 if (!scan)
148 return;
149 Next = regnext(scan);
150 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
151 Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)",
152 depth,
153 str,
154 REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
155 Next ? (REG_NODE_NUM(Next)) : 0 );
156 Perl_debug_show_study_flags(aTHX_ flags," [ ","]");
157 Perl_re_printf( aTHX_ "\n");
158 });
159 }
160
161 const regnode *
Perl_dumpuntil(pTHX_ const regexp * r,const regnode * start,const regnode * node,const regnode * last,const regnode * plast,SV * sv,I32 indent,U32 depth)162 Perl_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
163 const regnode *last, const regnode *plast,
164 SV* sv, I32 indent, U32 depth)
165 {
166 const regnode *next;
167 const regnode *optstart= NULL;
168
169 RXi_GET_DECL(r, ri);
170 DECLARE_AND_GET_RE_DEBUG_FLAGS;
171
172 PERL_ARGS_ASSERT_DUMPUNTIL;
173
174 #ifdef DEBUG_DUMPUNTIL
175 Perl_re_printf( aTHX_ "--- %d : %d - %d - %d\n", indent, node-start,
176 last ? last-start : 0, plast ? plast-start : 0);
177 #endif
178
179 if (plast && plast < last)
180 last= plast;
181
182 while (node && (!last || node < last)) {
183 const U8 op = OP(node);
184
185 if (op == CLOSE || op == SRCLOSE || op == WHILEM)
186 indent--;
187 next = regnext((regnode *)node);
188 const regnode *after = regnode_after((regnode *)node,0);
189
190 /* Where, what. */
191 if (op == OPTIMIZED) {
192 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
193 optstart = node;
194 else
195 goto after_print;
196 } else
197 CLEAR_OPTSTART;
198
199 regprop(r, sv, node, NULL, NULL);
200 Perl_re_printf( aTHX_ "%4" IVdf ":%*s%s", (IV)(node - start),
201 (int)(2*indent + 1), "", SvPVX_const(sv));
202
203 if (op != OPTIMIZED) {
204 if (next == NULL) /* Next ptr. */
205 Perl_re_printf( aTHX_ " (0)");
206 else if (REGNODE_TYPE(op) == BRANCH
207 && REGNODE_TYPE(OP(next)) != BRANCH )
208 Perl_re_printf( aTHX_ " (FAIL)");
209 else
210 Perl_re_printf( aTHX_ " (%" IVdf ")", (IV)(next - start));
211 Perl_re_printf( aTHX_ "\n");
212 }
213
214 after_print:
215 if (REGNODE_TYPE(op) == BRANCHJ) {
216 assert(next);
217 const regnode *nnode = (OP(next) == LONGJMP
218 ? regnext((regnode *)next)
219 : next);
220 if (last && nnode > last)
221 nnode = last;
222 DUMPUNTIL(after, nnode);
223 }
224 else if (REGNODE_TYPE(op) == BRANCH) {
225 assert(next);
226 DUMPUNTIL(after, next);
227 }
228 else if ( REGNODE_TYPE(op) == TRIE ) {
229 const regnode *this_trie = node;
230 const U32 n = ARG1u(node);
231 const reg_ac_data * const ac = op>=AHOCORASICK ?
232 (reg_ac_data *)ri->data->data[n] :
233 NULL;
234 const reg_trie_data * const trie =
235 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
236 #ifdef DEBUGGING
237 AV *const trie_words
238 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
239 #endif
240 const regnode *nextbranch= NULL;
241 I32 word_idx;
242 SvPVCLEAR(sv);
243 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
244 SV ** const elem_ptr = av_fetch_simple(trie_words, word_idx, 0);
245
246 Perl_re_indentf( aTHX_ "%s ",
247 indent+3,
248 elem_ptr
249 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
250 SvCUR(*elem_ptr), PL_dump_re_max_len,
251 PL_colors[0], PL_colors[1],
252 (SvUTF8(*elem_ptr)
253 ? PERL_PV_ESCAPE_UNI
254 : 0)
255 | PERL_PV_PRETTY_ELLIPSES
256 | PERL_PV_PRETTY_LTGT
257 )
258 : "???"
259 );
260 if (trie->jump) {
261 U16 dist= trie->jump[word_idx+1];
262 Perl_re_printf( aTHX_ "(%" UVuf ")\n",
263 (UV)((dist ? this_trie + dist : next) - start));
264 if (dist) {
265 if (!nextbranch)
266 nextbranch= this_trie + trie->jump[0];
267 DUMPUNTIL(this_trie + dist, nextbranch);
268 }
269 if (nextbranch && REGNODE_TYPE(OP(nextbranch))==BRANCH)
270 nextbranch= regnext((regnode *)nextbranch);
271 } else {
272 Perl_re_printf( aTHX_ "\n");
273 }
274 }
275 if (last && next > last)
276 node= last;
277 else
278 node= next;
279 }
280 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
281 DUMPUNTIL(after, after + 1); /* +1 is NOT a REGNODE_AFTER */
282 }
283 else if (REGNODE_TYPE(op) == CURLY && op != CURLYX) {
284 assert(next);
285 DUMPUNTIL(after, next);
286 }
287 else if ( op == PLUS || op == STAR) {
288 DUMPUNTIL(after, after + 1); /* +1 NOT a REGNODE_AFTER */
289 }
290 else if (REGNODE_TYPE(op) == EXACT || op == ANYOFHs) {
291 /* Literal string, where present. */
292 node = (const regnode *)REGNODE_AFTER_varies(node);
293 }
294 else {
295 node = REGNODE_AFTER_opcode(node,op);
296 }
297 if (op == CURLYX || op == OPEN || op == SROPEN)
298 indent++;
299 if (REGNODE_TYPE(op) == END)
300 break;
301 }
302 CLEAR_OPTSTART;
303 #ifdef DEBUG_DUMPUNTIL
304 Perl_re_printf( aTHX_ "--- %d\n", (int)indent);
305 #endif
306 return node;
307 }
308
309 #endif /* PERL_RE_BUILD_DEBUG */
310
311 /*
312 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
313 */
314 #ifdef DEBUGGING
315 static void
S_regdump_intflags(pTHX_ const char * lead,const U32 flags)316 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
317 {
318 int bit;
319 int set=0;
320
321 STATIC_ASSERT_STMT(REG_INTFLAGS_NAME_SIZE <= sizeof(flags) * CHARBITS);
322
323 for (bit=0; bit < REG_INTFLAGS_NAME_SIZE; bit++) {
324 if (flags & (1<<bit)) {
325 if (!set++ && lead)
326 Perl_re_printf( aTHX_ "%s", lead);
327 Perl_re_printf( aTHX_ "%s ", PL_reg_intflags_name[bit]);
328 }
329 }
330 if (lead) {
331 if (set)
332 Perl_re_printf( aTHX_ "\n");
333 else
334 Perl_re_printf( aTHX_ "%s[none-set]\n", lead);
335 }
336 }
337
338 static void
S_regdump_extflags(pTHX_ const char * lead,const U32 flags)339 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
340 {
341 int bit;
342 int set=0;
343 regex_charset cs;
344
345 STATIC_ASSERT_STMT(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags) * CHARBITS);
346
347 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
348 if (flags & (1U<<bit)) {
349 if ((1U<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
350 continue;
351 }
352 if (!set++ && lead)
353 Perl_re_printf( aTHX_ "%s", lead);
354 Perl_re_printf( aTHX_ "%s ", PL_reg_extflags_name[bit]);
355 }
356 }
357 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
358 if (!set++ && lead) {
359 Perl_re_printf( aTHX_ "%s", lead);
360 }
361 switch (cs) {
362 case REGEX_UNICODE_CHARSET:
363 Perl_re_printf( aTHX_ "UNICODE");
364 break;
365 case REGEX_LOCALE_CHARSET:
366 Perl_re_printf( aTHX_ "LOCALE");
367 break;
368 case REGEX_ASCII_RESTRICTED_CHARSET:
369 Perl_re_printf( aTHX_ "ASCII-RESTRICTED");
370 break;
371 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
372 Perl_re_printf( aTHX_ "ASCII-MORE_RESTRICTED");
373 break;
374 default:
375 Perl_re_printf( aTHX_ "UNKNOWN CHARACTER SET");
376 break;
377 }
378 }
379 if (lead) {
380 if (set)
381 Perl_re_printf( aTHX_ "\n");
382 else
383 Perl_re_printf( aTHX_ "%s[none-set]\n", lead);
384 }
385 }
386 #endif
387
388 void
Perl_regdump(pTHX_ const regexp * r)389 Perl_regdump(pTHX_ const regexp *r)
390 {
391 #ifdef DEBUGGING
392 int i;
393 SV * const sv = sv_newmortal();
394 SV *dsv= sv_newmortal();
395 RXi_GET_DECL(r, ri);
396 DECLARE_AND_GET_RE_DEBUG_FLAGS;
397
398 PERL_ARGS_ASSERT_REGDUMP;
399
400 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
401
402 /* Header fields of interest. */
403 for (i = 0; i < 2; i++) {
404 if (r->substrs->data[i].substr) {
405 RE_PV_QUOTED_DECL(s, 0, dsv,
406 SvPVX_const(r->substrs->data[i].substr),
407 RE_SV_DUMPLEN(r->substrs->data[i].substr),
408 PL_dump_re_max_len);
409 Perl_re_printf( aTHX_
410 "%s %s%s at %" IVdf "..%" UVuf " ",
411 i ? "floating" : "anchored",
412 s,
413 RE_SV_TAIL(r->substrs->data[i].substr),
414 (IV)r->substrs->data[i].min_offset,
415 (UV)r->substrs->data[i].max_offset);
416 }
417 else if (r->substrs->data[i].utf8_substr) {
418 RE_PV_QUOTED_DECL(s, 1, dsv,
419 SvPVX_const(r->substrs->data[i].utf8_substr),
420 RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr),
421 30);
422 Perl_re_printf( aTHX_
423 "%s utf8 %s%s at %" IVdf "..%" UVuf " ",
424 i ? "floating" : "anchored",
425 s,
426 RE_SV_TAIL(r->substrs->data[i].utf8_substr),
427 (IV)r->substrs->data[i].min_offset,
428 (UV)r->substrs->data[i].max_offset);
429 }
430 }
431
432 if (r->check_substr || r->check_utf8)
433 Perl_re_printf( aTHX_
434 (const char *)
435 ( r->check_substr == r->substrs->data[1].substr
436 && r->check_utf8 == r->substrs->data[1].utf8_substr
437 ? "(checking floating" : "(checking anchored"));
438 if (r->intflags & PREGf_NOSCAN)
439 Perl_re_printf( aTHX_ " noscan");
440 if (r->extflags & RXf_CHECK_ALL)
441 Perl_re_printf( aTHX_ " isall");
442 if (r->check_substr || r->check_utf8)
443 Perl_re_printf( aTHX_ ") ");
444
445 if (ri->regstclass) {
446 regprop(r, sv, ri->regstclass, NULL, NULL);
447 Perl_re_printf( aTHX_ "stclass %s ", SvPVX_const(sv));
448 }
449 if (r->intflags & PREGf_ANCH) {
450 Perl_re_printf( aTHX_ "anchored");
451 if (r->intflags & PREGf_ANCH_MBOL)
452 Perl_re_printf( aTHX_ "(MBOL)");
453 if (r->intflags & PREGf_ANCH_SBOL)
454 Perl_re_printf( aTHX_ "(SBOL)");
455 if (r->intflags & PREGf_ANCH_GPOS)
456 Perl_re_printf( aTHX_ "(GPOS)");
457 Perl_re_printf( aTHX_ " ");
458 }
459 if (r->intflags & PREGf_GPOS_SEEN)
460 Perl_re_printf( aTHX_ "GPOS:%" UVuf " ", (UV)r->gofs);
461 if (r->intflags & PREGf_SKIP)
462 Perl_re_printf( aTHX_ "plus ");
463 if (r->intflags & PREGf_IMPLICIT)
464 Perl_re_printf( aTHX_ "implicit ");
465 Perl_re_printf( aTHX_ "minlen %" IVdf " ", (IV)r->minlen);
466 if (r->extflags & RXf_EVAL_SEEN)
467 Perl_re_printf( aTHX_ "with eval ");
468 Perl_re_printf( aTHX_ "\n");
469 DEBUG_FLAGS_r({
470 regdump_extflags("r->extflags: ", r->extflags);
471 regdump_intflags("r->intflags: ", r->intflags);
472 });
473 #else
474 PERL_ARGS_ASSERT_REGDUMP;
475 PERL_UNUSED_CONTEXT;
476 PERL_UNUSED_ARG(r);
477 #endif /* DEBUGGING */
478 }
479
480 /* Should be synchronized with ANYOF_ #defines in regcomp.h */
481 #ifdef DEBUGGING
482
483 # if CC_WORDCHAR_ != 0 || CC_DIGIT_ != 1 || CC_ALPHA_ != 2 \
484 || CC_LOWER_ != 3 || CC_UPPER_ != 4 || CC_PUNCT_ != 5 \
485 || CC_PRINT_ != 6 || CC_ALPHANUMERIC_ != 7 || CC_GRAPH_ != 8 \
486 || CC_CASED_ != 9 || CC_SPACE_ != 10 || CC_BLANK_ != 11 \
487 || CC_XDIGIT_ != 12 || CC_CNTRL_ != 13 || CC_ASCII_ != 14 \
488 || CC_VERTSPACE_ != 15
489 # error Need to adjust order of anyofs[]
490 # endif
491 static const char * const anyofs[] = {
492 "\\w",
493 "\\W",
494 "\\d",
495 "\\D",
496 "[:alpha:]",
497 "[:^alpha:]",
498 "[:lower:]",
499 "[:^lower:]",
500 "[:upper:]",
501 "[:^upper:]",
502 "[:punct:]",
503 "[:^punct:]",
504 "[:print:]",
505 "[:^print:]",
506 "[:alnum:]",
507 "[:^alnum:]",
508 "[:graph:]",
509 "[:^graph:]",
510 "[:cased:]",
511 "[:^cased:]",
512 "\\s",
513 "\\S",
514 "[:blank:]",
515 "[:^blank:]",
516 "[:xdigit:]",
517 "[:^xdigit:]",
518 "[:cntrl:]",
519 "[:^cntrl:]",
520 "[:ascii:]",
521 "[:^ascii:]",
522 "\\v",
523 "\\V"
524 };
525 #endif
526
527 /*
528 - regprop - printable representation of opcode, with run time support
529 */
530
531 void
Perl_regprop(pTHX_ const regexp * prog,SV * sv,const regnode * o,const regmatch_info * reginfo,const RExC_state_t * pRExC_state)532 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
533 {
534 #ifdef DEBUGGING
535 U8 k;
536 const U8 op = OP(o);
537 RXi_GET_DECL(prog, progi);
538 DECLARE_AND_GET_RE_DEBUG_FLAGS;
539
540 PERL_ARGS_ASSERT_REGPROP;
541
542 SvPVCLEAR(sv);
543
544 if (op > REGNODE_MAX) { /* regnode.type is unsigned */
545 if (pRExC_state) { /* This gives more info, if we have it */
546 FAIL3("panic: corrupted regexp opcode %d > %d",
547 (int)op, (int)REGNODE_MAX);
548 }
549 else {
550 Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d",
551 (int)op, (int)REGNODE_MAX);
552 }
553 }
554 sv_catpv(sv, REGNODE_NAME(op)); /* Take off const! */
555
556 k = REGNODE_TYPE(op);
557 if (op == BRANCH) {
558 Perl_sv_catpvf(aTHX_ sv, " (buf:%" IVdf "/%" IVdf ")", (IV)ARG1a(o),(IV)ARG1b(o));
559 }
560 else if (op == BRANCHJ) {
561 Perl_sv_catpvf(aTHX_ sv, " (buf:%" IVdf "/%" IVdf ")", (IV)ARG2a(o),(IV)ARG2b(o));
562 }
563 else if (k == EXACT) {
564 sv_catpvs(sv, " ");
565 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
566 * is a crude hack but it may be the best for now since
567 * we have no flag "this EXACTish node was UTF-8"
568 * --jhi */
569 pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len,
570 PL_colors[0], PL_colors[1],
571 PERL_PV_ESCAPE_UNI_DETECT |
572 PERL_PV_ESCAPE_NONASCII |
573 PERL_PV_PRETTY_ELLIPSES |
574 PERL_PV_PRETTY_LTGT |
575 PERL_PV_PRETTY_NOCLEAR
576 );
577 } else if (k == TRIE) {
578 /* print the details of the trie in dumpuntil instead, as
579 * progi->data isn't available here */
580 const U32 n = ARG1u(o);
581 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
582 (reg_ac_data *)progi->data->data[n] :
583 NULL;
584 const reg_trie_data * const trie
585 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
586
587 Perl_sv_catpvf(aTHX_ sv, "-%s", REGNODE_NAME(FLAGS(o)));
588 DEBUG_TRIE_COMPILE_r({
589 if (trie->jump)
590 sv_catpvs(sv, "(JUMP)");
591 Perl_sv_catpvf(aTHX_ sv,
592 "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">",
593 (UV)trie->startstate,
594 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
595 (UV)trie->wordcount,
596 (UV)trie->minlen,
597 (UV)trie->maxlen,
598 (UV)TRIE_CHARCOUNT(trie),
599 (UV)trie->uniquecharcount
600 );
601 });
602 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
603 sv_catpvs(sv, "[");
604 (void) put_charclass_bitmap_innards(sv,
605 ((IS_ANYOF_TRIE(op))
606 ? ANYOF_BITMAP(o)
607 : TRIE_BITMAP(trie)),
608 NULL,
609 NULL,
610 NULL,
611 0,
612 FALSE
613 );
614 sv_catpvs(sv, "]");
615 }
616 if (trie->before_paren || trie->after_paren)
617 Perl_sv_catpvf(aTHX_ sv, " (buf:%" IVdf "/%" IVdf ")",
618 (IV)trie->before_paren,(IV)trie->after_paren);
619 } else if (k == CURLY) {
620 U32 lo = ARG1i(o), hi = ARG2i(o);
621 if (ARG3u(o)) /* check both ARG3a and ARG3b at the same time */
622 Perl_sv_catpvf(aTHX_ sv, "<%d:%d>", ARG3a(o),ARG3b(o)); /* paren before, paren after */
623 if (op == CURLYM || op == CURLYN || op == CURLYX)
624 Perl_sv_catpvf(aTHX_ sv, "[%d]", FLAGS(o)); /* Parenth number */
625 Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
626 if (hi == REG_INFTY)
627 sv_catpvs(sv, "INFTY");
628 else
629 Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
630 sv_catpvs(sv, "}");
631 }
632 else if (k == WHILEM && FLAGS(o)) /* Ordinal/of */
633 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", FLAGS(o) & 0xf, FLAGS(o)>>4);
634 else if (k == REF || k == OPEN || k == CLOSE
635 || k == GROUPP || op == ACCEPT)
636 {
637 AV *name_list= NULL;
638 U32 parno= (op == ACCEPT) ? ARG2u(o) :
639 (op == OPEN || op == CLOSE) ? PARNO(o) :
640 ARG1u(o);
641 if ( RXp_PAREN_NAMES(prog) ) {
642 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
643 } else if ( pRExC_state ) {
644 name_list= RExC_paren_name_list;
645 }
646 if ( name_list ) {
647 if ( k != REF || (op < REFN)) {
648 UV logical_parno = parno;
649 if (prog->parno_to_logical)
650 logical_parno = prog->parno_to_logical[parno];
651
652 Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)logical_parno); /* Parenth number */
653 if (parno != logical_parno)
654 Perl_sv_catpvf(aTHX_ sv, "/%" UVuf, (UV)parno); /* Parenth number */
655
656 SV **name= av_fetch_simple(name_list, parno, 0 );
657 if (name)
658 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
659 }
660 else
661 if (parno > 0) {
662 /* parno must always be larger than 0 for this block
663 * as it represents a slot into the data array, which
664 * has the 0 slot reserved for a placeholder so any valid
665 * index into it is always true, eg non-zero
666 * see the '%' "what" type and the implementation of
667 * S_reg_add_data()
668 */
669 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
670 I32 *nums=(I32*)SvPVX(sv_dat);
671 SV **name= av_fetch_simple(name_list, nums[0], 0 );
672 I32 n;
673 if (name) {
674 for ( n=0; n<SvIVX(sv_dat); n++ ) {
675 Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf,
676 (n ? "," : ""), (IV)nums[n]);
677 }
678 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
679 }
680 }
681 } else if (parno>0) {
682 UV logical_parno = parno;
683 if (prog->parno_to_logical)
684 logical_parno = prog->parno_to_logical[parno];
685
686 Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)logical_parno); /* Parenth number */
687 if (logical_parno != parno)
688 Perl_sv_catpvf(aTHX_ sv, "/%" UVuf, (UV)parno); /* Parenth number */
689
690 }
691 if ( k == REF ) {
692 Perl_sv_catpvf(aTHX_ sv, " <%" IVdf ">", (IV)ARG2i(o));
693 }
694 if ( k == REF && reginfo) {
695 U32 n = ARG1u(o); /* which paren pair */
696 I32 ln = RXp_OFFS_START(prog,n);
697 if (RXp_LASTPAREN(prog) < n || ln == -1 || RXp_OFFS_END(prog,n) == -1)
698 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
699 else if (ln == RXp_OFFS_END(prog,n))
700 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
701 else {
702 const char *s = reginfo->strbeg + ln;
703 Perl_sv_catpvf(aTHX_ sv, ": ");
704 Perl_pv_pretty( aTHX_ sv, s, RXp_OFFS_END(prog,n) - RXp_OFFS_START(prog,n), 32, 0, 0,
705 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
706 }
707 }
708 } else if (k == GOSUB) {
709 AV *name_list= NULL;
710 IV parno = ARG1u(o);
711 IV logical_parno = (parno && prog->parno_to_logical)
712 ? prog->parno_to_logical[parno]
713 : parno;
714 if ( RXp_PAREN_NAMES(prog) ) {
715 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
716 } else if ( pRExC_state ) {
717 name_list= RExC_paren_name_list;
718 }
719
720 /* Paren and offset */
721 Perl_sv_catpvf(aTHX_ sv, "%" IVdf, logical_parno);
722 if (logical_parno != parno)
723 Perl_sv_catpvf(aTHX_ sv, "/%" IVdf, parno);
724
725 Perl_sv_catpvf(aTHX_ sv, "[%+d:%d]", (int)ARG2i(o),
726 (int)((o + (int)ARG2i(o)) - progi->program) );
727 if (name_list) {
728 SV **name= av_fetch_simple(name_list, ARG1u(o), 0 );
729 if (name)
730 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
731 }
732 }
733 else if (k == LOGICAL)
734 /* 2: embedded, otherwise 1 */
735 Perl_sv_catpvf(aTHX_ sv, "[%d]", FLAGS(o));
736 else if (k == ANYOF || k == ANYOFH || k == ANYOFR) {
737 U8 flags;
738 char * bitmap;
739 U8 do_sep = 0; /* Do we need to separate various components of the
740 output? */
741 /* Set if there is still an unresolved user-defined property */
742 SV *unresolved = NULL;
743
744 /* Things that are ignored except when the runtime locale is UTF-8 */
745 SV *only_utf8_locale_invlist = NULL;
746
747 /* Code points that don't fit in the bitmap */
748 SV *nonbitmap_invlist = NULL;
749
750 /* And things that aren't in the bitmap, but are small enough to be */
751 SV* bitmap_range_not_in_bitmap = NULL;
752
753 bool inverted;
754
755 if (k != ANYOF) {
756 flags = 0;
757 bitmap = NULL;
758 }
759 else {
760 flags = ANYOF_FLAGS(o);
761 bitmap = ANYOF_BITMAP(o);
762 }
763
764 if (op == ANYOFL || op == ANYOFPOSIXL) {
765 if ((flags & ANYOFL_UTF8_LOCALE_REQD)) {
766 sv_catpvs(sv, "{utf8-locale-reqd}");
767 }
768 if (flags & ANYOFL_FOLD) {
769 sv_catpvs(sv, "{i}");
770 }
771 }
772
773 inverted = flags & ANYOF_INVERT;
774
775 /* If there is stuff outside the bitmap, get it */
776 if (k == ANYOFR) {
777
778 /* For a single range, split into the parts inside vs outside the
779 * bitmap. */
780 UV start = ANYOFRbase(o);
781 UV end = ANYOFRbase(o) + ANYOFRdelta(o);
782
783 if (start < NUM_ANYOF_CODE_POINTS) {
784 if (end < NUM_ANYOF_CODE_POINTS) {
785 bitmap_range_not_in_bitmap
786 = _add_range_to_invlist(bitmap_range_not_in_bitmap,
787 start, end);
788 }
789 else {
790 bitmap_range_not_in_bitmap
791 = _add_range_to_invlist(bitmap_range_not_in_bitmap,
792 start, NUM_ANYOF_CODE_POINTS);
793 start = NUM_ANYOF_CODE_POINTS;
794 }
795 }
796
797 if (start >= NUM_ANYOF_CODE_POINTS) {
798 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
799 ANYOFRbase(o),
800 ANYOFRbase(o) + ANYOFRdelta(o));
801 }
802 }
803 else if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(o)) {
804 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
805 NUM_ANYOF_CODE_POINTS,
806 UV_MAX);
807 }
808 else if (ANYOF_HAS_AUX(o)) {
809 (void) GET_REGCLASS_AUX_DATA(prog, o, FALSE,
810 &unresolved,
811 &only_utf8_locale_invlist,
812 &nonbitmap_invlist);
813
814 /* The aux data may contain stuff that could fit in the bitmap.
815 * This could come from a user-defined property being finally
816 * resolved when this call was done; or much more likely because
817 * there are matches that require UTF-8 to be valid, and so aren't
818 * in the bitmap (or ANYOFR). This is teased apart later */
819 _invlist_intersection(nonbitmap_invlist,
820 PL_InBitmap,
821 &bitmap_range_not_in_bitmap);
822 /* Leave just the things that don't fit into the bitmap */
823 _invlist_subtract(nonbitmap_invlist,
824 PL_InBitmap,
825 &nonbitmap_invlist);
826 }
827
828 /* Ready to start outputting. First, the initial left bracket */
829 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
830
831 if ( bitmap
832 || bitmap_range_not_in_bitmap
833 || only_utf8_locale_invlist
834 || unresolved)
835 {
836 /* Then all the things that could fit in the bitmap */
837 do_sep = put_charclass_bitmap_innards(
838 sv,
839 bitmap,
840 bitmap_range_not_in_bitmap,
841 only_utf8_locale_invlist,
842 o,
843 flags,
844
845 /* Can't try inverting for a
846 * better display if there
847 * are things that haven't
848 * been resolved */
849 (unresolved != NULL || k == ANYOFR));
850 SvREFCNT_dec(bitmap_range_not_in_bitmap);
851
852 /* If there are user-defined properties which haven't been defined
853 * yet, output them. If the result is not to be inverted, it is
854 * clearest to output them in a separate [] from the bitmap range
855 * stuff. If the result is to be complemented, we have to show
856 * everything in one [], as the inversion applies to the whole
857 * thing. Use {braces} to separate them from anything in the
858 * bitmap and anything above the bitmap. */
859 if (unresolved) {
860 if (inverted) {
861 if (! do_sep) { /* If didn't output anything in the bitmap
862 */
863 sv_catpvs(sv, "^");
864 }
865 sv_catpvs(sv, "{");
866 }
867 else if (do_sep) {
868 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1],
869 PL_colors[0]);
870 }
871 sv_catsv(sv, unresolved);
872 if (inverted) {
873 sv_catpvs(sv, "}");
874 }
875 do_sep = ! inverted;
876 }
877 else if ( do_sep == 2
878 && ! nonbitmap_invlist
879 && ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(o))
880 {
881 /* Here, the display shows the class as inverted, and
882 * everything above the lower display should also match, but
883 * there is no indication of that. Add this range so the code
884 * below will add it to the display */
885 _invlist_union_complement_2nd(nonbitmap_invlist,
886 PL_InBitmap,
887 &nonbitmap_invlist);
888 }
889 }
890
891 /* And, finally, add the above-the-bitmap stuff */
892 if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
893 SV* contents;
894
895 /* See if truncation size is overridden */
896 const STRLEN dump_len = (PL_dump_re_max_len > 256)
897 ? PL_dump_re_max_len
898 : 256;
899
900 /* This is output in a separate [] */
901 if (do_sep) {
902 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
903 }
904
905 /* And, for easy of understanding, it is shown in the
906 * uncomplemented form if possible. The one exception being if
907 * there are unresolved items, where the inversion has to be
908 * delayed until runtime */
909 if (inverted && ! unresolved) {
910 _invlist_invert(nonbitmap_invlist);
911 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
912 }
913
914 contents = invlist_contents(nonbitmap_invlist,
915 FALSE /* output suitable for catsv */
916 );
917
918 /* If the output is shorter than the permissible maximum, just do it. */
919 if (SvCUR(contents) <= dump_len) {
920 sv_catsv(sv, contents);
921 }
922 else {
923 const char * contents_string = SvPVX(contents);
924 STRLEN i = dump_len;
925
926 /* Otherwise, start at the permissible max and work back to the
927 * first break possibility */
928 while (i > 0 && contents_string[i] != ' ') {
929 i--;
930 }
931 if (i == 0) { /* Fail-safe. Use the max if we couldn't
932 find a legal break */
933 i = dump_len;
934 }
935
936 sv_catpvn(sv, contents_string, i);
937 sv_catpvs(sv, "...");
938 }
939
940 SvREFCNT_dec_NN(contents);
941 SvREFCNT_dec_NN(nonbitmap_invlist);
942 }
943
944 /* And finally the matching, closing ']' */
945 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
946
947 if (op == ANYOFHs) {
948 Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1));
949 }
950 else if (REGNODE_TYPE(op) != ANYOF) {
951 U8 lowest = (op != ANYOFHr)
952 ? FLAGS(o)
953 : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
954 U8 highest = (op == ANYOFHr)
955 ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o))
956 : (op == ANYOFH || op == ANYOFR)
957 ? 0xFF
958 : lowest;
959 #ifndef EBCDIC
960 if (op != ANYOFR || ! isASCII(ANYOFRbase(o) + ANYOFRdelta(o)))
961 #endif
962 {
963 Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
964 if (lowest != highest) {
965 Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
966 }
967 Perl_sv_catpvf(aTHX_ sv, ")");
968 }
969 }
970
971 SvREFCNT_dec(unresolved);
972 }
973 else if (k == ANYOFM) {
974 SV * cp_list = get_ANYOFM_contents(o);
975
976 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
977 if (op == NANYOFM) {
978 _invlist_invert(cp_list);
979 }
980
981 put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, 0, TRUE);
982 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
983
984 SvREFCNT_dec(cp_list);
985 }
986 else if (k == ANYOFHbbm) {
987 SV * cp_list = get_ANYOFHbbm_contents(o);
988 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
989
990 sv_catsv(sv, invlist_contents(cp_list,
991 FALSE /* output suitable for catsv */
992 ));
993 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
994
995 SvREFCNT_dec(cp_list);
996 }
997 else if (k == POSIXD || k == NPOSIXD) {
998 U8 index = FLAGS(o) * 2;
999 if (index < C_ARRAY_LENGTH(anyofs)) {
1000 if (*anyofs[index] != '[') {
1001 sv_catpvs(sv, "[");
1002 }
1003 sv_catpv(sv, anyofs[index]);
1004 if (*anyofs[index] != '[') {
1005 sv_catpvs(sv, "]");
1006 }
1007 }
1008 else {
1009 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
1010 }
1011 }
1012 else if (k == BOUND || k == NBOUND) {
1013 /* Must be synced with order of 'bound_type' in regcomp.h */
1014 const char * const bounds[] = {
1015 "", /* Traditional */
1016 "{gcb}",
1017 "{lb}",
1018 "{sb}",
1019 "{wb}"
1020 };
1021 assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
1022 sv_catpv(sv, bounds[FLAGS(o)]);
1023 }
1024 else if (k == BRANCHJ && (op == UNLESSM || op == IFMATCH)) {
1025 Perl_sv_catpvf(aTHX_ sv, "[%d", -(FLAGS(o)));
1026 if (NEXT_OFF(o)) {
1027 Perl_sv_catpvf(aTHX_ sv, "..-%d", FLAGS(o) - NEXT_OFF(o));
1028 }
1029 Perl_sv_catpvf(aTHX_ sv, "]");
1030 }
1031 else if (op == SBOL)
1032 Perl_sv_catpvf(aTHX_ sv, " /%s/", FLAGS(o) ? "\\A" : "^");
1033 else if (op == EVAL) {
1034 if (FLAGS(o) & EVAL_OPTIMISTIC_FLAG)
1035 Perl_sv_catpvf(aTHX_ sv, " optimistic");
1036 }
1037
1038 /* add on the verb argument if there is one */
1039 if ( ( k == VERB || op == ACCEPT || op == OPFAIL ) && FLAGS(o)) {
1040 if ( ARG1u(o) )
1041 Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
1042 SVfARG((MUTABLE_SV(progi->data->data[ ARG1u( o ) ]))));
1043 else
1044 sv_catpvs(sv, ":NULL");
1045 }
1046 #else
1047 PERL_UNUSED_CONTEXT;
1048 PERL_UNUSED_ARG(sv);
1049 PERL_UNUSED_ARG(o);
1050 PERL_UNUSED_ARG(prog);
1051 PERL_UNUSED_ARG(reginfo);
1052 PERL_UNUSED_ARG(pRExC_state);
1053 #endif /* DEBUGGING */
1054 }
1055
1056
1057 #ifdef DEBUGGING
1058 STATIC void
S_put_code_point(pTHX_ SV * sv,UV c)1059 S_put_code_point(pTHX_ SV *sv, UV c)
1060 {
1061 PERL_ARGS_ASSERT_PUT_CODE_POINT;
1062
1063 if (c > 255) {
1064 Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c);
1065 }
1066 else if (isPRINT(c)) {
1067 const char string = (char) c;
1068
1069 /* We use {phrase} as metanotation in the class, so also escape literal
1070 * braces */
1071 if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
1072 sv_catpvs(sv, "\\");
1073 sv_catpvn(sv, &string, 1);
1074 }
1075 else if (isMNEMONIC_CNTRL(c)) {
1076 Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
1077 }
1078 else {
1079 Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
1080 }
1081 }
1082
1083 STATIC void
S_put_range(pTHX_ SV * sv,UV start,const UV end,const bool allow_literals)1084 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
1085 {
1086 /* Appends to 'sv' a displayable version of the range of code points from
1087 * 'start' to 'end'. Mnemonics (like '\r') are used for the few controls
1088 * that have them, when they occur at the beginning or end of the range.
1089 * It uses hex to output the remaining code points, unless 'allow_literals'
1090 * is true, in which case the printable ASCII ones are output as-is (though
1091 * some of these will be escaped by put_code_point()).
1092 *
1093 * NOTE: This is designed only for printing ranges of code points that fit
1094 * inside an ANYOF bitmap. Higher code points are simply suppressed
1095 */
1096
1097 const unsigned int min_range_count = 3;
1098
1099 assert(start <= end);
1100
1101 PERL_ARGS_ASSERT_PUT_RANGE;
1102
1103 while (start <= end) {
1104 UV this_end;
1105 const char * format;
1106
1107 if ( end - start < min_range_count
1108 && (end - start <= 2 || (isPRINT_A(start) && isPRINT_A(end))))
1109 {
1110 /* Output a range of 1 or 2 chars individually, or longer ranges
1111 * when printable */
1112 for (; start <= end; start++) {
1113 put_code_point(sv, start);
1114 }
1115 break;
1116 }
1117
1118 /* If permitted by the input options, and there is a possibility that
1119 * this range contains a printable literal, look to see if there is
1120 * one. */
1121 if (allow_literals && start <= MAX_PRINT_A) {
1122
1123 /* If the character at the beginning of the range isn't an ASCII
1124 * printable, effectively split the range into two parts:
1125 * 1) the portion before the first such printable,
1126 * 2) the rest
1127 * and output them separately. */
1128 if (! isPRINT_A(start)) {
1129 UV temp_end = start + 1;
1130
1131 /* There is no point looking beyond the final possible
1132 * printable, in MAX_PRINT_A */
1133 UV max = MIN(end, MAX_PRINT_A);
1134
1135 while (temp_end <= max && ! isPRINT_A(temp_end)) {
1136 temp_end++;
1137 }
1138
1139 /* Here, temp_end points to one beyond the first printable if
1140 * found, or to one beyond 'max' if not. If none found, make
1141 * sure that we use the entire range */
1142 if (temp_end > MAX_PRINT_A) {
1143 temp_end = end + 1;
1144 }
1145
1146 /* Output the first part of the split range: the part that
1147 * doesn't have printables, with the parameter set to not look
1148 * for literals (otherwise we would infinitely recurse) */
1149 put_range(sv, start, temp_end - 1, FALSE);
1150
1151 /* The 2nd part of the range (if any) starts here. */
1152 start = temp_end;
1153
1154 /* We do a continue, instead of dropping down, because even if
1155 * the 2nd part is non-empty, it could be so short that we want
1156 * to output it as individual characters, as tested for at the
1157 * top of this loop. */
1158 continue;
1159 }
1160
1161 /* Here, 'start' is a printable ASCII. If it is an alphanumeric,
1162 * output a sub-range of just the digits or letters, then process
1163 * the remaining portion as usual. */
1164 if (isALPHANUMERIC_A(start)) {
1165 UV mask = (isDIGIT_A(start))
1166 ? CC_DIGIT_
1167 : isUPPER_A(start)
1168 ? CC_UPPER_
1169 : CC_LOWER_;
1170 UV temp_end = start + 1;
1171
1172 /* Find the end of the sub-range that includes just the
1173 * characters in the same class as the first character in it */
1174 while (temp_end <= end && generic_isCC_A_(temp_end, mask)) {
1175 temp_end++;
1176 }
1177 temp_end--;
1178
1179 /* For short ranges, don't duplicate the code above to output
1180 * them; just call recursively */
1181 if (temp_end - start < min_range_count) {
1182 put_range(sv, start, temp_end, FALSE);
1183 }
1184 else { /* Output as a range */
1185 put_code_point(sv, start);
1186 sv_catpvs(sv, "-");
1187 put_code_point(sv, temp_end);
1188 }
1189 start = temp_end + 1;
1190 continue;
1191 }
1192
1193 /* We output any other printables as individual characters */
1194 if (isPUNCT_A(start) || isSPACE_A(start)) {
1195 while (start <= end && (isPUNCT_A(start)
1196 || isSPACE_A(start)))
1197 {
1198 put_code_point(sv, start);
1199 start++;
1200 }
1201 continue;
1202 }
1203 } /* End of looking for literals */
1204
1205 /* Here is not to output as a literal. Some control characters have
1206 * mnemonic names. Split off any of those at the beginning and end of
1207 * the range to print mnemonically. It isn't possible for many of
1208 * these to be in a row, so this won't overwhelm with output */
1209 if ( start <= end
1210 && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
1211 {
1212 while (isMNEMONIC_CNTRL(start) && start <= end) {
1213 put_code_point(sv, start);
1214 start++;
1215 }
1216
1217 /* If this didn't take care of the whole range ... */
1218 if (start <= end) {
1219
1220 /* Look backwards from the end to find the final non-mnemonic
1221 * */
1222 UV temp_end = end;
1223 while (isMNEMONIC_CNTRL(temp_end)) {
1224 temp_end--;
1225 }
1226
1227 /* And separately output the interior range that doesn't start
1228 * or end with mnemonics */
1229 put_range(sv, start, temp_end, FALSE);
1230
1231 /* Then output the mnemonic trailing controls */
1232 start = temp_end + 1;
1233 while (start <= end) {
1234 put_code_point(sv, start);
1235 start++;
1236 }
1237 break;
1238 }
1239 }
1240
1241 /* As a final resort, output the range or subrange as hex. */
1242
1243 if (start >= NUM_ANYOF_CODE_POINTS) {
1244 this_end = end;
1245 }
1246 else { /* Have to split range at the bitmap boundary */
1247 this_end = (end < NUM_ANYOF_CODE_POINTS)
1248 ? end
1249 : NUM_ANYOF_CODE_POINTS - 1;
1250 }
1251 #if NUM_ANYOF_CODE_POINTS > 256
1252 format = (this_end < 256)
1253 ? "\\x%02" UVXf "-\\x%02" UVXf
1254 : "\\x{%04" UVXf "}-\\x{%04" UVXf "}";
1255 #else
1256 format = "\\x%02" UVXf "-\\x%02" UVXf;
1257 #endif
1258 GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
1259 Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
1260 GCC_DIAG_RESTORE_STMT;
1261 break;
1262 }
1263 }
1264
1265 STATIC void
S_put_charclass_bitmap_innards_invlist(pTHX_ SV * sv,SV * invlist)1266 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
1267 {
1268 /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
1269 * 'invlist' */
1270
1271 UV start, end;
1272 bool allow_literals = TRUE;
1273
1274 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
1275
1276 /* Generally, it is more readable if printable characters are output as
1277 * literals, but if a range (nearly) spans all of them, it's best to output
1278 * it as a single range. This code will use a single range if all but 2
1279 * ASCII printables are in it */
1280 invlist_iterinit(invlist);
1281 while (invlist_iternext(invlist, &start, &end)) {
1282
1283 /* If the range starts beyond the final printable, it doesn't have any
1284 * in it */
1285 if (start > MAX_PRINT_A) {
1286 break;
1287 }
1288
1289 /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span
1290 * all but two, the range must start and end no later than 2 from
1291 * either end */
1292 if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
1293 if (end > MAX_PRINT_A) {
1294 end = MAX_PRINT_A;
1295 }
1296 if (start < ' ') {
1297 start = ' ';
1298 }
1299 if (end - start >= MAX_PRINT_A - ' ' - 2) {
1300 allow_literals = FALSE;
1301 }
1302 break;
1303 }
1304 }
1305 invlist_iterfinish(invlist);
1306
1307 /* Here we have figured things out. Output each range */
1308 invlist_iterinit(invlist);
1309 while (invlist_iternext(invlist, &start, &end)) {
1310 if (start >= NUM_ANYOF_CODE_POINTS) {
1311 break;
1312 }
1313 put_range(sv, start, end, allow_literals);
1314 }
1315 invlist_iterfinish(invlist);
1316
1317 return;
1318 }
1319
1320 STATIC SV*
S_put_charclass_bitmap_innards_common(pTHX_ SV * invlist,SV * posixes,SV * only_utf8,SV * not_utf8,SV * only_utf8_locale,const bool invert)1321 S_put_charclass_bitmap_innards_common(pTHX_
1322 SV* invlist, /* The bitmap */
1323 SV* posixes, /* Under /l, things like [:word:], \S */
1324 SV* only_utf8, /* Under /d, matches iff the target is UTF-8 */
1325 SV* not_utf8, /* /d, matches iff the target isn't UTF-8 */
1326 SV* only_utf8_locale, /* Under /l, matches if the locale is UTF-8 */
1327 const bool invert /* Is the result to be inverted? */
1328 )
1329 {
1330 /* Create and return an SV containing a displayable version of the bitmap
1331 * and associated information determined by the input parameters. If the
1332 * output would have been only the inversion indicator '^', NULL is instead
1333 * returned. */
1334
1335 SV * output;
1336
1337 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
1338
1339 if (invert) {
1340 output = newSVpvs("^");
1341 }
1342 else {
1343 output = newSVpvs("");
1344 }
1345
1346 /* First, the code points in the bitmap that are unconditionally there */
1347 put_charclass_bitmap_innards_invlist(output, invlist);
1348
1349 /* Traditionally, these have been placed after the main code points */
1350 if (posixes) {
1351 sv_catsv(output, posixes);
1352 }
1353
1354 if (only_utf8 && _invlist_len(only_utf8)) {
1355 Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
1356 put_charclass_bitmap_innards_invlist(output, only_utf8);
1357 }
1358
1359 if (not_utf8 && _invlist_len(not_utf8)) {
1360 Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
1361 put_charclass_bitmap_innards_invlist(output, not_utf8);
1362 }
1363
1364 if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
1365 Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
1366 put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
1367
1368 /* This is the only list in this routine that can legally contain code
1369 * points outside the bitmap range. The call just above to
1370 * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
1371 * output them here. There's about a half-dozen possible, and none in
1372 * contiguous ranges longer than 2 */
1373 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
1374 UV start, end;
1375 SV* above_bitmap = NULL;
1376
1377 _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
1378
1379 invlist_iterinit(above_bitmap);
1380 while (invlist_iternext(above_bitmap, &start, &end)) {
1381 UV i;
1382
1383 for (i = start; i <= end; i++) {
1384 put_code_point(output, i);
1385 }
1386 }
1387 invlist_iterfinish(above_bitmap);
1388 SvREFCNT_dec_NN(above_bitmap);
1389 }
1390 }
1391
1392 if (invert && SvCUR(output) == 1) {
1393 return NULL;
1394 }
1395
1396 return output;
1397 }
1398
1399 STATIC U8
S_put_charclass_bitmap_innards(pTHX_ SV * sv,char * bitmap,SV * nonbitmap_invlist,SV * only_utf8_locale_invlist,const regnode * const node,const U8 flags,const bool force_as_is_display)1400 S_put_charclass_bitmap_innards(pTHX_ SV *sv,
1401 char *bitmap,
1402 SV *nonbitmap_invlist,
1403 SV *only_utf8_locale_invlist,
1404 const regnode * const node,
1405 const U8 flags,
1406 const bool force_as_is_display)
1407 {
1408 /* Appends to 'sv' a displayable version of the innards of the bracketed
1409 * character class defined by the other arguments:
1410 * 'bitmap' points to the bitmap, or NULL if to ignore that.
1411 * 'nonbitmap_invlist' is an inversion list of the code points that are in
1412 * the bitmap range, but for some reason aren't in the bitmap; NULL if
1413 * none. The reasons for this could be that they require some
1414 * condition such as the target string being or not being in UTF-8
1415 * (under /d), or because they came from a user-defined property that
1416 * was not resolved at the time of the regex compilation (under /u)
1417 * 'only_utf8_locale_invlist' is an inversion list of the code points that
1418 * are valid only if the runtime locale is a UTF-8 one; NULL if none
1419 * 'node' is the regex pattern ANYOF node. It is needed only when the
1420 * above two parameters are not null, and is passed so that this
1421 * routine can tease apart the various reasons for them.
1422 * 'flags' is the flags field of 'node'
1423 * 'force_as_is_display' is TRUE if this routine should definitely NOT try
1424 * to invert things to see if that leads to a cleaner display. If
1425 * FALSE, this routine is free to use its judgment about doing this.
1426 *
1427 * It returns 0 if nothing was actually output. (It may be that
1428 * the bitmap, etc is empty.)
1429 * 1 if the output wasn't inverted (didn't begin with a '^')
1430 * 2 if the output was inverted (did begin with a '^')
1431 *
1432 * When called for outputting the bitmap of a non-ANYOF node, just pass the
1433 * bitmap, with the succeeding parameters set to NULL, and the final one to
1434 * FALSE.
1435 */
1436
1437 /* In general, it tries to display the 'cleanest' representation of the
1438 * innards, choosing whether to display them inverted or not, regardless of
1439 * whether the class itself is to be inverted. However, there are some
1440 * cases where it can't try inverting, as what actually matches isn't known
1441 * until runtime, and hence the inversion isn't either. */
1442
1443 bool inverting_allowed = ! force_as_is_display;
1444
1445 int i;
1446 STRLEN orig_sv_cur = SvCUR(sv);
1447
1448 SV* invlist; /* Inversion list we accumulate of code points that
1449 are unconditionally matched */
1450 SV* only_utf8 = NULL; /* Under /d, list of matches iff the target is
1451 UTF-8 */
1452 SV* not_utf8 = NULL; /* /d, list of matches iff the target isn't UTF-8
1453 */
1454 SV* posixes = NULL; /* Under /l, string of things like [:word:], \D */
1455 SV* only_utf8_locale = NULL; /* Under /l, list of matches if the locale
1456 is UTF-8 */
1457
1458 SV* as_is_display; /* The output string when we take the inputs
1459 literally */
1460 SV* inverted_display; /* The output string when we invert the inputs */
1461
1462 bool invert = cBOOL(flags & ANYOF_INVERT); /* Is the input to be inverted
1463 to match? */
1464 /* We are biased in favor of displaying things without them being inverted,
1465 * as that is generally easier to understand */
1466 const int bias = 5;
1467
1468 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
1469
1470 /* Start off with whatever code points are passed in. (We clone, so we
1471 * don't change the caller's list) */
1472 if (nonbitmap_invlist) {
1473 assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
1474 invlist = invlist_clone(nonbitmap_invlist, NULL);
1475 }
1476 else { /* Worst case size is every other code point is matched */
1477 invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
1478 }
1479
1480 if (flags) {
1481 if (OP(node) == ANYOFD) {
1482
1483 /* This flag indicates that the code points below 0x100 in the
1484 * nonbitmap list are precisely the ones that match only when the
1485 * target is UTF-8 (they should all be non-ASCII). */
1486 if (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES) {
1487 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
1488 _invlist_subtract(invlist, only_utf8, &invlist);
1489 }
1490
1491 /* And this flag for matching all non-ASCII 0xFF and below */
1492 if (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared) {
1493 not_utf8 = invlist_clone(PL_UpperLatin1, NULL);
1494 }
1495 }
1496 else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) {
1497
1498 /* If either of these flags are set, what matches isn't
1499 * determinable except during execution, so don't know enough here
1500 * to invert */
1501 if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
1502 inverting_allowed = FALSE;
1503 }
1504
1505 /* What the posix classes match also varies at runtime, so these
1506 * will be output symbolically. */
1507 if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
1508 int i;
1509
1510 posixes = newSVpvs("");
1511 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
1512 if (ANYOF_POSIXL_TEST(node, i)) {
1513 sv_catpv(posixes, anyofs[i]);
1514 }
1515 }
1516 }
1517 }
1518 }
1519
1520 /* Accumulate the bit map into the unconditional match list */
1521 if (bitmap) {
1522 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1523 if (BITMAP_TEST(bitmap, i)) {
1524 int start = i++;
1525 for (;
1526 i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i);
1527 i++)
1528 { /* empty */ }
1529 invlist = _add_range_to_invlist(invlist, start, i-1);
1530 }
1531 }
1532 }
1533
1534 /* Make sure that the conditional match lists don't have anything in them
1535 * that match unconditionally; otherwise the output is quite confusing.
1536 * This could happen if the code that populates these misses some
1537 * duplication. */
1538 if (only_utf8) {
1539 _invlist_subtract(only_utf8, invlist, &only_utf8);
1540 }
1541 if (not_utf8) {
1542 _invlist_subtract(not_utf8, invlist, ¬_utf8);
1543 }
1544
1545 if (only_utf8_locale_invlist) {
1546
1547 /* Since this list is passed in, we have to make a copy before
1548 * modifying it */
1549 only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL);
1550
1551 _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
1552
1553 /* And, it can get really weird for us to try outputting an inverted
1554 * form of this list when it has things above the bitmap, so don't even
1555 * try */
1556 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
1557 inverting_allowed = FALSE;
1558 }
1559 }
1560
1561 /* Calculate what the output would be if we take the input as-is */
1562 as_is_display = put_charclass_bitmap_innards_common(invlist,
1563 posixes,
1564 only_utf8,
1565 not_utf8,
1566 only_utf8_locale,
1567 invert);
1568
1569 /* If have to take the output as-is, just do that */
1570 if (! inverting_allowed) {
1571 if (as_is_display) {
1572 sv_catsv(sv, as_is_display);
1573 SvREFCNT_dec_NN(as_is_display);
1574 }
1575 }
1576 else { /* But otherwise, create the output again on the inverted input, and
1577 use whichever version is shorter */
1578
1579 int inverted_bias, as_is_bias;
1580
1581 /* We will apply our bias to whichever of the results doesn't have
1582 * the '^' */
1583 bool trial_invert;
1584 if (invert) {
1585 trial_invert = FALSE;
1586 as_is_bias = bias;
1587 inverted_bias = 0;
1588 }
1589 else {
1590 trial_invert = TRUE;
1591 as_is_bias = 0;
1592 inverted_bias = bias;
1593 }
1594
1595 /* Now invert each of the lists that contribute to the output,
1596 * excluding from the result things outside the possible range */
1597
1598 /* For the unconditional inversion list, we have to add in all the
1599 * conditional code points, so that when inverted, they will be gone
1600 * from it */
1601 _invlist_union(only_utf8, invlist, &invlist);
1602 _invlist_union(not_utf8, invlist, &invlist);
1603 _invlist_union(only_utf8_locale, invlist, &invlist);
1604 _invlist_invert(invlist);
1605 _invlist_intersection(invlist, PL_InBitmap, &invlist);
1606
1607 if (only_utf8) {
1608 _invlist_invert(only_utf8);
1609 _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
1610 }
1611 else if (not_utf8) {
1612
1613 /* If a code point matches iff the target string is not in UTF-8,
1614 * then complementing the result has it not match iff not in UTF-8,
1615 * which is the same thing as matching iff it is UTF-8. */
1616 only_utf8 = not_utf8;
1617 not_utf8 = NULL;
1618 }
1619
1620 if (only_utf8_locale) {
1621 _invlist_invert(only_utf8_locale);
1622 _invlist_intersection(only_utf8_locale,
1623 PL_InBitmap,
1624 &only_utf8_locale);
1625 }
1626
1627 inverted_display = put_charclass_bitmap_innards_common(
1628 invlist,
1629 posixes,
1630 only_utf8,
1631 not_utf8,
1632 only_utf8_locale, trial_invert);
1633
1634 /* Use the shortest representation, taking into account our bias
1635 * against showing it inverted */
1636 if ( inverted_display
1637 && ( ! as_is_display
1638 || ( SvCUR(inverted_display) + inverted_bias
1639 < SvCUR(as_is_display) + as_is_bias)))
1640 {
1641 sv_catsv(sv, inverted_display);
1642 invert = ! invert;
1643 }
1644 else if (as_is_display) {
1645 sv_catsv(sv, as_is_display);
1646 }
1647
1648 SvREFCNT_dec(as_is_display);
1649 SvREFCNT_dec(inverted_display);
1650 }
1651
1652 SvREFCNT_dec_NN(invlist);
1653 SvREFCNT_dec(only_utf8);
1654 SvREFCNT_dec(not_utf8);
1655 SvREFCNT_dec(posixes);
1656 SvREFCNT_dec(only_utf8_locale);
1657
1658 U8 did_output_something = (bool) (SvCUR(sv) > orig_sv_cur);
1659 if (did_output_something) {
1660 /* Distinguish between non and inverted cases */
1661 did_output_something += invert;
1662 }
1663
1664 return did_output_something;
1665 }
1666 #endif /* DEBUGGING */
1667