1 #ifdef PERL_EXT_RE_BUILD
2 #include "re_top.h"
3 #endif
4
5 #include "EXTERN.h"
6 #define PERL_IN_REGEX_ENGINE
7 #define PERL_IN_REGCOMP_ANY
8 #define PERL_IN_REGCOMP_DEBUG_C
9 #include "perl.h"
10
11 #ifdef PERL_IN_XSUB_RE
12 # include "re_comp.h"
13 #else
14 # include "regcomp.h"
15 #endif
16
17 #include "invlist_inline.h"
18 #include "unicode_constants.h"
19 #include "regcomp_internal.h"
20
21 #ifdef DEBUGGING
22
23 int
Perl_re_printf(pTHX_ const char * fmt,...)24 Perl_re_printf(pTHX_ const char *fmt, ...)
25 {
26 va_list ap;
27 int result;
28 PerlIO *f= Perl_debug_log;
29 PERL_ARGS_ASSERT_RE_PRINTF;
30 va_start(ap, fmt);
31 result = PerlIO_vprintf(f, fmt, ap);
32 va_end(ap);
33 return result;
34 }
35
36 int
Perl_re_indentf(pTHX_ const char * fmt,U32 depth,...)37 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
38 {
39 va_list ap;
40 int result;
41 PerlIO *f= Perl_debug_log;
42 PERL_ARGS_ASSERT_RE_INDENTF;
43 va_start(ap, depth);
44 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
45 result = PerlIO_vprintf(f, fmt, ap);
46 va_end(ap);
47 return result;
48 }
49
50 void
Perl_debug_show_study_flags(pTHX_ U32 flags,const char * open_str,const char * close_str)51 Perl_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
52 const char *close_str)
53 {
54 PERL_ARGS_ASSERT_DEBUG_SHOW_STUDY_FLAGS;
55 if (!flags)
56 return;
57
58 Perl_re_printf( aTHX_ "%s", open_str);
59 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
60 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
61 DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
62 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
63 DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
64 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
65 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
66 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
67 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
68 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
69 DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
70 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
71 DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
72 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
73 DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
74 Perl_re_printf( aTHX_ "%s", close_str);
75 }
76
77 void
Perl_debug_studydata(pTHX_ const char * where,scan_data_t * data,U32 depth,int is_inf,SSize_t min,SSize_t stopmin,SSize_t delta)78 Perl_debug_studydata(pTHX_ const char *where, scan_data_t *data,
79 U32 depth, int is_inf,
80 SSize_t min, SSize_t stopmin, SSize_t delta)
81 {
82 PERL_ARGS_ASSERT_DEBUG_STUDYDATA;
83 DECLARE_AND_GET_RE_DEBUG_FLAGS;
84
85 DEBUG_OPTIMISE_MORE_r({
86 if (!data) {
87 Perl_re_indentf(aTHX_ "%s: NO DATA",
88 depth,
89 where);
90 return;
91 }
92 Perl_re_indentf(aTHX_ "%s: M/S/D: %" IVdf "/%" IVdf "/%" IVdf " Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
93 depth,
94 where,
95 min, stopmin, delta,
96 (IV)data->pos_min,
97 (IV)data->pos_delta,
98 (UV)data->flags
99 );
100
101 Perl_debug_show_study_flags(aTHX_ data->flags," [","]");
102
103 Perl_re_printf( aTHX_
104 " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
105 (IV)data->whilem_c,
106 (IV)(data->last_closep ? *((data)->last_closep) : -1),
107 is_inf ? "INF " : ""
108 );
109
110 if (data->last_found) {
111 int i;
112 Perl_re_printf(aTHX_
113 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
114 SvPVX_const(data->last_found),
115 (IV)data->last_end,
116 (IV)data->last_start_min,
117 (IV)data->last_start_max
118 );
119
120 for (i = 0; i < 2; i++) {
121 Perl_re_printf(aTHX_
122 " %s%s: '%s' @ %" IVdf "/%" IVdf,
123 data->cur_is_floating == i ? "*" : "",
124 i ? "Float" : "Fixed",
125 SvPVX_const(data->substrs[i].str),
126 (IV)data->substrs[i].min_offset,
127 (IV)data->substrs[i].max_offset
128 );
129 Perl_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
130 }
131 }
132
133 Perl_re_printf( aTHX_ "\n");
134 });
135 }
136
137
138 void
Perl_debug_peep(pTHX_ const char * str,const RExC_state_t * pRExC_state,regnode * scan,U32 depth,U32 flags)139 Perl_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
140 regnode *scan, U32 depth, U32 flags)
141 {
142 PERL_ARGS_ASSERT_DEBUG_PEEP;
143 DECLARE_AND_GET_RE_DEBUG_FLAGS;
144
145 DEBUG_OPTIMISE_r({
146 regnode *Next;
147
148 if (!scan)
149 return;
150 Next = regnext(scan);
151 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
152 Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)",
153 depth,
154 str,
155 REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
156 Next ? (REG_NODE_NUM(Next)) : 0 );
157 Perl_debug_show_study_flags(aTHX_ flags," [ ","]");
158 Perl_re_printf( aTHX_ "\n");
159 });
160 }
161
162 #endif /* DEBUGGING */
163
164 /*
165 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
166 */
167 #ifdef DEBUGGING
168
169 static void
S_regdump_intflags(pTHX_ const char * lead,const U32 flags)170 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
171 {
172 int bit;
173 int set=0;
174
175 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
176
177 for (bit=0; bit<=REG_INTFLAGS_NAME_SIZE; bit++) {
178 if (flags & (1<<bit)) {
179 if (!set++ && lead)
180 Perl_re_printf( aTHX_ "%s", lead);
181 Perl_re_printf( aTHX_ "%s ", PL_reg_intflags_name[bit]);
182 }
183 }
184 if (lead) {
185 if (set)
186 Perl_re_printf( aTHX_ "\n");
187 else
188 Perl_re_printf( aTHX_ "%s[none-set]\n", lead);
189 }
190 }
191
192 static void
S_regdump_extflags(pTHX_ const char * lead,const U32 flags)193 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
194 {
195 int bit;
196 int set=0;
197 regex_charset cs;
198
199 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
200
201 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
202 if (flags & (1U<<bit)) {
203 if ((1U<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
204 continue;
205 }
206 if (!set++ && lead)
207 Perl_re_printf( aTHX_ "%s", lead);
208 Perl_re_printf( aTHX_ "%s ", PL_reg_extflags_name[bit]);
209 }
210 }
211 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
212 if (!set++ && lead) {
213 Perl_re_printf( aTHX_ "%s", lead);
214 }
215 switch (cs) {
216 case REGEX_UNICODE_CHARSET:
217 Perl_re_printf( aTHX_ "UNICODE");
218 break;
219 case REGEX_LOCALE_CHARSET:
220 Perl_re_printf( aTHX_ "LOCALE");
221 break;
222 case REGEX_ASCII_RESTRICTED_CHARSET:
223 Perl_re_printf( aTHX_ "ASCII-RESTRICTED");
224 break;
225 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
226 Perl_re_printf( aTHX_ "ASCII-MORE_RESTRICTED");
227 break;
228 default:
229 Perl_re_printf( aTHX_ "UNKNOWN CHARACTER SET");
230 break;
231 }
232 }
233 if (lead) {
234 if (set)
235 Perl_re_printf( aTHX_ "\n");
236 else
237 Perl_re_printf( aTHX_ "%s[none-set]\n", lead);
238 }
239 }
240 #endif
241
242 void
Perl_regdump(pTHX_ const regexp * r)243 Perl_regdump(pTHX_ const regexp *r)
244 {
245 #ifdef DEBUGGING
246 int i;
247 SV * const sv = sv_newmortal();
248 SV *dsv= sv_newmortal();
249 RXi_GET_DECL(r, ri);
250 DECLARE_AND_GET_RE_DEBUG_FLAGS;
251
252 PERL_ARGS_ASSERT_REGDUMP;
253
254 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
255
256 /* Header fields of interest. */
257 for (i = 0; i < 2; i++) {
258 if (r->substrs->data[i].substr) {
259 RE_PV_QUOTED_DECL(s, 0, dsv,
260 SvPVX_const(r->substrs->data[i].substr),
261 RE_SV_DUMPLEN(r->substrs->data[i].substr),
262 PL_dump_re_max_len);
263 Perl_re_printf( aTHX_
264 "%s %s%s at %" IVdf "..%" UVuf " ",
265 i ? "floating" : "anchored",
266 s,
267 RE_SV_TAIL(r->substrs->data[i].substr),
268 (IV)r->substrs->data[i].min_offset,
269 (UV)r->substrs->data[i].max_offset);
270 }
271 else if (r->substrs->data[i].utf8_substr) {
272 RE_PV_QUOTED_DECL(s, 1, dsv,
273 SvPVX_const(r->substrs->data[i].utf8_substr),
274 RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr),
275 30);
276 Perl_re_printf( aTHX_
277 "%s utf8 %s%s at %" IVdf "..%" UVuf " ",
278 i ? "floating" : "anchored",
279 s,
280 RE_SV_TAIL(r->substrs->data[i].utf8_substr),
281 (IV)r->substrs->data[i].min_offset,
282 (UV)r->substrs->data[i].max_offset);
283 }
284 }
285
286 if (r->check_substr || r->check_utf8)
287 Perl_re_printf( aTHX_
288 (const char *)
289 ( r->check_substr == r->substrs->data[1].substr
290 && r->check_utf8 == r->substrs->data[1].utf8_substr
291 ? "(checking floating" : "(checking anchored"));
292 if (r->intflags & PREGf_NOSCAN)
293 Perl_re_printf( aTHX_ " noscan");
294 if (r->extflags & RXf_CHECK_ALL)
295 Perl_re_printf( aTHX_ " isall");
296 if (r->check_substr || r->check_utf8)
297 Perl_re_printf( aTHX_ ") ");
298
299 if (ri->regstclass) {
300 regprop(r, sv, ri->regstclass, NULL, NULL);
301 Perl_re_printf( aTHX_ "stclass %s ", SvPVX_const(sv));
302 }
303 if (r->intflags & PREGf_ANCH) {
304 Perl_re_printf( aTHX_ "anchored");
305 if (r->intflags & PREGf_ANCH_MBOL)
306 Perl_re_printf( aTHX_ "(MBOL)");
307 if (r->intflags & PREGf_ANCH_SBOL)
308 Perl_re_printf( aTHX_ "(SBOL)");
309 if (r->intflags & PREGf_ANCH_GPOS)
310 Perl_re_printf( aTHX_ "(GPOS)");
311 Perl_re_printf( aTHX_ " ");
312 }
313 if (r->intflags & PREGf_GPOS_SEEN)
314 Perl_re_printf( aTHX_ "GPOS:%" UVuf " ", (UV)r->gofs);
315 if (r->intflags & PREGf_SKIP)
316 Perl_re_printf( aTHX_ "plus ");
317 if (r->intflags & PREGf_IMPLICIT)
318 Perl_re_printf( aTHX_ "implicit ");
319 Perl_re_printf( aTHX_ "minlen %" IVdf " ", (IV)r->minlen);
320 if (r->extflags & RXf_EVAL_SEEN)
321 Perl_re_printf( aTHX_ "with eval ");
322 Perl_re_printf( aTHX_ "\n");
323 DEBUG_FLAGS_r({
324 regdump_extflags("r->extflags: ", r->extflags);
325 regdump_intflags("r->intflags: ", r->intflags);
326 });
327 #else
328 PERL_ARGS_ASSERT_REGDUMP;
329 PERL_UNUSED_CONTEXT;
330 PERL_UNUSED_ARG(r);
331 #endif /* DEBUGGING */
332 }
333
334 /* Should be synchronized with ANYOF_ #defines in regcomp.h */
335 #ifdef DEBUGGING
336
337 # if CC_WORDCHAR_ != 0 || CC_DIGIT_ != 1 || CC_ALPHA_ != 2 \
338 || CC_LOWER_ != 3 || CC_UPPER_ != 4 || CC_PUNCT_ != 5 \
339 || CC_PRINT_ != 6 || CC_ALPHANUMERIC_ != 7 || CC_GRAPH_ != 8 \
340 || CC_CASED_ != 9 || CC_SPACE_ != 10 || CC_BLANK_ != 11 \
341 || CC_XDIGIT_ != 12 || CC_CNTRL_ != 13 || CC_ASCII_ != 14 \
342 || CC_VERTSPACE_ != 15
343 # error Need to adjust order of anyofs[]
344 # endif
345 static const char * const anyofs[] = {
346 "\\w",
347 "\\W",
348 "\\d",
349 "\\D",
350 "[:alpha:]",
351 "[:^alpha:]",
352 "[:lower:]",
353 "[:^lower:]",
354 "[:upper:]",
355 "[:^upper:]",
356 "[:punct:]",
357 "[:^punct:]",
358 "[:print:]",
359 "[:^print:]",
360 "[:alnum:]",
361 "[:^alnum:]",
362 "[:graph:]",
363 "[:^graph:]",
364 "[:cased:]",
365 "[:^cased:]",
366 "\\s",
367 "\\S",
368 "[:blank:]",
369 "[:^blank:]",
370 "[:xdigit:]",
371 "[:^xdigit:]",
372 "[:cntrl:]",
373 "[:^cntrl:]",
374 "[:ascii:]",
375 "[:^ascii:]",
376 "\\v",
377 "\\V"
378 };
379 #endif
380
381 /*
382 - regprop - printable representation of opcode, with run time support
383 */
384
385 void
Perl_regprop(pTHX_ const regexp * prog,SV * sv,const regnode * o,const regmatch_info * reginfo,const RExC_state_t * pRExC_state)386 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
387 {
388 #ifdef DEBUGGING
389 U8 k;
390 const U8 op = OP(o);
391 RXi_GET_DECL(prog, progi);
392 DECLARE_AND_GET_RE_DEBUG_FLAGS;
393
394 PERL_ARGS_ASSERT_REGPROP;
395
396 SvPVCLEAR(sv);
397
398 if (op > REGNODE_MAX) { /* regnode.type is unsigned */
399 if (pRExC_state) { /* This gives more info, if we have it */
400 FAIL3("panic: corrupted regexp opcode %d > %d",
401 (int)op, (int)REGNODE_MAX);
402 }
403 else {
404 Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d",
405 (int)op, (int)REGNODE_MAX);
406 }
407 }
408 sv_catpv(sv, REGNODE_NAME(op)); /* Take off const! */
409
410 k = REGNODE_TYPE(op);
411 if (op == BRANCH) {
412 Perl_sv_catpvf(aTHX_ sv, " (buf:%" IVdf "/%" IVdf ")", (IV)ARG1a(o),(IV)ARG1b(o));
413 }
414 else if (op == BRANCHJ) {
415 Perl_sv_catpvf(aTHX_ sv, " (buf:%" IVdf "/%" IVdf ")", (IV)ARG2a(o),(IV)ARG2b(o));
416 }
417 else if (k == EXACT) {
418 sv_catpvs(sv, " ");
419 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
420 * is a crude hack but it may be the best for now since
421 * we have no flag "this EXACTish node was UTF-8"
422 * --jhi */
423 pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len,
424 PL_colors[0], PL_colors[1],
425 PERL_PV_ESCAPE_UNI_DETECT |
426 PERL_PV_ESCAPE_NONASCII |
427 PERL_PV_PRETTY_ELLIPSES |
428 PERL_PV_PRETTY_LTGT |
429 PERL_PV_PRETTY_NOCLEAR
430 );
431 } else if (k == TRIE) {
432 /* print the details of the trie in dumpuntil instead, as
433 * progi->data isn't available here */
434 const U32 n = ARG1u(o);
435 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
436 (reg_ac_data *)progi->data->data[n] :
437 NULL;
438 const reg_trie_data * const trie
439 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
440
441 Perl_sv_catpvf(aTHX_ sv, "-%s", REGNODE_NAME(FLAGS(o)));
442 DEBUG_TRIE_COMPILE_r({
443 if (trie->jump)
444 sv_catpvs(sv, "(JUMP)");
445 Perl_sv_catpvf(aTHX_ sv,
446 "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">",
447 (UV)trie->startstate,
448 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
449 (UV)trie->wordcount,
450 (UV)trie->minlen,
451 (UV)trie->maxlen,
452 (UV)TRIE_CHARCOUNT(trie),
453 (UV)trie->uniquecharcount
454 );
455 });
456 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
457 sv_catpvs(sv, "[");
458 (void) put_charclass_bitmap_innards(sv,
459 ((IS_ANYOF_TRIE(op))
460 ? ANYOF_BITMAP(o)
461 : TRIE_BITMAP(trie)),
462 NULL,
463 NULL,
464 NULL,
465 0,
466 FALSE
467 );
468 sv_catpvs(sv, "]");
469 }
470 if (trie->before_paren || trie->after_paren)
471 Perl_sv_catpvf(aTHX_ sv, " (buf:%" IVdf "/%" IVdf ")",
472 (IV)trie->before_paren,(IV)trie->after_paren);
473 } else if (k == CURLY) {
474 U32 lo = ARG1i(o), hi = ARG2i(o);
475 if (ARG3u(o)) /* check both ARG3a and ARG3b at the same time */
476 Perl_sv_catpvf(aTHX_ sv, "<%d:%d>", ARG3a(o),ARG3b(o)); /* paren before, paren after */
477 if (op == CURLYM || op == CURLYN || op == CURLYX)
478 Perl_sv_catpvf(aTHX_ sv, "[%d]", FLAGS(o)); /* Parenth number */
479 Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
480 if (hi == REG_INFTY)
481 sv_catpvs(sv, "INFTY");
482 else
483 Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
484 sv_catpvs(sv, "}");
485 }
486 else if (k == WHILEM && FLAGS(o)) /* Ordinal/of */
487 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", FLAGS(o) & 0xf, FLAGS(o)>>4);
488 else if (k == REF || k == OPEN || k == CLOSE
489 || k == GROUPP || op == ACCEPT)
490 {
491 AV *name_list= NULL;
492 U32 parno= (op == ACCEPT) ? ARG2u(o) :
493 (op == OPEN || op == CLOSE) ? PARNO(o) :
494 ARG1u(o);
495 if ( RXp_PAREN_NAMES(prog) ) {
496 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
497 } else if ( pRExC_state ) {
498 name_list= RExC_paren_name_list;
499 }
500 if ( name_list ) {
501 if ( k != REF || (op < REFN)) {
502 UV logical_parno = parno;
503 if (prog->parno_to_logical)
504 logical_parno = prog->parno_to_logical[parno];
505
506 Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)logical_parno); /* Parenth number */
507 if (parno != logical_parno)
508 Perl_sv_catpvf(aTHX_ sv, "/%" UVuf, (UV)parno); /* Parenth number */
509
510 SV **name= av_fetch_simple(name_list, parno, 0 );
511 if (name)
512 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
513 }
514 else
515 if (parno > 0) {
516 /* parno must always be larger than 0 for this block
517 * as it represents a slot into the data array, which
518 * has the 0 slot reserved for a placeholder so any valid
519 * index into it is always true, eg non-zero
520 * see the '%' "what" type and the implementation of
521 * S_reg_add_data()
522 */
523 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
524 I32 *nums=(I32*)SvPVX(sv_dat);
525 SV **name= av_fetch_simple(name_list, nums[0], 0 );
526 I32 n;
527 if (name) {
528 for ( n=0; n<SvIVX(sv_dat); n++ ) {
529 Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf,
530 (n ? "," : ""), (IV)nums[n]);
531 }
532 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
533 }
534 }
535 } else if (parno>0) {
536 UV logical_parno = parno;
537 if (prog->parno_to_logical)
538 logical_parno = prog->parno_to_logical[parno];
539
540 Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)logical_parno); /* Parenth number */
541 if (logical_parno != parno)
542 Perl_sv_catpvf(aTHX_ sv, "/%" UVuf, (UV)parno); /* Parenth number */
543
544 }
545 if ( k == REF ) {
546 Perl_sv_catpvf(aTHX_ sv, " <%" IVdf ">", (IV)ARG2i(o));
547 }
548 if ( k == REF && reginfo) {
549 U32 n = ARG1u(o); /* which paren pair */
550 I32 ln = RXp_OFFS_START(prog,n);
551 if (RXp_LASTPAREN(prog) < n || ln == -1 || RXp_OFFS_END(prog,n) == -1)
552 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
553 else if (ln == RXp_OFFS_END(prog,n))
554 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
555 else {
556 const char *s = reginfo->strbeg + ln;
557 Perl_sv_catpvf(aTHX_ sv, ": ");
558 Perl_pv_pretty( aTHX_ sv, s, RXp_OFFS_END(prog,n) - RXp_OFFS_START(prog,n), 32, 0, 0,
559 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
560 }
561 }
562 } else if (k == GOSUB) {
563 AV *name_list= NULL;
564 IV parno = ARG1u(o);
565 IV logical_parno = (parno && prog->parno_to_logical)
566 ? prog->parno_to_logical[parno]
567 : parno;
568 if ( RXp_PAREN_NAMES(prog) ) {
569 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
570 } else if ( pRExC_state ) {
571 name_list= RExC_paren_name_list;
572 }
573
574 /* Paren and offset */
575 Perl_sv_catpvf(aTHX_ sv, "%" IVdf, logical_parno);
576 if (logical_parno != parno)
577 Perl_sv_catpvf(aTHX_ sv, "/%" IVdf, parno);
578
579 Perl_sv_catpvf(aTHX_ sv, "[%+d:%d]", (int)ARG2i(o),
580 (int)((o + (int)ARG2i(o)) - progi->program) );
581 if (name_list) {
582 SV **name= av_fetch_simple(name_list, ARG1u(o), 0 );
583 if (name)
584 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
585 }
586 }
587 else if (k == LOGICAL)
588 /* 2: embedded, otherwise 1 */
589 Perl_sv_catpvf(aTHX_ sv, "[%d]", FLAGS(o));
590 else if (k == ANYOF || k == ANYOFH || k == ANYOFR) {
591 U8 flags;
592 char * bitmap;
593 U8 do_sep = 0; /* Do we need to separate various components of the
594 output? */
595 /* Set if there is still an unresolved user-defined property */
596 SV *unresolved = NULL;
597
598 /* Things that are ignored except when the runtime locale is UTF-8 */
599 SV *only_utf8_locale_invlist = NULL;
600
601 /* Code points that don't fit in the bitmap */
602 SV *nonbitmap_invlist = NULL;
603
604 /* And things that aren't in the bitmap, but are small enough to be */
605 SV* bitmap_range_not_in_bitmap = NULL;
606
607 bool inverted;
608
609 if (k != ANYOF) {
610 flags = 0;
611 bitmap = NULL;
612 }
613 else {
614 flags = ANYOF_FLAGS(o);
615 bitmap = ANYOF_BITMAP(o);
616 }
617
618 if (op == ANYOFL || op == ANYOFPOSIXL) {
619 if ((flags & ANYOFL_UTF8_LOCALE_REQD)) {
620 sv_catpvs(sv, "{utf8-locale-reqd}");
621 }
622 if (flags & ANYOFL_FOLD) {
623 sv_catpvs(sv, "{i}");
624 }
625 }
626
627 inverted = flags & ANYOF_INVERT;
628
629 /* If there is stuff outside the bitmap, get it */
630 if (k == ANYOFR) {
631
632 /* For a single range, split into the parts inside vs outside the
633 * bitmap. */
634 UV start = ANYOFRbase(o);
635 UV end = ANYOFRbase(o) + ANYOFRdelta(o);
636
637 if (start < NUM_ANYOF_CODE_POINTS) {
638 if (end < NUM_ANYOF_CODE_POINTS) {
639 bitmap_range_not_in_bitmap
640 = _add_range_to_invlist(bitmap_range_not_in_bitmap,
641 start, end);
642 }
643 else {
644 bitmap_range_not_in_bitmap
645 = _add_range_to_invlist(bitmap_range_not_in_bitmap,
646 start, NUM_ANYOF_CODE_POINTS);
647 start = NUM_ANYOF_CODE_POINTS;
648 }
649 }
650
651 if (start >= NUM_ANYOF_CODE_POINTS) {
652 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
653 ANYOFRbase(o),
654 ANYOFRbase(o) + ANYOFRdelta(o));
655 }
656 }
657 else if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(o)) {
658 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
659 NUM_ANYOF_CODE_POINTS,
660 UV_MAX);
661 }
662 else if (ANYOF_HAS_AUX(o)) {
663 (void) GET_REGCLASS_AUX_DATA(prog, o, FALSE,
664 &unresolved,
665 &only_utf8_locale_invlist,
666 &nonbitmap_invlist);
667
668 /* The aux data may contain stuff that could fit in the bitmap.
669 * This could come from a user-defined property being finally
670 * resolved when this call was done; or much more likely because
671 * there are matches that require UTF-8 to be valid, and so aren't
672 * in the bitmap (or ANYOFR). This is teased apart later */
673 _invlist_intersection(nonbitmap_invlist,
674 PL_InBitmap,
675 &bitmap_range_not_in_bitmap);
676 /* Leave just the things that don't fit into the bitmap */
677 _invlist_subtract(nonbitmap_invlist,
678 PL_InBitmap,
679 &nonbitmap_invlist);
680 }
681
682 /* Ready to start outputting. First, the initial left bracket */
683 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
684
685 if ( bitmap
686 || bitmap_range_not_in_bitmap
687 || only_utf8_locale_invlist
688 || unresolved)
689 {
690 /* Then all the things that could fit in the bitmap */
691 do_sep = put_charclass_bitmap_innards(
692 sv,
693 bitmap,
694 bitmap_range_not_in_bitmap,
695 only_utf8_locale_invlist,
696 o,
697 flags,
698
699 /* Can't try inverting for a
700 * better display if there
701 * are things that haven't
702 * been resolved */
703 (unresolved != NULL || k == ANYOFR));
704 SvREFCNT_dec(bitmap_range_not_in_bitmap);
705
706 /* If there are user-defined properties which haven't been defined
707 * yet, output them. If the result is not to be inverted, it is
708 * clearest to output them in a separate [] from the bitmap range
709 * stuff. If the result is to be complemented, we have to show
710 * everything in one [], as the inversion applies to the whole
711 * thing. Use {braces} to separate them from anything in the
712 * bitmap and anything above the bitmap. */
713 if (unresolved) {
714 if (inverted) {
715 if (! do_sep) { /* If didn't output anything in the bitmap
716 */
717 sv_catpvs(sv, "^");
718 }
719 sv_catpvs(sv, "{");
720 }
721 else if (do_sep) {
722 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1],
723 PL_colors[0]);
724 }
725 sv_catsv(sv, unresolved);
726 if (inverted) {
727 sv_catpvs(sv, "}");
728 }
729 do_sep = ! inverted;
730 }
731 else if ( do_sep == 2
732 && ! nonbitmap_invlist
733 && ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(o))
734 {
735 /* Here, the display shows the class as inverted, and
736 * everything above the lower display should also match, but
737 * there is no indication of that. Add this range so the code
738 * below will add it to the display */
739 _invlist_union_complement_2nd(nonbitmap_invlist,
740 PL_InBitmap,
741 &nonbitmap_invlist);
742 }
743 }
744
745 /* And, finally, add the above-the-bitmap stuff */
746 if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
747 SV* contents;
748
749 /* See if truncation size is overridden */
750 const STRLEN dump_len = (PL_dump_re_max_len > 256)
751 ? PL_dump_re_max_len
752 : 256;
753
754 /* This is output in a separate [] */
755 if (do_sep) {
756 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
757 }
758
759 /* And, for easy of understanding, it is shown in the
760 * uncomplemented form if possible. The one exception being if
761 * there are unresolved items, where the inversion has to be
762 * delayed until runtime */
763 if (inverted && ! unresolved) {
764 _invlist_invert(nonbitmap_invlist);
765 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
766 }
767
768 contents = invlist_contents(nonbitmap_invlist,
769 FALSE /* output suitable for catsv */
770 );
771
772 /* If the output is shorter than the permissible maximum, just do it. */
773 if (SvCUR(contents) <= dump_len) {
774 sv_catsv(sv, contents);
775 }
776 else {
777 const char * contents_string = SvPVX(contents);
778 STRLEN i = dump_len;
779
780 /* Otherwise, start at the permissible max and work back to the
781 * first break possibility */
782 while (i > 0 && contents_string[i] != ' ') {
783 i--;
784 }
785 if (i == 0) { /* Fail-safe. Use the max if we couldn't
786 find a legal break */
787 i = dump_len;
788 }
789
790 sv_catpvn(sv, contents_string, i);
791 sv_catpvs(sv, "...");
792 }
793
794 SvREFCNT_dec_NN(contents);
795 SvREFCNT_dec_NN(nonbitmap_invlist);
796 }
797
798 /* And finally the matching, closing ']' */
799 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
800
801 if (op == ANYOFHs) {
802 Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1));
803 }
804 else if (REGNODE_TYPE(op) != ANYOF) {
805 U8 lowest = (op != ANYOFHr)
806 ? FLAGS(o)
807 : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
808 U8 highest = (op == ANYOFHr)
809 ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o))
810 : (op == ANYOFH || op == ANYOFR)
811 ? 0xFF
812 : lowest;
813 #ifndef EBCDIC
814 if (op != ANYOFR || ! isASCII(ANYOFRbase(o) + ANYOFRdelta(o)))
815 #endif
816 {
817 Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
818 if (lowest != highest) {
819 Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
820 }
821 Perl_sv_catpvf(aTHX_ sv, ")");
822 }
823 }
824
825 SvREFCNT_dec(unresolved);
826 }
827 else if (k == ANYOFM) {
828 SV * cp_list = get_ANYOFM_contents(o);
829
830 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
831 if (op == NANYOFM) {
832 _invlist_invert(cp_list);
833 }
834
835 put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, 0, TRUE);
836 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
837
838 SvREFCNT_dec(cp_list);
839 }
840 else if (k == ANYOFHbbm) {
841 SV * cp_list = get_ANYOFHbbm_contents(o);
842 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
843
844 sv_catsv(sv, invlist_contents(cp_list,
845 FALSE /* output suitable for catsv */
846 ));
847 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
848
849 SvREFCNT_dec(cp_list);
850 }
851 else if (k == POSIXD || k == NPOSIXD) {
852 U8 index = FLAGS(o) * 2;
853 if (index < C_ARRAY_LENGTH(anyofs)) {
854 if (*anyofs[index] != '[') {
855 sv_catpvs(sv, "[");
856 }
857 sv_catpv(sv, anyofs[index]);
858 if (*anyofs[index] != '[') {
859 sv_catpvs(sv, "]");
860 }
861 }
862 else {
863 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
864 }
865 }
866 else if (k == BOUND || k == NBOUND) {
867 /* Must be synced with order of 'bound_type' in regcomp.h */
868 const char * const bounds[] = {
869 "", /* Traditional */
870 "{gcb}",
871 "{lb}",
872 "{sb}",
873 "{wb}"
874 };
875 assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
876 sv_catpv(sv, bounds[FLAGS(o)]);
877 }
878 else if (k == BRANCHJ && (op == UNLESSM || op == IFMATCH)) {
879 Perl_sv_catpvf(aTHX_ sv, "[%d", -(FLAGS(o)));
880 if (NEXT_OFF(o)) {
881 Perl_sv_catpvf(aTHX_ sv, "..-%d", FLAGS(o) - NEXT_OFF(o));
882 }
883 Perl_sv_catpvf(aTHX_ sv, "]");
884 }
885 else if (op == SBOL)
886 Perl_sv_catpvf(aTHX_ sv, " /%s/", FLAGS(o) ? "\\A" : "^");
887 else if (op == EVAL) {
888 if (FLAGS(o) & EVAL_OPTIMISTIC_FLAG)
889 Perl_sv_catpvf(aTHX_ sv, " optimistic");
890 }
891
892 /* add on the verb argument if there is one */
893 if ( ( k == VERB || op == ACCEPT || op == OPFAIL ) && FLAGS(o)) {
894 if ( ARG1u(o) )
895 Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
896 SVfARG((MUTABLE_SV(progi->data->data[ ARG1u( o ) ]))));
897 else
898 sv_catpvs(sv, ":NULL");
899 }
900 #else
901 PERL_UNUSED_CONTEXT;
902 PERL_UNUSED_ARG(sv);
903 PERL_UNUSED_ARG(o);
904 PERL_UNUSED_ARG(prog);
905 PERL_UNUSED_ARG(reginfo);
906 PERL_UNUSED_ARG(pRExC_state);
907 #endif /* DEBUGGING */
908 }
909
910 #ifdef DEBUGGING
911
912 STATIC void
S_put_code_point(pTHX_ SV * sv,UV c)913 S_put_code_point(pTHX_ SV *sv, UV c)
914 {
915 PERL_ARGS_ASSERT_PUT_CODE_POINT;
916
917 if (c > 255) {
918 Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c);
919 }
920 else if (isPRINT(c)) {
921 const char string = (char) c;
922
923 /* We use {phrase} as metanotation in the class, so also escape literal
924 * braces */
925 if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
926 sv_catpvs(sv, "\\");
927 sv_catpvn(sv, &string, 1);
928 }
929 else if (isMNEMONIC_CNTRL(c)) {
930 Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
931 }
932 else {
933 Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
934 }
935 }
936
937 STATIC void
S_put_range(pTHX_ SV * sv,UV start,const UV end,const bool allow_literals)938 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
939 {
940 /* Appends to 'sv' a displayable version of the range of code points from
941 * 'start' to 'end'. Mnemonics (like '\r') are used for the few controls
942 * that have them, when they occur at the beginning or end of the range.
943 * It uses hex to output the remaining code points, unless 'allow_literals'
944 * is true, in which case the printable ASCII ones are output as-is (though
945 * some of these will be escaped by put_code_point()).
946 *
947 * NOTE: This is designed only for printing ranges of code points that fit
948 * inside an ANYOF bitmap. Higher code points are simply suppressed
949 */
950
951 const unsigned int min_range_count = 3;
952
953 assert(start <= end);
954
955 PERL_ARGS_ASSERT_PUT_RANGE;
956
957 while (start <= end) {
958 UV this_end;
959 const char * format;
960
961 if ( end - start < min_range_count
962 && (end - start <= 2 || (isPRINT_A(start) && isPRINT_A(end))))
963 {
964 /* Output a range of 1 or 2 chars individually, or longer ranges
965 * when printable */
966 for (; start <= end; start++) {
967 put_code_point(sv, start);
968 }
969 break;
970 }
971
972 /* If permitted by the input options, and there is a possibility that
973 * this range contains a printable literal, look to see if there is
974 * one. */
975 if (allow_literals && start <= MAX_PRINT_A) {
976
977 /* If the character at the beginning of the range isn't an ASCII
978 * printable, effectively split the range into two parts:
979 * 1) the portion before the first such printable,
980 * 2) the rest
981 * and output them separately. */
982 if (! isPRINT_A(start)) {
983 UV temp_end = start + 1;
984
985 /* There is no point looking beyond the final possible
986 * printable, in MAX_PRINT_A */
987 UV max = MIN(end, MAX_PRINT_A);
988
989 while (temp_end <= max && ! isPRINT_A(temp_end)) {
990 temp_end++;
991 }
992
993 /* Here, temp_end points to one beyond the first printable if
994 * found, or to one beyond 'max' if not. If none found, make
995 * sure that we use the entire range */
996 if (temp_end > MAX_PRINT_A) {
997 temp_end = end + 1;
998 }
999
1000 /* Output the first part of the split range: the part that
1001 * doesn't have printables, with the parameter set to not look
1002 * for literals (otherwise we would infinitely recurse) */
1003 put_range(sv, start, temp_end - 1, FALSE);
1004
1005 /* The 2nd part of the range (if any) starts here. */
1006 start = temp_end;
1007
1008 /* We do a continue, instead of dropping down, because even if
1009 * the 2nd part is non-empty, it could be so short that we want
1010 * to output it as individual characters, as tested for at the
1011 * top of this loop. */
1012 continue;
1013 }
1014
1015 /* Here, 'start' is a printable ASCII. If it is an alphanumeric,
1016 * output a sub-range of just the digits or letters, then process
1017 * the remaining portion as usual. */
1018 if (isALPHANUMERIC_A(start)) {
1019 UV mask = (isDIGIT_A(start))
1020 ? CC_DIGIT_
1021 : isUPPER_A(start)
1022 ? CC_UPPER_
1023 : CC_LOWER_;
1024 UV temp_end = start + 1;
1025
1026 /* Find the end of the sub-range that includes just the
1027 * characters in the same class as the first character in it */
1028 while (temp_end <= end && generic_isCC_A_(temp_end, mask)) {
1029 temp_end++;
1030 }
1031 temp_end--;
1032
1033 /* For short ranges, don't duplicate the code above to output
1034 * them; just call recursively */
1035 if (temp_end - start < min_range_count) {
1036 put_range(sv, start, temp_end, FALSE);
1037 }
1038 else { /* Output as a range */
1039 put_code_point(sv, start);
1040 sv_catpvs(sv, "-");
1041 put_code_point(sv, temp_end);
1042 }
1043 start = temp_end + 1;
1044 continue;
1045 }
1046
1047 /* We output any other printables as individual characters */
1048 if (isPUNCT_A(start) || isSPACE_A(start)) {
1049 while (start <= end && (isPUNCT_A(start)
1050 || isSPACE_A(start)))
1051 {
1052 put_code_point(sv, start);
1053 start++;
1054 }
1055 continue;
1056 }
1057 } /* End of looking for literals */
1058
1059 /* Here is not to output as a literal. Some control characters have
1060 * mnemonic names. Split off any of those at the beginning and end of
1061 * the range to print mnemonically. It isn't possible for many of
1062 * these to be in a row, so this won't overwhelm with output */
1063 if ( start <= end
1064 && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
1065 {
1066 while (isMNEMONIC_CNTRL(start) && start <= end) {
1067 put_code_point(sv, start);
1068 start++;
1069 }
1070
1071 /* If this didn't take care of the whole range ... */
1072 if (start <= end) {
1073
1074 /* Look backwards from the end to find the final non-mnemonic
1075 * */
1076 UV temp_end = end;
1077 while (isMNEMONIC_CNTRL(temp_end)) {
1078 temp_end--;
1079 }
1080
1081 /* And separately output the interior range that doesn't start
1082 * or end with mnemonics */
1083 put_range(sv, start, temp_end, FALSE);
1084
1085 /* Then output the mnemonic trailing controls */
1086 start = temp_end + 1;
1087 while (start <= end) {
1088 put_code_point(sv, start);
1089 start++;
1090 }
1091 break;
1092 }
1093 }
1094
1095 /* As a final resort, output the range or subrange as hex. */
1096
1097 if (start >= NUM_ANYOF_CODE_POINTS) {
1098 this_end = end;
1099 }
1100 else { /* Have to split range at the bitmap boundary */
1101 this_end = (end < NUM_ANYOF_CODE_POINTS)
1102 ? end
1103 : NUM_ANYOF_CODE_POINTS - 1;
1104 }
1105 #if NUM_ANYOF_CODE_POINTS > 256
1106 format = (this_end < 256)
1107 ? "\\x%02" UVXf "-\\x%02" UVXf
1108 : "\\x{%04" UVXf "}-\\x{%04" UVXf "}";
1109 #else
1110 format = "\\x%02" UVXf "-\\x%02" UVXf;
1111 #endif
1112 GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
1113 Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
1114 GCC_DIAG_RESTORE_STMT;
1115 break;
1116 }
1117 }
1118
1119 STATIC void
S_put_charclass_bitmap_innards_invlist(pTHX_ SV * sv,SV * invlist)1120 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
1121 {
1122 /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
1123 * 'invlist' */
1124
1125 UV start, end;
1126 bool allow_literals = TRUE;
1127
1128 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
1129
1130 /* Generally, it is more readable if printable characters are output as
1131 * literals, but if a range (nearly) spans all of them, it's best to output
1132 * it as a single range. This code will use a single range if all but 2
1133 * ASCII printables are in it */
1134 invlist_iterinit(invlist);
1135 while (invlist_iternext(invlist, &start, &end)) {
1136
1137 /* If the range starts beyond the final printable, it doesn't have any
1138 * in it */
1139 if (start > MAX_PRINT_A) {
1140 break;
1141 }
1142
1143 /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span
1144 * all but two, the range must start and end no later than 2 from
1145 * either end */
1146 if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
1147 if (end > MAX_PRINT_A) {
1148 end = MAX_PRINT_A;
1149 }
1150 if (start < ' ') {
1151 start = ' ';
1152 }
1153 if (end - start >= MAX_PRINT_A - ' ' - 2) {
1154 allow_literals = FALSE;
1155 }
1156 break;
1157 }
1158 }
1159 invlist_iterfinish(invlist);
1160
1161 /* Here we have figured things out. Output each range */
1162 invlist_iterinit(invlist);
1163 while (invlist_iternext(invlist, &start, &end)) {
1164 if (start >= NUM_ANYOF_CODE_POINTS) {
1165 break;
1166 }
1167 put_range(sv, start, end, allow_literals);
1168 }
1169 invlist_iterfinish(invlist);
1170
1171 return;
1172 }
1173
1174 STATIC SV*
S_put_charclass_bitmap_innards_common(pTHX_ SV * invlist,SV * posixes,SV * only_utf8,SV * not_utf8,SV * only_utf8_locale,const bool invert)1175 S_put_charclass_bitmap_innards_common(pTHX_
1176 SV* invlist, /* The bitmap */
1177 SV* posixes, /* Under /l, things like [:word:], \S */
1178 SV* only_utf8, /* Under /d, matches iff the target is UTF-8 */
1179 SV* not_utf8, /* /d, matches iff the target isn't UTF-8 */
1180 SV* only_utf8_locale, /* Under /l, matches if the locale is UTF-8 */
1181 const bool invert /* Is the result to be inverted? */
1182 )
1183 {
1184 /* Create and return an SV containing a displayable version of the bitmap
1185 * and associated information determined by the input parameters. If the
1186 * output would have been only the inversion indicator '^', NULL is instead
1187 * returned. */
1188
1189 SV * output;
1190
1191 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
1192
1193 if (invert) {
1194 output = newSVpvs("^");
1195 }
1196 else {
1197 output = newSVpvs("");
1198 }
1199
1200 /* First, the code points in the bitmap that are unconditionally there */
1201 put_charclass_bitmap_innards_invlist(output, invlist);
1202
1203 /* Traditionally, these have been placed after the main code points */
1204 if (posixes) {
1205 sv_catsv(output, posixes);
1206 }
1207
1208 if (only_utf8 && _invlist_len(only_utf8)) {
1209 Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
1210 put_charclass_bitmap_innards_invlist(output, only_utf8);
1211 }
1212
1213 if (not_utf8 && _invlist_len(not_utf8)) {
1214 Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
1215 put_charclass_bitmap_innards_invlist(output, not_utf8);
1216 }
1217
1218 if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
1219 Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
1220 put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
1221
1222 /* This is the only list in this routine that can legally contain code
1223 * points outside the bitmap range. The call just above to
1224 * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
1225 * output them here. There's about a half-dozen possible, and none in
1226 * contiguous ranges longer than 2 */
1227 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
1228 UV start, end;
1229 SV* above_bitmap = NULL;
1230
1231 _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
1232
1233 invlist_iterinit(above_bitmap);
1234 while (invlist_iternext(above_bitmap, &start, &end)) {
1235 UV i;
1236
1237 for (i = start; i <= end; i++) {
1238 put_code_point(output, i);
1239 }
1240 }
1241 invlist_iterfinish(above_bitmap);
1242 SvREFCNT_dec_NN(above_bitmap);
1243 }
1244 }
1245
1246 if (invert && SvCUR(output) == 1) {
1247 return NULL;
1248 }
1249
1250 return output;
1251 }
1252
1253 STATIC U8
S_put_charclass_bitmap_innards(pTHX_ SV * sv,char * bitmap,SV * nonbitmap_invlist,SV * only_utf8_locale_invlist,const regnode * const node,const U8 flags,const bool force_as_is_display)1254 S_put_charclass_bitmap_innards(pTHX_ SV *sv,
1255 char *bitmap,
1256 SV *nonbitmap_invlist,
1257 SV *only_utf8_locale_invlist,
1258 const regnode * const node,
1259 const U8 flags,
1260 const bool force_as_is_display)
1261 {
1262 /* Appends to 'sv' a displayable version of the innards of the bracketed
1263 * character class defined by the other arguments:
1264 * 'bitmap' points to the bitmap, or NULL if to ignore that.
1265 * 'nonbitmap_invlist' is an inversion list of the code points that are in
1266 * the bitmap range, but for some reason aren't in the bitmap; NULL if
1267 * none. The reasons for this could be that they require some
1268 * condition such as the target string being or not being in UTF-8
1269 * (under /d), or because they came from a user-defined property that
1270 * was not resolved at the time of the regex compilation (under /u)
1271 * 'only_utf8_locale_invlist' is an inversion list of the code points that
1272 * are valid only if the runtime locale is a UTF-8 one; NULL if none
1273 * 'node' is the regex pattern ANYOF node. It is needed only when the
1274 * above two parameters are not null, and is passed so that this
1275 * routine can tease apart the various reasons for them.
1276 * 'flags' is the flags field of 'node'
1277 * 'force_as_is_display' is TRUE if this routine should definitely NOT try
1278 * to invert things to see if that leads to a cleaner display. If
1279 * FALSE, this routine is free to use its judgment about doing this.
1280 *
1281 * It returns 0 if nothing was actually output. (It may be that
1282 * the bitmap, etc is empty.)
1283 * 1 if the output wasn't inverted (didn't begin with a '^')
1284 * 2 if the output was inverted (did begin with a '^')
1285 *
1286 * When called for outputting the bitmap of a non-ANYOF node, just pass the
1287 * bitmap, with the succeeding parameters set to NULL, and the final one to
1288 * FALSE.
1289 */
1290
1291 /* In general, it tries to display the 'cleanest' representation of the
1292 * innards, choosing whether to display them inverted or not, regardless of
1293 * whether the class itself is to be inverted. However, there are some
1294 * cases where it can't try inverting, as what actually matches isn't known
1295 * until runtime, and hence the inversion isn't either. */
1296
1297 bool inverting_allowed = ! force_as_is_display;
1298
1299 int i;
1300 STRLEN orig_sv_cur = SvCUR(sv);
1301
1302 SV* invlist; /* Inversion list we accumulate of code points that
1303 are unconditionally matched */
1304 SV* only_utf8 = NULL; /* Under /d, list of matches iff the target is
1305 UTF-8 */
1306 SV* not_utf8 = NULL; /* /d, list of matches iff the target isn't UTF-8
1307 */
1308 SV* posixes = NULL; /* Under /l, string of things like [:word:], \D */
1309 SV* only_utf8_locale = NULL; /* Under /l, list of matches if the locale
1310 is UTF-8 */
1311
1312 SV* as_is_display; /* The output string when we take the inputs
1313 literally */
1314 SV* inverted_display; /* The output string when we invert the inputs */
1315
1316 bool invert = cBOOL(flags & ANYOF_INVERT); /* Is the input to be inverted
1317 to match? */
1318 /* We are biased in favor of displaying things without them being inverted,
1319 * as that is generally easier to understand */
1320 const int bias = 5;
1321
1322 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
1323
1324 /* Start off with whatever code points are passed in. (We clone, so we
1325 * don't change the caller's list) */
1326 if (nonbitmap_invlist) {
1327 assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
1328 invlist = invlist_clone(nonbitmap_invlist, NULL);
1329 }
1330 else { /* Worst case size is every other code point is matched */
1331 invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
1332 }
1333
1334 if (flags) {
1335 if (OP(node) == ANYOFD) {
1336
1337 /* This flag indicates that the code points below 0x100 in the
1338 * nonbitmap list are precisely the ones that match only when the
1339 * target is UTF-8 (they should all be non-ASCII). */
1340 if (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES) {
1341 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
1342 _invlist_subtract(invlist, only_utf8, &invlist);
1343 }
1344
1345 /* And this flag for matching all non-ASCII 0xFF and below */
1346 if (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared) {
1347 not_utf8 = invlist_clone(PL_UpperLatin1, NULL);
1348 }
1349 }
1350 else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) {
1351
1352 /* If either of these flags are set, what matches isn't
1353 * determinable except during execution, so don't know enough here
1354 * to invert */
1355 if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
1356 inverting_allowed = FALSE;
1357 }
1358
1359 /* What the posix classes match also varies at runtime, so these
1360 * will be output symbolically. */
1361 if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
1362 int i;
1363
1364 posixes = newSVpvs("");
1365 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
1366 if (ANYOF_POSIXL_TEST(node, i)) {
1367 sv_catpv(posixes, anyofs[i]);
1368 }
1369 }
1370 }
1371 }
1372 }
1373
1374 /* Accumulate the bit map into the unconditional match list */
1375 if (bitmap) {
1376 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1377 if (BITMAP_TEST(bitmap, i)) {
1378 int start = i++;
1379 for (;
1380 i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i);
1381 i++)
1382 { /* empty */ }
1383 invlist = _add_range_to_invlist(invlist, start, i-1);
1384 }
1385 }
1386 }
1387
1388 /* Make sure that the conditional match lists don't have anything in them
1389 * that match unconditionally; otherwise the output is quite confusing.
1390 * This could happen if the code that populates these misses some
1391 * duplication. */
1392 if (only_utf8) {
1393 _invlist_subtract(only_utf8, invlist, &only_utf8);
1394 }
1395 if (not_utf8) {
1396 _invlist_subtract(not_utf8, invlist, ¬_utf8);
1397 }
1398
1399 if (only_utf8_locale_invlist) {
1400
1401 /* Since this list is passed in, we have to make a copy before
1402 * modifying it */
1403 only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL);
1404
1405 _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
1406
1407 /* And, it can get really weird for us to try outputting an inverted
1408 * form of this list when it has things above the bitmap, so don't even
1409 * try */
1410 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
1411 inverting_allowed = FALSE;
1412 }
1413 }
1414
1415 /* Calculate what the output would be if we take the input as-is */
1416 as_is_display = put_charclass_bitmap_innards_common(invlist,
1417 posixes,
1418 only_utf8,
1419 not_utf8,
1420 only_utf8_locale,
1421 invert);
1422
1423 /* If have to take the output as-is, just do that */
1424 if (! inverting_allowed) {
1425 if (as_is_display) {
1426 sv_catsv(sv, as_is_display);
1427 SvREFCNT_dec_NN(as_is_display);
1428 }
1429 }
1430 else { /* But otherwise, create the output again on the inverted input, and
1431 use whichever version is shorter */
1432
1433 int inverted_bias, as_is_bias;
1434
1435 /* We will apply our bias to whichever of the results doesn't have
1436 * the '^' */
1437 bool trial_invert;
1438 if (invert) {
1439 trial_invert = FALSE;
1440 as_is_bias = bias;
1441 inverted_bias = 0;
1442 }
1443 else {
1444 trial_invert = TRUE;
1445 as_is_bias = 0;
1446 inverted_bias = bias;
1447 }
1448
1449 /* Now invert each of the lists that contribute to the output,
1450 * excluding from the result things outside the possible range */
1451
1452 /* For the unconditional inversion list, we have to add in all the
1453 * conditional code points, so that when inverted, they will be gone
1454 * from it */
1455 _invlist_union(only_utf8, invlist, &invlist);
1456 _invlist_union(not_utf8, invlist, &invlist);
1457 _invlist_union(only_utf8_locale, invlist, &invlist);
1458 _invlist_invert(invlist);
1459 _invlist_intersection(invlist, PL_InBitmap, &invlist);
1460
1461 if (only_utf8) {
1462 _invlist_invert(only_utf8);
1463 _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
1464 }
1465 else if (not_utf8) {
1466
1467 /* If a code point matches iff the target string is not in UTF-8,
1468 * then complementing the result has it not match iff not in UTF-8,
1469 * which is the same thing as matching iff it is UTF-8. */
1470 only_utf8 = not_utf8;
1471 not_utf8 = NULL;
1472 }
1473
1474 if (only_utf8_locale) {
1475 _invlist_invert(only_utf8_locale);
1476 _invlist_intersection(only_utf8_locale,
1477 PL_InBitmap,
1478 &only_utf8_locale);
1479 }
1480
1481 inverted_display = put_charclass_bitmap_innards_common(
1482 invlist,
1483 posixes,
1484 only_utf8,
1485 not_utf8,
1486 only_utf8_locale, trial_invert);
1487
1488 /* Use the shortest representation, taking into account our bias
1489 * against showing it inverted */
1490 if ( inverted_display
1491 && ( ! as_is_display
1492 || ( SvCUR(inverted_display) + inverted_bias
1493 < SvCUR(as_is_display) + as_is_bias)))
1494 {
1495 sv_catsv(sv, inverted_display);
1496 invert = ! invert;
1497 }
1498 else if (as_is_display) {
1499 sv_catsv(sv, as_is_display);
1500 }
1501
1502 SvREFCNT_dec(as_is_display);
1503 SvREFCNT_dec(inverted_display);
1504 }
1505
1506 SvREFCNT_dec_NN(invlist);
1507 SvREFCNT_dec(only_utf8);
1508 SvREFCNT_dec(not_utf8);
1509 SvREFCNT_dec(posixes);
1510 SvREFCNT_dec(only_utf8_locale);
1511
1512 U8 did_output_something = (bool) (SvCUR(sv) > orig_sv_cur);
1513 if (did_output_something) {
1514 /* Distinguish between non and inverted cases */
1515 did_output_something += invert;
1516 }
1517
1518 return did_output_something;
1519 }
1520
1521
1522 const regnode *
Perl_dumpuntil(pTHX_ const regexp * r,const regnode * start,const regnode * node,const regnode * last,const regnode * plast,SV * sv,I32 indent,U32 depth)1523 Perl_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
1524 const regnode *last, const regnode *plast,
1525 SV* sv, I32 indent, U32 depth)
1526 {
1527 const regnode *next;
1528 const regnode *optstart= NULL;
1529
1530 RXi_GET_DECL(r, ri);
1531 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1532
1533 PERL_ARGS_ASSERT_DUMPUNTIL;
1534
1535 #ifdef DEBUG_DUMPUNTIL
1536 Perl_re_printf( aTHX_ "--- %d : %d - %d - %d\n", indent, node-start,
1537 last ? last-start : 0, plast ? plast-start : 0);
1538 #endif
1539
1540 if (plast && plast < last)
1541 last= plast;
1542
1543 while (node && (!last || node < last)) {
1544 const U8 op = OP(node);
1545
1546 if (op == CLOSE || op == SRCLOSE || op == WHILEM)
1547 indent--;
1548 next = regnext((regnode *)node);
1549 const regnode *after = regnode_after((regnode *)node,0);
1550
1551 /* Where, what. */
1552 if (op == OPTIMIZED) {
1553 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
1554 optstart = node;
1555 else
1556 goto after_print;
1557 } else
1558 CLEAR_OPTSTART;
1559
1560 regprop(r, sv, node, NULL, NULL);
1561 Perl_re_printf( aTHX_ "%4" IVdf ":%*s%s", (IV)(node - start),
1562 (int)(2*indent + 1), "", SvPVX_const(sv));
1563
1564 if (op != OPTIMIZED) {
1565 if (next == NULL) /* Next ptr. */
1566 Perl_re_printf( aTHX_ " (0)");
1567 else if (REGNODE_TYPE(op) == BRANCH
1568 && REGNODE_TYPE(OP(next)) != BRANCH )
1569 Perl_re_printf( aTHX_ " (FAIL)");
1570 else
1571 Perl_re_printf( aTHX_ " (%" IVdf ")", (IV)(next - start));
1572 Perl_re_printf( aTHX_ "\n");
1573 }
1574
1575 after_print:
1576 if (REGNODE_TYPE(op) == BRANCHJ) {
1577 assert(next);
1578 const regnode *nnode = (OP(next) == LONGJMP
1579 ? regnext((regnode *)next)
1580 : next);
1581 if (last && nnode > last)
1582 nnode = last;
1583 DUMPUNTIL(after, nnode);
1584 }
1585 else if (REGNODE_TYPE(op) == BRANCH) {
1586 assert(next);
1587 DUMPUNTIL(after, next);
1588 }
1589 else if ( REGNODE_TYPE(op) == TRIE ) {
1590 const regnode *this_trie = node;
1591 const U32 n = ARG1u(node);
1592 const reg_ac_data * const ac = op>=AHOCORASICK ?
1593 (reg_ac_data *)ri->data->data[n] :
1594 NULL;
1595 const reg_trie_data * const trie =
1596 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
1597 #ifdef DEBUGGING
1598 AV *const trie_words
1599 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
1600 #endif
1601 const regnode *nextbranch= NULL;
1602 I32 word_idx;
1603 SvPVCLEAR(sv);
1604 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
1605 SV ** const elem_ptr = av_fetch_simple(trie_words, word_idx, 0);
1606
1607 Perl_re_indentf( aTHX_ "%s ",
1608 indent+3,
1609 elem_ptr
1610 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
1611 SvCUR(*elem_ptr), PL_dump_re_max_len,
1612 PL_colors[0], PL_colors[1],
1613 (SvUTF8(*elem_ptr)
1614 ? PERL_PV_ESCAPE_UNI
1615 : 0)
1616 | PERL_PV_PRETTY_ELLIPSES
1617 | PERL_PV_PRETTY_LTGT
1618 )
1619 : "???"
1620 );
1621 if (trie->jump) {
1622 U16 dist= trie->jump[word_idx+1];
1623 Perl_re_printf( aTHX_ "(%" UVuf ")\n",
1624 (UV)((dist ? this_trie + dist : next) - start));
1625 if (dist) {
1626 if (!nextbranch)
1627 nextbranch= this_trie + trie->jump[0];
1628 DUMPUNTIL(this_trie + dist, nextbranch);
1629 }
1630 if (nextbranch && REGNODE_TYPE(OP(nextbranch))==BRANCH)
1631 nextbranch= regnext((regnode *)nextbranch);
1632 } else {
1633 Perl_re_printf( aTHX_ "\n");
1634 }
1635 }
1636 if (last && next > last)
1637 node= last;
1638 else
1639 node= next;
1640 }
1641 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
1642 DUMPUNTIL(after, after + 1); /* +1 is NOT a REGNODE_AFTER */
1643 }
1644 else if (REGNODE_TYPE(op) == CURLY && op != CURLYX) {
1645 assert(next);
1646 DUMPUNTIL(after, next);
1647 }
1648 else if ( op == PLUS || op == STAR) {
1649 DUMPUNTIL(after, after + 1); /* +1 NOT a REGNODE_AFTER */
1650 }
1651 else if (REGNODE_TYPE(op) == EXACT || op == ANYOFHs) {
1652 /* Literal string, where present. */
1653 node = (const regnode *)REGNODE_AFTER_varies(node);
1654 }
1655 else {
1656 node = REGNODE_AFTER_opcode(node,op);
1657 }
1658 if (op == CURLYX || op == OPEN || op == SROPEN)
1659 indent++;
1660 if (REGNODE_TYPE(op) == END)
1661 break;
1662 }
1663 CLEAR_OPTSTART;
1664 #ifdef DEBUG_DUMPUNTIL
1665 Perl_re_printf( aTHX_ "--- %d\n", (int)indent);
1666 #endif
1667 return node;
1668 }
1669
1670 #endif /* DEBUGGING */
1671