xref: /openbsd/gnu/usr.bin/perl/regcomp_debug.c (revision f2a19305)
1 #ifdef PERL_EXT_RE_BUILD
2 #include "re_top.h"
3 #endif
4 
5 #include "EXTERN.h"
6 #define PERL_IN_REGEX_ENGINE
7 #define PERL_IN_REGCOMP_ANY
8 #define PERL_IN_REGCOMP_DEBUG_C
9 #include "perl.h"
10 
11 #ifdef PERL_IN_XSUB_RE
12 #  include "re_comp.h"
13 #else
14 #  include "regcomp.h"
15 #endif
16 
17 #include "invlist_inline.h"
18 #include "unicode_constants.h"
19 #include "regcomp_internal.h"
20 
21 #ifdef DEBUGGING
22 
23 int
Perl_re_printf(pTHX_ const char * fmt,...)24 Perl_re_printf(pTHX_ const char *fmt, ...)
25 {
26     va_list ap;
27     int result;
28     PerlIO *f= Perl_debug_log;
29     PERL_ARGS_ASSERT_RE_PRINTF;
30     va_start(ap, fmt);
31     result = PerlIO_vprintf(f, fmt, ap);
32     va_end(ap);
33     return result;
34 }
35 
36 int
Perl_re_indentf(pTHX_ const char * fmt,U32 depth,...)37 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
38 {
39     va_list ap;
40     int result;
41     PerlIO *f= Perl_debug_log;
42     PERL_ARGS_ASSERT_RE_INDENTF;
43     va_start(ap, depth);
44     PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
45     result = PerlIO_vprintf(f, fmt, ap);
46     va_end(ap);
47     return result;
48 }
49 
50 void
Perl_debug_show_study_flags(pTHX_ U32 flags,const char * open_str,const char * close_str)51 Perl_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
52                                     const char *close_str)
53 {
54     PERL_ARGS_ASSERT_DEBUG_SHOW_STUDY_FLAGS;
55     if (!flags)
56         return;
57 
58     Perl_re_printf( aTHX_  "%s", open_str);
59     DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
60     DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
61     DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
62     DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
63     DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
64     DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
65     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
66     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
67     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
68     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
69     DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
70     DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
71     DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
72     DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
73     DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
74     Perl_re_printf( aTHX_  "%s", close_str);
75 }
76 
77 void
Perl_debug_studydata(pTHX_ const char * where,scan_data_t * data,U32 depth,int is_inf,SSize_t min,SSize_t stopmin,SSize_t delta)78 Perl_debug_studydata(pTHX_ const char *where, scan_data_t *data,
79                     U32 depth, int is_inf,
80                     SSize_t min, SSize_t stopmin, SSize_t delta)
81 {
82     PERL_ARGS_ASSERT_DEBUG_STUDYDATA;
83     DECLARE_AND_GET_RE_DEBUG_FLAGS;
84 
85     DEBUG_OPTIMISE_MORE_r({
86         if (!data) {
87             Perl_re_indentf(aTHX_  "%s: NO DATA",
88                 depth,
89                 where);
90             return;
91         }
92         Perl_re_indentf(aTHX_  "%s: M/S/D: %" IVdf "/%" IVdf "/%" IVdf " Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
93             depth,
94             where,
95             min, stopmin, delta,
96             (IV)data->pos_min,
97             (IV)data->pos_delta,
98             (UV)data->flags
99         );
100 
101         Perl_debug_show_study_flags(aTHX_ data->flags," [","]");
102 
103         Perl_re_printf( aTHX_
104             " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
105             (IV)data->whilem_c,
106             (IV)(data->last_closep ? *((data)->last_closep) : -1),
107             is_inf ? "INF " : ""
108         );
109 
110         if (data->last_found) {
111             int i;
112             Perl_re_printf(aTHX_
113                 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
114                     SvPVX_const(data->last_found),
115                     (IV)data->last_end,
116                     (IV)data->last_start_min,
117                     (IV)data->last_start_max
118             );
119 
120             for (i = 0; i < 2; i++) {
121                 Perl_re_printf(aTHX_
122                     " %s%s: '%s' @ %" IVdf "/%" IVdf,
123                     data->cur_is_floating == i ? "*" : "",
124                     i ? "Float" : "Fixed",
125                     SvPVX_const(data->substrs[i].str),
126                     (IV)data->substrs[i].min_offset,
127                     (IV)data->substrs[i].max_offset
128                 );
129                 Perl_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
130             }
131         }
132 
133         Perl_re_printf( aTHX_ "\n");
134     });
135 }
136 
137 
138 void
Perl_debug_peep(pTHX_ const char * str,const RExC_state_t * pRExC_state,regnode * scan,U32 depth,U32 flags)139 Perl_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
140                 regnode *scan, U32 depth, U32 flags)
141 {
142     PERL_ARGS_ASSERT_DEBUG_PEEP;
143     DECLARE_AND_GET_RE_DEBUG_FLAGS;
144 
145     DEBUG_OPTIMISE_r({
146         regnode *Next;
147 
148         if (!scan)
149             return;
150         Next = regnext(scan);
151         regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
152         Perl_re_indentf( aTHX_   "%s>%3d: %s (%d)",
153             depth,
154             str,
155             REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
156             Next ? (REG_NODE_NUM(Next)) : 0 );
157         Perl_debug_show_study_flags(aTHX_ flags," [ ","]");
158         Perl_re_printf( aTHX_  "\n");
159    });
160 }
161 
162 #endif /* DEBUGGING */
163 
164 /*
165  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
166  */
167 #ifdef DEBUGGING
168 
169 static void
S_regdump_intflags(pTHX_ const char * lead,const U32 flags)170 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
171 {
172     int bit;
173     int set=0;
174 
175     ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
176 
177     for (bit=0; bit<=REG_INTFLAGS_NAME_SIZE; bit++) {
178         if (flags & (1<<bit)) {
179             if (!set++ && lead)
180                 Perl_re_printf( aTHX_  "%s", lead);
181             Perl_re_printf( aTHX_  "%s ", PL_reg_intflags_name[bit]);
182         }
183     }
184     if (lead)  {
185         if (set)
186             Perl_re_printf( aTHX_  "\n");
187         else
188             Perl_re_printf( aTHX_  "%s[none-set]\n", lead);
189     }
190 }
191 
192 static void
S_regdump_extflags(pTHX_ const char * lead,const U32 flags)193 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
194 {
195     int bit;
196     int set=0;
197     regex_charset cs;
198 
199     ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
200 
201     for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
202         if (flags & (1U<<bit)) {
203             if ((1U<<bit) & RXf_PMf_CHARSET) {  /* Output separately, below */
204                 continue;
205             }
206             if (!set++ && lead)
207                 Perl_re_printf( aTHX_  "%s", lead);
208             Perl_re_printf( aTHX_  "%s ", PL_reg_extflags_name[bit]);
209         }
210     }
211     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
212             if (!set++ && lead) {
213                 Perl_re_printf( aTHX_  "%s", lead);
214             }
215             switch (cs) {
216                 case REGEX_UNICODE_CHARSET:
217                     Perl_re_printf( aTHX_  "UNICODE");
218                     break;
219                 case REGEX_LOCALE_CHARSET:
220                     Perl_re_printf( aTHX_  "LOCALE");
221                     break;
222                 case REGEX_ASCII_RESTRICTED_CHARSET:
223                     Perl_re_printf( aTHX_  "ASCII-RESTRICTED");
224                     break;
225                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
226                     Perl_re_printf( aTHX_  "ASCII-MORE_RESTRICTED");
227                     break;
228                 default:
229                     Perl_re_printf( aTHX_  "UNKNOWN CHARACTER SET");
230                     break;
231             }
232     }
233     if (lead)  {
234         if (set)
235             Perl_re_printf( aTHX_  "\n");
236         else
237             Perl_re_printf( aTHX_  "%s[none-set]\n", lead);
238     }
239 }
240 #endif
241 
242 void
Perl_regdump(pTHX_ const regexp * r)243 Perl_regdump(pTHX_ const regexp *r)
244 {
245 #ifdef DEBUGGING
246     int i;
247     SV * const sv = sv_newmortal();
248     SV *dsv= sv_newmortal();
249     RXi_GET_DECL(r, ri);
250     DECLARE_AND_GET_RE_DEBUG_FLAGS;
251 
252     PERL_ARGS_ASSERT_REGDUMP;
253 
254     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
255 
256     /* Header fields of interest. */
257     for (i = 0; i < 2; i++) {
258         if (r->substrs->data[i].substr) {
259             RE_PV_QUOTED_DECL(s, 0, dsv,
260                             SvPVX_const(r->substrs->data[i].substr),
261                             RE_SV_DUMPLEN(r->substrs->data[i].substr),
262                             PL_dump_re_max_len);
263             Perl_re_printf( aTHX_
264                           "%s %s%s at %" IVdf "..%" UVuf " ",
265                           i ? "floating" : "anchored",
266                           s,
267                           RE_SV_TAIL(r->substrs->data[i].substr),
268                           (IV)r->substrs->data[i].min_offset,
269                           (UV)r->substrs->data[i].max_offset);
270         }
271         else if (r->substrs->data[i].utf8_substr) {
272             RE_PV_QUOTED_DECL(s, 1, dsv,
273                             SvPVX_const(r->substrs->data[i].utf8_substr),
274                             RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr),
275                             30);
276             Perl_re_printf( aTHX_
277                           "%s utf8 %s%s at %" IVdf "..%" UVuf " ",
278                           i ? "floating" : "anchored",
279                           s,
280                           RE_SV_TAIL(r->substrs->data[i].utf8_substr),
281                           (IV)r->substrs->data[i].min_offset,
282                           (UV)r->substrs->data[i].max_offset);
283         }
284     }
285 
286     if (r->check_substr || r->check_utf8)
287         Perl_re_printf( aTHX_
288                       (const char *)
289                       (   r->check_substr == r->substrs->data[1].substr
290                        && r->check_utf8   == r->substrs->data[1].utf8_substr
291                        ? "(checking floating" : "(checking anchored"));
292     if (r->intflags & PREGf_NOSCAN)
293         Perl_re_printf( aTHX_  " noscan");
294     if (r->extflags & RXf_CHECK_ALL)
295         Perl_re_printf( aTHX_  " isall");
296     if (r->check_substr || r->check_utf8)
297         Perl_re_printf( aTHX_  ") ");
298 
299     if (ri->regstclass) {
300         regprop(r, sv, ri->regstclass, NULL, NULL);
301         Perl_re_printf( aTHX_  "stclass %s ", SvPVX_const(sv));
302     }
303     if (r->intflags & PREGf_ANCH) {
304         Perl_re_printf( aTHX_  "anchored");
305         if (r->intflags & PREGf_ANCH_MBOL)
306             Perl_re_printf( aTHX_  "(MBOL)");
307         if (r->intflags & PREGf_ANCH_SBOL)
308             Perl_re_printf( aTHX_  "(SBOL)");
309         if (r->intflags & PREGf_ANCH_GPOS)
310             Perl_re_printf( aTHX_  "(GPOS)");
311         Perl_re_printf( aTHX_ " ");
312     }
313     if (r->intflags & PREGf_GPOS_SEEN)
314         Perl_re_printf( aTHX_  "GPOS:%" UVuf " ", (UV)r->gofs);
315     if (r->intflags & PREGf_SKIP)
316         Perl_re_printf( aTHX_  "plus ");
317     if (r->intflags & PREGf_IMPLICIT)
318         Perl_re_printf( aTHX_  "implicit ");
319     Perl_re_printf( aTHX_  "minlen %" IVdf " ", (IV)r->minlen);
320     if (r->extflags & RXf_EVAL_SEEN)
321         Perl_re_printf( aTHX_  "with eval ");
322     Perl_re_printf( aTHX_  "\n");
323     DEBUG_FLAGS_r({
324         regdump_extflags("r->extflags: ", r->extflags);
325         regdump_intflags("r->intflags: ", r->intflags);
326     });
327 #else
328     PERL_ARGS_ASSERT_REGDUMP;
329     PERL_UNUSED_CONTEXT;
330     PERL_UNUSED_ARG(r);
331 #endif  /* DEBUGGING */
332 }
333 
334 /* Should be synchronized with ANYOF_ #defines in regcomp.h */
335 #ifdef DEBUGGING
336 
337 #  if   CC_WORDCHAR_ != 0 || CC_DIGIT_ != 1        || CC_ALPHA_ != 2    \
338      || CC_LOWER_ != 3    || CC_UPPER_ != 4        || CC_PUNCT_ != 5    \
339      || CC_PRINT_ != 6    || CC_ALPHANUMERIC_ != 7 || CC_GRAPH_ != 8    \
340      || CC_CASED_ != 9    || CC_SPACE_ != 10       || CC_BLANK_ != 11   \
341      || CC_XDIGIT_ != 12  || CC_CNTRL_ != 13       || CC_ASCII_ != 14   \
342      || CC_VERTSPACE_ != 15
343 #   error Need to adjust order of anyofs[]
344 #  endif
345 static const char * const anyofs[] = {
346     "\\w",
347     "\\W",
348     "\\d",
349     "\\D",
350     "[:alpha:]",
351     "[:^alpha:]",
352     "[:lower:]",
353     "[:^lower:]",
354     "[:upper:]",
355     "[:^upper:]",
356     "[:punct:]",
357     "[:^punct:]",
358     "[:print:]",
359     "[:^print:]",
360     "[:alnum:]",
361     "[:^alnum:]",
362     "[:graph:]",
363     "[:^graph:]",
364     "[:cased:]",
365     "[:^cased:]",
366     "\\s",
367     "\\S",
368     "[:blank:]",
369     "[:^blank:]",
370     "[:xdigit:]",
371     "[:^xdigit:]",
372     "[:cntrl:]",
373     "[:^cntrl:]",
374     "[:ascii:]",
375     "[:^ascii:]",
376     "\\v",
377     "\\V"
378 };
379 #endif
380 
381 /*
382 - regprop - printable representation of opcode, with run time support
383 */
384 
385 void
Perl_regprop(pTHX_ const regexp * prog,SV * sv,const regnode * o,const regmatch_info * reginfo,const RExC_state_t * pRExC_state)386 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
387 {
388 #ifdef DEBUGGING
389     U8 k;
390     const U8 op = OP(o);
391     RXi_GET_DECL(prog, progi);
392     DECLARE_AND_GET_RE_DEBUG_FLAGS;
393 
394     PERL_ARGS_ASSERT_REGPROP;
395 
396     SvPVCLEAR(sv);
397 
398     if (op > REGNODE_MAX) {          /* regnode.type is unsigned */
399         if (pRExC_state) {  /* This gives more info, if we have it */
400             FAIL3("panic: corrupted regexp opcode %d > %d",
401                   (int)op, (int)REGNODE_MAX);
402         }
403         else {
404             Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d",
405                              (int)op, (int)REGNODE_MAX);
406         }
407     }
408     sv_catpv(sv, REGNODE_NAME(op)); /* Take off const! */
409 
410     k = REGNODE_TYPE(op);
411     if (op == BRANCH) {
412         Perl_sv_catpvf(aTHX_ sv, " (buf:%" IVdf "/%" IVdf ")", (IV)ARG1a(o),(IV)ARG1b(o));
413     }
414     else if (op == BRANCHJ) {
415         Perl_sv_catpvf(aTHX_ sv, " (buf:%" IVdf "/%" IVdf ")", (IV)ARG2a(o),(IV)ARG2b(o));
416     }
417     else if (k == EXACT) {
418         sv_catpvs(sv, " ");
419         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
420          * is a crude hack but it may be the best for now since
421          * we have no flag "this EXACTish node was UTF-8"
422          * --jhi */
423         pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len,
424                   PL_colors[0], PL_colors[1],
425                   PERL_PV_ESCAPE_UNI_DETECT |
426                   PERL_PV_ESCAPE_NONASCII   |
427                   PERL_PV_PRETTY_ELLIPSES   |
428                   PERL_PV_PRETTY_LTGT       |
429                   PERL_PV_PRETTY_NOCLEAR
430                   );
431     } else if (k == TRIE) {
432         /* print the details of the trie in dumpuntil instead, as
433          * progi->data isn't available here */
434         const U32 n = ARG1u(o);
435         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
436                (reg_ac_data *)progi->data->data[n] :
437                NULL;
438         const reg_trie_data * const trie
439             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
440 
441         Perl_sv_catpvf(aTHX_ sv, "-%s", REGNODE_NAME(FLAGS(o)));
442         DEBUG_TRIE_COMPILE_r({
443           if (trie->jump)
444             sv_catpvs(sv, "(JUMP)");
445           Perl_sv_catpvf(aTHX_ sv,
446             "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">",
447             (UV)trie->startstate,
448             (IV)trie->statecount-1, /* -1 because of the unused 0 element */
449             (UV)trie->wordcount,
450             (UV)trie->minlen,
451             (UV)trie->maxlen,
452             (UV)TRIE_CHARCOUNT(trie),
453             (UV)trie->uniquecharcount
454           );
455         });
456         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
457             sv_catpvs(sv, "[");
458             (void) put_charclass_bitmap_innards(sv,
459                                                 ((IS_ANYOF_TRIE(op))
460                                                  ? ANYOF_BITMAP(o)
461                                                  : TRIE_BITMAP(trie)),
462                                                 NULL,
463                                                 NULL,
464                                                 NULL,
465                                                 0,
466                                                 FALSE
467                                                );
468             sv_catpvs(sv, "]");
469         }
470         if (trie->before_paren || trie->after_paren)
471             Perl_sv_catpvf(aTHX_ sv, " (buf:%" IVdf "/%" IVdf ")",
472                     (IV)trie->before_paren,(IV)trie->after_paren);
473     } else if (k == CURLY) {
474         U32 lo = ARG1i(o), hi = ARG2i(o);
475         if (ARG3u(o)) /* check both ARG3a and ARG3b at the same time */
476             Perl_sv_catpvf(aTHX_ sv, "<%d:%d>", ARG3a(o),ARG3b(o)); /* paren before, paren after */
477         if (op == CURLYM || op == CURLYN || op == CURLYX)
478             Perl_sv_catpvf(aTHX_ sv, "[%d]", FLAGS(o)); /* Parenth number */
479         Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
480         if (hi == REG_INFTY)
481             sv_catpvs(sv, "INFTY");
482         else
483             Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
484         sv_catpvs(sv, "}");
485     }
486     else if (k == WHILEM && FLAGS(o))                   /* Ordinal/of */
487         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", FLAGS(o) & 0xf, FLAGS(o)>>4);
488     else if (k == REF || k == OPEN || k == CLOSE
489              || k == GROUPP || op == ACCEPT)
490     {
491         AV *name_list= NULL;
492         U32 parno= (op == ACCEPT)              ? ARG2u(o) :
493                    (op == OPEN || op == CLOSE) ? PARNO(o) :
494                                                  ARG1u(o);
495         if ( RXp_PAREN_NAMES(prog) ) {
496             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
497         } else if ( pRExC_state ) {
498             name_list= RExC_paren_name_list;
499         }
500         if ( name_list ) {
501             if ( k != REF || (op < REFN)) {
502                 UV logical_parno = parno;
503                 if (prog->parno_to_logical)
504                     logical_parno = prog->parno_to_logical[parno];
505 
506                 Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)logical_parno);     /* Parenth number */
507                 if (parno != logical_parno)
508                     Perl_sv_catpvf(aTHX_ sv, "/%" UVuf, (UV)parno);        /* Parenth number */
509 
510                 SV **name= av_fetch_simple(name_list, parno, 0 );
511                 if (name)
512                     Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
513             }
514             else
515             if (parno > 0) {
516                 /* parno must always be larger than 0 for this block
517                  * as it represents a slot into the data array, which
518                  * has the 0 slot reserved for a placeholder so any valid
519                  * index into it is always true, eg non-zero
520                  * see the '%' "what" type and the implementation of
521                  * S_reg_add_data()
522                  */
523                 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
524                 I32 *nums=(I32*)SvPVX(sv_dat);
525                 SV **name= av_fetch_simple(name_list, nums[0], 0 );
526                 I32 n;
527                 if (name) {
528                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
529                         Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf,
530                                     (n ? "," : ""), (IV)nums[n]);
531                     }
532                     Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
533                 }
534             }
535         } else if (parno>0) {
536             UV logical_parno = parno;
537             if (prog->parno_to_logical)
538                 logical_parno = prog->parno_to_logical[parno];
539 
540             Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)logical_parno);     /* Parenth number */
541             if (logical_parno != parno)
542                 Perl_sv_catpvf(aTHX_ sv, "/%" UVuf, (UV)parno);     /* Parenth number */
543 
544         }
545         if ( k == REF ) {
546             Perl_sv_catpvf(aTHX_ sv, " <%" IVdf ">", (IV)ARG2i(o));
547         }
548         if ( k == REF && reginfo) {
549             U32 n = ARG1u(o);  /* which paren pair */
550             I32 ln = RXp_OFFS_START(prog,n);
551             if (RXp_LASTPAREN(prog) < n || ln == -1 || RXp_OFFS_END(prog,n) == -1)
552                 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
553             else if (ln == RXp_OFFS_END(prog,n))
554                 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
555             else {
556                 const char *s = reginfo->strbeg + ln;
557                 Perl_sv_catpvf(aTHX_ sv, ": ");
558                 Perl_pv_pretty( aTHX_ sv, s, RXp_OFFS_END(prog,n) - RXp_OFFS_START(prog,n), 32, 0, 0,
559                     PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
560             }
561         }
562     } else if (k == GOSUB) {
563         AV *name_list= NULL;
564         IV parno = ARG1u(o);
565         IV logical_parno = (parno && prog->parno_to_logical)
566                          ? prog->parno_to_logical[parno]
567                          : parno;
568         if ( RXp_PAREN_NAMES(prog) ) {
569             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
570         } else if ( pRExC_state ) {
571             name_list= RExC_paren_name_list;
572         }
573 
574         /* Paren and offset */
575         Perl_sv_catpvf(aTHX_ sv, "%" IVdf, logical_parno);
576         if (logical_parno != parno)
577             Perl_sv_catpvf(aTHX_ sv, "/%" IVdf, parno);
578 
579         Perl_sv_catpvf(aTHX_ sv, "[%+d:%d]", (int)ARG2i(o),
580                 (int)((o + (int)ARG2i(o)) - progi->program) );
581         if (name_list) {
582             SV **name= av_fetch_simple(name_list, ARG1u(o), 0 );
583             if (name)
584                 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
585         }
586     }
587     else if (k == LOGICAL)
588         /* 2: embedded, otherwise 1 */
589         Perl_sv_catpvf(aTHX_ sv, "[%d]", FLAGS(o));
590     else if (k == ANYOF || k == ANYOFH || k == ANYOFR) {
591         U8 flags;
592         char * bitmap;
593         U8 do_sep = 0;    /* Do we need to separate various components of the
594                              output? */
595         /* Set if there is still an unresolved user-defined property */
596         SV *unresolved                = NULL;
597 
598         /* Things that are ignored except when the runtime locale is UTF-8 */
599         SV *only_utf8_locale_invlist = NULL;
600 
601         /* Code points that don't fit in the bitmap */
602         SV *nonbitmap_invlist = NULL;
603 
604         /* And things that aren't in the bitmap, but are small enough to be */
605         SV* bitmap_range_not_in_bitmap = NULL;
606 
607         bool inverted;
608 
609         if (k != ANYOF) {
610             flags = 0;
611             bitmap = NULL;
612         }
613         else {
614             flags = ANYOF_FLAGS(o);
615             bitmap = ANYOF_BITMAP(o);
616         }
617 
618         if (op == ANYOFL || op == ANYOFPOSIXL) {
619             if ((flags & ANYOFL_UTF8_LOCALE_REQD)) {
620                 sv_catpvs(sv, "{utf8-locale-reqd}");
621             }
622             if (flags & ANYOFL_FOLD) {
623                 sv_catpvs(sv, "{i}");
624             }
625         }
626 
627         inverted = flags & ANYOF_INVERT;
628 
629         /* If there is stuff outside the bitmap, get it */
630         if (k == ANYOFR) {
631 
632             /* For a single range, split into the parts inside vs outside the
633              * bitmap. */
634             UV start = ANYOFRbase(o);
635             UV end   = ANYOFRbase(o) + ANYOFRdelta(o);
636 
637             if (start < NUM_ANYOF_CODE_POINTS) {
638                 if (end < NUM_ANYOF_CODE_POINTS) {
639                     bitmap_range_not_in_bitmap
640                           = _add_range_to_invlist(bitmap_range_not_in_bitmap,
641                                                   start, end);
642                 }
643                 else {
644                     bitmap_range_not_in_bitmap
645                           = _add_range_to_invlist(bitmap_range_not_in_bitmap,
646                                                   start, NUM_ANYOF_CODE_POINTS);
647                     start = NUM_ANYOF_CODE_POINTS;
648                 }
649             }
650 
651             if (start >= NUM_ANYOF_CODE_POINTS) {
652                 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
653                                                 ANYOFRbase(o),
654                                                 ANYOFRbase(o) + ANYOFRdelta(o));
655             }
656         }
657         else if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(o)) {
658             nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
659                                                       NUM_ANYOF_CODE_POINTS,
660                                                       UV_MAX);
661         }
662         else if (ANYOF_HAS_AUX(o)) {
663                 (void) GET_REGCLASS_AUX_DATA(prog, o, FALSE,
664                                                 &unresolved,
665                                                 &only_utf8_locale_invlist,
666                                                 &nonbitmap_invlist);
667 
668             /* The aux data may contain stuff that could fit in the bitmap.
669              * This could come from a user-defined property being finally
670              * resolved when this call was done; or much more likely because
671              * there are matches that require UTF-8 to be valid, and so aren't
672              * in the bitmap (or ANYOFR).  This is teased apart later */
673             _invlist_intersection(nonbitmap_invlist,
674                                   PL_InBitmap,
675                                   &bitmap_range_not_in_bitmap);
676             /* Leave just the things that don't fit into the bitmap */
677             _invlist_subtract(nonbitmap_invlist,
678                               PL_InBitmap,
679                               &nonbitmap_invlist);
680         }
681 
682         /* Ready to start outputting.  First, the initial left bracket */
683         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
684 
685         if (   bitmap
686             || bitmap_range_not_in_bitmap
687             || only_utf8_locale_invlist
688             || unresolved)
689         {
690             /* Then all the things that could fit in the bitmap */
691             do_sep = put_charclass_bitmap_innards(
692                                     sv,
693                                     bitmap,
694                                     bitmap_range_not_in_bitmap,
695                                     only_utf8_locale_invlist,
696                                     o,
697                                     flags,
698 
699                                     /* Can't try inverting for a
700                                                    * better display if there
701                                                    * are things that haven't
702                                                    * been resolved */
703                                     (unresolved != NULL || k == ANYOFR));
704             SvREFCNT_dec(bitmap_range_not_in_bitmap);
705 
706             /* If there are user-defined properties which haven't been defined
707              * yet, output them.  If the result is not to be inverted, it is
708              * clearest to output them in a separate [] from the bitmap range
709              * stuff.  If the result is to be complemented, we have to show
710              * everything in one [], as the inversion applies to the whole
711              * thing.  Use {braces} to separate them from anything in the
712              * bitmap and anything above the bitmap. */
713             if (unresolved) {
714                 if (inverted) {
715                     if (! do_sep) { /* If didn't output anything in the bitmap
716                                      */
717                         sv_catpvs(sv, "^");
718                     }
719                     sv_catpvs(sv, "{");
720                 }
721                 else if (do_sep) {
722                     Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1],
723                                                       PL_colors[0]);
724                 }
725                 sv_catsv(sv, unresolved);
726                 if (inverted) {
727                     sv_catpvs(sv, "}");
728                 }
729                 do_sep = ! inverted;
730             }
731             else if (     do_sep == 2
732                      && ! nonbitmap_invlist
733                      &&   ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(o))
734             {
735                 /* Here, the display shows the class as inverted, and
736                  * everything above the lower display should also match, but
737                  * there is no indication of that.  Add this range so the code
738                  * below will add it to the display */
739                 _invlist_union_complement_2nd(nonbitmap_invlist,
740                                               PL_InBitmap,
741                                               &nonbitmap_invlist);
742             }
743         }
744 
745         /* And, finally, add the above-the-bitmap stuff */
746         if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
747             SV* contents;
748 
749             /* See if truncation size is overridden */
750             const STRLEN dump_len = (PL_dump_re_max_len > 256)
751                                     ? PL_dump_re_max_len
752                                     : 256;
753 
754             /* This is output in a separate [] */
755             if (do_sep) {
756                 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
757             }
758 
759             /* And, for easy of understanding, it is shown in the
760              * uncomplemented form if possible.  The one exception being if
761              * there are unresolved items, where the inversion has to be
762              * delayed until runtime */
763             if (inverted && ! unresolved) {
764                 _invlist_invert(nonbitmap_invlist);
765                 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
766             }
767 
768             contents = invlist_contents(nonbitmap_invlist,
769                                         FALSE /* output suitable for catsv */
770                                        );
771 
772             /* If the output is shorter than the permissible maximum, just do it. */
773             if (SvCUR(contents) <= dump_len) {
774                 sv_catsv(sv, contents);
775             }
776             else {
777                 const char * contents_string = SvPVX(contents);
778                 STRLEN i = dump_len;
779 
780                 /* Otherwise, start at the permissible max and work back to the
781                  * first break possibility */
782                 while (i > 0 && contents_string[i] != ' ') {
783                     i--;
784                 }
785                 if (i == 0) {       /* Fail-safe.  Use the max if we couldn't
786                                        find a legal break */
787                     i = dump_len;
788                 }
789 
790                 sv_catpvn(sv, contents_string, i);
791                 sv_catpvs(sv, "...");
792             }
793 
794             SvREFCNT_dec_NN(contents);
795             SvREFCNT_dec_NN(nonbitmap_invlist);
796         }
797 
798         /* And finally the matching, closing ']' */
799         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
800 
801         if (op == ANYOFHs) {
802             Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1));
803         }
804         else if (REGNODE_TYPE(op) != ANYOF) {
805             U8 lowest = (op != ANYOFHr)
806                          ? FLAGS(o)
807                          : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
808             U8 highest = (op == ANYOFHr)
809                          ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o))
810                          : (op == ANYOFH || op == ANYOFR)
811                            ? 0xFF
812                            : lowest;
813 #ifndef EBCDIC
814             if (op != ANYOFR || ! isASCII(ANYOFRbase(o) + ANYOFRdelta(o)))
815 #endif
816             {
817                 Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
818                 if (lowest != highest) {
819                     Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
820                 }
821                 Perl_sv_catpvf(aTHX_ sv, ")");
822             }
823         }
824 
825         SvREFCNT_dec(unresolved);
826     }
827     else if (k == ANYOFM) {
828         SV * cp_list = get_ANYOFM_contents(o);
829 
830         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
831         if (op == NANYOFM) {
832             _invlist_invert(cp_list);
833         }
834 
835         put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, 0, TRUE);
836         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
837 
838         SvREFCNT_dec(cp_list);
839     }
840     else if (k == ANYOFHbbm) {
841         SV * cp_list = get_ANYOFHbbm_contents(o);
842         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
843 
844         sv_catsv(sv, invlist_contents(cp_list,
845                                       FALSE /* output suitable for catsv */
846                                      ));
847         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
848 
849         SvREFCNT_dec(cp_list);
850     }
851     else if (k == POSIXD || k == NPOSIXD) {
852         U8 index = FLAGS(o) * 2;
853         if (index < C_ARRAY_LENGTH(anyofs)) {
854             if (*anyofs[index] != '[')  {
855                 sv_catpvs(sv, "[");
856             }
857             sv_catpv(sv, anyofs[index]);
858             if (*anyofs[index] != '[')  {
859                 sv_catpvs(sv, "]");
860             }
861         }
862         else {
863             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
864         }
865     }
866     else if (k == BOUND || k == NBOUND) {
867         /* Must be synced with order of 'bound_type' in regcomp.h */
868         const char * const bounds[] = {
869             "",      /* Traditional */
870             "{gcb}",
871             "{lb}",
872             "{sb}",
873             "{wb}"
874         };
875         assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
876         sv_catpv(sv, bounds[FLAGS(o)]);
877     }
878     else if (k == BRANCHJ && (op == UNLESSM || op == IFMATCH)) {
879         Perl_sv_catpvf(aTHX_ sv, "[%d", -(FLAGS(o)));
880         if (NEXT_OFF(o)) {
881             Perl_sv_catpvf(aTHX_ sv, "..-%d", FLAGS(o) - NEXT_OFF(o));
882         }
883         Perl_sv_catpvf(aTHX_ sv, "]");
884     }
885     else if (op == SBOL)
886         Perl_sv_catpvf(aTHX_ sv, " /%s/", FLAGS(o) ? "\\A" : "^");
887     else if (op == EVAL) {
888         if (FLAGS(o) & EVAL_OPTIMISTIC_FLAG)
889             Perl_sv_catpvf(aTHX_ sv, " optimistic");
890     }
891 
892     /* add on the verb argument if there is one */
893     if ( ( k == VERB || op == ACCEPT || op == OPFAIL ) && FLAGS(o)) {
894         if ( ARG1u(o) )
895             Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
896                        SVfARG((MUTABLE_SV(progi->data->data[ ARG1u( o ) ]))));
897         else
898             sv_catpvs(sv, ":NULL");
899     }
900 #else
901     PERL_UNUSED_CONTEXT;
902     PERL_UNUSED_ARG(sv);
903     PERL_UNUSED_ARG(o);
904     PERL_UNUSED_ARG(prog);
905     PERL_UNUSED_ARG(reginfo);
906     PERL_UNUSED_ARG(pRExC_state);
907 #endif  /* DEBUGGING */
908 }
909 
910 #ifdef DEBUGGING
911 
912 STATIC void
S_put_code_point(pTHX_ SV * sv,UV c)913 S_put_code_point(pTHX_ SV *sv, UV c)
914 {
915     PERL_ARGS_ASSERT_PUT_CODE_POINT;
916 
917     if (c > 255) {
918         Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c);
919     }
920     else if (isPRINT(c)) {
921         const char string = (char) c;
922 
923         /* We use {phrase} as metanotation in the class, so also escape literal
924          * braces */
925         if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
926             sv_catpvs(sv, "\\");
927         sv_catpvn(sv, &string, 1);
928     }
929     else if (isMNEMONIC_CNTRL(c)) {
930         Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
931     }
932     else {
933         Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
934     }
935 }
936 
937 STATIC void
S_put_range(pTHX_ SV * sv,UV start,const UV end,const bool allow_literals)938 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
939 {
940     /* Appends to 'sv' a displayable version of the range of code points from
941      * 'start' to 'end'.  Mnemonics (like '\r') are used for the few controls
942      * that have them, when they occur at the beginning or end of the range.
943      * It uses hex to output the remaining code points, unless 'allow_literals'
944      * is true, in which case the printable ASCII ones are output as-is (though
945      * some of these will be escaped by put_code_point()).
946      *
947      * NOTE:  This is designed only for printing ranges of code points that fit
948      *        inside an ANYOF bitmap.  Higher code points are simply suppressed
949      */
950 
951     const unsigned int min_range_count = 3;
952 
953     assert(start <= end);
954 
955     PERL_ARGS_ASSERT_PUT_RANGE;
956 
957     while (start <= end) {
958         UV this_end;
959         const char * format;
960 
961         if (    end - start < min_range_count
962             && (end - start <= 2 || (isPRINT_A(start) && isPRINT_A(end))))
963         {
964             /* Output a range of 1 or 2 chars individually, or longer ranges
965              * when printable */
966             for (; start <= end; start++) {
967                 put_code_point(sv, start);
968             }
969             break;
970         }
971 
972         /* If permitted by the input options, and there is a possibility that
973          * this range contains a printable literal, look to see if there is
974          * one. */
975         if (allow_literals && start <= MAX_PRINT_A) {
976 
977             /* If the character at the beginning of the range isn't an ASCII
978              * printable, effectively split the range into two parts:
979              *  1) the portion before the first such printable,
980              *  2) the rest
981              * and output them separately. */
982             if (! isPRINT_A(start)) {
983                 UV temp_end = start + 1;
984 
985                 /* There is no point looking beyond the final possible
986                  * printable, in MAX_PRINT_A */
987                 UV max = MIN(end, MAX_PRINT_A);
988 
989                 while (temp_end <= max && ! isPRINT_A(temp_end)) {
990                     temp_end++;
991                 }
992 
993                 /* Here, temp_end points to one beyond the first printable if
994                  * found, or to one beyond 'max' if not.  If none found, make
995                  * sure that we use the entire range */
996                 if (temp_end > MAX_PRINT_A) {
997                     temp_end = end + 1;
998                 }
999 
1000                 /* Output the first part of the split range: the part that
1001                  * doesn't have printables, with the parameter set to not look
1002                  * for literals (otherwise we would infinitely recurse) */
1003                 put_range(sv, start, temp_end - 1, FALSE);
1004 
1005                 /* The 2nd part of the range (if any) starts here. */
1006                 start = temp_end;
1007 
1008                 /* We do a continue, instead of dropping down, because even if
1009                  * the 2nd part is non-empty, it could be so short that we want
1010                  * to output it as individual characters, as tested for at the
1011                  * top of this loop.  */
1012                 continue;
1013             }
1014 
1015             /* Here, 'start' is a printable ASCII.  If it is an alphanumeric,
1016              * output a sub-range of just the digits or letters, then process
1017              * the remaining portion as usual. */
1018             if (isALPHANUMERIC_A(start)) {
1019                 UV mask = (isDIGIT_A(start))
1020                            ? CC_DIGIT_
1021                              : isUPPER_A(start)
1022                                ? CC_UPPER_
1023                                : CC_LOWER_;
1024                 UV temp_end = start + 1;
1025 
1026                 /* Find the end of the sub-range that includes just the
1027                  * characters in the same class as the first character in it */
1028                 while (temp_end <= end && generic_isCC_A_(temp_end, mask)) {
1029                     temp_end++;
1030                 }
1031                 temp_end--;
1032 
1033                 /* For short ranges, don't duplicate the code above to output
1034                  * them; just call recursively */
1035                 if (temp_end - start < min_range_count) {
1036                     put_range(sv, start, temp_end, FALSE);
1037                 }
1038                 else {  /* Output as a range */
1039                     put_code_point(sv, start);
1040                     sv_catpvs(sv, "-");
1041                     put_code_point(sv, temp_end);
1042                 }
1043                 start = temp_end + 1;
1044                 continue;
1045             }
1046 
1047             /* We output any other printables as individual characters */
1048             if (isPUNCT_A(start) || isSPACE_A(start)) {
1049                 while (start <= end && (isPUNCT_A(start)
1050                                         || isSPACE_A(start)))
1051                 {
1052                     put_code_point(sv, start);
1053                     start++;
1054                 }
1055                 continue;
1056             }
1057         } /* End of looking for literals */
1058 
1059         /* Here is not to output as a literal.  Some control characters have
1060          * mnemonic names.  Split off any of those at the beginning and end of
1061          * the range to print mnemonically.  It isn't possible for many of
1062          * these to be in a row, so this won't overwhelm with output */
1063         if (   start <= end
1064             && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
1065         {
1066             while (isMNEMONIC_CNTRL(start) && start <= end) {
1067                 put_code_point(sv, start);
1068                 start++;
1069             }
1070 
1071             /* If this didn't take care of the whole range ... */
1072             if (start <= end) {
1073 
1074                 /* Look backwards from the end to find the final non-mnemonic
1075                  * */
1076                 UV temp_end = end;
1077                 while (isMNEMONIC_CNTRL(temp_end)) {
1078                     temp_end--;
1079                 }
1080 
1081                 /* And separately output the interior range that doesn't start
1082                  * or end with mnemonics */
1083                 put_range(sv, start, temp_end, FALSE);
1084 
1085                 /* Then output the mnemonic trailing controls */
1086                 start = temp_end + 1;
1087                 while (start <= end) {
1088                     put_code_point(sv, start);
1089                     start++;
1090                 }
1091                 break;
1092             }
1093         }
1094 
1095         /* As a final resort, output the range or subrange as hex. */
1096 
1097         if (start >= NUM_ANYOF_CODE_POINTS) {
1098             this_end = end;
1099         }
1100         else {  /* Have to split range at the bitmap boundary */
1101             this_end = (end < NUM_ANYOF_CODE_POINTS)
1102                         ? end
1103                         : NUM_ANYOF_CODE_POINTS - 1;
1104         }
1105 #if NUM_ANYOF_CODE_POINTS > 256
1106         format = (this_end < 256)
1107                  ? "\\x%02" UVXf "-\\x%02" UVXf
1108                  : "\\x{%04" UVXf "}-\\x{%04" UVXf "}";
1109 #else
1110         format = "\\x%02" UVXf "-\\x%02" UVXf;
1111 #endif
1112         GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
1113         Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
1114         GCC_DIAG_RESTORE_STMT;
1115         break;
1116     }
1117 }
1118 
1119 STATIC void
S_put_charclass_bitmap_innards_invlist(pTHX_ SV * sv,SV * invlist)1120 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
1121 {
1122     /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
1123      * 'invlist' */
1124 
1125     UV start, end;
1126     bool allow_literals = TRUE;
1127 
1128     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
1129 
1130     /* Generally, it is more readable if printable characters are output as
1131      * literals, but if a range (nearly) spans all of them, it's best to output
1132      * it as a single range.  This code will use a single range if all but 2
1133      * ASCII printables are in it */
1134     invlist_iterinit(invlist);
1135     while (invlist_iternext(invlist, &start, &end)) {
1136 
1137         /* If the range starts beyond the final printable, it doesn't have any
1138          * in it */
1139         if (start > MAX_PRINT_A) {
1140             break;
1141         }
1142 
1143         /* In both ASCII and EBCDIC, a SPACE is the lowest printable.  To span
1144          * all but two, the range must start and end no later than 2 from
1145          * either end */
1146         if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
1147             if (end > MAX_PRINT_A) {
1148                 end = MAX_PRINT_A;
1149             }
1150             if (start < ' ') {
1151                 start = ' ';
1152             }
1153             if (end - start >= MAX_PRINT_A - ' ' - 2) {
1154                 allow_literals = FALSE;
1155             }
1156             break;
1157         }
1158     }
1159     invlist_iterfinish(invlist);
1160 
1161     /* Here we have figured things out.  Output each range */
1162     invlist_iterinit(invlist);
1163     while (invlist_iternext(invlist, &start, &end)) {
1164         if (start >= NUM_ANYOF_CODE_POINTS) {
1165             break;
1166         }
1167         put_range(sv, start, end, allow_literals);
1168     }
1169     invlist_iterfinish(invlist);
1170 
1171     return;
1172 }
1173 
1174 STATIC SV*
S_put_charclass_bitmap_innards_common(pTHX_ SV * invlist,SV * posixes,SV * only_utf8,SV * not_utf8,SV * only_utf8_locale,const bool invert)1175 S_put_charclass_bitmap_innards_common(pTHX_
1176         SV* invlist,            /* The bitmap */
1177         SV* posixes,            /* Under /l, things like [:word:], \S */
1178         SV* only_utf8,          /* Under /d, matches iff the target is UTF-8 */
1179         SV* not_utf8,           /* /d, matches iff the target isn't UTF-8 */
1180         SV* only_utf8_locale,   /* Under /l, matches if the locale is UTF-8 */
1181         const bool invert       /* Is the result to be inverted? */
1182 )
1183 {
1184     /* Create and return an SV containing a displayable version of the bitmap
1185      * and associated information determined by the input parameters.  If the
1186      * output would have been only the inversion indicator '^', NULL is instead
1187      * returned. */
1188 
1189     SV * output;
1190 
1191     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
1192 
1193     if (invert) {
1194         output = newSVpvs("^");
1195     }
1196     else {
1197         output = newSVpvs("");
1198     }
1199 
1200     /* First, the code points in the bitmap that are unconditionally there */
1201     put_charclass_bitmap_innards_invlist(output, invlist);
1202 
1203     /* Traditionally, these have been placed after the main code points */
1204     if (posixes) {
1205         sv_catsv(output, posixes);
1206     }
1207 
1208     if (only_utf8 && _invlist_len(only_utf8)) {
1209         Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
1210         put_charclass_bitmap_innards_invlist(output, only_utf8);
1211     }
1212 
1213     if (not_utf8 && _invlist_len(not_utf8)) {
1214         Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
1215         put_charclass_bitmap_innards_invlist(output, not_utf8);
1216     }
1217 
1218     if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
1219         Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
1220         put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
1221 
1222         /* This is the only list in this routine that can legally contain code
1223          * points outside the bitmap range.  The call just above to
1224          * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
1225          * output them here.  There's about a half-dozen possible, and none in
1226          * contiguous ranges longer than 2 */
1227         if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
1228             UV start, end;
1229             SV* above_bitmap = NULL;
1230 
1231             _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
1232 
1233             invlist_iterinit(above_bitmap);
1234             while (invlist_iternext(above_bitmap, &start, &end)) {
1235                 UV i;
1236 
1237                 for (i = start; i <= end; i++) {
1238                     put_code_point(output, i);
1239                 }
1240             }
1241             invlist_iterfinish(above_bitmap);
1242             SvREFCNT_dec_NN(above_bitmap);
1243         }
1244     }
1245 
1246     if (invert && SvCUR(output) == 1) {
1247         return NULL;
1248     }
1249 
1250     return output;
1251 }
1252 
1253 STATIC U8
S_put_charclass_bitmap_innards(pTHX_ SV * sv,char * bitmap,SV * nonbitmap_invlist,SV * only_utf8_locale_invlist,const regnode * const node,const U8 flags,const bool force_as_is_display)1254 S_put_charclass_bitmap_innards(pTHX_ SV *sv,
1255                                      char *bitmap,
1256                                      SV *nonbitmap_invlist,
1257                                      SV *only_utf8_locale_invlist,
1258                                      const regnode * const node,
1259                                      const U8 flags,
1260                                      const bool force_as_is_display)
1261 {
1262     /* Appends to 'sv' a displayable version of the innards of the bracketed
1263      * character class defined by the other arguments:
1264      *  'bitmap' points to the bitmap, or NULL if to ignore that.
1265      *  'nonbitmap_invlist' is an inversion list of the code points that are in
1266      *      the bitmap range, but for some reason aren't in the bitmap; NULL if
1267      *      none.  The reasons for this could be that they require some
1268      *      condition such as the target string being or not being in UTF-8
1269      *      (under /d), or because they came from a user-defined property that
1270      *      was not resolved at the time of the regex compilation (under /u)
1271      *  'only_utf8_locale_invlist' is an inversion list of the code points that
1272      *      are valid only if the runtime locale is a UTF-8 one; NULL if none
1273      *  'node' is the regex pattern ANYOF node.  It is needed only when the
1274      *      above two parameters are not null, and is passed so that this
1275      *      routine can tease apart the various reasons for them.
1276      *  'flags' is the flags field of 'node'
1277      *  'force_as_is_display' is TRUE if this routine should definitely NOT try
1278      *      to invert things to see if that leads to a cleaner display.  If
1279      *      FALSE, this routine is free to use its judgment about doing this.
1280      *
1281      * It returns 0 if nothing was actually output.  (It may be that
1282      *              the bitmap, etc is empty.)
1283      *            1 if the output wasn't inverted (didn't begin with a '^')
1284      *            2 if the output was inverted (did begin with a '^')
1285      *
1286      * When called for outputting the bitmap of a non-ANYOF node, just pass the
1287      * bitmap, with the succeeding parameters set to NULL, and the final one to
1288      * FALSE.
1289      */
1290 
1291     /* In general, it tries to display the 'cleanest' representation of the
1292      * innards, choosing whether to display them inverted or not, regardless of
1293      * whether the class itself is to be inverted.  However,  there are some
1294      * cases where it can't try inverting, as what actually matches isn't known
1295      * until runtime, and hence the inversion isn't either. */
1296 
1297     bool inverting_allowed = ! force_as_is_display;
1298 
1299     int i;
1300     STRLEN orig_sv_cur = SvCUR(sv);
1301 
1302     SV* invlist;            /* Inversion list we accumulate of code points that
1303                                are unconditionally matched */
1304     SV* only_utf8 = NULL;   /* Under /d, list of matches iff the target is
1305                                UTF-8 */
1306     SV* not_utf8 =  NULL;   /* /d, list of matches iff the target isn't UTF-8
1307                              */
1308     SV* posixes = NULL;     /* Under /l, string of things like [:word:], \D */
1309     SV* only_utf8_locale = NULL;    /* Under /l, list of matches if the locale
1310                                        is UTF-8 */
1311 
1312     SV* as_is_display;      /* The output string when we take the inputs
1313                                literally */
1314     SV* inverted_display;   /* The output string when we invert the inputs */
1315 
1316     bool invert = cBOOL(flags & ANYOF_INVERT);  /* Is the input to be inverted
1317                                                    to match? */
1318     /* We are biased in favor of displaying things without them being inverted,
1319      * as that is generally easier to understand */
1320     const int bias = 5;
1321 
1322     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
1323 
1324     /* Start off with whatever code points are passed in.  (We clone, so we
1325      * don't change the caller's list) */
1326     if (nonbitmap_invlist) {
1327         assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
1328         invlist = invlist_clone(nonbitmap_invlist, NULL);
1329     }
1330     else {  /* Worst case size is every other code point is matched */
1331         invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
1332     }
1333 
1334     if (flags) {
1335         if (OP(node) == ANYOFD) {
1336 
1337             /* This flag indicates that the code points below 0x100 in the
1338              * nonbitmap list are precisely the ones that match only when the
1339              * target is UTF-8 (they should all be non-ASCII). */
1340             if (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES) {
1341                 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
1342                 _invlist_subtract(invlist, only_utf8, &invlist);
1343             }
1344 
1345             /* And this flag for matching all non-ASCII 0xFF and below */
1346             if (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared) {
1347                 not_utf8 = invlist_clone(PL_UpperLatin1, NULL);
1348             }
1349         }
1350         else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) {
1351 
1352             /* If either of these flags are set, what matches isn't
1353              * determinable except during execution, so don't know enough here
1354              * to invert */
1355             if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
1356                 inverting_allowed = FALSE;
1357             }
1358 
1359             /* What the posix classes match also varies at runtime, so these
1360              * will be output symbolically. */
1361             if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
1362                 int i;
1363 
1364                 posixes = newSVpvs("");
1365                 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
1366                     if (ANYOF_POSIXL_TEST(node, i)) {
1367                         sv_catpv(posixes, anyofs[i]);
1368                     }
1369                 }
1370             }
1371         }
1372     }
1373 
1374     /* Accumulate the bit map into the unconditional match list */
1375     if (bitmap) {
1376         for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1377             if (BITMAP_TEST(bitmap, i)) {
1378                 int start = i++;
1379                 for (;
1380                      i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i);
1381                      i++)
1382                 { /* empty */ }
1383                 invlist = _add_range_to_invlist(invlist, start, i-1);
1384             }
1385         }
1386     }
1387 
1388     /* Make sure that the conditional match lists don't have anything in them
1389      * that match unconditionally; otherwise the output is quite confusing.
1390      * This could happen if the code that populates these misses some
1391      * duplication. */
1392     if (only_utf8) {
1393         _invlist_subtract(only_utf8, invlist, &only_utf8);
1394     }
1395     if (not_utf8) {
1396         _invlist_subtract(not_utf8, invlist, &not_utf8);
1397     }
1398 
1399     if (only_utf8_locale_invlist) {
1400 
1401         /* Since this list is passed in, we have to make a copy before
1402          * modifying it */
1403         only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL);
1404 
1405         _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
1406 
1407         /* And, it can get really weird for us to try outputting an inverted
1408          * form of this list when it has things above the bitmap, so don't even
1409          * try */
1410         if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
1411             inverting_allowed = FALSE;
1412         }
1413     }
1414 
1415     /* Calculate what the output would be if we take the input as-is */
1416     as_is_display = put_charclass_bitmap_innards_common(invlist,
1417                                                     posixes,
1418                                                     only_utf8,
1419                                                     not_utf8,
1420                                                     only_utf8_locale,
1421                                                     invert);
1422 
1423     /* If have to take the output as-is, just do that */
1424     if (! inverting_allowed) {
1425         if (as_is_display) {
1426             sv_catsv(sv, as_is_display);
1427             SvREFCNT_dec_NN(as_is_display);
1428         }
1429     }
1430     else { /* But otherwise, create the output again on the inverted input, and
1431               use whichever version is shorter */
1432 
1433         int inverted_bias, as_is_bias;
1434 
1435         /* We will apply our bias to whichever of the results doesn't have
1436          * the '^' */
1437         bool trial_invert;
1438         if (invert) {
1439             trial_invert = FALSE;
1440             as_is_bias = bias;
1441             inverted_bias = 0;
1442         }
1443         else {
1444             trial_invert = TRUE;
1445             as_is_bias = 0;
1446             inverted_bias = bias;
1447         }
1448 
1449         /* Now invert each of the lists that contribute to the output,
1450          * excluding from the result things outside the possible range */
1451 
1452         /* For the unconditional inversion list, we have to add in all the
1453          * conditional code points, so that when inverted, they will be gone
1454          * from it */
1455         _invlist_union(only_utf8, invlist, &invlist);
1456         _invlist_union(not_utf8, invlist, &invlist);
1457         _invlist_union(only_utf8_locale, invlist, &invlist);
1458         _invlist_invert(invlist);
1459         _invlist_intersection(invlist, PL_InBitmap, &invlist);
1460 
1461         if (only_utf8) {
1462             _invlist_invert(only_utf8);
1463             _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
1464         }
1465         else if (not_utf8) {
1466 
1467             /* If a code point matches iff the target string is not in UTF-8,
1468              * then complementing the result has it not match iff not in UTF-8,
1469              * which is the same thing as matching iff it is UTF-8. */
1470             only_utf8 = not_utf8;
1471             not_utf8 = NULL;
1472         }
1473 
1474         if (only_utf8_locale) {
1475             _invlist_invert(only_utf8_locale);
1476             _invlist_intersection(only_utf8_locale,
1477                                   PL_InBitmap,
1478                                   &only_utf8_locale);
1479         }
1480 
1481         inverted_display = put_charclass_bitmap_innards_common(
1482                                             invlist,
1483                                             posixes,
1484                                             only_utf8,
1485                                             not_utf8,
1486                                             only_utf8_locale, trial_invert);
1487 
1488         /* Use the shortest representation, taking into account our bias
1489          * against showing it inverted */
1490         if (   inverted_display
1491             && (   ! as_is_display
1492                 || (  SvCUR(inverted_display) + inverted_bias
1493                     < SvCUR(as_is_display)    + as_is_bias)))
1494         {
1495             sv_catsv(sv, inverted_display);
1496             invert = ! invert;
1497         }
1498         else if (as_is_display) {
1499             sv_catsv(sv, as_is_display);
1500         }
1501 
1502         SvREFCNT_dec(as_is_display);
1503         SvREFCNT_dec(inverted_display);
1504     }
1505 
1506     SvREFCNT_dec_NN(invlist);
1507     SvREFCNT_dec(only_utf8);
1508     SvREFCNT_dec(not_utf8);
1509     SvREFCNT_dec(posixes);
1510     SvREFCNT_dec(only_utf8_locale);
1511 
1512     U8 did_output_something = (bool) (SvCUR(sv) > orig_sv_cur);
1513     if (did_output_something) {
1514         /* Distinguish between non and inverted cases */
1515         did_output_something += invert;
1516     }
1517 
1518     return did_output_something;
1519 }
1520 
1521 
1522 const regnode *
Perl_dumpuntil(pTHX_ const regexp * r,const regnode * start,const regnode * node,const regnode * last,const regnode * plast,SV * sv,I32 indent,U32 depth)1523 Perl_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
1524             const regnode *last, const regnode *plast,
1525             SV* sv, I32 indent, U32 depth)
1526 {
1527     const regnode *next;
1528     const regnode *optstart= NULL;
1529 
1530     RXi_GET_DECL(r, ri);
1531     DECLARE_AND_GET_RE_DEBUG_FLAGS;
1532 
1533     PERL_ARGS_ASSERT_DUMPUNTIL;
1534 
1535 #ifdef DEBUG_DUMPUNTIL
1536     Perl_re_printf( aTHX_  "--- %d : %d - %d - %d\n", indent, node-start,
1537         last ? last-start : 0, plast ? plast-start : 0);
1538 #endif
1539 
1540     if (plast && plast < last)
1541         last= plast;
1542 
1543     while (node && (!last || node < last)) {
1544         const U8 op = OP(node);
1545 
1546         if (op == CLOSE || op == SRCLOSE || op == WHILEM)
1547             indent--;
1548         next = regnext((regnode *)node);
1549         const regnode *after = regnode_after((regnode *)node,0);
1550 
1551         /* Where, what. */
1552         if (op == OPTIMIZED) {
1553             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
1554                 optstart = node;
1555             else
1556                 goto after_print;
1557         } else
1558             CLEAR_OPTSTART;
1559 
1560         regprop(r, sv, node, NULL, NULL);
1561         Perl_re_printf( aTHX_  "%4" IVdf ":%*s%s", (IV)(node - start),
1562                       (int)(2*indent + 1), "", SvPVX_const(sv));
1563 
1564         if (op != OPTIMIZED) {
1565             if (next == NULL)           /* Next ptr. */
1566                 Perl_re_printf( aTHX_  " (0)");
1567             else if (REGNODE_TYPE(op) == BRANCH
1568                      && REGNODE_TYPE(OP(next)) != BRANCH )
1569                 Perl_re_printf( aTHX_  " (FAIL)");
1570             else
1571                 Perl_re_printf( aTHX_  " (%" IVdf ")", (IV)(next - start));
1572             Perl_re_printf( aTHX_ "\n");
1573         }
1574 
1575       after_print:
1576         if (REGNODE_TYPE(op) == BRANCHJ) {
1577             assert(next);
1578             const regnode *nnode = (OP(next) == LONGJMP
1579                                    ? regnext((regnode *)next)
1580                                    : next);
1581             if (last && nnode > last)
1582                 nnode = last;
1583             DUMPUNTIL(after, nnode);
1584         }
1585         else if (REGNODE_TYPE(op) == BRANCH) {
1586             assert(next);
1587             DUMPUNTIL(after, next);
1588         }
1589         else if ( REGNODE_TYPE(op)  == TRIE ) {
1590             const regnode *this_trie = node;
1591             const U32 n = ARG1u(node);
1592             const reg_ac_data * const ac = op>=AHOCORASICK ?
1593                (reg_ac_data *)ri->data->data[n] :
1594                NULL;
1595             const reg_trie_data * const trie =
1596                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
1597 #ifdef DEBUGGING
1598             AV *const trie_words
1599                            = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
1600 #endif
1601             const regnode *nextbranch= NULL;
1602             I32 word_idx;
1603             SvPVCLEAR(sv);
1604             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
1605                 SV ** const elem_ptr = av_fetch_simple(trie_words, word_idx, 0);
1606 
1607                 Perl_re_indentf( aTHX_  "%s ",
1608                     indent+3,
1609                     elem_ptr
1610                     ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
1611                                 SvCUR(*elem_ptr), PL_dump_re_max_len,
1612                                 PL_colors[0], PL_colors[1],
1613                                 (SvUTF8(*elem_ptr)
1614                                  ? PERL_PV_ESCAPE_UNI
1615                                  : 0)
1616                                 | PERL_PV_PRETTY_ELLIPSES
1617                                 | PERL_PV_PRETTY_LTGT
1618                             )
1619                     : "???"
1620                 );
1621                 if (trie->jump) {
1622                     U16 dist= trie->jump[word_idx+1];
1623                     Perl_re_printf( aTHX_  "(%" UVuf ")\n",
1624                                (UV)((dist ? this_trie + dist : next) - start));
1625                     if (dist) {
1626                         if (!nextbranch)
1627                             nextbranch= this_trie + trie->jump[0];
1628                         DUMPUNTIL(this_trie + dist, nextbranch);
1629                     }
1630                     if (nextbranch && REGNODE_TYPE(OP(nextbranch))==BRANCH)
1631                         nextbranch= regnext((regnode *)nextbranch);
1632                 } else {
1633                     Perl_re_printf( aTHX_  "\n");
1634                 }
1635             }
1636             if (last && next > last)
1637                 node= last;
1638             else
1639                 node= next;
1640         }
1641         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
1642             DUMPUNTIL(after, after + 1); /* +1 is NOT a REGNODE_AFTER */
1643         }
1644         else if (REGNODE_TYPE(op) == CURLY && op != CURLYX) {
1645             assert(next);
1646             DUMPUNTIL(after, next);
1647         }
1648         else if ( op == PLUS || op == STAR) {
1649             DUMPUNTIL(after, after + 1); /* +1 NOT a REGNODE_AFTER */
1650         }
1651         else if (REGNODE_TYPE(op) == EXACT || op == ANYOFHs) {
1652             /* Literal string, where present. */
1653             node = (const regnode *)REGNODE_AFTER_varies(node);
1654         }
1655         else {
1656             node = REGNODE_AFTER_opcode(node,op);
1657         }
1658         if (op == CURLYX || op == OPEN || op == SROPEN)
1659             indent++;
1660         if (REGNODE_TYPE(op) == END)
1661             break;
1662     }
1663     CLEAR_OPTSTART;
1664 #ifdef DEBUG_DUMPUNTIL
1665     Perl_re_printf( aTHX_  "--- %d\n", (int)indent);
1666 #endif
1667     return node;
1668 }
1669 
1670 #endif  /* DEBUGGING */
1671