1package re; 2 3# pragma for controlling the regexp engine 4use strict; 5use warnings; 6 7our $VERSION = "0.37"; 8our @ISA = qw(Exporter); 9our @EXPORT_OK = ('regmust', 10 qw(is_regexp regexp_pattern 11 regname regnames regnames_count)); 12our %EXPORT_OK = map { $_ => 1 } @EXPORT_OK; 13 14my %bitmask = ( 15 taint => 0x00100000, # HINT_RE_TAINT 16 eval => 0x00200000, # HINT_RE_EVAL 17); 18 19my $flags_hint = 0x02000000; # HINT_RE_FLAGS 20my $PMMOD_SHIFT = 0; 21my %reflags = ( 22 m => 1 << ($PMMOD_SHIFT + 0), 23 s => 1 << ($PMMOD_SHIFT + 1), 24 i => 1 << ($PMMOD_SHIFT + 2), 25 x => 1 << ($PMMOD_SHIFT + 3), 26 xx => 1 << ($PMMOD_SHIFT + 4), 27 n => 1 << ($PMMOD_SHIFT + 5), 28 p => 1 << ($PMMOD_SHIFT + 6), 29 strict => 1 << ($PMMOD_SHIFT + 10), 30# special cases: 31 d => 0, 32 l => 1, 33 u => 2, 34 a => 3, 35 aa => 4, 36); 37 38sub setcolor { 39 eval { # Ignore errors 40 require Term::Cap; 41 42 my $terminal = Tgetent Term::Cap ({OSPEED => 9600}); # Avoid warning. 43 my $props = $ENV{PERL_RE_TC} || 'md,me,so,se,us,ue'; 44 my @props = split /,/, $props; 45 my $colors = join "\t", map {$terminal->Tputs($_,1)} @props; 46 47 $colors =~ s/\0//g; 48 $ENV{PERL_RE_COLORS} = $colors; 49 }; 50 if ($@) { 51 $ENV{PERL_RE_COLORS} ||= qq'\t\t> <\t> <\t\t'; 52 } 53 54} 55 56my %flags = ( 57 COMPILE => 0x0000FF, 58 PARSE => 0x000001, 59 OPTIMISE => 0x000002, 60 TRIEC => 0x000004, 61 DUMP => 0x000008, 62 FLAGS => 0x000010, 63 TEST => 0x000020, 64 65 EXECUTE => 0x00FF00, 66 INTUIT => 0x000100, 67 MATCH => 0x000200, 68 TRIEE => 0x000400, 69 70 EXTRA => 0xFF0000, 71 TRIEM => 0x010000, 72 OFFSETS => 0x020000, 73 OFFSETSDBG => 0x040000, 74 STATE => 0x080000, 75 OPTIMISEM => 0x100000, 76 STACK => 0x280000, 77 BUFFERS => 0x400000, 78 GPOS => 0x800000, 79); 80$flags{ALL} = -1 & ~($flags{OFFSETS}|$flags{OFFSETSDBG}|$flags{BUFFERS}); 81$flags{All} = $flags{all} = $flags{DUMP} | $flags{EXECUTE}; 82$flags{Extra} = $flags{EXECUTE} | $flags{COMPILE} | $flags{GPOS}; 83$flags{More} = $flags{MORE} = $flags{All} | $flags{TRIEC} | $flags{TRIEM} | $flags{STATE}; 84$flags{State} = $flags{DUMP} | $flags{EXECUTE} | $flags{STATE}; 85$flags{TRIE} = $flags{DUMP} | $flags{EXECUTE} | $flags{TRIEC}; 86 87if (defined &DynaLoader::boot_DynaLoader) { 88 require XSLoader; 89 XSLoader::load(); 90} 91# else we're miniperl 92# We need to work for miniperl, because the XS toolchain uses Text::Wrap, which 93# uses re 'taint'. 94 95sub _load_unload { 96 my ($on)= @_; 97 if ($on) { 98 # We call install() every time, as if we didn't, we wouldn't 99 # "see" any changes to the color environment var since 100 # the last time it was called. 101 102 # install() returns an integer, which if casted properly 103 # in C resolves to a structure containing the regexp 104 # hooks. Setting it to a random integer will guarantee 105 # segfaults. 106 $^H{regcomp} = install(); 107 } else { 108 delete $^H{regcomp}; 109 } 110} 111 112sub bits { 113 my $on = shift; 114 my $bits = 0; 115 my $turning_all_off = ! @_ && ! $on; 116 if ($turning_all_off) { 117 118 # Pretend were called with certain parameters, which are best dealt 119 # with that way. 120 push @_, keys %bitmask; # taint and eval 121 push @_, 'strict'; 122 } 123 124 # Process each subpragma parameter 125 ARG: 126 foreach my $idx (0..$#_){ 127 my $s=$_[$idx]; 128 if ($s eq 'Debug' or $s eq 'Debugcolor') { 129 setcolor() if $s =~/color/i; 130 ${^RE_DEBUG_FLAGS} = 0 unless defined ${^RE_DEBUG_FLAGS}; 131 for my $idx ($idx+1..$#_) { 132 if ($flags{$_[$idx]}) { 133 if ($on) { 134 ${^RE_DEBUG_FLAGS} |= $flags{$_[$idx]}; 135 } else { 136 ${^RE_DEBUG_FLAGS} &= ~ $flags{$_[$idx]}; 137 } 138 } else { 139 require Carp; 140 Carp::carp("Unknown \"re\" Debug flag '$_[$idx]', possible flags: ", 141 join(", ",sort keys %flags ) ); 142 } 143 } 144 _load_unload($on ? 1 : ${^RE_DEBUG_FLAGS}); 145 last; 146 } elsif ($s eq 'debug' or $s eq 'debugcolor') { 147 setcolor() if $s =~/color/i; 148 _load_unload($on); 149 last; 150 } elsif (exists $bitmask{$s}) { 151 $bits |= $bitmask{$s}; 152 } elsif ($EXPORT_OK{$s}) { 153 require Exporter; 154 re->export_to_level(2, 're', $s); 155 } elsif ($s eq 'strict') { 156 if ($on) { 157 $^H{reflags} |= $reflags{$s}; 158 warnings::warnif('experimental::re_strict', 159 "\"use re 'strict'\" is experimental"); 160 161 # Turn on warnings if not already done. 162 if (! warnings::enabled('regexp')) { 163 require warnings; 164 warnings->import('regexp'); 165 $^H{re_strict} = 1; 166 } 167 } 168 else { 169 $^H{reflags} &= ~$reflags{$s} if $^H{reflags}; 170 171 # Turn off warnings if we turned them on. 172 warnings->unimport('regexp') if $^H{re_strict}; 173 } 174 if ($^H{reflags}) { 175 $^H |= $flags_hint; 176 } 177 else { 178 $^H &= ~$flags_hint; 179 } 180 } elsif ($s =~ s/^\///) { 181 my $reflags = $^H{reflags} || 0; 182 my $seen_charset; 183 my $x_count = 0; 184 while ($s =~ m/( . )/gx) { 185 local $_ = $1; 186 if (/[adul]/) { 187 # The 'a' may be repeated; hide this from the rest of the 188 # code by counting and getting rid of all of them, then 189 # changing to 'aa' if there is a repeat. 190 if ($_ eq 'a') { 191 my $sav_pos = pos $s; 192 my $a_count = $s =~ s/a//g; 193 pos $s = $sav_pos - 1; # -1 because got rid of the 'a' 194 if ($a_count > 2) { 195 require Carp; 196 Carp::carp( 197 qq 'The "a" flag may only appear a maximum of twice' 198 ); 199 } 200 elsif ($a_count == 2) { 201 $_ = 'aa'; 202 } 203 } 204 if ($on) { 205 if ($seen_charset) { 206 require Carp; 207 if ($seen_charset ne $_) { 208 Carp::carp( 209 qq 'The "$seen_charset" and "$_" flags ' 210 .qq 'are exclusive' 211 ); 212 } 213 else { 214 Carp::carp( 215 qq 'The "$seen_charset" flag may not appear ' 216 .qq 'twice' 217 ); 218 } 219 } 220 $^H{reflags_charset} = $reflags{$_}; 221 $seen_charset = $_; 222 } 223 else { 224 delete $^H{reflags_charset} 225 if defined $^H{reflags_charset} 226 && $^H{reflags_charset} == $reflags{$_}; 227 } 228 } elsif (exists $reflags{$_}) { 229 if ($_ eq 'x') { 230 $x_count++; 231 if ($x_count > 2) { 232 require Carp; 233 Carp::carp( 234 qq 'The "x" flag may only appear a maximum of twice' 235 ); 236 } 237 elsif ($x_count == 2) { 238 $_ = 'xx'; # First time through got the /x 239 } 240 } 241 242 $on 243 ? $reflags |= $reflags{$_} 244 : ($reflags &= ~$reflags{$_}); 245 } else { 246 require Carp; 247 Carp::carp( 248 qq'Unknown regular expression flag "$_"' 249 ); 250 next ARG; 251 } 252 } 253 ($^H{reflags} = $reflags or defined $^H{reflags_charset}) 254 ? $^H |= $flags_hint 255 : ($^H &= ~$flags_hint); 256 } else { 257 require Carp; 258 Carp::carp("Unknown \"re\" subpragma '$s' (known ones are: ", 259 join(', ', map {qq('$_')} 'debug', 'debugcolor', sort keys %bitmask), 260 ")"); 261 } 262 } 263 264 if ($turning_all_off) { 265 _load_unload(0); 266 $^H{reflags} = 0; 267 $^H{reflags_charset} = 0; 268 $^H &= ~$flags_hint; 269 } 270 271 $bits; 272} 273 274sub import { 275 shift; 276 $^H |= bits(1, @_); 277} 278 279sub unimport { 280 shift; 281 $^H &= ~ bits(0, @_); 282} 283 2841; 285 286__END__ 287 288=head1 NAME 289 290re - Perl pragma to alter regular expression behaviour 291 292=head1 SYNOPSIS 293 294 use re 'taint'; 295 ($x) = ($^X =~ /^(.*)$/s); # $x is tainted here 296 297 $pat = '(?{ $foo = 1 })'; 298 use re 'eval'; 299 /foo${pat}bar/; # won't fail (when not under -T 300 # switch) 301 302 { 303 no re 'taint'; # the default 304 ($x) = ($^X =~ /^(.*)$/s); # $x is not tainted here 305 306 no re 'eval'; # the default 307 /foo${pat}bar/; # disallowed (with or without -T 308 # switch) 309 } 310 311 use re 'strict'; # Raise warnings for more conditions 312 313 use re '/ix'; 314 "FOO" =~ / foo /; # /ix implied 315 no re '/x'; 316 "FOO" =~ /foo/; # just /i implied 317 318 use re 'debug'; # output debugging info during 319 /^(.*)$/s; # compile and run time 320 321 322 use re 'debugcolor'; # same as 'debug', but with colored 323 # output 324 ... 325 326 use re qw(Debug All); # Same as "use re 'debug'", but you 327 # can use "Debug" with things other 328 # than 'All' 329 use re qw(Debug More); # 'All' plus output more details 330 no re qw(Debug ALL); # Turn on (almost) all re debugging 331 # in this scope 332 333 use re qw(is_regexp regexp_pattern); # import utility functions 334 my ($pat,$mods)=regexp_pattern(qr/foo/i); 335 if (is_regexp($obj)) { 336 print "Got regexp: ", 337 scalar regexp_pattern($obj); # just as perl would stringify 338 } # it but no hassle with blessed 339 # re's. 340 341(We use $^X in these examples because it's tainted by default.) 342 343=head1 DESCRIPTION 344 345=head2 'taint' mode 346 347When C<use re 'taint'> is in effect, and a tainted string is the target 348of a regexp, the regexp memories (or values returned by the m// operator 349in list context) are tainted. This feature is useful when regexp operations 350on tainted data aren't meant to extract safe substrings, but to perform 351other transformations. 352 353=head2 'eval' mode 354 355When C<use re 'eval'> is in effect, a regexp is allowed to contain 356C<(?{ ... })> zero-width assertions and C<(??{ ... })> postponed 357subexpressions that are derived from variable interpolation, rather than 358appearing literally within the regexp. That is normally disallowed, since 359it is a 360potential security risk. Note that this pragma is ignored when the regular 361expression is obtained from tainted data, i.e. evaluation is always 362disallowed with tainted regular expressions. See L<perlre/(?{ code })> 363and L<perlre/(??{ code })>. 364 365For the purpose of this pragma, interpolation of precompiled regular 366expressions (i.e., the result of C<qr//>) is I<not> considered variable 367interpolation. Thus: 368 369 /foo${pat}bar/ 370 371I<is> allowed if $pat is a precompiled regular expression, even 372if $pat contains C<(?{ ... })> assertions or C<(??{ ... })> subexpressions. 373 374=head2 'strict' mode 375 376Note that this is an experimental feature which may be changed or removed in a 377future Perl release. 378 379When C<use re 'strict'> is in effect, stricter checks are applied than 380otherwise when compiling regular expressions patterns. These may cause more 381warnings to be raised than otherwise, and more things to be fatal instead of 382just warnings. The purpose of this is to find and report at compile time some 383things, which may be legal, but have a reasonable possibility of not being the 384programmer's actual intent. This automatically turns on the C<"regexp"> 385warnings category (if not already on) within its scope. 386 387As an example of something that is caught under C<"strict'>, but not 388otherwise, is the pattern 389 390 qr/\xABC/ 391 392The C<"\x"> construct without curly braces should be followed by exactly two 393hex digits; this one is followed by three. This currently evaluates as 394equivalent to 395 396 qr/\x{AB}C/ 397 398that is, the character whose code point value is C<0xAB>, followed by the 399letter C<C>. But since C<C> is a a hex digit, there is a reasonable chance 400that the intent was 401 402 qr/\x{ABC}/ 403 404that is the single character at C<0xABC>. Under C<'strict'> it is an error to 405not follow C<\x> with exactly two hex digits. When not under C<'strict'> a 406warning is generated if there is only one hex digit, and no warning is raised 407if there are more than two. 408 409It is expected that what exactly C<'strict'> does will evolve over time as we 410gain experience with it. This means that programs that compile under it in 411today's Perl may not compile, or may have more or fewer warnings, in future 412Perls. There is no backwards compatibility promises with regards to it. Also 413there are already proposals for an alternate syntax for enabling it. For 414these reasons, using it will raise a C<experimental::re_strict> class warning, 415unless that category is turned off. 416 417Note that if a pattern compiled within C<'strict'> is recompiled, say by 418interpolating into another pattern, outside of C<'strict'>, it is not checked 419again for strictness. This is because if it works under strict it must work 420under non-strict. 421 422=head2 '/flags' mode 423 424When C<use re '/I<flags>'> is specified, the given I<flags> are automatically 425added to every regular expression till the end of the lexical scope. 426I<flags> can be any combination of 427C<'a'>, 428C<'aa'>, 429C<'d'>, 430C<'i'>, 431C<'l'>, 432C<'m'>, 433C<'n'>, 434C<'p'>, 435C<'s'>, 436C<'u'>, 437C<'x'>, 438and/or 439C<'xx'>. 440 441C<no re '/I<flags>'> will turn off the effect of C<use re '/I<flags>'> for the 442given flags. 443 444For example, if you want all your regular expressions to have /msxx on by 445default, simply put 446 447 use re '/msxx'; 448 449at the top of your code. 450 451The character set C</adul> flags cancel each other out. So, in this example, 452 453 use re "/u"; 454 "ss" =~ /\xdf/; 455 use re "/d"; 456 "ss" =~ /\xdf/; 457 458the second C<use re> does an implicit C<no re '/u'>. 459 460Similarly, 461 462 use re "/xx"; # Doubled-x 463 ... 464 use re "/x"; # Single x from here on 465 ... 466 467Turning on one of the character set flags with C<use re> takes precedence over the 468C<locale> pragma and the 'unicode_strings' C<feature>, for regular 469expressions. Turning off one of these flags when it is active reverts to 470the behaviour specified by whatever other pragmata are in scope. For 471example: 472 473 use feature "unicode_strings"; 474 no re "/u"; # does nothing 475 use re "/l"; 476 no re "/l"; # reverts to unicode_strings behaviour 477 478=head2 'debug' mode 479 480When C<use re 'debug'> is in effect, perl emits debugging messages when 481compiling and using regular expressions. The output is the same as that 482obtained by running a C<-DDEBUGGING>-enabled perl interpreter with the 483B<-Dr> switch. It may be quite voluminous depending on the complexity 484of the match. Using C<debugcolor> instead of C<debug> enables a 485form of output that can be used to get a colorful display on terminals 486that understand termcap color sequences. Set C<$ENV{PERL_RE_TC}> to a 487comma-separated list of C<termcap> properties to use for highlighting 488strings on/off, pre-point part on/off. 489See L<perldebug/"Debugging Regular Expressions"> for additional info. 490 491As of 5.9.5 the directive C<use re 'debug'> and its equivalents are 492lexically scoped, as the other directives are. However they have both 493compile-time and run-time effects. 494 495See L<perlmodlib/Pragmatic Modules>. 496 497=head2 'Debug' mode 498 499Similarly C<use re 'Debug'> produces debugging output, the difference 500being that it allows the fine tuning of what debugging output will be 501emitted. Options are divided into three groups, those related to 502compilation, those related to execution and those related to special 503purposes. The options are as follows: 504 505=over 4 506 507=item Compile related options 508 509=over 4 510 511=item COMPILE 512 513Turns on all compile related debug options. 514 515=item PARSE 516 517Turns on debug output related to the process of parsing the pattern. 518 519=item OPTIMISE 520 521Enables output related to the optimisation phase of compilation. 522 523=item TRIEC 524 525Detailed info about trie compilation. 526 527=item DUMP 528 529Dump the final program out after it is compiled and optimised. 530 531=item FLAGS 532 533Dump the flags associated with the program 534 535=item TEST 536 537Print output intended for testing the internals of the compile process 538 539=back 540 541=item Execute related options 542 543=over 4 544 545=item EXECUTE 546 547Turns on all execute related debug options. 548 549=item MATCH 550 551Turns on debugging of the main matching loop. 552 553=item TRIEE 554 555Extra debugging of how tries execute. 556 557=item INTUIT 558 559Enable debugging of start-point optimisations. 560 561=back 562 563=item Extra debugging options 564 565=over 4 566 567=item EXTRA 568 569Turns on all "extra" debugging options. 570 571=item BUFFERS 572 573Enable debugging the capture group storage during match. Warning, 574this can potentially produce extremely large output. 575 576=item TRIEM 577 578Enable enhanced TRIE debugging. Enhances both TRIEE 579and TRIEC. 580 581=item STATE 582 583Enable debugging of states in the engine. 584 585=item STACK 586 587Enable debugging of the recursion stack in the engine. Enabling 588or disabling this option automatically does the same for debugging 589states as well. This output from this can be quite large. 590 591=item GPOS 592 593Enable debugging of the \G modifier. 594 595=item OPTIMISEM 596 597Enable enhanced optimisation debugging and start-point optimisations. 598Probably not useful except when debugging the regexp engine itself. 599 600=item OFFSETS 601 602Dump offset information. This can be used to see how regops correlate 603to the pattern. Output format is 604 605 NODENUM:POSITION[LENGTH] 606 607Where 1 is the position of the first char in the string. Note that position 608can be 0, or larger than the actual length of the pattern, likewise length 609can be zero. 610 611=item OFFSETSDBG 612 613Enable debugging of offsets information. This emits copious 614amounts of trace information and doesn't mesh well with other 615debug options. 616 617Almost definitely only useful to people hacking 618on the offsets part of the debug engine. 619 620 621=back 622 623=item Other useful flags 624 625These are useful shortcuts to save on the typing. 626 627=over 4 628 629=item ALL 630 631Enable all options at once except OFFSETS, OFFSETSDBG and BUFFERS. 632(To get every single option without exception, use both ALL and EXTRA, or 633starting in 5.30 on a C<-DDEBUGGING>-enabled perl interpreter, use 634the B<-Drv> command-line switches.) 635 636=item All 637 638Enable DUMP and all execute options. Equivalent to: 639 640 use re 'debug'; 641 642=item MORE 643 644=item More 645 646Enable the options enabled by "All", plus STATE, TRIEC, and TRIEM. 647 648=back 649 650=back 651 652As of 5.9.5 the directive C<use re 'debug'> and its equivalents are 653lexically scoped, as are the other directives. However they have both 654compile-time and run-time effects. 655 656=head2 Exportable Functions 657 658As of perl 5.9.5 're' debug contains a number of utility functions that 659may be optionally exported into the caller's namespace. They are listed 660below. 661 662=over 4 663 664=item is_regexp($ref) 665 666Returns true if the argument is a compiled regular expression as returned 667by C<qr//>, false if it is not. 668 669This function will not be confused by overloading or blessing. In 670internals terms, this extracts the regexp pointer out of the 671PERL_MAGIC_qr structure so it cannot be fooled. 672 673=item regexp_pattern($ref) 674 675If the argument is a compiled regular expression as returned by C<qr//>, 676then this function returns the pattern. 677 678In list context it returns a two element list, the first element 679containing the pattern and the second containing the modifiers used when 680the pattern was compiled. 681 682 my ($pat, $mods) = regexp_pattern($ref); 683 684In scalar context it returns the same as perl would when stringifying a raw 685C<qr//> with the same pattern inside. If the argument is not a compiled 686reference then this routine returns false but defined in scalar context, 687and the empty list in list context. Thus the following 688 689 if (regexp_pattern($ref) eq '(?^i:foo)') 690 691will be warning free regardless of what $ref actually is. 692 693Like C<is_regexp> this function will not be confused by overloading 694or blessing of the object. 695 696=item regmust($ref) 697 698If the argument is a compiled regular expression as returned by C<qr//>, 699then this function returns what the optimiser considers to be the longest 700anchored fixed string and longest floating fixed string in the pattern. 701 702A I<fixed string> is defined as being a substring that must appear for the 703pattern to match. An I<anchored fixed string> is a fixed string that must 704appear at a particular offset from the beginning of the match. A I<floating 705fixed string> is defined as a fixed string that can appear at any point in 706a range of positions relative to the start of the match. For example, 707 708 my $qr = qr/here .* there/x; 709 my ($anchored, $floating) = regmust($qr); 710 print "anchored:'$anchored'\nfloating:'$floating'\n"; 711 712results in 713 714 anchored:'here' 715 floating:'there' 716 717Because the C<here> is before the C<.*> in the pattern, its position 718can be determined exactly. That's not true, however, for the C<there>; 719it could appear at any point after where the anchored string appeared. 720Perl uses both for its optimisations, preferring the longer, or, if they are 721equal, the floating. 722 723B<NOTE:> This may not necessarily be the definitive longest anchored and 724floating string. This will be what the optimiser of the Perl that you 725are using thinks is the longest. If you believe that the result is wrong 726please report it via the L<perlbug> utility. 727 728=item regname($name,$all) 729 730Returns the contents of a named buffer of the last successful match. If 731$all is true, then returns an array ref containing one entry per buffer, 732otherwise returns the first defined buffer. 733 734=item regnames($all) 735 736Returns a list of all of the named buffers defined in the last successful 737match. If $all is true, then it returns all names defined, if not it returns 738only names which were involved in the match. 739 740=item regnames_count() 741 742Returns the number of distinct names defined in the pattern used 743for the last successful match. 744 745B<Note:> this result is always the actual number of distinct 746named buffers defined, it may not actually match that which is 747returned by C<regnames()> and related routines when those routines 748have not been called with the $all parameter set. 749 750=back 751 752=head1 SEE ALSO 753 754L<perlmodlib/Pragmatic Modules>. 755 756=cut 757