1package re; 2 3# pragma for controlling the regexp engine 4use strict; 5use warnings; 6 7our $VERSION = "0.43"; 8our @ISA = qw(Exporter); 9our @EXPORT_OK = qw{ 10 is_regexp regexp_pattern 11 regname regnames regnames_count 12 regmust optimization 13}; 14our %EXPORT_OK = map { $_ => 1 } @EXPORT_OK; 15 16my %bitmask = ( 17 taint => 0x00100000, # HINT_RE_TAINT 18 eval => 0x00200000, # HINT_RE_EVAL 19); 20 21my $flags_hint = 0x02000000; # HINT_RE_FLAGS 22my $PMMOD_SHIFT = 0; 23my %reflags = ( 24 m => 1 << ($PMMOD_SHIFT + 0), 25 s => 1 << ($PMMOD_SHIFT + 1), 26 i => 1 << ($PMMOD_SHIFT + 2), 27 x => 1 << ($PMMOD_SHIFT + 3), 28 xx => 1 << ($PMMOD_SHIFT + 4), 29 n => 1 << ($PMMOD_SHIFT + 5), 30 p => 1 << ($PMMOD_SHIFT + 6), 31 strict => 1 << ($PMMOD_SHIFT + 10), 32# special cases: 33 d => 0, 34 l => 1, 35 u => 2, 36 a => 3, 37 aa => 4, 38); 39 40sub setcolor { 41 eval { # Ignore errors 42 require Term::Cap; 43 44 my $terminal = Tgetent Term::Cap ({OSPEED => 9600}); # Avoid warning. 45 my $props = $ENV{PERL_RE_TC} || 'md,me,so,se,us,ue'; 46 my @props = split /,/, $props; 47 my $colors = join "\t", map {$terminal->Tputs($_,1)} @props; 48 49 $colors =~ s/\0//g; 50 $ENV{PERL_RE_COLORS} = $colors; 51 }; 52 if ($@) { 53 $ENV{PERL_RE_COLORS} ||= qq'\t\t> <\t> <\t\t'; 54 } 55 56} 57 58my %flags = ( 59 COMPILE => 0x0000FF, 60 PARSE => 0x000001, 61 OPTIMISE => 0x000002, 62 TRIEC => 0x000004, 63 DUMP => 0x000008, 64 FLAGS => 0x000010, 65 TEST => 0x000020, 66 67 EXECUTE => 0x00FF00, 68 INTUIT => 0x000100, 69 MATCH => 0x000200, 70 TRIEE => 0x000400, 71 72 EXTRA => 0x3FF0000, 73 TRIEM => 0x0010000, 74 STATE => 0x0080000, 75 OPTIMISEM => 0x0100000, 76 STACK => 0x0280000, 77 BUFFERS => 0x0400000, 78 GPOS => 0x0800000, 79 DUMP_PRE_OPTIMIZE => 0x1000000, 80 WILDCARD => 0x2000000, 81); 82$flags{ALL} = -1 & ~($flags{BUFFERS} 83 |$flags{DUMP_PRE_OPTIMIZE} 84 |$flags{WILDCARD} 85 ); 86$flags{All} = $flags{all} = $flags{DUMP} | $flags{EXECUTE}; 87$flags{Extra} = $flags{EXECUTE} | $flags{COMPILE} | $flags{GPOS}; 88$flags{More} = $flags{MORE} = 89 $flags{All} | $flags{TRIEC} | $flags{TRIEM} | $flags{STATE}; 90$flags{State} = $flags{DUMP} | $flags{EXECUTE} | $flags{STATE}; 91$flags{TRIE} = $flags{DUMP} | $flags{EXECUTE} | $flags{TRIEC}; 92 93if (defined &DynaLoader::boot_DynaLoader) { 94 require XSLoader; 95 XSLoader::load(); 96} 97# else we're miniperl 98# We need to work for miniperl, because the XS toolchain uses Text::Wrap, which 99# uses re 'taint'. 100 101sub _load_unload { 102 my ($on)= @_; 103 if ($on) { 104 # We call install() every time, as if we didn't, we wouldn't 105 # "see" any changes to the color environment var since 106 # the last time it was called. 107 108 # install() returns an integer, which if casted properly 109 # in C resolves to a structure containing the regexp 110 # hooks. Setting it to a random integer will guarantee 111 # segfaults. 112 $^H{regcomp} = install(); 113 } else { 114 delete $^H{regcomp}; 115 } 116} 117 118sub bits { 119 my $on = shift; 120 my $bits = 0; 121 my $turning_all_off = ! @_ && ! $on; 122 my $seen_Debug = 0; 123 my $seen_debug = 0; 124 if ($turning_all_off) { 125 126 # Pretend were called with certain parameters, which are best dealt 127 # with that way. 128 push @_, keys %bitmask; # taint and eval 129 push @_, 'strict'; 130 } 131 132 # Process each subpragma parameter 133 ARG: 134 foreach my $idx (0..$#_){ 135 my $s=$_[$idx]; 136 if ($s eq 'Debug' or $s eq 'Debugcolor') { 137 if (! $seen_Debug) { 138 $seen_Debug = 1; 139 140 # Reset to nothing, and then add what follows. $seen_Debug 141 # allows, though unlikely someone would do it, more than one 142 # Debug and flags in the arguments 143 ${^RE_DEBUG_FLAGS} = 0; 144 } 145 setcolor() if $s =~/color/i; 146 for my $idx ($idx+1..$#_) { 147 if ($flags{$_[$idx]}) { 148 if ($on) { 149 ${^RE_DEBUG_FLAGS} |= $flags{$_[$idx]}; 150 } else { 151 ${^RE_DEBUG_FLAGS} &= ~ $flags{$_[$idx]}; 152 } 153 } else { 154 require Carp; 155 Carp::carp("Unknown \"re\" Debug flag '$_[$idx]', possible flags: ", 156 join(", ",sort keys %flags ) ); 157 } 158 } 159 _load_unload($on ? 1 : ${^RE_DEBUG_FLAGS}); 160 last; 161 } elsif ($s eq 'debug' or $s eq 'debugcolor') { 162 163 # These default flags should be kept in sync with the same values 164 # in regcomp.h 165 ${^RE_DEBUG_FLAGS} = $flags{'EXECUTE'} | $flags{'DUMP'}; 166 setcolor() if $s =~/color/i; 167 _load_unload($on); 168 $seen_debug = 1; 169 } elsif (exists $bitmask{$s}) { 170 $bits |= $bitmask{$s}; 171 } elsif ($EXPORT_OK{$s}) { 172 require Exporter; 173 re->export_to_level(2, 're', $s); 174 } elsif ($s eq 'strict') { 175 if ($on) { 176 $^H{reflags} |= $reflags{$s}; 177 warnings::warnif('experimental::re_strict', 178 "\"use re 'strict'\" is experimental"); 179 180 # Turn on warnings if not already done. 181 if (! warnings::enabled('regexp')) { 182 require warnings; 183 warnings->import('regexp'); 184 $^H{re_strict} = 1; 185 } 186 } 187 else { 188 $^H{reflags} &= ~$reflags{$s} if $^H{reflags}; 189 190 # Turn off warnings if we turned them on. 191 warnings->unimport('regexp') if $^H{re_strict}; 192 } 193 if ($^H{reflags}) { 194 $^H |= $flags_hint; 195 } 196 else { 197 $^H &= ~$flags_hint; 198 } 199 } elsif ($s =~ s/^\///) { 200 my $reflags = $^H{reflags} || 0; 201 my $seen_charset; 202 my $x_count = 0; 203 while ($s =~ m/( . )/gx) { 204 local $_ = $1; 205 if (/[adul]/) { 206 # The 'a' may be repeated; hide this from the rest of the 207 # code by counting and getting rid of all of them, then 208 # changing to 'aa' if there is a repeat. 209 if ($_ eq 'a') { 210 my $sav_pos = pos $s; 211 my $a_count = $s =~ s/a//g; 212 pos $s = $sav_pos - 1; # -1 because got rid of the 'a' 213 if ($a_count > 2) { 214 require Carp; 215 Carp::carp( 216 qq 'The "a" flag may only appear a maximum of twice' 217 ); 218 } 219 elsif ($a_count == 2) { 220 $_ = 'aa'; 221 } 222 } 223 if ($on) { 224 if ($seen_charset) { 225 require Carp; 226 if ($seen_charset ne $_) { 227 Carp::carp( 228 qq 'The "$seen_charset" and "$_" flags ' 229 .qq 'are exclusive' 230 ); 231 } 232 else { 233 Carp::carp( 234 qq 'The "$seen_charset" flag may not appear ' 235 .qq 'twice' 236 ); 237 } 238 } 239 $^H{reflags_charset} = $reflags{$_}; 240 $seen_charset = $_; 241 } 242 else { 243 delete $^H{reflags_charset} 244 if defined $^H{reflags_charset} 245 && $^H{reflags_charset} == $reflags{$_}; 246 } 247 } elsif (exists $reflags{$_}) { 248 if ($_ eq 'x') { 249 $x_count++; 250 if ($x_count > 2) { 251 require Carp; 252 Carp::carp( 253 qq 'The "x" flag may only appear a maximum of twice' 254 ); 255 } 256 elsif ($x_count == 2) { 257 $_ = 'xx'; # First time through got the /x 258 } 259 } 260 261 $on 262 ? $reflags |= $reflags{$_} 263 : ($reflags &= ~$reflags{$_}); 264 } else { 265 require Carp; 266 Carp::carp( 267 qq'Unknown regular expression flag "$_"' 268 ); 269 next ARG; 270 } 271 } 272 ($^H{reflags} = $reflags or defined $^H{reflags_charset}) 273 ? $^H |= $flags_hint 274 : ($^H &= ~$flags_hint); 275 } else { 276 require Carp; 277 if ($seen_debug && defined $flags{$s}) { 278 Carp::carp("Use \"Debug\" not \"debug\", to list debug types" 279 . " in \"re\". \"$s\" ignored"); 280 } 281 else { 282 Carp::carp("Unknown \"re\" subpragma '$s' (known ones are: ", 283 join(', ', map {qq('$_')} 'debug', 'debugcolor', sort keys %bitmask), 284 ")"); 285 } 286 } 287 } 288 289 if ($turning_all_off) { 290 _load_unload(0); 291 $^H{reflags} = 0; 292 $^H{reflags_charset} = 0; 293 $^H &= ~$flags_hint; 294 } 295 296 $bits; 297} 298 299sub import { 300 shift; 301 $^H |= bits(1, @_); 302} 303 304sub unimport { 305 shift; 306 $^H &= ~ bits(0, @_); 307} 308 3091; 310 311__END__ 312 313=head1 NAME 314 315re - Perl pragma to alter regular expression behaviour 316 317=head1 SYNOPSIS 318 319 use re 'taint'; 320 ($x) = ($^X =~ /^(.*)$/s); # $x is tainted here 321 322 $pat = '(?{ $foo = 1 })'; 323 use re 'eval'; 324 /foo${pat}bar/; # won't fail (when not under -T 325 # switch) 326 327 { 328 no re 'taint'; # the default 329 ($x) = ($^X =~ /^(.*)$/s); # $x is not tainted here 330 331 no re 'eval'; # the default 332 /foo${pat}bar/; # disallowed (with or without -T 333 # switch) 334 } 335 336 use re 'strict'; # Raise warnings for more conditions 337 338 use re '/ix'; 339 "FOO" =~ / foo /; # /ix implied 340 no re '/x'; 341 "FOO" =~ /foo/; # just /i implied 342 343 use re 'debug'; # output debugging info during 344 /^(.*)$/s; # compile and run time 345 346 347 use re 'debugcolor'; # same as 'debug', but with colored 348 # output 349 ... 350 351 use re qw(Debug All); # Same as "use re 'debug'", but you 352 # can use "Debug" with things other 353 # than 'All' 354 use re qw(Debug More); # 'All' plus output more details 355 no re qw(Debug ALL); # Turn on (almost) all re debugging 356 # in this scope 357 358 use re qw(is_regexp regexp_pattern); # import utility functions 359 my ($pat,$mods)=regexp_pattern(qr/foo/i); 360 if (is_regexp($obj)) { 361 print "Got regexp: ", 362 scalar regexp_pattern($obj); # just as perl would stringify 363 } # it but no hassle with blessed 364 # re's. 365 366(We use $^X in these examples because it's tainted by default.) 367 368=head1 DESCRIPTION 369 370=head2 'taint' mode 371 372When C<use re 'taint'> is in effect, and a tainted string is the target 373of a regexp, the regexp memories (or values returned by the m// operator 374in list context) are tainted. This feature is useful when regexp operations 375on tainted data aren't meant to extract safe substrings, but to perform 376other transformations. 377 378=head2 'eval' mode 379 380When C<use re 'eval'> is in effect, a regexp is allowed to contain 381C<(?{ ... })> zero-width assertions and C<(??{ ... })> postponed 382subexpressions that are derived from variable interpolation, rather than 383appearing literally within the regexp. That is normally disallowed, since 384it is a 385potential security risk. Note that this pragma is ignored when the regular 386expression is obtained from tainted data, i.e. evaluation is always 387disallowed with tainted regular expressions. See L<perlre/(?{ code })> 388and L<perlre/(??{ code })>. 389 390For the purpose of this pragma, interpolation of precompiled regular 391expressions (i.e., the result of C<qr//>) is I<not> considered variable 392interpolation. Thus: 393 394 /foo${pat}bar/ 395 396I<is> allowed if $pat is a precompiled regular expression, even 397if $pat contains C<(?{ ... })> assertions or C<(??{ ... })> subexpressions. 398 399=head2 'strict' mode 400 401Note that this is an experimental feature which may be changed or removed in a 402future Perl release. 403 404When C<use re 'strict'> is in effect, stricter checks are applied than 405otherwise when compiling regular expressions patterns. These may cause more 406warnings to be raised than otherwise, and more things to be fatal instead of 407just warnings. The purpose of this is to find and report at compile time some 408things, which may be legal, but have a reasonable possibility of not being the 409programmer's actual intent. This automatically turns on the C<"regexp"> 410warnings category (if not already on) within its scope. 411 412As an example of something that is caught under C<"strict'>, but not 413otherwise, is the pattern 414 415 qr/\xABC/ 416 417The C<"\x"> construct without curly braces should be followed by exactly two 418hex digits; this one is followed by three. This currently evaluates as 419equivalent to 420 421 qr/\x{AB}C/ 422 423that is, the character whose code point value is C<0xAB>, followed by the 424letter C<C>. But since C<C> is a hex digit, there is a reasonable chance 425that the intent was 426 427 qr/\x{ABC}/ 428 429that is the single character at C<0xABC>. Under C<'strict'> it is an error to 430not follow C<\x> with exactly two hex digits. When not under C<'strict'> a 431warning is generated if there is only one hex digit, and no warning is raised 432if there are more than two. 433 434It is expected that what exactly C<'strict'> does will evolve over time as we 435gain experience with it. This means that programs that compile under it in 436today's Perl may not compile, or may have more or fewer warnings, in future 437Perls. There is no backwards compatibility promises with regards to it. Also 438there are already proposals for an alternate syntax for enabling it. For 439these reasons, using it will raise a C<experimental::re_strict> class warning, 440unless that category is turned off. 441 442Note that if a pattern compiled within C<'strict'> is recompiled, say by 443interpolating into another pattern, outside of C<'strict'>, it is not checked 444again for strictness. This is because if it works under strict it must work 445under non-strict. 446 447=head2 '/flags' mode 448 449When C<use re '/I<flags>'> is specified, the given I<flags> are automatically 450added to every regular expression till the end of the lexical scope. 451I<flags> can be any combination of 452C<'a'>, 453C<'aa'>, 454C<'d'>, 455C<'i'>, 456C<'l'>, 457C<'m'>, 458C<'n'>, 459C<'p'>, 460C<'s'>, 461C<'u'>, 462C<'x'>, 463and/or 464C<'xx'>. 465 466C<no re '/I<flags>'> will turn off the effect of C<use re '/I<flags>'> for the 467given flags. 468 469For example, if you want all your regular expressions to have /msxx on by 470default, simply put 471 472 use re '/msxx'; 473 474at the top of your code. 475 476The character set C</adul> flags cancel each other out. So, in this example, 477 478 use re "/u"; 479 "ss" =~ /\xdf/; 480 use re "/d"; 481 "ss" =~ /\xdf/; 482 483the second C<use re> does an implicit C<no re '/u'>. 484 485Similarly, 486 487 use re "/xx"; # Doubled-x 488 ... 489 use re "/x"; # Single x from here on 490 ... 491 492Turning on one of the character set flags with C<use re> takes precedence over the 493C<locale> pragma and the 'unicode_strings' C<feature>, for regular 494expressions. Turning off one of these flags when it is active reverts to 495the behaviour specified by whatever other pragmata are in scope. For 496example: 497 498 use feature "unicode_strings"; 499 no re "/u"; # does nothing 500 use re "/l"; 501 no re "/l"; # reverts to unicode_strings behaviour 502 503=head2 'debug' mode 504 505When C<use re 'debug'> is in effect, perl emits debugging messages when 506compiling and using regular expressions. The output is the same as that 507obtained by running a C<-DDEBUGGING>-enabled perl interpreter with the 508B<-Dr> switch. It may be quite voluminous depending on the complexity 509of the match. Using C<debugcolor> instead of C<debug> enables a 510form of output that can be used to get a colorful display on terminals 511that understand termcap color sequences. Set C<$ENV{PERL_RE_TC}> to a 512comma-separated list of C<termcap> properties to use for highlighting 513strings on/off, pre-point part on/off. 514See L<perldebug/"Debugging Regular Expressions"> for additional info. 515 516B<NOTE> that the exact format of the C<debug> mode is B<NOT> considered 517to be an officially supported API of Perl. It is intended for debugging 518only and may change as the core development team deems appropriate 519without notice or deprecation in any release of Perl, major or minor. 520Any documentation of the output is purely advisory. 521 522As of 5.9.5 the directive C<use re 'debug'> and its equivalents are 523lexically scoped, as the other directives are. However they have both 524compile-time and run-time effects. 525 526See L<perlmodlib/Pragmatic Modules>. 527 528=head2 'Debug' mode 529 530Similarly C<use re 'Debug'> produces debugging output, the difference 531being that it allows the fine tuning of what debugging output will be 532emitted. Options are divided into three groups, those related to 533compilation, those related to execution and those related to special 534purposes. 535 536B<NOTE> that the options provided under the C<Debug> mode and the exact 537format of the output they create is B<NOT> considered to be an 538officially supported API of Perl. It is intended for debugging only and 539may change as the core development team deems appropriate without notice 540or deprecation in any release of Perl, major or minor. Any documentation 541of the format or options available is advisory only and is subject to 542change without notice. 543 544The options are as follows: 545 546=over 4 547 548=item Compile related options 549 550=over 4 551 552=item COMPILE 553 554Turns on all non-extra compile related debug options. 555 556=item PARSE 557 558Turns on debug output related to the process of parsing the pattern. 559 560=item OPTIMISE 561 562Enables output related to the optimisation phase of compilation. 563 564=item TRIEC 565 566Detailed info about trie compilation. 567 568=item DUMP 569 570Dump the final program out after it is compiled and optimised. 571 572=item FLAGS 573 574Dump the flags associated with the program 575 576=item TEST 577 578Print output intended for testing the internals of the compile process 579 580=back 581 582=item Execute related options 583 584=over 4 585 586=item EXECUTE 587 588Turns on all non-extra execute related debug options. 589 590=item MATCH 591 592Turns on debugging of the main matching loop. 593 594=item TRIEE 595 596Extra debugging of how tries execute. 597 598=item INTUIT 599 600Enable debugging of start-point optimisations. 601 602=back 603 604=item Extra debugging options 605 606=over 4 607 608=item EXTRA 609 610Turns on all "extra" debugging options. 611 612=item BUFFERS 613 614Enable debugging the capture group storage during match. Warning, 615this can potentially produce extremely large output. 616 617=item TRIEM 618 619Enable enhanced TRIE debugging. Enhances both TRIEE 620and TRIEC. 621 622=item STATE 623 624Enable debugging of states in the engine. 625 626=item STACK 627 628Enable debugging of the recursion stack in the engine. Enabling 629or disabling this option automatically does the same for debugging 630states as well. This output from this can be quite large. 631 632=item GPOS 633 634Enable debugging of the \G modifier. 635 636=item OPTIMISEM 637 638Enable enhanced optimisation debugging and start-point optimisations. 639Probably not useful except when debugging the regexp engine itself. 640 641=item DUMP_PRE_OPTIMIZE 642 643Enable the dumping of the compiled pattern before the optimization phase. 644 645=item WILDCARD 646 647When Perl encounters a wildcard subpattern, (see L<perlunicode/Wildcards in 648Property Values>), it suspends compilation of the main pattern, compiles the 649subpattern, and then matches that against all legal possibilities to determine 650the actual code points the subpattern matches. After that it adds these to 651the main pattern, and continues its compilation. 652 653You may very well want to see how your subpattern gets compiled, but it is 654likely of less use to you to see how Perl matches that against all the legal 655possibilities, as that is under control of Perl, not you. Therefore, the 656debugging information of the compilation portion is as specified by the other 657options, but the debugging output of the matching portion is normally 658suppressed. 659 660You can use the WILDCARD option to enable the debugging output of this 661subpattern matching. Careful! This can lead to voluminous outputs, and it 662may not make much sense to you what and why Perl is doing what it is. 663But it may be helpful to you to see why things aren't going the way you 664expect. 665 666Note that this option alone doesn't cause any debugging information to be 667output. What it does is stop the normal suppression of execution-related 668debugging information during the matching portion of the compilation of 669wildcards. You also have to specify which execution debugging information you 670want, such as by also including the EXECUTE option. 671 672=back 673 674=item Other useful flags 675 676These are useful shortcuts to save on the typing. 677 678=over 4 679 680=item ALL 681 682Enable all options at once except BUFFERS, WILDCARD, and DUMP_PRE_OPTIMIZE. 683(To get every single option without exception, use both ALL and EXTRA, or 684starting in 5.30 on a C<-DDEBUGGING>-enabled perl interpreter, use 685the B<-Drv> command-line switches.) 686 687=item All 688 689Enable DUMP and all non-extra execute options. Equivalent to: 690 691 use re 'debug'; 692 693=item MORE 694 695=item More 696 697Enable the options enabled by "All", plus STATE, TRIEC, and TRIEM. 698 699=back 700 701=back 702 703As of 5.9.5 the directive C<use re 'debug'> and its equivalents are 704lexically scoped, as are the other directives. However they have both 705compile-time and run-time effects. 706 707=head2 Exportable Functions 708 709As of perl 5.9.5 're' debug contains a number of utility functions that 710may be optionally exported into the caller's namespace. They are listed 711below. 712 713=over 4 714 715=item is_regexp($ref) 716 717Returns true if the argument is a compiled regular expression as returned 718by C<qr//>, false if it is not. 719 720This function will not be confused by overloading or blessing. In 721internals terms, this extracts the regexp pointer out of the 722PERL_MAGIC_qr structure so it cannot be fooled. 723 724=item regexp_pattern($ref) 725 726If the argument is a compiled regular expression as returned by C<qr//>, 727then this function returns the pattern. 728 729In list context it returns a two element list, the first element 730containing the pattern and the second containing the modifiers used when 731the pattern was compiled. 732 733 my ($pat, $mods) = regexp_pattern($ref); 734 735In scalar context it returns the same as perl would when stringifying a raw 736C<qr//> with the same pattern inside. If the argument is not a compiled 737reference then this routine returns false but defined in scalar context, 738and the empty list in list context. Thus the following 739 740 if (regexp_pattern($ref) eq '(?^i:foo)') 741 742will be warning free regardless of what $ref actually is. 743 744Like C<is_regexp> this function will not be confused by overloading 745or blessing of the object. 746 747=item regname($name,$all) 748 749Returns the contents of a named buffer of the last successful match. If 750$all is true, then returns an array ref containing one entry per buffer, 751otherwise returns the first defined buffer. 752 753=item regnames($all) 754 755Returns a list of all of the named buffers defined in the last successful 756match. If $all is true, then it returns all names defined, if not it returns 757only names which were involved in the match. 758 759=item regnames_count() 760 761Returns the number of distinct names defined in the pattern used 762for the last successful match. 763 764B<Note:> this result is always the actual number of distinct 765named buffers defined, it may not actually match that which is 766returned by C<regnames()> and related routines when those routines 767have not been called with the $all parameter set. 768 769=item regmust($ref) 770 771If the argument is a compiled regular expression as returned by C<qr//>, 772then this function returns what the optimiser considers to be the longest 773anchored fixed string and longest floating fixed string in the pattern. 774 775A I<fixed string> is defined as being a substring that must appear for the 776pattern to match. An I<anchored fixed string> is a fixed string that must 777appear at a particular offset from the beginning of the match. A I<floating 778fixed string> is defined as a fixed string that can appear at any point in 779a range of positions relative to the start of the match. For example, 780 781 my $qr = qr/here .* there/x; 782 my ($anchored, $floating) = regmust($qr); 783 print "anchored:'$anchored'\nfloating:'$floating'\n"; 784 785results in 786 787 anchored:'here' 788 floating:'there' 789 790Because the C<here> is before the C<.*> in the pattern, its position 791can be determined exactly. That's not true, however, for the C<there>; 792it could appear at any point after where the anchored string appeared. 793Perl uses both for its optimisations, preferring the longer, or, if they are 794equal, the floating. 795 796B<NOTE:> This may not necessarily be the definitive longest anchored and 797floating string. This will be what the optimiser of the Perl that you 798are using thinks is the longest. If you believe that the result is wrong 799please report it via the L<perlbug> utility. 800 801=item optimization($ref) 802 803If the argument is a compiled regular expression as returned by C<qr//>, 804then this function returns a hashref of the optimization information 805discovered at compile time, so we can write tests around it. If any 806other argument is given, returns C<undef>. 807 808The hash contents are expected to change from time to time as we develop 809new ways to optimize - no assumption of stability should be made, not 810even between minor versions of perl. 811 812For the current version, the hash will have the following contents: 813 814=over 4 815 816=item minlen 817 818An integer, the least number of characters in any string that can match. 819 820=item minlenret 821 822An integer, the least number of characters that can be in C<$&> after a 823match. (Consider eg C< /ns(?=\d)/ >.) 824 825=item gofs 826 827An integer, the number of characters before C<pos()> to start match at. 828 829=item noscan 830 831A boolean, C<TRUE> to indicate that any anchored/floating substrings 832found should not be used. (CHECKME: apparently this is set for an 833anchored pattern with no floating substring, but never used.) 834 835=item isall 836 837A boolean, C<TRUE> to indicate that the optimizer information is all 838that the regular expression contains, and thus one does not need to 839enter the regexp runtime engine at all. 840 841=item anchor SBOL 842 843A boolean, C<TRUE> if the pattern is anchored to start of string. 844 845=item anchor MBOL 846 847A boolean, C<TRUE> if the pattern is anchored to any start of line 848within the string. 849 850=item anchor GPOS 851 852A boolean, C<TRUE> if the pattern is anchored to the end of the previous 853match. 854 855=item skip 856 857A boolean, C<TRUE> if the start class can match only the first of a run. 858 859=item implicit 860 861A boolean, C<TRUE> if a C</.*/> has been turned implicitly into a C</^.*/>. 862 863=item anchored/floating 864 865A byte string representing an anchored or floating substring respectively 866that any match must contain, or undef if no such substring was found, or 867if the substring would require utf8 to represent. 868 869=item anchored utf8/floating utf8 870 871A utf8 string representing an anchored or floating substring respectively 872that any match must contain, or undef if no such substring was found, or 873if the substring contains only 7-bit ASCII characters. 874 875=item anchored min offset/floating min offset 876 877An integer, the first offset in characters from a match location at which 878we should look for the corresponding substring. 879 880=item anchored max offset/floating max offset 881 882An integer, the last offset in characters from a match location at which 883we should look for the corresponding substring. 884 885Ignored for anchored, so may be 0 or same as min. 886 887=item anchored end shift/floating end shift 888 889FIXME: not sure what this is, something to do with lookbehind. regcomp.c 890says: 891 When the final pattern is compiled and the data is moved from the 892 scan_data_t structure into the regexp structure the information 893 about lookbehind is factored in, with the information that would 894 have been lost precalculated in the end_shift field for the 895 associated string. 896 897=item checking 898 899A constant string, one of "anchored", "floating" or "none" to indicate 900which substring (if any) should be checked for first. 901 902=item stclass 903 904A string representation of a character class ("start class") that must 905be the first character of any match. 906 907TODO: explain the representations. 908 909=back 910 911=back 912 913=head1 SEE ALSO 914 915L<perlmodlib/Pragmatic Modules>. 916 917=cut 918