1#!perl 2package CharClass::Matcher; 3use strict; 4use 5.008; 5use warnings; 6use warnings FATAL => 'all'; 7no warnings 'experimental::autoderef'; 8use Data::Dumper; 9$Data::Dumper::Useqq= 1; 10our $hex_fmt= "0x%02X"; 11 12sub DEBUG () { 0 } 13$|=1 if DEBUG; 14 15sub ASCII_PLATFORM { (ord('A') == 65) } 16 17require 'regen/regen_lib.pl'; 18 19=head1 NAME 20 21CharClass::Matcher -- Generate C macros that match character classes efficiently 22 23=head1 SYNOPSIS 24 25 perl Porting/regcharclass.pl 26 27=head1 DESCRIPTION 28 29Dynamically generates macros for detecting special charclasses 30in latin-1, utf8, and codepoint forms. Macros can be set to return 31the length (in bytes) of the matched codepoint, and/or the codepoint itself. 32 33To regenerate F<regcharclass.h>, run this script from perl-root. No arguments 34are necessary. 35 36Using WHATEVER as an example the following macros can be produced, depending 37on the input parameters (how to get each is described by internal comments at 38the C<__DATA__> line): 39 40=over 4 41 42=item C<is_WHATEVER(s,is_utf8)> 43 44=item C<is_WHATEVER_safe(s,e,is_utf8)> 45 46Do a lookup as appropriate based on the C<is_utf8> flag. When possible 47comparisons involving octect<128 are done before checking the C<is_utf8> 48flag, hopefully saving time. 49 50The version without the C<_safe> suffix should be used only when the input is 51known to be well-formed. 52 53=item C<is_WHATEVER_utf8(s)> 54 55=item C<is_WHATEVER_utf8_safe(s,e)> 56 57Do a lookup assuming the string is encoded in (normalized) UTF8. 58 59The version without the C<_safe> suffix should be used only when the input is 60known to be well-formed. 61 62=item C<is_WHATEVER_latin1(s)> 63 64=item C<is_WHATEVER_latin1_safe(s,e)> 65 66Do a lookup assuming the string is encoded in latin-1 (aka plan octets). 67 68The version without the C<_safe> suffix should be used only when it is known 69that C<s> contains at least one character. 70 71=item C<is_WHATEVER_cp(cp)> 72 73Check to see if the string matches a given codepoint (hypothetically a 74U32). The condition is constructed as to "break out" as early as 75possible if the codepoint is out of range of the condition. 76 77IOW: 78 79 (cp==X || (cp>X && (cp==Y || (cp>Y && ...)))) 80 81Thus if the character is X+1 only two comparisons will be done. Making 82matching lookups slower, but non-matching faster. 83 84=item C<what_len_WHATEVER_FOO(arg1, ..., len)> 85 86A variant form of each of the macro types described above can be generated, in 87which the code point is returned by the macro, and an extra parameter (in the 88final position) is added, which is a pointer for the macro to set the byte 89length of the returned code point. 90 91These forms all have a C<what_len> prefix instead of the C<is_>, for example 92C<what_len_WHATEVER_safe(s,e,is_utf8,len)> and 93C<what_len_WHATEVER_utf8(s,len)>. 94 95These forms should not be used I<except> on small sets of mostly widely 96separated code points; otherwise the code generated is inefficient. For these 97cases, it is best to use the C<is_> forms, and then find the code point with 98C<utf8_to_uvchr_buf>(). This program can fail with a "deep recursion" 99message on the worst of the inappropriate sets. Examine the generated macro 100to see if it is acceptable. 101 102=item C<what_WHATEVER_FOO(arg1, ...)> 103 104A variant form of each of the C<is_> macro types described above can be generated, in 105which the code point and not the length is returned by the macro. These have 106the same caveat as L</what_len_WHATEVER_FOO(arg1, ..., len)>, plus they should 107not be used where the set contains a NULL, as 0 is returned for two different 108cases: a) the set doesn't include the input code point; b) the set does 109include it, and it is a NULL. 110 111=back 112 113=head2 CODE FORMAT 114 115perltidy -st -bt=1 -bbt=0 -pt=0 -sbt=1 -ce -nwls== "%f" 116 117 118=head1 AUTHOR 119 120Author: Yves Orton (demerphq) 2007. Maintained by Perl5 Porters. 121 122=head1 BUGS 123 124No tests directly here (although the regex engine will fail tests 125if this code is broken). Insufficient documentation and no Getopts 126handler for using the module as a script. 127 128=head1 LICENSE 129 130You may distribute under the terms of either the GNU General Public 131License or the Artistic License, as specified in the README file. 132 133=cut 134 135# Sub naming convention: 136# __func : private subroutine, can not be called as a method 137# _func : private method, not meant for external use 138# func : public method. 139 140# private subs 141#------------------------------------------------------------------------------- 142# 143# ($cp,$n,$l,$u)=__uni_latin($str); 144# 145# Return a list of arrays, each of which when interpreted correctly 146# represent the string in some given encoding with specific conditions. 147# 148# $cp - list of codepoints that make up the string. 149# $n - list of octets that make up the string if all codepoints are invariant 150# regardless of if the string is in UTF-8 or not. 151# $l - list of octets that make up the string in latin1 encoding if all 152# codepoints < 256, and at least one codepoint is UTF-8 variant. 153# $u - list of octets that make up the string in utf8 if any codepoint is 154# UTF-8 variant 155# 156# High CP | Defined 157#-----------+---------- 158# 0 - 127 : $n (127/128 are the values for ASCII platforms) 159# 128 - 255 : $l, $u 160# 256 - ... : $u 161# 162 163sub __uni_latin1 { 164 my $str= shift; 165 my $max= 0; 166 my @cp; 167 my @cp_high; 168 my $only_has_invariants = 1; 169 for my $ch ( split //, $str ) { 170 my $cp= ord $ch; 171 push @cp, $cp; 172 push @cp_high, $cp if $cp > 255; 173 $max= $cp if $max < $cp; 174 if (! ASCII_PLATFORM && $only_has_invariants) { 175 if ($cp > 255) { 176 $only_has_invariants = 0; 177 } 178 else { 179 my $temp = chr($cp); 180 utf8::upgrade($temp); 181 my @utf8 = unpack "U0C*", $temp; 182 $only_has_invariants = (@utf8 == 1 && $utf8[0] == $cp); 183 } 184 } 185 } 186 my ( $n, $l, $u ); 187 $only_has_invariants = $max < 128 if ASCII_PLATFORM; 188 if ($only_has_invariants) { 189 $n= [@cp]; 190 } else { 191 $l= [@cp] if $max && $max < 256; 192 193 $u= $str; 194 utf8::upgrade($u); 195 $u= [ unpack "U0C*", $u ] if defined $u; 196 } 197 return ( \@cp, \@cp_high, $n, $l, $u ); 198} 199 200# 201# $clean= __clean($expr); 202# 203# Cleanup a ternary expression, removing unnecessary parens and apply some 204# simplifications using regexes. 205# 206 207sub __clean { 208 my ( $expr )= @_; 209 210 #return $expr; 211 212 our $parens; 213 $parens= qr/ (?> \( (?> (?: (?> [^()]+ ) | (??{ $parens }) )* ) \) ) /x; 214 215 ## remove redundant parens 216 1 while $expr =~ s/ \( \s* ( $parens ) \s* \) /$1/gx; 217 218 219 # repeatedly simplify conditions like 220 # ( (cond1) ? ( (cond2) ? X : Y ) : Y ) 221 # into 222 # ( ( (cond1) && (cond2) ) ? X : Y ) 223 # Also similarly handles expressions like: 224 # : (cond1) ? ( (cond2) ? X : Y ) : Y ) 225 # Note the inclusion of the close paren in ([:()]) and the open paren in ([()]) is 226 # purely to ensure we have a balanced set of parens in the expression which makes 227 # it easier to understand the pattern in an editor that understands paren's, we do 228 # not expect either of these cases to actually fire. - Yves 229 1 while $expr =~ s/ 230 ([:()]) \s* 231 ($parens) \s* 232 \? \s* 233 \( \s* ($parens) \s* 234 \? \s* ($parens|[^()?:\s]+?) \s* 235 : \s* ($parens|[^()?:\s]+?) \s* 236 \) \s* 237 : \s* \5 \s* 238 ([()]) 239 /$1 ( $2 && $3 ) ? $4 : $5 $6/gx; 240 #$expr=~s/\(\(U8\*\)s\)\[(\d+)\]/S$1/g if length $expr > 8000; 241 #$expr=~s/\s+//g if length $expr > 8000; 242 243 die "Expression too long" if length $expr > 8000; 244 245 return $expr; 246} 247 248# 249# $text= __macro(@args); 250# Join args together by newlines, and then neatly add backslashes to the end 251# of every line as expected by the C pre-processor for #define's. 252# 253 254sub __macro { 255 my $str= join "\n", @_; 256 $str =~ s/\s*$//; 257 my @lines= map { s/\s+$//; s/\t/ /g; $_ } split /\n/, $str; 258 my $last= pop @lines; 259 $str= join "\n", ( map { sprintf "%-76s\\", $_ } @lines ), $last; 260 1 while $str =~ s/^(\t*) {8}/$1\t/gm; 261 return $str . "\n"; 262} 263 264# 265# my $op=__incrdepth($op); 266# 267# take an 'op' hashref and add one to it and all its childrens depths. 268# 269 270sub __incrdepth { 271 my $op= shift; 272 return unless ref $op; 273 $op->{depth} += 1; 274 __incrdepth( $op->{yes} ); 275 __incrdepth( $op->{no} ); 276 return $op; 277} 278 279# join two branches of an opcode together with a condition, incrementing 280# the depth on the yes branch when we do so. 281# returns the new root opcode of the tree. 282sub __cond_join { 283 my ( $cond, $yes, $no )= @_; 284 if (ref $yes) { 285 return { 286 test => $cond, 287 yes => __incrdepth( $yes ), 288 no => $no, 289 depth => 0, 290 }; 291 } 292 else { 293 return { 294 test => $cond, 295 yes => $yes, 296 no => __incrdepth($no), 297 depth => 0, 298 }; 299 } 300} 301 302# Methods 303 304# constructor 305# 306# my $obj=CLASS->new(op=>'SOMENAME',title=>'blah',txt=>[..]); 307# 308# Create a new CharClass::Matcher object by parsing the text in 309# the txt array. Currently applies the following rules: 310# 311# Element starts with C<0x>, line is evaled the result treated as 312# a number which is passed to chr(). 313# 314# Element starts with C<">, line is evaled and the result treated 315# as a string. 316# 317# Each string is then stored in the 'strs' subhash as a hash record 318# made up of the results of __uni_latin1, using the keynames 319# 'low','latin1','utf8', as well as the synthesized 'LATIN1', 'high', and 320# 'UTF8' which hold a merge of 'low' and their lowercase equivalents. 321# 322# Size data is tracked per type in the 'size' subhash. 323# 324# Return an object 325# 326sub new { 327 my $class= shift; 328 my %opt= @_; 329 for ( qw(op txt) ) { 330 die "in " . __PACKAGE__ . " constructor '$_;' is a mandatory field" 331 if !exists $opt{$_}; 332 } 333 334 my $self= bless { 335 op => $opt{op}, 336 title => $opt{title} || '', 337 }, $class; 338 foreach my $txt ( @{ $opt{txt} } ) { 339 my $str= $txt; 340 if ( $str =~ /^[""]/ ) { 341 $str= eval $str; 342 } elsif ($str =~ / - /x ) { # A range: Replace this element on the 343 # list with its expansion 344 my ($lower, $upper) = $str =~ / 0x (.+?) \s* - \s* 0x (.+) /x; 345 die "Format must be like '0xDEAD - 0xBEAF'; instead was '$str'" if ! defined $lower || ! defined $upper; 346 foreach my $cp (hex $lower .. hex $upper) { 347 push @{$opt{txt}}, sprintf "0x%X", $cp; 348 } 349 next; 350 } elsif ($str =~ s/ ^ N (?= 0x ) //x ) { 351 # Otherwise undocumented, a leading N means is already in the 352 # native character set; don't convert. 353 $str= chr eval $str; 354 } elsif ( $str =~ /^0x/ ) { 355 $str= eval $str; 356 357 # Convert from Unicode/ASCII to native, if necessary 358 $str = utf8::unicode_to_native($str) if ! ASCII_PLATFORM 359 && $str <= 0xFF; 360 $str = chr $str; 361 } elsif ( $str =~ / \s* \\p \{ ( .*? ) \} /x) { 362 my $property = $1; 363 use Unicode::UCD qw(prop_invlist); 364 365 my @invlist = prop_invlist($property, '_perl_core_internal_ok'); 366 if (! @invlist) { 367 368 # An empty return could mean an unknown property, or merely 369 # that it is empty. Call in scalar context to differentiate 370 my $count = prop_invlist($property, '_perl_core_internal_ok'); 371 die "$property not found" unless defined $count; 372 } 373 374 # Replace this element on the list with the property's expansion 375 for (my $i = 0; $i < @invlist; $i += 2) { 376 foreach my $cp ($invlist[$i] .. $invlist[$i+1] - 1) { 377 378 # prop_invlist() returns native values; add leading 'N' 379 # to indicate that. 380 push @{$opt{txt}}, sprintf "N0x%X", $cp; 381 } 382 } 383 next; 384 } elsif ($str =~ / ^ do \s+ ( .* ) /x) { 385 die "do '$1' failed: $!$@" if ! do $1 or $@; 386 next; 387 } elsif ($str =~ / ^ & \s* ( .* ) /x) { # user-furnished sub() call 388 my @results = eval "$1"; 389 die "eval '$1' failed: $@" if $@; 390 push @{$opt{txt}}, @results; 391 next; 392 } else { 393 die "Unparsable line: $txt\n"; 394 } 395 my ( $cp, $cp_high, $low, $latin1, $utf8 )= __uni_latin1( $str ); 396 my $UTF8= $low || $utf8; 397 my $LATIN1= $low || $latin1; 398 my $high = (scalar grep { $_ < 256 } @$cp) ? 0 : $utf8; 399 #die Dumper($txt,$cp,$low,$latin1,$utf8) 400 # if $txt=~/NEL/ or $utf8 and @$utf8>3; 401 402 @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 high cp cp_high UTF8 LATIN1 )}= 403 ( $str, $txt, $low, $utf8, $latin1, $high, $cp, $cp_high, $UTF8, $LATIN1 ); 404 my $rec= $self->{strs}{$str}; 405 foreach my $key ( qw(low utf8 latin1 high cp cp_high UTF8 LATIN1) ) { 406 $self->{size}{$key}{ 0 + @{ $self->{strs}{$str}{$key} } }++ 407 if $self->{strs}{$str}{$key}; 408 } 409 $self->{has_multi} ||= @$cp > 1; 410 $self->{has_ascii} ||= $latin1 && @$latin1; 411 $self->{has_low} ||= $low && @$low; 412 $self->{has_high} ||= !$low && !$latin1; 413 } 414 $self->{val_fmt}= $hex_fmt; 415 $self->{count}= 0 + keys %{ $self->{strs} }; 416 return $self; 417} 418 419# my $trie = make_trie($type,$maxlen); 420# 421# using the data stored in the object build a trie of a specific type, 422# and with specific maximum depth. The trie is made up the elements of 423# the given types array for each string in the object (assuming it is 424# not too long.) 425# 426# returns the trie, or undef if there was no relevant data in the object. 427# 428 429sub make_trie { 430 my ( $self, $type, $maxlen )= @_; 431 432 my $strs= $self->{strs}; 433 my %trie; 434 foreach my $rec ( values %$strs ) { 435 die "panic: unknown type '$type'" 436 if !exists $rec->{$type}; 437 my $dat= $rec->{$type}; 438 next unless $dat; 439 next if $maxlen && @$dat > $maxlen; 440 my $node= \%trie; 441 foreach my $elem ( @$dat ) { 442 $node->{$elem} ||= {}; 443 $node= $node->{$elem}; 444 } 445 $node->{''}= $rec->{str}; 446 } 447 return 0 + keys( %trie ) ? \%trie : undef; 448} 449 450sub pop_count ($) { 451 my $word = shift; 452 453 # This returns a list of the positions of the bits in the input word that 454 # are 1. 455 456 my @positions; 457 my $position = 0; 458 while ($word) { 459 push @positions, $position if $word & 1; 460 $position++; 461 $word >>= 1; 462 } 463 return @positions; 464} 465 466# my $optree= _optree() 467# 468# recursively convert a trie to an optree where every node represents 469# an if else branch. 470# 471# 472 473sub _optree { 474 my ( $self, $trie, $test_type, $ret_type, $else, $depth )= @_; 475 return unless defined $trie; 476 if ( $self->{has_multi} and $ret_type =~ /cp|both/ ) { 477 die "Can't do 'cp' optree from multi-codepoint strings"; 478 } 479 $ret_type ||= 'len'; 480 $else= 0 unless defined $else; 481 $depth= 0 unless defined $depth; 482 483 # if we have an empty string as a key it means we are in an 484 # accepting state and unless we can match further on should 485 # return the value of the '' key. 486 if (exists $trie->{''} ) { 487 # we can now update the "else" value, anything failing to match 488 # after this point should return the value from this. 489 if ( $ret_type eq 'cp' ) { 490 $else= $self->{strs}{ $trie->{''} }{cp}[0]; 491 $else= sprintf "$self->{val_fmt}", $else if $else > 9; 492 } elsif ( $ret_type eq 'len' ) { 493 $else= $depth; 494 } elsif ( $ret_type eq 'both') { 495 $else= $self->{strs}{ $trie->{''} }{cp}[0]; 496 $else= sprintf "$self->{val_fmt}", $else if $else > 9; 497 $else= "len=$depth, $else"; 498 } 499 } 500 # extract the meaningful keys from the trie, filter out '' as 501 # it means we are an accepting state (end of sequence). 502 my @conds= sort { $a <=> $b } grep { length $_ } keys %$trie; 503 504 # if we haven't any keys there is no further we can match and we 505 # can return the "else" value. 506 return $else if !@conds; 507 508 # Assuming Perl is being released from an ASCII platform, the below makes 509 # it work for non-UTF-8 out-of-the box when porting to non-ASCII, by 510 # adding a translation back to ASCII. This is the wrong thing to do for 511 # UTF-EBCDIC, as that is different from UTF-8. But the intent here is 512 # that this regen should be run on the target system, which will omit the 513 # translation, and generate the correct UTF-EBCDIC. On ASCII systems, the 514 # translation macros expand to just their argument, so there is no harm 515 # done nor performance penalty by including them. 516 my $test; 517 if ($test_type =~ /^cp/) { 518 $test = "cp"; 519 $test = "NATIVE_TO_UNI($test)" if ASCII_PLATFORM; 520 } 521 else { 522 $test = "((U8*)s)[$depth]"; 523 $test = "NATIVE_TO_LATIN1($test)" if ASCII_PLATFORM; 524 } 525 526 # first we loop over the possible keys/conditions and find out what they 527 # look like; we group conditions with the same optree together. 528 my %dmp_res; 529 my @res_order; 530 local $Data::Dumper::Sortkeys=1; 531 foreach my $cond ( @conds ) { 532 533 # get the optree for this child/condition 534 my $res= $self->_optree( $trie->{$cond}, $test_type, $ret_type, $else, $depth + 1 ); 535 # convert it to a string with Dumper 536 my $res_code= Dumper( $res ); 537 538 push @{$dmp_res{$res_code}{vals}}, $cond; 539 if (!$dmp_res{$res_code}{optree}) { 540 $dmp_res{$res_code}{optree}= $res; 541 push @res_order, $res_code; 542 } 543 } 544 545 # now that we have deduped the optrees we construct a new optree containing the merged 546 # results. 547 my %root; 548 my $node= \%root; 549 foreach my $res_code_idx (0 .. $#res_order) { 550 my $res_code= $res_order[$res_code_idx]; 551 $node->{vals}= $dmp_res{$res_code}{vals}; 552 $node->{test}= $test; 553 $node->{yes}= $dmp_res{$res_code}{optree}; 554 $node->{depth}= $depth; 555 if ($res_code_idx < $#res_order) { 556 $node= $node->{no}= {}; 557 } else { 558 $node->{no}= $else; 559 } 560 } 561 562 # return the optree. 563 return \%root; 564} 565 566# my $optree= optree(%opts); 567# 568# Convert a trie to an optree, wrapper for _optree 569 570sub optree { 571 my $self= shift; 572 my %opt= @_; 573 my $trie= $self->make_trie( $opt{type}, $opt{max_depth} ); 574 $opt{ret_type} ||= 'len'; 575 my $test_type= $opt{type} =~ /^cp/ ? 'cp' : 'depth'; 576 return $self->_optree( $trie, $test_type, $opt{ret_type}, $opt{else}, 0 ); 577} 578 579# my $optree= generic_optree(%opts); 580# 581# build a "generic" optree out of the three 'low', 'latin1', 'utf8' 582# sets of strings, including a branch for handling the string type check. 583# 584 585sub generic_optree { 586 my $self= shift; 587 my %opt= @_; 588 589 $opt{ret_type} ||= 'len'; 590 my $test_type= 'depth'; 591 my $else= $opt{else} || 0; 592 593 my $latin1= $self->make_trie( 'latin1', $opt{max_depth} ); 594 my $utf8= $self->make_trie( 'utf8', $opt{max_depth} ); 595 596 $_= $self->_optree( $_, $test_type, $opt{ret_type}, $else, 0 ) 597 for $latin1, $utf8; 598 599 if ( $utf8 ) { 600 $else= __cond_join( "( is_utf8 )", $utf8, $latin1 || $else ); 601 } elsif ( $latin1 ) { 602 $else= __cond_join( "!( is_utf8 )", $latin1, $else ); 603 } 604 if ($opt{type} eq 'generic') { 605 my $low= $self->make_trie( 'low', $opt{max_depth} ); 606 if ( $low ) { 607 $else= $self->_optree( $low, $test_type, $opt{ret_type}, $else, 0 ); 608 } 609 } 610 611 return $else; 612} 613 614# length_optree() 615# 616# create a string length guarded optree. 617# 618 619sub length_optree { 620 my $self= shift; 621 my %opt= @_; 622 my $type= $opt{type}; 623 624 die "Can't do a length_optree on type 'cp', makes no sense." 625 if $type =~ /^cp/; 626 627 my $else= ( $opt{else} ||= 0 ); 628 629 my $method = $type =~ /generic/ ? 'generic_optree' : 'optree'; 630 if ($method eq 'optree' && scalar keys %{$self->{size}{$type}} == 1) { 631 632 # Here is non-generic output (meaning that we are only generating one 633 # type), and all things that match have the same number ('size') of 634 # bytes. The length guard is simply that we have that number of 635 # bytes. 636 my @size = keys %{$self->{size}{$type}}; 637 my $cond= "((e) - (s)) >= $size[0]"; 638 my $optree = $self->$method(%opt); 639 $else= __cond_join( $cond, $optree, $else ); 640 } 641 elsif ($self->{has_multi}) { 642 my @size; 643 644 # Here, there can be a match of a multiple character string. We use 645 # the traditional method which is to have a branch for each possible 646 # size (longest first) and test for the legal values for that size. 647 my %sizes= ( 648 %{ $self->{size}{low} || {} }, 649 %{ $self->{size}{latin1} || {} }, 650 %{ $self->{size}{utf8} || {} } 651 ); 652 if ($method eq 'generic_optree') { 653 @size= sort { $a <=> $b } keys %sizes; 654 } else { 655 @size= sort { $a <=> $b } keys %{ $self->{size}{$type} }; 656 } 657 for my $size ( @size ) { 658 my $optree= $self->$method( %opt, type => $type, max_depth => $size ); 659 my $cond= "((e)-(s) > " . ( $size - 1 ).")"; 660 $else= __cond_join( $cond, $optree, $else ); 661 } 662 } 663 else { 664 my $utf8; 665 666 # Here, has more than one possible size, and only matches a single 667 # character. For non-utf8, the needed length is 1; for utf8, it is 668 # found by array lookup 'UTF8SKIP'. 669 670 # If want just the code points above 255, set up to look for those; 671 # otherwise assume will be looking for all non-UTF-8-invariant code 672 # poiints. 673 my $trie_type = ($type eq 'high') ? 'high' : 'utf8'; 674 675 # If we do want more than the 0-255 range, find those, and if they 676 # exist... 677 if ($opt{type} !~ /latin1/i && ($utf8 = $self->make_trie($trie_type, 0))) { 678 679 # ... get them into an optree, and set them up as the 'else' clause 680 $utf8 = $self->_optree( $utf8, 'depth', $opt{ret_type}, 0, 0 ); 681 682 # We could make this 683 # UTF8_IS_START(*s) && ((e) - (s)) >= UTF8SKIP(s))"; 684 # to avoid doing the UTF8SKIP and subsequent branches for invariants 685 # that don't match. But the current macros that get generated 686 # have only a few things that can match past this, so I (khw) 687 # don't think it is worth it. (Even better would be to use 688 # calculate_mask(keys %$utf8) instead of UTF8_IS_START, and use it 689 # if it saves a bunch. 690 my $cond = "(((e) - (s)) >= UTF8SKIP(s))"; 691 $else = __cond_join($cond, $utf8, $else); 692 693 # For 'generic', we also will want the latin1 UTF-8 variants for 694 # the case where the input isn't UTF-8. 695 my $latin1; 696 if ($method eq 'generic_optree') { 697 $latin1 = $self->make_trie( 'latin1', 1); 698 $latin1= $self->_optree( $latin1, 'depth', $opt{ret_type}, 0, 0 ); 699 } 700 701 # If we want the UTF-8 invariants, get those. 702 my $low; 703 if ($opt{type} !~ /non_low|high/ 704 && ($low= $self->make_trie( 'low', 1))) 705 { 706 $low= $self->_optree( $low, 'depth', $opt{ret_type}, 0, 0 ); 707 708 # Expand out the UTF-8 invariants as a string so that we 709 # can use them as the conditional 710 $low = $self->_cond_as_str( $low, 0, \%opt); 711 712 # If there are Latin1 variants, add a test for them. 713 if ($latin1) { 714 $else = __cond_join("(! is_utf8 )", $latin1, $else); 715 } 716 elsif ($method eq 'generic_optree') { 717 718 # Otherwise for 'generic' only we know that what 719 # follows must be valid for just UTF-8 strings, 720 $else->{test} = "( is_utf8 && $else->{test} )"; 721 } 722 723 # If the invariants match, we are done; otherwise we have 724 # to go to the 'else' clause. 725 $else = __cond_join($low, 1, $else); 726 } 727 elsif ($latin1) { # Here, didn't want or didn't have invariants, 728 # but we do have latin variants 729 $else = __cond_join("(! is_utf8)", $latin1, $else); 730 } 731 732 # We need at least one byte available to start off the tests 733 $else = __cond_join("((e) > (s))", $else, 0); 734 } 735 else { # Here, we don't want or there aren't any variants. A single 736 # byte available is enough. 737 my $cond= "((e) > (s))"; 738 my $optree = $self->$method(%opt); 739 $else= __cond_join( $cond, $optree, $else ); 740 } 741 } 742 743 return $else; 744} 745 746sub calculate_mask(@) { 747 # Look at the input list of byte values. This routine returns an array of 748 # mask/base pairs to generate that list. 749 750 my @list = @_; 751 my $list_count = @list; 752 753 # Consider a set of byte values, A, B, C .... If we want to determine if 754 # <c> is one of them, we can write c==A || c==B || c==C .... If the 755 # values are consecutive, we can shorten that to A<=c && c<=Z, which uses 756 # far fewer branches. If only some of them are consecutive we can still 757 # save some branches by creating range tests for just those that are 758 # consecutive. _cond_as_str() does this work for looking for ranges. 759 # 760 # Another approach is to look at the bit patterns for A, B, C .... and see 761 # if they have some commonalities. That's what this function does. For 762 # example, consider a set consisting of the bytes 763 # 0xF0, 0xF1, 0xF2, and 0xF3. We could write: 764 # 0xF0 <= c && c <= 0xF4 765 # But the following mask/compare also works, and has just one test: 766 # (c & 0xFC) == 0xF0 767 # The reason it works is that the set consists of exactly those bytes 768 # whose first 4 bits are 1, and the next two are 0. (The value of the 769 # other 2 bits is immaterial in determining if a byte is in the set or 770 # not.) The mask masks out those 2 irrelevant bits, and the comparison 771 # makes sure that the result matches all bytes which match those 6 772 # material bits exactly. In other words, the set of bytes contains 773 # exactly those whose bottom two bit positions are either 0 or 1. The 774 # same principle applies to bit positions that are not necessarily 775 # adjacent. And it can be applied to bytes that differ in 1 through all 8 776 # bit positions. In order to be a candidate for this optimization, the 777 # number of bytes in the set must be a power of 2. 778 # 779 # Consider a different example, the set 0x53, 0x54, 0x73, and 0x74. That 780 # requires 4 tests using either ranges or individual values, and even 781 # though the number in the set is a power of 2, it doesn't qualify for the 782 # mask optimization described above because the number of bits that are 783 # different is too large for that. However, the set can be expressed as 784 # two branches with masks thusly: 785 # (c & 0xDF) == 0x53 || (c & 0xDF) == 0x54 786 # a branch savings of 50%. This is done by splitting the set into two 787 # subsets each of which has 2 elements, and within each set the values 788 # differ by 1 byte. 789 # 790 # This function attempts to find some way to save some branches using the 791 # mask technique. If not, it returns an empty list; if so, it 792 # returns a list consisting of 793 # [ [compare1, mask1], [compare2, mask2], ... 794 # [compare_n, undef], [compare_m, undef], ... 795 # ] 796 # The <mask> is undef in the above for those bytes that must be tested 797 # for individually. 798 # 799 # This function does not attempt to find the optimal set. To do so would 800 # probably require testing all possible combinations, and keeping track of 801 # the current best one. 802 # 803 # There are probably much better algorithms, but this is the one I (khw) 804 # came up with. We start with doing a bit-wise compare of every byte in 805 # the set with every other byte. The results are sorted into arrays of 806 # all those that differ by the same bit positions. These are stored in a 807 # hash with the each key being the bits they differ in. Here is the hash 808 # for the 0x53, 0x54, 0x73, 0x74 set: 809 # { 810 # 4 => { 811 # "0,1,2,5" => [ 812 # 83, 813 # 116, 814 # 84, 815 # 115 816 # ] 817 # }, 818 # 3 => { 819 # "0,1,2" => [ 820 # 83, 821 # 84, 822 # 115, 823 # 116 824 # ] 825 # } 826 # 1 => { 827 # 5 => [ 828 # 83, 829 # 115, 830 # 84, 831 # 116 832 # ] 833 # }, 834 # } 835 # 836 # The set consisting of values which differ in the 4 bit positions 0, 1, 837 # 2, and 5 from some other value in the set consists of all 4 values. 838 # Likewise all 4 values differ from some other value in the 3 bit 839 # positions 0, 1, and 2; and all 4 values differ from some other value in 840 # the single bit position 5. The keys at the uppermost level in the above 841 # hash, 1, 3, and 4, give the number of bit positions that each sub-key 842 # below it has. For example, the 4 key could have as its value an array 843 # consisting of "0,1,2,5", "0,1,2,6", and "3,4,6,7", if the inputs were 844 # such. The best optimization will group the most values into a single 845 # mask. The most values will be the ones that differ in the most 846 # positions, the ones with the largest value for the topmost key. These 847 # keys, are thus just for convenience of sorting by that number, and do 848 # not have any bearing on the core of the algorithm. 849 # 850 # We start with an element from largest number of differing bits. The 851 # largest in this case is 4 bits, and there is only one situation in this 852 # set which has 4 differing bits, "0,1,2,5". We look for any subset of 853 # this set which has 16 values that differ in these 4 bits. There aren't 854 # any, because there are only 4 values in the entire set. We then look at 855 # the next possible thing, which is 3 bits differing in positions "0,1,2". 856 # We look for a subset that has 8 values that differ in these 3 bits. 857 # Again there are none. So we go to look for the next possible thing, 858 # which is a subset of 2**1 values that differ only in bit position 5. 83 859 # and 115 do, so we calculate a mask and base for those and remove them 860 # from every set. Since there is only the one set remaining, we remove 861 # them from just this one. We then look to see if there is another set of 862 # 2 values that differ in bit position 5. 84 and 116 do, so we calculate 863 # a mask and base for those and remove them from every set (again only 864 # this set remains in this example). The set is now empty, and there are 865 # no more sets to look at, so we are done. 866 867 if ($list_count == 256) { # All 256 is trivially masked 868 return (0, 0); 869 } 870 871 my %hash; 872 873 # Generate bits-differing lists for each element compared against each 874 # other element 875 for my $i (0 .. $list_count - 2) { 876 for my $j ($i + 1 .. $list_count - 1) { 877 my @bits_that_differ = pop_count($list[$i] ^ $list[$j]); 878 my $differ_count = @bits_that_differ; 879 my $key = join ",", @bits_that_differ; 880 push @{$hash{$differ_count}{$key}}, $list[$i] unless grep { $_ == $list[$i] } @{$hash{$differ_count}{$key}}; 881 push @{$hash{$differ_count}{$key}}, $list[$j]; 882 } 883 } 884 885 print STDERR __LINE__, ": calculate_mask() called: List of values grouped by differing bits: ", Dumper \%hash if DEBUG; 886 887 my @final_results; 888 foreach my $count (reverse sort { $a <=> $b } keys %hash) { 889 my $need = 2 ** $count; # Need 8 values for 3 differing bits, etc 890 foreach my $bits (sort keys $hash{$count}) { 891 892 print STDERR __LINE__, ": For $count bit(s) difference ($bits), need $need; have ", scalar @{$hash{$count}{$bits}}, "\n" if DEBUG; 893 894 # Look only as long as there are at least as many elements in the 895 # subset as are needed 896 while ((my $cur_count = @{$hash{$count}{$bits}}) >= $need) { 897 898 print STDERR __LINE__, ": Looking at bit positions ($bits): ", Dumper $hash{$count}{$bits} if DEBUG; 899 900 # Start with the first element in it 901 my $try_base = $hash{$count}{$bits}[0]; 902 my @subset = $try_base; 903 904 # If it succeeds, we return a mask and a base to compare 905 # against the masked value. That base will be the AND of 906 # every element in the subset. Initialize to the one element 907 # we have so far. 908 my $compare = $try_base; 909 910 # We are trying to find a subset of this that has <need> 911 # elements that differ in the bit positions given by the 912 # string $bits, which is comma separated. 913 my @bits = split ",", $bits; 914 915 TRY: # Look through the remainder of the list for other 916 # elements that differ only by these bit positions. 917 918 for (my $i = 1; $i < $cur_count; $i++) { 919 my $try_this = $hash{$count}{$bits}[$i]; 920 my @positions = pop_count($try_base ^ $try_this); 921 922 print STDERR __LINE__, ": $try_base vs $try_this: is (", join(',', @positions), ") a subset of ($bits)?" if DEBUG;; 923 924 foreach my $pos (@positions) { 925 unless (grep { $pos == $_ } @bits) { 926 print STDERR " No\n" if DEBUG; 927 my $remaining = $cur_count - $i - 1; 928 if ($remaining && @subset + $remaining < $need) { 929 print STDERR __LINE__, ": Can stop trying $try_base, because even if all the remaining $remaining values work, they wouldn't add up to the needed $need when combined with the existing ", scalar @subset, " ones\n" if DEBUG; 930 last TRY; 931 } 932 next TRY; 933 } 934 } 935 936 print STDERR " Yes\n" if DEBUG; 937 push @subset, $try_this; 938 939 # Add this to the mask base, in case it ultimately 940 # succeeds, 941 $compare &= $try_this; 942 } 943 944 print STDERR __LINE__, ": subset (", join(", ", @subset), ") has ", scalar @subset, " elements; needs $need\n" if DEBUG; 945 946 if (@subset < $need) { 947 shift @{$hash{$count}{$bits}}; 948 next; # Try with next value 949 } 950 951 # Create the mask 952 my $mask = 0; 953 foreach my $position (@bits) { 954 $mask |= 1 << $position; 955 } 956 $mask = ~$mask & 0xFF; 957 push @final_results, [$compare, $mask]; 958 959 printf STDERR "%d: Got it: compare=%d=0x%X; mask=%X\n", __LINE__, $compare, $compare, $mask if DEBUG; 960 961 # These values are now spoken for. Remove them from future 962 # consideration 963 foreach my $remove_count (sort keys %hash) { 964 foreach my $bits (sort keys %{$hash{$remove_count}}) { 965 foreach my $to_remove (@subset) { 966 @{$hash{$remove_count}{$bits}} = grep { $_ != $to_remove } @{$hash{$remove_count}{$bits}}; 967 } 968 } 969 } 970 } 971 } 972 } 973 974 # Any values that remain in the list are ones that have to be tested for 975 # individually. 976 my @individuals; 977 foreach my $count (reverse sort { $a <=> $b } keys %hash) { 978 foreach my $bits (sort keys $hash{$count}) { 979 foreach my $remaining (@{$hash{$count}{$bits}}) { 980 981 # If we already know about this value, just ignore it. 982 next if grep { $remaining == $_ } @individuals; 983 984 # Otherwise it needs to be returned as something to match 985 # individually 986 push @final_results, [$remaining, undef]; 987 push @individuals, $remaining; 988 } 989 } 990 } 991 992 # Sort by increasing numeric value 993 @final_results = sort { $a->[0] <=> $b->[0] } @final_results; 994 995 print STDERR __LINE__, ": Final return: ", Dumper \@final_results if DEBUG; 996 997 return @final_results; 998} 999 1000# _cond_as_str 1001# turn a list of conditions into a text expression 1002# - merges ranges of conditions, and joins the result with || 1003sub _cond_as_str { 1004 my ( $self, $op, $combine, $opts_ref )= @_; 1005 my $cond= $op->{vals}; 1006 my $test= $op->{test}; 1007 my $is_cp_ret = $opts_ref->{ret_type} eq "cp"; 1008 return "( $test )" if !defined $cond; 1009 1010 # rangify the list. 1011 my @ranges; 1012 my $Update= sub { 1013 # We skip this if there are optimizations that 1014 # we can apply (below) to the individual ranges 1015 if ( ($is_cp_ret || $combine) && @ranges && ref $ranges[-1]) { 1016 if ( $ranges[-1][0] == $ranges[-1][1] ) { 1017 $ranges[-1]= $ranges[-1][0]; 1018 } elsif ( $ranges[-1][0] + 1 == $ranges[-1][1] ) { 1019 $ranges[-1]= $ranges[-1][0]; 1020 push @ranges, $ranges[-1] + 1; 1021 } 1022 } 1023 }; 1024 for my $condition ( @$cond ) { 1025 if ( !@ranges || $condition != $ranges[-1][1] + 1 ) { 1026 $Update->(); 1027 push @ranges, [ $condition, $condition ]; 1028 } else { 1029 $ranges[-1][1]++; 1030 } 1031 } 1032 $Update->(); 1033 1034 return $self->_combine( $test, @ranges ) 1035 if $combine; 1036 1037 if ($is_cp_ret) { 1038 @ranges= map { 1039 ref $_ 1040 ? sprintf( 1041 "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )", 1042 @$_ ) 1043 : sprintf( "$self->{val_fmt} == $test", $_ ); 1044 } @ranges; 1045 1046 return "( " . join( " || ", @ranges ) . " )"; 1047 } 1048 1049 # If the input set has certain characteristics, we can optimize tests 1050 # for it. This doesn't apply if returning the code point, as we want 1051 # each element of the set individually. The code above is for this 1052 # simpler case. 1053 1054 return 1 if @$cond == 256; # If all bytes match, is trivially true 1055 1056 my @masks; 1057 if (@ranges > 1) { 1058 1059 # See if the entire set shares optimizable characteristics, and if so, 1060 # return the optimization. We delay checking for this on sets with 1061 # just a single range, as there may be better optimizations available 1062 # in that case. 1063 @masks = calculate_mask(@$cond); 1064 1065 # Stringify the output of calculate_mask() 1066 if (@masks) { 1067 my @return; 1068 foreach my $mask_ref (@masks) { 1069 if (defined $mask_ref->[1]) { 1070 push @return, sprintf "( ( $test & $self->{val_fmt} ) == $self->{val_fmt} )", $mask_ref->[1], $mask_ref->[0]; 1071 } 1072 else { # An undefined mask means to use the value as-is 1073 push @return, sprintf "$test == $self->{val_fmt}", $mask_ref->[0]; 1074 } 1075 } 1076 1077 # The best possible case below for specifying this set of values via 1078 # ranges is 1 branch per range. If our mask method yielded better 1079 # results, there is no sense trying something that is bound to be 1080 # worse. 1081 if (@return < @ranges) { 1082 return "( " . join( " || ", @return ) . " )"; 1083 } 1084 1085 @masks = @return; 1086 } 1087 } 1088 1089 # Here, there was no entire-class optimization that was clearly better 1090 # than doing things by ranges. Look at each range. 1091 my $range_count_extra = 0; 1092 for (my $i = 0; $i < @ranges; $i++) { 1093 if (! ref $ranges[$i]) { # Trivial case: no range 1094 $ranges[$i] = sprintf "$self->{val_fmt} == $test", $ranges[$i]; 1095 } 1096 elsif ($ranges[$i]->[0] == $ranges[$i]->[1]) { 1097 $ranges[$i] = # Trivial case: single element range 1098 sprintf "$self->{val_fmt} == $test", $ranges[$i]->[0]; 1099 } 1100 else { 1101 my $output = ""; 1102 1103 # Well-formed UTF-8 continuation bytes on ascii platforms must be 1104 # in the range 0x80 .. 0xBF. If we know that the input is 1105 # well-formed (indicated by not trying to be 'safe'), we can omit 1106 # tests that verify that the input is within either of these 1107 # bounds. (No legal UTF-8 character can begin with anything in 1108 # this range, so we don't have to worry about this being a 1109 # continuation byte or not.) 1110 if (ASCII_PLATFORM 1111 && ! $opts_ref->{safe} 1112 && $opts_ref->{type} =~ / ^ (?: utf8 | high ) $ /xi) 1113 { 1114 my $lower_limit_is_80 = ($ranges[$i]->[0] == 0x80); 1115 my $upper_limit_is_BF = ($ranges[$i]->[1] == 0xBF); 1116 1117 # If the range is the entire legal range, it matches any legal 1118 # byte, so we can omit both tests. (This should happen only 1119 # if the number of ranges is 1.) 1120 if ($lower_limit_is_80 && $upper_limit_is_BF) { 1121 return 1; 1122 } 1123 elsif ($lower_limit_is_80) { # Just use the upper limit test 1124 $output = sprintf("( $test <= $self->{val_fmt} )", 1125 $ranges[$i]->[1]); 1126 } 1127 elsif ($upper_limit_is_BF) { # Just use the lower limit test 1128 $output = sprintf("( $test >= $self->{val_fmt} )", 1129 $ranges[$i]->[0]); 1130 } 1131 } 1132 1133 # If we didn't change to omit a test above, see if the number of 1134 # elements is a power of 2 (only a single bit in the 1135 # representation of its count will be set) and if so, it may be 1136 # that a mask/compare optimization is possible. 1137 if ($output eq "" 1138 && pop_count($ranges[$i]->[1] - $ranges[$i]->[0] + 1) == 1) 1139 { 1140 my @list; 1141 push @list, $_ for ($ranges[$i]->[0] .. $ranges[$i]->[1]); 1142 my @this_masks = calculate_mask(@list); 1143 1144 # Use the mask if there is just one for the whole range. 1145 # Otherwise there is no savings over the two branches that can 1146 # define the range. 1147 if (@this_masks == 1 && defined $this_masks[0][1]) { 1148 $output = sprintf "( $test & $self->{val_fmt} ) == $self->{val_fmt}", $this_masks[0][1], $this_masks[0][0]; 1149 } 1150 } 1151 1152 if ($output ne "") { # Prefer any optimization 1153 $ranges[$i] = $output; 1154 } 1155 else { 1156 # No optimization happened. We need a test that the code 1157 # point is within both bounds. But, if the bounds are 1158 # adjacent code points, it is cleaner to say 1159 # 'first == test || second == test' 1160 # than it is to say 1161 # 'first <= test && test <= second' 1162 1163 $range_count_extra++; # This range requires 2 branches to 1164 # represent 1165 if ($ranges[$i]->[0] + 1 == $ranges[$i]->[1]) { 1166 $ranges[$i] = "( " 1167 . join( " || ", ( map 1168 { sprintf "$self->{val_fmt} == $test", $_ } 1169 @{$ranges[$i]} ) ) 1170 . " )"; 1171 } 1172 else { # Full bounds checking 1173 $ranges[$i] = sprintf("( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )", $ranges[$i]->[0], $ranges[$i]->[1]); 1174 } 1175 } 1176 } 1177 } 1178 1179 # We have generated the list of bytes in two ways; one trying to use masks 1180 # to cut the number of branches down, and the other to look at individual 1181 # ranges (some of which could be cut down by using a mask for just it). 1182 # We return whichever method uses the fewest branches. 1183 return "( " 1184 . join( " || ", (@masks && @masks < @ranges + $range_count_extra) 1185 ? @masks 1186 : @ranges) 1187 . " )"; 1188} 1189 1190# _combine 1191# recursively turn a list of conditions into a fast break-out condition 1192# used by _cond_as_str() for 'cp' type macros. 1193sub _combine { 1194 my ( $self, $test, @cond )= @_; 1195 return if !@cond; 1196 my $item= shift @cond; 1197 my ( $cstr, $gtv ); 1198 if ( ref $item ) { # @item should be a 2-element array giving range start 1199 # and end 1200 if ($item->[0] == 0) { # UV's are never negative, so skip "0 <= " 1201 # test which could generate a compiler warning 1202 # that test is always true 1203 $cstr= sprintf( "$test <= $self->{val_fmt}", $item->[1] ); 1204 } 1205 else { 1206 $cstr= 1207 sprintf( "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )", 1208 @$item ); 1209 } 1210 $gtv= sprintf "$self->{val_fmt}", $item->[1]; 1211 } else { 1212 $cstr= sprintf( "$self->{val_fmt} == $test", $item ); 1213 $gtv= sprintf "$self->{val_fmt}", $item; 1214 } 1215 if ( @cond ) { 1216 my $combine= $self->_combine( $test, @cond ); 1217 if (@cond >1) { 1218 return "( $cstr || ( $gtv < $test &&\n" 1219 . $combine . " ) )"; 1220 } else { 1221 return "( $cstr || $combine )"; 1222 } 1223 } else { 1224 return $cstr; 1225 } 1226} 1227 1228# _render() 1229# recursively convert an optree to text with reasonably neat formatting 1230sub _render { 1231 my ( $self, $op, $combine, $brace, $opts_ref, $def, $submacros )= @_; 1232 return 0 if ! defined $op; # The set is empty 1233 if ( !ref $op ) { 1234 return $op; 1235 } 1236 my $cond= $self->_cond_as_str( $op, $combine, $opts_ref ); 1237 #no warnings 'recursion'; # This would allow really really inefficient 1238 # code to be generated. See pod 1239 my $yes= $self->_render( $op->{yes}, $combine, 1, $opts_ref, $def, $submacros ); 1240 return $yes if $cond eq '1'; 1241 1242 my $no= $self->_render( $op->{no}, $combine, 0, $opts_ref, $def, $submacros ); 1243 return "( $cond )" if $yes eq '1' and $no eq '0'; 1244 my ( $lb, $rb )= $brace ? ( "( ", " )" ) : ( "", "" ); 1245 return "$lb$cond ? $yes : $no$rb" 1246 if !ref( $op->{yes} ) && !ref( $op->{no} ); 1247 my $ind1= " " x 4; 1248 my $ind= "\n" . ( $ind1 x $op->{depth} ); 1249 1250 if ( ref $op->{yes} ) { 1251 $yes= $ind . $ind1 . $yes; 1252 } else { 1253 $yes= " " . $yes; 1254 } 1255 1256 my $str= "$lb$cond ?$yes$ind: $no$rb"; 1257 if (length $str > 6000) { 1258 push @$submacros, sprintf "#define $def\n( %s )", "_part" . (my $yes_idx= 0+@$submacros), $yes; 1259 push @$submacros, sprintf "#define $def\n( %s )", "_part" . (my $no_idx= 0+@$submacros), $no; 1260 return sprintf "%s%s ? $def : $def%s", $lb, $cond, "_part$yes_idx", "_part$no_idx", $rb; 1261 } 1262 return $str; 1263} 1264 1265# $expr=render($op,$combine) 1266# 1267# convert an optree to text with reasonably neat formatting. If $combine 1268# is true then the condition is created using "fast breakouts" which 1269# produce uglier expressions that are more efficient for common case, 1270# longer lists such as that resulting from type 'cp' output. 1271# Currently only used for type 'cp' macros. 1272sub render { 1273 my ( $self, $op, $combine, $opts_ref, $def_fmt )= @_; 1274 1275 my @submacros; 1276 my $macro= sprintf "#define $def_fmt\n( %s )", "", $self->_render( $op, $combine, 0, $opts_ref, $def_fmt, \@submacros ); 1277 1278 return join "\n\n", map { "/*** GENERATED CODE ***/\n" . __macro( __clean( $_ ) ) } @submacros, $macro; 1279} 1280 1281# make_macro 1282# make a macro of a given type. 1283# calls into make_trie and (generic_|length_)optree as needed 1284# Opts are: 1285# type : 'cp','cp_high', 'generic','high','low','latin1','utf8','LATIN1','UTF8' 1286# ret_type : 'cp' or 'len' 1287# safe : add length guards to macro 1288# 1289# type defaults to 'generic', and ret_type to 'len' unless type is 'cp' 1290# in which case it defaults to 'cp' as well. 1291# 1292# It is illegal to do a type 'cp' macro on a pattern with multi-codepoint 1293# sequences in it, as the generated macro will accept only a single codepoint 1294# as an argument. 1295# 1296# It is also illegal to do a non-safe macro on a pattern with multi-codepoint 1297# sequences in it, as even if it is known to be well-formed, we need to not 1298# run off the end of the buffer when say the buffer ends with the first two 1299# characters, but three are looked at by the macro. 1300# 1301# returns the macro. 1302 1303 1304sub make_macro { 1305 my $self= shift; 1306 my %opts= @_; 1307 my $type= $opts{type} || 'generic'; 1308 if ($self->{has_multi}) { 1309 if ($type =~ /^cp/) { 1310 die "Can't do a 'cp' on multi-codepoint character class '$self->{op}'" 1311 } 1312 elsif (! $opts{safe}) { 1313 die "'safe' is required on multi-codepoint character class '$self->{op}'" 1314 } 1315 } 1316 my $ret_type= $opts{ret_type} || ( $opts{type} =~ /^cp/ ? 'cp' : 'len' ); 1317 my $method; 1318 if ( $opts{safe} ) { 1319 $method= 'length_optree'; 1320 } elsif ( $type =~ /generic/ ) { 1321 $method= 'generic_optree'; 1322 } else { 1323 $method= 'optree'; 1324 } 1325 my @args= $type =~ /^cp/ ? 'cp' : 's'; 1326 push @args, "e" if $opts{safe}; 1327 push @args, "is_utf8" if $type =~ /generic/; 1328 push @args, "len" if $ret_type eq 'both'; 1329 my $pfx= $ret_type eq 'both' ? 'what_len_' : 1330 $ret_type eq 'cp' ? 'what_' : 'is_'; 1331 my $ext= $type =~ /generic/ ? '' : '_' . lc( $type ); 1332 $ext .= '_non_low' if $type eq 'generic_non_low'; 1333 $ext .= "_safe" if $opts{safe}; 1334 my $argstr= join ",", @args; 1335 my $def_fmt="$pfx$self->{op}$ext%s($argstr)"; 1336 my $optree= $self->$method( %opts, type => $type, ret_type => $ret_type ); 1337 return $self->render( $optree, ($type =~ /^cp/) ? 1 : 0, \%opts, $def_fmt ); 1338} 1339 1340# if we aren't being used as a module (highly likely) then process 1341# the __DATA__ below and produce macros in regcharclass.h 1342# if an argument is provided to the script then it is assumed to 1343# be the path of the file to output to, if the arg is '-' outputs 1344# to STDOUT. 1345if ( !caller ) { 1346 $|++; 1347 my $path= shift @ARGV || "regcharclass.h"; 1348 my $out_fh; 1349 if ( $path eq '-' ) { 1350 $out_fh= \*STDOUT; 1351 } else { 1352 $out_fh = open_new( $path ); 1353 } 1354 print $out_fh read_only_top( lang => 'C', by => $0, 1355 file => 'regcharclass.h', style => '*', 1356 copyright => [2007, 2011], 1357 final => <<EOF, 1358WARNING: These macros are for internal Perl core use only, and may be 1359changed or removed without notice. 1360EOF 1361 ); 1362 print $out_fh "\n#ifndef H_REGCHARCLASS /* Guard against nested #includes */\n#define H_REGCHARCLASS 1\n\n"; 1363 1364 my ( $op, $title, @txt, @types, %mods ); 1365 my $doit= sub { 1366 return unless $op; 1367 1368 # Skip if to compile on a different platform. 1369 return if delete $mods{only_ascii_platform} && ! ASCII_PLATFORM; 1370 return if delete $mods{only_ebcdic_platform} && ord 'A' != 193; 1371 1372 print $out_fh "/*\n\t$op: $title\n\n"; 1373 print $out_fh join "\n", ( map { "\t$_" } @txt ), "*/", ""; 1374 my $obj= __PACKAGE__->new( op => $op, title => $title, txt => \@txt ); 1375 1376 #die Dumper(\@types,\%mods); 1377 1378 my @mods; 1379 push @mods, 'safe' if delete $mods{safe}; 1380 unshift @mods, 'fast' if delete $mods{fast} || ! @mods; # Default to 'fast' 1381 # do this one 1382 # first, as 1383 # traditional 1384 if (%mods) { 1385 die "Unknown modifiers: ", join ", ", map { "'$_'" } sort keys %mods; 1386 } 1387 1388 foreach my $type_spec ( @types ) { 1389 my ( $type, $ret )= split /-/, $type_spec; 1390 $ret ||= 'len'; 1391 foreach my $mod ( @mods ) { 1392 delete $mods{$mod}; 1393 my $macro= $obj->make_macro( 1394 type => $type, 1395 ret_type => $ret, 1396 safe => $mod eq 'safe' && $type !~ /^cp/, 1397 ); 1398 print $out_fh $macro, "\n"; 1399 } 1400 } 1401 }; 1402 1403 while ( <DATA> ) { 1404 s/^ \s* (?: \# .* ) ? $ //x; # squeeze out comment and blanks 1405 next unless /\S/; 1406 chomp; 1407 if ( /^[A-Z]/ ) { 1408 $doit->(); # This starts a new definition; do the previous one 1409 ( $op, $title )= split /\s*:\s*/, $_, 2; 1410 @txt= (); 1411 } elsif ( s/^=>// ) { 1412 my ( $type, $modifier )= split /:/, $_; 1413 @types= split ' ', $type; 1414 undef %mods; 1415 map { $mods{$_} = 1 } split ' ', $modifier; 1416 } else { 1417 push @txt, "$_"; 1418 } 1419 } 1420 $doit->(); 1421 1422 print $out_fh "\n#endif /* H_REGCHARCLASS */\n"; 1423 1424 if($path eq '-') { 1425 print $out_fh "/* ex: set ro: */\n"; 1426 } else { 1427 read_only_bottom_close_and_rename($out_fh) 1428 } 1429} 1430 1431# The form of the input is a series of definitions to make macros for. 1432# The first line gives the base name of the macro, followed by a colon, and 1433# then text to be used in comments associated with the macro that are its 1434# title or description. In all cases the first (perhaps only) parameter to 1435# the macro is a pointer to the first byte of the code point it is to test to 1436# see if it is in the class determined by the macro. In the case of non-UTF8, 1437# the code point consists only of a single byte. 1438# 1439# The second line must begin with a '=>' and be followed by the types of 1440# macro(s) to be generated; these are specified below. A colon follows the 1441# types, followed by the modifiers, also specified below. At least one 1442# modifier is required. 1443# 1444# The subsequent lines give what code points go into the class defined by the 1445# macro. Multiple characters may be specified via a string like "\x0D\x0A", 1446# enclosed in quotes. Otherwise the lines consist of one of: 1447# 1) a single Unicode code point, prefaced by 0x 1448# 2) a single range of Unicode code points separated by a minus (and 1449# optional space) 1450# 3) a single Unicode property specified in the standard Perl form 1451# "\p{...}" 1452# 4) a line like 'do path'. This will do a 'do' on the file given by 1453# 'path'. It is assumed that this does nothing but load subroutines 1454# (See item 5 below). The reason 'require path' is not used instead is 1455# because 'do' doesn't assume that path is in @INC. 1456# 5) a subroutine call 1457# &pkg::foo(arg1, ...) 1458# where pkg::foo was loaded by a 'do' line (item 4). The subroutine 1459# returns an array of entries of forms like items 1-3 above. This 1460# allows more complex inputs than achievable from the other input types. 1461# 1462# A blank line or one whose first non-blank character is '#' is a comment. 1463# The definition of the macro is terminated by a line unlike those described. 1464# 1465# Valid types: 1466# low generate a macro whose name is 'is_BASE_low' and defines a 1467# class that includes only ASCII-range chars. (BASE is the 1468# input macro base name.) 1469# latin1 generate a macro whose name is 'is_BASE_latin1' and defines a 1470# class that includes only upper-Latin1-range chars. It is not 1471# designed to take a UTF-8 input parameter. 1472# high generate a macro whose name is 'is_BASE_high' and defines a 1473# class that includes all relevant code points that are above 1474# the Latin1 range. This is for very specialized uses only. 1475# It is designed to take only an input UTF-8 parameter. 1476# utf8 generate a macro whose name is 'is_BASE_utf8' and defines a 1477# class that includes all relevant characters that aren't ASCII. 1478# It is designed to take only an input UTF-8 parameter. 1479# LATIN1 generate a macro whose name is 'is_BASE_latin1' and defines a 1480# class that includes both ASCII and upper-Latin1-range chars. 1481# It is not designed to take a UTF-8 input parameter. 1482# UTF8 generate a macro whose name is 'is_BASE_utf8' and defines a 1483# class that can include any code point, adding the 'low' ones 1484# to what 'utf8' works on. It is designed to take only an input 1485# UTF-8 parameter. 1486# generic generate a macro whose name is 'is_BASE". It has a 2nd, 1487# boolean, parameter which indicates if the first one points to 1488# a UTF-8 string or not. Thus it works in all circumstances. 1489# generic_non_low generate a macro whose name is 'is_BASE_non_low". It has 1490# a 2nd, boolean, parameter which indicates if the first one 1491# points to a UTF-8 string or not. It excludes any ASCII-range 1492# matches, but otherwise it works in all circumstances. 1493# cp generate a macro whose name is 'is_BASE_cp' and defines a 1494# class that returns true if the UV parameter is a member of the 1495# class; false if not. 1496# cp_high like cp, but it is assumed that it is known that the UV 1497# parameter is above Latin1. The name of the generated macro is 1498# 'is_BASE_cp_high'. This is different from high-cp, derived 1499# below. 1500# A macro of the given type is generated for each type listed in the input. 1501# The default return value is the number of octets read to generate the match. 1502# Append "-cp" to the type to have it instead return the matched codepoint. 1503# The macro name is changed to 'what_BASE...'. See pod for 1504# caveats 1505# Appending '-both" instead adds an extra parameter to the end of the argument 1506# list, which is a pointer as to where to store the number of 1507# bytes matched, while also returning the code point. The macro 1508# name is changed to 'what_len_BASE...'. See pod for caveats 1509# 1510# Valid modifiers: 1511# safe The input string is not necessarily valid UTF-8. In 1512# particular an extra parameter (always the 2nd) to the macro is 1513# required, which points to one beyond the end of the string. 1514# The macro will make sure not to read off the end of the 1515# string. In the case of non-UTF8, it makes sure that the 1516# string has at least one byte in it. The macro name has 1517# '_safe' appended to it. 1518# fast The input string is valid UTF-8. No bounds checking is done, 1519# and the macro can make assumptions that lead to faster 1520# execution. 1521# only_ascii_platform Skip this definition if this program is being run on 1522# a non-ASCII platform. 1523# only_ebcdic_platform Skip this definition if this program is being run on 1524# a non-EBCDIC platform. 1525# No modifier need be specified; fast is assumed for this case. If both 1526# 'fast', and 'safe' are specified, two macros will be created for each 1527# 'type'. 1528# 1529# If run on a non-ASCII platform will automatically convert the Unicode input 1530# to native. The documentation above is slightly wrong in this case. 'low' 1531# actually refers to code points whose UTF-8 representation is the same as the 1532# non-UTF-8 version (invariants); and 'latin1' refers to all the rest of the 1533# code points less than 256. 1534 15351; # in the unlikely case we are being used as a module 1536 1537__DATA__ 1538# This is no longer used, but retained in case it is needed some day. 1539# TRICKYFOLD: Problematic fold case letters. When adding to this list, also should add them to regcomp.c and fold_grind.t 1540# => generic cp generic-cp generic-both :fast safe 1541# 0x00DF # LATIN SMALL LETTER SHARP S 1542# 0x0390 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS 1543# 0x03B0 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS 1544# 0x1E9E # LATIN CAPITAL LETTER SHARP S, because maps to same as 00DF 1545# 0x1FD3 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA; maps same as 0390 1546# 0x1FE3 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA; maps same as 03B0 1547 1548LNBREAK: Line Break: \R 1549=> generic UTF8 LATIN1 : safe 1550"\x0D\x0A" # CRLF - Network (Windows) line ending 1551\p{VertSpace} 1552 1553HORIZWS: Horizontal Whitespace: \h \H 1554=> high cp_high : fast 1555\p{HorizSpace} 1556 1557VERTWS: Vertical Whitespace: \v \V 1558=> high cp_high : fast 1559\p{VertSpace} 1560 1561XDIGIT: Hexadecimal digits 1562=> high cp_high : fast 1563\p{XDigit} 1564 1565XPERLSPACE: \p{XPerlSpace} 1566=> high cp_high : fast 1567\p{XPerlSpace} 1568 1569REPLACEMENT: Unicode REPLACEMENT CHARACTER 1570=> UTF8 :safe 15710xFFFD 1572 1573NONCHAR: Non character code points 1574=> UTF8 :fast 1575\p{Nchar} 1576 1577SURROGATE: Surrogate characters 1578=> UTF8 :fast 1579\p{Gc=Cs} 1580 1581GCB_L: Grapheme_Cluster_Break=L 1582=> UTF8 :fast 1583\p{_X_GCB_L} 1584 1585GCB_LV_LVT_V: Grapheme_Cluster_Break=(LV or LVT or V) 1586=> UTF8 :fast 1587\p{_X_LV_LVT_V} 1588 1589GCB_Prepend: Grapheme_Cluster_Break=Prepend 1590=> UTF8 :fast 1591\p{_X_GCB_Prepend} 1592 1593GCB_RI: Grapheme_Cluster_Break=RI 1594=> UTF8 :fast 1595\p{_X_RI} 1596 1597GCB_SPECIAL_BEGIN_START: Grapheme_Cluster_Break=special_begin_starts 1598=> UTF8 :fast 1599\p{_X_Special_Begin_Start} 1600 1601GCB_T: Grapheme_Cluster_Break=T 1602=> UTF8 :fast 1603\p{_X_GCB_T} 1604 1605GCB_V: Grapheme_Cluster_Break=V 1606=> UTF8 :fast 1607\p{_X_GCB_V} 1608 1609# This program was run with this enabled, and the results copied to utf8.h; 1610# then this was commented out because it takes so long to figure out these 2 1611# million code points. The results would not change unless utf8.h decides it 1612# wants a maximum other than 4 bytes, or this program creates better 1613# optimizations 1614#UTF8_CHAR: Matches utf8 from 1 to 4 bytes 1615#=> UTF8 :safe only_ascii_platform 1616#0x0 - 0x1FFFFF 1617 1618# This hasn't been commented out, because we haven't an EBCDIC platform to run 1619# it on, and the 3 types of EBCDIC allegedly supported by Perl would have 1620# different results 1621UTF8_CHAR: Matches utf8 from 1 to 5 bytes 1622=> UTF8 :safe only_ebcdic_platform 16230x0 - 0x3FFFFF: 1624 1625QUOTEMETA: Meta-characters that \Q should quote 1626=> high :fast 1627\p{_Perl_Quotemeta} 1628 1629MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character 1630=> UTF8 :safe 1631do regen/regcharclass_multi_char_folds.pl 1632 1633# 1 => All folds 1634®charclass_multi_char_folds::multi_char_folds(1) 1635 1636MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character 1637=> LATIN1 : safe 1638 1639®charclass_multi_char_folds::multi_char_folds(0) 1640# 0 => Latin1-only 1641 1642FOLDS_TO_MULTI: characters that fold to multi-char strings 1643=> UTF8 :fast 1644\p{_Perl_Folds_To_Multi_Char} 1645 1646PROBLEMATIC_LOCALE_FOLD : characters whose fold is problematic under locale 1647=> UTF8 cp :fast 1648\p{_Perl_Problematic_Locale_Folds} 1649 1650PROBLEMATIC_LOCALE_FOLDEDS_START : The first folded character of folds which are problematic under locale 1651=> UTF8 cp :fast 1652\p{_Perl_Problematic_Locale_Foldeds_Start} 1653 1654PATWS: pattern white space 1655=> generic generic_non_low cp : safe 1656\p{PatWS} 1657