1# Grind out a lot of combinatoric tests for folding. 2# It uses various charset modifiers, passed in via $::TEST_CHUNK. The caller 3# will also have set the locale to use if /l is the modifier. 4# L is a pseudo-modifier that indicates to use the modifier /l instead, and 5# the locale set by the caller is known to be UTF-8, 6# T is a pseudo-modifier that indicates to use the pseudo modifier /L 7# instead, and the locale set by the caller is known to be Turkic UTF-8, 8 9binmode STDOUT, ":utf8"; 10 11BEGIN { 12 chdir 't' if -d 't'; 13 require './test.pl'; 14 set_up_inc('../lib'); 15 require Config; Config->import; 16 skip_all_if_miniperl("no dynamic loading on miniperl, no Encode nor POSIX"); 17 if ($^O eq 'dec_osf') { 18 skip_all("$^O cannot handle this test"); 19 } 20 21 watchdog(5 * 60); 22 require './loc_tools.pl'; 23} 24 25use charnames ":full"; 26 27my $DEBUG = 0; # Outputs extra information for debugging this .t 28 29use strict; 30use warnings; 31no warnings 'locale'; # Plenty of these would otherwise get generated 32use Encode; 33use POSIX; 34 35my $charset = $::TEST_CHUNK; 36my $use_turkic_rules = 0; 37 38if ($charset eq 'T') { 39 $charset = 'L'; 40 $use_turkic_rules = 1; 41} 42 43my $has_LC_CTYPE = is_category_valid('LC_CTYPE'); 44 45# Special-cased characters in the .c's that we want to make sure get tested. 46my %be_sure_to_test = ( 47 chr utf8::unicode_to_native(0xDF) => 1, # LATIN_SMALL_LETTER_SHARP_S 48 49 # This is included because the uppercase occupies more bytes, but the 50 # first two bytes of their representations differ only in one bit, 51 # that could lead the code looking for shortcuts astray; you can't do 52 # certain shortcuts if the lengths differ 53 "\x{29E}" => 1, # LATIN SMALL LETTER TURNED K 54 55 "\x{390}" => 1, # GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 56 "\x{3B0}" => 1, # GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 57 58 # This is included because the uppercase and lowercase differ by only 59 # a single bit and it is in the first of the two byte representations. 60 # This showed that a previous way was erroneous of calculating if 61 # initial substrings were closely-related bit-wise. 62 "\x{3CC}" => 1, # GREEK SMALL LETTER OMICRON WITH TONOS 63 64 "\x{1E9E}" => 1, # LATIN_CAPITAL_LETTER_SHARP_S 65 "\x{1FD3}" => 1, # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA 66 "\x{1FE3}" => 1, # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA 67 68 # These are included because they are adjacent and fold to the same 69 # result, U+01C6. This has tripped up the code in the past that 70 # wrongly thought that sequential code points must fold to sequential 71 # code points 72 "\x{01C4}" => 1, # LATIN CAPITAL LETTER DZ WITH CARON 73 "\x{01C5}" => 1, # LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON 74 75 "I" => 1, 76); 77 78# Tests both unicode and not, so make sure not implicitly testing unicode 79no feature 'unicode_strings'; 80 81# Case-insensitive matching is a large and complicated issue. Perl does not 82# implement it fully, properly. For example, it doesn't include normalization 83# as part of the equation. To test every conceivable combination is clearly 84# impossible; these tests are mostly drawn from visual inspection of the code 85# and experience, trying to exercise all areas. 86 87# There are three basic ranges of characters that Perl may treat differently: 88# 1) Invariants under utf8 which on ASCII-ish machines are ASCII, and are 89# referred to here as ASCII. On EBCDIC machines, the non-ASCII invariants 90# are all controls that fold to themselves. 91my $ASCII = 1; 92 93# 2) Other characters that fit into a byte but are different in utf8 than not; 94# here referred to, taking some liberties, as Latin1. 95my $Latin1 = 2; 96 97# 3) Characters that won't fit in a byte; here referred to as Unicode 98my $Unicode = 3; 99 100# Within these basic groups are equivalence classes that testing any character 101# in is likely to lead to the same results as any other character. This is 102# used to cut down the number of tests needed, unless PERL_RUN_SLOW_TESTS is 103# set. 104my $skip_apparently_redundant = ! $ENV{PERL_RUN_SLOW_TESTS}; 105 106# Additionally parts of this test run a lot of subtests, outputting the 107# resulting TAP can be expensive so the tests are summarised internally. The 108# PERL_DEBUG_FULL_TEST environment variable can be set to produce the full 109# output for debugging purposes. 110 111sub range_type { 112 my $ord = ord shift; 113 114 return $ASCII if utf8::native_to_unicode($ord) < 128; 115 return $Latin1 if $ord < 256; 116 return $Unicode; 117} 118 119sub numerically { 120 return $a <=> $b 121} 122 123my $list_all_tests = $ENV{PERL_DEBUG_FULL_TEST} || $DEBUG; 124$| = 1 if $list_all_tests; 125 126# Significant time is saved by not outputting each test but grouping the 127# output into subtests 128my $okays; # Number of ok's in current subtest 129my $this_iteration; # Number of possible tests in current subtest 130my $count = 0; # Number of subtests = number of total tests 131 132sub run_test($$$$) { 133 my ($test, $todo, $do_we_output_locale_name, $debug) = @_; 134 135 $debug = "" unless $DEBUG; 136 my $res = eval $test; 137 138 if ($do_we_output_locale_name) { 139 $do_we_output_locale_name = 'setlocale(LC_CTYPE, "' 140 . POSIX::setlocale(&POSIX::LC_CTYPE) 141 . '"); '; 142 } 143 if (!$res || $list_all_tests) { 144 # Failed or debug; output the result 145 $count++; 146 ok($res, "$do_we_output_locale_name$test; $debug"); 147 } else { 148 # Just count the test as passed 149 $okays++; 150 } 151 $this_iteration++; 152} 153 154my %has_test_by_participants; # Makes sure has tests for each range and each 155 # number of characters that fold to the same 156 # thing 157my %has_test_by_byte_count; # Makes sure has tests for each combination of 158 # n bytes folds to m bytes 159 160my %tests; # The set of tests we expect to pass. 161# Each key is a code point that folds to something else. 162# Each value is a list of things that the key folds to. If the 'thing' is a 163# single code point, it is that ordinal. If it is a multi-char fold, it is an 164# ordered list of the code points in that fold. Here's an example for 'S': 165# '83' => [ 115, 383 ] 166# 167# And one for a multi-char fold: \xDF 168# 223 => [ 169# [ # 'ss' 170# 83, 171# 83 172# ], 173# [ # 'SS' 174# 115, 175# 115 176# ], 177# [ # LATIN SMALL LETTER LONG S 178# 383, 179# 383 180# ], 181# 7838 # LATIN_CAPITAL_LETTER_SHARP_S 182# ], 183 184my %neg_tests; # Same format, but we expect these tests to fail 185 186my %folds; # keys are code points that fold; values are either 0 or 1 which 187 # in turn are keys with their values each a list of code points the 188 # code point key folds to. The folds under 1 are the ones that are 189 # valid in this run; the ones under 0 are ones valid under other 190 # circumstances. 191 192my %inverse_folds; # keys are strings of the folded-to; then come a layer of 193 # 0 or 1, like %folds. The lowest values are lists of 194 # characters that fold to them 195 196# Here's a portion of an %inverse_folds in a run where Turkic folds are not 197# legal, so \x{130} doesn't fold to 'i' in this run. 198# 'h' => { 199# '1' => [ 200# 'H' 201# ] 202# }, 203# "h\x{331}" => { 204# '1' => [ 205# "\x{1e96}" 206# ] 207# }, 208# 'i' => { 209# '0' => [ 210# "\x{130}" 211# ], 212# '1' => [ 213# 'I' 214# ] 215# }, 216# "i\x{307}" => { 217# '1' => [ 218# "\x{130}" 219# ] 220# }, 221# 'j' => { 222# '1' => [ 223# 'J' 224# ] 225# }, 226 227sub add_test($$@) { 228 my ($tests_ref, $to, @from) = @_; 229 230 # Called to cause the input to be tested by adding to $%tests_ref. @from 231 # is the list of characters that fold to the string $to. @from should be 232 # sorted so the lowest code point is first.... 233 # The input is in string form; %tests uses code points, so have to 234 # convert. 235 236 my $to_chars = length $to; 237 my @test_to; # List of tests for $to 238 239 if ($to_chars == 1) { 240 @test_to = ord $to; 241 } 242 else { 243 push @test_to, [ map { ord $_ } split "", $to ]; 244 245 # For multi-char folds, we also test that things that can fold to each 246 # individual character in the fold also work. If we were testing 247 # comprehensively, we would try every combination of upper and lower 248 # case in the fold, but it will have to suffice to avoid running 249 # forever to make sure that each thing that folds to these is tested 250 # at least once. Because of complement matching ([^...]), we need to 251 # do both the folded, and the folded-from. 252 # We first look at each character in the multi-char fold, and save how 253 # many characters fold to it; and also the maximum number of such 254 # folds 255 my @folds_to_count; # 0th char in fold is index 0 ... 256 my $max_folds_to = 0; 257 258 for (my $i = 0; $i < $to_chars; $i++) { 259 my $to_char = substr($to, $i, 1); 260 if (exists $inverse_folds{$to_char}{1}) { 261 $folds_to_count[$i] = scalar @{$inverse_folds{$to_char}{1}}; 262 $max_folds_to = $folds_to_count[$i] if $max_folds_to < $folds_to_count[$i]; 263 } 264 else { 265 $folds_to_count[$i] = 0; 266 } 267 } 268 269 # We will need to generate as many tests as the maximum number of 270 # folds, so that each fold will have at least one test. 271 # For example, consider character X which folds to the three character 272 # string 'xyz'. If 2 things fold to x (X and x), 4 to y (Y, Y' 273 # (Y-prime), Y'' (Y-prime-prime), and y), and 1 thing to z (itself), 4 274 # tests will be generated: 275 # xyz 276 # XYz 277 # xY'z 278 # xY''z 279 for (my $i = 0; $i < $max_folds_to; $i++) { 280 my @this_test_to; # Assemble a single test 281 282 # For each character in the multi-char fold ... 283 for (my $j = 0; $j < $to_chars; $j++) { 284 my $this_char = substr($to, $j, 1); 285 286 # Use its corresponding inverse fold, if available. 287 if ( $i < $folds_to_count[$j] 288 && exists $inverse_folds{$this_char}{1}) 289 { 290 push @this_test_to, ord $inverse_folds{$this_char}{1}[$i]; 291 } 292 else { # Or else itself. 293 push @this_test_to, ord $this_char; 294 } 295 } 296 297 # Add this test to the list 298 push @test_to, [ @this_test_to ]; 299 } 300 301 # Here, have assembled all the tests for the multi-char fold. Sort so 302 # lowest code points are first for consistency and aesthetics in 303 # output. We know there are at least two characters in the fold, but 304 # I haven't bothered to worry about sorting on an optional third 305 # character if the first two are identical. 306 @test_to = sort { ($a->[0] == $b->[0]) 307 ? $a->[1] <=> $b->[1] 308 : $a->[0] <=> $b->[0] 309 } @test_to; 310 } 311 312 313 # This test is from n bytes to m bytes. Record that so won't try to add 314 # another test that does the same. 315 use bytes; 316 my $to_bytes = length $to; 317 foreach my $from_map (@from) { 318 $has_test_by_byte_count{length $from_map}{$to_bytes} = $to; 319 } 320 no bytes; 321 322 my $ord_smallest_from = ord shift @from; 323 if (exists $tests_ref->{$ord_smallest_from}) { 324 die "There are already tests for $ord_smallest_from" 325 }; 326 327 # Add in the fold tests, 328 push @{$tests_ref->{$ord_smallest_from}}, @test_to; 329 330 # Then any remaining froms in the equivalence class. 331 push @{$tests_ref->{$ord_smallest_from}}, map { ord $_ } @from; 332} 333 334# Get the Unicode rules and construct inverse mappings from them 335 336use Unicode::UCD; 337my $file="../lib/unicore/CaseFolding.txt"; 338 339# Use the Unicode data file if we are on an ASCII platform (which its data is 340# for), and it is in the modern format (starting in Unicode 3.1.0) and it is 341# available. This avoids being affected by potential bugs introduced by other 342# layers of Perl 343if ($::IS_ASCII 344 && pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0 345 && open my $fh, "<", $file) 346{ 347 # We process the file in reverse order because its easier to see the T 348 # entry first and then know that the next line we process is the 349 # corresponding one for non-T. 350 my @rules = <$fh>; 351 my $prev_was_turkic = 0; 352 while (defined ($_ = pop @rules)) { 353 chomp; 354 355 # Lines look like (though without the initial '#') 356 #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE 357 358 # Get rid of comments, ignore blank or comment-only lines 359 my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx; 360 next unless length $line; 361 my ($hex_from, $fold_type, @hex_folded) = split /[\s;]+/, $line; 362 363 next if $fold_type eq 'S'; # If Unicode's tables are correct, the F 364 # should be a superset of S 365 next if $fold_type eq 'I'; # Perl doesn't do old Turkish folding 366 367 my $test_type; 368 if ($fold_type eq 'T') { 369 $test_type = 0 + $use_turkic_rules; 370 $prev_was_turkic = 1; 371 } 372 elsif ($prev_was_turkic) { 373 $test_type = 0 + ! $use_turkic_rules; 374 $prev_was_turkic = 0; 375 } 376 else { 377 $test_type = 1; 378 $prev_was_turkic = 0; 379 } 380 381 my $from = hex $hex_from; 382 my @to = map { hex $_ } @hex_folded; 383 push @{$folds{$from}{$test_type}}, @to; 384 385 my $folded_str = pack ("U0U*", @to); 386 push @{$inverse_folds{$folded_str}{$test_type}}, chr $from; 387 } 388} 389else { # Here, can't use the .txt file: read the Unicode rules file and 390 # construct inverse mappings from it 391 392 skip_all "Don't know how to generate turkic rules on this platform" 393 if $use_turkic_rules; 394 my ($invlist_ref, $invmap_ref, undef, $default) 395 = Unicode::UCD::prop_invmap('Case_Folding'); 396 for my $i (0 .. @$invlist_ref - 1 - 1) { 397 next if $invmap_ref->[$i] == $default; 398 399 # Make into an array if not so already, so can treat uniformly below 400 $invmap_ref->[$i] = [ $invmap_ref->[$i] ] if ! ref $invmap_ref->[$i]; 401 402 # Each subsequent element of the range requires adjustment of +1 from 403 # the previous element 404 my $adjust = -1; 405 for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) { 406 $adjust++; 407 my @to = map { $_ + $adjust } @{$invmap_ref->[$i]}; 408 push @{$folds{$j}{1}}, @to; 409 my $folded_str = join "", map { chr } @to; 410 utf8::upgrade($folded_str); 411 #note (sprintf "%d: %04X: %s", __LINE__, $j, join " ", 412 # map { sprintf "%04X", $_ + $adjust } @{$invmap_ref->[$i]}); 413 push @{$inverse_folds{$folded_str}{1}}, chr $j; 414 } 415 } 416} 417 418# Analyze the data and generate tests to get adequate test coverage. We sort 419# things so that smallest code points are done first. 420foreach my $to (sort { $a cmp $b } keys %inverse_folds) 421{ 422TO: 423 foreach my $tests_ref (\%tests, \%neg_tests) { 424 my $test_type = ($tests_ref == \%tests) ? 1 : 0; 425 426 next unless exists $inverse_folds{$to}{$test_type}; 427 428 # Within each fold, sort so that the smallest code points are done first 429 @{$inverse_folds{$to}{$test_type}} = sort { $a cmp $b } @{$inverse_folds{$to}{$test_type}}; 430 my @from = @{$inverse_folds{$to}{$test_type}}; 431 432 # Just add it to the tests if doing complete coverage 433 if (! $skip_apparently_redundant) { 434 add_test($tests_ref, $to, @from); 435 next TO; 436 } 437 438 my $to_chars = length $to; 439 my $to_range_type = range_type(substr($to, 0, 1)); 440 441 # If this is required to be tested, do so. We check for these first, as 442 # they will take up slots of byte-to-byte combinations that we otherwise 443 # would have to have other tests to get. 444 foreach my $from_map (@from) { 445 if (exists $be_sure_to_test{$from_map}) { 446 add_test($tests_ref, $to, @from); 447 next TO; 448 } 449 } 450 451 # If the fold contains heterogeneous range types, is suspect and should be 452 # tested. 453 if ($to_chars > 1) { 454 foreach my $char (split "", $to) { 455 if (range_type($char) != $to_range_type) { 456 add_test($tests_ref, $to, @from); 457 next TO; 458 } 459 } 460 } 461 462 # If the mapping crosses range types, is suspect and should be tested 463 foreach my $from_map (@from) { 464 if (range_type($from_map) != $to_range_type) { 465 add_test($tests_ref, $to, @from); 466 next TO; 467 } 468 } 469 470 # Here, all components of the mapping are in the same range type. For 471 # single character folds, we test one case in each range type that has 2 472 # particpants, 3 particpants, etc. 473 if ($to_chars == 1) { 474 if (! exists $has_test_by_participants{scalar @from}{$to_range_type}) { 475 add_test($tests_ref, $to, @from); 476 $has_test_by_participants{scalar @from}{$to_range_type} = $to; 477 next TO; 478 } 479 } 480 481 # We also test all combinations of mappings from m to n bytes. This is 482 # because the regex optimizer cares. (Don't bother worrying about that 483 # Latin1 chars will occupy a different number of bytes under utf8, as 484 # there are plenty of other cases that catch these byte numbers.) 485 use bytes; 486 my $to_bytes = length $to; 487 foreach my $from_map (@from) { 488 if (! exists $has_test_by_byte_count{length $from_map}{$to_bytes}) { 489 add_test($tests_ref, $to, @from); 490 next TO; 491 } 492 } 493 } 494} 495 496# For each range type, test additionally a character that folds to itself 497add_test(\%tests, ":", ":"); 498add_test(\%tests, chr utf8::unicode_to_native(0xF7), chr utf8::unicode_to_native(0xF7)); 499add_test(\%tests, chr 0x2C7, chr 0x2C7); 500 501# To cut down on the number of tests 502my $has_tested_aa_above_latin1; 503my $has_tested_latin1_aa; 504my $has_tested_ascii_aa; 505my $has_tested_l_above_latin1; 506my $has_tested_above_latin1_l; 507my $has_tested_ascii_l; 508my $has_tested_above_latin1_d; 509my $has_tested_ascii_d; 510my $has_tested_non_latin1_d; 511my $has_tested_above_latin1_a; 512my $has_tested_ascii_a; 513my $has_tested_non_latin1_a; 514 515# For use by pairs() in generating combinations 516sub prefix { 517 my $p = shift; 518 map [ $p, $_ ], @_ 519} 520 521# Returns all ordered combinations of pairs of elements from the input array. 522# It doesn't return pairs like (a, a), (b, b). Change the slice to an array 523# to do that. This was just to have fewer tests. 524sub pairs (@) { 525 #print STDERR __LINE__, ": ", join(" XXX ", map { sprintf "%04X", $_ } @_), "\n"; 526 map { prefix $_[$_], @_[0..$_-1, $_+1..$#_] } 0..$#_ 527} 528 529# Finally ready to do the tests 530foreach my $tests_ref (\%neg_tests, \%tests) { 531foreach my $test (sort { numerically } keys %{$tests_ref}) { 532 533 my $previous_target; 534 my $previous_pattern; 535 my @pairs = pairs(sort numerically $test, @{$tests_ref->{$test}}); 536 537 # Each fold can be viewed as a closure of all the characters that 538 # participate in it. Look at each possible pairing from a closure, with the 539 # first member of the pair the target string to match against, and the 540 # second member forming the pattern. Thus each fold member gets tested as 541 # the string, and the pattern with every other member in the opposite role. 542 while (my $pair = shift @pairs) { 543 my ($target, $pattern) = @$pair; 544 545 # When testing a char that doesn't fold, we can get the same 546 # permutation twice; so skip all but the first. 547 next if $previous_target 548 && $previous_target == $target 549 && $previous_pattern == $pattern; 550 ($previous_target, $previous_pattern) = ($target, $pattern); 551 552 # Each side may be either a single char or a string. Extract each into an 553 # array (perhaps of length 1) 554 my @target, my @pattern; 555 @target = (ref $target) ? @$target : $target; 556 @pattern = (ref $pattern) ? @$pattern : $pattern; 557 558 # We are testing just folds to/from a single character. If our pairs 559 # happens to generate multi/multi, skip. 560 next if @target > 1 && @pattern > 1; 561 562 # Get in hex form. 563 my @x_target = map { sprintf "\\x{%04X}", $_ } @target; 564 my @x_pattern = map { sprintf "\\x{%04X}", $_ } @pattern; 565 566 my $target_above_latin1 = grep { $_ > 255 } @target; 567 my $pattern_above_latin1 = grep { $_ > 255 } @pattern; 568 my $target_has_ascii = grep { utf8::native_to_unicode($_) < 128 } @target; 569 my $pattern_has_ascii = grep { utf8::native_to_unicode($_) < 128 } @pattern; 570 my $target_only_ascii = ! grep { utf8::native_to_unicode($_) > 127 } @target; 571 my $pattern_only_ascii = ! grep { utf8::native_to_unicode($_) > 127 } @pattern; 572 my $target_has_latin1 = grep { $_ < 256 } @target; 573 my $target_has_upper_latin1 574 = grep { $_ < 256 && utf8::native_to_unicode($_) > 127 } @target; 575 my $pattern_has_upper_latin1 576 = grep { $_ < 256 && utf8::native_to_unicode($_) > 127 } @pattern; 577 my $pattern_has_latin1 = grep { $_ < 256 } @pattern; 578 my $is_self = @target == 1 && @pattern == 1 && $target[0] == $pattern[0]; 579 580 # We don't test multi-char folding into other multi-chars. We are testing 581 # a code point that folds to or from other characters. Find the single 582 # code point for diagnostic purposes. (If both are single, choose the 583 # target string) 584 my $ord = @target == 1 ? $target[0] : $pattern[0]; 585 my $progress = sprintf "%04X: \"%s\" and /%s/", 586 $test, 587 join("", @x_target), 588 join("", @x_pattern); 589 #note $progress; 590 591 # Now grind out tests, using various combinations. 592 { 593 my $charset_mod = lc $charset; 594 my $current_locale = ($has_LC_CTYPE) 595 ? setlocale(&POSIX::LC_CTYPE) 596 : 'C'; 597 $current_locale = 'C locale' if $current_locale eq 'C'; 598 $okays = 0; 599 $this_iteration = 0; 600 601 # To cut down somewhat on the enormous quantity of tests this currently 602 # runs, skip some for some of the character sets whose results aren't 603 # likely to differ from others. But run all tests on the code points 604 # that don't fold, plus one other set in each range group. 605 if (! $is_self) { 606 607 # /aa should only affect things with folds in the ASCII range. But, try 608 # it on one set in the other ranges just to make sure it doesn't break 609 # them. 610 if ($charset eq 'aa') { 611 612 # It may be that this $pair of code points to test are both 613 # non-ascii, but if either of them actually fold to ascii, that is 614 # suspect and should be tested. So for /aa, use whether their folds 615 # are ascii or not 616 my $target_has_ascii = $target_has_ascii; 617 my $pattern_has_ascii = $pattern_has_ascii; 618 if (! $target_has_ascii) { 619 foreach my $cp (@target) { 620 if (exists $folds{$cp}{1} 621 && grep { utf8::native_to_unicode($_) < 128 } @{$folds{$cp}{1}} ) 622 { 623 $target_has_ascii = 1; 624 last; 625 } 626 } 627 } 628 if (! $pattern_has_ascii) { 629 foreach my $cp (@pattern) { 630 if (exists $folds{$cp}{1} 631 && grep { utf8::native_to_unicode($_) < 128 } @{$folds{$cp}}{1} ) 632 { 633 $pattern_has_ascii = 1; 634 last; 635 } 636 } 637 } 638 639 if (! $target_has_ascii && ! $pattern_has_ascii) { 640 if ($target_above_latin1 || $pattern_above_latin1) { 641 next if defined $has_tested_aa_above_latin1 642 && $has_tested_aa_above_latin1 != $test; 643 $has_tested_aa_above_latin1 = $test; 644 } 645 next if defined $has_tested_latin1_aa 646 && $has_tested_latin1_aa != $test; 647 $has_tested_latin1_aa = $test; 648 } 649 elsif ($target_only_ascii && $pattern_only_ascii) { 650 651 # And, except for one set just to make sure, skip tests 652 # where both elements in the pair are ASCII. If one works for 653 # aa, the others are likely too. This skips tests where the 654 # fold is from non-ASCII to ASCII, but this part of the test 655 # is just about the ASCII components. 656 next if defined $has_tested_ascii_l 657 && $has_tested_ascii_l != $test; 658 $has_tested_ascii_l = $test; 659 } 660 } 661 elsif ($charset eq 'l') { 662 663 # For l, don't need to test beyond one set those things that are 664 # all above latin1, because unlikely to have different successes 665 # than /u. But, for the same reason as described in the /aa above, 666 # it is suspect and should be tested, if either of the folds are to 667 # latin1. 668 my $target_has_latin1 = $target_has_latin1; 669 my $pattern_has_latin1 = $pattern_has_latin1; 670 if (! $target_has_latin1) { 671 foreach my $cp (@target) { 672 if (exists $folds{$cp}{1} 673 && grep { $_ < 256 } @{$folds{$cp}{1}} ) 674 { 675 $target_has_latin1 = 1; 676 last; 677 } 678 } 679 } 680 if (! $pattern_has_latin1) { 681 foreach my $cp (@pattern) { 682 if (exists $folds{$cp}{1} 683 && grep { $_ < 256 } @{$folds{$cp}{1}} ) 684 { 685 $pattern_has_latin1 = 1; 686 last; 687 } 688 } 689 } 690 if (! $target_has_latin1 && ! $pattern_has_latin1) { 691 next if defined $has_tested_above_latin1_l 692 && $has_tested_above_latin1_l != $test; 693 $has_tested_above_latin1_l = $test; 694 } 695 elsif ($target_only_ascii && $pattern_only_ascii) { 696 697 # And, except for one set just to make sure, skip tests 698 # where both elements in the pair are ASCII. This is 699 # essentially the same reasoning as above for /aa. 700 next if defined $has_tested_ascii_l 701 && $has_tested_ascii_l != $test; 702 $has_tested_ascii_l = $test; 703 } 704 } 705 elsif ($charset eq 'd') { 706 # Similarly for d. Beyond one test (besides self) each, we don't 707 # test pairs that are both ascii; or both above latin1, or are 708 # combinations of ascii and above latin1. 709 if (! $target_has_upper_latin1 && ! $pattern_has_upper_latin1) { 710 if ($target_has_ascii && $pattern_has_ascii) { 711 next if defined $has_tested_ascii_d 712 && $has_tested_ascii_d != $test; 713 $has_tested_ascii_d = $test 714 } 715 elsif (! $target_has_latin1 && ! $pattern_has_latin1) { 716 next if defined $has_tested_above_latin1_d 717 && $has_tested_above_latin1_d != $test; 718 $has_tested_above_latin1_d = $test; 719 } 720 else { 721 next if defined $has_tested_non_latin1_d 722 && $has_tested_non_latin1_d != $test; 723 $has_tested_non_latin1_d = $test; 724 } 725 } 726 } 727 elsif ($charset eq 'a') { 728 # Similarly for a. This should match identically to /u, so wasn't 729 # tested at all until a bug was found that was thereby missed. 730 # As a compromise, beyond one test (besides self) each, we don't 731 # test pairs that are both ascii; or both above latin1, or are 732 # combinations of ascii and above latin1. 733 if (! $target_has_upper_latin1 && ! $pattern_has_upper_latin1) { 734 if ($target_has_ascii && $pattern_has_ascii) { 735 next if defined $has_tested_ascii_a 736 && $has_tested_ascii_a != $test; 737 $has_tested_ascii_a = $test 738 } 739 elsif (! $target_has_latin1 && ! $pattern_has_latin1) { 740 next if defined $has_tested_above_latin1_a 741 && $has_tested_above_latin1_a != $test; 742 $has_tested_above_latin1_a = $test; 743 } 744 else { 745 next if defined $has_tested_non_latin1_a 746 && $has_tested_non_latin1_a != $test; 747 $has_tested_non_latin1_a = $test; 748 } 749 } 750 } 751 } 752 753 foreach my $utf8_target (0, 1) { # Both utf8 and not, for 754 # code points < 256 755 my $upgrade_target = ""; 756 757 # These must already be in utf8 because the string to match has 758 # something above latin1. So impossible to test if to not to be in 759 # utf8; and otherwise, no upgrade is needed. 760 next if $target_above_latin1 && ! $utf8_target; 761 $upgrade_target = ' utf8::upgrade($c);' if ! $target_above_latin1 && $utf8_target; 762 763 foreach my $utf8_pattern (0, 1) { 764 next if $pattern_above_latin1 && ! $utf8_pattern; 765 766 # Our testing of 'l' uses the POSIX locale, which is ASCII-only 767 my $uni_semantics = $charset ne 'l' && ( $utf8_target 768 || $charset eq 'u' 769 || $charset eq 'L' 770 || ($charset eq 'd' && $utf8_pattern) 771 || $charset =~ /a/); 772 my $upgrade_pattern = ""; 773 $upgrade_pattern = ' utf8::upgrade($rhs);' 774 if ! $pattern_above_latin1 && $utf8_pattern; 775 776 my $lhs = join "", @x_target; 777 my $lhs_str = eval qq{"$lhs"}; fail($@) if $@; 778 my @rhs = @x_pattern; 779 my $rhs = join "", @rhs; 780 781 # Unicode created a folding rule that partially emulates what 782 # happens in a Turkish locale, by using combining characters. The 783 # result is close enough to what really should happen, that it can 784 # pass many of the tests, but not all. So, if we have a rule that 785 # is expecting failure, it may pass instead. The code in the block 786 # below is good enough for skipping the tests, and khw tried to make 787 # it general, but should the rules be revised (unlikely at this 788 # point), this might need to be tweaked. 789 if ($tests_ref == \%neg_tests) { 790 my ($shorter_ref, $longer_ref); 791 792 # Convert the $rhs to a string, like we already did for the lhs 793 my $rhs_str = eval qq{"$rhs"}; fail($@) if $@; 794 795 # If the lengths of the two sides are equal, we don't want to do 796 # this; this is only to bypass the combining characters affecting 797 # things 798 if (length $lhs_str != length $rhs_str) { 799 800 # Find the shorter and longer of the pair 801 if (length $lhs_str < length $rhs_str) { 802 $shorter_ref = \$lhs_str; 803 $longer_ref = \$rhs_str; 804 } 805 else { 806 $shorter_ref = \$rhs_str; 807 $longer_ref = \$lhs_str; 808 } 809 810 # If the shorter string is entirely contained in the longer, we 811 # have generated a test that is likely to succeed, and the 812 # reasons it would fail have nothing to do with folding. But we 813 # are expecting it to fail, and so our test is invalid. Skip 814 # it. 815 next if index($$longer_ref, $$shorter_ref) >= 0; 816 817 818 # The above eliminates about half the failure cases. This gets 819 # the rest. If the shorter string is a single character and has 820 # a fold legal in this run to a character that is in the longer 821 # string, it is also likely to succeed under /i. So again our 822 # computed test is bogus. 823 if ( length $$shorter_ref == 1 824 && exists $folds{ord $$shorter_ref}{1}) 825 { 826 my @folded_to = @{$folds{ord $$shorter_ref}{1}}; 827 next if @folded_to == 1 828 && index($$longer_ref, chr $folded_to[0]) >= 0; 829 } 830 } 831 } 832 833 my $should_fail = (! $uni_semantics && $ord < 256 && ! $is_self && utf8::native_to_unicode($ord) >= 128) 834 || ($charset eq 'aa' && $target_has_ascii != $pattern_has_ascii) 835 || ($charset eq 'l' && $target_has_latin1 != $pattern_has_latin1) 836 || $tests_ref == \%neg_tests; 837 838 # Do simple tests of referencing capture buffers, named and 839 # numbered. 840 my $op = '=~'; 841 $op = '!~' if $should_fail; 842 843 my $todo = 0; # No longer any todo's 844 my $eval = "my \$c = \"$lhs$rhs\"; my \$rhs = \"$rhs\"; " 845 . $upgrade_pattern 846 . " my \$p = qr/(?$charset_mod:^(\$rhs)\\1\$)/i;" 847 . "$upgrade_target \$c $op \$p"; 848 run_test($eval, $todo, ($charset_mod eq 'l'), ""); 849 850 $eval = "my \$c = \"$lhs$rhs\"; my \$rhs = \"$rhs\"; " 851 . $upgrade_pattern 852 . " my \$p = qr/(?$charset_mod:^(?<grind>\$rhs)\\k<grind>\$)/i;" 853 . "$upgrade_target \$c $op \$p"; 854 run_test($eval, $todo, ($charset_mod eq 'l'), ""); 855 856 if ($lhs ne $rhs) { 857 $eval = "my \$c = \"$rhs$lhs\"; my \$rhs = \"$rhs\"; " 858 . $upgrade_pattern 859 . " my \$p = qr/(?$charset_mod:^(\$rhs)\\1\$)/i;" 860 . "$upgrade_target \$c $op \$p"; 861 run_test($eval, "", ($charset_mod eq 'l'), ""); 862 863 $eval = "my \$c = \"$rhs$lhs\"; my \$rhs = \"$rhs\"; " 864 . $upgrade_pattern 865 . " my \$p = qr/(?$charset_mod:^(?<grind>\$rhs)\\k<grind>\$)/i;" 866 . "$upgrade_target \$c $op \$p"; 867 run_test($eval, "", ($charset_mod eq 'l'), ""); 868 } 869 870 # See if works on what could be a simple trie. 871 my $alternate; 872 { 873 # Keep the alternate | branch the same length as the tested one so 874 # that it's length doesn't influence things 875 my $evaled = eval "\"$rhs\""; # Convert e.g. \x{foo} into its 876 # chr equivalent 877 use bytes; 878 $alternate = 'q' x length $evaled; 879 } 880 $eval = "my \$c = \"$lhs\"; my \$rhs = \"$rhs\"; " 881 . $upgrade_pattern 882 . " my \$p = qr/\$rhs|$alternate/i$charset_mod;" 883 . "$upgrade_target \$c $op \$p"; 884 run_test($eval, "", ($charset_mod eq 'l'), ""); 885 886 # Check that works when the folded character follows something that 887 # is quantified. This test knows the regex code internals to the 888 # extent that it knows this is a potential problem, and that there 889 # are three different types of quantifiers generated: 1) The thing 890 # being quantified matches a single character; 2) it matches more 891 # than one character, but is fixed width; 3) it can match a variable 892 # number of characters. (It doesn't know that case 3 shouldn't 893 # matter, since it doesn't do anything special for the character 894 # following the quantifier; nor that some of the different 895 # quantifiers execute the same underlying code, as these tests are 896 # quick, and this insulates these tests from changes in the 897 # implementation.) 898 for my $quantifier ('?', '??', '*', '*?', '+', '+?', '{1,2}', '{1,2}?') { 899 $eval = "my \$c = \"_$lhs\"; my \$rhs = \"$rhs\"; $upgrade_pattern " 900 . "my \$p = qr/(?$charset_mod:.$quantifier\$rhs)/i;" 901 . "$upgrade_target \$c $op \$p"; 902 run_test($eval, "", ($charset_mod eq 'l'), ""); 903 $eval = "my \$c = \"__$lhs\"; my \$rhs = \"$rhs\"; $upgrade_pattern " 904 . "my \$p = qr/(?$charset_mod:(?:..)$quantifier\$rhs)/i;" 905 . "$upgrade_target \$c $op \$p"; 906 run_test($eval, "", ($charset_mod eq 'l'), ""); 907 $eval = "my \$c = \"__$lhs\"; my \$rhs = \"$rhs\"; $upgrade_pattern " 908 . "my \$p = qr/(?$charset_mod:(?:.|\\R)$quantifier\$rhs)/i;" 909 . "$upgrade_target \$c $op \$p"; 910 run_test($eval, "", ($charset_mod eq 'l'), ""); 911 } 912 913 foreach my $bracketed (0, 1) { # Put rhs in [...], or not 914 next if $bracketed && @pattern != 1; # bracketed makes these 915 # or's instead of a sequence 916 foreach my $optimize_bracketed (0, 1) { 917 next if $optimize_bracketed && ! $bracketed; 918 foreach my $inverted (0,1) { 919 next if $inverted && ! $bracketed; # inversion only valid 920 # in [^...] 921 next if $inverted && @target != 1; # [perl #89750] multi-char 922 # not valid in [^...] 923 924 # In some cases, add an extra character that doesn't fold, and 925 # looks ok in the output. 926 my $extra_char = "_"; 927 foreach my $prepend ("", $extra_char) { 928 foreach my $append ("", $extra_char) { 929 930 # Assemble the rhs. Put each character in a separate 931 # bracketed if using charclasses. This creates a stress on 932 # the code to span a match across multiple elements 933 my $rhs = ""; 934 foreach my $rhs_char (@rhs) { 935 $rhs .= '[' if $bracketed; 936 $rhs .= '^' if $inverted; 937 $rhs .= $rhs_char; 938 939 # Add a character to the class, so class doesn't get 940 # optimized out, unless we are testing that optimization 941 $rhs .= '_' if $optimize_bracketed; 942 $rhs .= ']' if $bracketed; 943 } 944 945 # Add one of: no capturing parens 946 # a single set 947 # a nested set 948 # Use quantifiers and extra variable width matches inside 949 # them to keep some optimizations from happening 950 foreach my $parend (0, 1, 2) { 951 my $interior = (! $parend) 952 ? $rhs 953 : ($parend == 1) 954 ? "(${rhs},?)" 955 : "((${rhs})+,?)"; 956 foreach my $quantifier ("", '?', '*', '+', '{1,3}') { 957 958 # Perhaps should be TODOs, as are unimplemented, but 959 # maybe will never be implemented 960 next if @pattern != 1 && $quantifier; 961 962 # A ? or * quantifier normally causes the thing to be 963 # able to match a null string 964 my $quantifier_can_match_null = $quantifier eq '?' 965 || $quantifier eq '*'; 966 967 # But since we only quantify the last character in a 968 # multiple fold, the other characters will have width, 969 # except if we are quantifying the whole rhs 970 my $can_match_null = $quantifier_can_match_null 971 && (@rhs == 1 || $parend); 972 973 foreach my $l_anchor ("", '^') { # '\A' didn't change 974 # result) 975 foreach my $r_anchor ("", '$') { # '\Z', '\z' didn't 976 # change result) 977 # The folded part can match the null string if it 978 # isn't required to have width, and there's not 979 # something on one or both sides that force it to. 980 my $both_sides = ($l_anchor && $r_anchor) 981 || ($l_anchor && $append) 982 || ($r_anchor && $prepend) 983 || ($prepend && $append); 984 my $must_match = ! $can_match_null || $both_sides; 985 # for performance, but doing this missed many failures 986 #next unless $must_match; 987 my $quantified = "(?$charset_mod:$l_anchor$prepend" 988 . "$interior${quantifier}$append$r_anchor)"; 989 my $op; 990 if ($must_match && $should_fail) { 991 $op = 0; 992 } else { 993 $op = 1; 994 } 995 $op = ! $op if $must_match && $inverted; 996 997 if ($inverted && @target > 1) { 998 # When doing an inverted match against a 999 # multi-char target, and there is not something on 1000 # the left to anchor the match, if it shouldn't 1001 # succeed, skip, as what will happen (when working 1002 # correctly) is that it will match the first 1003 # position correctly, and then be inverted to not 1004 # match; then it will go to the second position 1005 # where it won't match, but get inverted to match, 1006 # and hence succeeding. 1007 next if ! ($l_anchor || $prepend) && ! $op; 1008 1009 # Can't ever match for latin1 code points non-uni 1010 # semantics that have a inverted multi-char fold 1011 # when there is something on both sides and the 1012 # quantifier isn't such as to span the required 1013 # width, which is 2 or 3. 1014 $op = 0 if $ord < 255 1015 && ! $uni_semantics 1016 && $both_sides 1017 && ( ! $quantifier || $quantifier eq '?') 1018 && $parend < 2; 1019 1020 # Similarly can't ever match when inverting a 1021 # multi-char fold for /aa and the quantifier 1022 # isn't sufficient to allow it to span to both 1023 # sides. 1024 $op = 0 if $target_has_ascii 1025 && $charset eq 'aa' 1026 && $both_sides 1027 && ( ! $quantifier || $quantifier eq '?') 1028 && $parend < 2; 1029 1030 # Or for /l 1031 $op = 0 if $target_has_latin1 && $charset eq 'l' 1032 && $both_sides 1033 && ( ! $quantifier || $quantifier eq '?') 1034 && $parend < 2; 1035 } 1036 1037 1038 my $desc = ""; 1039 if ($charset_mod eq 'l') { 1040 $desc .= 'setlocale(LC_CTYPE, "' 1041 . POSIX::setlocale(&POSIX::LC_CTYPE) 1042 . '"); ' 1043 } 1044 $desc .= "my \$c = \"$prepend$lhs$append\"; " 1045 . "my \$rhs = \"\"; $upgrade_pattern" 1046 . "my \$p = qr/$quantified\$rhs/i;" 1047 . "$upgrade_target " 1048 . "\$c " . ($op ? "=~" : "!~") . " \$p; "; 1049 if ($DEBUG) { 1050 $desc .= ( 1051 "; uni_semantics=$uni_semantics, " 1052 . "should_fail=$should_fail, " 1053 . "bracketed=$bracketed, " 1054 . "prepend=$prepend, " 1055 . "append=$append, " 1056 . "parend=$parend, " 1057 . "quantifier=$quantifier, " 1058 . "l_anchor=$l_anchor, " 1059 . "r_anchor=$r_anchor; " 1060 . "pattern_above_latin1=$pattern_above_latin1; " 1061 . "utf8_pattern=$utf8_pattern" 1062 ); 1063 } 1064 1065 my $c = "$prepend$lhs_str$append"; 1066 my $p = "$quantified"; # string copy deliberate 1067 utf8::upgrade($c) if length($upgrade_target); 1068 utf8::upgrade($p) if length($upgrade_pattern); 1069 $p = qr/$p/i; 1070 my $res = $op ? ($c =~ $p): ($c !~ $p); 1071 1072 if (!$res || $list_all_tests) { 1073 # Failed or debug; output the result 1074 $count++; 1075 ok($res, "test $count - $desc"); 1076 } else { 1077 # Just count the test as passed 1078 $okays++; 1079 } 1080 $this_iteration++; 1081 } 1082 } 1083 } 1084 } 1085 } 1086 } 1087 } 1088 } 1089 } 1090 } 1091 } 1092 unless($list_all_tests) { 1093 $count++; 1094 is $okays, $this_iteration, "$okays subtests ok for" 1095 . " /$charset_mod" 1096 . (($charset_mod eq 'l') ? " ($current_locale)" : "") 1097 . ', target="' . join("", @x_target) . '",' 1098 . ' pat="' . join("", @x_pattern) . '"'; 1099 } 1100 } 1101 } 1102} 1103} 1104 1105plan($count); 1106 11071 1108