1use strict; 2use warnings; 3 4# re/fold_grind.t has more complex tests, but doesn't test every fold 5# This file also tests the fc() keyword. 6 7BEGIN { 8 chdir 't' if -d 't'; 9 @INC = '../lib'; 10 require Config; import Config; 11 require './test.pl'; 12 require './loc_tools.pl'; # Contains find_utf8_ctype_locale() 13} 14 15use feature 'unicode_strings'; 16use Unicode::UCD qw(all_casefolds); 17 18binmode *STDOUT, ":utf8"; 19 20our $TODO; 21 22 23plan("no_plan"); 24# Read in the official case folding definitions. 25my $casefolds = all_casefolds(); 26my @folds; 27my @CF; 28my @simple_folds; 29my %reverse_fold; 30use Unicode::UCD; 31use charnames(); 32 33foreach my $decimal_code_point (sort { $a <=> $b } keys %$casefolds) { 34 # We only use simple folds in fc(), since the regex engine uses full case 35 # folding. 36 37 my $name = charnames::viacode($decimal_code_point); 38 my $type = $casefolds->{$decimal_code_point}{'status'}; 39 my $code = $casefolds->{$decimal_code_point}{'code'}; 40 my $simple = $casefolds->{$decimal_code_point}{'simple'}; 41 my $full = $casefolds->{$decimal_code_point}{'full'}; 42 43 if ($simple && $simple ne $full) { # If there is a distinction 44 push @simple_folds, [ $code, $simple, $type, $name ]; 45 } 46 47 push @CF, [ $code, $full, $type, $name ]; 48 49 # Get the inverse fold for single-char mappings. 50 $reverse_fold{pack "U0U*", hex $simple} = pack "U0U*", $decimal_code_point if $simple; 51} 52 53foreach my $test_ref ( @simple_folds ) { 54 use feature 'fc'; 55 my ($code, $mapping, $type, $name) = @$test_ref; 56 my $c = pack("U0U*", hex $code); 57 my $f = pack("U0U*", map { hex } split " ", $mapping); 58 59 my $against = join "", "qq{", map("\\x{$_}", split " ", $mapping), "}"; 60 { 61 isnt(fc($c), $f, "$code - $name - $mapping - $type - Full casefolding, fc(\\x{$code}) ne $against"); 62 isnt("\F$c", $f, "$code - $name - $mapping - $type - Full casefolding, qq{\\F\\x{$code}} ne $against"); 63 } 64} 65 66foreach my $test_ref (@CF) { 67 my ($code, $mapping, $type, $name) = @$test_ref; 68 my $c = pack("U0U*", hex $code); 69 my $f = pack("U0U*", map { hex } split " ", $mapping); 70 my $f_length = length $f; 71 foreach my $test ( 72 qq[":$c:" =~ /:$c:/], 73 qq[":$c:" =~ /:$c:/i], 74 qq[":$c:" =~ /:[_$c]:/], # Place two chars in [] so doesn't get 75 # optimized to a non-charclass 76 qq[":$c:" =~ /:[_$c]:/i], 77 qq[":$c:" =~ /:$f:/i], 78 qq[":$f:" =~ /:$c:/i], 79 ) { 80 ok eval $test, "$code - $name - $mapping - $type - $test"; 81 } 82 83 { 84 # fc() tests 85 my $against = join "", "qq{", map("\\x{$_}", split " ", $mapping), "}"; 86 is(CORE::fc($c), $f, 87 "$code - $name - $mapping - $type - fc(\\x{$code}) eq $against"); 88 is("\F$c", $f, "$code - $name - $mapping - $type - qq{\\F\\x{$code}} eq $against"); 89 90 # And here we test bytes. For [A-Za-z0-9], the fold is the same as lc under 91 # bytes. For everything else, it's the bytes that formed the original string. 92 if ( $c =~ /[A-Za-z0-9]/ ) { 93 use bytes; 94 is(CORE::fc($c), lc($c), "$code - $name - fc and use bytes, ascii"); 95 } else { 96 my $copy = "" . $c; 97 utf8::encode($copy); 98 is($copy, do { use bytes; CORE::fc($c) }, "$code - $name - fc and use bytes"); 99 } 100 } 101 # Certain tests weren't convenient to put in the list above since they are 102 # TODO's in multi-character folds. 103 if ($f_length == 1) { 104 105 # The qq loses the utf8ness of ":$f:". These tests are not about 106 # finding bugs in utf8ness, so make sure it's utf8. 107 my $test = qq[my \$s = ":$f:"; utf8::upgrade(\$s); \$s =~ /:[_$c]:/i]; 108 ok eval $test, "$code - $name - $mapping - $type - $test"; 109 $test = qq[":$c:" =~ /:[_$f]:/i]; 110 ok eval $test, "$code - $name - $mapping - $type - $test"; 111 } 112 else { 113 114 # There are two classes of multi-char folds that need more work. For 115 # example, 116 # ":ß:" =~ /:[_s]{2}:/i 117 # ":ss:" =~ /:[_ß]:/i 118 # 119 # Some of the old tests for the second case happened to pass somewhat 120 # coincidentally. But none would pass if changed to this. 121 # ":SS:" =~ /:[_ß]:/i 122 # 123 # As the capital SS doesn't get folded. When those pass, it means 124 # that the code has been changed to take into account folding in the 125 # string, and all should pass, capitalized or not (this wouldn't be 126 # true for [^complemented character classes], for which the fold case 127 # is better, but these aren't used in this .t currently. So, what is 128 # done is to essentially upper-case the string for this class (but use 129 # the reverse fold not uc(), as that is more correct) 130 my $u; 131 for my $i (0 .. $f_length - 1) { 132 my $cur_char = substr($f, $i, 1); 133 $u .= $reverse_fold{$cur_char} || $cur_char; 134 } 135 my $test; 136 137 # A multi-char fold should not match just one char; 138 # e.g., ":ß:" !~ /:[_s]:/i 139 $test = qq[":$c:" !~ /:[_$f]:/i]; 140 ok eval $test, "$code - $name - $mapping - $type - $test"; 141 142 TODO: { # e.g., ":ß:" =~ /:[_s]{2}:/i 143 local $TODO = 'Multi-char fold in [character class]'; 144 145 $test = qq[":$c:" =~ /:[_$f]{$f_length}:/i]; 146 ok eval $test, "$code - $name - $mapping - $type - $test"; 147 } 148 149 # e.g., ":SS:" =~ /:[_ß]:/i now pass, so TODO has been removed, but 150 # since they use '$u', they are left out of the main loop 151 $test = qq[ my \$s = ":$u:"; utf8::upgrade(\$s); \$s =~ /:[_$c]:/i]; 152 ok eval $test, "$code - $name - $mapping - $type - $test"; 153 } 154} 155 156{ 157 use utf8; 158 use feature qw(fc); 159 # These three come from the ICU project's test suite, more especifically 160 # http://icu.sourcearchive.com/documentation/4.4~rc1-1/strcase_8cpp-source.html 161 162 my $s = "A\N{U+00df}\N{U+00b5}\N{U+fb03}\N{U+1040C}\N{U+0130}\N{U+0131}"; 163 #\N{LATIN CAPITAL LETTER A}\N{LATIN SMALL LETTER SHARP S}\N{MICRO SIGN}\N{LATIN SMALL LIGATURE FFI}\N{DESERET CAPITAL LETTER AY}\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}\N{LATIN SMALL LETTER DOTLESS I} 164 165 my $f = "ass\N{U+03bc}ffi\N{U+10434}i\N{U+0307}\N{U+0131}"; 166 #\N{LATIN SMALL LETTER A}\N{LATIN SMALL LETTER S}\N{LATIN SMALL LETTER S}\N{GREEK SMALL LETTER MU}\N{LATIN SMALL LETTER F}\N{LATIN SMALL LETTER F}\N{LATIN SMALL LETTER I}\N{DESERET SMALL LETTER AY}\N{LATIN SMALL LETTER I}\N{COMBINING DOT ABOVE}\N{LATIN SMALL LETTER DOTLESS I} 167 168 is(fc($s), $f, "ICU's casefold test passes"); 169 is("\F$s", $f, "ICU's casefold test passes"); 170 171 is( fc("aBİIıϐßffi"), "abi̇iıβssffi" ); 172 is( "\FaBİIıϐßffi", "abi̇iıβssffi" ); 173# TODO: { 174# local $::TODO = "turkic special cases"; 175# is( fc "aBİIıϐßffi", "abiııβssffi" ); 176# } 177 178 # The next batch come from http://www.devdaily.com/java/jwarehouse/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilter.java.shtml 179 # Except the article got most casings wrong. Or maybe Lucene does. 180 181 is( fc("This is a test"), "this is a test" ); 182 is( fc("Ruß"), "russ" ); 183 is( fc("ΜΆΪΟΣ"), "μάϊοσ" ); 184 is( fc("Μάϊος"), "μάϊοσ" ); 185 is( fc(""), "" ); 186 is( fc("r\xe9sum\xe9"), "r\xe9sum\xe9" ); 187 is( fc("re\x{0301}sume\x{0301}"), "re\x{301}sume\x{301}" ); 188 is( fc("ELİF"), "eli\x{307}f" ); 189 is( fc("eli\x{307}f"), "eli\x{307}f"); 190 191 # This batch comes from 192 # http://www.java2s.com/Open-Source/Java-Document/Internationalization-Localization/icu4j/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java.htm 193 # Which uses ICU as the backend. 194 195 my @folding_mixed = ( 196 "\x{61}\x{42}\x{130}\x{49}\x{131}\x{3d0}\x{df}\x{fb03}", 197 "A\x{df}\x{b5}\x{fb03}\x{1040C}\x{130}\x{131}", 198 ); 199 200 my @folding_default = ( 201 "\x{61}\x{62}\x{69}\x{307}\x{69}\x{131}\x{3b2}\x{73}\x{73}\x{66}\x{66}\x{69}", 202 "ass\x{3bc}ffi\x{10434}i\x{307}\x{131}", 203 ); 204 205 my @folding_exclude_turkic = ( 206 "\x{61}\x{62}\x{69}\x{131}\x{131}\x{3b2}\x{73}\x{73}\x{66}\x{66}\x{69}", 207 "ass\x{3bc}ffi\x{10434}i\x{131}", 208 ); 209 210 is( fc($folding_mixed[1]), $folding_default[1] ); 211 212 is( fc($folding_mixed[0]), $folding_default[0] ); 213 214} 215 216{ 217 use utf8; 218 # Table stolen from tchrist's mail in 219 # http://bugs.python.org/file23051/casing-tests.py 220 # and http://98.245.80.27/tcpc/OSCON2011/case-test.python3 221 # For reference, it's a longer version of what he posted here: 222 # http://stackoverflow.com/questions/6991038/case-insensitive-storage-and-unicode-compatibility 223 224 #Couple of repeats because I'm lazy, not tchrist's fault. 225 226 #This should probably go in t/op/lc.t 227 228 my @test_table = ( 229# ORIG LC_SIMPLE TC_SIMPLE UC_SIMPLE LC_FULL TC_FULL UC_FULL FC_SIMPLE FC_TURKIC FC_FULL 230[ 'þǽr rihtes', 'þǽr rihtes', 'Þǽr Rihtes', 'ÞǼR RIHTES', 'þǽr rihtes', 'Þǽr Rihtes', 'ÞǼR RIHTES', 'þǽr rihtes', 'þǽr rihtes', 'þǽr rihtes', ], 231[ 'duȝeðlice', 'duȝeðlice', 'Duȝeðlice', 'DUȜEÐLICE', 'duȝeðlice', 'Duȝeðlice', 'DUȜEÐLICE', 'duȝeðlice', 'duȝeðlice', 'duȝeðlice', ], 232[ 'Ævar Arnfjörð Bjarmason', 'ævar arnfjörð bjarmason', 'Ævar Arnfjörð Bjarmason', 'ÆVAR ARNFJÖRÐ BJARMASON', 'ævar arnfjörð bjarmason', 'Ævar Arnfjörð Bjarmason', 'ÆVAR ARNFJÖRÐ BJARMASON', 'ævar arnfjörð bjarmason', 'ævar arnfjörð bjarmason', 'ævar arnfjörð bjarmason', ], 233[ 'Кириллица', 'кириллица', 'Кириллица', 'КИРИЛЛИЦА', 'кириллица', 'Кириллица', 'КИРИЛЛИЦА', 'кириллица', 'кириллица', 'кириллица', ], 234[ 'ij', 'ij', 'IJ', 'IJ', 'ij', 'IJ', 'IJ', 'ij', 'ij', 'ij', ], 235[ 'Van Dijke', 'van dijke', 'Van Dijke', 'VAN DIJKE', 'van dijke', 'Van Dijke', 'VAN DIJKE', 'van dijke', 'van dijke', 'van dijke', ], 236[ 'VAN DIJKE', 'van dijke', 'Van Dijke', 'VAN DIJKE', 'van dijke', 'Van Dijke', 'VAN DIJKE', 'van dijke', 'van dijke', 'van dijke', ], 237[ 'efficient', 'efficient', 'Efficient', 'EffiCIENT', 'efficient', 'Efficient', 'EFFICIENT', 'efficient', 'efficient', 'efficient', ], 238[ 'flour', 'flour', 'flour', 'flOUR', 'flour', 'Flour', 'FLOUR', 'flour', 'flour', 'flour', ], 239[ 'flour and water', 'flour and water', 'flour And Water', 'flOUR AND WATER', 'flour and water', 'Flour And Water', 'FLOUR AND WATER', 'flour and water', 'flour and water', 'flour and water', ], 240[ 'dzur', 'dzur', 'Dzur', 'DZUR', 'dzur', 'Dzur', 'DZUR', 'dzur', 'dzur', 'dzur', ], 241[ 'Dzur', 'dzur', 'Dzur', 'DZUR', 'dzur', 'Dzur', 'DZUR', 'dzur', 'dzur', 'dzur', ], 242[ 'DZUR', 'dzur', 'Dzur', 'DZUR', 'dzur', 'Dzur', 'DZUR', 'dzur', 'dzur', 'dzur', ], 243[ 'dzur mountain', 'dzur mountain', 'Dzur Mountain', 'DZUR MOUNTAIN', 'dzur mountain', 'Dzur Mountain', 'DZUR MOUNTAIN', 'dzur mountain', 'dzur mountain', 'dzur mountain', ], 244[ 'Dzur Mountain', 'dzur mountain', 'Dzur Mountain', 'DZUR MOUNTAIN', 'dzur mountain', 'Dzur Mountain', 'DZUR MOUNTAIN', 'dzur mountain', 'dzur mountain', 'dzur mountain', ], 245[ 'DZUR MOUNTAIN', 'dzur mountain', 'Dzur Mountain', 'DZUR MOUNTAIN', 'dzur mountain', 'Dzur Mountain', 'DZUR MOUNTAIN', 'dzur mountain', 'dzur mountaın', 'dzur mountain', ], 246[ 'poſt', 'poſt', 'Poſt', 'POST', 'poſt', 'Poſt', 'POST', 'post', 'post', 'post', ], 247[ 'poſt', 'poſt', 'Poſt', 'POſt', 'poſt', 'Poſt', 'POST', 'poſt', 'post', 'post', ], 248[ 'ſtop', 'ſtop', 'ſtop', 'ſtOP', 'ſtop', 'Stop', 'STOP', 'ſtop', 'stop', 'stop', ], 249[ 'tschüß', 'tschüß', 'Tschüß', 'TSCHÜß', 'tschüß', 'Tschüß', 'TSCHÜSS', 'tschüß', 'tschüss', 'tschüss', ], 250[ 'TSCHÜẞ', 'tschüß', 'Tschüß', 'TSCHÜẞ', 'tschüß', 'Tschüß', 'TSCHÜẞ', 'tschüß', 'tschüss', 'tschüss', ], 251[ 'weiß', 'weiß', 'Weiß', 'WEIß', 'weiß', 'Weiß', 'WEISS', 'weiß', 'weiss', 'weiss', ], 252[ 'WEIẞ', 'weiß', 'Weiß', 'WEIẞ', 'weiß', 'Weiß', 'WEIẞ', 'weiß', 'weıss', 'weiss', ], 253[ 'ẞIEW', 'ßiew', 'ẞiew', 'ẞIEW', 'ßiew', 'ẞiew', 'ẞIEW', 'ßiew', 'ssıew', 'ssiew', ], 254[ 'ᾲ', 'ᾲ', 'Ὰͅ', 'ᾺΙ', 'ᾲ', 'Ὰͅ', 'ᾺΙ', 'ὰι', 'ὰι', 'ὰι', ], 255[ 'Ὰι', 'ὰι', 'Ὰι', 'ᾺΙ', 'ὰι', 'Ὰι', 'ᾺΙ', 'ὰι', 'ὰι', 'ὰι', ], 256[ 'ᾺΙ', 'ὰι', 'Ὰι', 'ᾺΙ', 'ὰι', 'Ὰι', 'ᾺΙ', 'ὰι', 'ὰι', 'ὰι', ], 257[ 'ᾲ', 'ᾲ', 'ᾲ', 'ᾲ', 'ᾲ', 'Ὰͅ', 'ᾺΙ', 'ᾲ', 'ὰι', 'ὰι', ], 258[ 'Ὰͅ', 'ᾲ', 'Ὰͅ', 'ᾺΙ', 'ᾲ', 'Ὰͅ', 'ᾺΙ', 'ὰι', 'ὰι', 'ὰι', ], 259[ 'ᾺΙ', 'ὰι', 'Ὰι', 'ᾺΙ', 'ὰι', 'Ὰι', 'ᾺΙ', 'ὰι', 'ὰι', 'ὰι', ], 260[ 'ᾲ στο διάολο', 'ᾲ στο διάολο', 'ᾲ Στο Διάολο', 'ᾲ ΣΤΟ ΔΙΆΟΛΟ', 'ᾲ στο διάολο', 'Ὰͅ Στο Διάολο', 'ᾺΙ ΣΤΟ ΔΙΆΟΛΟ', 'ᾲ στο διάολο', 'ὰι στο διάολο', 'ὰι στο διάολο', ], 261[ 'ᾲ στο διάολο', 'ᾲ στο διάολο', 'Ὰͅ Στο Διάολο', 'ᾺΙ ΣΤΟ ΔΙΆΟΛΟ', 'ᾲ στο διάολο', 'Ὰͅ Στο Διάολο', 'ᾺΙ ΣΤΟ ΔΙΆΟΛΟ', 'ὰι στο διάολο', 'ὰι στο διάολο', 'ὰι στο διάολο', ], 262[ '', '', '', '', '', '', '', '', '', '', ], 263[ '', '', '', '', '', '', '', '', '', '', ], 264[ '', '', '', '', '', '', '', '', '', '', ], 265[ 'henry ⅷ', 'henry ⅷ', 'Henry Ⅷ', 'HENRY Ⅷ', 'henry ⅷ', 'Henry Ⅷ', 'HENRY Ⅷ', 'henry ⅷ', 'henry ⅷ', 'henry ⅷ', ], 266[ 'Henry Ⅷ', 'henry ⅷ', 'Henry Ⅷ', 'HENRY Ⅷ', 'henry ⅷ', 'Henry Ⅷ', 'HENRY Ⅷ', 'henry ⅷ', 'henry ⅷ', 'henry ⅷ', ], 267[ 'HENRY Ⅷ', 'henry ⅷ', 'Henry Ⅷ', 'HENRY Ⅷ', 'henry ⅷ', 'Henry Ⅷ', 'HENRY Ⅷ', 'henry ⅷ', 'henry ⅷ', 'henry ⅷ', ], 268[ 'i work at ⓚ', 'i work at ⓚ', 'I Work At Ⓚ', 'I WORK AT Ⓚ', 'i work at ⓚ', 'I Work At Ⓚ', 'I WORK AT Ⓚ', 'i work at ⓚ', 'i work at ⓚ', 'i work at ⓚ', ], 269[ 'I Work At Ⓚ', 'i work at ⓚ', 'I Work At Ⓚ', 'I WORK AT Ⓚ', 'i work at ⓚ', 'I Work At Ⓚ', 'I WORK AT Ⓚ', 'i work at ⓚ', 'ı work at ⓚ', 'i work at ⓚ', ], 270[ 'I WORK AT Ⓚ', 'i work at ⓚ', 'I Work At Ⓚ', 'I WORK AT Ⓚ', 'i work at ⓚ', 'I Work At Ⓚ', 'I WORK AT Ⓚ', 'i work at ⓚ', 'ı work at ⓚ', 'i work at ⓚ', ], 271[ 'istambul', 'istambul', 'Istambul', 'ISTAMBUL', 'istambul', 'Istambul', 'ISTAMBUL', 'istambul', 'istambul', 'istambul', ], 272[ 'i̇stanbul', 'i̇stanbul', 'İstanbul', 'İSTANBUL', 'i̇stanbul', 'İstanbul', 'İSTANBUL', 'i̇stanbul', 'i̇stanbul', 'i̇stanbul', ], 273[ 'İstanbul', 'i̇stanbul', 'İstanbul', 'İSTANBUL', 'i̇stanbul', 'İstanbul', 'İSTANBUL', 'i̇stanbul', 'ı̇stanbul', 'i̇stanbul', ], 274[ 'İSTANBUL', 'istanbul', 'İstanbul', 'İSTANBUL', 'i̇stanbul', 'İstanbul', 'İSTANBUL', 'İstanbul', 'istanbul', 'i̇stanbul', ], 275[ 'στιγμας', 'στιγμας', 'Στιγμας', 'ΣΤΙΓΜΑΣ', 'στιγμας', 'Στιγμας', 'ΣΤΙΓΜΑΣ', 'στιγμασ', 'στιγμασ', 'στιγμασ', ], 276[ 'στιγμασ', 'στιγμασ', 'Στιγμασ', 'ΣΤΙΓΜΑΣ', 'στιγμασ', 'Στιγμασ', 'ΣΤΙΓΜΑΣ', 'στιγμασ', 'στιγμασ', 'στιγμασ', ], 277[ 'ΣΤΙΓΜΑΣ', 'στιγμασ', 'Στιγμασ', 'ΣΤΙΓΜΑΣ', 'στιγμασ', 'Στιγμασ', 'ΣΤΙΓΜΑΣ', 'στιγμασ', 'στιγμασ', 'στιγμασ', ], 278[ 'ʀᴀʀᴇ', 'ʀᴀʀᴇ', 'Ʀᴀʀᴇ', 'ƦᴀƦᴇ', 'ʀᴀʀᴇ', 'Ʀᴀʀᴇ', 'ƦᴀƦᴇ', 'ʀᴀʀᴇ', 'ʀᴀʀᴇ', 'ʀᴀʀᴇ', ], 279[ 'Ʀᴀʀᴇ', 'ʀᴀʀᴇ', 'Ʀᴀʀᴇ', 'ƦᴀƦᴇ', 'ʀᴀʀᴇ', 'Ʀᴀʀᴇ', 'ƦᴀƦᴇ', 'ʀᴀʀᴇ', 'ʀᴀʀᴇ', 'ʀᴀʀᴇ', ], 280[ 'ƦᴀƦᴇ', 'ʀᴀʀᴇ', 'Ʀᴀʀᴇ', 'ƦᴀƦᴇ', 'ʀᴀʀᴇ', 'Ʀᴀʀᴇ', 'ƦᴀƦᴇ', 'ʀᴀʀᴇ', 'ʀᴀʀᴇ', 'ʀᴀʀᴇ', ], 281[ 'Ԧԧ', 'ԧԧ', 'Ԧԧ', 'ԦԦ', 'ԧԧ', 'Ԧԧ', 'ԦԦ', 'ԧԧ', 'ԧԧ', 'ԧԧ', ], 282[ 'ԧԧ', 'ԧԧ', 'Ԧԧ', 'ԦԦ', 'ԧԧ', 'Ԧԧ', 'ԦԦ', 'ԧԧ', 'ԧԧ', 'ԧԧ', ], 283[ 'Ԧԧ', 'ԧԧ', 'Ԧԧ', 'ԦԦ', 'ԧԧ', 'Ԧԧ', 'ԦԦ', 'ԧԧ', 'ԧԧ', 'ԧԧ', ], 284[ 'ԦԦ', 'ԧԧ', 'Ԧԧ', 'ԦԦ', 'ԧԧ', 'Ԧԧ', 'ԦԦ', 'ԧԧ', 'ԧԧ', 'ԧԧ', ], 285[ "þǽr rihtes", "þǽr rihtes", "Þǽr Rihtes", "ÞǼR RIHTES", "þǽr rihtes", "Þǽr Rihtes", "ÞǼR RIHTES", "þǽr rihtes", "þǽr rihtes", "þǽr rihtes", ], 286[ "duȝeðlice", "duȝeðlice", "Duȝeðlice", "DUȜEÐLICE", "duȝeðlice", "Duȝeðlice", "DUȜEÐLICE", "duȝeðlice", "duȝeðlice", "duȝeðlice", ], 287[ "Van Dijke", "van dijke", "Van Dijke", "VAN DIJKE", "van dijke", "Van Dijke", "VAN DIJKE", "van dijke", "van dijke", "van dijke", ], 288[ "fi", "fi", "fi", "fi", "fi", "Fi", "FI", "fi", "fi", "fi", ], 289[ "filesystem", "filesystem", "filesystem", "fiLESYSTEM", "filesystem", "Filesystem", "FILESYSTEM", "filesystem", "filesystem", "filesystem", ], 290[ "efficient", "efficient", "Efficient", "EffiCIENT", "efficient", "Efficient", "EFFICIENT", "efficient", "efficient", "efficient", ], 291[ "flour and water", "flour and water", "flour And Water", "flOUR AND WATER", "flour and water", "Flour And Water", "FLOUR AND WATER", "flour and water", "flour and water", "flour and water", ], 292[ "dz", "dz", "Dz", "DZ", "dz", "Dz", "DZ", "dz", "dz", "dz", ], 293[ "dzur mountain", "dzur mountain", "Dzur Mountain", "DZUR MOUNTAIN", "dzur mountain", "Dzur Mountain", "DZUR MOUNTAIN", "dzur mountain", "dzur mountain", "dzur mountain", ], 294[ "poſt", "poſt", "Poſt", "POST", "poſt", "Poſt", "POST", "post", "post", "post", ], 295[ "poſt", "poſt", "Poſt", "POſt", "poſt", "Poſt", "POST", "poſt", "post", "post", ], 296[ "ſtop", "ſtop", "ſtop", "ſtOP", "ſtop", "Stop", "STOP", "ſtop", "stop", "stop", ], 297[ "tschüß", "tschüß", "Tschüß", "TSCHÜß", "tschüß", "Tschüß", "TSCHÜSS", "tschüß", "tschüss", "tschüss", ], 298[ "TSCHÜẞ", "tschüß", "Tschüß", "TSCHÜẞ", "tschüß", "Tschüß", "TSCHÜẞ", "tschüß", "tschüss", "tschüss", ], 299[ "rußland", "rußland", "Rußland", "RUßLAND", "rußland", "Rußland", "RUSSLAND", "rußland", "russland", "russland", ], 300[ "RUẞLAND", "rußland", "Rußland", "RUẞLAND", "rußland", "Rußland", "RUẞLAND", "rußland", "russland", "russland", ], 301[ "weiß", "weiß", "Weiß", "WEIß", "weiß", "Weiß", "WEISS", "weiß", "weiss", "weiss", ], 302[ "WEIẞ", "weiß", "Weiß", "WEIẞ", "weiß", "Weiß", "WEIẞ", "weiß", "weıss", "weiss", ], 303[ "ẞIEW", "ßiew", "ẞiew", "ẞIEW", "ßiew", "ẞiew", "ẞIEW", "ßiew", "ssıew", "ssiew", ], 304[ "ͅ", "ͅ", "Ι", "Ι", "ͅ", "Ι", "Ι", "ι", "ι", "ι", ], 305[ "ᾲ", "ᾲ", "Ὰͅ", "ᾺΙ", "ᾲ", "Ὰͅ", "ᾺΙ", "ὰι", "ὰι", "ὰι", ], 306[ "Ὰι", "ὰι", "Ὰι", "ᾺΙ", "ὰι", "Ὰι", "ᾺΙ", "ὰι", "ὰι", "ὰι", ], 307[ "ᾺΙ", "ὰι", "Ὰι", "ᾺΙ", "ὰι", "Ὰι", "ᾺΙ", "ὰι", "ὰι", "ὰι", ], 308[ "ᾲ", "ᾲ", "ᾲ", "ᾲ", "ᾲ", "Ὰͅ", "ᾺΙ", "ᾲ", "ὰι", "ὰι", ], 309[ "Ὰͅ", "ᾲ", "Ὰͅ", "ᾺΙ", "ᾲ", "Ὰͅ", "ᾺΙ", "ὰι", "ὰι", "ὰι", ], 310[ "ᾺΙ", "ὰι", "Ὰι", "ᾺΙ", "ὰι", "Ὰι", "ᾺΙ", "ὰι", "ὰι", "ὰι", ], 311[ "ᾲ στο διάολο", "ᾲ στο διάολο", "ᾲ Στο Διάολο", "ᾲ ΣΤΟ ΔΙΆΟΛΟ", "ᾲ στο διάολο", "Ὰͅ Στο Διάολο", "ᾺΙ ΣΤΟ ΔΙΆΟΛΟ", "ᾲ στο διάολο", "ὰι στο διάολο", "ὰι στο διάολο", ], 312[ "ᾲ στο διάολο", "ᾲ στο διάολο", "Ὰͅ Στο Διάολο", "ᾺΙ ΣΤΟ ΔΙΆΟΛΟ", "ᾲ στο διάολο", "Ὰͅ Στο Διάολο", "ᾺΙ ΣΤΟ ΔΙΆΟΛΟ", "ὰι στο διάολο", "ὰι στο διάολο", "ὰι στο διάολο", ], 313[ "ⅷ", "ⅷ", "Ⅷ", "Ⅷ", "ⅷ", "Ⅷ", "Ⅷ", "ⅷ", "ⅷ", "ⅷ", ], 314[ "henry ⅷ", "henry ⅷ", "Henry Ⅷ", "HENRY Ⅷ", "henry ⅷ", "Henry Ⅷ", "HENRY Ⅷ", "henry ⅷ", "henry ⅷ", "henry ⅷ", ], 315[ "ⓚ", "ⓚ", "Ⓚ", "Ⓚ", "ⓚ", "Ⓚ", "Ⓚ", "ⓚ", "ⓚ", "ⓚ", ], 316[ "i work at ⓚ", "i work at ⓚ", "I Work At Ⓚ", "I WORK AT Ⓚ", "i work at ⓚ", "I Work At Ⓚ", "I WORK AT Ⓚ", "i work at ⓚ", "i work at ⓚ", "i work at ⓚ", ], 317[ "istambul", "istambul", "Istambul", "ISTAMBUL", "istambul", "Istambul", "ISTAMBUL", "istambul", "istambul", "istambul", ], 318[ "i̇stanbul", "i̇stanbul", "İstanbul", "İSTANBUL", "i̇stanbul", "İstanbul", "İSTANBUL", "i̇stanbul", "i̇stanbul", "i̇stanbul", ], 319[ "İstanbul", "i̇stanbul", "İstanbul", "İSTANBUL", "i̇stanbul", "İstanbul", "İSTANBUL", "i̇stanbul", "ı̇stanbul", "i̇stanbul", ], 320[ "İSTANBUL", "istanbul", "İstanbul", "İSTANBUL", "i̇stanbul", "İstanbul", "İSTANBUL", "İstanbul", "istanbul", "i̇stanbul", ], 321[ "στιγμας", "στιγμας", "Στιγμας", "ΣΤΙΓΜΑΣ", "στιγμας", "Στιγμας", "ΣΤΙΓΜΑΣ", "στιγμασ", "στιγμασ", "στιγμασ", ], 322[ "στιγμασ", "στιγμασ", "Στιγμασ", "ΣΤΙΓΜΑΣ", "στιγμασ", "Στιγμασ", "ΣΤΙΓΜΑΣ", "στιγμασ", "στιγμασ", "στιγμασ", ], 323[ "ΣΤΙΓΜΑΣ", "στιγμασ", "Στιγμασ", "ΣΤΙΓΜΑΣ", "στιγμασ", "Στιγμασ", "ΣΤΙΓΜΑΣ", "στιγμασ", "στιγμασ", "στιγμασ", ], 324[ "ʀᴀʀᴇ", "ʀᴀʀᴇ", "Ʀᴀʀᴇ", "ƦᴀƦᴇ", "ʀᴀʀᴇ", "Ʀᴀʀᴇ", "ƦᴀƦᴇ", "ʀᴀʀᴇ", "ʀᴀʀᴇ", "ʀᴀʀᴇ", ], 325[ "", "", "", "", "", "", "", "", "", "", ], 326[ "Ԧԧ", "ԧԧ", "Ԧԧ", "ԦԦ", "ԧԧ", "Ԧԧ", "ԦԦ", "ԧԧ", "ԧԧ", "ԧԧ", ], 327[ "ﬓﬔﬕﬖﬗ", "ﬓﬔﬕﬖﬗ", "ﬓﬔﬕﬖﬗ", "ﬓﬔﬕﬖﬗ", "ﬓﬔﬕﬖﬗ", "Մնﬔﬕﬖﬗ", "ՄՆՄԵՄԻՎՆՄԽ", "ﬓﬔﬕﬖﬗ", "մնմեմիվնմխ", "մնմեմիվնմխ", ], 328[ "ʼn groot", "ʼn groot", "ʼn Groot", "ʼn GROOT", "ʼn groot", "ʼN Groot", "ʼN GROOT", "ʼn groot", "ʼn groot", "ʼn groot", ], 329[ "ẚ", "ẚ", "ẚ", "ẚ", "ẚ", "Aʾ", "Aʾ", "ẚ", "aʾ", "aʾ", ], 330[ "ff", "ff", "ff", "ff", "ff", "Ff", "FF", "ff", "ff", "ff", ], 331[ "ǰ", "ǰ", "ǰ", "ǰ", "ǰ", "J̌", "J̌", "ǰ", "ǰ", "ǰ", ], 332[ "550 nm or Å", "550 nm or å", "550 Nm Or Å", "550 NM OR Å", "550 nm or å", "550 Nm Or Å", "550 NM OR Å", "550 nm or å", "550 nm or å", "550 nm or å", ], 333); 334 335 use feature qw(fc); 336 337 for (@test_table) { 338 my ($simple_lc, $simple_tc, $simple_uc, $simple_fc) = @{$_}[1, 2, 3, 7]; 339 my ($orig, $lower, $titlecase, $upper, $fc_turkic, $fc_full) = @{$_}[0,4,5,6,8,9]; 340 341 if ($orig =~ /(\P{Assigned})/) { # So can fail gracefully in earlier 342 # Unicode versions 343 fail(sprintf "because U+%04X is unassigned", ord($1)); 344 next; 345 } 346 is( fc($orig), $fc_full, "fc('$orig') returns '$fc_full'" ); 347 is( "\F$orig", $fc_full, '\F works' ); 348 is( lc($orig), $lower, "lc('$orig') returns '$lower'" ); 349 is( "\L$orig", $lower, '\L works' ); 350 is( uc($orig), $upper, "uc('$orig') returns '$upper'" ); 351 is( "\U$orig", $upper, '\U works' ); 352 } 353} 354 355{ 356 use feature qw(fc); 357 package Eeyup { use overload q{""} => sub { "\x{df}" }, fallback => 1 } 358 package Uunope { use overload q{""} => sub { "\x{30cb}" }, fallback => 1 } 359 package Undef { use overload q{""} => sub { undef }, fallback => 1 } 360 361 my $obj = bless {}, "Eeyup"; 362 is(fc($obj), "ss", "fc() works on overloaded objects returning latin-1"); 363 $obj = bless {}, "Eeyup"; 364 is("\F$obj", "ss", '\F works on overloaded objects returning latin-1'); 365 366 $obj = bless {}, "Uunope"; 367 is(fc($obj), "\x{30cb}", "fc() works on overloaded objects returning UTF-8"); 368 $obj = bless {}, "Uunope"; 369 is("\F$obj", "\x{30cb}", '\F works on overloaded objects returning UTF-8'); 370 371 $obj = bless {}, "Undef"; 372 my $warnings; 373 { 374 no warnings; 375 use warnings "uninitialized"; 376 local $SIG{__WARN__} = sub { $warnings++; like(shift, qr/Use of uninitialized value (?:\$obj )?in fc/) }; 377 fc(undef); 378 fc($obj); 379 } 380 is( $warnings, 2, "correct number of warnings" ); 381 382 my $fetched = 0; 383 package Derpy { sub TIESCALAR { bless {}, shift } sub FETCH { $fetched++; "\x{df}" } } 384 385 tie my $x, "Derpy"; 386 387 is( fc($x), "ss", "fc() works on tied values" ); 388 is( $fetched, 1, "and only calls the magic once" ); 389 390} 391 392{ 393 use feature qw( fc ); 394 my $troublesome1 = "\xdf" x 11; #SvLEN should be 12, SvCUR should be 11 395 #So this should force fc() to grow the string. 396 397 is( fc($troublesome1), "ss" x 11, "fc() grows the string" ); 398 399 my $troublesome2 = "abcdef:\x{df}:fjksjs"; #SvLEN should be 16, SvCUR should be 15 400 is( fc($troublesome2), "abcdef:ss:fjksjs", "fc() expands \\x{DF} in the middle of a string that needs to grow" ); 401 402 my $troublesome3 = ":\x{df}:"; 403 is( fc($troublesome3), ":ss:", "fc() expands \\x{DF} in the middle of a string" ); 404 405 406 my $troublesome4 = "\x{B5}"; #\N{MICRON SIGN} is latin-1, but its foldcase is in UTF-8 407 408 is( fc($troublesome4), "\x{3BC}", "fc() for a latin-1 \x{B5} returns UTF-8" ); 409 ok( !utf8::is_utf8($troublesome4), "fc() doesn't upgrade the original string" ); 410 411 412 my $troublesome5 = "\x{C9}abda\x{B5}aaf\x{C8}"; # Up until foldcasing \x{B5}, the string 413 # was in Latin-1. This tests that the 414 # results don't have illegal UTF-8 415 # (i.e. leftover latin-1) in them 416 417 is( fc($troublesome5), "\x{E9}abda\x{3BC}aaf\x{E8}" ); 418} 419 420 421{ 422 use feature qw( fc unicode_strings ); 423 use if $Config{d_setlocale}, qw(POSIX locale_h); 424 setlocale(&POSIX::LC_ALL, "C") if $Config{d_setlocale}; 425 426 # This tests both code paths in pp_fc 427 428 for (0..0xff) { 429 my $latin1 = chr; 430 my $utf8 = $latin1; 431 utf8::downgrade($latin1); #No-op, but doesn't hurt 432 utf8::upgrade($utf8); 433 is(fc($latin1), fc($utf8), "fc() gives the same results for \\x{$_} in Latin-1 and UTF-8 under unicode_strings"); 434 SKIP: { 435 skip 'No locale testing without d_setlocale', 2 if(!$Config{d_setlocale}) || $Config::Config{ccflags} =~ /\bD?NO_LOCALE(_|\b)/; 436 BEGIN { 437 if($Config{d_setlocale}) { 438 require locale; import locale; 439 } 440 } 441 is(fc($latin1), lc($latin1), "use locale; fc(qq{\\x{$_}}), lc(qq{\\x{$_}}) when qq{\\x{$_}} is in latin-1"); 442 is(fc($utf8), lc($utf8), "use locale; fc(qq{\\x{$_}}), lc(qq{\\x{$_}}) when qq{\\x{$_}} is in latin-1"); 443 } 444 SKIP: { 445 if ( 446 !$Config::Config{d_setlocale} 447 || $Config::Config{ccflags} =~ /\bD?NO_LOCALE(_|\b)/ 448 ) { 449 skip "no locale support", 2 450 } 451 no feature 'unicode_strings'; 452 is(fc($latin1), lc($latin1), "under nothing, fc() for <256 is the same as lc"); 453 } 454 } 455} 456 457my $utf8_locale = find_utf8_ctype_locale(); 458 459{ 460 use feature qw( fc ); 461 use locale; 462 is(fc("\x{1E9E}"), fc("\x{17F}\x{17F}"), 'fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")'); 463 SKIP: { 464 skip 'Can\'t find a UTF-8 locale', 1 unless defined $utf8_locale; 465 setlocale(&LC_CTYPE, $utf8_locale); 466 is(fc("\x{1E9E}"), "ss", 'fc("\x{1E9E}") eq "ss" in a UTF-8 locale)'); 467 } 468} 469 470SKIP: { 471 skip 'Can\'t find a UTF-8 locale', 256 unless defined $utf8_locale; 472 473 use feature qw( fc unicode_strings ); 474 475 # Get the official fc values outside locale. 476 no locale; 477 my @unicode_fc; 478 for (0..0xff) { 479 push @unicode_fc, fc(chr); 480 } 481 482 # These should match the UTF-8 locale values 483 setlocale(&LC_CTYPE, $utf8_locale); 484 use locale; 485 for (0..0xff) { 486 is(fc(chr), $unicode_fc[$_], "In a UTF-8 locale, fc(chr $_) is the same as official Unicode"); 487 } 488} 489 490 491my $num_tests = curr_test() - 1; 492 493plan($num_tests); 494