xref: /openbsd/gnu/usr.bin/perl/t/uni/fold.t (revision cecf84d4)
1use strict;
2use warnings;
3
4# re/fold_grind.t has more complex tests, but doesn't test every fold
5# This file also tests the fc() keyword.
6
7BEGIN {
8    chdir 't' if -d 't';
9    @INC = '../lib';
10    require Config; import Config;
11    require './test.pl';
12    require './loc_tools.pl';   # Contains find_utf8_ctype_locale()
13}
14
15use feature 'unicode_strings';
16use Unicode::UCD qw(all_casefolds);
17
18binmode *STDOUT, ":utf8";
19
20our $TODO;
21
22
23plan("no_plan");
24# Read in the official case folding definitions.
25my $casefolds = all_casefolds();
26my @folds;
27my @CF;
28my @simple_folds;
29my %reverse_fold;
30use Unicode::UCD;
31use charnames();
32
33foreach my $decimal_code_point (sort { $a <=> $b } keys %$casefolds) {
34    # We only use simple folds in fc(), since the regex engine uses full case
35    # folding.
36
37    my $name = charnames::viacode($decimal_code_point);
38    my $type = $casefolds->{$decimal_code_point}{'status'};
39    my $code = $casefolds->{$decimal_code_point}{'code'};
40    my $simple = $casefolds->{$decimal_code_point}{'simple'};
41    my $full = $casefolds->{$decimal_code_point}{'full'};
42
43    if ($simple && $simple ne $full) { # If there is a distinction
44        push @simple_folds, [ $code, $simple, $type, $name ];
45    }
46
47    push @CF, [ $code, $full, $type, $name ];
48
49    # Get the inverse fold for single-char mappings.
50    $reverse_fold{pack "U0U*", hex $simple} = pack "U0U*", $decimal_code_point if $simple;
51}
52
53foreach my $test_ref ( @simple_folds ) {
54    use feature 'fc';
55    my ($code, $mapping, $type, $name) = @$test_ref;
56    my $c = pack("U0U*", hex $code);
57    my $f = pack("U0U*", map { hex } split " ", $mapping);
58
59    my $against = join "", "qq{", map("\\x{$_}", split " ", $mapping), "}";
60    {
61        isnt(fc($c), $f, "$code - $name - $mapping - $type - Full casefolding, fc(\\x{$code}) ne $against");
62        isnt("\F$c", $f, "$code - $name - $mapping - $type - Full casefolding, qq{\\F\\x{$code}} ne $against");
63    }
64}
65
66foreach my $test_ref (@CF) {
67    my ($code, $mapping, $type, $name) = @$test_ref;
68    my $c = pack("U0U*", hex $code);
69    my $f = pack("U0U*", map { hex } split " ", $mapping);
70    my $f_length = length $f;
71    foreach my $test (
72            qq[":$c:" =~ /:$c:/],
73            qq[":$c:" =~ /:$c:/i],
74            qq[":$c:" =~ /:[_$c]:/], # Place two chars in [] so doesn't get
75                                     # optimized to a non-charclass
76            qq[":$c:" =~ /:[_$c]:/i],
77            qq[":$c:" =~ /:$f:/i],
78            qq[":$f:" =~ /:$c:/i],
79    ) {
80        ok eval $test, "$code - $name - $mapping - $type - $test";
81    }
82
83    {
84        # fc() tests
85        my $against = join "", "qq{", map("\\x{$_}", split " ", $mapping), "}";
86        is(CORE::fc($c), $f,
87            "$code - $name - $mapping - $type - fc(\\x{$code}) eq $against");
88        is("\F$c", $f, "$code - $name - $mapping - $type - qq{\\F\\x{$code}} eq $against");
89
90        # And here we test bytes. For [A-Za-z0-9], the fold is the same as lc under
91        # bytes. For everything else, it's the bytes that formed the original string.
92        if ( $c =~ /[A-Za-z0-9]/ ) {
93            use bytes;
94            is(CORE::fc($c), lc($c), "$code - $name - fc and use bytes, ascii");
95        } else {
96            my $copy = "" . $c;
97            utf8::encode($copy);
98            is($copy, do { use bytes; CORE::fc($c) }, "$code - $name - fc and use bytes");
99        }
100    }
101    # Certain tests weren't convenient to put in the list above since they are
102    # TODO's in multi-character folds.
103    if ($f_length == 1) {
104
105        # The qq loses the utf8ness of ":$f:".  These tests are not about
106        # finding bugs in utf8ness, so make sure it's utf8.
107        my $test = qq[my \$s = ":$f:"; utf8::upgrade(\$s); \$s =~ /:[_$c]:/i];
108        ok eval $test, "$code - $name - $mapping - $type - $test";
109        $test = qq[":$c:" =~ /:[_$f]:/i];
110        ok eval $test, "$code - $name - $mapping - $type - $test";
111    }
112    else {
113
114        # There are two classes of multi-char folds that need more work.  For
115        # example,
116        #   ":ß:" =~ /:[_s]{2}:/i
117        #   ":ss:" =~ /:[_ß]:/i
118        #
119        # Some of the old tests for the second case happened to pass somewhat
120        # coincidentally.  But none would pass if changed to this.
121        #   ":SS:" =~ /:[_ß]:/i
122        #
123        # As the capital SS doesn't get folded.  When those pass, it means
124        # that the code has been changed to take into account folding in the
125        # string, and all should pass, capitalized or not (this wouldn't be
126        # true for [^complemented character classes], for which the fold case
127        # is better, but these aren't used in this .t currently.  So, what is
128        # done is to essentially upper-case the string for this class (but use
129        # the reverse fold not uc(), as that is more correct)
130        my $u;
131        for my $i (0 .. $f_length - 1) {
132            my $cur_char = substr($f, $i, 1);
133            $u .= $reverse_fold{$cur_char} || $cur_char;
134        }
135        my $test;
136
137        # A multi-char fold should not match just one char;
138        # e.g., ":ß:" !~ /:[_s]:/i
139        $test = qq[":$c:" !~ /:[_$f]:/i];
140        ok eval $test, "$code - $name - $mapping - $type - $test";
141
142        TODO: { # e.g., ":ß:" =~ /:[_s]{2}:/i
143            local $TODO = 'Multi-char fold in [character class]';
144
145            $test = qq[":$c:" =~ /:[_$f]{$f_length}:/i];
146            ok eval $test, "$code - $name - $mapping - $type - $test";
147        }
148
149        # e.g., ":SS:" =~ /:[_ß]:/i now pass, so TODO has been removed, but
150        # since they use '$u', they are left out of the main loop
151        $test = qq[ my \$s = ":$u:"; utf8::upgrade(\$s); \$s =~ /:[_$c]:/i];
152        ok eval $test, "$code - $name - $mapping - $type - $test";
153    }
154}
155
156{
157    use utf8;
158    use feature qw(fc);
159    # These three come from the ICU project's test suite, more especifically
160    # http://icu.sourcearchive.com/documentation/4.4~rc1-1/strcase_8cpp-source.html
161
162    my $s = "A\N{U+00df}\N{U+00b5}\N{U+fb03}\N{U+1040C}\N{U+0130}\N{U+0131}";
163    #\N{LATIN CAPITAL LETTER A}\N{LATIN SMALL LETTER SHARP S}\N{MICRO SIGN}\N{LATIN SMALL LIGATURE FFI}\N{DESERET CAPITAL LETTER AY}\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}\N{LATIN SMALL LETTER DOTLESS I}
164
165    my $f = "ass\N{U+03bc}ffi\N{U+10434}i\N{U+0307}\N{U+0131}";
166    #\N{LATIN SMALL LETTER A}\N{LATIN SMALL LETTER S}\N{LATIN SMALL LETTER S}\N{GREEK SMALL LETTER MU}\N{LATIN SMALL LETTER F}\N{LATIN SMALL LETTER F}\N{LATIN SMALL LETTER I}\N{DESERET SMALL LETTER AY}\N{LATIN SMALL LETTER I}\N{COMBINING DOT ABOVE}\N{LATIN SMALL LETTER DOTLESS I}
167
168    is(fc($s), $f, "ICU's casefold test passes");
169    is("\F$s", $f, "ICU's casefold test passes");
170
171    is( fc("aBİIıϐßffi��"), "abi̇iıβssffi��" );
172    is( "\FaBİIıϐßffi��", "abi̇iıβssffi��" );
173#    TODO: {
174#        local $::TODO = "turkic special cases";
175#        is( fc "aBİIıϐßffi��", "abiııβssffi��" );
176#    }
177
178    # The next batch come from http://www.devdaily.com/java/jwarehouse/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilter.java.shtml
179    # Except the article got most casings wrong. Or maybe Lucene does.
180
181    is( fc("This is a test"), "this is a test" );
182    is( fc("Ruß"), "russ"    );
183    is( fc("ΜΆΪΟΣ"), "μάϊοσ" );
184    is( fc("Μάϊος"), "μάϊοσ" );
185    is( fc("��"), "��"       );
186    is( fc("r\xe9sum\xe9"), "r\xe9sum\xe9" );
187    is( fc("re\x{0301}sume\x{0301}"), "re\x{301}sume\x{301}" );
188    is( fc("ELİF"), "eli\x{307}f" );
189    is( fc("eli\x{307}f"), "eli\x{307}f");
190
191    # This batch comes from
192    # http://www.java2s.com/Open-Source/Java-Document/Internationalization-Localization/icu4j/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java.htm
193    # Which uses ICU as the backend.
194
195    my @folding_mixed = (
196        "\x{61}\x{42}\x{130}\x{49}\x{131}\x{3d0}\x{df}\x{fb03}",
197        "A\x{df}\x{b5}\x{fb03}\x{1040C}\x{130}\x{131}",
198    );
199
200    my @folding_default = (
201        "\x{61}\x{62}\x{69}\x{307}\x{69}\x{131}\x{3b2}\x{73}\x{73}\x{66}\x{66}\x{69}",
202        "ass\x{3bc}ffi\x{10434}i\x{307}\x{131}",
203    );
204
205    my @folding_exclude_turkic = (
206        "\x{61}\x{62}\x{69}\x{131}\x{131}\x{3b2}\x{73}\x{73}\x{66}\x{66}\x{69}",
207        "ass\x{3bc}ffi\x{10434}i\x{131}",
208    );
209
210    is( fc($folding_mixed[1]), $folding_default[1] );
211
212    is( fc($folding_mixed[0]), $folding_default[0] );
213
214}
215
216{
217    use utf8;
218    # Table stolen from tchrist's mail in
219    # http://bugs.python.org/file23051/casing-tests.py
220    # and http://98.245.80.27/tcpc/OSCON2011/case-test.python3
221    # For reference, it's a longer version of what he posted here:
222    # http://stackoverflow.com/questions/6991038/case-insensitive-storage-and-unicode-compatibility
223
224    #Couple of repeats because I'm lazy, not tchrist's fault.
225
226    #This should probably go in t/op/lc.t
227
228    my @test_table = (
229# ORIG LC_SIMPLE TC_SIMPLE UC_SIMPLE LC_FULL TC_FULL UC_FULL FC_SIMPLE FC_TURKIC FC_FULL
230[ 'þǽr rihtes', 'þǽr rihtes', 'Þǽr Rihtes', 'ÞǼR RIHTES', 'þǽr rihtes', 'Þǽr Rihtes', 'ÞǼR RIHTES', 'þǽr rihtes', 'þǽr rihtes', 'þǽr rihtes',  ],
231[ 'duȝeðlice', 'duȝeðlice', 'Duȝeðlice', 'DUȜEÐLICE', 'duȝeðlice', 'Duȝeðlice', 'DUȜEÐLICE', 'duȝeðlice', 'duȝeðlice', 'duȝeðlice',  ],
232[ 'Ævar Arnfjörð Bjarmason', 'ævar arnfjörð bjarmason', 'Ævar Arnfjörð Bjarmason', 'ÆVAR ARNFJÖRÐ BJARMASON', 'ævar arnfjörð bjarmason', 'Ævar Arnfjörð Bjarmason', 'ÆVAR ARNFJÖRÐ BJARMASON', 'ævar arnfjörð bjarmason', 'ævar arnfjörð bjarmason', 'ævar arnfjörð bjarmason',  ],
233[ 'Кириллица', 'кириллица', 'Кириллица', 'КИРИЛЛИЦА', 'кириллица', 'Кириллица', 'КИРИЛЛИЦА', 'кириллица', 'кириллица', 'кириллица',  ],
234[ 'ij', 'ij', 'IJ', 'IJ', 'ij', 'IJ', 'IJ', 'ij', 'ij', 'ij',  ],
235[ 'Van Dijke', 'van dijke', 'Van Dijke', 'VAN DIJKE', 'van dijke', 'Van Dijke', 'VAN DIJKE', 'van dijke', 'van dijke', 'van dijke',  ],
236[ 'VAN DIJKE', 'van dijke', 'Van Dijke', 'VAN DIJKE', 'van dijke', 'Van Dijke', 'VAN DIJKE', 'van dijke', 'van dijke', 'van dijke',  ],
237[ 'efficient', 'efficient', 'Efficient', 'EffiCIENT', 'efficient', 'Efficient', 'EFFICIENT', 'efficient', 'efficient', 'efficient',  ],
238[ 'flour', 'flour', 'flour', 'flOUR', 'flour', 'Flour', 'FLOUR', 'flour', 'flour', 'flour',  ],
239[ 'flour and water', 'flour and water', 'flour And Water', 'flOUR AND WATER', 'flour and water', 'Flour And Water', 'FLOUR AND WATER', 'flour and water', 'flour and water', 'flour and water',  ],
240[ 'dzur', 'dzur', 'Dzur', 'DZUR', 'dzur', 'Dzur', 'DZUR', 'dzur', 'dzur', 'dzur',  ],
241[ 'Dzur', 'dzur', 'Dzur', 'DZUR', 'dzur', 'Dzur', 'DZUR', 'dzur', 'dzur', 'dzur',  ],
242[ 'DZUR', 'dzur', 'Dzur', 'DZUR', 'dzur', 'Dzur', 'DZUR', 'dzur', 'dzur', 'dzur',  ],
243[ 'dzur mountain', 'dzur mountain', 'Dzur Mountain', 'DZUR MOUNTAIN', 'dzur mountain', 'Dzur Mountain', 'DZUR MOUNTAIN', 'dzur mountain', 'dzur mountain', 'dzur mountain',  ],
244[ 'Dzur Mountain', 'dzur mountain', 'Dzur Mountain', 'DZUR MOUNTAIN', 'dzur mountain', 'Dzur Mountain', 'DZUR MOUNTAIN', 'dzur mountain', 'dzur mountain', 'dzur mountain',  ],
245[ 'DZUR MOUNTAIN', 'dzur mountain', 'Dzur Mountain', 'DZUR MOUNTAIN', 'dzur mountain', 'Dzur Mountain', 'DZUR MOUNTAIN', 'dzur mountain', 'dzur mountaın', 'dzur mountain',  ],
246[ 'poſt', 'poſt', 'Poſt', 'POST', 'poſt', 'Poſt', 'POST', 'post', 'post', 'post',  ],
247[ 'poſt', 'poſt', 'Poſt', 'POſt', 'poſt', 'Poſt', 'POST', 'poſt', 'post', 'post',  ],
248[ 'ſtop', 'ſtop', 'ſtop', 'ſtOP', 'ſtop', 'Stop', 'STOP', 'ſtop', 'stop', 'stop',  ],
249[ 'tschüß', 'tschüß', 'Tschüß', 'TSCHÜß', 'tschüß', 'Tschüß', 'TSCHÜSS', 'tschüß', 'tschüss', 'tschüss',  ],
250[ 'TSCHÜẞ', 'tschüß', 'Tschüß', 'TSCHÜẞ', 'tschüß', 'Tschüß', 'TSCHÜẞ', 'tschüß', 'tschüss', 'tschüss',  ],
251[ 'weiß', 'weiß', 'Weiß', 'WEIß', 'weiß', 'Weiß', 'WEISS', 'weiß', 'weiss', 'weiss',  ],
252[ 'WEIẞ', 'weiß', 'Weiß', 'WEIẞ', 'weiß', 'Weiß', 'WEIẞ', 'weiß', 'weıss', 'weiss',  ],
253[ 'ẞIEW', 'ßiew', 'ẞiew', 'ẞIEW', 'ßiew', 'ẞiew', 'ẞIEW', 'ßiew', 'ssıew', 'ssiew',  ],
254[ 'ᾲ', 'ᾲ', 'Ὰͅ', 'ᾺΙ', 'ᾲ', 'Ὰͅ', 'ᾺΙ', 'ὰι', 'ὰι', 'ὰι',  ],
255[ 'Ὰι', 'ὰι', 'Ὰι', 'ᾺΙ', 'ὰι', 'Ὰι', 'ᾺΙ', 'ὰι', 'ὰι', 'ὰι',  ],
256[ 'ᾺΙ', 'ὰι', 'Ὰι', 'ᾺΙ', 'ὰι', 'Ὰι', 'ᾺΙ', 'ὰι', 'ὰι', 'ὰι',  ],
257[ 'ᾲ', 'ᾲ', 'ᾲ', 'ᾲ', 'ᾲ', 'Ὰͅ', 'ᾺΙ', 'ᾲ', 'ὰι', 'ὰι',  ],
258[ 'Ὰͅ', 'ᾲ', 'Ὰͅ', 'ᾺΙ', 'ᾲ', 'Ὰͅ', 'ᾺΙ', 'ὰι', 'ὰι', 'ὰι',  ],
259[ 'ᾺΙ', 'ὰι', 'Ὰι', 'ᾺΙ', 'ὰι', 'Ὰι', 'ᾺΙ', 'ὰι', 'ὰι', 'ὰι',  ],
260[ 'ᾲ στο διάολο', 'ᾲ στο διάολο', 'ᾲ Στο Διάολο', 'ᾲ ΣΤΟ ΔΙΆΟΛΟ', 'ᾲ στο διάολο', 'Ὰͅ Στο Διάολο', 'ᾺΙ ΣΤΟ ΔΙΆΟΛΟ', 'ᾲ στο διάολο', 'ὰι στο διάολο', 'ὰι στο διάολο',  ],
261[ 'ᾲ στο διάολο', 'ᾲ στο διάολο', 'Ὰͅ Στο Διάολο', 'ᾺΙ ΣΤΟ ΔΙΆΟΛΟ', 'ᾲ στο διάολο', 'Ὰͅ Στο Διάολο', 'ᾺΙ ΣΤΟ ΔΙΆΟΛΟ', 'ὰι στο διάολο', 'ὰι στο διάολο', 'ὰι στο διάολο',  ],
262[ '��������������', '��������������', '��������������', '��������������', '��������������', '��������������', '��������������', '��������������', '��������������', '��������������',  ],
263[ '��������������', '��������������', '��������������', '��������������', '��������������', '��������������', '��������������', '��������������', '��������������', '��������������',  ],
264[ '��������������', '��������������', '��������������', '��������������', '��������������', '��������������', '��������������', '��������������', '��������������', '��������������',  ],
265[ 'henry ⅷ', 'henry ⅷ', 'Henry Ⅷ', 'HENRY Ⅷ', 'henry ⅷ', 'Henry Ⅷ', 'HENRY Ⅷ', 'henry ⅷ', 'henry ⅷ', 'henry ⅷ',  ],
266[ 'Henry Ⅷ', 'henry ⅷ', 'Henry Ⅷ', 'HENRY Ⅷ', 'henry ⅷ', 'Henry Ⅷ', 'HENRY Ⅷ', 'henry ⅷ', 'henry ⅷ', 'henry ⅷ',  ],
267[ 'HENRY Ⅷ', 'henry ⅷ', 'Henry Ⅷ', 'HENRY Ⅷ', 'henry ⅷ', 'Henry Ⅷ', 'HENRY Ⅷ', 'henry ⅷ', 'henry ⅷ', 'henry ⅷ',  ],
268[ 'i work at ⓚ', 'i work at ⓚ', 'I Work At Ⓚ', 'I WORK AT Ⓚ', 'i work at ⓚ', 'I Work At Ⓚ', 'I WORK AT Ⓚ', 'i work at ⓚ', 'i work at ⓚ', 'i work at ⓚ',  ],
269[ 'I Work At Ⓚ', 'i work at ⓚ', 'I Work At Ⓚ', 'I WORK AT Ⓚ', 'i work at ⓚ', 'I Work At Ⓚ', 'I WORK AT Ⓚ', 'i work at ⓚ', 'ı work at ⓚ', 'i work at ⓚ',  ],
270[ 'I WORK AT Ⓚ', 'i work at ⓚ', 'I Work At Ⓚ', 'I WORK AT Ⓚ', 'i work at ⓚ', 'I Work At Ⓚ', 'I WORK AT Ⓚ', 'i work at ⓚ', 'ı work at ⓚ', 'i work at ⓚ',  ],
271[ 'istambul', 'istambul', 'Istambul', 'ISTAMBUL', 'istambul', 'Istambul', 'ISTAMBUL', 'istambul', 'istambul', 'istambul',  ],
272[ 'i̇stanbul', 'i̇stanbul', 'İstanbul', 'İSTANBUL', 'i̇stanbul', 'İstanbul', 'İSTANBUL', 'i̇stanbul', 'i̇stanbul', 'i̇stanbul',  ],
273[ 'İstanbul', 'i̇stanbul', 'İstanbul', 'İSTANBUL', 'i̇stanbul', 'İstanbul', 'İSTANBUL', 'i̇stanbul', 'ı̇stanbul', 'i̇stanbul',  ],
274[ 'İSTANBUL', 'istanbul', 'İstanbul', 'İSTANBUL', 'i̇stanbul', 'İstanbul', 'İSTANBUL', 'İstanbul', 'istanbul', 'i̇stanbul',  ],
275[ 'στιγμας', 'στιγμας', 'Στιγμας', 'ΣΤΙΓΜΑΣ', 'στιγμας', 'Στιγμας', 'ΣΤΙΓΜΑΣ', 'στιγμασ', 'στιγμασ', 'στιγμασ',  ],
276[ 'στιγμασ', 'στιγμασ', 'Στιγμασ', 'ΣΤΙΓΜΑΣ', 'στιγμασ', 'Στιγμασ', 'ΣΤΙΓΜΑΣ', 'στιγμασ', 'στιγμασ', 'στιγμασ',  ],
277[ 'ΣΤΙΓΜΑΣ', 'στιγμασ', 'Στιγμασ', 'ΣΤΙΓΜΑΣ', 'στιγμασ', 'Στιγμασ', 'ΣΤΙΓΜΑΣ', 'στιγμασ', 'στιγμασ', 'στιγμασ',  ],
278[ 'ʀᴀʀᴇ', 'ʀᴀʀᴇ', 'Ʀᴀʀᴇ', 'ƦᴀƦᴇ', 'ʀᴀʀᴇ', 'Ʀᴀʀᴇ', 'ƦᴀƦᴇ', 'ʀᴀʀᴇ', 'ʀᴀʀᴇ', 'ʀᴀʀᴇ',  ],
279[ 'Ʀᴀʀᴇ', 'ʀᴀʀᴇ', 'Ʀᴀʀᴇ', 'ƦᴀƦᴇ', 'ʀᴀʀᴇ', 'Ʀᴀʀᴇ', 'ƦᴀƦᴇ', 'ʀᴀʀᴇ', 'ʀᴀʀᴇ', 'ʀᴀʀᴇ',  ],
280[ 'ƦᴀƦᴇ', 'ʀᴀʀᴇ', 'Ʀᴀʀᴇ', 'ƦᴀƦᴇ', 'ʀᴀʀᴇ', 'Ʀᴀʀᴇ', 'ƦᴀƦᴇ', 'ʀᴀʀᴇ', 'ʀᴀʀᴇ', 'ʀᴀʀᴇ',  ],
281[ 'Ԧԧ', 'ԧԧ', 'Ԧԧ', 'ԦԦ', 'ԧԧ', 'Ԧԧ', 'ԦԦ', 'ԧԧ', 'ԧԧ', 'ԧԧ',  ],
282[ 'ԧԧ', 'ԧԧ', 'Ԧԧ', 'ԦԦ', 'ԧԧ', 'Ԧԧ', 'ԦԦ', 'ԧԧ', 'ԧԧ', 'ԧԧ',  ],
283[ 'Ԧԧ', 'ԧԧ', 'Ԧԧ', 'ԦԦ', 'ԧԧ', 'Ԧԧ', 'ԦԦ', 'ԧԧ', 'ԧԧ', 'ԧԧ',  ],
284[ 'ԦԦ', 'ԧԧ', 'Ԧԧ', 'ԦԦ', 'ԧԧ', 'Ԧԧ', 'ԦԦ', 'ԧԧ', 'ԧԧ', 'ԧԧ',  ],
285[ "þǽr rihtes", "þǽr rihtes", "Þǽr Rihtes", "ÞǼR RIHTES", "þǽr rihtes", "Þǽr Rihtes", "ÞǼR RIHTES", "þǽr rihtes", "þǽr rihtes", "þǽr rihtes",  ],
286[ "duȝeðlice", "duȝeðlice", "Duȝeðlice", "DUȜEÐLICE", "duȝeðlice", "Duȝeðlice", "DUȜEÐLICE", "duȝeðlice", "duȝeðlice", "duȝeðlice",  ],
287[ "Van Dijke", "van dijke", "Van Dijke", "VAN DIJKE", "van dijke", "Van Dijke", "VAN DIJKE", "van dijke", "van dijke", "van dijke",  ],
288[ "fi", "fi", "fi", "fi", "fi", "Fi", "FI", "fi", "fi", "fi",  ],
289[ "filesystem", "filesystem", "filesystem", "fiLESYSTEM", "filesystem", "Filesystem", "FILESYSTEM", "filesystem", "filesystem", "filesystem",  ],
290[ "efficient", "efficient", "Efficient", "EffiCIENT", "efficient", "Efficient", "EFFICIENT", "efficient", "efficient", "efficient",  ],
291[ "flour and water", "flour and water", "flour And Water", "flOUR AND WATER", "flour and water", "Flour And Water", "FLOUR AND WATER", "flour and water", "flour and water", "flour and water",  ],
292[ "dz", "dz", "Dz", "DZ", "dz", "Dz", "DZ", "dz", "dz", "dz",  ],
293[ "dzur mountain", "dzur mountain", "Dzur Mountain", "DZUR MOUNTAIN", "dzur mountain", "Dzur Mountain", "DZUR MOUNTAIN", "dzur mountain", "dzur mountain", "dzur mountain",  ],
294[ "poſt", "poſt", "Poſt", "POST", "poſt", "Poſt", "POST", "post", "post", "post",  ],
295[ "poſt", "poſt", "Poſt", "POſt", "poſt", "Poſt", "POST", "poſt", "post", "post",  ],
296[ "ſtop", "ſtop", "ſtop", "ſtOP", "ſtop", "Stop", "STOP", "ſtop", "stop", "stop",  ],
297[ "tschüß", "tschüß", "Tschüß", "TSCHÜß", "tschüß", "Tschüß", "TSCHÜSS", "tschüß", "tschüss", "tschüss",  ],
298[ "TSCHÜẞ", "tschüß", "Tschüß", "TSCHÜẞ", "tschüß", "Tschüß", "TSCHÜẞ", "tschüß", "tschüss", "tschüss",  ],
299[ "rußland", "rußland", "Rußland", "RUßLAND", "rußland", "Rußland", "RUSSLAND", "rußland", "russland", "russland",  ],
300[ "RUẞLAND", "rußland", "Rußland", "RUẞLAND", "rußland", "Rußland", "RUẞLAND", "rußland", "russland", "russland",  ],
301[ "weiß", "weiß", "Weiß", "WEIß", "weiß", "Weiß", "WEISS", "weiß", "weiss", "weiss",  ],
302[ "WEIẞ", "weiß", "Weiß", "WEIẞ", "weiß", "Weiß", "WEIẞ", "weiß", "weıss", "weiss",  ],
303[ "ẞIEW", "ßiew", "ẞiew", "ẞIEW", "ßiew", "ẞiew", "ẞIEW", "ßiew", "ssıew", "ssiew",  ],
304[ "ͅ", "ͅ", "Ι", "Ι", "ͅ", "Ι", "Ι", "ι", "ι", "ι",  ],
305[ "ᾲ", "ᾲ", "Ὰͅ", "ᾺΙ", "ᾲ", "Ὰͅ", "ᾺΙ", "ὰι", "ὰι", "ὰι",  ],
306[ "Ὰι", "ὰι", "Ὰι", "ᾺΙ", "ὰι", "Ὰι", "ᾺΙ", "ὰι", "ὰι", "ὰι",  ],
307[ "ᾺΙ", "ὰι", "Ὰι", "ᾺΙ", "ὰι", "Ὰι", "ᾺΙ", "ὰι", "ὰι", "ὰι",  ],
308[ "ᾲ", "ᾲ", "ᾲ", "ᾲ", "ᾲ", "Ὰͅ", "ᾺΙ", "ᾲ", "ὰι", "ὰι",  ],
309[ "Ὰͅ", "ᾲ", "Ὰͅ", "ᾺΙ", "ᾲ", "Ὰͅ", "ᾺΙ", "ὰι", "ὰι", "ὰι",  ],
310[ "ᾺΙ", "ὰι", "Ὰι", "ᾺΙ", "ὰι", "Ὰι", "ᾺΙ", "ὰι", "ὰι", "ὰι",  ],
311[ "ᾲ στο διάολο", "ᾲ στο διάολο", "ᾲ Στο Διάολο", "ᾲ ΣΤΟ ΔΙΆΟΛΟ", "ᾲ στο διάολο", "Ὰͅ Στο Διάολο", "ᾺΙ ΣΤΟ ΔΙΆΟΛΟ", "ᾲ στο διάολο", "ὰι στο διάολο", "ὰι στο διάολο",  ],
312[ "ᾲ στο διάολο", "ᾲ στο διάολο", "Ὰͅ Στο Διάολο", "ᾺΙ ΣΤΟ ΔΙΆΟΛΟ", "ᾲ στο διάολο", "Ὰͅ Στο Διάολο", "ᾺΙ ΣΤΟ ΔΙΆΟΛΟ", "ὰι στο διάολο", "ὰι στο διάολο", "ὰι στο διάολο",  ],
313[ "ⅷ", "ⅷ", "Ⅷ", "Ⅷ", "ⅷ", "Ⅷ", "Ⅷ", "ⅷ", "ⅷ", "ⅷ",  ],
314[ "henry ⅷ", "henry ⅷ", "Henry Ⅷ", "HENRY Ⅷ", "henry ⅷ", "Henry Ⅷ", "HENRY Ⅷ", "henry ⅷ", "henry ⅷ", "henry ⅷ",  ],
315[ "ⓚ", "ⓚ", "Ⓚ", "Ⓚ", "ⓚ", "Ⓚ", "Ⓚ", "ⓚ", "ⓚ", "ⓚ",  ],
316[ "i work at ⓚ", "i work at ⓚ", "I Work At Ⓚ", "I WORK AT Ⓚ", "i work at ⓚ", "I Work At Ⓚ", "I WORK AT Ⓚ", "i work at ⓚ", "i work at ⓚ", "i work at ⓚ",  ],
317[ "istambul", "istambul", "Istambul", "ISTAMBUL", "istambul", "Istambul", "ISTAMBUL", "istambul", "istambul", "istambul",  ],
318[ "i̇stanbul", "i̇stanbul", "İstanbul", "İSTANBUL", "i̇stanbul", "İstanbul", "İSTANBUL", "i̇stanbul", "i̇stanbul", "i̇stanbul",  ],
319[ "İstanbul", "i̇stanbul", "İstanbul", "İSTANBUL", "i̇stanbul", "İstanbul", "İSTANBUL", "i̇stanbul", "ı̇stanbul", "i̇stanbul",  ],
320[ "İSTANBUL", "istanbul", "İstanbul", "İSTANBUL", "i̇stanbul", "İstanbul", "İSTANBUL", "İstanbul", "istanbul", "i̇stanbul",  ],
321[ "στιγμας", "στιγμας", "Στιγμας", "ΣΤΙΓΜΑΣ", "στιγμας", "Στιγμας", "ΣΤΙΓΜΑΣ", "στιγμασ", "στιγμασ", "στιγμασ",  ],
322[ "στιγμασ", "στιγμασ", "Στιγμασ", "ΣΤΙΓΜΑΣ", "στιγμασ", "Στιγμασ", "ΣΤΙΓΜΑΣ", "στιγμασ", "στιγμασ", "στιγμασ",  ],
323[ "ΣΤΙΓΜΑΣ", "στιγμασ", "Στιγμασ", "ΣΤΙΓΜΑΣ", "στιγμασ", "Στιγμασ", "ΣΤΙΓΜΑΣ", "στιγμασ", "στιγμασ", "στιγμασ",  ],
324[ "ʀᴀʀᴇ", "ʀᴀʀᴇ", "Ʀᴀʀᴇ", "ƦᴀƦᴇ", "ʀᴀʀᴇ", "Ʀᴀʀᴇ", "ƦᴀƦᴇ", "ʀᴀʀᴇ", "ʀᴀʀᴇ", "ʀᴀʀᴇ",  ],
325[ "��������������", "��������������", "��������������", "��������������", "��������������", "��������������", "��������������", "��������������", "��������������", "��������������",  ],
326[ "Ԧԧ", "ԧԧ", "Ԧԧ", "ԦԦ", "ԧԧ", "Ԧԧ", "ԦԦ", "ԧԧ", "ԧԧ", "ԧԧ",  ],
327[ "ﬓﬔﬕﬖﬗ", "ﬓﬔﬕﬖﬗ", "ﬓﬔﬕﬖﬗ", "ﬓﬔﬕﬖﬗ", "ﬓﬔﬕﬖﬗ", "Մնﬔﬕﬖﬗ", "ՄՆՄԵՄԻՎՆՄԽ", "ﬓﬔﬕﬖﬗ", "մնմեմիվնմխ", "մնմեմիվնմխ",  ],
328[ "ʼn groot", "ʼn groot", "ʼn Groot", "ʼn GROOT", "ʼn groot", "ʼN Groot", "ʼN GROOT", "ʼn groot", "ʼn groot", "ʼn groot",  ],
329[ "ẚ", "ẚ", "ẚ", "ẚ", "ẚ", "Aʾ", "Aʾ", "ẚ", "aʾ", "aʾ",  ],
330[ "ff", "ff", "ff", "ff", "ff", "Ff", "FF", "ff", "ff", "ff",  ],
331[ "ǰ", "ǰ", "ǰ", "ǰ", "ǰ", "J̌", "J̌", "ǰ", "ǰ", "ǰ",  ],
332[ "550 nm or Å", "550 nm or å", "550 Nm Or Å", "550 NM OR Å", "550 nm or å", "550 Nm Or Å", "550 NM OR Å", "550 nm or å", "550 nm or å", "550 nm or å",  ],
333);
334
335    use feature qw(fc);
336
337    for (@test_table) {
338        my ($simple_lc, $simple_tc, $simple_uc, $simple_fc) = @{$_}[1, 2, 3, 7];
339        my ($orig, $lower, $titlecase, $upper, $fc_turkic, $fc_full) = @{$_}[0,4,5,6,8,9];
340
341        if ($orig =~ /(\P{Assigned})/) {   # So can fail gracefully in earlier
342                                           # Unicode versions
343            fail(sprintf "because U+%04X is unassigned", ord($1));
344            next;
345        }
346        is( fc($orig), $fc_full, "fc('$orig') returns '$fc_full'" );
347        is( "\F$orig", $fc_full, '\F works' );
348        is( lc($orig), $lower,   "lc('$orig') returns '$lower'" );
349        is( "\L$orig", $lower,   '\L works' );
350        is( uc($orig), $upper,   "uc('$orig') returns '$upper'" );
351        is( "\U$orig", $upper,   '\U works' );
352    }
353}
354
355{
356    use feature qw(fc);
357    package Eeyup  { use overload q{""} => sub { "\x{df}"   }, fallback => 1 }
358    package Uunope { use overload q{""} => sub { "\x{30cb}" }, fallback => 1 }
359    package Undef  { use overload q{""} => sub {   undef    }, fallback => 1 }
360
361    my $obj = bless {}, "Eeyup";
362    is(fc($obj), "ss", "fc() works on overloaded objects returning latin-1");
363    $obj = bless {}, "Eeyup";
364    is("\F$obj", "ss", '\F works on overloaded objects returning latin-1');
365
366    $obj = bless {}, "Uunope";
367    is(fc($obj), "\x{30cb}", "fc() works on overloaded objects returning UTF-8");
368    $obj = bless {}, "Uunope";
369    is("\F$obj", "\x{30cb}", '\F works on overloaded objects returning UTF-8');
370
371    $obj = bless {}, "Undef";
372    my $warnings;
373    {
374        no warnings;
375        use warnings "uninitialized";
376        local $SIG{__WARN__} = sub { $warnings++; like(shift, qr/Use of uninitialized value (?:\$obj )?in fc/) };
377        fc(undef);
378        fc($obj);
379    }
380    is( $warnings, 2, "correct number of warnings" );
381
382    my $fetched = 0;
383    package Derpy { sub TIESCALAR { bless {}, shift } sub FETCH { $fetched++; "\x{df}" } }
384
385    tie my $x, "Derpy";
386
387    is( fc($x), "ss", "fc() works on tied values" );
388    is( $fetched, 1, "and only calls the magic once" );
389
390}
391
392{
393    use feature qw( fc );
394    my $troublesome1 = "\xdf" x 11; #SvLEN should be 12, SvCUR should be 11
395                                    #So this should force fc() to grow the string.
396
397    is( fc($troublesome1), "ss" x 11, "fc() grows the string" );
398
399    my $troublesome2 = "abcdef:\x{df}:fjksjs"; #SvLEN should be 16, SvCUR should be 15
400    is( fc($troublesome2), "abcdef:ss:fjksjs", "fc() expands \\x{DF} in the middle of a string that needs to grow" );
401
402    my $troublesome3 = ":\x{df}:";
403    is( fc($troublesome3), ":ss:", "fc() expands \\x{DF} in the middle of a string" );
404
405
406    my $troublesome4 = "\x{B5}"; #\N{MICRON SIGN} is latin-1, but its foldcase is in UTF-8
407
408    is( fc($troublesome4), "\x{3BC}", "fc() for a latin-1 \x{B5} returns UTF-8" );
409    ok( !utf8::is_utf8($troublesome4), "fc() doesn't upgrade the original string" );
410
411
412    my $troublesome5 = "\x{C9}abda\x{B5}aaf\x{C8}"; # Up until foldcasing \x{B5}, the string
413                                                    # was in Latin-1. This tests that the
414                                                    # results don't have illegal UTF-8
415                                                    # (i.e. leftover latin-1) in them
416
417    is( fc($troublesome5), "\x{E9}abda\x{3BC}aaf\x{E8}" );
418}
419
420
421{
422    use feature qw( fc unicode_strings );
423    use if $Config{d_setlocale}, qw(POSIX locale_h);
424    setlocale(&POSIX::LC_ALL, "C") if $Config{d_setlocale};
425
426    # This tests both code paths in pp_fc
427
428    for (0..0xff) {
429        my $latin1 = chr;
430        my $utf8   = $latin1;
431        utf8::downgrade($latin1); #No-op, but doesn't hurt
432        utf8::upgrade($utf8);
433        is(fc($latin1), fc($utf8), "fc() gives the same results for \\x{$_} in Latin-1 and UTF-8 under unicode_strings");
434        SKIP: {
435              skip 'No locale testing without d_setlocale', 2 if(!$Config{d_setlocale}) || $Config::Config{ccflags} =~ /\bD?NO_LOCALE(_|\b)/;
436              BEGIN {
437                  if($Config{d_setlocale}) {
438                      require locale; import locale;
439                  }
440              }
441            is(fc($latin1), lc($latin1), "use locale; fc(qq{\\x{$_}}), lc(qq{\\x{$_}}) when qq{\\x{$_}} is in latin-1");
442            is(fc($utf8), lc($utf8), "use locale; fc(qq{\\x{$_}}), lc(qq{\\x{$_}}) when qq{\\x{$_}} is in latin-1");
443        }
444        SKIP: {
445            if (
446                !$Config::Config{d_setlocale}
447            || $Config::Config{ccflags} =~ /\bD?NO_LOCALE(_|\b)/
448            ) {
449                skip "no locale support", 2
450            }
451            no feature 'unicode_strings';
452            is(fc($latin1), lc($latin1), "under nothing, fc() for <256 is the same as lc");
453        }
454    }
455}
456
457my $utf8_locale = find_utf8_ctype_locale();
458
459{
460    use feature qw( fc );
461    use locale;
462    is(fc("\x{1E9E}"), fc("\x{17F}\x{17F}"), 'fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")');
463    SKIP: {
464        skip 'Can\'t find a UTF-8 locale', 1 unless defined $utf8_locale;
465        setlocale(&LC_CTYPE, $utf8_locale);
466        is(fc("\x{1E9E}"), "ss", 'fc("\x{1E9E}") eq "ss" in a UTF-8 locale)');
467    }
468}
469
470SKIP: {
471    skip 'Can\'t find a UTF-8 locale', 256 unless defined $utf8_locale;
472
473    use feature qw( fc unicode_strings );
474
475    # Get the official fc values outside locale.
476    no locale;
477    my @unicode_fc;
478    for (0..0xff) {
479        push @unicode_fc, fc(chr);
480    }
481
482    # These should match the UTF-8 locale values
483    setlocale(&LC_CTYPE, $utf8_locale);
484    use locale;
485    for (0..0xff) {
486        is(fc(chr), $unicode_fc[$_], "In a UTF-8 locale, fc(chr $_) is the same as official Unicode");
487    }
488}
489
490
491my $num_tests = curr_test() - 1;
492
493plan($num_tests);
494