1#!perl 2 3BEGIN { 4 chdir 't' if -d 't'; 5 require './test.pl'; 6 set_up_inc('../lib'); 7 skip_all_if_miniperl("no dynamic loading on miniperl, no File::Spec"); 8} 9 10use strict; 11use warnings; 12my @tests; 13 14my $file="../lib/unicore/CaseFolding.txt"; 15my @folds; 16use Unicode::UCD; 17 18# Use the Unicode data file if we are on an ASCII platform (which its data is 19# for), and it is in the modern format (starting in Unicode 3.1.0) and it is 20# available. This avoids being affected by potential bugs introduced by other 21# layers of Perl 22if (ord('A') == 65 23 && pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0 24 && open my $fh, "<", $file) 25{ 26 @folds = <$fh>; 27} 28else { 29 my ($invlist_ref, $invmap_ref, undef, $default) 30 = Unicode::UCD::prop_invmap('Case_Folding'); 31 for my $i (0 .. @$invlist_ref - 1 - 1) { 32 next if $invmap_ref->[$i] == $default; 33 my $adjust = -1; 34 for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) { 35 $adjust++; 36 37 # Single-code point maps go to a 'C' type 38 if (! ref $invmap_ref->[$i]) { 39 push @folds, sprintf("%04X; C; %04X\n", 40 $j, 41 $invmap_ref->[$i] + $adjust); 42 } 43 else { # Multi-code point maps go to 'F'. prop_invmap() 44 # guarantees that no adjustment is needed for these, 45 # as the range will contain just one element 46 push @folds, sprintf("%04X; F; %s\n", 47 $j, 48 join " ", map { sprintf "%04X", $_ } 49 @{$invmap_ref->[$i]}); 50 } 51 } 52 } 53} 54 55for (@folds) { 56 chomp; 57 my ($line,$comment)= split/\s+#\s+/, $_; 58 $comment = "" unless defined $comment; 59 my ($cp,$type,@folded)=split/[\s;]+/,$line||''; 60 next unless $type and ($type eq 'F' or $type eq 'C'); 61 my $fold_above_latin1 = grep { hex("0x$_") > 255 } @folded; 62 $_="\\x{$_}" for @folded; 63 my $cpv=hex("0x$cp"); 64 my $chr="\\x{$cp}"; 65 my @str; 66 foreach my $swap (0, 1) { # swap lhs and rhs, or not. 67 foreach my $charclass (0, 1) { # Put rhs in [...], or not 68 my $lhs; 69 my $rhs; 70 if ($swap) { 71 $lhs = join "", @folded; 72 $rhs = $chr; 73 $rhs = "[$rhs]" if $charclass; 74 } else { 75 #next if $charclass && @folded > 1; 76 $lhs = $chr; 77 $rhs = ""; 78 foreach my $rhs_char (@folded) { 79 80 # The colon is an unrelated character to the rest of the 81 # class, and makes sure no optimization into an EXACTish 82 # node occurs. 83 $rhs .= '[:' if $charclass; 84 $rhs .= $rhs_char; 85 $rhs .= ']' if $charclass; 86 } 87 } 88 $lhs = "\"$lhs\""; 89 $rhs = "/^$rhs\$/iu"; 90 91 # Try both Latin1 and Unicode for code points below 256 92 foreach my $upgrade ("", 'utf8::upgrade($c); ') { 93 if ($upgrade) { # No need to upgrade if already must be in 94 # utf8 95 next if $swap && $fold_above_latin1; 96 next if !$swap && $cpv > 255; 97 } 98 my $eval = "my \$c = $lhs; $upgrade\$c =~ $rhs"; 99 #print __LINE__, ": $eval\n"; 100 push @tests, qq[ok(eval '$eval', '$eval - $comment')]; 101 if (! $swap && $charclass && @folded > 1) 102 { 103 $tests[-1]="TODO: { local \$::TODO='A multi-char fold \"foo\", doesnt work for /[f][o][o]/i';\n$tests[-1] }" 104 } 105 } 106 } 107 } 108} 109 110# Now verify the case folding tables. First compute the mappings without 111# resorting to the functions we're testing. 112 113# Initialize the array so each $i maps to itself. 114my @fold_ascii; 115for my $i (0 .. 255) { 116 $fold_ascii[$i] = $i; 117} 118my @fold_latin1 = @fold_ascii; 119 120# Override the uppercase elements to fold to their lower case equivalents, 121# using the fact that 'A' in ASCII is 0x41, 'a' is 0x41+32, 'B' is 0x42, and 122# so on. The same paradigm applies for most of the Latin1 range cased 123# characters, but in posix anything outside ASCII maps to itself, as we've 124# already set up. 125for my $i (0x41 .. 0x5A, 0xC0 .. 0xD6, 0xD8 .. 0xDE) { 126 my $upper_ord = utf8::unicode_to_native($i); 127 my $lower_ord = utf8::unicode_to_native($i + 32); 128 129 $fold_latin1[$upper_ord] = $lower_ord; 130 131 next if $i > 127; 132 $fold_ascii[$upper_ord] = $lower_ord; 133} 134 135# Same for folding lower to the upper equivalents 136for my $i (0x61 .. 0x7A, 0xE0 .. 0xF6, 0xF8 .. 0xFE) { 137 my $lower_ord = utf8::unicode_to_native($i); 138 my $upper_ord = utf8::unicode_to_native($i - 32); 139 140 $fold_latin1[$lower_ord] = $upper_ord; 141 142 next if $i > 127; 143 $fold_ascii[$lower_ord] = $upper_ord; 144} 145 146# Test every latin1 character for the correct values in both /u and /d 147for my $i (0 .. 255) { 148 my $chr = sprintf "\\x%02X", $i; 149 my $hex_fold_ascii = sprintf "0x%02X", $fold_ascii[$i]; 150 my $hex_fold_latin1 = sprintf "0x%02X", $fold_latin1[$i]; 151 push @tests, qq[like chr($hex_fold_ascii), qr/(?d:$chr)/i, 'chr($hex_fold_ascii) =~ qr/(?d:$chr)/i']; 152 push @tests, qq[like chr($hex_fold_latin1), qr/(?u:$chr)/i, 'chr($hex_fold_latin1) =~ qr/(?u:$chr)/i']; 153} 154 155 156push @tests, qq[like chr(0x0430), qr/[=\x{0410}-\x{0411}]/i, 'Bug #71752 Unicode /i char in a range']; 157push @tests, qq[like 'a', qr/\\p{Upper}/i, "'a' =~ /\\\\p{Upper}/i"]; 158push @tests, q[my $c = "\x{212A}"; my $p = qr/(?:^[K_]+$)/i; utf8::upgrade($p); like $c, qr/$p/, 'Bug #78994: my $c = "\x{212A}"; my $p = qr/(?:^[K_]+$)/i; utf8::upgrade($p); $c =~ $p']; 159 160use charnames ":full"; 161my $e_grave = chr utf8::unicode_to_native(0xE8); 162push @tests, q[my $re1 = "\N{WHITE SMILING FACE}";like $e_grave, qr/[\w$re1]/, 'my $re = "\N{WHITE SMILING FACE}"; $e_grave =~ qr/[\w$re]/']; 163push @tests, q[my $re2 = "\N{WHITE SMILING FACE}";like $e_grave, qr/\w|$re2/, 'my $re = "\N{WHITE SMILING FACE}"; $e_grave =~ qr/\w|$re/']; 164 165eval join ";\n","plan tests=>". (scalar @tests), @tests, "1" 166 or die $@; 167__DATA__ 168