1#!perl -w 2 3use strict; 4use Test::More; 5 6use XS::APItest; 7 8foreach ([0, '', '', 'empty'], 9 [0, 'N', 'N', '1 char'], 10 [1, 'NN', 'N', '1 char substring'], 11 [-2, 'Perl', 'Rules', 'different'], 12 [0, chr 163, chr 163, 'pound sign'], 13 [1, chr (163) . 10, chr (163) . 1, '10 pounds is more than 1 pound'], 14 [1, chr(163) . chr(163), chr 163, '2 pound signs are more than 1'], 15 [-2, ' $!', " \x{1F42B}!", 'Camels are worth more than 1 dollar'], 16 [-1, '!', "!\x{1F42A}", 'Initial substrings match'], 17 ) { 18 my ($expect, $left, $right, $desc) = @$_; 19 my $copy = $right; 20 utf8::encode($copy); 21 is(bytes_cmp_utf8($left, $copy), $expect, $desc); 22 next if $right =~ tr/\0-\377//c; 23 utf8::encode($left); 24 is(bytes_cmp_utf8($right, $left), -$expect, "$desc reversed"); 25} 26 27# Test uft8n_to_uvchr(). These provide essentially complete code coverage. 28 29# Copied from utf8.h 30my $UTF8_ALLOW_EMPTY = 0x0001; 31my $UTF8_ALLOW_CONTINUATION = 0x0002; 32my $UTF8_ALLOW_NON_CONTINUATION = 0x0004; 33my $UTF8_ALLOW_SHORT = 0x0008; 34my $UTF8_ALLOW_LONG = 0x0010; 35my $UTF8_DISALLOW_SURROGATE = 0x0020; 36my $UTF8_WARN_SURROGATE = 0x0040; 37my $UTF8_DISALLOW_NONCHAR = 0x0080; 38my $UTF8_WARN_NONCHAR = 0x0100; 39my $UTF8_DISALLOW_SUPER = 0x0200; 40my $UTF8_WARN_SUPER = 0x0400; 41my $UTF8_DISALLOW_FE_FF = 0x0800; 42my $UTF8_WARN_FE_FF = 0x1000; 43my $UTF8_CHECK_ONLY = 0x2000; 44 45my $REPLACEMENT = 0xFFFD; 46 47my @warnings; 48 49use warnings 'utf8'; 50local $SIG{__WARN__} = sub { push @warnings, @_ }; 51 52# First test the malformations. All these raise category utf8 warnings. 53foreach my $test ( 54 [ "zero length string malformation", "", 0, 55 $UTF8_ALLOW_EMPTY, 0, 0, 56 qr/empty string/ 57 ], 58 [ "orphan continuation byte malformation", "\x80a", 2, 59 $UTF8_ALLOW_CONTINUATION, $REPLACEMENT, 1, 60 qr/unexpected continuation byte/ 61 ], 62 [ "premature next character malformation (immediate)", "\xc2a", 2, 63 $UTF8_ALLOW_NON_CONTINUATION, $REPLACEMENT, 1, 64 qr/unexpected non-continuation byte.*immediately after start byte/ 65 ], 66 [ "premature next character malformation (non-immediate)", "\xf0\x80a", 3, 67 $UTF8_ALLOW_NON_CONTINUATION, $REPLACEMENT, 2, 68 qr/unexpected non-continuation byte .* 2 bytes after start byte/ 69 ], 70 [ "too short malformation", "\xf0\x80a", 2, 71 # Having the 'a' after this, but saying there are only 2 bytes also 72 # tests that we pay attention to the passed in length 73 $UTF8_ALLOW_SHORT, $REPLACEMENT, 2, 74 qr/2 bytes, need 4/ 75 ], 76 [ "overlong malformation", "\xc1\xaf", 2, 77 $UTF8_ALLOW_LONG, ord('o'), 2, 78 qr/2 bytes, need 1/ 79 ], 80 [ "overflow malformation", "\xff\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf", 13, 81 0, # There is no way to allow this malformation 82 $REPLACEMENT, 13, 83 qr/overflow/ 84 ], 85) { 86 my ($testname, $bytes, $length, $allow_flags, $allowed_uv, $expected_len, $message ) = @$test; 87 88 next if ! ok(length($bytes) >= $length, "$testname: Make sure won't read beyond buffer: " . length($bytes) . " >= $length"); 89 90 # Test what happens when this malformation is not allowed 91 undef @warnings; 92 my $ret_ref = test_utf8n_to_uvchr($bytes, $length, 0); 93 is($ret_ref->[0], 0, "$testname: disallowed: Returns 0"); 94 is($ret_ref->[1], $expected_len, "$testname: disallowed: Returns expected length"); 95 if (is(scalar @warnings, 1, "$testname: disallowed: Got a single warning ")) { 96 like($warnings[0], $message, "$testname: disallowed: Got expected warning"); 97 } 98 else { 99 if (scalar @warnings) { 100 note "The warnings were: " . join(", ", @warnings); 101 } 102 } 103 104 { # Next test when disallowed, and warnings are off. 105 undef @warnings; 106 no warnings 'utf8'; 107 my $ret_ref = test_utf8n_to_uvchr($bytes, $length, 0); 108 is($ret_ref->[0], 0, "$testname: disallowed: no warnings 'utf8': Returns 0"); 109 is($ret_ref->[1], $expected_len, "$testname: disallowed: no warnings 'utf8': Returns expected length"); 110 if (!is(scalar @warnings, 0, "$testname: disallowed: no warnings 'utf8': no warnings generated")) { 111 note "The warnings were: " . join(", ", @warnings); 112 } 113 } 114 115 # Test with CHECK_ONLY 116 undef @warnings; 117 $ret_ref = test_utf8n_to_uvchr($bytes, $length, $UTF8_CHECK_ONLY); 118 is($ret_ref->[0], 0, "$testname: CHECK_ONLY: Returns 0"); 119 is($ret_ref->[1], -1, "$testname: CHECK_ONLY: returns expected length"); 120 if (! is(scalar @warnings, 0, "$testname: CHECK_ONLY: no warnings generated")) { 121 note "The warnings were: " . join(", ", @warnings); 122 } 123 124 next if $allow_flags == 0; # Skip if can't allow this malformation 125 126 # Test when the malformation is allowed 127 undef @warnings; 128 $ret_ref = test_utf8n_to_uvchr($bytes, $length, $allow_flags); 129 is($ret_ref->[0], $allowed_uv, "$testname: allowed: Returns expected uv"); 130 is($ret_ref->[1], $expected_len, "$testname: allowed: Returns expected length"); 131 if (!is(scalar @warnings, 0, "$testname: allowed: no warnings generated")) 132 { 133 note "The warnings were: " . join(", ", @warnings); 134 } 135} 136 137my $FF_ret; 138 139use Unicode::UCD; 140my $has_quad = ($Unicode::UCD::MAX_CP > 0xFFFF_FFFF); 141if ($has_quad) { 142 no warnings qw{portable overflow}; 143 $FF_ret = 0x1000000000; 144} 145else { # The above overflows unless a quad platform 146 $FF_ret = 0; 147} 148 149# Now test the cases where a legal code point is generated, but may or may not 150# be allowed/warned on. 151my @tests = ( 152 [ "surrogate", "\xed\xa4\x8d", 153 $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE, 'surrogate', 0xD90D, 3, 154 qr/surrogate/ 155 ], 156 [ "non_unicode", "\xf4\x90\x80\x80", 157 $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER, 'non_unicode', 0x110000, 4, 158 qr/not Unicode/ 159 ], 160 [ "non-character code point", "\xEF\xB7\x90", 161 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, 'nonchar', 0xFDD0, 3, 162 qr/Unicode non-character.*is illegal for open interchange/ 163 ], 164 [ "begins with FE", "\xfe\x82\x80\x80\x80\x80\x80", 165 166 # This code point is chosen so that it is representable in a UV on 167 # 32-bit machines 168 $UTF8_WARN_FE_FF, $UTF8_DISALLOW_FE_FF, 'utf8', 0x80000000, 7, 169 qr/Code point 0x80000000 is not Unicode, and not portable/ 170 ], 171 [ "overflow with FE/FF", 172 # This tests the interaction of WARN_FE_FF/DISALLOW_FE_FF with 173 # overflow. The overflow malformation is never allowed, so preventing 174 # it takes precedence if the FE_FF options would otherwise allow in an 175 # overflowing value. These two code points (1 for 32-bits; 1 for 64) 176 # were chosen because the old overflow detection algorithm did not 177 # catch them; this means this test also checks for that fix. 178 ($has_quad) 179 ? "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf" 180 : "\xfe\x86\x80\x80\x80\x80\x80", 181 182 # We include both warning categories to make sure the FE_FF one has 183 # precedence 184 "$UTF8_WARN_FE_FF|$UTF8_WARN_SUPER", "$UTF8_DISALLOW_FE_FF", 'utf8', 0, 185 ($has_quad) ? 13 : 7, 186 qr/overflow at byte .*, after start byte 0xf/ 187 ], 188); 189 190if ($has_quad) { # All FF's will overflow on 32 bit 191 push @tests, 192 [ "begins with FF", "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80", 193 $UTF8_WARN_FE_FF, $UTF8_DISALLOW_FE_FF, 'utf8', $FF_ret, 13, 194 qr/Code point 0x.* is not Unicode, and not portable/ 195 ]; 196} 197 198foreach my $test (@tests) { 199 my ($testname, $bytes, $warn_flags, $disallow_flags, $category, $allowed_uv, $expected_len, $message ) = @$test; 200 201 my $length = length $bytes; 202 my $will_overflow = $testname =~ /overflow/; 203 204 # This is more complicated than the malformations tested earlier, as there 205 # are several orthogonal variables involved. We test all the subclasses 206 # of utf8 warnings to verify they work with and without the utf8 class, 207 # and don't have effects on other sublass warnings 208 foreach my $warning ('utf8', 'surrogate', 'nonchar', 'non_unicode') { 209 foreach my $warn_flag (0, $warn_flags) { 210 foreach my $disallow_flag (0, $disallow_flags) { 211 foreach my $do_warning (0, 1) { 212 213 my $eval_warn = $do_warning 214 ? "use warnings '$warning'" 215 : $warning eq "utf8" 216 ? "no warnings 'utf8'" 217 : "use warnings 'utf8'; no warnings '$warning'"; 218 219 # is effectively disallowed if will overflow, even if the 220 # flag indicates it is allowed, fix up test name to 221 # indicate this as well 222 my $disallowed = $disallow_flag || $will_overflow; 223 224 my $this_name = "$testname: " . (($disallow_flag) 225 ? 'disallowed' 226 : ($disallowed) 227 ? 'FE_FF allowed' 228 : 'allowed'); 229 $this_name .= ", $eval_warn"; 230 $this_name .= ", " . (($warn_flag) 231 ? 'with warning flag' 232 : 'no warning flag'); 233 234 undef @warnings; 235 my $ret_ref; 236 #note __LINE__ . ": $eval_warn; \$ret_ref = test_utf8n_to_uvchr('$bytes', $length, $warn_flag|$disallow_flag)"; 237 my $eval_text = "$eval_warn; \$ret_ref = test_utf8n_to_uvchr('$bytes', $length, $warn_flag|$disallow_flag)"; 238 eval "$eval_text"; 239 if (! ok ("$@ eq ''", "$this_name: eval succeeded")) { 240 note "\$!='$!'; eval'd=\"$eval_text\""; 241 next; 242 } 243 if ($disallowed) { 244 is($ret_ref->[0], 0, "$this_name: Returns 0"); 245 } 246 else { 247 is($ret_ref->[0], $allowed_uv, 248 "$this_name: Returns expected uv"); 249 } 250 is($ret_ref->[1], $expected_len, 251 "$this_name: Returns expected length"); 252 253 if (! $do_warning 254 && ($warning eq 'utf8' || $warning eq $category)) 255 { 256 if (!is(scalar @warnings, 0, 257 "$this_name: No warnings generated")) 258 { 259 note "The warnings were: " . join(", ", @warnings); 260 } 261 } 262 elsif ($will_overflow 263 && ! $disallow_flag 264 && $warning eq 'utf8') 265 { 266 267 # Will get the overflow message instead of the expected 268 # message under these circumstances, as they would 269 # otherwise accept an overflowed value, which the code 270 # should not allow, so falls back to overflow. 271 if (is(scalar @warnings, 1, 272 "$this_name: Got a single warning ")) 273 { 274 like($warnings[0], qr/overflow/, 275 "$this_name: Got overflow warning"); 276 } 277 else { 278 if (scalar @warnings) { 279 note "The warnings were: " 280 . join(", ", @warnings); 281 } 282 } 283 } 284 elsif ($warn_flag 285 && ($warning eq 'utf8' || $warning eq $category)) 286 { 287 if (is(scalar @warnings, 1, 288 "$this_name: Got a single warning ")) 289 { 290 like($warnings[0], $message, 291 "$this_name: Got expected warning"); 292 } 293 else { 294 if (scalar @warnings) { 295 note "The warnings were: " 296 . join(", ", @warnings); 297 } 298 } 299 } 300 301 # Check CHECK_ONLY results when the input is disallowed. Do 302 # this when actually disallowed, not just when the 303 # $disallow_flag is set 304 if ($disallowed) { 305 undef @warnings; 306 $ret_ref = test_utf8n_to_uvchr($bytes, $length, 307 $disallow_flag|$UTF8_CHECK_ONLY); 308 is($ret_ref->[0], 0, "$this_name, CHECK_ONLY: Returns 0"); 309 is($ret_ref->[1], -1, 310 "$this_name: CHECK_ONLY: returns expected length"); 311 if (! is(scalar @warnings, 0, 312 "$this_name, CHECK_ONLY: no warnings generated")) 313 { 314 note "The warnings were: " . join(", ", @warnings); 315 } 316 } 317 } 318 } 319 } 320 } 321} 322 323done_testing; 324