1#!perl -w 2 3# This is a base file to be used by various .t's in its directory 4# It tests various malformed UTF-8 sequences and some code points that are 5# "problematic", and verifies that the correct warnings/flags etc are 6# generated when using them. For the code points, it also takes the UTF-8 and 7# perturbs it to be malformed in various ways, and tests that this gets 8# appropriately detected. 9 10use strict; 11use Test::More; 12 13BEGIN { 14 use_ok('XS::APItest'); 15 require 'charset_tools.pl'; 16 require './t/utf8_setup.pl'; 17}; 18 19$|=1; 20 21use XS::APItest; 22 23my @warnings_gotten; 24 25use warnings 'utf8'; 26local $SIG{__WARN__} = sub { my @copy = @_; 27 push @warnings_gotten, map { chomp; $_ } @copy; 28 }; 29 30my $highest_non_extended_utf8_cp = (isASCII) ? 0x7FFFFFFF : 0x3FFFFFFF; 31my $native_lowest_continuation_chr = I8_to_native(chr $::lowest_continuation); 32 33# C5 is chosen as it is valid for both ASCII and EBCDIC platforms 34my $known_start_byte = I8_to_native("\xC5"); 35 36sub requires_extended_utf8($) { 37 38 # Returns a boolean as to whether or not the code point parameter fits 39 # into 31 bits, subject to the convention that a negative code point 40 # stands for one that overflows the word size, so won't fit in 31 bits. 41 42 return shift > $highest_non_extended_utf8_cp; 43} 44 45sub is_extended_utf8($) { 46 47 # Returns a boolean as to whether or not the input UTF-8 sequence uses 48 # Perl extended UTF-8. 49 50 my $byte = substr(shift, 0, 1); 51 return ord $byte >= 0xFE if isASCII; 52 return $byte == I8_to_native("\xFF"); 53} 54 55sub overflow_discern_len($) { 56 57 # Returns how many bytes are needed to tell if a non-overlong UTF-8 58 # sequence is for a code point that won't fit in the platform's word size. 59 # Only the length of the sequence representing a single code point is 60 # needed. 61 62 if (isASCII) { 63 return ($::is64bit) ? 3 : 1; 64 65 # Below is needed for code points above IV_MAX 66 #return ($::is64bit) ? 3 : ((shift == $::max_bytes) 67 # ? 1 68 # : 2); 69 } 70 71 return ($::is64bit) ? 2 : 8; 72} 73 74sub overlong_discern_len($) { 75 76 # Returns how many bytes are needed to tell if the input UTF-8 sequence 77 # for a code point is overlong 78 79 my $string = shift; 80 my $length = length $string; 81 my $byte = ord native_to_I8(substr($string, 0, 1)); 82 if (isASCII) { 83 return ($byte >= 0xFE) 84 ? ((! $::is64bit) 85 ? 1 86 : ($byte == 0xFF) ? 7 : 2) 87 : (($length == 2) ? 1 : 2); 88 # Below is needed for code points above IV_MAX 89 #return ($length == $::max_bytes) 90 # # This is constrained to 1 on 32-bit machines, as it 91 # # overflows there 92 # ? (($::is64bit) ? 7 : 1) 93 # : (($length == 2) ? 1 : 2); 94 } 95 96 return ($length == $::max_bytes) ? 8 : (($length <= 3) ? 1 : 2); 97} 98 99my @tests; 100{ 101 no warnings qw(portable overflow); 102 @tests = ( 103 # $testname, 104 # $bytes, UTF-8 string 105 # $allowed_uv, code point $bytes evaluates to; -1 if 106 # overflows 107 # $needed_to_discern_len optional, how long an initial substring do 108 # we need to tell that the string must be for 109 # a code point in the category it falls in, 110 # like being a surrogate; 0 indicates we need 111 # the whole string. Some categories have a 112 # default that is used if this is omitted. 113 [ "orphan continuation byte malformation", 114 I8_to_native("$::I8c"), 115 0xFFFD, 116 1, 117 ], 118 [ "overlong malformation, lowest 2-byte", 119 (isASCII) ? "\xc0\x80" : I8_to_native("\xc0\xa0"), 120 0, # NUL 121 ], 122 [ "overlong malformation, highest 2-byte", 123 (isASCII) ? "\xc1\xbf" : I8_to_native("\xc4\xbf"), 124 (isASCII) ? 0x7F : 0xFF, 125 ], 126 [ "overlong malformation, lowest 3-byte", 127 (isASCII) ? "\xe0\x80\x80" : I8_to_native("\xe0\xa0\xa0"), 128 0, # NUL 129 ], 130 [ "overlong malformation, highest 3-byte", 131 (isASCII) ? "\xe0\x9f\xbf" : I8_to_native("\xe0\xbf\xbf"), 132 (isASCII) ? 0x7FF : 0x3FF, 133 ], 134 [ "lowest surrogate", 135 (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"), 136 0xD800, 137 ], 138 [ "a middle surrogate", 139 (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"), 140 0xD90D, 141 ], 142 [ "highest surrogate", 143 (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"), 144 0xDFFF, 145 ], 146 [ "first of 32 consecutive non-character code points", 147 (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"), 148 0xFDD0, 149 ], 150 [ "a mid non-character code point of the 32 consecutive ones", 151 (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"), 152 0xFDE0, 153 ], 154 [ "final of 32 consecutive non-character code points", 155 (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"), 156 0xFDEF, 157 ], 158 [ "non-character code point U+FFFE", 159 (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"), 160 0xFFFE, 161 ], 162 [ "non-character code point U+FFFF", 163 (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"), 164 0xFFFF, 165 ], 166 [ "overlong malformation, lowest 4-byte", 167 (isASCII) ? "\xf0\x80\x80\x80" : I8_to_native("\xf0\xa0\xa0\xa0"), 168 0, # NUL 169 ], 170 [ "overlong malformation, highest 4-byte", 171 (isASCII) ? "\xf0\x8F\xbf\xbf" : I8_to_native("\xf0\xaf\xbf\xbf"), 172 (isASCII) ? 0xFFFF : 0x3FFF, 173 ], 174 [ "non-character code point U+1FFFE", 175 (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"), 176 0x1FFFE, 177 ], 178 [ "non-character code point U+1FFFF", 179 (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"), 180 0x1FFFF, 181 ], 182 [ "non-character code point U+2FFFE", 183 (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"), 184 0x2FFFE, 185 ], 186 [ "non-character code point U+2FFFF", 187 (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"), 188 0x2FFFF, 189 ], 190 [ "non-character code point U+3FFFE", 191 (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"), 192 0x3FFFE, 193 ], 194 [ "non-character code point U+3FFFF", 195 (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"), 196 0x3FFFF, 197 ], 198 [ "non-character code point U+4FFFE", 199 (isASCII) 200 ? "\xf1\x8f\xbf\xbe" 201 : I8_to_native("\xf8\xa9\xbf\xbf\xbe"), 202 0x4FFFE, 203 ], 204 [ "non-character code point U+4FFFF", 205 (isASCII) 206 ? "\xf1\x8f\xbf\xbf" 207 : I8_to_native("\xf8\xa9\xbf\xbf\xbf"), 208 0x4FFFF, 209 ], 210 [ "non-character code point U+5FFFE", 211 (isASCII) 212 ? "\xf1\x9f\xbf\xbe" 213 : I8_to_native("\xf8\xab\xbf\xbf\xbe"), 214 0x5FFFE, 215 ], 216 [ "non-character code point U+5FFFF", 217 (isASCII) 218 ? "\xf1\x9f\xbf\xbf" 219 : I8_to_native("\xf8\xab\xbf\xbf\xbf"), 220 0x5FFFF, 221 ], 222 [ "non-character code point U+6FFFE", 223 (isASCII) 224 ? "\xf1\xaf\xbf\xbe" 225 : I8_to_native("\xf8\xad\xbf\xbf\xbe"), 226 0x6FFFE, 227 ], 228 [ "non-character code point U+6FFFF", 229 (isASCII) 230 ? "\xf1\xaf\xbf\xbf" 231 : I8_to_native("\xf8\xad\xbf\xbf\xbf"), 232 0x6FFFF, 233 ], 234 [ "non-character code point U+7FFFE", 235 (isASCII) 236 ? "\xf1\xbf\xbf\xbe" 237 : I8_to_native("\xf8\xaf\xbf\xbf\xbe"), 238 0x7FFFE, 239 ], 240 [ "non-character code point U+7FFFF", 241 (isASCII) 242 ? "\xf1\xbf\xbf\xbf" 243 : I8_to_native("\xf8\xaf\xbf\xbf\xbf"), 244 0x7FFFF, 245 ], 246 [ "non-character code point U+8FFFE", 247 (isASCII) 248 ? "\xf2\x8f\xbf\xbe" 249 : I8_to_native("\xf8\xb1\xbf\xbf\xbe"), 250 0x8FFFE, 251 ], 252 [ "non-character code point U+8FFFF", 253 (isASCII) 254 ? "\xf2\x8f\xbf\xbf" 255 : I8_to_native("\xf8\xb1\xbf\xbf\xbf"), 256 0x8FFFF, 257 ], 258 [ "non-character code point U+9FFFE", 259 (isASCII) 260 ? "\xf2\x9f\xbf\xbe" 261 : I8_to_native("\xf8\xb3\xbf\xbf\xbe"), 262 0x9FFFE, 263 ], 264 [ "non-character code point U+9FFFF", 265 (isASCII) 266 ? "\xf2\x9f\xbf\xbf" 267 : I8_to_native("\xf8\xb3\xbf\xbf\xbf"), 268 0x9FFFF, 269 ], 270 [ "non-character code point U+AFFFE", 271 (isASCII) 272 ? "\xf2\xaf\xbf\xbe" 273 : I8_to_native("\xf8\xb5\xbf\xbf\xbe"), 274 0xAFFFE, 275 ], 276 [ "non-character code point U+AFFFF", 277 (isASCII) 278 ? "\xf2\xaf\xbf\xbf" 279 : I8_to_native("\xf8\xb5\xbf\xbf\xbf"), 280 0xAFFFF, 281 ], 282 [ "non-character code point U+BFFFE", 283 (isASCII) 284 ? "\xf2\xbf\xbf\xbe" 285 : I8_to_native("\xf8\xb7\xbf\xbf\xbe"), 286 0xBFFFE, 287 ], 288 [ "non-character code point U+BFFFF", 289 (isASCII) 290 ? "\xf2\xbf\xbf\xbf" 291 : I8_to_native("\xf8\xb7\xbf\xbf\xbf"), 292 0xBFFFF, 293 ], 294 [ "non-character code point U+CFFFE", 295 (isASCII) 296 ? "\xf3\x8f\xbf\xbe" 297 : I8_to_native("\xf8\xb9\xbf\xbf\xbe"), 298 0xCFFFE, 299 ], 300 [ "non-character code point U+CFFFF", 301 (isASCII) 302 ? "\xf3\x8f\xbf\xbf" 303 : I8_to_native("\xf8\xb9\xbf\xbf\xbf"), 304 0xCFFFF, 305 ], 306 [ "non-character code point U+DFFFE", 307 (isASCII) 308 ? "\xf3\x9f\xbf\xbe" 309 : I8_to_native("\xf8\xbb\xbf\xbf\xbe"), 310 0xDFFFE, 311 ], 312 [ "non-character code point U+DFFFF", 313 (isASCII) 314 ? "\xf3\x9f\xbf\xbf" 315 : I8_to_native("\xf8\xbb\xbf\xbf\xbf"), 316 0xDFFFF, 317 ], 318 [ "non-character code point U+EFFFE", 319 (isASCII) 320 ? "\xf3\xaf\xbf\xbe" 321 : I8_to_native("\xf8\xbd\xbf\xbf\xbe"), 322 0xEFFFE, 323 ], 324 [ "non-character code point U+EFFFF", 325 (isASCII) 326 ? "\xf3\xaf\xbf\xbf" 327 : I8_to_native("\xf8\xbd\xbf\xbf\xbf"), 328 0xEFFFF, 329 ], 330 [ "non-character code point U+FFFFE", 331 (isASCII) 332 ? "\xf3\xbf\xbf\xbe" 333 : I8_to_native("\xf8\xbf\xbf\xbf\xbe"), 334 0xFFFFE, 335 ], 336 [ "non-character code point U+FFFFF", 337 (isASCII) 338 ? "\xf3\xbf\xbf\xbf" 339 : I8_to_native("\xf8\xbf\xbf\xbf\xbf"), 340 0xFFFFF, 341 ], 342 [ "non-character code point U+10FFFE", 343 (isASCII) 344 ? "\xf4\x8f\xbf\xbe" 345 : I8_to_native("\xf9\xa1\xbf\xbf\xbe"), 346 0x10FFFE, 347 ], 348 [ "non-character code point U+10FFFF", 349 (isASCII) 350 ? "\xf4\x8f\xbf\xbf" 351 : I8_to_native("\xf9\xa1\xbf\xbf\xbf"), 352 0x10FFFF, 353 ], 354 [ "first non_unicode", 355 (isASCII) 356 ? "\xf4\x90\x80\x80" 357 : I8_to_native("\xf9\xa2\xa0\xa0\xa0"), 358 0x110000, 359 2, 360 ], 361 [ "non_unicode whose first byte tells that", 362 (isASCII) 363 ? "\xf5\x80\x80\x80" 364 : I8_to_native("\xfa\xa0\xa0\xa0\xa0"), 365 (isASCII) ? 0x140000 : 0x200000, 366 1, 367 ], 368 [ "overlong malformation, lowest 5-byte", 369 (isASCII) 370 ? "\xf8\x80\x80\x80\x80" 371 : I8_to_native("\xf8\xa0\xa0\xa0\xa0"), 372 0, # NUL 373 ], 374 [ "overlong malformation, highest 5-byte", 375 (isASCII) 376 ? "\xf8\x87\xbf\xbf\xbf" 377 : I8_to_native("\xf8\xa7\xbf\xbf\xbf"), 378 (isASCII) ? 0x1FFFFF : 0x3FFFF, 379 ], 380 [ "overlong malformation, lowest 6-byte", 381 (isASCII) 382 ? "\xfc\x80\x80\x80\x80\x80" 383 : I8_to_native("\xfc\xa0\xa0\xa0\xa0\xa0"), 384 0, # NUL 385 ], 386 [ "overlong malformation, highest 6-byte", 387 (isASCII) 388 ? "\xfc\x83\xbf\xbf\xbf\xbf" 389 : I8_to_native("\xfc\xa3\xbf\xbf\xbf\xbf"), 390 (isASCII) ? 0x3FFFFFF : 0x3FFFFF, 391 ], 392 [ "overlong malformation, lowest 7-byte", 393 (isASCII) 394 ? "\xfe\x80\x80\x80\x80\x80\x80" 395 : I8_to_native("\xfe\xa0\xa0\xa0\xa0\xa0\xa0"), 396 0, # NUL 397 ], 398 [ "overlong malformation, highest 7-byte", 399 (isASCII) 400 ? "\xfe\x81\xbf\xbf\xbf\xbf\xbf" 401 : I8_to_native("\xfe\xa1\xbf\xbf\xbf\xbf\xbf"), 402 (isASCII) ? 0x7FFFFFFF : 0x3FFFFFF, 403 ], 404 [ "highest 31 bit code point", 405 (isASCII) 406 ? "\xfd\xbf\xbf\xbf\xbf\xbf" 407 : I8_to_native( 408 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xbf\xbf\xbf\xbf\xbf\xbf"), 409 0x7FFFFFFF, 410 1, 411 ], 412 [ "lowest 32 bit code point", 413 (isASCII) 414 ? "\xfe\x82\x80\x80\x80\x80\x80" 415 : I8_to_native( 416 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"), 417 ($::is64bit) ? 0x80000000 : -1, # Overflows on 32-bit systems 418 1, 419 ], 420 # Used when UV_MAX is allowed as a code point 421 #[ "highest 32 bit code point", 422 # (isASCII) 423 # ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf" 424 # : I8_to_native( 425 # "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"), 426 # 0xFFFFFFFF, 427 #], 428 #[ "Lowest 33 bit code point", 429 # (isASCII) 430 # ? "\xfe\x84\x80\x80\x80\x80\x80" 431 # : I8_to_native( 432 # "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"), 433 # ($::is64bit) ? 0x100000000 : 0x0, # Overflows on 32-bit systems 434 #], 435 ); 436 437 if (! $::is64bit) { 438 if (isASCII) { 439 push @tests, 440 [ "overlong malformation, but naively looks like overflow", 441 "\xff\x80\x80\x80\x80\x80\x80\x81\xbf\xbf\xbf\xbf\xbf", 442 0x7FFFFFFF, 443 ], 444 # Used when above IV_MAX are allowed. 445 #[ "overlong malformation, but naively looks like overflow", 446 # "\xff\x80\x80\x80\x80\x80\x80\x83\xbf\xbf\xbf\xbf\xbf", 447 # 0xFFFFFFFF, 448 #], 449 [ "overflow that old algorithm failed to detect", 450 "\xfe\x86\x80\x80\x80\x80\x80", 451 -1, 452 ]; 453 } 454 } 455 456 push @tests, 457 [ "overlong malformation, lowest max-byte", 458 (isASCII) 459 ? "\xff\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80" 460 : I8_to_native( 461 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), 462 0, # NUL 463 ], 464 [ "overlong malformation, highest max-byte", 465 (isASCII) # 2**36-1 on ASCII; 2**30-1 on EBCDIC 466 ? "\xff\x80\x80\x80\x80\x80\x80\xbf\xbf\xbf\xbf\xbf\xbf" 467 : I8_to_native( 468 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xbf\xbf\xbf\xbf\xbf\xbf"), 469 (isASCII) ? (($::is64bit) ? 0xFFFFFFFFF : -1) : 0x3FFFFFFF, 470 ]; 471 472 if (isASCII) { 473 push @tests, 474 [ "Lowest code point requiring 13 bytes to represent", # 2**36 475 "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80", 476 ($::is64bit) ? 0x1000000000 : -1, # overflows on 32bit 477 ], 478 }; 479 480 if ($::is64bit) { 481 push @tests, 482 [ "highest 63 bit code point", 483 (isASCII) 484 ? "\xff\x80\x87\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf" 485 : I8_to_native( 486 "\xff\xa7\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"), 487 0x7FFFFFFFFFFFFFFF, 488 (isASCII) ? 1 : 2, 489 ], 490 [ "first 64 bit code point", 491 (isASCII) 492 ? "\xff\x80\x88\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80" 493 : I8_to_native( 494 "\xff\xa8\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), 495 -1, 496 ]; 497 # Used when UV_MAX is allowed as a code point 498 #[ "highest 64 bit code point", 499 # (isASCII) 500 # ? "\xff\x80\x8f\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf" 501 # : I8_to_native( 502 # "\xff\xaf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"), 503 # 0xFFFFFFFFFFFFFFFF, 504 # (isASCII) ? 1 : 2, 505 #], 506 #[ "first 65 bit code point", 507 # (isASCII) 508 # ? "\xff\x80\x9f\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80" 509 # : I8_to_native( 510 # "\xff\xb0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), 511 # 0, 512 #]; 513 if (isASCII) { 514 push @tests, 515 [ "overflow that old algorithm failed to detect", 516 "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf", 517 -1, 518 ]; 519 } 520 else { 521 push @tests, # These could falsely show wrongly in a naive 522 # implementation 523 [ "requires at least 32 bits", 524 I8_to_native( 525 "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), 526 0x800000000, 527 40000000 528 ], 529 [ "requires at least 32 bits", 530 I8_to_native( 531 "\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), 532 0x10000000000, 533 ], 534 [ "requires at least 32 bits", 535 I8_to_native( 536 "\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), 537 0x200000000000, 538 ], 539 [ "requires at least 32 bits", 540 I8_to_native( 541 "\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), 542 0x4000000000000, 543 ], 544 [ "requires at least 32 bits", 545 I8_to_native( 546 "\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), 547 0x80000000000000, 548 ], 549 [ "requires at least 32 bits", 550 I8_to_native( 551 "\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), 552 0x1000000000000000, 553 ]; 554 } 555 } 556} 557 558sub flags_to_text($$) 559{ 560 my ($flags, $flags_to_text_ref) = @_; 561 562 # Returns a string containing a mnemonic representation of the bits that 563 # are set in the $flags. These are assumed to be flag bits. The return 564 # looks like "FOO|BAR|BAZ". The second parameter is a reference to an 565 # array that gives the textual representation of all the possible flags. 566 # Element 0 is the text for the bit 0 flag; element 1 for bit 1; .... If 567 # no bits at all are set the string "0" is returned; 568 569 my @flag_text; 570 my $shift = 0; 571 572 return "0" if $flags == 0; 573 574 while ($flags) { 575 #diag sprintf "%x", $flags; 576 if ($flags & 1) { 577 push @flag_text, $flags_to_text_ref->[$shift]; 578 } 579 $shift++; 580 $flags >>= 1; 581 } 582 583 return join "|", @flag_text; 584} 585 586# Possible flag returns from utf8n_to_uvchr_error(). These should have G_, 587# instead of A_, D_, but the prefixes will be used in a a later commit, so 588# minimize churn by having them here. 589my @utf8n_flags_to_text = ( qw( 590 A_EMPTY 591 A_CONTINUATION 592 A_NON_CONTINUATION 593 A_SHORT 594 A_LONG 595 A_LONG_AND_ITS_VALUE 596 PLACEHOLDER 597 A_OVERFLOW 598 D_SURROGATE 599 W_SURROGATE 600 D_NONCHAR 601 W_NONCHAR 602 D_SUPER 603 W_SUPER 604 D_PERL_EXTENDED 605 W_PERL_EXTENDED 606 CHECK_ONLY 607 NO_CONFIDENCE_IN_CURLEN_ 608 ) ); 609 610sub utf8n_display_call($) 611{ 612 # Converts an eval string that calls test_utf8n_to_uvchr into a more human 613 # readable form, and returns it. Doesn't work if the byte string contains 614 # an apostrophe. The return will look something like: 615 # test_utf8n_to_uvchr_error('$bytes', $length, $flags) 616 #diag $_[0]; 617 618 $_[0] =~ / ^ ( [^(]* \( ) ' ( [^']*? ) ' ( .+ , \D* ) ( \d+ ) \) $ /x; 619 my $text1 = $1; # Everything before the byte string 620 my $bytes = $2; 621 my $text2 = $3; # Includes the length 622 my $flags = $4; 623 624 return $text1 625 . display_bytes($bytes) 626 . $text2 627 . flags_to_text($flags, \@utf8n_flags_to_text) 628 . ')'; 629} 630 631my @uvchr_flags_to_text = ( qw( 632 W_SURROGATE 633 W_NONCHAR 634 W_SUPER 635 W_PERL_EXTENDED 636 D_SURROGATE 637 D_NONCHAR 638 D_SUPER 639 D_PERL_EXTENDED 640) ); 641 642sub uvchr_display_call($) 643{ 644 # Converts an eval string that calls test_uvchr_to_utf8 into a more human 645 # readable form, and returns it. The return will look something like: 646 # test_uvchr_to_utf8n_flags($uv, $flags) 647 #diag $_[0]; 648 649 650 $_[0] =~ / ^ ( [^(]* \( ) ( \d+ ) , \s* ( \d+ ) \) $ /x; 651 my $text = $1; 652 my $cp = sprintf "%X", $2; 653 my $flags = $3; 654 655 return "${text}0x$cp, " . flags_to_text($flags, \@uvchr_flags_to_text) . ')'; 656} 657 658sub do_warnings_test(@) 659{ 660 my @expected_warnings = @_; 661 662 # Compares the input expected warnings array with @warnings_gotten, 663 # generating a pass for each found, removing it from @warnings_gotten. 664 # Any discrepancies generate test failures. Returns TRUE if no 665 # discrepcancies; otherwise FALSE. 666 667 my $succeeded = 1; 668 669 if (@expected_warnings == 0) { 670 if (! is(@warnings_gotten, 0, " Expected and got no warnings")) { 671 output_warnings(@warnings_gotten); 672 $succeeded = 0; 673 } 674 return $succeeded; 675 } 676 677 # Check that we got all the expected warnings, 678 # removing each one found 679 WARNING: 680 foreach my $expected (@expected_warnings) { 681 foreach (my $i = 0; $i < @warnings_gotten; $i++) { 682 if ($warnings_gotten[$i] =~ $expected) { 683 pass(" Expected and got warning: " 684 . " $warnings_gotten[$i]"); 685 splice @warnings_gotten, $i, 1; 686 next WARNING; 687 } 688 } 689 fail(" Expected a warning that matches " 690 . $expected . " but didn't get it"); 691 $succeeded = 0; 692 } 693 694 if (! is(@warnings_gotten, 0, " Got no unexpected warnings")) { 695 output_warnings(@warnings_gotten); 696 $succeeded = 0; 697 } 698 699 return $succeeded; 700} 701 702my $min_cont = (isASCII) ? 0x80 : 0xA0; 703my $continuation_shift = (isASCII) ? 6 : 5; 704my $continuation_mask = (1 << $continuation_shift) - 1; 705 706sub isUTF8_CHAR($$) { # Uses first principals to determine if this is legal 707 # (Doesn't work if overflows) 708 my ($string, $length) = @_; 709 710 # Uses first principals to calculate if $string is legal 711 712 return 0 if $length <= 0; 713 714 my $first = ord substr($string, 0, 1); 715 716 # Invariant 717 return 1 if $length == 1 && $first < $min_cont; 718 719 return 0 if $first < 0xC0; # Starts with continuation 720 721 # Calculate the number of leading 1 bits 722 my $utf8skip = 0; 723 my $bits = $first; 724 do { 725 $utf8skip++; 726 $bits = ($bits << 1) & 0xFF; 727 } while ($bits & 0x80); 728 729 return 0 if $utf8skip != $length; 730 731 # Acuumulate the $code point. The remaining bits in the start byte count 732 # towards it 733 my $cp = $bits >> $utf8skip; 734 735 for my $i (1 .. $length - 1) { 736 my $ord = ord substr($string, $i, 1); 737 738 # Wrong if not a continuation 739 return 0 if $ord < $min_cont || $ord >= 0xC0; 740 741 $cp = ($cp << $continuation_shift) 742 | ($ord & $continuation_mask); 743 } 744 745 # If the calculated value can be expressed in fewer bytes than were passed 746 # in, is an illegal overlong. XXX if 'chr' is not working properly, this 747 # may not be right 748 my $chr = chr $cp; 749 utf8::upgrade($chr); 750 751 use bytes; 752 return 0 if length $chr < $length; 753 754 return 1; 755} 756 757sub start_mark($) { 758 my $len = shift; 759 return 0xFF if $len > 7; 760 return (0xFF & (0xFE << (7 - $len))); 761} 762 763sub start_mask($) { 764 my $len = shift; 765 return 0 if $len > 7; 766 return 0x1F >> ($len - 2); 767} 768 769# This test is split into this number of files. 770my $num_test_files = $ENV{TEST_JOBS} || 1; 771$num_test_files = 10 if $num_test_files > 10; 772 773# We only really need to test utf8n_to_uvchr_msgs() once with this flag. 774my $tested_CHECK_ONLY = 0; 775 776my $test_count = -1; 777 778# By setting this environment variable to this particular value, we test 779# essentially all combinations of potential UTF-8, so that can get a 780# comprehensive test of the decoding routine. This test assumes the routine 781# that does the translation from code point to UTF-8 is working. An assert 782# can be used in the routine to make sure that the dfa is working precisely 783# correctly, and any flaws in it aren't being masked by the remainder of the 784# function. 785if ($::TEST_CHUNK == 0 786&& $ENV{PERL_DEBUG_FULL_TEST} 787&& $ENV{PERL_DEBUG_FULL_TEST} == 97) 788{ 789 my $min_cont_mask = $min_cont | 0xF; 790 my @bytes = ( 0, # Placeholder to signify to use an empty string "" 791 ord 'A',# We assume that all the invariant characters are 792 # properly in the same class, so this is an exemplar 793 # character 794 $min_cont .. 0xFF # But test every non-invariant individually 795 ); 796 my $shift = (isASCII) ? 6 : 5; 797 my $mark = $min_cont; 798 my $mask = (1 << $shift) - 1; 799 for my $byte1 (@bytes) { 800 for my $byte2 (@bytes) { 801 last if $byte2 && ! $byte1; # Don't test empty preceding byte 802 803 last if $byte2 && $byte1 < 0xC0; # No need to test more than a 804 # single byte unless start byte 805 # indicates those. 806 807 for my $byte3 (@bytes) { 808 last if $byte3 && ! $byte2; 809 last if $byte3 && $byte1 < 0xE0; # Only test 3 bytes for 810 # 3-byte start byte 811 812 # If the preceding byte is a start byte, it should fail, and 813 # there is no need to test illegal bytes that follow. 814 # Instead, limit ourselves to just a few legal bytes that 815 # could follow. This cuts down tremendously on the number of 816 # tests executed. 817 next if $byte2 >= 0xC0 818 && $byte3 >= $min_cont 819 && ($byte3 & $min_cont_mask) != $min_cont; 820 821 for my $byte4 (@bytes) { 822 last if $byte4 && ! $byte3; 823 last if $byte4 && $byte1 < 0xF0; # Only test 4 bytes for 824 # 4 byte strings 825 826 # Like for byte 3, we limit things that come after a 827 # mispositioned start-byte to just a few things that 828 # otherwise would be legal 829 next if ($byte2 >= 0xC0 || $byte3 >= 0xC0) 830 && $byte4 >= $min_cont 831 && ($byte4 & $min_cont_mask) != $min_cont; 832 833 for my $byte5 (@bytes) { 834 last if $byte5 && ! $byte4; 835 last if $byte5 && $byte1 < 0xF8; # Only test 5 bytes for 836 # 5 byte strings 837 838 # Like for byte 4, we limit things that come after a 839 # mispositioned start-byte to just a few things that 840 # otherwise would be legal 841 next if ( $byte2 >= 0xC0 842 || $byte3 >= 0xC0 843 || $byte4 >= 0xC0) 844 && $byte4 >= $min_cont 845 && ($byte4 & $min_cont_mask) != $min_cont; 846 847 my $string = ""; 848 $string .= chr $byte1 if $byte1; 849 $string .= chr $byte2 if $byte2; 850 $string .= chr $byte3 if $byte3; 851 $string .= chr $byte4 if $byte4; 852 $string .= chr $byte5 if $byte5; 853 854 my $length = length $string; 855 next unless $length; 856 last if $byte1 >= ((isASCII) ? 0xF6 : 0xFA); 857 858 my $native = I8_to_native($string); 859 my $is_valid = isUTF8_CHAR($native, $length); 860 my $got_valid = test_isUTF8_CHAR($native, $length); 861 my $got_strict 862 = test_isSTRICT_UTF8_CHAR($native, $length); 863 my $got_C9 864 = test_isC9_STRICT_UTF8_CHAR($native, $length); 865 my $ret = test_utf8n_to_uvchr_msgs($native, $length, 866 $::UTF8_WARN_ILLEGAL_INTERCHANGE); 867 my $is_strict = $is_valid; 868 my $is_C9 = $is_valid; 869 870 if ($is_valid) { 871 872 # Here, is legal UTF-8. Verify that it returned 873 # the correct code point, and if so, that it 874 # correctly classifies the result. 875 my $cp = $ret->[0]; 876 877 my $should_be_string; 878 if ($length == 1) { 879 $should_be_string = chr $cp; 880 } 881 else { 882 883 # Starting with the code point, use first 884 # principals to find the equivalen UTF-8 885 # string 886 my @bytes; 887 my $uv = $cp; 888 for (my $i = $length - 1; $i > 0; $i--) { 889 $bytes[$i] = chr I8_to_native(($uv & $mask) 890 | $mark); 891 $uv >>= $shift; 892 } 893 $bytes[0] = chr I8_to_native(( $uv 894 & start_mask($length)) 895 | start_mark($length)); 896 $should_be_string = join "", @bytes; 897 } 898 899 # If the original string and the inverse are the 900 # same, it worked. 901 if (is($native, $should_be_string, 902 "utf8n_to_uvchr_msgs(" 903 . display_bytes($native) 904 . ") returns correct uv=0x" 905 . sprintf ("%x", $cp))) 906 { 907 my $is_surrogate = $cp >= 0xD800 908 && $cp <= 0xDFFF; 909 my $got_surrogate 910 = ($ret->[2] & $::UTF8_GOT_SURROGATE) != 0; 911 $is_strict = 0 if $is_surrogate; 912 $is_C9 = 0 if $is_surrogate; 913 914 my $is_super = $cp > 0x10FFFF; 915 my $got_super 916 = ($ret->[2] & $::UTF8_GOT_SUPER) != 0; 917 $is_strict = 0 if $is_super; 918 $is_C9 = 0 if $is_super; 919 920 my $is_nonchar = ! $is_super 921 && ( ($cp & 0xFFFE) == 0xFFFE 922 || ($cp >= 0xFDD0 && $cp <= 0xFDEF)); 923 my $got_nonchar 924 = ($ret->[2] & $::UTF8_GOT_NONCHAR) != 0; 925 $is_strict = 0 if $is_nonchar; 926 927 is($got_surrogate, $is_surrogate, 928 " And correctly flagged it as" 929 . ((! $is_surrogate) ? " not" : "") 930 . " being a surrogate"); 931 is($got_super, $is_super, 932 " And correctly flagged it as" 933 . ((! $is_super) ? " not" : "") 934 . " being above Unicode"); 935 is($got_nonchar, $is_nonchar, 936 " And correctly flagged it as" 937 . ((! $is_nonchar) ? " not" : "") 938 . " being a non-char"); 939 } 940 941 # This is how we exit the loop normally if things 942 # are working. The fail-safe code above is used 943 # when they aren't. 944 goto done if $cp > 0x140001; 945 } 946 else { 947 is($ret->[0], 0, "utf8n_to_uvchr_msgs(" 948 . display_bytes($native) 949 . ") correctly returns error"); 950 if (! ($ret->[2] & ($::UTF8_GOT_SHORT 951 |$::UTF8_GOT_NON_CONTINUATION 952 |$::UTF8_GOT_LONG))) 953 { 954 is($ret->[2] & ( $::UTF8_GOT_NONCHAR 955 |$::UTF8_GOT_SURROGATE 956 |$::UTF8_GOT_SUPER), 0, 957 " And isn't a surrogate, non-char, nor" 958 . " above Unicode"); 959 } 960 } 961 962 is($got_valid == 0, $is_valid == 0, 963 " And isUTF8_CHAR() correctly returns " 964 . (($got_valid == 0) ? "0" : "non-zero")); 965 is($got_strict == 0, $is_strict == 0, 966 " And isSTRICT_UTF8_CHAR() correctly returns " 967 . (($got_strict == 0) ? "0" : "non-zero")); 968 is($got_C9 == 0, $is_C9 == 0, 969 " And isC9_UTF8_CHAR() correctly returns " 970 . (($got_C9 == 0) ? "0" : "non-zero")); 971 } 972 } 973 } 974 } 975 } 976 done: 977} 978 979foreach my $test (@tests) { 980 $test_count++; 981 next if $test_count % $num_test_files != $::TEST_CHUNK; 982 983 my ($testname, $bytes, $allowed_uv, $needed_to_discern_len) = @$test; 984 985 my $length = length $bytes; 986 my $initially_overlong = $testname =~ /overlong/; 987 my $initially_orphan = $testname =~ /orphan/; 988 my $will_overflow = $allowed_uv < 0; 989 990 my $uv_string = sprintf(($allowed_uv < 0x100) ? "%02X" : "%04X", $allowed_uv); 991 my $display_bytes = display_bytes($bytes); 992 993 my $controlling_warning_category; 994 my $utf8n_flag_to_warn; 995 my $utf8n_flag_to_disallow; 996 my $uvchr_flag_to_warn; 997 my $uvchr_flag_to_disallow; 998 999 # We want to test that the independent flags are actually independent. 1000 # For example, that a surrogate doesn't trigger a non-character warning, 1001 # and conversely, turning off an above-Unicode flag doesn't suppress a 1002 # surrogate warning. Earlier versions of this file used nested loops to 1003 # test all possible combinations. But that creates lots of tests, making 1004 # this run too long. What is now done instead is to use the complement of 1005 # the category we are testing to greatly reduce the combinatorial 1006 # explosion. For example, if we have a surrogate and we aren't expecting 1007 # a warning about it, we set all the flags for non-surrogates to raise 1008 # warnings. If one shows up, it indicates the flags aren't independent. 1009 my $utf8n_flag_to_warn_complement; 1010 my $utf8n_flag_to_disallow_complement; 1011 my $uvchr_flag_to_warn_complement; 1012 my $uvchr_flag_to_disallow_complement; 1013 1014 # Many of the code points being tested are middling in that if code point 1015 # edge cases work, these are very likely to as well. Because this test 1016 # file takes a while to execute, we skip testing the edge effects of code 1017 # points deemed middling, while testing their basics and continuing to 1018 # fully test the non-middling code points. 1019 my $skip_most_tests = 0; 1020 1021 my $cp_message_qr; # Pattern that matches the message raised when 1022 # that message contains the problematic code 1023 # point. The message is the same (currently) both 1024 # when going from/to utf8. 1025 my $non_cp_trailing_text; # The suffix text when the message doesn't 1026 # contain a code point. (This is a result of 1027 # some sort of malformation that means we 1028 # can't get an exact code poin 1029 my $extended_cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E 1030 \Q requires a Perl extension, and so is not\E 1031 \Q portable\E/x; 1032 my $extended_non_cp_trailing_text 1033 = "is a Perl extension, and so is not portable"; 1034 1035 # What bytes should have been used to specify a code point that has been 1036 # specified as an overlong. 1037 my $correct_bytes_for_overlong; 1038 1039 # Is this test malformed from the beginning? If so, we know to generally 1040 # expect that the tests will show it isn't valid. 1041 my $initially_malformed = 0; 1042 1043 if ($initially_overlong || $initially_orphan) { 1044 $non_cp_trailing_text = "if you see this, there is an error"; 1045 $cp_message_qr = qr/\Q$non_cp_trailing_text\E/; 1046 $initially_malformed = 1; 1047 $utf8n_flag_to_warn = 0; 1048 $utf8n_flag_to_disallow = 0; 1049 1050 $utf8n_flag_to_warn_complement = $::UTF8_WARN_SURROGATE; 1051 $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_SURROGATE; 1052 if (! $will_overflow && $allowed_uv <= 0x10FFFF) { 1053 $utf8n_flag_to_warn_complement |= $::UTF8_WARN_SUPER; 1054 $utf8n_flag_to_disallow_complement |= $::UTF8_DISALLOW_SUPER; 1055 if (($allowed_uv & 0xFFFF) != 0xFFFF) { 1056 $utf8n_flag_to_warn_complement |= $::UTF8_WARN_NONCHAR; 1057 $utf8n_flag_to_disallow_complement |= $::UTF8_DISALLOW_NONCHAR; 1058 } 1059 } 1060 if (! is_extended_utf8($bytes)) { 1061 $utf8n_flag_to_warn_complement |= $::UTF8_WARN_PERL_EXTENDED; 1062 $utf8n_flag_to_disallow_complement |= $::UTF8_DISALLOW_PERL_EXTENDED; 1063 } 1064 1065 $controlling_warning_category = 'utf8'; 1066 1067 if ($initially_overlong) { 1068 if (! defined $needed_to_discern_len) { 1069 $needed_to_discern_len = overlong_discern_len($bytes); 1070 } 1071 $correct_bytes_for_overlong = display_bytes_no_quotes(chr $allowed_uv); 1072 } 1073 } 1074 elsif($will_overflow || $allowed_uv > 0x10FFFF) { 1075 1076 # Set the SUPER flags; later, we test for PERL_EXTENDED as well. 1077 $utf8n_flag_to_warn = $::UTF8_WARN_SUPER; 1078 $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SUPER; 1079 $uvchr_flag_to_warn = $::UNICODE_WARN_SUPER; 1080 $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SUPER;; 1081 1082 # Below, we add the flags for non-perl_extended to the code points 1083 # that don't fit that category. Special tests are done for this 1084 # category in the inner loop. 1085 $utf8n_flag_to_warn_complement = $::UTF8_WARN_NONCHAR 1086 |$::UTF8_WARN_SURROGATE; 1087 $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR 1088 |$::UTF8_DISALLOW_SURROGATE; 1089 $uvchr_flag_to_warn_complement = $::UNICODE_WARN_NONCHAR 1090 |$::UNICODE_WARN_SURROGATE; 1091 $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR 1092 |$::UNICODE_DISALLOW_SURROGATE; 1093 $controlling_warning_category = 'non_unicode'; 1094 1095 if ($will_overflow) { # This is realy a malformation 1096 $non_cp_trailing_text = "if you see this, there is an error"; 1097 $cp_message_qr = qr/\Q$non_cp_trailing_text\E/; 1098 $initially_malformed = 1; 1099 if (! defined $needed_to_discern_len) { 1100 $needed_to_discern_len = overflow_discern_len($length); 1101 } 1102 } 1103 elsif (requires_extended_utf8($allowed_uv)) { 1104 $cp_message_qr = $extended_cp_message_qr; 1105 $non_cp_trailing_text = $extended_non_cp_trailing_text; 1106 $needed_to_discern_len = 1 unless defined $needed_to_discern_len; 1107 } 1108 else { 1109 $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E 1110 \Q may not be portable\E/x; 1111 $non_cp_trailing_text = "is for a non-Unicode code point, may not" 1112 . " be portable"; 1113 $utf8n_flag_to_warn_complement |= $::UTF8_WARN_PERL_EXTENDED; 1114 $utf8n_flag_to_disallow_complement 1115 |= $::UTF8_DISALLOW_PERL_EXTENDED; 1116 $uvchr_flag_to_warn_complement |= $::UNICODE_WARN_PERL_EXTENDED; 1117 $uvchr_flag_to_disallow_complement 1118 |= $::UNICODE_DISALLOW_PERL_EXTENDED; 1119 } 1120 } 1121 elsif ($allowed_uv >= 0xD800 && $allowed_uv <= 0xDFFF) { 1122 $cp_message_qr = qr/UTF-16 surrogate U\+$uv_string/; 1123 $non_cp_trailing_text = "is for a surrogate"; 1124 $needed_to_discern_len = 2 unless defined $needed_to_discern_len; 1125 $skip_most_tests = 1 if $allowed_uv > 0xD800 && $allowed_uv < 0xDFFF; 1126 1127 $utf8n_flag_to_warn = $::UTF8_WARN_SURROGATE; 1128 $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SURROGATE; 1129 $uvchr_flag_to_warn = $::UNICODE_WARN_SURROGATE; 1130 $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SURROGATE;; 1131 1132 $utf8n_flag_to_warn_complement = $::UTF8_WARN_NONCHAR 1133 |$::UTF8_WARN_SUPER 1134 |$::UTF8_WARN_PERL_EXTENDED; 1135 $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR 1136 |$::UTF8_DISALLOW_SUPER 1137 |$::UTF8_DISALLOW_PERL_EXTENDED; 1138 $uvchr_flag_to_warn_complement = $::UNICODE_WARN_NONCHAR 1139 |$::UNICODE_WARN_SUPER 1140 |$::UNICODE_WARN_PERL_EXTENDED; 1141 $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR 1142 |$::UNICODE_DISALLOW_SUPER 1143 |$::UNICODE_DISALLOW_PERL_EXTENDED; 1144 $controlling_warning_category = 'surrogate'; 1145 } 1146 elsif ( ($allowed_uv >= 0xFDD0 && $allowed_uv <= 0xFDEF) 1147 || ($allowed_uv & 0xFFFE) == 0xFFFE) 1148 { 1149 $cp_message_qr = qr/\QUnicode non-character U+$uv_string\E 1150 \Q is not recommended for open interchange\E/x; 1151 $non_cp_trailing_text = "if you see this, there is an error"; 1152 $needed_to_discern_len = $length unless defined $needed_to_discern_len; 1153 if ( ($allowed_uv > 0xFDD0 && $allowed_uv < 0xFDEF) 1154 || ($allowed_uv > 0xFFFF && $allowed_uv < 0x10FFFE)) 1155 { 1156 $skip_most_tests = 1; 1157 } 1158 1159 $utf8n_flag_to_warn = $::UTF8_WARN_NONCHAR; 1160 $utf8n_flag_to_disallow = $::UTF8_DISALLOW_NONCHAR; 1161 $uvchr_flag_to_warn = $::UNICODE_WARN_NONCHAR; 1162 $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_NONCHAR;; 1163 1164 $utf8n_flag_to_warn_complement = $::UTF8_WARN_SURROGATE 1165 |$::UTF8_WARN_SUPER 1166 |$::UTF8_WARN_PERL_EXTENDED; 1167 $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_SURROGATE 1168 |$::UTF8_DISALLOW_SUPER 1169 |$::UTF8_DISALLOW_PERL_EXTENDED; 1170 $uvchr_flag_to_warn_complement = $::UNICODE_WARN_SURROGATE 1171 |$::UNICODE_WARN_SUPER 1172 |$::UNICODE_WARN_PERL_EXTENDED; 1173 $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_SURROGATE 1174 |$::UNICODE_DISALLOW_SUPER 1175 |$::UNICODE_DISALLOW_PERL_EXTENDED; 1176 1177 $controlling_warning_category = 'nonchar'; 1178 } 1179 else { 1180 die "Can't figure out what type of warning to test for $testname" 1181 } 1182 1183 die 'Didn\'t set $needed_to_discern_len for ' . $testname 1184 unless defined $needed_to_discern_len; 1185 1186 # We try various combinations of malformations that can occur 1187 foreach my $short (0, 1) { 1188 next if $skip_most_tests && $short; 1189 foreach my $unexpected_noncont (0, 1) { 1190 next if $skip_most_tests && $unexpected_noncont; 1191 foreach my $overlong (0, 1) { 1192 next if $overlong && $skip_most_tests; 1193 next if $initially_overlong && ! $overlong; 1194 1195 # If we're creating an overlong, it can't be longer than the 1196 # maximum length, so skip if we're already at that length. 1197 next if (! $initially_overlong && $overlong) 1198 && $length >= $::max_bytes; 1199 1200 my $this_cp_message_qr = $cp_message_qr; 1201 my $this_non_cp_trailing_text = $non_cp_trailing_text; 1202 1203 foreach my $malformed_allow_type (0..2) { 1204 # 0 don't allow this malformation; ignored if no malformation 1205 # 1 allow, with REPLACEMENT CHARACTER returned 1206 # 2 allow, with intended code point returned. All malformations 1207 # other than overlong can't determine the intended code point, 1208 # so this isn't valid for them. 1209 next if $malformed_allow_type == 2 1210 && ($will_overflow || $short || $unexpected_noncont); 1211 next if $skip_most_tests && $malformed_allow_type; 1212 1213 # Here we are in the innermost loop for malformations. So we 1214 # know which ones are in effect. Can now change the input to be 1215 # appropriately malformed. We also can set up certain other 1216 # things now, like whether we expect a return flag from this 1217 # malformation, and which flag. 1218 1219 my $this_bytes = $bytes; 1220 my $this_length = $length; 1221 my $this_expected_len = $length; 1222 my $this_needed_to_discern_len = $needed_to_discern_len; 1223 1224 my @malformation_names; 1225 my @expected_malformation_warnings; 1226 my @expected_malformation_return_flags; 1227 1228 # Contains the flags for any allowed malformations. Currently no 1229 # combinations of on/off are tested for. It's either all are 1230 # allowed, or none are. 1231 my $allow_flags = 0; 1232 my $overlong_is_in_perl_extended_utf8 = 0; 1233 my $dont_use_overlong_cp = 0; 1234 1235 if ($initially_orphan) { 1236 next if $overlong || $short || $unexpected_noncont; 1237 } 1238 1239 if ($overlong) { 1240 if (! $initially_overlong) { 1241 my $new_expected_len; 1242 1243 # To force this malformation, we convert the original start 1244 # byte into a continuation byte with the same data bits as 1245 # originally. ... 1246 my $start_byte = substr($this_bytes, 0, 1); 1247 my $converted_to_continuation_byte 1248 = start_byte_to_cont($start_byte); 1249 1250 # ... Then we prepend it with a known overlong sequence. 1251 # This should evaluate to the exact same code point as the 1252 # original. We try to avoid an overlong using Perl 1253 # extended UTF-8. The code points are the highest 1254 # representable as overlongs on the respective platform 1255 # without using extended UTF-8. 1256 if (native_to_I8($start_byte) lt "\xFC") { 1257 $start_byte = I8_to_native("\xFC"); 1258 $new_expected_len = 6; 1259 } 1260 elsif (! isASCII && native_to_I8($start_byte) lt "\xFE") { 1261 1262 # FE is not extended UTF-8 on EBCDIC 1263 $start_byte = I8_to_native("\xFE"); 1264 $new_expected_len = 7; 1265 } 1266 else { # Must use extended UTF-8. On ASCII platforms, we 1267 # could express some overlongs here starting with 1268 # \xFE, but there's no real reason to do so. 1269 $overlong_is_in_perl_extended_utf8 = 1; 1270 $start_byte = I8_to_native("\xFF"); 1271 $new_expected_len = $::max_bytes; 1272 $this_cp_message_qr = $extended_cp_message_qr; 1273 1274 # The warning that gets raised doesn't include the 1275 # code point in the message if the code point can be 1276 # expressed without using extended UTF-8, but the 1277 # particular overlong sequence used is in extended 1278 # UTF-8. To do otherwise would be confusing to the 1279 # user, as it would claim the code point requires 1280 # extended, when it doesn't. 1281 $dont_use_overlong_cp = 1 1282 unless requires_extended_utf8($allowed_uv); 1283 $this_non_cp_trailing_text 1284 = $extended_non_cp_trailing_text; 1285 } 1286 1287 # Splice in the revise continuation byte, preceded by the 1288 # start byte and the proper number of the lowest 1289 # continuation bytes. 1290 $this_bytes = $start_byte 1291 . ($native_lowest_continuation_chr 1292 x ( $new_expected_len 1293 - 1 1294 - length($this_bytes))) 1295 . $converted_to_continuation_byte 1296 . substr($this_bytes, 1); 1297 $this_length = length($this_bytes); 1298 $this_needed_to_discern_len = $new_expected_len 1299 - ( $this_expected_len 1300 - $this_needed_to_discern_len); 1301 $this_expected_len = $new_expected_len; 1302 } 1303 } 1304 1305 if ($short) { 1306 1307 # To force this malformation, just tell the test to not look 1308 # as far as it should into the input. 1309 $this_length--; 1310 $this_expected_len--; 1311 1312 $allow_flags |= $::UTF8_ALLOW_SHORT if $malformed_allow_type; 1313 } 1314 1315 if ($unexpected_noncont) { 1316 1317 # To force this malformation, change the final continuation 1318 # byte into a start byte. 1319 my $pos = ($short) ? -2 : -1; 1320 substr($this_bytes, $pos, 1) = $known_start_byte; 1321 $this_expected_len--; 1322 } 1323 1324 # The whole point of a test that is malformed from the beginning 1325 # is to test for that malformation. If we've modified things so 1326 # much that we don't have enough information to detect that 1327 # malformation, there's no point in testing. 1328 next if $initially_malformed 1329 && $this_expected_len < $this_needed_to_discern_len; 1330 1331 # Here, we've transformed the input with all of the desired 1332 # non-overflow malformations. We are now in a position to 1333 # construct any potential warnings for those malformations. But 1334 # it's a pain to get the detailed messages exactly right, so for 1335 # now XXX, only do so for those that return an explicit code 1336 # point. 1337 1338 if ($initially_orphan) { 1339 push @malformation_names, "orphan continuation"; 1340 push @expected_malformation_return_flags, 1341 $::UTF8_GOT_CONTINUATION; 1342 $allow_flags |= $::UTF8_ALLOW_CONTINUATION 1343 if $malformed_allow_type; 1344 push @expected_malformation_warnings, qr/unexpected continuation/; 1345 } 1346 1347 if ($overlong) { 1348 push @malformation_names, 'overlong'; 1349 push @expected_malformation_return_flags, $::UTF8_GOT_LONG; 1350 1351 # If one of the other malformation types is also in effect, we 1352 # don't know what the intended code point was. 1353 if ($short || $unexpected_noncont || $will_overflow) { 1354 push @expected_malformation_warnings, qr/overlong/; 1355 } 1356 else { 1357 my $wrong_bytes = display_bytes_no_quotes( 1358 substr($this_bytes, 0, $this_length)); 1359 if (! defined $correct_bytes_for_overlong) { 1360 $correct_bytes_for_overlong 1361 = display_bytes_no_quotes($bytes); 1362 } 1363 my $prefix = ( $allowed_uv > 0x10FFFF 1364 || ! isASCII && $allowed_uv < 256) 1365 ? "0x" 1366 : "U+"; 1367 push @expected_malformation_warnings, 1368 qr/\QMalformed UTF-8 character: $wrong_bytes\E 1369 \Q (overlong; instead use\E 1370 \Q $correct_bytes_for_overlong to\E 1371 \Q represent $prefix$uv_string)/x; 1372 } 1373 1374 if ($malformed_allow_type == 2) { 1375 $allow_flags |= $::UTF8_ALLOW_LONG_AND_ITS_VALUE; 1376 } 1377 elsif ($malformed_allow_type) { 1378 $allow_flags |= $::UTF8_ALLOW_LONG; 1379 } 1380 } 1381 if ($short) { 1382 push @malformation_names, 'short'; 1383 push @expected_malformation_return_flags, $::UTF8_GOT_SHORT; 1384 push @expected_malformation_warnings, qr/too short/; 1385 } 1386 if ($unexpected_noncont) { 1387 push @malformation_names, 'unexpected non-continuation'; 1388 push @expected_malformation_return_flags, 1389 $::UTF8_GOT_NON_CONTINUATION; 1390 $allow_flags |= $::UTF8_ALLOW_NON_CONTINUATION 1391 if $malformed_allow_type; 1392 push @expected_malformation_warnings, 1393 qr/unexpected non-continuation byte/; 1394 } 1395 1396 # The overflow malformation is done differently than other 1397 # malformations. It comes from manually typed tests in the test 1398 # array. We now make it be treated like one of the other 1399 # malformations. But some has to be deferred until the inner loop 1400 my $overflow_msg_pattern; 1401 if ($will_overflow) { 1402 push @malformation_names, 'overflow'; 1403 1404 $overflow_msg_pattern = display_bytes_no_quotes( 1405 substr($this_bytes, 0, $this_expected_len)); 1406 $overflow_msg_pattern = qr/\QMalformed UTF-8 character:\E 1407 \Q $overflow_msg_pattern\E 1408 \Q (overflows)\E/x; 1409 push @expected_malformation_return_flags, $::UTF8_GOT_OVERFLOW; 1410 $allow_flags |= $::UTF8_ALLOW_OVERFLOW if $malformed_allow_type; 1411 } 1412 1413 # And we can create the malformation-related text for the the test 1414 # names we eventually will generate. 1415 my $malformations_name = ""; 1416 if (@malformation_names) { 1417 $malformations_name .= "dis" unless $malformed_allow_type; 1418 $malformations_name .= "allowed "; 1419 $malformations_name .= "malformation"; 1420 $malformations_name .= "s" if @malformation_names > 1; 1421 $malformations_name .= ": "; 1422 $malformations_name .= join "/", @malformation_names; 1423 $malformations_name = " ($malformations_name)"; 1424 } 1425 1426 # Done setting up the malformation related stuff 1427 1428 { # First test the isFOO calls 1429 use warnings; # XXX no warnings 'deprecated'; # Make sure these don't raise warnings 1430 undef @warnings_gotten; 1431 1432 my $ret = test_isUTF8_CHAR($this_bytes, $this_length); 1433 my $ret_flags 1434 = test_isUTF8_CHAR_flags($this_bytes, $this_length, 0); 1435 if ($malformations_name) { 1436 is($ret, 0, "For $testname$malformations_name: isUTF8_CHAR() returns 0"); 1437 is($ret_flags, 0, " And isUTF8_CHAR_flags() returns 0"); 1438 } 1439 else { 1440 is($ret, $this_length, "For $testname: isUTF8_CHAR() returns" 1441 . " expected length: $this_length"); 1442 is($ret_flags, $this_length, 1443 " And isUTF8_CHAR_flags(...,0) returns expected" 1444 . " length: $this_length"); 1445 } 1446 is(scalar @warnings_gotten, 0, 1447 " And neither isUTF8_CHAR() nor isUTF8_CHAR()_flags" 1448 . " generated any warnings") 1449 or output_warnings(@warnings_gotten); 1450 1451 undef @warnings_gotten; 1452 $ret = test_isSTRICT_UTF8_CHAR($this_bytes, $this_length); 1453 if ($malformations_name) { 1454 is($ret, 0, " And isSTRICT_UTF8_CHAR() returns 0"); 1455 } 1456 else { 1457 my $expected_ret 1458 = ( $testname =~ /surrogate|non-character/ 1459 || $allowed_uv > 0x10FFFF) 1460 ? 0 1461 : $this_length; 1462 is($ret, $expected_ret, 1463 " And isSTRICT_UTF8_CHAR() returns expected" 1464 . " length: $expected_ret"); 1465 $ret = test_isUTF8_CHAR_flags($this_bytes, $this_length, 1466 $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE); 1467 is($ret, $expected_ret, 1468 " And isUTF8_CHAR_flags('" 1469 . "DISALLOW_ILLEGAL_INTERCHANGE') acts like" 1470 . " isSTRICT_UTF8_CHAR"); 1471 } 1472 is(scalar @warnings_gotten, 0, 1473 " And neither isSTRICT_UTF8_CHAR() nor" 1474 . " isUTF8_CHAR_flags generated any warnings") 1475 or output_warnings(@warnings_gotten); 1476 1477 undef @warnings_gotten; 1478 $ret = test_isC9_STRICT_UTF8_CHAR($this_bytes, $this_length); 1479 if ($malformations_name) { 1480 is($ret, 0, " And isC9_STRICT_UTF8_CHAR() returns 0"); 1481 } 1482 else { 1483 my $expected_ret = ( $testname =~ /surrogate/ 1484 || $allowed_uv > 0x10FFFF) 1485 ? 0 1486 : $this_expected_len; 1487 is($ret, $expected_ret, " And isC9_STRICT_UTF8_CHAR()" 1488 . " returns expected length:" 1489 . " $expected_ret"); 1490 $ret = test_isUTF8_CHAR_flags($this_bytes, $this_length, 1491 $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE); 1492 is($ret, $expected_ret, 1493 " And isUTF8_CHAR_flags('" 1494 . "DISALLOW_ILLEGAL_C9_INTERCHANGE') acts like" 1495 . " isC9_STRICT_UTF8_CHAR"); 1496 } 1497 is(scalar @warnings_gotten, 0, 1498 " And neither isC9_STRICT_UTF8_CHAR() nor" 1499 . " isUTF8_CHAR_flags generated any warnings") 1500 or output_warnings(@warnings_gotten); 1501 1502 foreach my $disallow_type (0..2) { 1503 # 0 is don't disallow this type of code point 1504 # 1 is do disallow 1505 # 2 is do disallow, but only code points requiring 1506 # perl-extended-UTF8 1507 1508 my $disallow_flags; 1509 my $expected_ret; 1510 1511 if ($malformations_name) { 1512 1513 # Malformations are by default disallowed, so testing 1514 # with $disallow_type equal to 0 is sufficicient. 1515 next if $disallow_type; 1516 1517 $disallow_flags = 0; 1518 $expected_ret = 0; 1519 } 1520 elsif ($disallow_type == 1) { 1521 $disallow_flags = $utf8n_flag_to_disallow; 1522 $expected_ret = 0; 1523 } 1524 elsif ($disallow_type == 2) { 1525 next if ! requires_extended_utf8($allowed_uv); 1526 $disallow_flags = $::UTF8_DISALLOW_PERL_EXTENDED; 1527 $expected_ret = 0; 1528 } 1529 else { # type is 0 1530 $disallow_flags = $utf8n_flag_to_disallow_complement; 1531 $expected_ret = $this_length; 1532 } 1533 1534 $ret = test_isUTF8_CHAR_flags($this_bytes, $this_length, 1535 $disallow_flags); 1536 is($ret, $expected_ret, 1537 " And isUTF8_CHAR_flags($display_bytes," 1538 . " $disallow_flags) returns $expected_ret") 1539 or diag "The flags mean " 1540 . flags_to_text($disallow_flags, 1541 \@utf8n_flags_to_text); 1542 is(scalar @warnings_gotten, 0, 1543 " And isUTF8_CHAR_flags(...) generated" 1544 . " no warnings") 1545 or output_warnings(@warnings_gotten); 1546 1547 # Test partial character handling, for each byte not a 1548 # full character 1549 my $did_test_partial = 0; 1550 for (my $j = 1; $j < $this_length - 1; $j++) { 1551 $did_test_partial = 1; 1552 my $partial = substr($this_bytes, 0, $j); 1553 my $ret_should_be; 1554 my $comment; 1555 if ($disallow_type || $malformations_name) { 1556 $ret_should_be = 0; 1557 $comment = "disallowed"; 1558 1559 # The number of bytes required to tell if a 1560 # sequence has something wrong is the smallest of 1561 # all the things wrong with it. We start with the 1562 # number for this type of code point, if that is 1563 # disallowed; or the whole length if not. The 1564 # latter is what a couple of the malformations 1565 # require. 1566 my $needed_to_tell = ($disallow_type) 1567 ? $this_needed_to_discern_len 1568 : $this_expected_len; 1569 1570 # Then we see if the malformations that are 1571 # detectable early in the string are present. 1572 if ($overlong) { 1573 my $dl = overlong_discern_len($this_bytes); 1574 $needed_to_tell = $dl if $dl < $needed_to_tell; 1575 } 1576 if ($will_overflow) { 1577 my $dl = overflow_discern_len($length); 1578 $needed_to_tell = $dl if $dl < $needed_to_tell; 1579 } 1580 1581 if ($j < $needed_to_tell) { 1582 $ret_should_be = 1; 1583 $comment .= ", but need $needed_to_tell" 1584 . " bytes to discern:"; 1585 } 1586 } 1587 else { 1588 $ret_should_be = 1; 1589 $comment = "allowed"; 1590 } 1591 1592 undef @warnings_gotten; 1593 1594 $ret = test_is_utf8_valid_partial_char_flags($partial, 1595 $j, $disallow_flags); 1596 is($ret, $ret_should_be, 1597 " And is_utf8_valid_partial_char_flags(" 1598 . display_bytes($partial) 1599 . ", $disallow_flags), $comment: returns" 1600 . " $ret_should_be") 1601 or diag "The flags mean " 1602 . flags_to_text($disallow_flags, \@utf8n_flags_to_text); 1603 } 1604 1605 if ($did_test_partial) { 1606 is(scalar @warnings_gotten, 0, 1607 " And is_utf8_valid_partial_char_flags()" 1608 . " generated no warnings for any of the lengths") 1609 or output_warnings(@warnings_gotten); 1610 } 1611 } 1612 } 1613 1614 # Now test the to/from UTF-8 calls. There are several orthogonal 1615 # variables involved. We test most possible combinations 1616 1617 foreach my $do_disallow (0, 1) { 1618 if ($do_disallow) { 1619 next if $initially_overlong || $initially_orphan; 1620 } 1621 else { 1622 next if $skip_most_tests; 1623 } 1624 1625 # This tests four functions: utf8n_to_uvchr_error, 1626 # utf8n_to_uvchr_msgs, uvchr_to_utf8_flags, and 1627 # uvchr_to_utf8_msgs. The first two are variants of each other, 1628 # and the final two also form a pair. We use a loop 'which_func' 1629 # to determine which of each pair is being tested. The main loop 1630 # tests either the first and third, or the 2nd and fourth. 1631 # which_func is sets whether we are expecting warnings or not in 1632 # certain places. The _msgs() version of the functions expects 1633 # warnings even if lexical ones are turned off, so by making its 1634 # which_func == 1, we can say we want warnings; whereas the other 1635 # one with the value 0, doesn't get them. 1636 for my $which_func (0, 1) { 1637 my $utf8_func = ($which_func) 1638 ? 'utf8n_to_uvchr_msgs' 1639 : 'utf8n_to_uvchr_error'; 1640 1641 # We classify the warnings into certain "interesting" types, 1642 # described later 1643 foreach my $warning_type (0..4) { 1644 next if $skip_most_tests && $warning_type != 1; 1645 foreach my $use_warn_flag (0, 1) { 1646 if ($use_warn_flag) { 1647 next if $initially_overlong || $initially_orphan; 1648 1649 # Since foo_msgs() expects warnings even when lexical 1650 # ones are turned off, we can skip testing it when 1651 # they are turned on, with little likelihood of 1652 # missing an error case. 1653 next if $which_func; 1654 } 1655 else { 1656 next if $skip_most_tests; 1657 } 1658 1659 # Finally, here is the inner loop 1660 1661 my $this_utf8n_flag_to_warn = $utf8n_flag_to_warn; 1662 my $this_utf8n_flag_to_disallow = $utf8n_flag_to_disallow; 1663 my $this_uvchr_flag_to_warn = $uvchr_flag_to_warn; 1664 my $this_uvchr_flag_to_disallow = $uvchr_flag_to_disallow; 1665 1666 my $eval_warn; 1667 my $expect_regular_warnings; 1668 my $expect_warnings_for_malformed; 1669 my $expect_warnings_for_overflow; 1670 1671 if ($warning_type == 0) { 1672 $eval_warn = "use warnings"; 1673 $expect_regular_warnings = $use_warn_flag; 1674 1675 # We ordinarily expect overflow warnings here. But it 1676 # is somewhat more complicated, and the final 1677 # determination is deferred to one place in the file 1678 # where we handle overflow. 1679 $expect_warnings_for_overflow = 1; 1680 1681 # We would ordinarily expect malformed warnings in 1682 # this case, but not if malformations are allowed. 1683 $expect_warnings_for_malformed 1684 = $malformed_allow_type == 0; 1685 } 1686 elsif ($warning_type == 1) { 1687 $eval_warn = "no warnings"; 1688 $expect_regular_warnings = $which_func; 1689 $expect_warnings_for_overflow = $which_func; 1690 $expect_warnings_for_malformed = $which_func; 1691 } 1692 elsif ($warning_type == 2) { 1693 $eval_warn = "no warnings; use warnings 'utf8'"; 1694 $expect_regular_warnings = $use_warn_flag; 1695 $expect_warnings_for_overflow = 1; 1696 $expect_warnings_for_malformed 1697 = $malformed_allow_type == 0; 1698 } 1699 elsif ($warning_type == 3) { 1700 $eval_warn = "no warnings; use warnings" 1701 . " '$controlling_warning_category'"; 1702 $expect_regular_warnings = $use_warn_flag; 1703 $expect_warnings_for_overflow 1704 = $controlling_warning_category eq 'non_unicode'; 1705 $expect_warnings_for_malformed = $which_func; 1706 } 1707 elsif ($warning_type == 4) { # Like type 3, but uses the 1708 # PERL_EXTENDED flags 1709 # The complement flags were set up so that the 1710 # PERL_EXTENDED flags have been tested that they don't 1711 # trigger wrongly for too small code points. And the 1712 # flags have been set up so that those small code 1713 # points are tested for being above Unicode. What's 1714 # left to test is that the large code points do 1715 # trigger the PERL_EXTENDED flags. 1716 next if ! requires_extended_utf8($allowed_uv); 1717 next if $controlling_warning_category ne 'non_unicode'; 1718 $eval_warn = "no warnings; use warnings 'non_unicode'"; 1719 $expect_regular_warnings = 1; 1720 $expect_warnings_for_overflow = 1; 1721 $expect_warnings_for_malformed = 0; 1722 $this_utf8n_flag_to_warn = $::UTF8_WARN_PERL_EXTENDED; 1723 $this_utf8n_flag_to_disallow 1724 = $::UTF8_DISALLOW_PERL_EXTENDED; 1725 $this_uvchr_flag_to_warn 1726 = $::UNICODE_WARN_PERL_EXTENDED; 1727 $this_uvchr_flag_to_disallow 1728 = $::UNICODE_DISALLOW_PERL_EXTENDED; 1729 } 1730 else { 1731 die "Unexpected warning type '$warning_type'"; 1732 } 1733 1734 # We only need to test the case where all warnings are 1735 # enabled (type 0) to see if turning off the warning flag 1736 # causes things to not be output. If those pass, then 1737 # turning on some sub-category of warnings, or turning off 1738 # warnings altogether are extremely likely to not output 1739 # warnings either, given how the warnings subsystem is 1740 # supposed to work, and this file assumes it does work. 1741 next if $warning_type != 0 && ! $use_warn_flag; 1742 1743 # The convention is that the 'got' flag is the same value 1744 # as the disallow one. If this were violated, the tests 1745 # here should start failing. 1746 my $return_flag = $this_utf8n_flag_to_disallow; 1747 1748 # If we aren't expecting warnings/disallow for this, turn 1749 # on all the other flags. That makes sure that they all 1750 # are independent of this flag, and so we don't need to 1751 # test them individually. 1752 my $this_warning_flags 1753 = ($use_warn_flag) 1754 ? $this_utf8n_flag_to_warn 1755 : ($overlong_is_in_perl_extended_utf8 1756 ? ($utf8n_flag_to_warn_complement 1757 & ~$::UTF8_WARN_PERL_EXTENDED) 1758 : $utf8n_flag_to_warn_complement); 1759 my $this_disallow_flags 1760 = ($do_disallow) 1761 ? $this_utf8n_flag_to_disallow 1762 : ($overlong_is_in_perl_extended_utf8 1763 ? ($utf8n_flag_to_disallow_complement 1764 & ~$::UTF8_DISALLOW_PERL_EXTENDED) 1765 : $utf8n_flag_to_disallow_complement); 1766 my $expected_uv = $allowed_uv; 1767 my $this_uv_string = $uv_string; 1768 1769 my @expected_return_flags 1770 = @expected_malformation_return_flags; 1771 my @expected_warnings; 1772 push @expected_warnings, @expected_malformation_warnings 1773 if $expect_warnings_for_malformed; 1774 1775 # The overflow malformation is done differently than other 1776 # malformations. It comes from manually typed tests in 1777 # the test array, but it also is above Unicode and uses 1778 # Perl extended UTF-8, so affects some of the flags being 1779 # tested. We now make it be treated like one of the other 1780 # generated malformations. 1781 if ($will_overflow) { 1782 1783 # An overflow is (way) above Unicode, and overrides 1784 # everything else. 1785 $expect_regular_warnings = 0; 1786 1787 # Earlier, we tentatively calculated whether this 1788 # should emit a message or not. It's tentative 1789 # because, even if we ordinarily would output it, we 1790 # don't if malformations are allowed -- except an 1791 # overflow is also a SUPER and PERL_EXTENDED, and if 1792 # warnings for those are enabled, the overflow 1793 # warning does get raised. 1794 if ( $expect_warnings_for_overflow 1795 && ( $malformed_allow_type == 0 1796 || ( $this_warning_flags 1797 & ($::UTF8_WARN_SUPER 1798 |$::UTF8_WARN_PERL_EXTENDED)))) 1799 { 1800 push @expected_warnings, $overflow_msg_pattern; 1801 } 1802 } 1803 1804 # It may be that the malformations have shortened the 1805 # amount of input we look at so much that we can't tell 1806 # what the category the code point was in. Otherwise, set 1807 # up the expected return flags based on the warnings and 1808 # disallowments. 1809 if ($this_expected_len < $this_needed_to_discern_len) { 1810 $expect_regular_warnings = 0; 1811 } 1812 elsif ( ($this_warning_flags & $this_utf8n_flag_to_warn) 1813 || ( $this_disallow_flags 1814 & $this_utf8n_flag_to_disallow)) 1815 { 1816 push @expected_return_flags, $return_flag; 1817 } 1818 1819 # Finish setting up the expected warning. 1820 if ($expect_regular_warnings) { 1821 1822 # So far the array contains warnings generated by 1823 # malformations. Add the expected regular one. 1824 unshift @expected_warnings, $this_cp_message_qr; 1825 1826 # But it may need to be modified, because either of 1827 # these malformations means we can't determine the 1828 # expected code point. 1829 if ( $short || $unexpected_noncont 1830 || $dont_use_overlong_cp) 1831 { 1832 my $first_byte = substr($this_bytes, 0, 1); 1833 $expected_warnings[0] = display_bytes( 1834 substr($this_bytes, 0, $this_expected_len)); 1835 $expected_warnings[0] 1836 = qr/[Aa]\Qny UTF-8 sequence that starts with\E 1837 \Q $expected_warnings[0]\E 1838 \Q $this_non_cp_trailing_text\E/x; 1839 } 1840 } 1841 1842 # Is effectively disallowed if we've set up a malformation 1843 # (unless malformations are allowed), even if the flag 1844 # indicates it is allowed. Fix up test name to indicate 1845 # this as well 1846 my $disallowed = 0; 1847 if ( $this_disallow_flags & $this_utf8n_flag_to_disallow 1848 && $this_expected_len >= $this_needed_to_discern_len) 1849 { 1850 $disallowed = 1; 1851 } 1852 if ($malformations_name) { 1853 if ($malformed_allow_type == 0) { 1854 $disallowed = 1; 1855 } 1856 elsif ($malformed_allow_type == 1) { 1857 1858 # Even if allowed, the malformation returns the 1859 # REPLACEMENT CHARACTER. 1860 $expected_uv = 0xFFFD; 1861 $this_uv_string = "0xFFFD" 1862 } 1863 } 1864 1865 my $this_name = "$utf8_func() $testname: "; 1866 my @scratch_expected_return_flags = @expected_return_flags; 1867 if (! $initially_malformed) { 1868 $this_name .= ($disallowed) 1869 ? 'disallowed, ' 1870 : 'allowed, '; 1871 } 1872 $this_name .= "$eval_warn"; 1873 $this_name .= ", " . (( $this_warning_flags 1874 & $this_utf8n_flag_to_warn) 1875 ? 'with flag for raising warnings' 1876 : 'no flag for raising warnings'); 1877 $this_name .= $malformations_name; 1878 1879 # Do the actual test using an eval 1880 undef @warnings_gotten; 1881 my $ret_ref; 1882 my $this_flags 1883 = $allow_flags|$this_warning_flags|$this_disallow_flags; 1884 my $eval_text = "$eval_warn; \$ret_ref" 1885 . " = test_$utf8_func(" 1886 . "'$this_bytes', $this_length, $this_flags)"; 1887 eval "$eval_text"; 1888 if (! ok ($@ eq "", "$this_name: eval succeeded")) 1889 { 1890 diag "\$@='$@'; call was: " 1891 . utf8n_display_call($eval_text); 1892 next; 1893 } 1894 1895 if ($disallowed) { 1896 is($ret_ref->[0], 0, " And returns 0") 1897 or diag "Call was: " . utf8n_display_call($eval_text); 1898 } 1899 else { 1900 is($ret_ref->[0], $expected_uv, 1901 " And returns expected uv: " 1902 . $this_uv_string) 1903 or diag "Call was: " . utf8n_display_call($eval_text); 1904 } 1905 is($ret_ref->[1], $this_expected_len, 1906 " And returns expected length:" 1907 . " $this_expected_len") 1908 or diag "Call was: " . utf8n_display_call($eval_text); 1909 1910 my $returned_flags = $ret_ref->[2]; 1911 1912 for (my $i = @scratch_expected_return_flags - 1; 1913 $i >= 0; 1914 $i--) 1915 { 1916 if ($scratch_expected_return_flags[$i] & $returned_flags) 1917 { 1918 if ($scratch_expected_return_flags[$i] 1919 == $::UTF8_GOT_PERL_EXTENDED) 1920 { 1921 pass(" Expected and got return flag for" 1922 . " PERL_EXTENDED"); 1923 } 1924 # The first entries in this are 1925 # malformations 1926 elsif ($i > @malformation_names - 1) { 1927 pass(" Expected and got return flag" 1928 . " for " . $controlling_warning_category); 1929 } 1930 else { 1931 pass(" Expected and got return flag for " 1932 . $malformation_names[$i] 1933 . " malformation"); 1934 } 1935 $returned_flags 1936 &= ~$scratch_expected_return_flags[$i]; 1937 splice @scratch_expected_return_flags, $i, 1; 1938 } 1939 } 1940 1941 if (! is($returned_flags, 0, 1942 " Got no unexpected return flags")) 1943 { 1944 diag "The unexpected flags gotten were: " 1945 . (flags_to_text($returned_flags, 1946 \@utf8n_flags_to_text) 1947 # We strip off any prefixes from the flag 1948 # names 1949 =~ s/ \b [A-Z] _ //xgr); 1950 diag "Call was: " . utf8n_display_call($eval_text); 1951 } 1952 1953 if (! is (scalar @scratch_expected_return_flags, 0, 1954 " Got all expected return flags")) 1955 { 1956 diag "The expected flags not gotten were: " 1957 . (flags_to_text(eval join("|", 1958 @scratch_expected_return_flags), 1959 \@utf8n_flags_to_text) 1960 # We strip off any prefixes from the flag 1961 # names 1962 =~ s/ \b [A-Z] _ //xgr); 1963 diag "Call was: " . utf8n_display_call($eval_text); 1964 } 1965 1966 if ($which_func) { 1967 my @returned_warnings; 1968 for my $element_ref (@{$ret_ref->[3]}) { 1969 push @returned_warnings, $element_ref->{'text'}; 1970 my $text = $element_ref->{'text'}; 1971 my $flag = $element_ref->{'flag_bit'}; 1972 my $category = $element_ref->{'warning_category'}; 1973 1974 if (! ok(($flag & ($flag-1)) == 0, 1975 "flag for returned msg is a single bit")) 1976 { 1977 diag sprintf("flags are %x; msg=%s", $flag, $text); 1978 } 1979 else { 1980 if (grep { $_ == $flag } @expected_return_flags) { 1981 pass("flag for returned msg is expected"); 1982 } 1983 else { 1984 fail("flag (" 1985 . flags_to_text($flag, \@utf8n_flags_to_text) 1986 . ") for returned msg is expected"); 1987 } 1988 } 1989 1990 # In perl space, don't know the category numbers 1991 isnt($category, 0, 1992 "returned category for msg isn't 0"); 1993 } 1994 1995 ok(@warnings_gotten == 0, "$utf8_func raised no warnings;" 1996 . " the next tests are for ones in the returned" 1997 . " variable") 1998 or diag join "\n", "The unexpected warnings were:", 1999 @warnings_gotten; 2000 @warnings_gotten = @returned_warnings; 2001 } 2002 2003 do_warnings_test(@expected_warnings) 2004 or diag "Call was: " . utf8n_display_call($eval_text); 2005 undef @warnings_gotten; 2006 2007 # Check CHECK_ONLY results when the input is 2008 # disallowed. Do this when actually disallowed, 2009 # not just when the $this_disallow_flags is set. We only 2010 # test once utf8n_to_uvchr_msgs() with this. 2011 if ( $disallowed 2012 && ($which_func == 0 || ! $tested_CHECK_ONLY)) 2013 { 2014 $tested_CHECK_ONLY = 1; 2015 my $this_flags = $this_disallow_flags|$::UTF8_CHECK_ONLY; 2016 my $eval_text = "use warnings; \$ret_ref =" 2017 . " test_$utf8_func('" 2018 . "$this_bytes', $this_length," 2019 . " $this_flags)"; 2020 eval $eval_text; 2021 if (! ok ($@ eq "", 2022 " And eval succeeded with CHECK_ONLY")) 2023 { 2024 diag "\$@='$@'; Call was: " 2025 . utf8n_display_call($eval_text); 2026 next; 2027 } 2028 is($ret_ref->[0], 0, " CHECK_ONLY: Returns 0") 2029 or diag "Call was: " . utf8n_display_call($eval_text); 2030 is($ret_ref->[1], -1, 2031 " CHECK_ONLY: returns -1 for length") 2032 or diag "Call was: " . utf8n_display_call($eval_text); 2033 if (! is(scalar @warnings_gotten, 0, 2034 " CHECK_ONLY: no warnings generated")) 2035 { 2036 diag "Call was: " . utf8n_display_call($eval_text); 2037 output_warnings(@warnings_gotten); 2038 } 2039 } 2040 2041 # Now repeat some of the above, but for 2042 # uvchr_to_utf8_flags(). Since this comes from an 2043 # existing code point, it hasn't overflowed, and isn't 2044 # malformed. 2045 next if @malformation_names; 2046 2047 my $uvchr_func = ($which_func) 2048 ? 'uvchr_to_utf8_flags_msgs' 2049 : 'uvchr_to_utf8_flags'; 2050 2051 $this_warning_flags = ($use_warn_flag) 2052 ? $this_uvchr_flag_to_warn 2053 : 0; 2054 $this_disallow_flags = ($do_disallow) 2055 ? $this_uvchr_flag_to_disallow 2056 : 0; 2057 2058 $disallowed = $this_disallow_flags 2059 & $this_uvchr_flag_to_disallow; 2060 $this_name .= ", " . (( $this_warning_flags 2061 & $this_utf8n_flag_to_warn) 2062 ? 'with flag for raising warnings' 2063 : 'no flag for raising warnings'); 2064 2065 $this_name = "$uvchr_func() $testname: " 2066 . (($disallowed) 2067 ? 'disallowed' 2068 : 'allowed'); 2069 $this_name .= ", $eval_warn"; 2070 $this_name .= ", " . (( $this_warning_flags 2071 & $this_uvchr_flag_to_warn) 2072 ? 'with warning flag' 2073 : 'no warning flag'); 2074 2075 undef @warnings_gotten; 2076 my $ret; 2077 $this_flags = $this_warning_flags|$this_disallow_flags; 2078 $eval_text = "$eval_warn; \$ret =" 2079 . " test_$uvchr_func(" 2080 . "$allowed_uv, $this_flags)"; 2081 eval "$eval_text"; 2082 if (! ok ($@ eq "", "$this_name: eval succeeded")) 2083 { 2084 diag "\$@='$@'; call was: " 2085 . uvchr_display_call($eval_text); 2086 next; 2087 } 2088 2089 if ($which_func) { 2090 if (defined $ret->[1]) { 2091 my @returned_warnings; 2092 push @returned_warnings, $ret->[1]{'text'}; 2093 my $text = $ret->[1]{'text'}; 2094 my $flag = $ret->[1]{'flag_bit'}; 2095 my $category = $ret->[1]{'warning_category'}; 2096 2097 if (! ok(($flag & ($flag-1)) == 0, 2098 "flag for returned msg is a single bit")) 2099 { 2100 diag sprintf("flags are %x; msg=%s", $flag, $text); 2101 } 2102 else { 2103 if ($flag & $this_uvchr_flag_to_disallow) { 2104 pass("flag for returned msg is expected"); 2105 } 2106 else { 2107 fail("flag (" 2108 . flags_to_text($flag, \@utf8n_flags_to_text) 2109 . ") for returned msg is expected"); 2110 } 2111 } 2112 2113 # In perl space, don't know the category numbers 2114 isnt($category, 0, 2115 "returned category for msg isn't 0"); 2116 2117 ok(@warnings_gotten == 0, "$uvchr_func raised no warnings;" 2118 . " the next tests are for ones in the returned" 2119 . " variable") 2120 or diag join "\n", "The unexpected warnings were:", 2121 @warnings_gotten; 2122 @warnings_gotten = @returned_warnings; 2123 } 2124 2125 $ret = $ret->[0]; 2126 } 2127 2128 if ($disallowed) { 2129 is($ret, undef, " And returns undef") 2130 or diag "Call was: " . uvchr_display_call($eval_text); 2131 } 2132 else { 2133 is($ret, $this_bytes, " And returns expected string") 2134 or diag "Call was: " . uvchr_display_call($eval_text); 2135 } 2136 2137 do_warnings_test(@expected_warnings) 2138 or diag "Call was: " . uvchr_display_call($eval_text); 2139 } 2140 } 2141 } 2142 } 2143 } 2144 } 2145 } 2146 } 2147} 2148 2149done_testing; 2150