1#!perl -w 2 3BEGIN { 4 require 'loc_tools.pl'; # Contains locales_enabled() and 5 # find_utf8_ctype_locale() 6} 7 8use strict; 9use Test::More; 10use Config; 11 12use XS::APItest; 13 14my $tab = " " x 4; # Indent subsidiary tests this much 15 16use Unicode::UCD qw(search_invlist prop_invmap prop_invlist); 17my ($charname_list, $charname_map, $format, $default) = prop_invmap("Name Alias"); 18 19sub get_charname($) { 20 my $cp = shift; 21 22 # If there is a an abbreviation for the code point name, use it 23 my $name_index = search_invlist(\@{$charname_list}, $cp); 24 if (defined $name_index) { 25 my $synonyms = $charname_map->[$name_index]; 26 if (ref $synonyms) { 27 my $pat = qr/: abbreviation/; 28 my @abbreviations = grep { $_ =~ $pat } @$synonyms; 29 if (@abbreviations) { 30 return $abbreviations[0] =~ s/$pat//r; 31 } 32 } 33 } 34 35 # Otherwise, use the full name 36 use charnames (); 37 return charnames::viacode($cp) // "No name"; 38} 39 40sub truth($) { # Converts values so is() works 41 return (shift) ? 1 : 0; 42} 43 44my $base_locale; 45my $utf8_locale; 46if(locales_enabled('LC_ALL')) { 47 require POSIX; 48 $base_locale = POSIX::setlocale( &POSIX::LC_ALL, "C"); 49 if (defined $base_locale && $base_locale eq 'C') { 50 use locale; # make \w work right in non-ASCII lands 51 52 # Some locale implementations don't have the 128-255 characters all 53 # mean nothing. Skip the locale tests in that situation 54 for my $u (128 .. 255) { 55 if (chr(utf8::unicode_to_native($u)) =~ /[[:print:]]/) { 56 undef $base_locale; 57 last; 58 } 59 } 60 61 $utf8_locale = find_utf8_ctype_locale() if $base_locale; 62 } 63} 64 65sub get_display_locale_or_skip($$) { 66 67 # Helper function intimately tied to its callers. It knows the loop 68 # iterates with a locale of "", meaning don't use locale; $base_locale 69 # meaning to use a non-UTF-8 locale; and $utf8_locale. 70 # 71 # It checks to see if the current test should be skipped or executed, 72 # returning an empty list for the former, and for the latter: 73 # ( 'locale display name', 74 # bool of is this a UTF-8 locale ) 75 # 76 # The display name is the empty string if not using locale. Functions 77 # with _LC in their name are skipped unless in locale, and functions 78 # without _LC are executed only outside locale. 79 80 my ($locale, $suffix) = @_; 81 82 # The test should be skipped if the input is for a non-existent locale 83 return unless defined $locale; 84 85 # Here the input is defined, either a locale name or "". If the test is 86 # for not using locales, we want to do the test for non-LC functions, 87 # and skip it for LC ones. 88 if ($locale eq "") { 89 return ("", 0) if $suffix !~ /LC/; 90 return; 91 } 92 93 # Here the input is for a real locale. We don't test the non-LC functions 94 # for locales. 95 return if $suffix !~ /LC/; 96 97 # Here is for a LC function and a real locale. The base locale is not 98 # UTF-8. 99 return (" ($locale locale)", 0) if $locale eq $base_locale; 100 101 # The only other possibility is that we have a UTF-8 locale 102 return (" ($locale)", 1); 103} 104 105sub try_malforming($$$) 106{ 107 # Determines if the tests for malformed UTF-8 should be done. When done, 108 # the .xs code creates malformations by pretending the length is shorter 109 # than it actually is. Some things can't be malformed, and sometimes this 110 # test knows that the current code doesn't look for a malformation under 111 # various circumstances. 112 113 my ($u, $function, $using_locale) = @_; 114 # $u is unicode code point; 115 116 # Single bytes can't be malformed 117 return 0 if $u < ((ord "A" == 65) ? 128 : 160); 118 119 # ASCII doesn't need to ever look beyond the first byte. 120 return 0 if $function eq "ASCII"; 121 122 # Nor, on EBCDIC systems, does CNTRL 123 return 0 if ord "A" != 65 && $function eq "CNTRL"; 124 125 # No controls above 255, so the code doesn't look at those 126 return 0 if $u > 255 && $function eq "CNTRL"; 127 128 # No non-ASCII digits below 256, except if using locales. 129 return 0 if $u < 256 && ! $using_locale && $function =~ /X?DIGIT/; 130 131 return 1; 132} 133 134my %properties = ( 135 # name => Lookup-property name 136 alnum => 'Word', 137 wordchar => 'Word', 138 alphanumeric => 'Alnum', 139 alpha => 'XPosixAlpha', 140 ascii => 'ASCII', 141 blank => 'Blank', 142 cntrl => 'Control', 143 digit => 'Digit', 144 graph => 'Graph', 145 idfirst => '_Perl_IDStart', 146 idcont => '_Perl_IDCont', 147 lower => 'XPosixLower', 148 print => 'Print', 149 psxspc => 'XPosixSpace', 150 punct => 'XPosixPunct', 151 quotemeta => '_Perl_Quotemeta', 152 space => 'XPerlSpace', 153 vertws => 'VertSpace', 154 upper => 'XPosixUpper', 155 xdigit => 'XDigit', 156 ); 157 158my %seen; 159my @warnings; 160local $SIG{__WARN__} = sub { push @warnings, @_ }; 161 162my %utf8_param_code = ( 163 "_safe" => 0, 164 "_safe, malformed" => 1, 165 "deprecated unsafe" => -1, 166 "deprecated mathoms" => -2, 167 ); 168 169# This test is split into this number of files. 170my $num_test_files = $ENV{TEST_JOBS} || 1; 171$::TEST_CHUNK = 0 if $num_test_files == 1 && ! defined $::TEST_CHUNK; 172$num_test_files = 10 if $num_test_files > 10; 173 174my $property_count = -1; 175foreach my $name (sort keys %properties, 'octal') { 176 177 # We test every nth property in this run so that this test is split into 178 # smaller chunks to minimize test suite elapsed time when run in parallel. 179 $property_count++; 180 next if $property_count % $num_test_files != $::TEST_CHUNK; 181 182 my @invlist; 183 if ($name eq 'octal') { 184 # Hand-roll an inversion list with 0-7 in it and nothing else. 185 push @invlist, ord "0", ord "8"; 186 } 187 else { 188 my $property = $properties{$name}; 189 @invlist = prop_invlist($property, '_perl_core_internal_ok'); 190 if (! @invlist) { 191 192 # An empty return could mean an unknown property, or merely that 193 # it is empty. Call in scalar context to differentiate 194 if (! prop_invlist($property, '_perl_core_internal_ok')) { 195 fail("No inversion list found for $property"); 196 next; 197 } 198 } 199 } 200 201 # Include all the Latin1 code points, plus 0x100. 202 my @code_points = (0 .. 256); 203 204 # Then include the next few boundaries above those from this property 205 my $above_latins = 0; 206 foreach my $range_start (@invlist) { 207 next if $range_start < 257; 208 push @code_points, $range_start - 1, $range_start; 209 $above_latins++; 210 last if $above_latins > 5; 211 } 212 213 # This makes sure we are using the Perl definition of idfirst and idcont, 214 # and not the Unicode. There are a few differences. 215 push @code_points, ord "\N{ESTIMATED SYMBOL}" if $name =~ /^id(first|cont)/; 216 if ($name eq "idcont") { # And some that are continuation but not start 217 push @code_points, ord("\N{GREEK ANO TELEIA}"), 218 ord("\N{COMBINING GRAVE ACCENT}"); 219 } 220 221 # And finally one non-Unicode code point. 222 push @code_points, 0x110000; # Above Unicode, no prop should match 223 no warnings 'non_unicode'; 224 225 for my $n (@code_points) { 226 my $u = utf8::native_to_unicode($n); 227 my $function = uc($name); 228 229 is (@warnings, 0, "Got no unexpected warnings in previous iteration") 230 or diag("@warnings"); 231 undef @warnings; 232 233 my $matches = search_invlist(\@invlist, $n); 234 if (! defined $matches) { 235 $matches = 0; 236 } 237 else { 238 $matches = truth(! ($matches % 2)); 239 } 240 241 my $ret; 242 my $char_name = get_charname($n); 243 my $display_name = sprintf "\\x{%02X, %s}", $n, $char_name; 244 my $display_call = "is${function}( $display_name )"; 245 246 foreach my $suffix ("", "_A", "_L1", "_LC", "_uni", "_uvchr", 247 "_LC_uvchr", "_utf8", "_LC_utf8") 248 { 249 250 # Not all possible macros have been defined 251 if ($name eq 'vertws') { 252 253 # vertws is always all of Unicode 254 next if $suffix !~ / ^ _ ( uni | uvchr | utf8 ) $ /x; 255 } 256 elsif ($name eq 'alnum') { 257 258 # ALNUM_A, ALNUM_L1, and ALNUM_uvchr are not defined as these 259 # suffixes were added later, after WORDCHAR was created to be 260 # a clearer synonym for ALNUM 261 next if $suffix eq '_A' 262 || $suffix eq '_L1' 263 || $suffix eq '_uvchr'; 264 } 265 elsif ($name eq 'octal') { 266 next if $suffix ne "" && $suffix ne '_A' && $suffix ne '_L1'; 267 } 268 elsif ($name eq 'quotemeta') { 269 # There is only one macro for this, and is defined only for 270 # Latin1 range 271 next if $suffix ne "" 272 } 273 274 foreach my $locale ("", $base_locale, $utf8_locale) { 275 276 my ($display_locale, $locale_is_utf8) 277 = get_display_locale_or_skip($locale, $suffix); 278 next unless defined $display_locale; 279 280 use if $locale, "locale"; 281 POSIX::setlocale( &POSIX::LC_ALL, $locale) if $locale; 282 283 if ($suffix !~ /utf8/) { # _utf8 has to handled specially 284 my $display_call 285 = "is${function}$suffix( $display_name )$display_locale"; 286 $ret = truth eval "test_is${function}$suffix($n)"; 287 if (is ($@, "", "$display_call didn't give error")) { 288 my $truth = $matches; 289 if ($truth) { 290 291 # The single byte functions are false for 292 # above-Latin1 293 if ($n >= 256) { 294 $truth = 0 295 if $suffix=~ / ^ ( _A | _L [1C] )? $ /x; 296 } 297 elsif ( $u >= 128 298 && $name ne 'quotemeta') 299 { 300 301 # The no-suffix and _A functions are false 302 # for non-ASCII. So are _LC functions on a 303 # non-UTF-8 locale 304 $truth = 0 if $suffix eq "_A" 305 || $suffix eq "" 306 || ( $suffix =~ /LC/ 307 && ! $locale_is_utf8); 308 } 309 } 310 311 is ($ret, $truth, "${tab}And correctly returns $truth"); 312 } 313 } 314 else { # _utf8 suffix 315 my $char = chr($n); 316 utf8::upgrade($char); 317 $char = quotemeta $char if $char eq '\\' || $char eq "'"; 318 my $truth; 319 if ( $suffix =~ /LC/ 320 && ! $locale_is_utf8 321 && $n < 256 322 && $u >= 128) 323 { # The C-locale _LC function returns FALSE for Latin1 324 # above ASCII 325 $truth = 0; 326 } 327 else { 328 $truth = $matches; 329 } 330 331 foreach my $utf8_param("_safe", 332 "_safe, malformed", 333 "deprecated unsafe" 334 ) 335 { 336 my $utf8_param_code = $utf8_param_code{$utf8_param}; 337 my $expect_error = $utf8_param_code > 0; 338 next if $expect_error 339 && ! try_malforming($u, $function, 340 $suffix =~ /LC/); 341 342 my $display_call = "is${function}$suffix( $display_name" 343 . ", $utf8_param )$display_locale"; 344 $ret = truth eval "test_is${function}$suffix('$char'," 345 . " $utf8_param_code)"; 346 if ($expect_error) { 347 isnt ($@, "", 348 "expected and got error in $display_call"); 349 like($@, qr/Malformed UTF-8 character/, 350 "${tab}And got expected message"); 351 if (is (@warnings, 1, 352 "${tab}Got a single warning besides")) 353 { 354 like($warnings[0], 355 qr/Malformed UTF-8 character.*short/, 356 "${tab}Got expected warning"); 357 } 358 else { 359 diag("@warnings"); 360 } 361 undef @warnings; 362 } 363 elsif (is ($@, "", "$display_call didn't give error")) { 364 is ($ret, $truth, 365 "${tab}And correctly returned $truth"); 366 if ($utf8_param_code < 0) { 367 my $warnings_ok; 368 my $unique_function = "is" . $function . $suffix; 369 if (! $seen{$unique_function}++) { 370 $warnings_ok = is(@warnings, 1, 371 "${tab}This is first call to" 372 . " $unique_function; Got a single" 373 . " warning"); 374 if ($warnings_ok) { 375 $warnings_ok = like($warnings[0], 376 qr/starting in Perl .* will require an additional parameter/, 377 "${tab}The warning was the expected" 378 . " deprecation one"); 379 } 380 } 381 else { 382 $warnings_ok = is(@warnings, 0, 383 "${tab}This subsequent call to" 384 . " $unique_function did not warn"); 385 } 386 $warnings_ok or diag("@warnings"); 387 undef @warnings; 388 } 389 } 390 } 391 } 392 } 393 } 394 } 395} 396 397my %to_properties = ( 398 FOLD => 'Case_Folding', 399 LOWER => 'Lowercase_Mapping', 400 TITLE => 'Titlecase_Mapping', 401 UPPER => 'Uppercase_Mapping', 402 ); 403 404$property_count = -1; 405foreach my $name (sort keys %to_properties) { 406 407 $property_count++; 408 next if $property_count % $num_test_files != $::TEST_CHUNK; 409 410 my $property = $to_properties{$name}; 411 my ($list_ref, $map_ref, $format, $missing) 412 = prop_invmap($property, ); 413 if (! $list_ref || ! $map_ref) { 414 fail("No inversion map found for $property"); 415 next; 416 } 417 if ($format !~ / ^ a l? $ /x) { 418 fail("Unexpected inversion map format ('$format') found for $property"); 419 next; 420 } 421 422 # Include all the Latin1 code points, plus 0x100. 423 my @code_points = (0 .. 256); 424 425 # Then include the next few multi-char folds above those from this 426 # property, and include the next few single folds as well 427 my $above_latins = 0; 428 my $multi_char = 0; 429 for my $i (0 .. @{$list_ref} - 1) { 430 my $range_start = $list_ref->[$i]; 431 next if $range_start < 257; 432 if (ref $map_ref->[$i] && $multi_char < 5) { 433 push @code_points, $range_start - 1 434 if $code_points[-1] != $range_start - 1; 435 push @code_points, $range_start; 436 $multi_char++; 437 } 438 elsif ($above_latins < 5) { 439 push @code_points, $range_start - 1 440 if $code_points[-1] != $range_start - 1; 441 push @code_points, $range_start; 442 $above_latins++; 443 } 444 last if $above_latins >= 5 && $multi_char >= 5; 445 } 446 447 # And finally one non-Unicode code point. 448 push @code_points, 0x110000; # Above Unicode, no prop should match 449 no warnings 'non_unicode'; 450 451 # $n is native; $u unicode. 452 for my $n (@code_points) { 453 my $u = utf8::native_to_unicode($n); 454 my $function = $name; 455 456 my $index = search_invlist(\@{$list_ref}, $n); 457 458 my $ret; 459 my $char_name = get_charname($n); 460 my $display_name = sprintf "\\N{U+%02X, %s}", $n, $char_name; 461 462 foreach my $suffix ("", "_L1", "_LC") { 463 464 # This is the only macro defined for L1 465 next if $suffix eq "_L1" && $function ne "LOWER"; 466 467 SKIP: 468 foreach my $locale ("", $base_locale, $utf8_locale) { 469 470 # titlecase is not defined in locales. 471 next if $name eq 'TITLE' && $suffix eq "_LC"; 472 473 my ($display_locale, $locale_is_utf8) 474 = get_display_locale_or_skip($locale, $suffix); 475 next unless defined $display_locale; 476 477 skip("to${name}_LC does not work for LATIN SMALL LETTER SHARP S" 478 . "$display_locale", 1) 479 if $u == 0xDF && $name =~ / FOLD | UPPER /x 480 && $suffix eq "_LC" && $locale_is_utf8; 481 482 use if $locale, "locale"; 483 POSIX::setlocale( &POSIX::LC_ALL, $locale) if $locale; 484 485 my $display_call = "to${function}$suffix(" 486 . " $display_name )$display_locale"; 487 $ret = eval "test_to${function}$suffix($n)"; 488 if (is ($@, "", "$display_call didn't give error")) { 489 my $should_be; 490 if ($n > 255) { 491 $should_be = $n; 492 } 493 elsif ( $u > 127 494 && ( $suffix eq "" 495 || ($suffix eq "_LC" && ! $locale_is_utf8))) 496 { 497 $should_be = $n; 498 } 499 elsif ($map_ref->[$index] != $missing) { 500 $should_be = $map_ref->[$index] + $n - $list_ref->[$index] 501 } 502 else { 503 $should_be = $n; 504 } 505 506 is ($ret, $should_be, 507 sprintf("${tab}And correctly returned 0x%02X", 508 $should_be)); 509 } 510 } 511 } 512 513 # The _uni, uvchr, and _utf8 functions return both the ordinal of the 514 # first code point of the result, and the result in utf8. The .xs 515 # tests return these in an array, in [0] and [1] respectively, with 516 # [2] the length of the utf8 in bytes. 517 my $utf8_should_be = ""; 518 my $first_ord_should_be; 519 if (ref $map_ref->[$index]) { # A multi-char result 520 for my $n (0 .. @{$map_ref->[$index]} - 1) { 521 $utf8_should_be .= chr $map_ref->[$index][$n]; 522 } 523 524 $first_ord_should_be = $map_ref->[$index][0]; 525 } 526 else { # A single-char result 527 $first_ord_should_be = ($map_ref->[$index] != $missing) 528 ? $map_ref->[$index] + $n 529 - $list_ref->[$index] 530 : $n; 531 $utf8_should_be = chr $first_ord_should_be; 532 } 533 utf8::upgrade($utf8_should_be); 534 535 # Test _uni, uvchr 536 foreach my $suffix ('_uni', '_uvchr') { 537 my $s; 538 my $len; 539 my $display_call = "to${function}$suffix( $display_name )"; 540 $ret = eval "test_to${function}$suffix($n)"; 541 if (is ($@, "", "$display_call didn't give error")) { 542 is ($ret->[0], $first_ord_should_be, 543 sprintf("${tab}And correctly returned 0x%02X", 544 $first_ord_should_be)); 545 is ($ret->[1], $utf8_should_be, "${tab}Got correct utf8"); 546 use bytes; 547 is ($ret->[2], length $utf8_should_be, 548 "${tab}Got correct number of bytes for utf8 length"); 549 } 550 } 551 552 # Test _utf8 553 my $char = chr($n); 554 utf8::upgrade($char); 555 $char = quotemeta $char if $char eq '\\' || $char eq "'"; 556 foreach my $utf8_param("_safe", 557 "_safe, malformed", 558 "deprecated unsafe", 559 "deprecated mathoms", 560 ) 561 { 562 use Config; 563 next if $utf8_param eq 'deprecated mathoms' 564 && $Config{'ccflags'} =~ /-DNO_MATHOMS/; 565 566 my $utf8_param_code = $utf8_param_code{$utf8_param}; 567 my $expect_error = $utf8_param_code > 0; 568 569 # Skip if can't malform (because is a UTF-8 invariant) 570 next if $expect_error && $u < ((ord "A" == 65) ? 128 : 160); 571 572 my $display_call = "to${function}_utf8($display_name, $utf8_param )"; 573 $ret = eval "test_to${function}_utf8('$char', $utf8_param_code)"; 574 if ($expect_error) { 575 isnt ($@, "", "expected and got error in $display_call"); 576 like($@, qr/Malformed UTF-8 character/, 577 "${tab}And got expected message"); 578 undef @warnings; 579 } 580 elsif (is ($@, "", "$display_call didn't give error")) { 581 is ($ret->[0], $first_ord_should_be, 582 sprintf("${tab}And correctly returned 0x%02X", 583 $first_ord_should_be)); 584 is ($ret->[1], $utf8_should_be, "${tab}Got correct utf8"); 585 use bytes; 586 is ($ret->[2], length $utf8_should_be, 587 "${tab}Got correct number of bytes for utf8 length"); 588 if ($utf8_param_code < 0) { 589 my $warnings_ok; 590 if (! $seen{"${function}_utf8$utf8_param"}++) { 591 $warnings_ok = is(@warnings, 1, 592 "${tab}Got a single warning"); 593 if ($warnings_ok) { 594 my $expected; 595 if ($utf8_param_code == -2) { 596 my $lc_func = lc $function; 597 $expected 598 = qr/starting in Perl .* to_utf8_$lc_func\(\) will be removed/; 599 } 600 else { 601 $expected 602 = qr/starting in Perl .* will require an additional parameter/; 603 } 604 $warnings_ok = like($warnings[0], $expected, 605 "${tab}Got expected deprecation warning"); 606 } 607 } 608 else { 609 $warnings_ok = is(@warnings, 0, 610 "${tab}Deprecation warned only the one time"); 611 } 612 $warnings_ok or diag("@warnings"); 613 undef @warnings; 614 } 615 } 616 } 617 } 618} 619 620# This is primarily to make sure that no non-Unicode warnings get generated 621is(scalar @warnings, 0, "No unexpected warnings were generated in the tests") 622 or diag @warnings; 623 624done_testing; 625