1#!./perl 2# 3# Tests that have to do with checking whether characters have (or not have) 4# certain Unicode properties; belong (or not belong) to blocks, scripts, etc. 5# including user-defined properties 6# 7 8use strict; 9use warnings; 10use v5.16; 11use utf8; 12 13# To verify that messages containing the expansions work on UTF-8 14my $utf8_comment; 15 16my @warnings; 17local $SIG {__WARN__} = sub {push @warnings, "@_"}; 18 19BEGIN { 20 chdir 't' if -d 't'; 21 require './test.pl'; 22 skip_all_if_miniperl("no dynamic loading on miniperl, no File::Spec (used by charnames)"); 23} 24 25sub run_tests; 26 27sub get_str_name($) { 28 my $char = shift; 29 30 my ($str, $name); 31 32 if ($char =~ /^\\/) { 33 $str = eval qq ["$char"]; 34 $name = qq ["$char"]; 35 } 36 elsif ($char =~ /^0x([0-9A-Fa-f]+)$/) { 37 $str = chr hex $1; 38 $name = "chr ($char)"; 39 } 40 else { 41 $str = $char; 42 $name = qq ["$char"]; 43 } 44 45 return ($str, $name); 46} 47 48# 49# This is the data to test. 50# 51# This is a hash; keys are the property to test. 52# Values are arrays containing characters to test. The characters can 53# have the following formats: 54# '\N{CHARACTER NAME}' - Use character with that name 55# '\x{1234}' - Use character with that hex escape 56# '0x1234' - Use chr() to get that character 57# "a" - Character to use 58# 59# If a character entry starts with ! the character does not belong to the class 60# 61# If the class is just single letter, we use both \pL and \p{L} 62# 63 64use charnames ':full'; 65 66my @CLASSES = ( 67 L => ["a", "A"], 68 Ll => ["b", "!B"], 69 Lu => ["!c", "C"], 70 IsLl => ["d", "!D"], 71 IsLu => ["!e", "E"], 72 LC => ["f", "!1"], 73 'L&' => ["g", "!2"], 74 'Lowercase Letter' => ["h", "!H"], 75 76 Common => ["!i", "3"], 77 Inherited => ["!j", '\x{300}'], 78 79 InBasicLatin => ['\N{LATIN CAPITAL LETTER A}'], 80 InLatin1Supplement => ['\N{LATIN CAPITAL LETTER A WITH GRAVE}'], 81 InLatinExtendedA => ['\N{LATIN CAPITAL LETTER A WITH MACRON}'], 82 InLatinExtendedB => ['\N{LATIN SMALL LETTER B WITH STROKE}'], 83 InKatakana => ['\N{KATAKANA LETTER SMALL A}'], 84 IsLatin => ["0x100", "0x212b"], 85 IsHebrew => ["0x5d0", "0xfb4f"], 86 IsGreek => ["0x37a", "0x386", "!0x387", "0x388", 87 "0x38a", "!0x38b", "0x38c"], 88 HangulSyllables => ['\x{AC00}'], 89 'Script=Latin' => ['\x{0100}'], 90 'Block=LatinExtendedA' => ['\x{0100}'], 91 'Category=UppercaseLetter' => ['\x{0100}'], 92 93 # 94 # It's ok to repeat class names. 95 # 96 InLatin1Supplement => 97 ['!\N{U+7f}', '\N{U+80}', '\N{U+ff}', '!\x{100}'], 98 InLatinExtendedA => 99 ['!\N{U+7f}', '!\N{U+80}', '!\N{U+ff}', '\x{100}'], 100 101 # 102 # Properties are case-insensitive, and may have whitespace, 103 # dashes and underscores. 104 # 105 'in-latin1_SUPPLEMENT' => ['\N{U+80}', 106 '\N{LATIN SMALL LETTER Y WITH DIAERESIS}'], 107 ' ^ In Latin 1 Supplement ' 108 => ['!\N{U+80}', '\N{COFFIN}'], 109 'latin-1 supplement' => ['\N{U+80}', "0xDF"], 110 111); 112 113my @USER_DEFINED_PROPERTIES; 114my @USER_CASELESS_PROPERTIES; 115my @USER_ERROR_PROPERTIES; 116my @DEFERRED; 117my $overflow; 118BEGIN { 119 $utf8_comment = "#\N{U+30CD}"; 120 121 use Config; 122 $overflow = $Config{uvsize} < 8 ? "80000000" : "80000000000000000"; 123 124 # We defined these at compile time, so that the subroutines that they 125 # refer to aren't known, so that we can test properties not known until 126 # runtime 127 128 @USER_DEFINED_PROPERTIES = ( 129 # 130 # User defined properties 131 # 132 InKana1 => ['\x{3040}', '!\x{303F}'], 133 InKana2 => ['\x{3040}', '!\x{303F}'], 134 InKana3 => ['\x{3041}', '!\x{3040}'], 135 InNotKana => ['\x{3040}', '!\x{3041}'], 136 InConsonant => ['d', '!e'], 137 IsSyriac1 => ['\x{0712}', '!\x{072F}'], 138 IsSyriac1KanaMark => ['\x{309A}', '!\x{3090}'], 139 IsSyriac1KanaMark => ['\x{0730}', '!\x{0712}'], 140 '# User-defined character properties may lack \n at the end', 141 InGreekSmall => ['\N{GREEK SMALL LETTER PI}', 142 '\N{GREEK SMALL LETTER FINAL SIGMA}'], 143 InGreekCapital => ['\N{GREEK CAPITAL LETTER PI}', '!\x{03A2}'], 144 Dash => ['-'], 145 ASCII_Hex_Digit => ['!-', 'A'], 146 IsAsciiHexAndDash => ['-', 'A'], 147 InLatin1 => ['\x{0100}', '!\x{00FF}'], 148 ); 149 150 @USER_CASELESS_PROPERTIES = ( 151 # 152 # User defined properties which differ depending on /i. Second entry 153 # is false normally, true under /i 154 # 155 'IsMyUpper' => ["M", "!m" ], 156 'pkg1::pkg2::IsMyLower' => ["a", "!A" ], 157 ); 158 159 @USER_ERROR_PROPERTIES = ( 160 'IsOverflow' => qr/Code point too large in (?# 161 )"0\t$overflow$utf8_comment" in expansion of (?# 162 )main::IsOverflow/, 163 'InRecursedA' => qr/Infinite recursion in user-defined property (?# 164 )"main::InRecursedA" in expansion of (?# 165 )main::InRecursedC in expansion of (?# 166 )main::InRecursedB in expansion of (?# 167 )main::InRecursedA/, 168 'IsRangeReversed' => qr/Illegal range in "200 100$utf8_comment" in (?# 169 )expansion of main::IsRangeReversed/, 170 'IsNonHex' => qr/Can't find Unicode property definition (?# 171 )"BEEF CAGED" in expansion of main::IsNonHex/, 172 173 # Could have \n, hence /s 174 'IsDeath' => qr/Died.* in expansion of main::IsDeath/s, 175 ); 176 177 # Now create a list of properties whose definitions won't be known at 178 # runtime. The qr// below thus will have forward references to them, and 179 # when matched at runtime will not know what's in the property definition 180 my @DEFERRABLE_USER_DEFINED_PROPERTIES; 181 push @DEFERRABLE_USER_DEFINED_PROPERTIES, @USER_DEFINED_PROPERTIES; 182 push @DEFERRABLE_USER_DEFINED_PROPERTIES, @USER_CASELESS_PROPERTIES; 183 unshift @DEFERRABLE_USER_DEFINED_PROPERTIES, @USER_ERROR_PROPERTIES; 184 for (my $i = 0; $i < @DEFERRABLE_USER_DEFINED_PROPERTIES; $i+=2) { 185 my $property = $DEFERRABLE_USER_DEFINED_PROPERTIES[$i]; 186 if ($property =~ / ^ \# /x) { 187 $i++; 188 redo; 189 } 190 191 # Only do this for the properties in the list that are user-defined 192 next if ($property !~ / ( ^ | :: ) I[ns] /x); 193 194 push @DEFERRED, qr/\p{$property}/, 195 $DEFERRABLE_USER_DEFINED_PROPERTIES[$i+1]; 196 } 197} 198 199# 200# From the short properties we populate POSIX-like classes. 201# 202my %SHORT_PROPERTIES = ( 203 'Ll' => ['m', '\N{CYRILLIC SMALL LETTER A}'], 204 'Lu' => ['M', '\N{GREEK CAPITAL LETTER ALPHA}'], 205 'Lo' => ['\N{HIRAGANA LETTER SMALL A}'], 206 # is also in other alphabetic 207 'Mn' => ['\N{HEBREW POINT RAFE}'], 208 'Nd' => ["0", '\N{ARABIC-INDIC DIGIT ZERO}'], 209 'Pc' => ["_"], 210 'Po' => ["!"], 211 'Zs' => [" "], 212 'Cc' => ['\x{00}'], 213); 214 215# 216# Illegal properties 217# 218my @ILLEGAL_PROPERTIES = 219 qw[q qrst f foo isfoo infoo ISfoo INfoo Is::foo In::foo]; 220 221my %d; 222 223while (my ($class, $chars) = each %SHORT_PROPERTIES) { 224 push @{$d {IsAlpha}} => map {$class =~ /^[LM]/ ? $_ : "!$_"} @$chars; 225 push @{$d {IsAlnum}} => map {$class =~ /^[LMN]./ ? $_ : "!$_"} @$chars; 226 push @{$d {IsASCII}} => map {length ($_) == 1 || $_ eq '\x{00}' 227 ? $_ : "!$_"} @$chars; 228 push @{$d {IsCntrl}} => map {$class =~ /^C/ ? $_ : "!$_"} @$chars; 229 push @{$d {IsBlank}} => map {$class =~ /^Z[lps]/ ? $_ : "!$_"} @$chars; 230 push @{$d {IsDigit}} => map {$class =~ /^Nd$/ ? $_ : "!$_"} @$chars; 231 push @{$d {IsGraph}} => map {$class =~ /^([LMNPS]|Co)/ 232 ? $_ : "!$_"} @$chars; 233 push @{$d {IsPrint}} => map {$class =~ /^([LMNPS]|Co|Zs)/ 234 ? $_ : "!$_"} @$chars; 235 push @{$d {IsLower}} => map {$class =~ /^Ll$/ ? $_ : "!$_"} @$chars; 236 push @{$d {IsUpper}} => map {$class =~ /^L[ut]/ ? $_ : "!$_"} @$chars; 237 push @{$d {IsPunct}} => map {$class =~ /^P/ ? $_ : "!$_"} @$chars; 238 push @{$d {IsWord}} => map {$class =~ /^[LMN]/ || $_ eq "_" 239 ? $_ : "!$_"} @$chars; 240 push @{$d {IsSpace}} => map {$class =~ /^Z/ || 241 length ($_) == 1 && utf8::native_to_unicode(ord ($_)) >= 0x09 242 && utf8::native_to_unicode(ord ($_)) <= 0x0D 243 ? $_ : "!$_"} @$chars; 244} 245 246push @CLASSES => "# Short properties" => %SHORT_PROPERTIES, 247 "# POSIX like properties" => %d, 248 "# User defined properties" => @USER_DEFINED_PROPERTIES; 249 250 251# 252# Calculate the number of tests. 253# 254my $count = 0; 255for (my $i = 0; $i < @CLASSES; $i += 2) { 256 $i ++, redo if $CLASSES [$i] =~ /^\h*#\h*(.*)/; 257 $count += 2 * (length $CLASSES [$i] == 1 ? 4 : 2) * @{$CLASSES [$i + 1]}; 258} 259$count += 4 * @ILLEGAL_PROPERTIES; 260$count += 4 * grep {length $_ == 1} @ILLEGAL_PROPERTIES; 261$count += 8 * @USER_CASELESS_PROPERTIES; 262$count += 1 * (@DEFERRED - @USER_ERROR_PROPERTIES) / 2; 263$count += 1 * @USER_ERROR_PROPERTIES; 264$count += 1; # one bad apple 265$count += 1; # No warnings generated 266 267plan(tests => $count); 268 269run_tests unless caller (); 270 271sub match { 272 my ($char, $match, $nomatch, $caseless) = @_; 273 $caseless = "" unless defined $caseless; 274 $caseless = 'i' if $caseless; 275 276 my ($str, $name) = get_str_name($char); 277 278 undef $@; 279 my $pat = "qr/$match/$caseless"; 280 my $match_pat = eval $pat; 281 if (is($@, '', "$pat compiled correctly to a regexp: $@")) { 282 like($str, $match_pat, "$name correctly matched"); 283 } 284 285 undef $@; 286 $pat = "qr/$nomatch/$caseless"; 287 my $nomatch_pat = eval $pat; 288 if (is($@, '', "$pat compiled correctly to a regexp: $@")) { 289 unlike($str, $nomatch_pat, "$name correctly did not match"); 290 } 291} 292 293sub run_tests { 294 295 for (my $i = 0; $i < @DEFERRED; $i+=2) { 296 if (ref $DEFERRED[$i+1] eq 'ARRAY') { 297 my ($str, $name) = get_str_name($DEFERRED[$i+1][0]); 298 like($str, $DEFERRED[$i], 299 "$name correctly matched $DEFERRED[$i] (defn. not known until runtime)"); 300 } 301 else { # Single entry rhs indicates a property that is an error 302 undef $@; 303 304 # Using block eval causes the pattern to not be recompiled, so it 305 # retains its deferred status until this is executed. 306 eval { 'A' =~ $DEFERRED[$i] }; 307 like($@, $DEFERRED[$i+1], 308 "$DEFERRED[$i] gave correct failure message (defn. not known until runtime)"); 309 } 310 } 311 312 while (@CLASSES) { 313 my $class = shift @CLASSES; 314 if ($class =~ /^\h*#\h*(.*)/) { 315 print "# $1\n"; 316 next; 317 } 318 last unless @CLASSES; 319 my $chars = shift @CLASSES; 320 my @in = grep {!/^!./} @$chars; 321 my @out = map {s/^!(?=.)//; $_} grep { /^!./} @$chars; 322 my $in_pat = eval qq ['\\p{$class}']; 323 my $out_pat = eval qq ['\\P{$class}']; 324 325 match $_, $in_pat, $out_pat for @in; 326 match $_, $out_pat, $in_pat for @out; 327 328 if (1 == length $class) { # Repeat without braces if name length 1 329 my $in_pat = eval qq ['\\p$class']; 330 my $out_pat = eval qq ['\\P$class']; 331 332 match $_, $in_pat, $out_pat for @in; 333 match $_, $out_pat, $in_pat for @out; 334 } 335 } 336 337 338 print "# Illegal properties\n"; 339 foreach my $p (@ILLEGAL_PROPERTIES) { 340 my $pat; 341 if ($p =~ /::/) { 342 $pat = qr /^Illegal user-defined property name/; 343 } 344 else { 345 $pat = qr /^Can't find Unicode property definition/; 346 } 347 348 undef $@; 349 my $r = eval "'a' =~ /\\p{$p}/; 1"; 350 is($r, undef, "Unknown Unicode property \\p{$p}"); 351 like($@, $pat, "Unknown Unicode property \\p{$p}"); 352 undef $@; 353 my $s = eval "'a' =~ /\\P{$p}/; 1"; 354 is($s, undef, "Unknown Unicode property \\p{$p}"); 355 like($@, $pat, "Unknown Unicode property \\p{$p}"); 356 if (length $p == 1) { 357 undef $@; 358 my $r = eval "'a' =~ /\\p$p/; 1"; 359 is($r, undef, "Unknown Unicode property \\p$p"); 360 like($@, $pat, "Unknown Unicode property \\p$p"); 361 undef $@; 362 my $s = eval "'a' =~ /\\P$p/; 1"; 363 is($r, undef, "Unknown Unicode property \\P$p"); 364 like($@, $pat, "Unknown Unicode property \\P$p"); 365 } 366 } 367 368 print "# User-defined properties with /i differences\n"; 369 while (my $class = shift @USER_CASELESS_PROPERTIES) { 370 my $chars_ref = shift @USER_CASELESS_PROPERTIES; 371 my @in = grep {!/^!./} @$chars_ref; 372 my @out = map {s/^!(?=.)//; $_} grep { /^!./} @$chars_ref; 373 my $in_pat = eval qq ['\\p{$class}']; 374 my $out_pat = eval qq ['\\P{$class}']; 375 376 # Verify that adding /i does change the out set to match. 377 match $_, $in_pat, $out_pat, 'i' for @out; 378 379 # Verify that adding /i doesn't change the in set. 380 match $_, $in_pat, $out_pat, 'i' for @in; 381 382 # Verify works as regularly for not /i 383 match $_, $in_pat, $out_pat for @in; 384 match $_, $out_pat, $in_pat for @out; 385 } 386 387 print "# User-defined properties with errors in their definition\n"; 388 while (my $error_property = shift @USER_ERROR_PROPERTIES) { 389 my $error_re = shift @USER_ERROR_PROPERTIES; 390 391 undef $@; 392 eval { 'A' =~ /\p{$error_property}/; }; 393 like($@, $error_re, "$error_property gave correct failure message"); 394 } 395} 396 397 398# 399# User defined properties 400# 401 402sub InKana1 {<<'--'} 4033040 309F # A comment; next line has trailing spaces 40430A0 30FF 405-- 406 407sub InKana2 {<<'--'} 408+utf8::InHiragana 409+utf8::InKatakana 410-- 411 412sub InKana3 {<<'--'} 413# First line comment 414+utf8::InHiragana 415# Full line comment 416+utf8::InKatakana 417-utf8::IsCn 418-- 419 420sub InNotKana {<<'--'} 421!utf8::InHiragana # A comment; next line has trailing spaces 422-utf8::InKatakana 423+utf8::IsCn 424# Final line comment 425-- 426 427sub InConsonant { 428 429 my $return = "+utf8::Lowercase\n&utf8::ASCII\n"; 430 $return .= sprintf("-%X\n", ord "a"); 431 $return .= sprintf("-%X\n", ord "e"); 432 $return .= sprintf("-%X\n", ord "i"); 433 $return .= sprintf("-%X\n", ord "o"); 434 $return .= sprintf("-%X\n", ord "u"); 435 return $return; 436} 437 438sub IsSyriac1 {<<'--'} 4390712 072C 4400730 074A 441-- 442 443sub InRecursedA { 444 return "+main::InRecursedB\n"; 445} 446 447sub InRecursedB { 448 return "+main::InRecursedC\n"; 449} 450 451sub InRecursedC { 452 return "+main::InRecursedA\n"; 453} 454 455sub InGreekSmall {return "03B1\t03C9"} 456sub InGreekCapital {return "0391\t03A9\n-03A2"} 457 458sub IsAsciiHexAndDash {<<'--'} 459+utf8::ASCII_Hex_Digit 460+utf8::Dash 461-- 462 463sub InLatin1 { 464 return "0100\t10FFFF"; 465} 466 467sub IsMyUpper { 468 use feature 'state'; 469 470 state $cased_count = 0; 471 state $caseless_count = 0; 472 my $ret= "+utf8::"; 473 474 my $caseless = shift; 475 if($caseless) { 476 die "Called twice" if $caseless_count; 477 $caseless_count++; 478 $ret .= 'Alphabetic' 479 } 480 else { 481 die "Called twice" if $cased_count; 482 $cased_count++; 483 $ret .= 'Uppercase'; 484 } 485 486 return $ret . "\n&utf8::ASCII"; 487} 488 489sub pkg1::pkg2::IsMyLower { 490 my $caseless = shift; 491 return "+utf8::" 492 . (($caseless) 493 ? 'Alphabetic' 494 : 'Lowercase') 495 . "\n&utf8::ASCII"; 496} 497 498sub IsRangeReversed { 499 return "200 100$utf8_comment"; 500} 501 502sub IsNonHex { 503 return "BEEF CAGED$utf8_comment"; 504} 505 506sub IsDeath { 507 die; 508} 509 510# Verify that can use user-defined properties inside another one 511sub IsSyriac1KanaMark {<<'--'} 512+main::IsSyriac1 513+main::InKana3 514&utf8::IsMark 515-- 516 517# fake user-defined properties; these subs shouldn't be called, because 518# their names don't start with In or Is 519 520sub f { die } 521sub foo { die } 522sub isfoo { die } 523sub infoo { die } 524sub ISfoo { die } 525sub INfoo { die } 526sub Is::foo { die } 527sub In::foo { die } 528 529sub IsOverflow { 530 return "0\t$overflow$utf8_comment"; 531} 532 533fresh_perl_like(<<'EOP', qr/Can't find Unicode property definition "F000\\tF010" in expansion of InOneBadApple/, {}, "Just one component bad"); 534# Extra backslash converts tab to backslash-t 535sub InOneBadApple { return "0100\t0110\n10000\t10010\nF000\\tF010\n0400\t0410" } 536qr/\p{InOneBadApple}/; 537EOP 538 539if (! is(@warnings, 0, "No warnings were generated")) { 540 diag join "\n", @warnings, "\n"; 541} 542 5431; 544__END__ 545