1 2BEGIN { 3 unless ('A' eq pack('U', 0x41)) { 4 print "1..0 # Unicode::Normalize cannot pack a Unicode code point\n"; 5 exit 0; 6 } 7 unless (0x41 == unpack('U', 'A')) { 8 print "1..0 # Unicode::Normalize cannot get a Unicode code point\n"; 9 exit 0; 10 } 11} 12 13BEGIN { 14 if ($ENV{PERL_CORE}) { 15 chdir('t') if -d 't'; 16 @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); 17 } 18} 19 20######################### 21 22use strict; 23use warnings; 24BEGIN { $| = 1; print "1..217\n"; } 25my $count = 0; 26sub ok ($;$) { 27 my $p = my $r = shift; 28 if (@_) { 29 my $x = shift; 30 $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x; 31 } 32 print $p ? "ok" : "not ok", ' ', ++$count, "\n"; 33} 34 35use Unicode::Normalize qw(:all); 36 37ok(1); 38 39sub _pack_U { Unicode::Normalize::pack_U(@_) } 40sub hexU { _pack_U map hex, split ' ', shift } 41 42# This won't work on EBCDIC platforms prior to v5.8.0, which is when this 43# translation function was defined 44*to_native = (defined &utf8::unicode_to_native) 45 ? \&utf8::unicode_to_native 46 : sub { return shift }; 47 48######################### 49 50ok(getCombinClass( to_native(0)), 0); 51ok(getCombinClass(to_native(41)), 0); 52ok(getCombinClass(to_native(65)), 0); 53ok(getCombinClass( 768), 230); 54ok(getCombinClass(1809), 36); 55 56ok(getCanon(to_native( 0)), undef); 57ok(getCanon(to_native(0x29)), undef); 58ok(getCanon(to_native(0x41)), undef); 59ok(getCanon(to_native(0x00C0)), _pack_U(0x0041, 0x0300)); 60ok(getCanon(to_native(0x00EF)), _pack_U(0x0069, 0x0308)); 61ok(getCanon(0x304C), _pack_U(0x304B, 0x3099)); 62ok(getCanon(0x1EA4), _pack_U(0x0041, 0x0302, 0x0301)); 63ok(getCanon(0x1F82), _pack_U(0x03B1, 0x0313, 0x0300, 0x0345)); 64ok(getCanon(0x1FAF), _pack_U(0x03A9, 0x0314, 0x0342, 0x0345)); 65ok(getCanon(0xAC00), _pack_U(0x1100, 0x1161)); 66ok(getCanon(0xAE00), _pack_U(0x1100, 0x1173, 0x11AF)); 67ok(getCanon(0x212C), undef); 68ok(getCanon(0x3243), undef); 69ok(getCanon(0xFA2D), _pack_U(0x9DB4)); 70 71# 20 72 73ok(getCompat(to_native( 0)), undef); 74ok(getCompat(to_native(0x29)), undef); 75ok(getCompat(to_native(0x41)), undef); 76ok(getCompat(to_native(0x00C0)), _pack_U(0x0041, 0x0300)); 77ok(getCompat(to_native(0x00EF)), _pack_U(0x0069, 0x0308)); 78ok(getCompat(0x304C), _pack_U(0x304B, 0x3099)); 79ok(getCompat(0x1EA4), _pack_U(0x0041, 0x0302, 0x0301)); 80ok(getCompat(0x1F82), _pack_U(0x03B1, 0x0313, 0x0300, 0x0345)); 81ok(getCompat(0x1FAF), _pack_U(0x03A9, 0x0314, 0x0342, 0x0345)); 82ok(getCompat(0x212C), _pack_U(0x0042)); 83ok(getCompat(0x3243), _pack_U(0x0028, 0x81F3, 0x0029)); 84ok(getCompat(0xAC00), _pack_U(0x1100, 0x1161)); 85ok(getCompat(0xAE00), _pack_U(0x1100, 0x1173, 0x11AF)); 86ok(getCompat(0xFA2D), _pack_U(0x9DB4)); 87 88# 34 89 90ok(getComposite(to_native( 0), to_native( 0)), undef); 91ok(getComposite(to_native( 0), to_native(0x29)), undef); 92ok(getComposite(to_native(0x29), to_native( 0)), undef); 93ok(getComposite(to_native(0x29), to_native(0x29)), undef); 94ok(getComposite(to_native( 0), to_native(0x41)), undef); 95ok(getComposite(to_native(0x41), to_native( 0)), undef); 96ok(getComposite(to_native(0x41), to_native(0x41)), undef); 97ok(getComposite(to_native(12), to_native(0x0300)), undef); 98ok(getComposite(to_native(0x0055), 0xFF00), undef); 99ok(getComposite(to_native(0x0041), 0x0300), to_native(0x00C0)); 100ok(getComposite(to_native(0x0055), 0x0300), to_native(0x00D9)); 101ok(getComposite(0x0112, 0x0300), 0x1E14); 102ok(getComposite(0x1100, 0x1161), 0xAC00); 103ok(getComposite(0x1100, 0x1173), 0xADF8); 104ok(getComposite(0x1100, 0x11AF), undef); 105ok(getComposite(0x1173, 0x11AF), undef); 106ok(getComposite(0xAC00, 0x11A7), undef); 107ok(getComposite(0xAC00, 0x11A8), 0xAC01); 108ok(getComposite(0xADF8, 0x11AF), 0xAE00); 109 110# 53 111 112sub uprops { 113 my $uv = shift; 114 my $r = ""; 115 $r .= isExclusion($uv) ? 'X' : 'x'; 116 $r .= isSingleton($uv) ? 'S' : 's'; 117 $r .= isNonStDecomp($uv) ? 'N' : 'n'; # Non-Starter Decomposition 118 $r .= isComp_Ex($uv) ? 'F' : 'f'; # Full exclusion (X + S + N) 119 $r .= isComp2nd($uv) ? 'B' : 'b'; # B = M = Y 120 $r .= isNFD_NO($uv) ? 'D' : 'd'; 121 $r .= isNFC_MAYBE($uv) ? 'M' : 'm'; # Maybe 122 $r .= isNFC_NO($uv) ? 'C' : 'c'; 123 $r .= isNFKD_NO($uv) ? 'K' : 'k'; 124 $r .= isNFKC_MAYBE($uv) ? 'Y' : 'y'; # maYbe 125 $r .= isNFKC_NO($uv) ? 'G' : 'g'; 126 return $r; 127} 128 129ok(uprops(to_native(0x0000)), 'xsnfbdmckyg'); # NULL 130ok(uprops(to_native(0x0029)), 'xsnfbdmckyg'); # RIGHT PARENTHESIS 131ok(uprops(to_native(0x0041)), 'xsnfbdmckyg'); # LATIN CAPITAL LETTER A 132ok(uprops(to_native(0x00A0)), 'xsnfbdmcKyG'); # NO-BREAK SPACE 133ok(uprops(to_native(0x00C0)), 'xsnfbDmcKyg'); # LATIN CAPITAL LETTER A WITH GRAVE 134ok(uprops(0x0300), 'xsnfBdMckYg'); # COMBINING GRAVE ACCENT 135ok(uprops(0x0344), 'xsNFbDmCKyG'); # COMBINING GREEK DIALYTIKA TONOS 136ok(uprops(0x0387), 'xSnFbDmCKyG'); # GREEK ANO TELEIA 137ok(uprops(0x0958), 'XsnFbDmCKyG'); # DEVANAGARI LETTER QA 138ok(uprops(0x0F43), 'XsnFbDmCKyG'); # TIBETAN LETTER GHA 139ok(uprops(0x1100), 'xsnfbdmckyg'); # HANGUL CHOSEONG KIYEOK 140ok(uprops(0x1161), 'xsnfBdMckYg'); # HANGUL JUNGSEONG A 141ok(uprops(0x11AF), 'xsnfBdMckYg'); # HANGUL JONGSEONG RIEUL 142ok(uprops(0x212B), 'xSnFbDmCKyG'); # ANGSTROM SIGN 143ok(uprops(0xAC00), 'xsnfbDmcKyg'); # HANGUL SYLLABLE GA 144ok(uprops(0xF900), 'xSnFbDmCKyG'); # CJK COMPATIBILITY IDEOGRAPH-F900 145ok(uprops(0xFB4E), 'XsnFbDmCKyG'); # HEBREW LETTER PE WITH RAFE 146ok(uprops(0xFF71), 'xsnfbdmcKyG'); # HALFWIDTH KATAKANA LETTER A 147 148# 71 149 150ok(decompose(""), ""); 151ok(decompose("A"), "A"); 152ok(decompose("", 1), ""); 153ok(decompose("A", 1), "A"); 154 155ok(decompose(hexU("1E14 AC01")), hexU("0045 0304 0300 1100 1161 11A8")); 156ok(decompose(hexU("AC00 AE00")), hexU("1100 1161 1100 1173 11AF")); 157ok(decompose(hexU("304C FF76")), hexU("304B 3099 FF76")); 158 159ok(decompose(hexU("1E14 AC01"), 1), hexU("0045 0304 0300 1100 1161 11A8")); 160ok(decompose(hexU("AC00 AE00"), 1), hexU("1100 1161 1100 1173 11AF")); 161ok(decompose(hexU("304C FF76"), 1), hexU("304B 3099 30AB")); 162 163# don't modify the source 164my $sDec = "\x{FA19}"; 165ok(decompose($sDec), "\x{795E}"); 166ok($sDec, "\x{FA19}"); 167 168# 83 169 170ok(reorder(""), ""); 171ok(reorder("A"), "A"); 172ok(reorder(hexU("0041 0300 0315 0313 031b 0061")), 173 hexU("0041 031b 0300 0313 0315 0061")); 174ok(reorder(hexU("00C1 0300 0315 0313 031b 0061 309A 3099")), 175 hexU("00C1 031b 0300 0313 0315 0061 309A 3099")); 176 177# don't modify the source 178my $sReord = "\x{3000}\x{300}\x{31b}"; 179ok(reorder($sReord), "\x{3000}\x{31b}\x{300}"); 180ok($sReord, "\x{3000}\x{300}\x{31b}"); 181 182# 89 183 184ok(compose(""), ""); 185ok(compose("A"), "A"); 186ok(compose(hexU("0061 0300")), hexU("00E0")); 187ok(compose(hexU("0061 0300 031B")), hexU("00E0 031B")); 188ok(compose(hexU("0061 0300 0315")), hexU("00E0 0315")); 189ok(compose(hexU("0061 0300 0313")), hexU("00E0 0313")); 190ok(compose(hexU("0061 031B 0300")), hexU("00E0 031B")); 191ok(compose(hexU("0061 0315 0300")), hexU("0061 0315 0300")); 192ok(compose(hexU("0061 0313 0300")), hexU("0061 0313 0300")); 193 194# don't modify the source 195my $sCom = "\x{304B}\x{3099}"; 196ok(compose($sCom), "\x{304C}"); 197ok($sCom, "\x{304B}\x{3099}"); 198 199# 100 200 201ok(composeContiguous(""), ""); 202ok(composeContiguous("A"), "A"); 203ok(composeContiguous(hexU("0061 0300")), hexU("00E0")); 204ok(composeContiguous(hexU("0061 0300 031B")), hexU("00E0 031B")); 205ok(composeContiguous(hexU("0061 0300 0315")), hexU("00E0 0315")); 206ok(composeContiguous(hexU("0061 0300 0313")), hexU("00E0 0313")); 207ok(composeContiguous(hexU("0061 031B 0300")), hexU("0061 031B 0300")); 208ok(composeContiguous(hexU("0061 0315 0300")), hexU("0061 0315 0300")); 209ok(composeContiguous(hexU("0061 0313 0300")), hexU("0061 0313 0300")); 210 211# don't modify the source 212my $sCtg = "\x{30DB}\x{309A}"; 213ok(composeContiguous($sCtg), "\x{30DD}"); 214ok($sCtg, "\x{30DB}\x{309A}"); 215 216# 111 217 218sub answer { defined $_[0] ? $_[0] ? "YES" : "NO" : "MAYBE" } 219 220ok(answer(checkNFD("")), "YES"); 221ok(answer(checkNFC("")), "YES"); 222ok(answer(checkNFKD("")), "YES"); 223ok(answer(checkNFKC("")), "YES"); 224ok(answer(check("NFD", "")), "YES"); 225ok(answer(check("NFC", "")), "YES"); 226ok(answer(check("NFKD","")), "YES"); 227ok(answer(check("NFKC","")), "YES"); 228 229# U+0000 to U+007F are prenormalized in all the normalization forms. 230ok(answer(checkNFD("AZaz\t12!#`")), "YES"); 231ok(answer(checkNFC("AZaz\t12!#`")), "YES"); 232ok(answer(checkNFKD("AZaz\t12!#`")), "YES"); 233ok(answer(checkNFKC("AZaz\t12!#`")), "YES"); 234ok(answer(check("D", "AZaz\t12!#`")), "YES"); 235ok(answer(check("C", "AZaz\t12!#`")), "YES"); 236ok(answer(check("KD","AZaz\t12!#`")), "YES"); 237ok(answer(check("KC","AZaz\t12!#`")), "YES"); 238 239ok(answer(checkNFD(NFD(_pack_U(0xC1, 0x1100, 0x1173, 0x11AF)))), "YES"); 240ok(answer(checkNFD(hexU("20 C1 1100 1173 11AF"))), "NO"); 241ok(answer(checkNFC(hexU("20 C1 1173 11AF"))), "MAYBE"); 242ok(answer(checkNFC(hexU("20 C1 AE00 1100"))), "YES"); 243ok(answer(checkNFC(hexU("20 C1 AE00 1100 0300"))), "MAYBE"); 244ok(answer(checkNFC(hexU("212B 1100 0300"))), "NO"); 245ok(answer(checkNFC(hexU("1100 0300 212B"))), "NO"); 246ok(answer(checkNFC(hexU("0041 0327 030A"))), "MAYBE"); # A+cedilla+ring 247ok(answer(checkNFC(hexU("0041 030A 0327"))), "NO"); # A+ring+cedilla 248ok(answer(checkNFC(hexU("20 C1 FF71 2025"))),"YES"); 249ok(answer(check("NFC", hexU("20 C1 212B 300"))), "NO"); 250ok(answer(checkNFKD(hexU("20 C1 FF71 2025"))), "NO"); 251ok(answer(checkNFKC(hexU("20 C1 AE00 2025"))), "NO"); 252ok(answer(checkNFKC(hexU("212B 1100 0300"))), "NO"); 253ok(answer(checkNFKC(hexU("1100 0300 212B"))), "NO"); 254ok(answer(checkNFKC(hexU("0041 0327 030A"))), "MAYBE"); # A+cedilla+ring 255ok(answer(checkNFKC(hexU("0041 030A 0327"))), "NO"); # A+ring+cedilla 256ok(answer(check("NFKC", hexU("20 C1 212B 300"))), "NO"); 257 258# 145 259 260"012ABC" =~ /(\d+)(\w+)/; 261ok("012" eq NFC $1 && "ABC" eq NFC $2); 262 263ok(normalize('C', $1), "012"); 264ok(normalize('C', $2), "ABC"); 265 266ok(normalize('NFC', $1), "012"); 267ok(normalize('NFC', $2), "ABC"); 268 # s/^NF// in normalize() must not prevent using $1, $&, etc. 269 270# 150 271 272# a string with initial zero should be treated like a number 273 274# LATIN CAPITAL LETTER A WITH GRAVE 275ok(getCombinClass(sprintf("0%d", to_native(192))), 0); 276ok(getCanon (sprintf("0%d", to_native(192))), _pack_U(0x41, 0x300)); 277ok(getCompat(sprintf("0%d", to_native(192))), _pack_U(0x41, 0x300)); 278my $lead_zero = sprintf "0%d", to_native(65); 279ok(getComposite($lead_zero, "0768"), to_native(192)); 280ok(isNFD_NO (sprintf("0%d", to_native(192)))); 281ok(isNFKD_NO(sprintf("0%d", to_native(192)))); 282 283# DEVANAGARI LETTER QA 284ok(isExclusion("02392")); 285ok(isComp_Ex ("02392")); 286ok(isNFC_NO ("02392")); 287ok(isNFKC_NO ("02392")); 288ok(isNFD_NO ("02392")); 289ok(isNFKD_NO ("02392")); 290 291# ANGSTROM SIGN 292ok(isSingleton("08491")); 293ok(isComp_Ex ("08491")); 294ok(isNFC_NO ("08491")); 295ok(isNFKC_NO ("08491")); 296ok(isNFD_NO ("08491")); 297ok(isNFKD_NO ("08491")); 298 299# COMBINING GREEK DIALYTIKA TONOS 300ok(isNonStDecomp("0836")); 301ok(isComp_Ex ("0836")); 302ok(isNFC_NO ("0836")); 303ok(isNFKC_NO ("0836")); 304ok(isNFD_NO ("0836")); 305ok(isNFKD_NO ("0836")); 306 307# COMBINING GRAVE ACCENT 308ok(getCombinClass("0768"), 230); 309ok(isComp2nd ("0768")); 310ok(isNFC_MAYBE ("0768")); 311ok(isNFKC_MAYBE("0768")); 312 313# HANGUL SYLLABLE GA 314ok(getCombinClass("044032"), 0); 315ok(getCanon("044032"), _pack_U(0x1100, 0x1161)); 316ok(getCompat("044032"), _pack_U(0x1100, 0x1161)); 317ok(getComposite("04352", "04449"), 0xAC00); 318 319# 182 320 321# string with 22 combining characters: (0x300..0x315) 322my $str_cc22 = _pack_U(0x3041, 0x300..0x315, 0x3042); 323ok(decompose($str_cc22), $str_cc22); 324ok(reorder($str_cc22), $str_cc22); 325ok(compose($str_cc22), $str_cc22); 326ok(composeContiguous($str_cc22), $str_cc22); 327ok(NFD($str_cc22), $str_cc22); 328ok(NFC($str_cc22), $str_cc22); 329ok(NFKD($str_cc22), $str_cc22); 330ok(NFKC($str_cc22), $str_cc22); 331ok(FCD($str_cc22), $str_cc22); 332ok(FCC($str_cc22), $str_cc22); 333 334# 192 335 336# string with 40 combining characters of the same class: (0x300..0x313)x2 337my $str_cc40 = _pack_U(0x3041, 0x300..0x313, 0x300..0x313, 0x3042); 338ok(decompose($str_cc40), $str_cc40); 339ok(reorder($str_cc40), $str_cc40); 340ok(compose($str_cc40), $str_cc40); 341ok(composeContiguous($str_cc40), $str_cc40); 342ok(NFD($str_cc40), $str_cc40); 343ok(NFC($str_cc40), $str_cc40); 344ok(NFKD($str_cc40), $str_cc40); 345ok(NFKC($str_cc40), $str_cc40); 346ok(FCD($str_cc40), $str_cc40); 347ok(FCC($str_cc40), $str_cc40); 348 349# 202 350 351my $precomp = hexU("304C 304E 3050 3052 3054"); 352my $combseq = hexU("304B 3099 304D 3099 304F 3099 3051 3099 3053 3099"); 353ok(decompose($precomp x 5), $combseq x 5); 354ok(decompose($precomp x 10), $combseq x 10); 355ok(decompose($precomp x 20), $combseq x 20); 356 357my $hangsyl = hexU("AC00 B098 B2E4 B77C B9C8"); 358my $jamoseq = hexU("1100 1161 1102 1161 1103 1161 1105 1161 1106 1161"); 359ok(decompose($hangsyl x 5), $jamoseq x 5); 360ok(decompose($hangsyl x 10), $jamoseq x 10); 361ok(decompose($hangsyl x 20), $jamoseq x 20); 362 363my $notcomp = hexU("304B 304D 304F 3051 3053"); 364ok(decompose($precomp . $notcomp), $combseq . $notcomp); 365ok(decompose($precomp . $notcomp x 5), $combseq . $notcomp x 5); 366ok(decompose($precomp . $notcomp x10), $combseq . $notcomp x10); 367 368# 211 369 370my $preUnicode3_1 = !defined getCanon(0x1D15E); 371my $preUnicode3_2 = !defined getCanon(0x2ADC); 372 373# HEBREW LETTER YOD WITH HIRIQ 374ok($preUnicode3_1 xor isExclusion(0xFB1D)); 375ok($preUnicode3_1 xor isComp_Ex (0xFB1D)); 376 377# MUSICAL SYMBOL HALF NOTE 378ok($preUnicode3_1 xor isExclusion(0x1D15E)); 379ok($preUnicode3_1 xor isComp_Ex (0x1D15E)); 380 381# FORKING 382ok($preUnicode3_2 xor isExclusion(0x2ADC)); 383ok($preUnicode3_2 xor isComp_Ex (0x2ADC)); 384 385# 217 386 387