1package Unicode::Collate::Locale; 2 3use strict; 4use warnings; 5use Carp; 6use base qw(Unicode::Collate); 7 8our $VERSION = '1.31'; 9 10my $PL_EXT = '.pl'; 11 12my %LocaleFile = map { ($_, $_) } qw( 13 af ar as az be bn ca cs cu cy da dsb ee eo es et fa fi fil fo gu 14 ha haw he hi hr hu hy ig is ja kk kl kn ko kok lkt ln lt lv 15 mk ml mr mt nb nn nso om or pa pl ro sa se si sk sl sq sr sv 16 ta te th tn to tr uk ur vi vo wae wo yo zh 17); 18 $LocaleFile{'default'} = ''; 19# aliases 20 $LocaleFile{'bs'} = 'hr'; 21 $LocaleFile{'bs_Cyrl'} = 'sr'; 22 $LocaleFile{'sr_Latn'} = 'hr'; 23# short file names 24 $LocaleFile{'de__phonebook'} = 'de_phone'; 25 $LocaleFile{'de_AT_phonebook'} = 'de_at_ph'; 26 $LocaleFile{'es__traditional'} = 'es_trad'; 27 $LocaleFile{'fr_CA'} = 'fr_ca'; 28 $LocaleFile{'fi__phonebook'} = 'fi_phone'; 29 $LocaleFile{'si__dictionary'} = 'si_dict'; 30 $LocaleFile{'sv__reformed'} = 'sv_refo'; 31 $LocaleFile{'ug_Cyrl'} = 'ug_cyrl'; 32 $LocaleFile{'zh__big5han'} = 'zh_big5'; 33 $LocaleFile{'zh__gb2312han'} = 'zh_gb'; 34 $LocaleFile{'zh__pinyin'} = 'zh_pin'; 35 $LocaleFile{'zh__stroke'} = 'zh_strk'; 36 $LocaleFile{'zh__zhuyin'} = 'zh_zhu'; 37 38my %TypeAlias = qw( 39 phone phonebook 40 phonebk phonebook 41 dict dictionary 42 reform reformed 43 trad traditional 44 big5 big5han 45 gb2312 gb2312han 46); 47 48sub _locale { 49 my $locale = shift; 50 if ($locale) { 51 $locale = lc $locale; 52 $locale =~ tr/\-\ \./_/; 53 $locale =~ s/_([0-9a-z]+)\z/$TypeAlias{$1} ? 54 "_$TypeAlias{$1}" : "_$1"/e; 55 $LocaleFile{$locale} and return $locale; 56 57 my @code = split /_/, $locale; 58 my $lan = shift @code; 59 my $scr = @code && length $code[0] == 4 ? ucfirst shift @code : ''; 60 my $reg = @code && length $code[0] < 4 ? uc shift @code : ''; 61 my $var = @code ? shift @code : ''; 62 63 my @list; 64 push @list, ( 65 "${lan}_${scr}_${reg}_$var", 66 "${lan}_${scr}__$var", # empty $scr should not be ${lan}__$var. 67 "${lan}_${reg}_$var", # empty $reg may be ${lan}__$var. 68 "${lan}__$var", 69 ) if $var ne ''; 70 push @list, ( 71 "${lan}_${scr}_${reg}", 72 "${lan}_${scr}", 73 "${lan}_${reg}", 74 ${lan}, 75 ); 76 for my $loc (@list) { 77 $LocaleFile{$loc} and return $loc; 78 } 79 } 80 return 'default'; 81} 82 83sub getlocale { 84 return shift->{accepted_locale}; 85} 86 87sub locale_version { 88 return shift->{locale_version}; 89} 90 91sub _fetchpl { 92 my $accepted = shift; 93 my $f = $LocaleFile{$accepted}; 94 return if !$f; 95 $f .= $PL_EXT; 96 97 # allow to search @INC 98# use File::Spec; 99# my $path = File::Spec->catfile('Unicode', 'Collate', 'Locale', $f); 100 my $path = "Unicode/Collate/Locale/$f"; 101 my $h = do $path; 102 croak "Unicode/Collate/Locale/$f can't be found" if !$h; 103 return $h; 104} 105 106sub new { 107 my $class = shift; 108 my %hash = @_; 109 $hash{accepted_locale} = _locale($hash{locale}); 110 111 if (exists $hash{table}) { 112 croak "your table can't be used with Unicode::Collate::Locale"; 113 } 114 115 my $href = _fetchpl($hash{accepted_locale}); 116 while (my($k,$v) = each %$href) { 117 if (!exists $hash{$k}) { 118 $hash{$k} = $v; 119 } elsif ($k eq 'entry') { 120 $hash{$k} = $v.$hash{$k}; 121 } else { 122 croak "$k is reserved by $hash{locale}, can't be overwritten"; 123 } 124 } 125 return $class->SUPER::new(%hash); 126} 127 1281; 129__END__ 130 131=head1 NAME 132 133Unicode::Collate::Locale - Linguistic tailoring for DUCET via Unicode::Collate 134 135=head1 SYNOPSIS 136 137 use Unicode::Collate::Locale; 138 139 #construct 140 $Collator = Unicode::Collate::Locale-> 141 new(locale => $locale_name, %tailoring); 142 143 #sort 144 @sorted = $Collator->sort(@not_sorted); 145 146 #compare 147 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1. 148 149B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted 150according to Perl's Unicode support. See L<perlunicode>, 151L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>. 152Otherwise you can use C<preprocess> (cf. C<Unicode::Collate>) 153or should decode them before. 154 155=head1 DESCRIPTION 156 157This module provides linguistic tailoring for it 158taking advantage of C<Unicode::Collate>. 159 160=head2 Constructor 161 162The C<new> method returns a collator object. 163 164A parameter list for the constructor is a hash, which can include 165a special key C<locale> and its value (case-insensitive) standing 166for a Unicode base language code (two or three-letter). 167For example, C<Unicode::Collate::Locale-E<gt>new(locale =E<gt> 'ES')> 168returns a collator tailored for Spanish. 169 170C<$locale_name> may be suffixed with a Unicode script code (four-letter), 171a Unicode region (territory) code, a Unicode language variant code. 172These codes are case-insensitive, and separated with C<'_'> or C<'-'>. 173E.g. C<en_US> for English in USA, 174C<az_Cyrl> for Azerbaijani in the Cyrillic script, 175C<es_ES_traditional> for Spanish in Spain (Traditional). 176 177If C<$locale_name> is not available, 178fallback is selected in the following order: 179 180 1. language with a variant code 181 2. language with a script code 182 3. language with a region code 183 4. language 184 5. default 185 186Tailoring tags provided by C<Unicode::Collate> are allowed as long as 187they are not used for C<locale> support. Esp. the C<table> tag 188is always untailorable, since it is reserved for DUCET. 189 190However C<entry> is allowed, even if it is used for C<locale> support, 191to add or override mappings. 192 193E.g. a collator for Spanish, which ignores diacritics and case difference 194(i.e. level 1), with reversed case ordering and no normalization. 195 196 Unicode::Collate::Locale->new( 197 level => 1, 198 locale => 'es', 199 upper_before_lower => 1, 200 normalization => undef 201 ) 202 203Overriding a behavior already tailored by C<locale> is disallowed 204if such a tailoring is passed to C<new()>. 205 206 Unicode::Collate::Locale->new( 207 locale => 'da', 208 upper_before_lower => 0, # causes error as reserved by 'da' 209 ) 210 211However C<change()> inherited from C<Unicode::Collate> allows 212such a tailoring that is reserved by C<locale>. Examples: 213 214 new(locale => 'fr_ca')->change(backwards => undef) 215 new(locale => 'da')->change(upper_before_lower => 0) 216 new(locale => 'ja')->change(overrideCJK => undef) 217 218=head2 Methods 219 220C<Unicode::Collate::Locale> is a subclass of C<Unicode::Collate> 221and methods other than C<new> are inherited from C<Unicode::Collate>. 222 223Here is a list of additional methods: 224 225=over 4 226 227=item C<$Collator-E<gt>getlocale> 228 229Returns a language code accepted and used actually on collation. 230If linguistic tailoring is not provided for a language code you passed 231(intensionally for some languages, or due to the incomplete implementation), 232this method returns a string C<'default'> meaning no special tailoring. 233 234=item C<$Collator-E<gt>locale_version> 235 236(Since Unicode::Collate::Locale 0.87) 237Returns the version number (perhaps C</\d\.\d\d/>) of the locale, as that 238of F<Locale/*.pl>. 239 240B<Note:> F<Locale/*.pl> that a collator uses should be identified by 241a combination of return values from C<getlocale> and C<locale_version>. 242 243=back 244 245=head2 A list of tailorable locales 246 247 locale name description 248 -------------------------------------------------------------- 249 af Afrikaans 250 ar Arabic 251 as Assamese 252 az Azerbaijani (Azeri) 253 be Belarusian 254 bn Bengali 255 bs Bosnian (tailored as Croatian) 256 bs_Cyrl Bosnian in Cyrillic (tailored as Serbian) 257 ca Catalan 258 cs Czech 259 cu Church Slavic 260 cy Welsh 261 da Danish 262 de__phonebook German (umlaut as 'ae', 'oe', 'ue') 263 de_AT_phonebook Austrian German (umlaut primary greater) 264 dsb Lower Sorbian 265 ee Ewe 266 eo Esperanto 267 es Spanish 268 es__traditional Spanish ('ch' and 'll' as a grapheme) 269 et Estonian 270 fa Persian 271 fi Finnish (v and w are primary equal) 272 fi__phonebook Finnish (v and w as separate characters) 273 fil Filipino 274 fo Faroese 275 fr_CA Canadian French 276 gu Gujarati 277 ha Hausa 278 haw Hawaiian 279 he Hebrew 280 hi Hindi 281 hr Croatian 282 hu Hungarian 283 hy Armenian 284 ig Igbo 285 is Icelandic 286 ja Japanese [1] 287 kk Kazakh 288 kl Kalaallisut 289 kn Kannada 290 ko Korean [2] 291 kok Konkani 292 lkt Lakota 293 ln Lingala 294 lt Lithuanian 295 lv Latvian 296 mk Macedonian 297 ml Malayalam 298 mr Marathi 299 mt Maltese 300 nb Norwegian Bokmal 301 nn Norwegian Nynorsk 302 nso Northern Sotho 303 om Oromo 304 or Oriya 305 pa Punjabi 306 pl Polish 307 ro Romanian 308 sa Sanskrit 309 se Northern Sami 310 si Sinhala 311 si__dictionary Sinhala (U+0DA5 = U+0DA2,0DCA,0DA4) 312 sk Slovak 313 sl Slovenian 314 sq Albanian 315 sr Serbian 316 sr_Latn Serbian in Latin (tailored as Croatian) 317 sv Swedish (v and w are primary equal) 318 sv__reformed Swedish (v and w as separate characters) 319 ta Tamil 320 te Telugu 321 th Thai 322 tn Tswana 323 to Tonga 324 tr Turkish 325 ug_Cyrl Uyghur in Cyrillic 326 uk Ukrainian 327 ur Urdu 328 vi Vietnamese 329 vo Volapu"k 330 wae Walser 331 wo Wolof 332 yo Yoruba 333 zh Chinese 334 zh__big5han Chinese (ideographs: big5 order) 335 zh__gb2312han Chinese (ideographs: GB-2312 order) 336 zh__pinyin Chinese (ideographs: pinyin order) [3] 337 zh__stroke Chinese (ideographs: stroke order) [3] 338 zh__zhuyin Chinese (ideographs: zhuyin order) [3] 339 -------------------------------------------------------------- 340 341Locales according to the default UCA rules include 342am (Amharic) without C<[reorder Ethi]>, 343bg (Bulgarian) without C<[reorder Cyrl]>, 344chr (Cherokee) without C<[reorder Cher]>, 345de (German), 346en (English), 347fr (French), 348ga (Irish), 349id (Indonesian), 350it (Italian), 351ka (Georgian) without C<[reorder Geor]>, 352mn (Mongolian) without C<[reorder Cyrl Mong]>, 353ms (Malay), 354nl (Dutch), 355pt (Portuguese), 356ru (Russian) without C<[reorder Cyrl]>, 357sw (Swahili), 358zu (Zulu). 359 360B<Note> 361 362[1] ja: Ideographs are sorted in JIS X 0208 order. 363Fullwidth and halfwidth forms are identical to their regular form. 364The difference between hiragana and katakana is at the 4th level, 365the comparison also requires C<(variable =E<gt> 'Non-ignorable')>, 366and then C<katakana_before_hiragana> has no effect. 367 368[2] ko: Plenty of ideographs are sorted by their reading. Such 369an ideograph is primary (level 1) equal to, and secondary (level 2) 370greater than, the corresponding hangul syllable. 371 372[3] zh__pinyin, zh__stroke and zh__zhuyin: implemented alt='short', 373where a smaller number of ideographs are tailored. 374 375=head2 A list of variant codes and their aliases 376 377 variant code alias 378 ------------------------------------------ 379 dictionary dict 380 phonebook phone phonebk 381 reformed reform 382 traditional trad 383 ------------------------------------------ 384 big5han big5 385 gb2312han gb2312 386 pinyin 387 stroke 388 zhuyin 389 ------------------------------------------ 390 391Note: 'pinyin' is Han in Latin, 'zhuyin' is Han in Bopomofo. 392 393=head1 INSTALL 394 395Installation of C<Unicode::Collate::Locale> requires F<Collate/Locale.pm>, 396F<Collate/Locale/*.pm>, F<Collate/CJK/*.pm> and F<Collate/allkeys.txt>. 397On building, C<Unicode::Collate::Locale> doesn't require 398any of F<data/*.txt>, F<gendata/*>, and F<mklocale>. 399Tests for C<Unicode::Collate::Locale> are named F<t/loc_*.t>. 400 401=head1 CAVEAT 402 403=over 4 404 405=item Tailoring is not maximum 406 407Even if a certain letter is tailored, its equivalent would not always 408tailored as well as it. For example, even though W is tailored, 409fullwidth W (C<U+FF37>), W with acute (C<U+1E82>), etc. are not 410tailored. The result may depend on whether source strings are 411normalized or not, and whether decomposed or composed. 412Thus C<(normalization =E<gt> undef)> is less preferred. 413 414=item Collation reordering is not supported 415 416The order of any groups including scripts is not changed. 417 418=back 419 420=head2 Reference 421 422 locale based CLDR or other reference 423 -------------------------------------------------------------------- 424 af 30 = 1.8.1 425 ar 30 = 28 ("compat" wo [reorder Arab]) = 1.9.0 426 as 30 = 28 (without [reorder Beng..]) = 23 427 az 30 = 24 ("standard" wo [reorder Latn Cyrl]) 428 be 30 = 28 (without [reorder Cyrl]) 429 bn 30 = 28 ("standard" wo [reorder Beng..]) = 2.0.1 430 bs 30 = 28 (type="standard": [import hr]) 431 bs_Cyrl 30 = 28 (type="standard": [import sr]) 432 ca 30 = 23 (alt="proposed" type="standard") 433 cs 30 = 1.8.1 (type="standard") 434 cu 34 = 30 (without [reorder Cyrl]) 435 cy 30 = 1.8.1 436 da 22.1 = 1.8.1 (type="standard") 437 de__phonebook 30 = 2.0 (type="phonebook") 438 de_AT_phonebook 30 = 27 (type="phonebook") 439 dsb 30 = 26 440 ee 30 = 21 441 eo 30 = 1.8.1 442 es 30 = 1.9.0 (type="standard") 443 es__traditional 30 = 1.8.1 (type="traditional") 444 et 30 = 26 445 fa 22.1 = 1.8.1 446 fi 22.1 = 1.8.1 (type="standard" alt="proposed") 447 fi__phonebook 22.1 = 1.8.1 (type="phonebook") 448 fil 30 = 1.9.0 (type="standard") = 1.8.1 449 fo 22.1 = 1.8.1 (alt="proposed" type="standard") 450 fr_CA 30 = 1.9.0 451 gu 30 = 28 ("standard" wo [reorder Gujr..]) = 1.9.0 452 ha 30 = 1.9.0 453 haw 30 = 24 454 he 30 = 28 (without [reorder Hebr]) = 23 455 hi 30 = 28 (without [reorder Deva..]) = 1.9.0 456 hr 30 = 28 ("standard" wo [reorder Latn Cyrl]) = 1.9.0 457 hu 22.1 = 1.8.1 (alt="proposed" type="standard") 458 hy 30 = 28 (without [reorder Armn]) = 1.8.1 459 ig 30 = 1.8.1 460 is 22.1 = 1.8.1 (type="standard") 461 ja 22.1 = 1.8.1 (type="standard") 462 kk 30 = 28 (without [reorder Cyrl]) 463 kl 22.1 = 1.8.1 (type="standard") 464 kn 30 = 28 ("standard" wo [reorder Knda..]) = 1.9.0 465 ko 22.1 = 1.8.1 (type="standard") 466 kok 30 = 28 (without [reorder Deva..]) = 1.8.1 467 lkt 30 = 25 468 ln 30 = 2.0 (type="standard") = 1.8.1 469 lt 22.1 = 1.9.0 470 lv 22.1 = 1.9.0 (type="standard") = 1.8.1 471 mk 30 = 28 (without [reorder Cyrl]) 472 ml 22.1 = 1.9.0 473 mr 30 = 28 (without [reorder Deva..]) = 1.8.1 474 mt 22.1 = 1.9.0 475 nb 22.1 = 2.0 (type="standard") 476 nn 22.1 = 2.0 (type="standard") 477 nso [*] 26 = 1.8.1 478 om 22.1 = 1.8.1 479 or 30 = 28 (without [reorder Orya..]) = 1.9.0 480 pa 22.1 = 1.8.1 481 pl 30 = 1.8.1 482 ro 30 = 1.9.0 (type="standard") 483 sa [*] 1.9.1 = 1.8.1 (type="standard" alt="proposed") 484 se 22.1 = 1.8.1 (type="standard") 485 si 30 = 28 ("standard" wo [reorder Sinh..]) = 1.9.0 486 si__dictionary 30 = 28 ("dictionary" wo [reorder Sinh..]) = 1.9.0 487 sk 22.1 = 1.9.0 (type="standard") 488 sl 22.1 = 1.8.1 (type="standard" alt="proposed") 489 sq 22.1 = 1.8.1 (alt="proposed" type="standard") 490 sr 30 = 28 (without [reorder Cyrl]) 491 sr_Latn 30 = 28 (type="standard": [import hr]) 492 sv 22.1 = 1.9.0 (type="standard") 493 sv__reformed 22.1 = 1.8.1 (type="reformed") 494 ta 22.1 = 1.9.0 495 te 30 = 28 (without [reorder Telu..]) = 1.9.0 496 th 22.1 = 22 497 tn [*] 26 = 1.8.1 498 to 22.1 = 22 499 tr 22.1 = 1.8.1 (type="standard") 500 uk 30 = 28 (without [reorder Cyrl]) 501 ug_Cyrl https://en.wikipedia.org/wiki/Uyghur_Cyrillic_alphabet 502 ur 22.1 = 1.9.0 503 vi 22.1 = 1.8.1 504 vo 30 = 25 505 wae 30 = 2.0 506 wo [*] 1.9.1 = 1.8.1 507 yo 30 = 1.8.1 508 zh 22.1 = 1.8.1 (type="standard") 509 zh__big5han 22.1 = 1.8.1 (type="big5han") 510 zh__gb2312han 22.1 = 1.8.1 (type="gb2312han") 511 zh__pinyin 22.1 = 2.0 (type='pinyin' alt='short') 512 zh__stroke 22.1 = 1.9.1 (type='stroke' alt='short') 513 zh__zhuyin 22.1 = 22 (type='zhuyin' alt='short') 514 -------------------------------------------------------------------- 515 516[*] http://www.unicode.org/repos/cldr/tags/latest/seed/collation/ 517 518=head1 AUTHOR 519 520The Unicode::Collate::Locale module for perl was written 521by SADAHIRO Tomoyuki, <SADAHIRO@cpan.org>. 522This module is Copyright(C) 2004-2020, SADAHIRO Tomoyuki. Japan. 523All rights reserved. 524 525This module is free software; you can redistribute it and/or 526modify it under the same terms as Perl itself. 527 528=head1 SEE ALSO 529 530=over 4 531 532=item Unicode Collation Algorithm - UTS #10 533 534L<http://www.unicode.org/reports/tr10/> 535 536=item The Default Unicode Collation Element Table (DUCET) 537 538L<http://www.unicode.org/Public/UCA/latest/allkeys.txt> 539 540=item Unicode Locale Data Markup Language (LDML) - UTS #35 541 542L<http://www.unicode.org/reports/tr35/> 543 544=item CLDR - Unicode Common Locale Data Repository 545 546L<http://cldr.unicode.org/> 547 548=item L<Unicode::Collate> 549 550=item L<Unicode::Normalize> 551 552=back 553 554=cut 555