1package Unicode::Collate::Locale; 2 3use strict; 4use Carp; 5use base qw(Unicode::Collate); 6 7our $VERSION = '1.02'; 8 9my $PL_EXT = '.pl'; 10 11my %LocaleFile = map { ($_, $_) } qw( 12 af ar as az be bg bn ca cs cy da ee eo es et fa fi fil fo fr 13 gu ha haw hi hr hu hy ig is ja kk kl kn ko kok ln lt lv 14 mk ml mr mt nb nn nso om or pa pl ro ru sa se si sk sl sq 15 sr sv ta te th tn to tr uk ur vi wae wo yo zh 16); 17 $LocaleFile{'default'} = ''; 18# aliases 19 $LocaleFile{'bs'} = 'hr'; 20 $LocaleFile{'bs_Cyrl'} = 'sr'; 21 $LocaleFile{'sr_Latn'} = 'hr'; 22# short file names 23 $LocaleFile{'de__phonebook'} = 'de_phone'; 24 $LocaleFile{'es__traditional'} = 'es_trad'; 25 $LocaleFile{'fi__phonebook'} = 'fi_phone'; 26 $LocaleFile{'si__dictionary'} = 'si_dict'; 27 $LocaleFile{'sv__reformed'} = 'sv_refo'; 28 $LocaleFile{'zh__big5han'} = 'zh_big5'; 29 $LocaleFile{'zh__gb2312han'} = 'zh_gb'; 30 $LocaleFile{'zh__pinyin'} = 'zh_pin'; 31 $LocaleFile{'zh__stroke'} = 'zh_strk'; 32 $LocaleFile{'zh__zhuyin'} = 'zh_zhu'; 33 34my %TypeAlias = qw( 35 phone phonebook 36 phonebk phonebook 37 dict dictionary 38 reform reformed 39 trad traditional 40 big5 big5han 41 gb2312 gb2312han 42); 43 44sub _locale { 45 my $locale = shift; 46 if ($locale) { 47 $locale = lc $locale; 48 $locale =~ tr/\-\ \./_/; 49 $locale =~ s/_([0-9a-z]+)\z/$TypeAlias{$1} ? 50 "_$TypeAlias{$1}" : "_$1"/e; 51 $LocaleFile{$locale} and return $locale; 52 53 my @code = split /_/, $locale; 54 my $lan = shift @code; 55 my $scr = @code && length $code[0] == 4 ? ucfirst shift @code : ''; 56 my $reg = @code && length $code[0] < 4 ? uc shift @code : ''; 57 my $var = @code ? shift @code : ''; 58 59 my @list; 60 push @list, ( 61 "${lan}_${scr}_${reg}_$var", 62 "${lan}_${scr}__$var", # empty $scr should not be ${lan}__$var. 63 "${lan}_${reg}_$var", # empty $reg may be ${lan}__$var. 64 "${lan}__$var", 65 ) if $var ne ''; 66 push @list, ( 67 "${lan}_${scr}_${reg}", 68 "${lan}_${scr}", 69 "${lan}_${reg}", 70 ${lan}, 71 ); 72 for my $loc (@list) { 73 $LocaleFile{$loc} and return $loc; 74 } 75 } 76 return 'default'; 77} 78 79sub getlocale { 80 return shift->{accepted_locale}; 81} 82 83sub locale_version { 84 return shift->{locale_version}; 85} 86 87sub _fetchpl { 88 my $accepted = shift; 89 my $f = $LocaleFile{$accepted}; 90 return if !$f; 91 $f .= $PL_EXT; 92 93 # allow to search @INC 94# use File::Spec; 95# my $path = File::Spec->catfile('Unicode', 'Collate', 'Locale', $f); 96 my $path = "Unicode/Collate/Locale/$f"; 97 my $h = do $path; 98 croak "Unicode/Collate/Locale/$f can't be found" if !$h; 99 return $h; 100} 101 102sub new { 103 my $class = shift; 104 my %hash = @_; 105 $hash{accepted_locale} = _locale($hash{locale}); 106 107 if (exists $hash{table}) { 108 croak "your table can't be used with Unicode::Collate::Locale"; 109 } 110 111 my $href = _fetchpl($hash{accepted_locale}); 112 while (my($k,$v) = each %$href) { 113 if (!exists $hash{$k}) { 114 $hash{$k} = $v; 115 } elsif ($k eq 'entry') { 116 $hash{$k} = $v.$hash{$k}; 117 } else { 118 croak "$k is reserved by $hash{locale}, can't be overwritten"; 119 } 120 } 121 return $class->SUPER::new(%hash); 122} 123 1241; 125__END__ 126 127MEMORANDA for developing 128 129locale based CLDR 130---------------------------------------------------------------------------- 131af 22.1 = 1.8.1 132ar 22.1 = 1.9.0 133as 22.1 = 1.8.1 134az 22.1 = 1.8.1 (type="standard") 135be 22.1 = 1.9.0 136bg 22.1 = 1.9.0 137bn 22.1 = 2.0.1 (type="standard") 138bs 22.1 = 1.9.0 (alias source="hr") 139bs_Cyrl 22.1 = 22 (alias source="sr") 140ca 22.1 = 1.8.1 (alt="proposed" type="standard") 141cs 22.1 = 1.8.1 (type="standard") 142cy 22.1 = 1.8.1 143da 22.1 = 1.8.1 (type="standard") [mod aA to pass CLDR test] 144de__phonebook 22.1 = 2.0 (type="phonebook") 145ee 22.1 = 22 146eo 22.1 = 1.8.1 147es 22.1 = 1.9.0 (type="standard") 148es__traditional 22.1 = 1.8.1 (type="traditional") 149et 22.1 = 1.8.1 150fa 22.1 = 1.8.1 151fi 22.1 = 1.8.1 (type="standard" alt="proposed") 152fi__phonebook 22.1 = 1.8.1 (type="phonebook") 153fil 22.1 = 1.9.0 (type="standard") = 1.8.1 154fo 22.1 = 1.8.1 (alt="proposed" type="standard") 155fr 22.1 = 1.9.0 (fr_CA, backwards="on") 156gu 22.1 = 1.9.0 (type="standard") 157ha 22.1 = 1.9.0 158haw 22.1 = 1.8.1 159hi 22.1 = 1.9.0 (type="standard") 160hr 22.1 = 1.9.0 (type="standard") 161hu 22.1 = 1.8.1 (alt="proposed" type="standard") 162hy 22.1 = 1.8.1 163ig 22.1 = 1.8.1 164is 22.1 = 1.8.1 (type="standard") 165ja 22.1 = 1.8.1 (type="standard") 166kk 22.1 = 1.9.0 167kl 22.1 = 1.8.1 (type="standard") 168kn 22.1 = 1.9.0 (type="standard") 169ko 22.1 = 1.8.1 (type="standard") 170kok 22.1 = 1.8.1 171ln 22.1 = 2.0 (type="standard") = 1.8.1 172lt 22.1 = 1.9.0 173lv 22.1 = 1.9.0 (type="standard") = 1.8.1 174mk 22.1 = 1.9.0 175ml 22.1 = 1.9.0 176mr 22.1 = 1.8.1 177mt 22.1 = 1.9.0 178nb 22.1 = 2.0 (type="standard") 179nn 22.1 = 2.0 (type="standard") 180nso 22.1 = 1.8.1 181om 22.1 = 1.8.1 182or 22.1 = 1.9.0 183pa 22.1 = 1.8.1 184pl 22.1 = 1.8.1 185ro 22.1 = 1.9.0 (type="standard") 186ru 22.1 = 1.9.0 187sa 1.9.1 = 1.8.1 (type="standard" alt="proposed") [now /seed] 188se 22.1 = 1.8.1 (type="standard") 189si 22.1 = 1.9.0 (type="standard") 190si__dictionary 22.1 = 1.9.0 (type="dictionary") 191sk 22.1 = 1.9.0 (type="standard") 192sl 22.1 = 1.8.1 (type="standard" alt="proposed") 193sq 22.1 = 1.8.1 (alt="proposed" type="standard") 194sr 22.1 = 1.9.0 (type="standard") 195sr_Latn 22.1 = 1.8.1 (alias source="hr") 196sv 22.1 = 1.9.0 (type="standard") 197sv__reformed 22.1 = 1.8.1 (type="reformed") 198ta 22.1 = 1.9.0 199te 22.1 = 1.9.0 200th 22.1 = 22 201tn 22.1 = 1.8.1 202to 22.1 = 22 203tr 22.1 = 1.8.1 (type="standard") 204uk 22.1 = 21 205ur 22.1 = 1.9.0 206vi 22.1 = 1.8.1 207wae 22.1 = 2.0 208wo 1.9.1 = 1.8.1 [now /seed] 209yo 22.1 = 1.8.1 210zh 22.1 = 1.8.1 (type="standard") 211zh__big5han 22.1 = 1.8.1 (type="big5han") 212zh__gb2312han 22.1 = 1.8.1 (type="gb2312han") 213zh__pinyin 22.1 = 2.0 (type='pinyin' alt='short') 214zh__stroke 22.1 = 1.9.1 (type='stroke' alt='short') 215zh__zhuyin 22.1 = 22 (type='zhuyin' alt='short') 216---------------------------------------------------------------------------- 217 218=head1 NAME 219 220Unicode::Collate::Locale - Linguistic tailoring for DUCET via Unicode::Collate 221 222=head1 SYNOPSIS 223 224 use Unicode::Collate::Locale; 225 226 #construct 227 $Collator = Unicode::Collate::Locale-> 228 new(locale => $locale_name, %tailoring); 229 230 #sort 231 @sorted = $Collator->sort(@not_sorted); 232 233 #compare 234 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1. 235 236B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted 237according to Perl's Unicode support. See L<perlunicode>, 238L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>. 239Otherwise you can use C<preprocess> (cf. C<Unicode::Collate>) 240or should decode them before. 241 242=head1 DESCRIPTION 243 244This module provides linguistic tailoring for it 245taking advantage of C<Unicode::Collate>. 246 247=head2 Constructor 248 249The C<new> method returns a collator object. 250 251A parameter list for the constructor is a hash, which can include 252a special key C<locale> and its value (case-insensitive) standing 253for a Unicode base language code (two or three-letter). 254For example, C<Unicode::Collate::Locale-E<gt>new(locale =E<gt> 'FR')> 255returns a collator tailored for French. 256 257C<$locale_name> may be suffixed with a Unicode script code (four-letter), 258a Unicode region code, a Unicode language variant code. These codes are 259case-insensitive, and separated with C<'_'> or C<'-'>. 260E.g. C<en_US> for English in USA, 261C<az_Cyrl> for Azerbaijani in the Cyrillic script, 262C<es_ES_traditional> for Spanish in Spain (Traditional). 263 264If C<$locale_name> is not available, 265fallback is selected in the following order: 266 267 1. language with a variant code 268 2. language with a script code 269 3. language with a region code 270 4. language 271 5. default 272 273Tailoring tags provided by C<Unicode::Collate> are allowed as long as 274they are not used for C<locale> support. Esp. the C<table> tag 275is always untailorable, since it is reserved for DUCET. 276 277However C<entry> is allowed, even if it is used for C<locale> support, 278to add or override mappings. 279 280E.g. a collator for French, which ignores diacritics and case difference 281(i.e. level 1), with reversed case ordering and no normalization. 282 283 Unicode::Collate::Locale->new( 284 level => 1, 285 locale => 'fr', 286 upper_before_lower => 1, 287 normalization => undef 288 ) 289 290Overriding a behavior already tailored by C<locale> is disallowed 291if such a tailoring is passed to C<new()>. 292 293 Unicode::Collate::Locale->new( 294 locale => 'da', 295 upper_before_lower => 0, # causes error as reserved by 'da' 296 ) 297 298However C<change()> inherited from C<Unicode::Collate> allows 299such a tailoring that is reserved by C<locale>. Examples: 300 301 new(locale => 'ca')->change(backwards => undef) 302 new(locale => 'da')->change(upper_before_lower => 0) 303 new(locale => 'ja')->change(overrideCJK => undef) 304 305=head2 Methods 306 307C<Unicode::Collate::Locale> is a subclass of C<Unicode::Collate> 308and methods other than C<new> are inherited from C<Unicode::Collate>. 309 310Here is a list of additional methods: 311 312=over 4 313 314=item C<$Collator-E<gt>getlocale> 315 316Returns a language code accepted and used actually on collation. 317If linguistic tailoring is not provided for a language code you passed 318(intensionally for some languages, or due to the incomplete implementation), 319this method returns a string C<'default'> meaning no special tailoring. 320 321=item C<$Collator-E<gt>locale_version> 322 323(Since Unicode::Collate::Locale 0.87) 324Returns the version number (perhaps C</\d\.\d\d/>) of the locale, as that 325of F<Locale/*.pl>. 326 327B<Note:> F<Locale/*.pl> that a collator uses should be identified by 328a combination of return values from C<getlocale> and C<locale_version>. 329 330=back 331 332=head2 A list of tailorable locales 333 334 locale name description 335 -------------------------------------------------------------- 336 af Afrikaans 337 ar Arabic 338 as Assamese 339 az Azerbaijani (Azeri) 340 be Belarusian 341 bg Bulgarian 342 bn Bengali 343 bs Bosnian 344 bs_Cyrl Bosnian in Cyrillic (tailored as Serbian) 345 ca Catalan 346 cs Czech 347 cy Welsh 348 da Danish 349 de__phonebook German (umlaut as 'ae', 'oe', 'ue') 350 ee Ewe 351 eo Esperanto 352 es Spanish 353 es__traditional Spanish ('ch' and 'll' as a grapheme) 354 et Estonian 355 fa Persian 356 fi Finnish (v and w are primary equal) 357 fi__phonebook Finnish (v and w as separate characters) 358 fil Filipino 359 fo Faroese 360 fr French 361 gu Gujarati 362 ha Hausa 363 haw Hawaiian 364 hi Hindi 365 hr Croatian 366 hu Hungarian 367 hy Armenian 368 ig Igbo 369 is Icelandic 370 ja Japanese [1] 371 kk Kazakh 372 kl Kalaallisut 373 kn Kannada 374 ko Korean [2] 375 kok Konkani 376 ln Lingala 377 lt Lithuanian 378 lv Latvian 379 mk Macedonian 380 ml Malayalam 381 mr Marathi 382 mt Maltese 383 nb Norwegian Bokmal 384 nn Norwegian Nynorsk 385 nso Northern Sotho 386 om Oromo 387 or Oriya 388 pa Punjabi 389 pl Polish 390 ro Romanian 391 ru Russian 392 sa Sanskrit 393 se Northern Sami 394 si Sinhala 395 si__dictionary Sinhala (U+0DA5 = U+0DA2,0DCA,0DA4) 396 sk Slovak 397 sl Slovenian 398 sq Albanian 399 sr Serbian 400 sr_Latn Serbian in Latin (tailored as Croatian) 401 sv Swedish (v and w are primary equal) 402 sv__reformed Swedish (v and w as separate characters) 403 ta Tamil 404 te Telugu 405 th Thai 406 tn Tswana 407 to Tonga 408 tr Turkish 409 uk Ukrainian 410 ur Urdu 411 vi Vietnamese 412 wae Walser 413 wo Wolof 414 yo Yoruba 415 zh Chinese 416 zh__big5han Chinese (ideographs: big5 order) 417 zh__gb2312han Chinese (ideographs: GB-2312 order) 418 zh__pinyin Chinese (ideographs: pinyin order) [3] 419 zh__stroke Chinese (ideographs: stroke order) [3] 420 zh__zhuyin Chinese (ideographs: zhuyin order) [3] 421 -------------------------------------------------------------- 422 423Locales according to the default UCA rules include 424chr (Cherokee), 425de (German), 426en (English), 427ga (Irish), 428id (Indonesian), 429it (Italian), 430ka (Georgian), 431ms (Malay), 432nl (Dutch), 433pt (Portuguese), 434st (Southern Sotho), 435sw (Swahili), 436xh (Xhosa), 437zu (Zulu). 438 439B<Note> 440 441[1] ja: Ideographs are sorted in JIS X 0208 order. 442Fullwidth and halfwidth forms are identical to their regular form. 443The difference between hiragana and katakana is at the 4th level, 444the comparison also requires C<(variable =E<gt> 'Non-ignorable')>, 445and then C<katakana_before_hiragana> has no effect. 446 447[2] ko: Plenty of ideographs are sorted by their reading. Such 448an ideograph is primary (level 1) equal to, and secondary (level 2) 449greater than, the corresponding hangul syllable. 450 451[3] zh__pinyin, zh__stroke and zh__zhuyin: implemented alt='short', 452where a smaller number of ideographs are tailored. 453 454Note: 'pinyin' is in latin, 'zhuyin' is in bopomofo. 455 456=head1 INSTALL 457 458Installation of C<Unicode::Collate::Locale> requires F<Collate/Locale.pm>, 459F<Collate/Locale/*.pm>, F<Collate/CJK/*.pm> and F<Collate/allkeys.txt>. 460On building, C<Unicode::Collate::Locale> doesn't require any of F<data/*.txt>, 461F<gendata/*>, and F<mklocale>. 462Tests for C<Unicode::Collate::Locale> are named F<t/loc_*.t>. 463 464=head1 CAVEAT 465 466=over 4 467 468=item tailoring is not maximum 469 470Even if a certain letter is tailored, its equivalent would not always 471tailored as well as it. For example, even though W is tailored, 472fullwidth W (C<U+FF37>), W with acute (C<U+1E82>), etc. are not 473tailored. The result may depend on whether source strings are 474normalized or not, and whether decomposed or composed. 475Thus C<(normalization =E<gt> undef)> is less preferred. 476 477=back 478 479=head1 AUTHOR 480 481The Unicode::Collate::Locale module for perl was written 482by SADAHIRO Tomoyuki, <SADAHIRO@cpan.org>. 483This module is Copyright(C) 2004-2013, SADAHIRO Tomoyuki. Japan. 484All rights reserved. 485 486This module is free software; you can redistribute it and/or 487modify it under the same terms as Perl itself. 488 489=head1 SEE ALSO 490 491=over 4 492 493=item Unicode Collation Algorithm - UTS #10 494 495L<http://www.unicode.org/reports/tr10/> 496 497=item The Default Unicode Collation Element Table (DUCET) 498 499L<http://www.unicode.org/Public/UCA/latest/allkeys.txt> 500 501=item Unicode Locale Data Markup Language (LDML) - UTS #35 502 503L<http://www.unicode.org/reports/tr35/> 504 505=item CLDR - Unicode Common Locale Data Repository 506 507L<http://cldr.unicode.org/> 508 509=item L<Unicode::Collate> 510 511=item L<Unicode::Normalize> 512 513=back 514 515=cut 516