1package Unicode::Collate::Locale;
2
3use strict;
4use warnings;
5use Carp;
6use base qw(Unicode::Collate);
7
8our $VERSION = '1.31';
9
10my $PL_EXT  = '.pl';
11
12my %LocaleFile = map { ($_, $_) } qw(
13   af ar as az be bn ca cs cu cy da dsb ee eo es et fa fi fil fo gu
14   ha haw he hi hr hu hy ig is ja kk kl kn ko kok lkt ln lt lv
15   mk ml mr mt nb nn nso om or pa pl ro sa se si sk sl sq sr sv
16   ta te th tn to tr uk ur vi vo wae wo yo zh
17);
18   $LocaleFile{'default'} = '';
19# aliases
20   $LocaleFile{'bs'}      = 'hr';
21   $LocaleFile{'bs_Cyrl'} = 'sr';
22   $LocaleFile{'sr_Latn'} = 'hr';
23# short file names
24   $LocaleFile{'de__phonebook'}   = 'de_phone';
25   $LocaleFile{'de_AT_phonebook'} = 'de_at_ph';
26   $LocaleFile{'es__traditional'} = 'es_trad';
27   $LocaleFile{'fr_CA'}           = 'fr_ca';
28   $LocaleFile{'fi__phonebook'}   = 'fi_phone';
29   $LocaleFile{'si__dictionary'}  = 'si_dict';
30   $LocaleFile{'sv__reformed'}    = 'sv_refo';
31   $LocaleFile{'ug_Cyrl'}         = 'ug_cyrl';
32   $LocaleFile{'zh__big5han'}     = 'zh_big5';
33   $LocaleFile{'zh__gb2312han'}   = 'zh_gb';
34   $LocaleFile{'zh__pinyin'}      = 'zh_pin';
35   $LocaleFile{'zh__stroke'}      = 'zh_strk';
36   $LocaleFile{'zh__zhuyin'}      = 'zh_zhu';
37
38my %TypeAlias = qw(
39    phone     phonebook
40    phonebk   phonebook
41    dict      dictionary
42    reform    reformed
43    trad      traditional
44    big5      big5han
45    gb2312    gb2312han
46);
47
48sub _locale {
49    my $locale = shift;
50    if ($locale) {
51	$locale = lc $locale;
52	$locale =~ tr/\-\ \./_/;
53	$locale =~ s/_([0-9a-z]+)\z/$TypeAlias{$1} ?
54				  "_$TypeAlias{$1}" : "_$1"/e;
55	$LocaleFile{$locale} and return $locale;
56
57	my @code = split /_/, $locale;
58	my $lan = shift @code;
59	my $scr = @code && length $code[0] == 4 ? ucfirst shift @code : '';
60	my $reg = @code && length $code[0] <  4 ? uc      shift @code : '';
61	my $var = @code                         ?         shift @code : '';
62
63	my @list;
64	push @list, (
65	    "${lan}_${scr}_${reg}_$var",
66	    "${lan}_${scr}__$var", # empty $scr should not be ${lan}__$var.
67	    "${lan}_${reg}_$var",  # empty $reg may be ${lan}__$var.
68	    "${lan}__$var",
69	) if $var ne '';
70	push @list, (
71	    "${lan}_${scr}_${reg}",
72	    "${lan}_${scr}",
73	    "${lan}_${reg}",
74	     ${lan},
75	);
76	for my $loc (@list) {
77	    $LocaleFile{$loc} and return $loc;
78	}
79    }
80    return 'default';
81}
82
83sub getlocale {
84    return shift->{accepted_locale};
85}
86
87sub locale_version {
88    return shift->{locale_version};
89}
90
91sub _fetchpl {
92    my $accepted = shift;
93    my $f = $LocaleFile{$accepted};
94    return if !$f;
95    $f .= $PL_EXT;
96
97    # allow to search @INC
98#   use File::Spec;
99#   my $path = File::Spec->catfile('Unicode', 'Collate', 'Locale', $f);
100    my $path = "Unicode/Collate/Locale/$f";
101    my $h = do $path;
102    croak "Unicode/Collate/Locale/$f can't be found" if !$h;
103    return $h;
104}
105
106sub new {
107    my $class = shift;
108    my %hash = @_;
109    $hash{accepted_locale} = _locale($hash{locale});
110
111    if (exists $hash{table}) {
112	croak "your table can't be used with Unicode::Collate::Locale";
113    }
114
115    my $href = _fetchpl($hash{accepted_locale});
116    while (my($k,$v) = each %$href) {
117	if (!exists $hash{$k}) {
118	    $hash{$k} = $v;
119	} elsif ($k eq 'entry') {
120	    $hash{$k} = $v.$hash{$k};
121	} else {
122	    croak "$k is reserved by $hash{locale}, can't be overwritten";
123	}
124    }
125    return $class->SUPER::new(%hash);
126}
127
1281;
129__END__
130
131=head1 NAME
132
133Unicode::Collate::Locale - Linguistic tailoring for DUCET via Unicode::Collate
134
135=head1 SYNOPSIS
136
137  use Unicode::Collate::Locale;
138
139  #construct
140  $Collator = Unicode::Collate::Locale->
141      new(locale => $locale_name, %tailoring);
142
143  #sort
144  @sorted = $Collator->sort(@not_sorted);
145
146  #compare
147  $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
148
149B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted
150according to Perl's Unicode support. See L<perlunicode>,
151L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
152Otherwise you can use C<preprocess> (cf. C<Unicode::Collate>)
153or should decode them before.
154
155=head1 DESCRIPTION
156
157This module provides linguistic tailoring for it
158taking advantage of C<Unicode::Collate>.
159
160=head2 Constructor
161
162The C<new> method returns a collator object.
163
164A parameter list for the constructor is a hash, which can include
165a special key C<locale> and its value (case-insensitive) standing
166for a Unicode base language code (two or three-letter).
167For example, C<Unicode::Collate::Locale-E<gt>new(locale =E<gt> 'ES')>
168returns a collator tailored for Spanish.
169
170C<$locale_name> may be suffixed with a Unicode script code (four-letter),
171a Unicode region (territory) code, a Unicode language variant code.
172These codes are case-insensitive, and separated with C<'_'> or C<'-'>.
173E.g. C<en_US> for English in USA,
174C<az_Cyrl> for Azerbaijani in the Cyrillic script,
175C<es_ES_traditional> for Spanish in Spain (Traditional).
176
177If C<$locale_name> is not available,
178fallback is selected in the following order:
179
180    1. language with a variant code
181    2. language with a script code
182    3. language with a region code
183    4. language
184    5. default
185
186Tailoring tags provided by C<Unicode::Collate> are allowed as long as
187they are not used for C<locale> support.  Esp. the C<table> tag
188is always untailorable, since it is reserved for DUCET.
189
190However C<entry> is allowed, even if it is used for C<locale> support,
191to add or override mappings.
192
193E.g. a collator for Spanish, which ignores diacritics and case difference
194(i.e. level 1), with reversed case ordering and no normalization.
195
196    Unicode::Collate::Locale->new(
197        level => 1,
198        locale => 'es',
199        upper_before_lower => 1,
200        normalization => undef
201    )
202
203Overriding a behavior already tailored by C<locale> is disallowed
204if such a tailoring is passed to C<new()>.
205
206    Unicode::Collate::Locale->new(
207        locale => 'da',
208        upper_before_lower => 0, # causes error as reserved by 'da'
209    )
210
211However C<change()> inherited from C<Unicode::Collate> allows
212such a tailoring that is reserved by C<locale>. Examples:
213
214    new(locale => 'fr_ca')->change(backwards => undef)
215    new(locale => 'da')->change(upper_before_lower => 0)
216    new(locale => 'ja')->change(overrideCJK => undef)
217
218=head2 Methods
219
220C<Unicode::Collate::Locale> is a subclass of C<Unicode::Collate>
221and methods other than C<new> are inherited from C<Unicode::Collate>.
222
223Here is a list of additional methods:
224
225=over 4
226
227=item C<$Collator-E<gt>getlocale>
228
229Returns a language code accepted and used actually on collation.
230If linguistic tailoring is not provided for a language code you passed
231(intensionally for some languages, or due to the incomplete implementation),
232this method returns a string C<'default'> meaning no special tailoring.
233
234=item C<$Collator-E<gt>locale_version>
235
236(Since Unicode::Collate::Locale 0.87)
237Returns the version number (perhaps C</\d\.\d\d/>) of the locale, as that
238of F<Locale/*.pl>.
239
240B<Note:> F<Locale/*.pl> that a collator uses should be identified by
241a combination of return values from C<getlocale> and C<locale_version>.
242
243=back
244
245=head2 A list of tailorable locales
246
247      locale name       description
248    --------------------------------------------------------------
249      af                Afrikaans
250      ar                Arabic
251      as                Assamese
252      az                Azerbaijani (Azeri)
253      be                Belarusian
254      bn                Bengali
255      bs                Bosnian (tailored as Croatian)
256      bs_Cyrl           Bosnian in Cyrillic (tailored as Serbian)
257      ca                Catalan
258      cs                Czech
259      cu                Church Slavic
260      cy                Welsh
261      da                Danish
262      de__phonebook     German (umlaut as 'ae', 'oe', 'ue')
263      de_AT_phonebook   Austrian German (umlaut primary greater)
264      dsb               Lower Sorbian
265      ee                Ewe
266      eo                Esperanto
267      es                Spanish
268      es__traditional   Spanish ('ch' and 'll' as a grapheme)
269      et                Estonian
270      fa                Persian
271      fi                Finnish (v and w are primary equal)
272      fi__phonebook     Finnish (v and w as separate characters)
273      fil               Filipino
274      fo                Faroese
275      fr_CA             Canadian French
276      gu                Gujarati
277      ha                Hausa
278      haw               Hawaiian
279      he                Hebrew
280      hi                Hindi
281      hr                Croatian
282      hu                Hungarian
283      hy                Armenian
284      ig                Igbo
285      is                Icelandic
286      ja                Japanese [1]
287      kk                Kazakh
288      kl                Kalaallisut
289      kn                Kannada
290      ko                Korean [2]
291      kok               Konkani
292      lkt               Lakota
293      ln                Lingala
294      lt                Lithuanian
295      lv                Latvian
296      mk                Macedonian
297      ml                Malayalam
298      mr                Marathi
299      mt                Maltese
300      nb                Norwegian Bokmal
301      nn                Norwegian Nynorsk
302      nso               Northern Sotho
303      om                Oromo
304      or                Oriya
305      pa                Punjabi
306      pl                Polish
307      ro                Romanian
308      sa                Sanskrit
309      se                Northern Sami
310      si                Sinhala
311      si__dictionary    Sinhala (U+0DA5 = U+0DA2,0DCA,0DA4)
312      sk                Slovak
313      sl                Slovenian
314      sq                Albanian
315      sr                Serbian
316      sr_Latn           Serbian in Latin (tailored as Croatian)
317      sv                Swedish (v and w are primary equal)
318      sv__reformed      Swedish (v and w as separate characters)
319      ta                Tamil
320      te                Telugu
321      th                Thai
322      tn                Tswana
323      to                Tonga
324      tr                Turkish
325      ug_Cyrl           Uyghur in Cyrillic
326      uk                Ukrainian
327      ur                Urdu
328      vi                Vietnamese
329      vo                Volapu"k
330      wae               Walser
331      wo                Wolof
332      yo                Yoruba
333      zh                Chinese
334      zh__big5han       Chinese (ideographs: big5 order)
335      zh__gb2312han     Chinese (ideographs: GB-2312 order)
336      zh__pinyin        Chinese (ideographs: pinyin order) [3]
337      zh__stroke        Chinese (ideographs: stroke order) [3]
338      zh__zhuyin        Chinese (ideographs: zhuyin order) [3]
339    --------------------------------------------------------------
340
341Locales according to the default UCA rules include
342am (Amharic) without C<[reorder Ethi]>,
343bg (Bulgarian) without C<[reorder Cyrl]>,
344chr (Cherokee) without C<[reorder Cher]>,
345de (German),
346en (English),
347fr (French),
348ga (Irish),
349id (Indonesian),
350it (Italian),
351ka (Georgian) without C<[reorder Geor]>,
352mn (Mongolian) without C<[reorder Cyrl Mong]>,
353ms (Malay),
354nl (Dutch),
355pt (Portuguese),
356ru (Russian) without C<[reorder Cyrl]>,
357sw (Swahili),
358zu (Zulu).
359
360B<Note>
361
362[1] ja: Ideographs are sorted in JIS X 0208 order.
363Fullwidth and halfwidth forms are identical to their regular form.
364The difference between hiragana and katakana is at the 4th level,
365the comparison also requires C<(variable =E<gt> 'Non-ignorable')>,
366and then C<katakana_before_hiragana> has no effect.
367
368[2] ko: Plenty of ideographs are sorted by their reading. Such
369an ideograph is primary (level 1) equal to, and secondary (level 2)
370greater than, the corresponding hangul syllable.
371
372[3] zh__pinyin, zh__stroke and zh__zhuyin: implemented alt='short',
373where a smaller number of ideographs are tailored.
374
375=head2 A list of variant codes and their aliases
376
377      variant code       alias
378    ------------------------------------------
379      dictionary         dict
380      phonebook          phone     phonebk
381      reformed           reform
382      traditional        trad
383    ------------------------------------------
384      big5han            big5
385      gb2312han          gb2312
386      pinyin
387      stroke
388      zhuyin
389    ------------------------------------------
390
391Note: 'pinyin' is Han in Latin, 'zhuyin' is Han in Bopomofo.
392
393=head1 INSTALL
394
395Installation of C<Unicode::Collate::Locale> requires F<Collate/Locale.pm>,
396F<Collate/Locale/*.pm>, F<Collate/CJK/*.pm> and F<Collate/allkeys.txt>.
397On building, C<Unicode::Collate::Locale> doesn't require
398any of F<data/*.txt>, F<gendata/*>, and F<mklocale>.
399Tests for C<Unicode::Collate::Locale> are named F<t/loc_*.t>.
400
401=head1 CAVEAT
402
403=over 4
404
405=item Tailoring is not maximum
406
407Even if a certain letter is tailored, its equivalent would not always
408tailored as well as it. For example, even though W is tailored,
409fullwidth W (C<U+FF37>), W with acute (C<U+1E82>), etc. are not
410tailored. The result may depend on whether source strings are
411normalized or not, and whether decomposed or composed.
412Thus C<(normalization =E<gt> undef)> is less preferred.
413
414=item Collation reordering is not supported
415
416The order of any groups including scripts is not changed.
417
418=back
419
420=head2 Reference
421
422      locale            based CLDR or other reference
423    --------------------------------------------------------------------
424      af                30 = 1.8.1
425      ar                30 = 28 ("compat" wo [reorder Arab]) = 1.9.0
426      as                30 = 28 (without [reorder Beng..]) = 23
427      az                30 = 24 ("standard" wo [reorder Latn Cyrl])
428      be                30 = 28 (without [reorder Cyrl])
429      bn                30 = 28 ("standard" wo [reorder Beng..]) = 2.0.1
430      bs                30 = 28 (type="standard": [import hr])
431      bs_Cyrl           30 = 28 (type="standard": [import sr])
432      ca                30 = 23 (alt="proposed" type="standard")
433      cs                30 = 1.8.1 (type="standard")
434      cu                34 = 30 (without [reorder Cyrl])
435      cy                30 = 1.8.1
436      da                22.1 = 1.8.1 (type="standard")
437      de__phonebook     30 = 2.0 (type="phonebook")
438      de_AT_phonebook   30 = 27 (type="phonebook")
439      dsb               30 = 26
440      ee                30 = 21
441      eo                30 = 1.8.1
442      es                30 = 1.9.0 (type="standard")
443      es__traditional   30 = 1.8.1 (type="traditional")
444      et                30 = 26
445      fa                22.1 = 1.8.1
446      fi                22.1 = 1.8.1 (type="standard" alt="proposed")
447      fi__phonebook     22.1 = 1.8.1 (type="phonebook")
448      fil               30 = 1.9.0 (type="standard") = 1.8.1
449      fo                22.1 = 1.8.1 (alt="proposed" type="standard")
450      fr_CA             30 = 1.9.0
451      gu                30 = 28 ("standard" wo [reorder Gujr..]) = 1.9.0
452      ha                30 = 1.9.0
453      haw               30 = 24
454      he                30 = 28 (without [reorder Hebr]) = 23
455      hi                30 = 28 (without [reorder Deva..]) = 1.9.0
456      hr                30 = 28 ("standard" wo [reorder Latn Cyrl]) = 1.9.0
457      hu                22.1 = 1.8.1 (alt="proposed" type="standard")
458      hy                30 = 28 (without [reorder Armn]) = 1.8.1
459      ig                30 = 1.8.1
460      is                22.1 = 1.8.1 (type="standard")
461      ja                22.1 = 1.8.1 (type="standard")
462      kk                30 = 28 (without [reorder Cyrl])
463      kl                22.1 = 1.8.1 (type="standard")
464      kn                30 = 28 ("standard" wo [reorder Knda..]) = 1.9.0
465      ko                22.1 = 1.8.1 (type="standard")
466      kok               30 = 28 (without [reorder Deva..]) = 1.8.1
467      lkt               30 = 25
468      ln                30 = 2.0 (type="standard") = 1.8.1
469      lt                22.1 = 1.9.0
470      lv                22.1 = 1.9.0 (type="standard") = 1.8.1
471      mk                30 = 28 (without [reorder Cyrl])
472      ml                22.1 = 1.9.0
473      mr                30 = 28 (without [reorder Deva..]) = 1.8.1
474      mt                22.1 = 1.9.0
475      nb                22.1 = 2.0   (type="standard")
476      nn                22.1 = 2.0   (type="standard")
477      nso           [*] 26 = 1.8.1
478      om                22.1 = 1.8.1
479      or                30 = 28 (without [reorder Orya..]) = 1.9.0
480      pa                22.1 = 1.8.1
481      pl                30 = 1.8.1
482      ro                30 = 1.9.0 (type="standard")
483      sa            [*] 1.9.1 = 1.8.1 (type="standard" alt="proposed")
484      se                22.1 = 1.8.1 (type="standard")
485      si                30 = 28 ("standard" wo [reorder Sinh..]) = 1.9.0
486      si__dictionary    30 = 28 ("dictionary" wo [reorder Sinh..]) = 1.9.0
487      sk                22.1 = 1.9.0 (type="standard")
488      sl                22.1 = 1.8.1 (type="standard" alt="proposed")
489      sq                22.1 = 1.8.1 (alt="proposed" type="standard")
490      sr                30 = 28 (without [reorder Cyrl])
491      sr_Latn           30 = 28 (type="standard": [import hr])
492      sv                22.1 = 1.9.0 (type="standard")
493      sv__reformed      22.1 = 1.8.1 (type="reformed")
494      ta                22.1 = 1.9.0
495      te                30 = 28 (without [reorder Telu..]) = 1.9.0
496      th                22.1 = 22
497      tn            [*] 26 = 1.8.1
498      to                22.1 = 22
499      tr                22.1 = 1.8.1 (type="standard")
500      uk                30 = 28 (without [reorder Cyrl])
501      ug_Cyrl           https://en.wikipedia.org/wiki/Uyghur_Cyrillic_alphabet
502      ur                22.1 = 1.9.0
503      vi                22.1 = 1.8.1
504      vo                30 = 25
505      wae               30 = 2.0
506      wo            [*] 1.9.1 = 1.8.1
507      yo                30 = 1.8.1
508      zh                22.1 = 1.8.1 (type="standard")
509      zh__big5han       22.1 = 1.8.1 (type="big5han")
510      zh__gb2312han     22.1 = 1.8.1 (type="gb2312han")
511      zh__pinyin        22.1 = 2.0   (type='pinyin' alt='short')
512      zh__stroke        22.1 = 1.9.1 (type='stroke' alt='short')
513      zh__zhuyin        22.1 = 22    (type='zhuyin' alt='short')
514    --------------------------------------------------------------------
515
516[*] http://www.unicode.org/repos/cldr/tags/latest/seed/collation/
517
518=head1 AUTHOR
519
520The Unicode::Collate::Locale module for perl was written
521by SADAHIRO Tomoyuki, <SADAHIRO@cpan.org>.
522This module is Copyright(C) 2004-2020, SADAHIRO Tomoyuki. Japan.
523All rights reserved.
524
525This module is free software; you can redistribute it and/or
526modify it under the same terms as Perl itself.
527
528=head1 SEE ALSO
529
530=over 4
531
532=item Unicode Collation Algorithm - UTS #10
533
534L<http://www.unicode.org/reports/tr10/>
535
536=item The Default Unicode Collation Element Table (DUCET)
537
538L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
539
540=item Unicode Locale Data Markup Language (LDML) - UTS #35
541
542L<http://www.unicode.org/reports/tr35/>
543
544=item CLDR - Unicode Common Locale Data Repository
545
546L<http://cldr.unicode.org/>
547
548=item L<Unicode::Collate>
549
550=item L<Unicode::Normalize>
551
552=back
553
554=cut
555