1package Unicode::Collate::Locale;
2
3use strict;
4use Carp;
5use base qw(Unicode::Collate);
6
7our $VERSION = '1.02';
8
9my $PL_EXT  = '.pl';
10
11my %LocaleFile = map { ($_, $_) } qw(
12   af ar as az be bg bn ca cs cy da ee eo es et fa fi fil fo fr
13   gu ha haw hi hr hu hy ig is ja kk kl kn ko kok ln lt lv
14   mk ml mr mt nb nn nso om or pa pl ro ru sa se si sk sl sq
15   sr sv ta te th tn to tr uk ur vi wae wo yo zh
16);
17   $LocaleFile{'default'} = '';
18# aliases
19   $LocaleFile{'bs'}      = 'hr';
20   $LocaleFile{'bs_Cyrl'} = 'sr';
21   $LocaleFile{'sr_Latn'} = 'hr';
22# short file names
23   $LocaleFile{'de__phonebook'}   = 'de_phone';
24   $LocaleFile{'es__traditional'} = 'es_trad';
25   $LocaleFile{'fi__phonebook'}   = 'fi_phone';
26   $LocaleFile{'si__dictionary'}  = 'si_dict';
27   $LocaleFile{'sv__reformed'}    = 'sv_refo';
28   $LocaleFile{'zh__big5han'}     = 'zh_big5';
29   $LocaleFile{'zh__gb2312han'}   = 'zh_gb';
30   $LocaleFile{'zh__pinyin'}      = 'zh_pin';
31   $LocaleFile{'zh__stroke'}      = 'zh_strk';
32   $LocaleFile{'zh__zhuyin'}      = 'zh_zhu';
33
34my %TypeAlias = qw(
35    phone     phonebook
36    phonebk   phonebook
37    dict      dictionary
38    reform    reformed
39    trad      traditional
40    big5      big5han
41    gb2312    gb2312han
42);
43
44sub _locale {
45    my $locale = shift;
46    if ($locale) {
47	$locale = lc $locale;
48	$locale =~ tr/\-\ \./_/;
49	$locale =~ s/_([0-9a-z]+)\z/$TypeAlias{$1} ?
50				  "_$TypeAlias{$1}" : "_$1"/e;
51	$LocaleFile{$locale} and return $locale;
52
53	my @code = split /_/, $locale;
54	my $lan = shift @code;
55	my $scr = @code && length $code[0] == 4 ? ucfirst shift @code : '';
56	my $reg = @code && length $code[0] <  4 ? uc      shift @code : '';
57	my $var = @code                         ?         shift @code : '';
58
59	my @list;
60	push @list, (
61	    "${lan}_${scr}_${reg}_$var",
62	    "${lan}_${scr}__$var", # empty $scr should not be ${lan}__$var.
63	    "${lan}_${reg}_$var",  # empty $reg may be ${lan}__$var.
64	    "${lan}__$var",
65	) if $var ne '';
66	push @list, (
67	    "${lan}_${scr}_${reg}",
68	    "${lan}_${scr}",
69	    "${lan}_${reg}",
70	     ${lan},
71	);
72	for my $loc (@list) {
73	    $LocaleFile{$loc} and return $loc;
74	}
75    }
76    return 'default';
77}
78
79sub getlocale {
80    return shift->{accepted_locale};
81}
82
83sub locale_version {
84    return shift->{locale_version};
85}
86
87sub _fetchpl {
88    my $accepted = shift;
89    my $f = $LocaleFile{$accepted};
90    return if !$f;
91    $f .= $PL_EXT;
92
93    # allow to search @INC
94#   use File::Spec;
95#   my $path = File::Spec->catfile('Unicode', 'Collate', 'Locale', $f);
96    my $path = "Unicode/Collate/Locale/$f";
97    my $h = do $path;
98    croak "Unicode/Collate/Locale/$f can't be found" if !$h;
99    return $h;
100}
101
102sub new {
103    my $class = shift;
104    my %hash = @_;
105    $hash{accepted_locale} = _locale($hash{locale});
106
107    if (exists $hash{table}) {
108	croak "your table can't be used with Unicode::Collate::Locale";
109    }
110
111    my $href = _fetchpl($hash{accepted_locale});
112    while (my($k,$v) = each %$href) {
113	if (!exists $hash{$k}) {
114	    $hash{$k} = $v;
115	} elsif ($k eq 'entry') {
116	    $hash{$k} = $v.$hash{$k};
117	} else {
118	    croak "$k is reserved by $hash{locale}, can't be overwritten";
119	}
120    }
121    return $class->SUPER::new(%hash);
122}
123
1241;
125__END__
126
127MEMORANDA for developing
128
129locale            based CLDR
130----------------------------------------------------------------------------
131af                22.1 = 1.8.1
132ar                22.1 = 1.9.0
133as                22.1 = 1.8.1
134az                22.1 = 1.8.1 (type="standard")
135be                22.1 = 1.9.0
136bg                22.1 = 1.9.0
137bn                22.1 = 2.0.1 (type="standard")
138bs                22.1 = 1.9.0 (alias source="hr")
139bs_Cyrl           22.1 = 22    (alias source="sr")
140ca                22.1 = 1.8.1 (alt="proposed" type="standard")
141cs                22.1 = 1.8.1 (type="standard")
142cy                22.1 = 1.8.1
143da                22.1 = 1.8.1 (type="standard") [mod aA to pass CLDR test]
144de__phonebook     22.1 = 2.0   (type="phonebook")
145ee                22.1 = 22
146eo                22.1 = 1.8.1
147es                22.1 = 1.9.0 (type="standard")
148es__traditional   22.1 = 1.8.1 (type="traditional")
149et                22.1 = 1.8.1
150fa                22.1 = 1.8.1
151fi                22.1 = 1.8.1 (type="standard" alt="proposed")
152fi__phonebook     22.1 = 1.8.1 (type="phonebook")
153fil               22.1 = 1.9.0 (type="standard") = 1.8.1
154fo                22.1 = 1.8.1 (alt="proposed" type="standard")
155fr                22.1 = 1.9.0 (fr_CA, backwards="on")
156gu                22.1 = 1.9.0 (type="standard")
157ha                22.1 = 1.9.0
158haw               22.1 = 1.8.1
159hi                22.1 = 1.9.0 (type="standard")
160hr                22.1 = 1.9.0 (type="standard")
161hu                22.1 = 1.8.1 (alt="proposed" type="standard")
162hy                22.1 = 1.8.1
163ig                22.1 = 1.8.1
164is                22.1 = 1.8.1 (type="standard")
165ja                22.1 = 1.8.1 (type="standard")
166kk                22.1 = 1.9.0
167kl                22.1 = 1.8.1 (type="standard")
168kn                22.1 = 1.9.0 (type="standard")
169ko                22.1 = 1.8.1 (type="standard")
170kok               22.1 = 1.8.1
171ln                22.1 = 2.0   (type="standard") = 1.8.1
172lt                22.1 = 1.9.0
173lv                22.1 = 1.9.0 (type="standard") = 1.8.1
174mk                22.1 = 1.9.0
175ml                22.1 = 1.9.0
176mr                22.1 = 1.8.1
177mt                22.1 = 1.9.0
178nb                22.1 = 2.0   (type="standard")
179nn                22.1 = 2.0   (type="standard")
180nso               22.1 = 1.8.1
181om                22.1 = 1.8.1
182or                22.1 = 1.9.0
183pa                22.1 = 1.8.1
184pl                22.1 = 1.8.1
185ro                22.1 = 1.9.0 (type="standard")
186ru                22.1 = 1.9.0
187sa                1.9.1 = 1.8.1 (type="standard" alt="proposed") [now /seed]
188se                22.1 = 1.8.1 (type="standard")
189si                22.1 = 1.9.0 (type="standard")
190si__dictionary    22.1 = 1.9.0 (type="dictionary")
191sk                22.1 = 1.9.0 (type="standard")
192sl                22.1 = 1.8.1 (type="standard" alt="proposed")
193sq                22.1 = 1.8.1 (alt="proposed" type="standard")
194sr                22.1 = 1.9.0 (type="standard")
195sr_Latn           22.1 = 1.8.1 (alias source="hr")
196sv                22.1 = 1.9.0 (type="standard")
197sv__reformed      22.1 = 1.8.1 (type="reformed")
198ta                22.1 = 1.9.0
199te                22.1 = 1.9.0
200th                22.1 = 22
201tn                22.1 = 1.8.1
202to                22.1 = 22
203tr                22.1 = 1.8.1 (type="standard")
204uk                22.1 = 21
205ur                22.1 = 1.9.0
206vi                22.1 = 1.8.1
207wae               22.1 = 2.0
208wo                1.9.1 = 1.8.1 [now /seed]
209yo                22.1 = 1.8.1
210zh                22.1 = 1.8.1 (type="standard")
211zh__big5han       22.1 = 1.8.1 (type="big5han")
212zh__gb2312han     22.1 = 1.8.1 (type="gb2312han")
213zh__pinyin        22.1 = 2.0   (type='pinyin' alt='short')
214zh__stroke        22.1 = 1.9.1 (type='stroke' alt='short')
215zh__zhuyin        22.1 = 22    (type='zhuyin' alt='short')
216----------------------------------------------------------------------------
217
218=head1 NAME
219
220Unicode::Collate::Locale - Linguistic tailoring for DUCET via Unicode::Collate
221
222=head1 SYNOPSIS
223
224  use Unicode::Collate::Locale;
225
226  #construct
227  $Collator = Unicode::Collate::Locale->
228      new(locale => $locale_name, %tailoring);
229
230  #sort
231  @sorted = $Collator->sort(@not_sorted);
232
233  #compare
234  $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
235
236B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted
237according to Perl's Unicode support. See L<perlunicode>,
238L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
239Otherwise you can use C<preprocess> (cf. C<Unicode::Collate>)
240or should decode them before.
241
242=head1 DESCRIPTION
243
244This module provides linguistic tailoring for it
245taking advantage of C<Unicode::Collate>.
246
247=head2 Constructor
248
249The C<new> method returns a collator object.
250
251A parameter list for the constructor is a hash, which can include
252a special key C<locale> and its value (case-insensitive) standing
253for a Unicode base language code (two or three-letter).
254For example, C<Unicode::Collate::Locale-E<gt>new(locale =E<gt> 'FR')>
255returns a collator tailored for French.
256
257C<$locale_name> may be suffixed with a Unicode script code (four-letter),
258a Unicode region code, a Unicode language variant code. These codes are
259case-insensitive, and separated with C<'_'> or C<'-'>.
260E.g. C<en_US> for English in USA,
261C<az_Cyrl> for Azerbaijani in the Cyrillic script,
262C<es_ES_traditional> for Spanish in Spain (Traditional).
263
264If C<$locale_name> is not available,
265fallback is selected in the following order:
266
267    1. language with a variant code
268    2. language with a script code
269    3. language with a region code
270    4. language
271    5. default
272
273Tailoring tags provided by C<Unicode::Collate> are allowed as long as
274they are not used for C<locale> support.  Esp. the C<table> tag
275is always untailorable, since it is reserved for DUCET.
276
277However C<entry> is allowed, even if it is used for C<locale> support,
278to add or override mappings.
279
280E.g. a collator for French, which ignores diacritics and case difference
281(i.e. level 1), with reversed case ordering and no normalization.
282
283    Unicode::Collate::Locale->new(
284        level => 1,
285        locale => 'fr',
286        upper_before_lower => 1,
287        normalization => undef
288    )
289
290Overriding a behavior already tailored by C<locale> is disallowed
291if such a tailoring is passed to C<new()>.
292
293    Unicode::Collate::Locale->new(
294        locale => 'da',
295        upper_before_lower => 0, # causes error as reserved by 'da'
296    )
297
298However C<change()> inherited from C<Unicode::Collate> allows
299such a tailoring that is reserved by C<locale>. Examples:
300
301    new(locale => 'ca')->change(backwards => undef)
302    new(locale => 'da')->change(upper_before_lower => 0)
303    new(locale => 'ja')->change(overrideCJK => undef)
304
305=head2 Methods
306
307C<Unicode::Collate::Locale> is a subclass of C<Unicode::Collate>
308and methods other than C<new> are inherited from C<Unicode::Collate>.
309
310Here is a list of additional methods:
311
312=over 4
313
314=item C<$Collator-E<gt>getlocale>
315
316Returns a language code accepted and used actually on collation.
317If linguistic tailoring is not provided for a language code you passed
318(intensionally for some languages, or due to the incomplete implementation),
319this method returns a string C<'default'> meaning no special tailoring.
320
321=item C<$Collator-E<gt>locale_version>
322
323(Since Unicode::Collate::Locale 0.87)
324Returns the version number (perhaps C</\d\.\d\d/>) of the locale, as that
325of F<Locale/*.pl>.
326
327B<Note:> F<Locale/*.pl> that a collator uses should be identified by
328a combination of return values from C<getlocale> and C<locale_version>.
329
330=back
331
332=head2 A list of tailorable locales
333
334      locale name       description
335    --------------------------------------------------------------
336      af                Afrikaans
337      ar                Arabic
338      as                Assamese
339      az                Azerbaijani (Azeri)
340      be                Belarusian
341      bg                Bulgarian
342      bn                Bengali
343      bs                Bosnian
344      bs_Cyrl           Bosnian in Cyrillic (tailored as Serbian)
345      ca                Catalan
346      cs                Czech
347      cy                Welsh
348      da                Danish
349      de__phonebook     German (umlaut as 'ae', 'oe', 'ue')
350      ee                Ewe
351      eo                Esperanto
352      es                Spanish
353      es__traditional   Spanish ('ch' and 'll' as a grapheme)
354      et                Estonian
355      fa                Persian
356      fi                Finnish (v and w are primary equal)
357      fi__phonebook     Finnish (v and w as separate characters)
358      fil               Filipino
359      fo                Faroese
360      fr                French
361      gu                Gujarati
362      ha                Hausa
363      haw               Hawaiian
364      hi                Hindi
365      hr                Croatian
366      hu                Hungarian
367      hy                Armenian
368      ig                Igbo
369      is                Icelandic
370      ja                Japanese [1]
371      kk                Kazakh
372      kl                Kalaallisut
373      kn                Kannada
374      ko                Korean [2]
375      kok               Konkani
376      ln                Lingala
377      lt                Lithuanian
378      lv                Latvian
379      mk                Macedonian
380      ml                Malayalam
381      mr                Marathi
382      mt                Maltese
383      nb                Norwegian Bokmal
384      nn                Norwegian Nynorsk
385      nso               Northern Sotho
386      om                Oromo
387      or                Oriya
388      pa                Punjabi
389      pl                Polish
390      ro                Romanian
391      ru                Russian
392      sa                Sanskrit
393      se                Northern Sami
394      si                Sinhala
395      si__dictionary    Sinhala (U+0DA5 = U+0DA2,0DCA,0DA4)
396      sk                Slovak
397      sl                Slovenian
398      sq                Albanian
399      sr                Serbian
400      sr_Latn           Serbian in Latin (tailored as Croatian)
401      sv                Swedish (v and w are primary equal)
402      sv__reformed      Swedish (v and w as separate characters)
403      ta                Tamil
404      te                Telugu
405      th                Thai
406      tn                Tswana
407      to                Tonga
408      tr                Turkish
409      uk                Ukrainian
410      ur                Urdu
411      vi                Vietnamese
412      wae               Walser
413      wo                Wolof
414      yo                Yoruba
415      zh                Chinese
416      zh__big5han       Chinese (ideographs: big5 order)
417      zh__gb2312han     Chinese (ideographs: GB-2312 order)
418      zh__pinyin        Chinese (ideographs: pinyin order) [3]
419      zh__stroke        Chinese (ideographs: stroke order) [3]
420      zh__zhuyin        Chinese (ideographs: zhuyin order) [3]
421    --------------------------------------------------------------
422
423Locales according to the default UCA rules include
424chr (Cherokee),
425de (German),
426en (English),
427ga (Irish),
428id (Indonesian),
429it (Italian),
430ka (Georgian),
431ms (Malay),
432nl (Dutch),
433pt (Portuguese),
434st (Southern Sotho),
435sw (Swahili),
436xh (Xhosa),
437zu (Zulu).
438
439B<Note>
440
441[1] ja: Ideographs are sorted in JIS X 0208 order.
442Fullwidth and halfwidth forms are identical to their regular form.
443The difference between hiragana and katakana is at the 4th level,
444the comparison also requires C<(variable =E<gt> 'Non-ignorable')>,
445and then C<katakana_before_hiragana> has no effect.
446
447[2] ko: Plenty of ideographs are sorted by their reading. Such
448an ideograph is primary (level 1) equal to, and secondary (level 2)
449greater than, the corresponding hangul syllable.
450
451[3] zh__pinyin, zh__stroke and zh__zhuyin: implemented alt='short',
452where a smaller number of ideographs are tailored.
453
454Note: 'pinyin' is in latin, 'zhuyin' is in bopomofo.
455
456=head1 INSTALL
457
458Installation of C<Unicode::Collate::Locale> requires F<Collate/Locale.pm>,
459F<Collate/Locale/*.pm>, F<Collate/CJK/*.pm> and F<Collate/allkeys.txt>.
460On building, C<Unicode::Collate::Locale> doesn't require any of F<data/*.txt>,
461F<gendata/*>, and F<mklocale>.
462Tests for C<Unicode::Collate::Locale> are named F<t/loc_*.t>.
463
464=head1 CAVEAT
465
466=over 4
467
468=item tailoring is not maximum
469
470Even if a certain letter is tailored, its equivalent would not always
471tailored as well as it. For example, even though W is tailored,
472fullwidth W (C<U+FF37>), W with acute (C<U+1E82>), etc. are not
473tailored. The result may depend on whether source strings are
474normalized or not, and whether decomposed or composed.
475Thus C<(normalization =E<gt> undef)> is less preferred.
476
477=back
478
479=head1 AUTHOR
480
481The Unicode::Collate::Locale module for perl was written
482by SADAHIRO Tomoyuki, <SADAHIRO@cpan.org>.
483This module is Copyright(C) 2004-2013, SADAHIRO Tomoyuki. Japan.
484All rights reserved.
485
486This module is free software; you can redistribute it and/or
487modify it under the same terms as Perl itself.
488
489=head1 SEE ALSO
490
491=over 4
492
493=item Unicode Collation Algorithm - UTS #10
494
495L<http://www.unicode.org/reports/tr10/>
496
497=item The Default Unicode Collation Element Table (DUCET)
498
499L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
500
501=item Unicode Locale Data Markup Language (LDML) - UTS #35
502
503L<http://www.unicode.org/reports/tr35/>
504
505=item CLDR - Unicode Common Locale Data Repository
506
507L<http://cldr.unicode.org/>
508
509=item L<Unicode::Collate>
510
511=item L<Unicode::Normalize>
512
513=back
514
515=cut
516