xref: /openbsd/gnu/usr.bin/perl/lib/Unicode/UCD.pm (revision 17df1aa7)
1package Unicode::UCD;
2
3use strict;
4use warnings;
5
6our $VERSION = '0.27';
7
8use Storable qw(dclone);
9
10require Exporter;
11
12our @ISA = qw(Exporter);
13
14our @EXPORT_OK = qw(charinfo
15		    charblock charscript
16		    charblocks charscripts
17		    charinrange
18		    general_categories bidi_types
19		    compexcl
20		    casefold casespec
21		    namedseq);
22
23use Carp;
24
25=head1 NAME
26
27Unicode::UCD - Unicode character database
28
29=head1 SYNOPSIS
30
31    use Unicode::UCD 'charinfo';
32    my $charinfo   = charinfo($codepoint);
33
34    use Unicode::UCD 'casefold';
35    my $casefold = casefold(0xFB00);
36
37    use Unicode::UCD 'casespec';
38    my $casespec = casespec(0xFB00);
39
40    use Unicode::UCD 'charblock';
41    my $charblock  = charblock($codepoint);
42
43    use Unicode::UCD 'charscript';
44    my $charscript = charscript($codepoint);
45
46    use Unicode::UCD 'charblocks';
47    my $charblocks = charblocks();
48
49    use Unicode::UCD 'charscripts';
50    my $charscripts = charscripts();
51
52    use Unicode::UCD qw(charscript charinrange);
53    my $range = charscript($script);
54    print "looks like $script\n" if charinrange($range, $codepoint);
55
56    use Unicode::UCD qw(general_categories bidi_types);
57    my $categories = general_categories();
58    my $types = bidi_types();
59
60    use Unicode::UCD 'compexcl';
61    my $compexcl = compexcl($codepoint);
62
63    use Unicode::UCD 'namedseq';
64    my $namedseq = namedseq($named_sequence_name);
65
66    my $unicode_version = Unicode::UCD::UnicodeVersion();
67
68=head1 DESCRIPTION
69
70The Unicode::UCD module offers a series of functions that
71provide a simple interface to the Unicode
72Character Database.
73
74=head2 code point argument
75
76Some of the functions are called with a I<code point argument>, which is either
77a decimal or a hexadecimal scalar designating a Unicode code point, or C<U+>
78followed by hexadecimals designating a Unicode code point.  In other words, if
79you want a code point to be interpreted as a hexadecimal number, you must
80prefix it with either C<0x> or C<U+>, because a string like e.g. C<123> will be
81interpreted as a decimal code point.  Also note that Unicode is B<not> limited
82to 16 bits (the number of Unicode code points is open-ended, in theory
83unlimited): you may have more than 4 hexdigits.
84=cut
85
86my $UNICODEFH;
87my $BLOCKSFH;
88my $SCRIPTSFH;
89my $VERSIONFH;
90my $COMPEXCLFH;
91my $CASEFOLDFH;
92my $CASESPECFH;
93my $NAMEDSEQFH;
94
95sub openunicode {
96    my ($rfh, @path) = @_;
97    my $f;
98    unless (defined $$rfh) {
99	for my $d (@INC) {
100	    use File::Spec;
101	    $f = File::Spec->catfile($d, "unicore", @path);
102	    last if open($$rfh, $f);
103	    undef $f;
104	}
105	croak __PACKAGE__, ": failed to find ",
106              File::Spec->catfile(@path), " in @INC"
107	    unless defined $f;
108    }
109    return $f;
110}
111
112=head2 B<charinfo()>
113
114    use Unicode::UCD 'charinfo';
115
116    my $charinfo = charinfo(0x41);
117
118This returns information about the input L</code point argument>
119as a reference to a hash of fields as defined by the Unicode
120standard.  If the L</code point argument> is not assigned in the standard
121(i.e., has the general category C<Cn> meaning C<Unassigned>)
122or is a non-character (meaning it is guaranteed to never be assigned in
123the standard),
124B<undef> is returned.
125
126Fields that aren't applicable to the particular code point argument exist in the
127returned hash, and are empty.
128
129The keys in the hash with the meanings of their values are:
130
131=over
132
133=item B<code>
134
135the input L</code point argument> expressed in hexadecimal, with leading zeros
136added if necessary to make it contain at least four hexdigits
137
138=item B<name>
139
140name of I<code>, all IN UPPER CASE.
141Some control-type code points do not have names.
142This field will be empty for C<Surrogate> and C<Private Use> code points,
143and for the others without a name,
144it will contain a description enclosed in angle brackets, like
145C<E<lt>controlE<gt>>.
146
147
148=item B<category>
149
150The short name of the general category of I<code>.
151This will match one of the keys in the hash returned by L</general_categories()>.
152
153=item B<combining>
154
155the combining class number for I<code> used in the Canonical Ordering Algorithm.
156For Unicode 5.1, this is described in Section 3.11 C<Canonical Ordering Behavior>
157available at
158L<http://www.unicode.org/versions/Unicode5.1.0/>
159
160=item B<bidi>
161
162bidirectional type of I<code>.
163This will match one of the keys in the hash returned by L</bidi_types()>.
164
165=item B<decomposition>
166
167is empty if I<code> has no decomposition; or is one or more codes
168(separated by spaces) that taken in order represent a decomposition for
169I<code>.  Each has at least four hexdigits.
170The codes may be preceded by a word enclosed in angle brackets then a space,
171like C<E<lt>compatE<gt> >, giving the type of decomposition
172
173=item B<decimal>
174
175if I<code> is a decimal digit this is its integer numeric value
176
177=item B<digit>
178
179if I<code> represents a whole number, this is its integer numeric value
180
181=item B<numeric>
182
183if I<code> represents a whole or rational number, this is its numeric value.
184Rational values are expressed as a string like C<1/4>.
185
186=item B<mirrored>
187
188C<Y> or C<N> designating if I<code> is mirrored in bidirectional text
189
190=item B<unicode10>
191
192name of I<code> in the Unicode 1.0 standard if one
193existed for this code point and is different from the current name
194
195=item B<comment>
196
197ISO 10646 comment field.
198It appears in parentheses in the ISO 10646 names list,
199or contains an asterisk to indicate there is
200a note for this code point in Annex P of that standard.
201
202=item B<upper>
203
204is empty if there is no single code point uppercase mapping for I<code>;
205otherwise it is that mapping expressed as at least four hexdigits.
206(L</casespec()> should be used in addition to B<charinfo()>
207for case mappings when the calling program can cope with multiple code point
208mappings.)
209
210=item B<lower>
211
212is empty if there is no single code point lowercase mapping for I<code>;
213otherwise it is that mapping expressed as at least four hexdigits.
214(L</casespec()> should be used in addition to B<charinfo()>
215for case mappings when the calling program can cope with multiple code point
216mappings.)
217
218=item B<title>
219
220is empty if there is no single code point titlecase mapping for I<code>;
221otherwise it is that mapping expressed as at least four hexdigits.
222(L</casespec()> should be used in addition to B<charinfo()>
223for case mappings when the calling program can cope with multiple code point
224mappings.)
225
226=item B<block>
227
228block I<code> belongs to (used in \p{In...}).
229See L</Blocks versus Scripts>.
230
231
232=item B<script>
233
234script I<code> belongs to.
235See L</Blocks versus Scripts>.
236
237=back
238
239Note that you cannot do (de)composition and casing based solely on the
240I<decomposition>, I<combining>, I<lower>, I<upper>, and I<title> fields;
241you will need also the L</compexcl()>, and L</casespec()> functions.
242
243=cut
244
245# NB: This function is duplicated in charnames.pm
246sub _getcode {
247    my $arg = shift;
248
249    if ($arg =~ /^[1-9]\d*$/) {
250	return $arg;
251    } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
252	return hex($1);
253    }
254
255    return;
256}
257
258# Lingua::KO::Hangul::Util not part of the standard distribution
259# but it will be used if available.
260
261eval { require Lingua::KO::Hangul::Util };
262my $hasHangulUtil = ! $@;
263if ($hasHangulUtil) {
264    Lingua::KO::Hangul::Util->import();
265}
266
267sub hangul_decomp { # internal: called from charinfo
268    if ($hasHangulUtil) {
269	my @tmp = decomposeHangul(shift);
270	return sprintf("%04X %04X",      @tmp) if @tmp == 2;
271	return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
272    }
273    return;
274}
275
276sub hangul_charname { # internal: called from charinfo
277    return sprintf("HANGUL SYLLABLE-%04X", shift);
278}
279
280sub han_charname { # internal: called from charinfo
281    return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
282}
283
284# Overwritten by data in file
285my %first_last = (
286   'CJK Ideograph Extension A' => [ 0x3400,   0x4DB5   ],
287   'CJK Ideograph'             => [ 0x4E00,   0x9FA5   ],
288   'CJK Ideograph Extension B' => [ 0x20000,  0x2A6D6  ],
289);
290
291get_charinfo_ranges();
292
293sub get_charinfo_ranges {
294   my @blocks = keys %first_last;
295
296   my $fh;
297   openunicode( \$fh, 'UnicodeData.txt' );
298   if( defined $fh ){
299      while( my $line = <$fh> ){
300         next unless $line =~ /(?:First|Last)/;
301         if( grep{ $line =~ /[^;]+;<$_\s*,\s*(?:First|Last)>/ }@blocks ){
302            my ($number,$block,$type);
303            ($number,$block) = split /;/, $line;
304            $block =~ s/<|>//g;
305            ($block,$type) = split /, /, $block;
306            my $index = $type eq 'First' ? 0 : 1;
307            $first_last{ $block }->[$index] = hex $number;
308         }
309      }
310   }
311}
312
313my @CharinfoRanges = (
314# block name
315# [ first, last, coderef to name, coderef to decompose ],
316# CJK Ideographs Extension A
317  [ @{ $first_last{'CJK Ideograph Extension A'} },        \&han_charname,   undef  ],
318# CJK Ideographs
319  [ @{ $first_last{'CJK Ideograph'} },                    \&han_charname,   undef  ],
320# Hangul Syllables
321  [ 0xAC00,   0xD7A3,   $hasHangulUtil ? \&getHangulName : \&hangul_charname,  \&hangul_decomp ],
322# Non-Private Use High Surrogates
323  [ 0xD800,   0xDB7F,   undef,   undef  ],
324# Private Use High Surrogates
325  [ 0xDB80,   0xDBFF,   undef,   undef  ],
326# Low Surrogates
327  [ 0xDC00,   0xDFFF,   undef,   undef  ],
328# The Private Use Area
329  [ 0xE000,   0xF8FF,   undef,   undef  ],
330# CJK Ideographs Extension B
331  [ @{ $first_last{'CJK Ideograph Extension B'} },        \&han_charname,   undef  ],
332# Plane 15 Private Use Area
333  [ 0xF0000,  0xFFFFD,  undef,   undef  ],
334# Plane 16 Private Use Area
335  [ 0x100000, 0x10FFFD, undef,   undef  ],
336);
337
338sub charinfo {
339    my $arg  = shift;
340    my $code = _getcode($arg);
341    croak __PACKAGE__, "::charinfo: unknown code '$arg'"
342	unless defined $code;
343    my $hexk = sprintf("%06X", $code);
344    my($rcode,$rname,$rdec);
345    foreach my $range (@CharinfoRanges){
346      if ($range->[0] <= $code && $code <= $range->[1]) {
347        $rcode = $hexk;
348	$rcode =~ s/^0+//;
349	$rcode =  sprintf("%04X", hex($rcode));
350        $rname = $range->[2] ? $range->[2]->($code) : '';
351        $rdec  = $range->[3] ? $range->[3]->($code) : '';
352        $hexk  = sprintf("%06X", $range->[0]); # replace by the first
353        last;
354      }
355    }
356    openunicode(\$UNICODEFH, "UnicodeData.txt");
357    if (defined $UNICODEFH) {
358	use Search::Dict 1.02;
359	if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
360	    my $line = <$UNICODEFH>;
361	    return unless defined $line;
362	    chomp $line;
363	    my %prop;
364	    @prop{qw(
365		     code name category
366		     combining bidi decomposition
367		     decimal digit numeric
368		     mirrored unicode10 comment
369		     upper lower title
370		    )} = split(/;/, $line, -1);
371	    $hexk =~ s/^0+//;
372	    $hexk =  sprintf("%04X", hex($hexk));
373	    if ($prop{code} eq $hexk) {
374		$prop{block}  = charblock($code);
375		$prop{script} = charscript($code);
376		if(defined $rname){
377                    $prop{code} = $rcode;
378                    $prop{name} = $rname;
379                    $prop{decomposition} = $rdec;
380                }
381		return \%prop;
382	    }
383	}
384    }
385    return;
386}
387
388sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
389    my ($table, $lo, $hi, $code) = @_;
390
391    return if $lo > $hi;
392
393    my $mid = int(($lo+$hi) / 2);
394
395    if ($table->[$mid]->[0] < $code) {
396	if ($table->[$mid]->[1] >= $code) {
397	    return $table->[$mid]->[2];
398	} else {
399	    _search($table, $mid + 1, $hi, $code);
400	}
401    } elsif ($table->[$mid]->[0] > $code) {
402	_search($table, $lo, $mid - 1, $code);
403    } else {
404	return $table->[$mid]->[2];
405    }
406}
407
408sub charinrange {
409    my ($range, $arg) = @_;
410    my $code = _getcode($arg);
411    croak __PACKAGE__, "::charinrange: unknown code '$arg'"
412	unless defined $code;
413    _search($range, 0, $#$range, $code);
414}
415
416=head2 B<charblock()>
417
418    use Unicode::UCD 'charblock';
419
420    my $charblock = charblock(0x41);
421    my $charblock = charblock(1234);
422    my $charblock = charblock(0x263a);
423    my $charblock = charblock("U+263a");
424
425    my $range     = charblock('Armenian');
426
427With a L</code point argument> charblock() returns the I<block> the code point
428belongs to, e.g.  C<Basic Latin>.
429If the code point is unassigned, this returns the block it would belong to if
430it were assigned (which it may in future versions of the Unicode Standard).
431
432See also L</Blocks versus Scripts>.
433
434If supplied with an argument that can't be a code point, charblock() tries
435to do the opposite and interpret the argument as a code point block. The
436return value is a I<range>: an anonymous list of lists that contain
437I<start-of-range>, I<end-of-range> code point pairs. You can test whether
438a code point is in a range using the L</charinrange()> function. If the
439argument is not a known code point block, B<undef> is returned.
440
441=cut
442
443my @BLOCKS;
444my %BLOCKS;
445
446sub _charblocks {
447    unless (@BLOCKS) {
448	if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
449	    local $_;
450	    while (<$BLOCKSFH>) {
451		if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
452		    my ($lo, $hi) = (hex($1), hex($2));
453		    my $subrange = [ $lo, $hi, $3 ];
454		    push @BLOCKS, $subrange;
455		    push @{$BLOCKS{$3}}, $subrange;
456		}
457	    }
458	    close($BLOCKSFH);
459	}
460    }
461}
462
463sub charblock {
464    my $arg = shift;
465
466    _charblocks() unless @BLOCKS;
467
468    my $code = _getcode($arg);
469
470    if (defined $code) {
471	_search(\@BLOCKS, 0, $#BLOCKS, $code);
472    } else {
473	if (exists $BLOCKS{$arg}) {
474	    return dclone $BLOCKS{$arg};
475	} else {
476	    return;
477	}
478    }
479}
480
481=head2 B<charscript()>
482
483    use Unicode::UCD 'charscript';
484
485    my $charscript = charscript(0x41);
486    my $charscript = charscript(1234);
487    my $charscript = charscript("U+263a");
488
489    my $range      = charscript('Thai');
490
491With a L</code point argument> charscript() returns the I<script> the
492code point belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.
493If the code point is unassigned, it returns B<undef>
494
495If supplied with an argument that can't be a code point, charscript() tries
496to do the opposite and interpret the argument as a code point script. The
497return value is a I<range>: an anonymous list of lists that contain
498I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
499code point is in a range using the L</charinrange()> function. If the
500argument is not a known code point script, B<undef> is returned.
501
502See also L</Blocks versus Scripts>.
503
504=cut
505
506my @SCRIPTS;
507my %SCRIPTS;
508
509sub _charscripts {
510    unless (@SCRIPTS) {
511	if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
512	    local $_;
513	    while (<$SCRIPTSFH>) {
514		if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
515		    my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
516		    my $script = lc($3);
517		    $script =~ s/\b(\w)/uc($1)/ge;
518		    my $subrange = [ $lo, $hi, $script ];
519		    push @SCRIPTS, $subrange;
520		    push @{$SCRIPTS{$script}}, $subrange;
521		}
522	    }
523	    close($SCRIPTSFH);
524	    @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
525	}
526    }
527}
528
529sub charscript {
530    my $arg = shift;
531
532    _charscripts() unless @SCRIPTS;
533
534    my $code = _getcode($arg);
535
536    if (defined $code) {
537	_search(\@SCRIPTS, 0, $#SCRIPTS, $code);
538    } else {
539	if (exists $SCRIPTS{$arg}) {
540	    return dclone $SCRIPTS{$arg};
541	} else {
542	    return;
543	}
544    }
545}
546
547=head2 B<charblocks()>
548
549    use Unicode::UCD 'charblocks';
550
551    my $charblocks = charblocks();
552
553charblocks() returns a reference to a hash with the known block names
554as the keys, and the code point ranges (see L</charblock()>) as the values.
555
556See also L</Blocks versus Scripts>.
557
558=cut
559
560sub charblocks {
561    _charblocks() unless %BLOCKS;
562    return dclone \%BLOCKS;
563}
564
565=head2 B<charscripts()>
566
567    use Unicode::UCD 'charscripts';
568
569    my $charscripts = charscripts();
570
571charscripts() returns a reference to a hash with the known script
572names as the keys, and the code point ranges (see L</charscript()>) as
573the values.
574
575See also L</Blocks versus Scripts>.
576
577=cut
578
579sub charscripts {
580    _charscripts() unless %SCRIPTS;
581    return dclone \%SCRIPTS;
582}
583
584=head2 B<charinrange()>
585
586In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
587can also test whether a code point is in the I<range> as returned by
588L</charblock()> and L</charscript()> or as the values of the hash returned
589by L</charblocks()> and L</charscripts()> by using charinrange():
590
591    use Unicode::UCD qw(charscript charinrange);
592
593    $range = charscript('Hiragana');
594    print "looks like hiragana\n" if charinrange($range, $codepoint);
595
596=cut
597
598my %GENERAL_CATEGORIES =
599 (
600    'L'  =>         'Letter',
601    'LC' =>         'CasedLetter',
602    'Lu' =>         'UppercaseLetter',
603    'Ll' =>         'LowercaseLetter',
604    'Lt' =>         'TitlecaseLetter',
605    'Lm' =>         'ModifierLetter',
606    'Lo' =>         'OtherLetter',
607    'M'  =>         'Mark',
608    'Mn' =>         'NonspacingMark',
609    'Mc' =>         'SpacingMark',
610    'Me' =>         'EnclosingMark',
611    'N'  =>         'Number',
612    'Nd' =>         'DecimalNumber',
613    'Nl' =>         'LetterNumber',
614    'No' =>         'OtherNumber',
615    'P'  =>         'Punctuation',
616    'Pc' =>         'ConnectorPunctuation',
617    'Pd' =>         'DashPunctuation',
618    'Ps' =>         'OpenPunctuation',
619    'Pe' =>         'ClosePunctuation',
620    'Pi' =>         'InitialPunctuation',
621    'Pf' =>         'FinalPunctuation',
622    'Po' =>         'OtherPunctuation',
623    'S'  =>         'Symbol',
624    'Sm' =>         'MathSymbol',
625    'Sc' =>         'CurrencySymbol',
626    'Sk' =>         'ModifierSymbol',
627    'So' =>         'OtherSymbol',
628    'Z'  =>         'Separator',
629    'Zs' =>         'SpaceSeparator',
630    'Zl' =>         'LineSeparator',
631    'Zp' =>         'ParagraphSeparator',
632    'C'  =>         'Other',
633    'Cc' =>         'Control',
634    'Cf' =>         'Format',
635    'Cs' =>         'Surrogate',
636    'Co' =>         'PrivateUse',
637    'Cn' =>         'Unassigned',
638 );
639
640sub general_categories {
641    return dclone \%GENERAL_CATEGORIES;
642}
643
644=head2 B<general_categories()>
645
646    use Unicode::UCD 'general_categories';
647
648    my $categories = general_categories();
649
650This returns a reference to a hash which has short
651general category names (such as C<Lu>, C<Nd>, C<Zs>, C<S>) as keys and long
652names (such as C<UppercaseLetter>, C<DecimalNumber>, C<SpaceSeparator>,
653C<Symbol>) as values.  The hash is reversible in case you need to go
654from the long names to the short names.  The general category is the
655one returned from
656L</charinfo()> under the C<category> key.
657
658=cut
659
660my %BIDI_TYPES =
661 (
662   'L'   => 'Left-to-Right',
663   'LRE' => 'Left-to-Right Embedding',
664   'LRO' => 'Left-to-Right Override',
665   'R'   => 'Right-to-Left',
666   'AL'  => 'Right-to-Left Arabic',
667   'RLE' => 'Right-to-Left Embedding',
668   'RLO' => 'Right-to-Left Override',
669   'PDF' => 'Pop Directional Format',
670   'EN'  => 'European Number',
671   'ES'  => 'European Number Separator',
672   'ET'  => 'European Number Terminator',
673   'AN'  => 'Arabic Number',
674   'CS'  => 'Common Number Separator',
675   'NSM' => 'Non-Spacing Mark',
676   'BN'  => 'Boundary Neutral',
677   'B'   => 'Paragraph Separator',
678   'S'   => 'Segment Separator',
679   'WS'  => 'Whitespace',
680   'ON'  => 'Other Neutrals',
681 );
682
683=head2 B<bidi_types()>
684
685    use Unicode::UCD 'bidi_types';
686
687    my $categories = bidi_types();
688
689This returns a reference to a hash which has the short
690bidi (bidirectional) type names (such as C<L>, C<R>) as keys and long
691names (such as C<Left-to-Right>, C<Right-to-Left>) as values.  The
692hash is reversible in case you need to go from the long names to the
693short names.  The bidi type is the one returned from
694L</charinfo()>
695under the C<bidi> key.  For the exact meaning of the various bidi classes
696the Unicode TR9 is recommended reading:
697L<http://www.unicode.org/reports/tr9/>
698(as of Unicode 5.0.0)
699
700=cut
701
702sub bidi_types {
703    return dclone \%BIDI_TYPES;
704}
705
706=head2 B<compexcl()>
707
708    use Unicode::UCD 'compexcl';
709
710    my $compexcl = compexcl(0x09dc);
711
712This returns B<true> if the
713L</code point argument> should not be produced by composition normalization,
714B<AND> if that fact is not otherwise determinable from the Unicode data base.
715It currently does not return B<true> if the code point has a decomposition
716consisting of another single code point, nor if its decomposition starts
717with a code point whose combining class is non-zero.  Code points that meet
718either of these conditions should also not be produced by composition
719normalization.
720
721It returns B<false> otherwise.
722
723=cut
724
725my %COMPEXCL;
726
727sub _compexcl {
728    unless (%COMPEXCL) {
729	if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
730	    local $_;
731	    while (<$COMPEXCLFH>) {
732		if (/^([0-9A-F]+)\s+\#\s+/) {
733		    my $code = hex($1);
734		    $COMPEXCL{$code} = undef;
735		}
736	    }
737	    close($COMPEXCLFH);
738	}
739    }
740}
741
742sub compexcl {
743    my $arg  = shift;
744    my $code = _getcode($arg);
745    croak __PACKAGE__, "::compexcl: unknown code '$arg'"
746	unless defined $code;
747
748    _compexcl() unless %COMPEXCL;
749
750    return exists $COMPEXCL{$code};
751}
752
753=head2 B<casefold()>
754
755    use Unicode::UCD 'casefold';
756
757    my $casefold = casefold(0xDF);
758    if (defined $casefold) {
759        my @full_fold_hex = split / /, $casefold->{'full'};
760        my $full_fold_string =
761                    join "", map {chr(hex($_))} @full_fold_hex;
762        my @turkic_fold_hex =
763                        split / /, ($casefold->{'turkic'} ne "")
764                                        ? $casefold->{'turkic'}
765                                        : $casefold->{'full'};
766        my $turkic_fold_string =
767                        join "", map {chr(hex($_))} @turkic_fold_hex;
768    }
769    if (defined $casefold && $casefold->{'simple'} ne "") {
770        my $simple_fold_hex = $casefold->{'simple'};
771        my $simple_fold_string = chr(hex($simple_fold_hex));
772    }
773
774This returns the (almost) locale-independent case folding of the
775character specified by the L</code point argument>.
776
777If there is no case folding for that code point, B<undef> is returned.
778
779If there is a case folding for that code point, a reference to a hash
780with the following fields is returned:
781
782=over
783
784=item B<code>
785
786the input L</code point argument> expressed in hexadecimal, with leading zeros
787added if necessary to make it contain at least four hexdigits
788
789=item B<full>
790
791one or more codes (separated by spaces) that taken in order give the
792code points for the case folding for I<code>.
793Each has at least four hexdigits.
794
795=item B<simple>
796
797is empty, or is exactly one code with at least four hexdigits which can be used
798as an alternative case folding when the calling program cannot cope with the
799fold being a sequence of multiple code points.  If I<full> is just one code
800point, then I<simple> equals I<full>.  If there is no single code point folding
801defined for I<code>, then I<simple> is the empty string.  Otherwise, it is an
802inferior, but still better-than-nothing alternative folding to I<full>.
803
804=item B<mapping>
805
806is the same as I<simple> if I<simple> is not empty, and it is the same as I<full>
807otherwise.  It can be considered to be the simplest possible folding for
808I<code>.  It is defined primarily for backwards compatibility.
809
810=item B<status>
811
812is C<C> (for C<common>) if the best possible fold is a single code point
813(I<simple> equals I<full> equals I<mapping>).  It is C<S> if there are distinct
814folds, I<simple> and I<full> (I<mapping> equals I<simple>).  And it is C<F> if
815there only a I<full> fold (I<mapping> equals I<full>; I<simple> is empty).  Note
816that this
817describes the contents of I<mapping>.  It is defined primarily for backwards
818compatibility.
819
820On versions 3.1 and earlier of Unicode, I<status> can also be
821C<I> which is the same as C<C> but is a special case for dotted uppercase I and
822dotless lowercase i:
823
824=over
825
826=item B<*>
827
828If you use this C<I> mapping, the result is case-insensitive,
829but dotless and dotted I's are not distinguished
830
831=item B<*>
832
833If you exclude this C<I> mapping, the result is not fully case-insensitive, but
834dotless and dotted I's are distinguished
835
836=back
837
838=item B<turkic>
839
840contains any special folding for Turkic languages.  For versions of Unicode
841starting with 3.2, this field is empty unless I<code> has a different folding
842in Turkic languages, in which case it is one or more codes (separated by
843spaces) that taken in order give the code points for the case folding for
844I<code> in those languages.
845Each code has at least four hexdigits.
846Note that this folding does not maintain canonical equivalence without
847additional processing.
848
849For versions of Unicode 3.1 and earlier, this field is empty unless there is a
850special folding for Turkic languages, in which case I<status> is C<I>, and
851I<mapping>, I<full>, I<simple>, and I<turkic> are all equal.
852
853=back
854
855Programs that want complete generality and the best folding results should use
856the folding contained in the I<full> field.  But note that the fold for some
857code points will be a sequence of multiple code points.
858
859Programs that can't cope with the fold mapping being multiple code points can
860use the folding contained in the I<simple> field, with the loss of some
861generality.  In Unicode 5.1, about 7% of the defined foldings have no single
862code point folding.
863
864The I<mapping> and I<status> fields are provided for backwards compatibility for
865existing programs.  They contain the same values as in previous versions of
866this function.
867
868Locale is not completely independent.  The I<turkic> field contains results to
869use when the locale is a Turkic language.
870
871For more information about case mappings see
872L<http://www.unicode.org/unicode/reports/tr21>
873
874=cut
875
876my %CASEFOLD;
877
878sub _casefold {
879    unless (%CASEFOLD) {
880	if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
881	    local $_;
882	    while (<$CASEFOLDFH>) {
883		if (/^([0-9A-F]+); ([CFIST]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
884		    my $code = hex($1);
885		    $CASEFOLD{$code}{'code'} = $1;
886		    $CASEFOLD{$code}{'turkic'} = "" unless
887					    defined $CASEFOLD{$code}{'turkic'};
888		    if ($2 eq 'C' || $2 eq 'I') {	# 'I' is only on 3.1 and
889							# earlier Unicodes
890							# Both entries there (I
891							# only checked 3.1) are
892							# the same as C, and
893							# there are no other
894							# entries for those
895							# codepoints, so treat
896							# as if C, but override
897							# the turkic one for
898							# 'I'.
899			$CASEFOLD{$code}{'status'} = $2;
900			$CASEFOLD{$code}{'full'} = $CASEFOLD{$code}{'simple'} =
901			$CASEFOLD{$code}{'mapping'} = $3;
902			$CASEFOLD{$code}{'turkic'} = $3 if $2 eq 'I';
903		    } elsif ($2 eq 'F') {
904			$CASEFOLD{$code}{'full'} = $3;
905			unless (defined $CASEFOLD{$code}{'simple'}) {
906				$CASEFOLD{$code}{'simple'} = "";
907				$CASEFOLD{$code}{'mapping'} = $3;
908				$CASEFOLD{$code}{'status'} = $2;
909			}
910		    } elsif ($2 eq 'S') {
911
912
913			# There can't be a simple without a full, and simple
914			# overrides all but full
915
916			$CASEFOLD{$code}{'simple'} = $3;
917			$CASEFOLD{$code}{'mapping'} = $3;
918			$CASEFOLD{$code}{'status'} = $2;
919		    } elsif ($2 eq 'T') {
920			$CASEFOLD{$code}{'turkic'} = $3;
921		    } # else can't happen because only [CIFST] are possible
922		}
923	    }
924	    close($CASEFOLDFH);
925	}
926    }
927}
928
929sub casefold {
930    my $arg  = shift;
931    my $code = _getcode($arg);
932    croak __PACKAGE__, "::casefold: unknown code '$arg'"
933	unless defined $code;
934
935    _casefold() unless %CASEFOLD;
936
937    return $CASEFOLD{$code};
938}
939
940=head2 B<casespec()>
941
942    use Unicode::UCD 'casespec';
943
944    my $casespec = casespec(0xFB00);
945
946This returns the potentially locale-dependent case mappings of the L</code point
947argument>.  The mappings may be longer than a single code point (which the basic
948Unicode case mappings as returned by L</charinfo()> never are).
949
950If there are no case mappings for the L</code point argument>, or if all three
951possible mappings (I<lower>, I<title> and I<upper>) result in single code
952points and are locale independent and unconditional, B<undef> is returned
953(which means that the case mappings, if any, for the code point are those
954returned by L</charinfo()>).
955
956Otherwise, a reference to a hash giving the mappings (or a reference to a hash
957of such hashes, explained below) is returned with the following keys and their
958meanings:
959
960The keys in the bottom layer hash with the meanings of their values are:
961
962=over
963
964=item B<code>
965
966the input L</code point argument> expressed in hexadecimal, with leading zeros
967added if necessary to make it contain at least four hexdigits
968
969=item B<lower>
970
971one or more codes (separated by spaces) that taken in order give the
972code points for the lower case of I<code>.
973Each has at least four hexdigits.
974
975=item B<title>
976
977one or more codes (separated by spaces) that taken in order give the
978code points for the title case of I<code>.
979Each has at least four hexdigits.
980
981=item B<lower>
982
983one or more codes (separated by spaces) that taken in order give the
984code points for the upper case of I<code>.
985Each has at least four hexdigits.
986
987=item B<condition>
988
989the conditions for the mappings to be valid.
990If B<undef>, the mappings are always valid.
991When defined, this field is a list of conditions,
992all of which must be true for the mappings to be valid.
993The list consists of one or more
994I<locales> (see below)
995and/or I<contexts> (explained in the next paragraph),
996separated by spaces.
997(Other than as used to separate elements, spaces are to be ignored.)
998Case distinctions in the condition list are not significant.
999Conditions preceded by "NON_" represent the negation of the condition.
1000
1001A I<context> is one of those defined in the Unicode standard.
1002For Unicode 5.1, they are defined in Section 3.13 C<Default Case Operations>
1003available at
1004L<http://www.unicode.org/versions/Unicode5.1.0/>.
1005These are for context-sensitive casing.
1006
1007=back
1008
1009The hash described above is returned for locale-independent casing, where
1010at least one of the mappings has length longer than one.  If B<undef> is
1011returned, the code point may have mappings, but if so, all are length one,
1012and are returned by L</charinfo()>.
1013Note that when this function does return a value, it will be for the complete
1014set of mappings for a code point, even those whose length is one.
1015
1016If there are additional casing rules that apply only in certain locales,
1017an additional key for each will be defined in the returned hash.  Each such key
1018will be its locale name, defined as a 2-letter ISO 3166 country code, possibly
1019followed by a "_" and a 2-letter ISO language code (possibly followed by a "_"
1020and a variant code).  You can find the lists of all possible locales, see
1021L<Locale::Country> and L<Locale::Language>.
1022(In Unicode 5.1, the only locales returned by this function
1023are C<lt>, C<tr>, and C<az>.)
1024
1025Each locale key is a reference to a hash that has the form above, and gives
1026the casing rules for that particular locale, which take precedence over the
1027locale-independent ones when in that locale.
1028
1029If the only casing for a code point is locale-dependent, then the returned
1030hash will not have any of the base keys, like C<code>, C<upper>, etc., but
1031will contain only locale keys.
1032
1033For more information about case mappings see
1034L<http://www.unicode.org/unicode/reports/tr21/>
1035
1036=cut
1037
1038my %CASESPEC;
1039
1040sub _casespec {
1041    unless (%CASESPEC) {
1042	if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
1043	    local $_;
1044	    while (<$CASESPECFH>) {
1045		if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
1046		    my ($hexcode, $lower, $title, $upper, $condition) =
1047			($1, $2, $3, $4, $5);
1048		    my $code = hex($hexcode);
1049		    if (exists $CASESPEC{$code}) {
1050			if (exists $CASESPEC{$code}->{code}) {
1051			    my ($oldlower,
1052				$oldtitle,
1053				$oldupper,
1054				$oldcondition) =
1055				    @{$CASESPEC{$code}}{qw(lower
1056							   title
1057							   upper
1058							   condition)};
1059			    if (defined $oldcondition) {
1060				my ($oldlocale) =
1061				($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
1062				delete $CASESPEC{$code};
1063				$CASESPEC{$code}->{$oldlocale} =
1064				{ code      => $hexcode,
1065				  lower     => $oldlower,
1066				  title     => $oldtitle,
1067				  upper     => $oldupper,
1068				  condition => $oldcondition };
1069			    }
1070			}
1071			my ($locale) =
1072			    ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
1073			$CASESPEC{$code}->{$locale} =
1074			{ code      => $hexcode,
1075			  lower     => $lower,
1076			  title     => $title,
1077			  upper     => $upper,
1078			  condition => $condition };
1079		    } else {
1080			$CASESPEC{$code} =
1081			{ code      => $hexcode,
1082			  lower     => $lower,
1083			  title     => $title,
1084			  upper     => $upper,
1085			  condition => $condition };
1086		    }
1087		}
1088	    }
1089	    close($CASESPECFH);
1090	}
1091    }
1092}
1093
1094sub casespec {
1095    my $arg  = shift;
1096    my $code = _getcode($arg);
1097    croak __PACKAGE__, "::casespec: unknown code '$arg'"
1098	unless defined $code;
1099
1100    _casespec() unless %CASESPEC;
1101
1102    return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
1103}
1104
1105=head2 B<namedseq()>
1106
1107    use Unicode::UCD 'namedseq';
1108
1109    my $namedseq = namedseq("KATAKANA LETTER AINU P");
1110    my @namedseq = namedseq("KATAKANA LETTER AINU P");
1111    my %namedseq = namedseq();
1112
1113If used with a single argument in a scalar context, returns the string
1114consisting of the code points of the named sequence, or B<undef> if no
1115named sequence by that name exists.  If used with a single argument in
1116a list context, it returns the list of the ordinals of the code points.  If used
1117with no
1118arguments in a list context, returns a hash with the names of the
1119named sequences as the keys and the named sequences as strings as
1120the values.  Otherwise, it returns B<undef> or an empty list depending
1121on the context.
1122
1123This function only operates on officially approved (not provisional) named
1124sequences.
1125
1126=cut
1127
1128my %NAMEDSEQ;
1129
1130sub _namedseq {
1131    unless (%NAMEDSEQ) {
1132	if (openunicode(\$NAMEDSEQFH, "NamedSequences.txt")) {
1133	    local $_;
1134	    while (<$NAMEDSEQFH>) {
1135		if (/^(.+)\s*;\s*([0-9A-F]+(?: [0-9A-F]+)*)$/) {
1136		    my ($n, $s) = ($1, $2);
1137		    my @s = map { chr(hex($_)) } split(' ', $s);
1138		    $NAMEDSEQ{$n} = join("", @s);
1139		}
1140	    }
1141	    close($NAMEDSEQFH);
1142	}
1143    }
1144}
1145
1146sub namedseq {
1147    _namedseq() unless %NAMEDSEQ;
1148    my $wantarray = wantarray();
1149    if (defined $wantarray) {
1150	if ($wantarray) {
1151	    if (@_ == 0) {
1152		return %NAMEDSEQ;
1153	    } elsif (@_ == 1) {
1154		my $s = $NAMEDSEQ{ $_[0] };
1155		return defined $s ? map { ord($_) } split('', $s) : ();
1156	    }
1157	} elsif (@_ == 1) {
1158	    return $NAMEDSEQ{ $_[0] };
1159	}
1160    }
1161    return;
1162}
1163
1164=head2 Unicode::UCD::UnicodeVersion
1165
1166This returns the version of the Unicode Character Database, in other words, the
1167version of the Unicode standard the database implements.  The version is a
1168string of numbers delimited by dots (C<'.'>).
1169
1170=cut
1171
1172my $UNICODEVERSION;
1173
1174sub UnicodeVersion {
1175    unless (defined $UNICODEVERSION) {
1176	openunicode(\$VERSIONFH, "version");
1177	chomp($UNICODEVERSION = <$VERSIONFH>);
1178	close($VERSIONFH);
1179	croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
1180	    unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
1181    }
1182    return $UNICODEVERSION;
1183}
1184
1185=head2 B<Blocks versus Scripts>
1186
1187The difference between a block and a script is that scripts are closer
1188to the linguistic notion of a set of code points required to present
1189languages, while block is more of an artifact of the Unicode code point
1190numbering and separation into blocks of (mostly) 256 code points.
1191
1192For example the Latin B<script> is spread over several B<blocks>, such
1193as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
1194C<Latin Extended-B>.  On the other hand, the Latin script does not
1195contain all the characters of the C<Basic Latin> block (also known as
1196ASCII): it includes only the letters, and not, for example, the digits
1197or the punctuation.
1198
1199For blocks see L<http://www.unicode.org/Public/UNIDATA/Blocks.txt>
1200
1201For scripts see UTR #24: L<http://www.unicode.org/unicode/reports/tr24/>
1202
1203=head2 B<Matching Scripts and Blocks>
1204
1205Scripts are matched with the regular-expression construct
1206C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
1207while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
1208any of the 256 code points in the Tibetan block).
1209
1210
1211=head2 Implementation Note
1212
1213The first use of charinfo() opens a read-only filehandle to the Unicode
1214Character Database (the database is included in the Perl distribution).
1215The filehandle is then kept open for further queries.  In other words,
1216if you are wondering where one of your filehandles went, that's where.
1217
1218=head1 BUGS
1219
1220Does not yet support EBCDIC platforms.
1221
1222L</compexcl()> should give a complete list of excluded code points.
1223
1224=head1 AUTHOR
1225
1226Jarkko Hietaniemi
1227
1228=cut
1229
12301;
1231