1package Unicode::UCD;
2
3use strict;
4use warnings;
5no warnings 'surrogate';    # surrogates can be inputs to this
6use charnames ();
7
8our $VERSION = '0.75';
9
10sub DEBUG () { 0 }
11$|=1 if DEBUG;
12
13require Exporter;
14
15our @ISA = qw(Exporter);
16
17our @EXPORT_OK = qw(charinfo
18		    charblock charscript
19		    charblocks charscripts
20		    charinrange
21		    charprop
22		    charprops_all
23		    general_categories bidi_types
24		    compexcl
25		    casefold all_casefolds casespec
26		    namedseq
27                    num
28                    prop_aliases
29                    prop_value_aliases
30                    prop_values
31                    prop_invlist
32                    prop_invmap
33                    search_invlist
34                    MAX_CP
35                );
36
37use Carp;
38
39sub IS_ASCII_PLATFORM { ord("A") == 65 }
40
41=head1 NAME
42
43Unicode::UCD - Unicode character database
44
45=head1 SYNOPSIS
46
47    use Unicode::UCD 'charinfo';
48    my $charinfo   = charinfo($codepoint);
49
50    use Unicode::UCD 'charprop';
51    my $value  = charprop($codepoint, $property);
52
53    use Unicode::UCD 'charprops_all';
54    my $all_values_hash_ref = charprops_all($codepoint);
55
56    use Unicode::UCD 'casefold';
57    my $casefold = casefold($codepoint);
58
59    use Unicode::UCD 'all_casefolds';
60    my $all_casefolds_ref = all_casefolds();
61
62    use Unicode::UCD 'casespec';
63    my $casespec = casespec($codepoint);
64
65    use Unicode::UCD 'charblock';
66    my $charblock  = charblock($codepoint);
67
68    use Unicode::UCD 'charscript';
69    my $charscript = charscript($codepoint);
70
71    use Unicode::UCD 'charblocks';
72    my $charblocks = charblocks();
73
74    use Unicode::UCD 'charscripts';
75    my $charscripts = charscripts();
76
77    use Unicode::UCD qw(charscript charinrange);
78    my $range = charscript($script);
79    print "looks like $script\n" if charinrange($range, $codepoint);
80
81    use Unicode::UCD qw(general_categories bidi_types);
82    my $categories = general_categories();
83    my $types = bidi_types();
84
85    use Unicode::UCD 'prop_aliases';
86    my @space_names = prop_aliases("space");
87
88    use Unicode::UCD 'prop_value_aliases';
89    my @gc_punct_names = prop_value_aliases("Gc", "Punct");
90
91    use Unicode::UCD 'prop_values';
92    my @all_EA_short_names = prop_values("East_Asian_Width");
93
94    use Unicode::UCD 'prop_invlist';
95    my @puncts = prop_invlist("gc=punctuation");
96
97    use Unicode::UCD 'prop_invmap';
98    my ($list_ref, $map_ref, $format, $missing)
99                                      = prop_invmap("General Category");
100
101    use Unicode::UCD 'search_invlist';
102    my $index = search_invlist(\@invlist, $code_point);
103
104    # The following function should be used only internally in
105    # implementations of the Unicode Normalization Algorithm, and there
106    # are better choices than it.
107    use Unicode::UCD 'compexcl';
108    my $compexcl = compexcl($codepoint);
109
110    use Unicode::UCD 'namedseq';
111    my $namedseq = namedseq($named_sequence_name);
112
113    my $unicode_version = Unicode::UCD::UnicodeVersion();
114
115    my $convert_to_numeric =
116              Unicode::UCD::num("\N{RUMI DIGIT ONE}\N{RUMI DIGIT TWO}");
117
118=head1 DESCRIPTION
119
120The Unicode::UCD module offers a series of functions that
121provide a simple interface to the Unicode
122Character Database.
123
124=head2 code point argument
125
126Some of the functions are called with a I<code point argument>, which is either
127a decimal or a hexadecimal scalar designating a code point in the platform's
128native character set (extended to Unicode), or a string containing C<U+>
129followed by hexadecimals
130designating a Unicode code point.  A leading 0 will force a hexadecimal
131interpretation, as will a hexadecimal digit that isn't a decimal digit.
132
133Examples:
134
135    223     # Decimal 223 in native character set
136    0223    # Hexadecimal 223, native (= 547 decimal)
137    0xDF    # Hexadecimal DF, native (= 223 decimal)
138    '0xDF'  # String form of hexadecimal (= 223 decimal)
139    'U+DF'  # Hexadecimal DF, in Unicode's character set
140                              (= LATIN SMALL LETTER SHARP S)
141
142Note that the largest code point in Unicode is U+10FFFF.
143
144=cut
145
146our %caseless_equivalent;
147our $e_precision;
148our %file_to_swash_name;
149our @inline_definitions;
150our %loose_property_name_of;
151our %loose_property_to_file_of;
152our %loose_to_file_of;
153our $MAX_CP;
154our %nv_floating_to_rational;
155our %prop_aliases;
156our %stricter_to_file_of;
157our %strict_property_to_file_of;
158our %SwashInfo;
159our %why_deprecated;
160
161my $v_unicode_version;  # v-string.
162
163sub openunicode {
164    my (@path) = @_;
165    my $rfh;
166    for my $d (@INC) {
167        use File::Spec;
168        my $f = File::Spec->catfile($d, "unicore", @path);
169        return $rfh if open($rfh, '<', $f);
170    }
171    croak __PACKAGE__, ": failed to find ",
172        File::Spec->catfile("unicore", @path), " in @INC";
173}
174
175sub _dclone ($) {   # Use Storable::dclone if available; otherwise emulate it.
176
177    use if defined &DynaLoader::boot_DynaLoader, Storable => qw(dclone);
178
179    return dclone(shift) if defined &dclone;
180
181    my $arg = shift;
182    my $type = ref $arg;
183    return $arg unless $type;   # No deep cloning needed for scalars
184
185    if ($type eq 'ARRAY') {
186        my @return;
187        foreach my $element (@$arg) {
188            push @return, &_dclone($element);
189        }
190        return \@return;
191    }
192    elsif ($type eq 'HASH') {
193        my %return;
194        foreach my $key (keys %$arg) {
195            $return{$key} = &_dclone($arg->{$key});
196        }
197        return \%return;
198    }
199    else {
200        croak "_dclone can't handle " . $type;
201    }
202}
203
204=head2 B<charinfo()>
205
206    use Unicode::UCD 'charinfo';
207
208    my $charinfo = charinfo(0x41);
209
210This returns information about the input L</code point argument>
211as a reference to a hash of fields as defined by the Unicode
212standard.  If the L</code point argument> is not assigned in the standard
213(i.e., has the general category C<Cn> meaning C<Unassigned>)
214or is a non-character (meaning it is guaranteed to never be assigned in
215the standard),
216C<undef> is returned.
217
218Fields that aren't applicable to the particular code point argument exist in the
219returned hash, and are empty.
220
221For results that are less "raw" than this function returns, or to get the values for
222any property, not just the few covered by this function, use the
223L</charprop()> function.
224
225The keys in the hash with the meanings of their values are:
226
227=over
228
229=item B<code>
230
231the input native L</code point argument> expressed in hexadecimal, with
232leading zeros
233added if necessary to make it contain at least four hexdigits
234
235=item B<name>
236
237name of I<code>, all IN UPPER CASE.
238Some control-type code points do not have names.
239This field will be empty for C<Surrogate> and C<Private Use> code points,
240and for the others without a name,
241it will contain a description enclosed in angle brackets, like
242C<E<lt>controlE<gt>>.
243
244
245=item B<category>
246
247The short name of the general category of I<code>.
248This will match one of the keys in the hash returned by L</general_categories()>.
249
250The L</prop_value_aliases()> function can be used to get all the synonyms
251of the category name.
252
253=item B<combining>
254
255the combining class number for I<code> used in the Canonical Ordering Algorithm.
256For Unicode 5.1, this is described in Section 3.11 C<Canonical Ordering Behavior>
257available at
258L<http://www.unicode.org/versions/Unicode5.1.0/>
259
260The L</prop_value_aliases()> function can be used to get all the synonyms
261of the combining class number.
262
263=item B<bidi>
264
265bidirectional type of I<code>.
266This will match one of the keys in the hash returned by L</bidi_types()>.
267
268The L</prop_value_aliases()> function can be used to get all the synonyms
269of the bidi type name.
270
271=item B<decomposition>
272
273is empty if I<code> has no decomposition; or is one or more codes
274(separated by spaces) that, taken in order, represent a decomposition for
275I<code>.  Each has at least four hexdigits.
276The codes may be preceded by a word enclosed in angle brackets, then a space,
277like C<E<lt>compatE<gt> >, giving the type of decomposition
278
279This decomposition may be an intermediate one whose components are also
280decomposable.  Use L<Unicode::Normalize> to get the final decomposition in one
281step.
282
283=item B<decimal>
284
285if I<code> represents a decimal digit this is its integer numeric value
286
287=item B<digit>
288
289if I<code> represents some other digit-like number, this is its integer
290numeric value
291
292=item B<numeric>
293
294if I<code> represents a whole or rational number, this is its numeric value.
295Rational values are expressed as a string like C<1/4>.
296
297=item B<mirrored>
298
299C<Y> or C<N> designating if I<code> is mirrored in bidirectional text
300
301=item B<unicode10>
302
303name of I<code> in the Unicode 1.0 standard if one
304existed for this code point and is different from the current name
305
306=item B<comment>
307
308As of Unicode 6.0, this is always empty.
309
310=item B<upper>
311
312is, if non-empty, the uppercase mapping for I<code> expressed as at least four
313hexdigits.  This indicates that the full uppercase mapping is a single
314character, and is identical to the simple (single-character only) mapping.
315When this field is empty, it means that the simple uppercase mapping is
316I<code> itself; you'll need some other means, (like L</charprop()> or
317L</casespec()> to get the full mapping.
318
319=item B<lower>
320
321is, if non-empty, the lowercase mapping for I<code> expressed as at least four
322hexdigits.  This indicates that the full lowercase mapping is a single
323character, and is identical to the simple (single-character only) mapping.
324When this field is empty, it means that the simple lowercase mapping is
325I<code> itself; you'll need some other means, (like L</charprop()> or
326L</casespec()> to get the full mapping.
327
328=item B<title>
329
330is, if non-empty, the titlecase mapping for I<code> expressed as at least four
331hexdigits.  This indicates that the full titlecase mapping is a single
332character, and is identical to the simple (single-character only) mapping.
333When this field is empty, it means that the simple titlecase mapping is
334I<code> itself; you'll need some other means, (like L</charprop()> or
335L</casespec()> to get the full mapping.
336
337=item B<block>
338
339the block I<code> belongs to (used in C<\p{Blk=...}>).
340The L</prop_value_aliases()> function can be used to get all the synonyms
341of the block name.
342
343See L</Blocks versus Scripts>.
344
345=item B<script>
346
347the script I<code> belongs to.
348The L</prop_value_aliases()> function can be used to get all the synonyms
349of the script name.  Note that this is the older "Script" property value, and
350not the improved "Script_Extensions" value.
351
352See L</Blocks versus Scripts>.
353
354=back
355
356Note that you cannot do (de)composition and casing based solely on the
357I<decomposition>, I<combining>, I<lower>, I<upper>, and I<title> fields; you
358will need also the L</casespec()> function and the C<Composition_Exclusion>
359property.  (Or you could just use the L<lc()|perlfunc/lc>,
360L<uc()|perlfunc/uc>, and L<ucfirst()|perlfunc/ucfirst> functions, and the
361L<Unicode::Normalize> module.)
362
363=cut
364
365my %Cache;
366
367# Digits may be separated by a single underscore
368my $digits = qr/ ( [0-9] _? )+ (?!:_) /x;
369
370# A sign can be surrounded by white space
371my $sign = qr/ \s* [+-]? \s* /x;
372
373my $f_float = qr/  $sign $digits+ \. $digits*    # e.g., 5.0, 5.
374                 | $sign $digits* \. $digits+/x; # 0.7, .7
375
376# A number may be an integer, a rational, or a float with an optional exponent
377# We (shudder) accept a signed denominator
378my $number = qr{  ^ $sign $digits+ $
379                | ^ $sign $digits+ \/ $sign $digits+ $
380                | ^ $f_float (?: [Ee] [+-]? $digits )? $}x;
381
382sub loose_name ($) {
383    # Given a lowercase property or property-value name, return its
384    # standardized version that is expected for look-up in the 'loose' hashes
385    # in UCD.pl (hence, this depends on what mktables does).  This squeezes
386    # out blanks, underscores and dashes.  The complication stems from the
387    # grandfathered-in 'L_', which retains a single trailing underscore.
388
389# integer or float (no exponent)
390my $integer_or_float_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /x;
391
392# Also includes rationals
393my $numeric_re = qr! $integer_or_float_re | ^ -? \d+ / \d+ $ !x;
394    return $_[0] if $_[0] =~ $numeric_re;
395
396    (my $loose = $_[0]) =~ s/[-_ \t]//g;
397
398    return $loose if $loose !~ / ^ (?: is | to )? l $/x;
399    return 'l_' if $_[0] =~ / l .* _ /x;    # If original had a trailing '_'
400    return $loose;
401}
402
403##
404## "SWASH" == "SWATCH HASH". A "swatch" is a swatch of the Unicode landscape.
405## It's a data structure that encodes a set of Unicode characters.
406##
407
408{
409    use re "/aa";  # Nothing here uses above Latin1.
410
411    # If a floating point number is within this distance from the value of a
412    # fraction, it is considered to be that fraction, even if many more digits
413    # are specified that don't exactly match.
414    my $min_floating_slop;
415
416    # To guard against this program calling something that in turn ends up
417    # calling this program with the same inputs, and hence infinitely
418    # recursing, we keep a stack of the properties that are currently in
419    # progress, pushed upon entry, popped upon return.
420    my @recursed;
421
422    sub SWASHNEW {
423        my ($class, $type, $list, $minbits) = @_;
424        my $user_defined = 0;
425        local $^D = 0 if $^D;
426
427        $class = "" unless defined $class;
428        print STDERR __LINE__, ": class=$class, type=$type, list=",
429                                (defined $list) ? $list : ':undef:',
430                                ", minbits=$minbits\n" if DEBUG;
431
432        ##
433        ## Get the list of codepoints for the type.
434        ## Called from swash_init (see utf8.c) or SWASHNEW itself.
435        ##
436        ## Callers of swash_init:
437        ##     prop_invlist
438        ##     Unicode::UCD::prop_invmap
439        ##
440        ## Given a $type, our goal is to fill $list with the set of codepoint
441        ## ranges. If $type is false, $list passed is used.
442        ##
443        ## $minbits:
444        ##     For binary properties, $minbits must be 1.
445        ##     For character mappings (case and transliteration), $minbits must
446        ##     be a number except 1.
447        ##
448        ## $list (or that filled according to $type):
449        ##     Refer to perlunicode.pod, "User-Defined Character Properties."
450        ##
451        ##     For binary properties, only characters with the property value
452        ##     of True should be listed. The 3rd column, if any, will be ignored
453        ##
454        ## To make the parsing of $type clear, this code takes the a rather
455        ## unorthodox approach of last'ing out of the block once we have the
456        ## info we need. Were this to be a subroutine, the 'last' would just
457        ## be a 'return'.
458        ##
459        #   If a problem is found $type is returned;
460        #   Upon success, a new (or cached) blessed object is returned with
461        #   keys TYPE, BITS, EXTRAS, LIST, and with values having the
462        #   same meanings as the input parameters.
463        #   SPECIALS contains a reference to any special-treatment hash in the
464        #       property.
465        #   INVERT_IT is non-zero if the result should be inverted before use
466        #   USER_DEFINED is non-zero if the result came from a user-defined
467        my $file; ## file to load data from, and also part of the %Cache key.
468
469        # Change this to get a different set of Unicode tables
470        my $unicore_dir = 'unicore';
471        my $invert_it = 0;
472        my $list_is_from_mktables = 0;  # Is $list returned from a mktables
473                                        # generated file?  If so, we know it's
474                                        # well behaved.
475
476        if ($type)
477        {
478            # Verify that this isn't a recursive call for this property.
479            # Can't use croak, as it may try to recurse to here itself.
480            my $class_type = $class . "::$type";
481            if (grep { $_ eq $class_type } @recursed) {
482                CORE::die "panic: Infinite recursion in SWASHNEW for '$type'\n";
483            }
484            push @recursed, $class_type;
485
486            $type =~ s/^\s+//;
487            $type =~ s/\s+$//;
488
489            # regcomp.c surrounds the property name with '__" and '_i' if this
490            # is to be caseless matching.
491            my $caseless = $type =~ s/^(.*)__(.*)_i$/$1$2/;
492
493            print STDERR __LINE__, ": type=$type, caseless=$caseless\n" if DEBUG;
494
495        GETFILE:
496            {
497                ##
498                ## It could be a user-defined property.  Look in current
499                ## package if no package given
500                ##
501
502
503                my $caller0 = caller(0);
504                my $caller1 = $type =~ s/(.+):://
505                              ? $1
506                              : $caller0 eq 'main'
507                                ? 'main'
508                                : caller(1);
509
510                if (defined $caller1 && $type =~ /^I[ns]\w+$/) {
511                    my $prop = "${caller1}::$type";
512                    if (exists &{$prop}) {
513                        # stolen from Scalar::Util::PP::tainted()
514                        my $tainted;
515                        {
516                            local($@, $SIG{__DIE__}, $SIG{__WARN__});
517                            local $^W = 0;
518                            no warnings;
519                            eval { kill 0 * $prop };
520                            $tainted = 1 if $@ =~ /^Insecure/;
521                        }
522                        die "Insecure user-defined property \\p{$prop}\n"
523                            if $tainted;
524                        no strict 'refs';
525                        $list = &{$prop}($caseless);
526                        $user_defined = 1;
527                        last GETFILE;
528                    }
529                }
530
531                require "$unicore_dir/UCD.pl";
532
533                # All property names are matched caselessly
534                my $property_and_table = CORE::lc $type;
535                print STDERR __LINE__, ": $property_and_table\n" if DEBUG;
536
537                # See if is of the compound form 'property=value', where the
538                # value indicates the table we should use.
539                my ($property, $table, @remainder) =
540                                    split /\s*[:=]\s*/, $property_and_table, -1;
541                if (@remainder) {
542                    pop @recursed if @recursed;
543                    return $type;
544                }
545
546                my $prefix;
547                if (! defined $table) {
548
549                    # Here, is the single form.  The property becomes empty, and
550                    # the whole value is the table.
551                    $table = $property;
552                    $prefix = $property = "";
553                } else {
554                    print STDERR __LINE__, ": $property\n" if DEBUG;
555
556                    # Here it is the compound property=table form.  The property
557                    # name is always loosely matched, and always can have an
558                    # optional 'is' prefix (which isn't true in the single
559                    # form).
560                    $property = loose_name($property) =~ s/^is//r;
561
562                    # And convert to canonical form.  Quit if not valid.
563                    $property = $loose_property_name_of{$property};
564                    if (! defined $property) {
565                        pop @recursed if @recursed;
566                        return $type;
567                    }
568
569                    $prefix = "$property=";
570
571                    # If the rhs looks like it is a number...
572                    print STDERR __LINE__, ": table=$table\n" if DEBUG;
573
574                    if ($table =~ $number) {
575                        print STDERR __LINE__, ": table=$table\n" if DEBUG;
576
577                        # Split on slash, in case it is a rational, like \p{1/5}
578                        my @parts = split m{ \s* / \s* }x, $table, -1;
579                        print __LINE__, ": $type\n" if @parts > 2 && DEBUG;
580
581                        foreach my $part (@parts) {
582                            print __LINE__, ": part=$part\n" if DEBUG;
583
584                            $part =~ s/^\+\s*//;    # Remove leading plus
585                            $part =~ s/^-\s*/-/;    # Remove blanks after unary
586                                                    # minus
587
588                            # Remove underscores between digits.
589                            $part =~ s/(?<= [0-9] ) _ (?= [0-9] ) //xg;
590
591                            # No leading zeros (but don't make a single '0'
592                            # into a null string)
593                            $part =~ s/ ^ ( -? ) 0+ /$1/x;
594                            $part .= '0' if $part eq '-' || $part eq "";
595
596                            # No trailing zeros after a decimal point
597                            $part =~ s/ ( \. [0-9]*? ) 0+ $ /$1/x;
598
599                            # Begin with a 0 if a leading decimal point
600                            $part =~ s/ ^ ( -? ) \. /${1}0./x;
601
602                            # Ensure not a trailing decimal point: turn into an
603                            # integer
604                            $part =~ s/ \. $ //x;
605
606                            print STDERR __LINE__, ": part=$part\n" if DEBUG;
607                            #return $type if $part eq "";
608                        }
609
610                        #  If a rational...
611                        if (@parts == 2) {
612
613                            # If denominator is negative, get rid of it, and ...
614                            if ($parts[1] =~ s/^-//) {
615
616                                # If numerator is also negative, convert the
617                                # whole thing to positive, else move the minus
618                                # to the numerator
619                                if ($parts[0] !~ s/^-//) {
620                                    $parts[0] = '-' . $parts[0];
621                                }
622                            }
623                            $table = join '/', @parts;
624                        }
625                        elsif ($property ne 'nv' || $parts[0] !~ /\./) {
626
627                            # Here is not numeric value, or doesn't have a
628                            # decimal point.  No further manipulation is
629                            # necessary.  (Note the hard-coded property name.
630                            # This could fail if other properties eventually
631                            # had fractions as well; perhaps the cjk ones
632                            # could evolve to do that.  This hard-coding could
633                            # be fixed by mktables generating a list of
634                            # properties that could have fractions.)
635                            $table = $parts[0];
636                        } else {
637
638                            # Here is a floating point numeric_value.  Convert
639                            # to rational.  Get a normalized form, like
640                            # 5.00E-01, and look that up in the hash
641
642                            my $float = sprintf "%.*e",
643                                                $e_precision,
644                                                0 + $parts[0];
645
646                            if (exists $nv_floating_to_rational{$float}) {
647                                $table = $nv_floating_to_rational{$float};
648                            } else {
649                                pop @recursed if @recursed;
650                                return $type;
651                            }
652                        }
653                        print STDERR __LINE__, ": $property=$table\n" if DEBUG;
654                    }
655                }
656
657                # Combine lhs (if any) and rhs to get something that matches
658                # the syntax of the lookups.
659                $property_and_table = "$prefix$table";
660                print STDERR __LINE__, ": $property_and_table\n" if DEBUG;
661
662                # First try stricter matching.
663                $file = $stricter_to_file_of{$property_and_table};
664
665                # If didn't find it, try again with looser matching by editing
666                # out the applicable characters on the rhs and looking up
667                # again.
668                my $strict_property_and_table;
669                if (! defined $file) {
670
671                    # This isn't used unless the name begins with 'to'
672                    $strict_property_and_table = $property_and_table =~  s/^to//r;
673                    $table = loose_name($table);
674                    $property_and_table = "$prefix$table";
675                    print STDERR __LINE__, ": $property_and_table\n" if DEBUG;
676                    $file = $loose_to_file_of{$property_and_table};
677                    print STDERR __LINE__, ": $property_and_table\n" if DEBUG;
678                }
679
680                # Add the constant and go fetch it in.
681                if (defined $file) {
682
683                    # If the file name contains a !, it means to invert.  The
684                    # 0+ makes sure result is numeric
685                    $invert_it = 0 + $file =~ s/!//;
686
687                    if ($caseless
688                        && exists $caseless_equivalent{$property_and_table})
689                    {
690                        $file = $caseless_equivalent{$property_and_table};
691                    }
692
693                    # The pseudo-directory '#' means that there really isn't a
694                    # file to read, the data is in-line as part of the string;
695                    # we extract it below.
696                    $file = "$unicore_dir/lib/$file.pl" unless $file =~ m!^#/!;
697                    last GETFILE;
698                }
699                print STDERR __LINE__, ": didn't find $property_and_table\n" if DEBUG;
700
701                ##
702                ## Last attempt -- see if it's a standard "To" name
703                ## (e.g. "ToLower")  ToTitle is used by ucfirst().
704                ## The user-level way to access ToDigit() and ToFold()
705                ## is to use Unicode::UCD.
706                ##
707                # Only check if caller wants non-binary
708                if ($minbits != 1) {
709                    if ($property_and_table =~ s/^to//) {
710                    # Look input up in list of properties for which we have
711                    # mapping files.  First do it with the strict approach
712                        if (defined ($file = $strict_property_to_file_of{
713                                                    $strict_property_and_table}))
714                        {
715                            $type = $file_to_swash_name{$file};
716                            print STDERR __LINE__, ": type set to $type\n"
717                                                                        if DEBUG;
718                            $file = "$unicore_dir/$file.pl";
719                            last GETFILE;
720                        }
721                        elsif (defined ($file =
722                          $loose_property_to_file_of{$property_and_table}))
723                        {
724                            $type = $file_to_swash_name{$file};
725                            print STDERR __LINE__, ": type set to $type\n"
726                                                                        if DEBUG;
727                            $file = "$unicore_dir/$file.pl";
728                            last GETFILE;
729                        }   # If that fails see if there is a corresponding binary
730                            # property file
731                        elsif (defined ($file =
732                                    $loose_to_file_of{$property_and_table}))
733                        {
734
735                            # Here, there is no map file for the property we
736                            # are trying to get the map of, but this is a
737                            # binary property, and there is a file for it that
738                            # can easily be translated to a mapping, so use
739                            # that, treating this as a binary property.
740                            # Setting 'minbits' here causes it to be stored as
741                            # such in the cache, so if someone comes along
742                            # later looking for just a binary, they get it.
743                            $minbits = 1;
744
745                            # The 0+ makes sure is numeric
746                            $invert_it = 0 + $file =~ s/!//;
747                            $file = "$unicore_dir/lib/$file.pl"
748                                                         unless $file =~ m!^#/!;
749                            last GETFILE;
750                        }
751                    }
752                }
753
754                ##
755                ## If we reach this line, it's because we couldn't figure
756                ## out what to do with $type. Ouch.
757                ##
758
759                pop @recursed if @recursed;
760                return $type;
761            } # end of GETFILE block
762
763            if (defined $file) {
764                print STDERR __LINE__, ": found it (file='$file')\n" if DEBUG;
765
766                ##
767                ## If we reach here, it was due to a 'last GETFILE' above
768                ## (exception: user-defined properties and mappings), so we
769                ## have a filename, so now we load it if we haven't already.
770
771                # The pseudo-directory '#' means the result isn't really a
772                # file, but is in-line, with semi-colons to be turned into
773                # new-lines.  Since it is in-line there is no advantage to
774                # caching the result
775                if ($file =~ s!^#/!!) {
776                    $list = $inline_definitions[$file];
777                }
778                else {
779                    # Here, we have an actual file to read in and load, but it
780                    # may already have been read-in and cached.  The cache key
781                    # is the class and file to load, and whether the results
782                    # need to be inverted.
783                    my $found = $Cache{$class, $file, $invert_it};
784                    if ($found and ref($found) eq $class) {
785                        print STDERR __LINE__, ": Returning cached swash for '$class,$file,$invert_it' for \\p{$type}\n" if DEBUG;
786                        pop @recursed if @recursed;
787                        return $found;
788                    }
789
790                    local $@;
791                    local $!;
792                    $list = do $file; die $@ if $@;
793                }
794
795                $list_is_from_mktables = 1;
796            }
797        } # End of $type is non-null
798
799        # Here, either $type was null, or we found the requested property and
800        # read it into $list
801
802        my $extras = "";
803
804        my $bits = $minbits;
805
806        # mktables lists don't have extras, like '&prop', so don't need
807        # to separate them; also lists are already sorted, so don't need to do
808        # that.
809        if ($list && ! $list_is_from_mktables) {
810            my $taint = substr($list,0,0); # maintain taint
811
812            # Separate the extras from the code point list, and make sure
813            # user-defined properties are well-behaved for
814            # downstream code.
815            if ($user_defined) {
816                my @tmp = split(/^/m, $list);
817                my %seen;
818                no warnings;
819
820                # The extras are anything that doesn't begin with a hex digit.
821                $extras = join '', $taint, grep /^[^0-9a-fA-F]/, @tmp;
822
823                # Remove the extras, and sort the remaining entries by the
824                # numeric value of their beginning hex digits, removing any
825                # duplicates.
826                $list = join '', $taint,
827                        map  { $_->[1] }
828                        sort { $a->[0] <=> $b->[0] }
829                        map  { /^([0-9a-fA-F]+)/ && !$seen{$1}++ ? [ CORE::hex($1), $_ ] : () }
830                        @tmp; # XXX doesn't do ranges right
831            }
832            else {
833                # mktables has gone to some trouble to make non-user defined
834                # properties well-behaved, so we can skip the effort we do for
835                # user-defined ones.  Any extras are at the very beginning of
836                # the string.
837
838                # This regex splits out the first lines of $list into $1 and
839                # strips them off from $list, until we get one that begins
840                # with a hex number, alone on the line, or followed by a tab.
841                # Either portion may be empty.
842                $list =~ s/ \A ( .*? )
843                            (?: \z | (?= ^ [0-9a-fA-F]+ (?: \t | $) ) )
844                          //msx;
845
846                $extras = "$taint$1";
847            }
848        }
849
850        if ($minbits != 1 && $minbits < 32) { # not binary property
851            my $top = 0;
852            while ($list =~ /^([0-9a-fA-F]+)(?:[\t]([0-9a-fA-F]+)?)(?:[ \t]([0-9a-fA-F]+))?/mg) {
853                my $min = CORE::hex $1;
854                my $max = defined $2 ? CORE::hex $2 : $min;
855                my $val = defined $3 ? CORE::hex $3 : 0;
856                $val += $max - $min if defined $3;
857                $top = $val if $val > $top;
858            }
859            my $topbits =
860                $top > 0xffff ? 32 :
861                $top > 0xff ? 16 : 8;
862            $bits = $topbits if $bits < $topbits;
863        }
864
865        my @extras;
866        if ($extras) {
867            for my $x ($extras) {
868                my $taint = substr($x,0,0); # maintain taint
869                pos $x = 0;
870                while ($x =~ /^([^0-9a-fA-F\n])(.*)/mg) {
871                    my $char = "$1$taint";
872                    my $name = "$2$taint";
873                    print STDERR __LINE__, ": char [$char] => name [$name]\n"
874                        if DEBUG;
875                    if ($char =~ /[-+!&]/) {
876                        my ($c,$t) = split(/::/, $name, 2);	# bogus use of ::, really
877                        my $subobj;
878                        if ($c eq 'utf8') { # khw is unsure of this
879                            $subobj = SWASHNEW($t, "", $minbits, 0);
880                        }
881                        elsif (exists &$name) {
882                            $subobj = SWASHNEW($name, "", $minbits, 0);
883                        }
884                        elsif ($c =~ /^([0-9a-fA-F]+)/) {
885                            $subobj = SWASHNEW("", $c, $minbits, 0);
886                        }
887                        print STDERR __LINE__, ": returned from getting sub object for $name\n" if DEBUG;
888                        if (! ref $subobj) {
889                            pop @recursed if @recursed && $type;
890                            return $subobj;
891                        }
892                        push @extras, $name => $subobj;
893                        $bits = $subobj->{BITS} if $bits < $subobj->{BITS};
894                        $user_defined = $subobj->{USER_DEFINED}
895                                              if $subobj->{USER_DEFINED};
896                    }
897                }
898            }
899        }
900
901        if (DEBUG) {
902            print STDERR __LINE__, ": CLASS = $class, TYPE => $type, BITS => $bits, INVERT_IT => $invert_it, USER_DEFINED => $user_defined";
903            print STDERR "\nLIST =>\n$list" if defined $list;
904            print STDERR "\nEXTRAS =>\n$extras" if defined $extras;
905            print STDERR "\n";
906        }
907
908        my $SWASH = bless {
909            TYPE => $type,
910            BITS => $bits,
911            EXTRAS => $extras,
912            LIST => $list,
913            USER_DEFINED => $user_defined,
914            @extras,
915        } => $class;
916
917        if ($file) {
918            $Cache{$class, $file, $invert_it} = $SWASH;
919            if ($type
920                && exists $SwashInfo{$type}
921                && exists $SwashInfo{$type}{'specials_name'})
922            {
923                my $specials_name = $SwashInfo{$type}{'specials_name'};
924                no strict "refs";
925                print STDERR "\nspecials_name => $specials_name\n" if DEBUG;
926                $SWASH->{'SPECIALS'} = \%$specials_name;
927            }
928            $SWASH->{'INVERT_IT'} = $invert_it;
929        }
930
931        pop @recursed if @recursed && $type;
932
933        return $SWASH;
934    }
935}
936
937# NB: This function is nearly duplicated in charnames.pm
938sub _getcode {
939    my $arg = shift;
940
941    if ($arg =~ /^[1-9]\d*$/) {
942	return $arg;
943    }
944    elsif ($arg =~ /^(?:0[xX])?([[:xdigit:]]+)$/) {
945	return CORE::hex($1);
946    }
947    elsif ($arg =~ /^[Uu]\+([[:xdigit:]]+)$/) { # Is of form U+0000, means
948                                                # wants the Unicode code
949                                                # point, not the native one
950        my $decimal = CORE::hex($1);
951        return $decimal if IS_ASCII_PLATFORM;
952        return utf8::unicode_to_native($decimal);
953    }
954
955    return;
956}
957
958# Populated by _num.  Converts real number back to input rational
959my %real_to_rational;
960
961# To store the contents of files found on disk.
962my @BIDIS;
963my @CATEGORIES;
964my @DECOMPOSITIONS;
965my @NUMERIC_TYPES;
966my %SIMPLE_LOWER;
967my %SIMPLE_TITLE;
968my %SIMPLE_UPPER;
969my %UNICODE_1_NAMES;
970my %ISO_COMMENT;
971
972# Eval'd so can run on versions earlier than the property is available in
973my $Hangul_Syllables_re = eval 'qr/\p{Block=Hangul_Syllables}/';
974
975sub charinfo {
976
977    # This function has traditionally mimicked what is in UnicodeData.txt,
978    # warts and all.  This is a re-write that avoids UnicodeData.txt so that
979    # it can be removed to save disk space.  Instead, this assembles
980    # information gotten by other methods that get data from various other
981    # files.  It uses charnames to get the character name; and various
982    # mktables tables.
983
984    use feature 'unicode_strings';
985
986    # Will fail if called under minitest
987    use if defined &DynaLoader::boot_DynaLoader, "Unicode::Normalize" => qw(getCombinClass NFD);
988
989    my $arg  = shift;
990    my $code = _getcode($arg);
991    croak __PACKAGE__, "::charinfo: unknown code '$arg'" unless defined $code;
992
993    # Non-unicode implies undef.
994    return if $code > 0x10FFFF;
995
996    my %prop;
997    my $char = chr($code);
998
999    @CATEGORIES =_read_table("To/Gc.pl") unless @CATEGORIES;
1000    $prop{'category'} = _search(\@CATEGORIES, 0, $#CATEGORIES, $code)
1001                        // $SwashInfo{'ToGc'}{'missing'};
1002    # Return undef if category value is 'Unassigned' or one of its synonyms
1003    return if grep { lc $_ eq 'unassigned' }
1004                                    prop_value_aliases('Gc', $prop{'category'});
1005
1006    $prop{'code'} = sprintf "%04X", $code;
1007    $prop{'name'} = ($char =~ /\p{Cntrl}/) ? '<control>'
1008                                           : (charnames::viacode($code) // "");
1009
1010    $prop{'combining'} = getCombinClass($code);
1011
1012    @BIDIS =_read_table("To/Bc.pl") unless @BIDIS;
1013    $prop{'bidi'} = _search(\@BIDIS, 0, $#BIDIS, $code)
1014                    // $SwashInfo{'ToBc'}{'missing'};
1015
1016    # For most code points, we can just read in "unicore/Decomposition.pl", as
1017    # its contents are exactly what should be output.  But that file doesn't
1018    # contain the data for the Hangul syllable decompositions, which can be
1019    # algorithmically computed, and NFD() does that, so we call NFD() for
1020    # those.  We can't use NFD() for everything, as it does a complete
1021    # recursive decomposition, and what this function has always done is to
1022    # return what's in UnicodeData.txt which doesn't show that recursiveness.
1023    # Fortunately, the NFD() of the Hanguls doesn't have any recursion
1024    # issues.
1025    # Having no decomposition implies an empty field; otherwise, all but
1026    # "Canonical" imply a compatible decomposition, and the type is prefixed
1027    # to that, as it is in UnicodeData.txt
1028    UnicodeVersion() unless defined $v_unicode_version;
1029    if ($v_unicode_version ge v2.0.0 && $char =~ $Hangul_Syllables_re) {
1030        # The code points of the decomposition are output in standard Unicode
1031        # hex format, separated by blanks.
1032        $prop{'decomposition'} = join " ", map { sprintf("%04X", $_)}
1033                                           unpack "U*", NFD($char);
1034    }
1035    else {
1036        @DECOMPOSITIONS = _read_table("Decomposition.pl")
1037                          unless @DECOMPOSITIONS;
1038        $prop{'decomposition'} = _search(\@DECOMPOSITIONS, 0, $#DECOMPOSITIONS,
1039                                                                $code) // "";
1040    }
1041
1042    # Can use num() to get the numeric values, if any.
1043    if (! defined (my $value = num($char))) {
1044        $prop{'decimal'} = $prop{'digit'} = $prop{'numeric'} = "";
1045    }
1046    else {
1047        if ($char =~ /\d/) {
1048            $prop{'decimal'} = $prop{'digit'} = $prop{'numeric'} = $value;
1049        }
1050        else {
1051
1052            # For non-decimal-digits, we have to read in the Numeric type
1053            # to distinguish them.  It is not just a matter of integer vs.
1054            # rational, as some whole number values are not considered digits,
1055            # e.g., TAMIL NUMBER TEN.
1056            $prop{'decimal'} = "";
1057
1058            @NUMERIC_TYPES =_read_table("To/Nt.pl") unless @NUMERIC_TYPES;
1059            if ((_search(\@NUMERIC_TYPES, 0, $#NUMERIC_TYPES, $code) // "")
1060                eq 'Digit')
1061            {
1062                $prop{'digit'} = $prop{'numeric'} = $value;
1063            }
1064            else {
1065                $prop{'digit'} = "";
1066                $prop{'numeric'} = $real_to_rational{$value} // $value;
1067            }
1068        }
1069    }
1070
1071    $prop{'mirrored'} = ($char =~ /\p{Bidi_Mirrored}/) ? 'Y' : 'N';
1072
1073    %UNICODE_1_NAMES =_read_table("To/Na1.pl", "use_hash") unless %UNICODE_1_NAMES;
1074    $prop{'unicode10'} = $UNICODE_1_NAMES{$code} // "";
1075
1076    UnicodeVersion() unless defined $v_unicode_version;
1077    if ($v_unicode_version ge v6.0.0) {
1078        $prop{'comment'} = "";
1079    }
1080    else {
1081        %ISO_COMMENT = _read_table("To/Isc.pl", "use_hash") unless %ISO_COMMENT;
1082        $prop{'comment'} = (defined $ISO_COMMENT{$code})
1083                           ? $ISO_COMMENT{$code}
1084                           : "";
1085    }
1086
1087    %SIMPLE_UPPER = _read_table("To/Uc.pl", "use_hash") unless %SIMPLE_UPPER;
1088    $prop{'upper'} = (defined $SIMPLE_UPPER{$code})
1089                     ? sprintf("%04X", $SIMPLE_UPPER{$code})
1090                     : "";
1091
1092    %SIMPLE_LOWER = _read_table("To/Lc.pl", "use_hash") unless %SIMPLE_LOWER;
1093    $prop{'lower'} = (defined $SIMPLE_LOWER{$code})
1094                     ? sprintf("%04X", $SIMPLE_LOWER{$code})
1095                     : "";
1096
1097    %SIMPLE_TITLE = _read_table("To/Tc.pl", "use_hash") unless %SIMPLE_TITLE;
1098    $prop{'title'} = (defined $SIMPLE_TITLE{$code})
1099                     ? sprintf("%04X", $SIMPLE_TITLE{$code})
1100                     : "";
1101
1102    $prop{block}  = charblock($code);
1103    $prop{script} = charscript($code);
1104    return \%prop;
1105}
1106
1107sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
1108    my ($table, $lo, $hi, $code) = @_;
1109
1110    return if $lo > $hi;
1111
1112    my $mid = int(($lo+$hi) / 2);
1113
1114    if ($table->[$mid]->[0] < $code) {
1115	if ($table->[$mid]->[1] >= $code) {
1116	    return $table->[$mid]->[2];
1117	} else {
1118	    _search($table, $mid + 1, $hi, $code);
1119	}
1120    } elsif ($table->[$mid]->[0] > $code) {
1121	_search($table, $lo, $mid - 1, $code);
1122    } else {
1123	return $table->[$mid]->[2];
1124    }
1125}
1126
1127sub _read_table ($;$) {
1128
1129    # Returns the contents of the mktables generated table file located at $1
1130    # in the form of either an array of arrays or a hash, depending on if the
1131    # optional second parameter is true (for hash return) or not.  In the case
1132    # of a hash return, each key is a code point, and its corresponding value
1133    # is what the table gives as the code point's corresponding value.  In the
1134    # case of an array return, each outer array denotes a range with [0] the
1135    # start point of that range; [1] the end point; and [2] the value that
1136    # every code point in the range has.  The hash return is useful for fast
1137    # lookup when the table contains only single code point ranges.  The array
1138    # return takes much less memory when there are large ranges.
1139    #
1140    # This function has the side effect of setting
1141    # $SwashInfo{$property}{'format'} to be the mktables format of the
1142    #                                       table; and
1143    # $SwashInfo{$property}{'missing'} to be the value for all entries
1144    #                                        not listed in the table.
1145    # where $property is the Unicode property name, preceded by 'To' for map
1146    # properties., e.g., 'ToSc'.
1147    #
1148    # Table entries look like one of:
1149    # 0000	0040	Common	# [65]
1150    # 00AA		Latin
1151
1152    my $table = shift;
1153    my $return_hash = shift;
1154    $return_hash = 0 unless defined $return_hash;
1155    my @return;
1156    my %return;
1157    local $_;
1158    my $list = do "unicore/$table";
1159
1160    # Look up if this property requires adjustments, which we do below if it
1161    # does.
1162    require "unicore/UCD.pl";
1163    my $property = $table =~ s/\.pl//r;
1164    $property = $file_to_swash_name{$property};
1165    my $to_adjust = defined $property
1166                    && $SwashInfo{$property}{'format'} =~ / ^ a /x;
1167
1168    for (split /^/m, $list) {
1169        my ($start, $end, $value) = / ^ (.+?) \t (.*?) \t (.+?)
1170                                        \s* ( \# .* )?  # Optional comment
1171                                        $ /x;
1172        my $decimal_start = hex $start;
1173        my $decimal_end = ($end eq "") ? $decimal_start : hex $end;
1174        $value = hex $value if $to_adjust
1175                               && $SwashInfo{$property}{'format'} eq 'ax';
1176        if ($return_hash) {
1177            foreach my $i ($decimal_start .. $decimal_end) {
1178                $return{$i} = ($to_adjust)
1179                              ? $value + $i - $decimal_start
1180                              : $value;
1181            }
1182        }
1183        elsif (! $to_adjust
1184               && @return
1185               && $return[-1][1] == $decimal_start - 1
1186               && $return[-1][2] eq $value)
1187        {
1188            # If this is merely extending the previous range, do just that.
1189            $return[-1]->[1] = $decimal_end;
1190        }
1191        else {
1192            push @return, [ $decimal_start, $decimal_end, $value ];
1193        }
1194    }
1195    return ($return_hash) ? %return : @return;
1196}
1197
1198sub charinrange {
1199    my ($range, $arg) = @_;
1200    my $code = _getcode($arg);
1201    croak __PACKAGE__, "::charinrange: unknown code '$arg'"
1202	unless defined $code;
1203    _search($range, 0, $#$range, $code);
1204}
1205
1206=head2 B<charprop()>
1207
1208    use Unicode::UCD 'charprop';
1209
1210    print charprop(0x41, "Gc"), "\n";
1211    print charprop(0x61, "General_Category"), "\n";
1212
1213  prints
1214    Lu
1215    Ll
1216
1217This returns the value of the Unicode property given by the second parameter
1218for the  L</code point argument> given by the first.
1219
1220The passed-in property may be specified as any of the synonyms returned by
1221L</prop_aliases()>.
1222
1223The return value is always a scalar, either a string or a number.  For
1224properties where there are synonyms for the values, the synonym returned by
1225this function is the longest, most descriptive form, the one returned by
1226L</prop_value_aliases()> when called in a scalar context.  Of course, you can
1227call L</prop_value_aliases()> on the result to get other synonyms.
1228
1229The return values are more "cooked" than the L</charinfo()> ones.  For
1230example, the C<"uc"> property value is the actual string containing the full
1231uppercase mapping of the input code point.  You have to go to extra trouble
1232with C<charinfo> to get this value from its C<upper> hash element when the
1233full mapping differs from the simple one.
1234
1235Special note should be made of the return values for a few properties:
1236
1237=over
1238
1239=item Block
1240
1241The value returned is the new-style (see L</Old-style versus new-style block
1242names>).
1243
1244=item Decomposition_Mapping
1245
1246Like L</charinfo()>, the result may be an intermediate decomposition whose
1247components are also decomposable.  Use L<Unicode::Normalize> to get the final
1248decomposition in one step.
1249
1250Unlike L</charinfo()>, this does not include the decomposition type.  Use the
1251C<Decomposition_Type> property to get that.
1252
1253=item Name_Alias
1254
1255If the input code point's name has more than one synonym, they are returned
1256joined into a single comma-separated string.
1257
1258=item Numeric_Value
1259
1260If the result is a fraction, it is converted into a floating point number to
1261the accuracy of your platform.
1262
1263=item Script_Extensions
1264
1265If the result is multiple script names, they are returned joined into a single
1266comma-separated string.
1267
1268=back
1269
1270When called with a property that is a Perl extension that isn't expressible in
1271a compound form, this function currently returns C<undef>, as the only two
1272possible values are I<true> or I<false> (1 or 0 I suppose).  This behavior may
1273change in the future, so don't write code that relies on it.  C<Present_In> is
1274a Perl extension that is expressible in a bipartite or compound form (for
1275example, C<\p{Present_In=4.0}>), so C<charprop> accepts it.  But C<Any> is a
1276Perl extension that isn't expressible that way, so C<charprop> returns
1277C<undef> for it.  Also C<charprop> returns C<undef> for all Perl extensions
1278that are internal-only.
1279
1280=cut
1281
1282sub charprop ($$;$) {
1283    my ($input_cp, $prop, $internal_ok) = @_;
1284
1285    my $cp = _getcode($input_cp);
1286    croak __PACKAGE__, "::charprop: unknown code point '$input_cp'" unless defined $cp;
1287
1288    my ($list_ref, $map_ref, $format, $default)
1289                                      = prop_invmap($prop, $internal_ok);
1290    return undef unless defined $list_ref;
1291
1292    my $i = search_invlist($list_ref, $cp);
1293    croak __PACKAGE__, "::charprop: prop_invmap return is invalid for charprop('$input_cp', '$prop)" unless defined $i;
1294
1295    # $i is the index into both the inversion list and map of $cp.
1296    my $map = $map_ref->[$i];
1297
1298    # Convert enumeration values to their most complete form.
1299    if (! ref $map) {
1300        my $long_form = prop_value_aliases($prop, $map);
1301        $map = $long_form if defined $long_form;
1302    }
1303
1304    if ($format =~ / ^ s /x) {  # Scalars
1305        return join ",", @$map if ref $map; # Convert to scalar with comma
1306                                            # separated array elements
1307
1308        # Resolve ambiguity as to whether an all digit value is a code point
1309        # that should be converted to a character, or whether it is really
1310        # just a number.  To do this, look at the default.  If it is a
1311        # non-empty number, we can safely assume the result is also a number.
1312        if ($map =~ / ^ \d+ $ /ax && $default !~ / ^ \d+ $ /ax) {
1313            $map = chr $map;
1314        }
1315        elsif ($map =~ / ^ (?: Y | N ) $ /x) {
1316
1317            # prop_invmap() returns these values for properties that are Perl
1318            # extensions.  But this is misleading.  For now, return undef for
1319            # these, as currently documented.
1320            undef $map unless
1321                exists $prop_aliases{loose_name(lc $prop)};
1322        }
1323        return $map;
1324    }
1325    elsif ($format eq 'ar') {   # numbers, including rationals
1326        my $offset = $cp - $list_ref->[$i];
1327        return $map if $map =~ /nan/i;
1328        return $map + $offset if $offset != 0;  # If needs adjustment
1329        return eval $map;   # Convert e.g., 1/2 to 0.5
1330    }
1331    elsif ($format =~ /^a/) {   # Some entries need adjusting
1332
1333        # Linearize sequences into a string.
1334        return join "", map { chr $_ } @$map if ref $map; # XXX && $format =~ /^ a [dl] /x;
1335
1336        return "" if $map eq "" && $format =~ /^a.*e/;
1337
1338        # These are all character mappings.  Return the chr if no adjustment
1339        # is needed
1340        return chr $cp if $map eq "0";
1341
1342        # Convert special entry.
1343        if ($map eq '<hangul syllable>' && $format eq 'ad') {
1344            use Unicode::Normalize qw(NFD);
1345            return NFD(chr $cp);
1346        }
1347
1348        # The rest need adjustment from the first entry in the inversion list
1349        # corresponding to this map.
1350        my $offset = $cp - $list_ref->[$i];
1351        return chr($map + $cp - $list_ref->[$i]);
1352    }
1353    elsif ($format eq 'n') {    # The name property
1354
1355        # There are two special cases, handled here.
1356        if ($map =~ / ( .+ ) <code\ point> $ /x) {
1357            $map = sprintf("$1%04X", $cp);
1358        }
1359        elsif ($map eq '<hangul syllable>') {
1360            $map = charnames::viacode($cp);
1361        }
1362        return $map;
1363    }
1364    else {
1365        croak __PACKAGE__, "::charprop: Internal error: unknown format '$format'.  Please perlbug this";
1366    }
1367}
1368
1369=head2 B<charprops_all()>
1370
1371    use Unicode::UCD 'charprops_all';
1372
1373    my $%properties_of_A_hash_ref = charprops_all("U+41");
1374
1375This returns a reference to a hash whose keys are all the distinct Unicode (no
1376Perl extension) properties, and whose values are the respective values for
1377those properties for the input L</code point argument>.
1378
1379Each key is the property name in its longest, most descriptive form.  The
1380values are what L</charprop()> would return.
1381
1382This function is expensive in time and memory.
1383
1384=cut
1385
1386sub charprops_all($) {
1387    my $input_cp = shift;
1388
1389    my $cp = _getcode($input_cp);
1390    croak __PACKAGE__, "::charprops_all: unknown code point '$input_cp'" unless defined $cp;
1391
1392    my %return;
1393
1394    require "unicore/UCD.pl";
1395
1396    foreach my $prop (keys %prop_aliases) {
1397
1398        # Don't return a Perl extension.  (This is the only one that
1399        # %prop_aliases has in it.)
1400        next if $prop eq 'perldecimaldigit';
1401
1402        # Use long name for $prop in the hash
1403        $return{scalar prop_aliases($prop)} = charprop($cp, $prop);
1404    }
1405
1406    return \%return;
1407}
1408
1409=head2 B<charblock()>
1410
1411    use Unicode::UCD 'charblock';
1412
1413    my $charblock = charblock(0x41);
1414    my $charblock = charblock(1234);
1415    my $charblock = charblock(0x263a);
1416    my $charblock = charblock("U+263a");
1417
1418    my $range     = charblock('Armenian');
1419
1420With a L</code point argument> C<charblock()> returns the I<block> the code point
1421belongs to, e.g.  C<Basic Latin>.  The old-style block name is returned (see
1422L</Old-style versus new-style block names>).
1423The L</prop_value_aliases()> function can be used to get all the synonyms
1424of the block name.
1425
1426If the code point is unassigned, this returns the block it would belong to if
1427it were assigned.  (If the Unicode version being used is so early as to not
1428have blocks, all code points are considered to be in C<No_Block>.)
1429
1430See also L</Blocks versus Scripts>.
1431
1432If supplied with an argument that can't be a code point, C<charblock()> tries to
1433do the opposite and interpret the argument as an old-style block name.  On an
1434ASCII platform, the return value is a I<range set> with one range: an
1435anonymous array with a single element that consists of another anonymous array
1436whose first element is the first code point in the block, and whose second
1437element is the final code point in the block.  On an EBCDIC
1438platform, the first two Unicode blocks are not contiguous.  Their range sets
1439are lists containing I<start-of-range>, I<end-of-range> code point pairs.  You
1440can test whether a code point is in a range set using the L</charinrange()>
1441function.  (To be precise, each I<range set> contains a third array element,
1442after the range boundary ones: the old_style block name.)
1443
1444If the argument to C<charblock()> is not a known block, C<undef> is
1445returned.
1446
1447=cut
1448
1449my @BLOCKS;
1450my %BLOCKS;
1451
1452sub _charblocks {
1453
1454    # Can't read from the mktables table because it loses the hyphens in the
1455    # original.
1456    unless (@BLOCKS) {
1457        UnicodeVersion() unless defined $v_unicode_version;
1458        if ($v_unicode_version lt v2.0.0) {
1459            my $subrange = [ 0, 0x10FFFF, 'No_Block' ];
1460            push @BLOCKS, $subrange;
1461            push @{$BLOCKS{'No_Block'}}, $subrange;
1462        }
1463        else {
1464            my $blocksfh = openunicode("Blocks.txt");
1465	    local $_;
1466	    local $/ = "\n";
1467	    while (<$blocksfh>) {
1468
1469                # Old versions used a different syntax to mark the range.
1470                $_ =~ s/;\s+/../ if $v_unicode_version lt v3.1.0;
1471
1472		if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
1473		    my ($lo, $hi) = (hex($1), hex($2));
1474		    my $subrange = [ $lo, $hi, $3 ];
1475		    push @BLOCKS, $subrange;
1476		    push @{$BLOCKS{$3}}, $subrange;
1477		}
1478	    }
1479            if (! IS_ASCII_PLATFORM) {
1480                # The first two blocks, through 0xFF, are wrong on EBCDIC
1481                # platforms.
1482
1483                my @new_blocks = _read_table("To/Blk.pl");
1484
1485                # Get rid of the first two ranges in the Unicode version, and
1486                # replace them with the ones computed by mktables.
1487                shift @BLOCKS;
1488                shift @BLOCKS;
1489                delete $BLOCKS{'Basic Latin'};
1490                delete $BLOCKS{'Latin-1 Supplement'};
1491
1492                # But there are multiple entries in the computed versions, and
1493                # we change their names to (which we know) to be the old-style
1494                # ones.
1495                for my $i (0.. @new_blocks - 1) {
1496                    if ($new_blocks[$i][2] =~ s/Basic_Latin/Basic Latin/
1497                        or $new_blocks[$i][2] =~
1498                                    s/Latin_1_Supplement/Latin-1 Supplement/)
1499                    {
1500                        push @{$BLOCKS{$new_blocks[$i][2]}}, $new_blocks[$i];
1501                    }
1502                    else {
1503                        splice @new_blocks, $i;
1504                        last;
1505                    }
1506                }
1507                unshift @BLOCKS, @new_blocks;
1508            }
1509	}
1510    }
1511}
1512
1513sub charblock {
1514    my $arg = shift;
1515
1516    _charblocks() unless @BLOCKS;
1517
1518    my $code = _getcode($arg);
1519
1520    if (defined $code) {
1521	my $result = _search(\@BLOCKS, 0, $#BLOCKS, $code);
1522        return $result if defined $result;
1523        return 'No_Block';
1524    }
1525    elsif (exists $BLOCKS{$arg}) {
1526        return _dclone $BLOCKS{$arg};
1527    }
1528
1529    carp __PACKAGE__, "::charblock: unknown code '$arg'";
1530    return;
1531}
1532
1533=head2 B<charscript()>
1534
1535    use Unicode::UCD 'charscript';
1536
1537    my $charscript = charscript(0x41);
1538    my $charscript = charscript(1234);
1539    my $charscript = charscript("U+263a");
1540
1541    my $range      = charscript('Thai');
1542
1543With a L</code point argument>, C<charscript()> returns the I<script> the
1544code point belongs to, e.g., C<Latin>, C<Greek>, C<Han>.
1545If the code point is unassigned or the Unicode version being used is so early
1546that it doesn't have scripts, this function returns C<"Unknown">.
1547The L</prop_value_aliases()> function can be used to get all the synonyms
1548of the script name.
1549
1550Note that the Script_Extensions property is an improved version of the Script
1551property, and you should probably be using that instead, with the
1552L</charprop()> function.
1553
1554If supplied with an argument that can't be a code point, charscript() tries
1555to do the opposite and interpret the argument as a script name. The
1556return value is a I<range set>: an anonymous array of arrays that contain
1557I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
1558code point is in a range set using the L</charinrange()> function.
1559(To be precise, each I<range set> contains a third array element,
1560after the range boundary ones: the script name.)
1561
1562If the C<charscript()> argument is not a known script, C<undef> is returned.
1563
1564See also L</Blocks versus Scripts>.
1565
1566=cut
1567
1568my @SCRIPTS;
1569my %SCRIPTS;
1570
1571sub _charscripts {
1572    unless (@SCRIPTS) {
1573        UnicodeVersion() unless defined $v_unicode_version;
1574        if ($v_unicode_version lt v3.1.0) {
1575            push @SCRIPTS, [ 0, 0x10FFFF, 'Unknown' ];
1576        }
1577        else {
1578            @SCRIPTS =_read_table("To/Sc.pl");
1579        }
1580    }
1581    foreach my $entry (@SCRIPTS) {
1582        $entry->[2] =~ s/(_\w)/\L$1/g;  # Preserve old-style casing
1583        push @{$SCRIPTS{$entry->[2]}}, $entry;
1584    }
1585}
1586
1587sub charscript {
1588    my $arg = shift;
1589
1590    _charscripts() unless @SCRIPTS;
1591
1592    my $code = _getcode($arg);
1593
1594    if (defined $code) {
1595	my $result = _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
1596        return $result if defined $result;
1597        return $SwashInfo{'ToSc'}{'missing'};
1598    } elsif (exists $SCRIPTS{$arg}) {
1599        return _dclone $SCRIPTS{$arg};
1600    }
1601
1602    carp __PACKAGE__, "::charscript: unknown code '$arg'";
1603    return;
1604}
1605
1606=head2 B<charblocks()>
1607
1608    use Unicode::UCD 'charblocks';
1609
1610    my $charblocks = charblocks();
1611
1612C<charblocks()> returns a reference to a hash with the known block names
1613as the keys, and the code point ranges (see L</charblock()>) as the values.
1614
1615The names are in the old-style (see L</Old-style versus new-style block
1616names>).
1617
1618L<prop_invmap("block")|/prop_invmap()> can be used to get this same data in a
1619different type of data structure.
1620
1621L<prop_values("Block")|/prop_values()> can be used to get all
1622the known new-style block names as a list, without the code point ranges.
1623
1624See also L</Blocks versus Scripts>.
1625
1626=cut
1627
1628sub charblocks {
1629    _charblocks() unless %BLOCKS;
1630    return _dclone \%BLOCKS;
1631}
1632
1633=head2 B<charscripts()>
1634
1635    use Unicode::UCD 'charscripts';
1636
1637    my $charscripts = charscripts();
1638
1639C<charscripts()> returns a reference to a hash with the known script
1640names as the keys, and the code point ranges (see L</charscript()>) as
1641the values.
1642
1643L<prop_invmap("script")|/prop_invmap()> can be used to get this same data in a
1644different type of data structure.  Since the Script_Extensions property is an
1645improved version of the Script property, you should instead use
1646L<prop_invmap("scx")|/prop_invmap()>.
1647
1648L<C<prop_values("Script")>|/prop_values()> can be used to get all
1649the known script names as a list, without the code point ranges.
1650
1651See also L</Blocks versus Scripts>.
1652
1653=cut
1654
1655sub charscripts {
1656    _charscripts() unless %SCRIPTS;
1657    return _dclone \%SCRIPTS;
1658}
1659
1660=head2 B<charinrange()>
1661
1662In addition to using the C<\p{Blk=...}> and C<\P{Blk=...}> constructs, you
1663can also test whether a code point is in the I<range> as returned by
1664L</charblock()> and L</charscript()> or as the values of the hash returned
1665by L</charblocks()> and L</charscripts()> by using C<charinrange()>:
1666
1667    use Unicode::UCD qw(charscript charinrange);
1668
1669    $range = charscript('Hiragana');
1670    print "looks like hiragana\n" if charinrange($range, $codepoint);
1671
1672=cut
1673
1674my %GENERAL_CATEGORIES =
1675 (
1676    'L'  =>         'Letter',
1677    'LC' =>         'CasedLetter',
1678    'Lu' =>         'UppercaseLetter',
1679    'Ll' =>         'LowercaseLetter',
1680    'Lt' =>         'TitlecaseLetter',
1681    'Lm' =>         'ModifierLetter',
1682    'Lo' =>         'OtherLetter',
1683    'M'  =>         'Mark',
1684    'Mn' =>         'NonspacingMark',
1685    'Mc' =>         'SpacingMark',
1686    'Me' =>         'EnclosingMark',
1687    'N'  =>         'Number',
1688    'Nd' =>         'DecimalNumber',
1689    'Nl' =>         'LetterNumber',
1690    'No' =>         'OtherNumber',
1691    'P'  =>         'Punctuation',
1692    'Pc' =>         'ConnectorPunctuation',
1693    'Pd' =>         'DashPunctuation',
1694    'Ps' =>         'OpenPunctuation',
1695    'Pe' =>         'ClosePunctuation',
1696    'Pi' =>         'InitialPunctuation',
1697    'Pf' =>         'FinalPunctuation',
1698    'Po' =>         'OtherPunctuation',
1699    'S'  =>         'Symbol',
1700    'Sm' =>         'MathSymbol',
1701    'Sc' =>         'CurrencySymbol',
1702    'Sk' =>         'ModifierSymbol',
1703    'So' =>         'OtherSymbol',
1704    'Z'  =>         'Separator',
1705    'Zs' =>         'SpaceSeparator',
1706    'Zl' =>         'LineSeparator',
1707    'Zp' =>         'ParagraphSeparator',
1708    'C'  =>         'Other',
1709    'Cc' =>         'Control',
1710    'Cf' =>         'Format',
1711    'Cs' =>         'Surrogate',
1712    'Co' =>         'PrivateUse',
1713    'Cn' =>         'Unassigned',
1714 );
1715
1716sub general_categories {
1717    return _dclone \%GENERAL_CATEGORIES;
1718}
1719
1720=head2 B<general_categories()>
1721
1722    use Unicode::UCD 'general_categories';
1723
1724    my $categories = general_categories();
1725
1726This returns a reference to a hash which has short
1727general category names (such as C<Lu>, C<Nd>, C<Zs>, C<S>) as keys and long
1728names (such as C<UppercaseLetter>, C<DecimalNumber>, C<SpaceSeparator>,
1729C<Symbol>) as values.  The hash is reversible in case you need to go
1730from the long names to the short names.  The general category is the
1731one returned from
1732L</charinfo()> under the C<category> key.
1733
1734The L</prop_values()> and L</prop_value_aliases()> functions can be used as an
1735alternative to this function; the first returning a simple list of the short
1736category names; and the second gets all the synonyms of a given category name.
1737
1738=cut
1739
1740my %BIDI_TYPES =
1741 (
1742   'L'   => 'Left-to-Right',
1743   'LRE' => 'Left-to-Right Embedding',
1744   'LRO' => 'Left-to-Right Override',
1745   'R'   => 'Right-to-Left',
1746   'AL'  => 'Right-to-Left Arabic',
1747   'RLE' => 'Right-to-Left Embedding',
1748   'RLO' => 'Right-to-Left Override',
1749   'PDF' => 'Pop Directional Format',
1750   'EN'  => 'European Number',
1751   'ES'  => 'European Number Separator',
1752   'ET'  => 'European Number Terminator',
1753   'AN'  => 'Arabic Number',
1754   'CS'  => 'Common Number Separator',
1755   'NSM' => 'Non-Spacing Mark',
1756   'BN'  => 'Boundary Neutral',
1757   'B'   => 'Paragraph Separator',
1758   'S'   => 'Segment Separator',
1759   'WS'  => 'Whitespace',
1760   'ON'  => 'Other Neutrals',
1761 );
1762
1763=head2 B<bidi_types()>
1764
1765    use Unicode::UCD 'bidi_types';
1766
1767    my $categories = bidi_types();
1768
1769This returns a reference to a hash which has the short
1770bidi (bidirectional) type names (such as C<L>, C<R>) as keys and long
1771names (such as C<Left-to-Right>, C<Right-to-Left>) as values.  The
1772hash is reversible in case you need to go from the long names to the
1773short names.  The bidi type is the one returned from
1774L</charinfo()>
1775under the C<bidi> key.  For the exact meaning of the various bidi classes
1776the Unicode TR9 is recommended reading:
1777L<http://www.unicode.org/reports/tr9/>
1778(as of Unicode 5.0.0)
1779
1780The L</prop_values()> and L</prop_value_aliases()> functions can be used as an
1781alternative to this function; the first returning a simple list of the short
1782bidi type names; and the second gets all the synonyms of a given bidi type
1783name.
1784
1785=cut
1786
1787sub bidi_types {
1788    return _dclone \%BIDI_TYPES;
1789}
1790
1791=head2 B<compexcl()>
1792
1793WARNING: Unicode discourages the use of this function or any of the
1794alternative mechanisms listed in this section (the documentation of
1795C<compexcl()>), except internally in implementations of the Unicode
1796Normalization Algorithm.  You should be using L<Unicode::Normalize> directly
1797instead of these.  Using these will likely lead to half-baked results.
1798
1799    use Unicode::UCD 'compexcl';
1800
1801    my $compexcl = compexcl(0x09dc);
1802
1803This routine returns C<undef> if the Unicode version being used is so early
1804that it doesn't have this property.
1805
1806C<compexcl()> is included for backwards
1807compatibility, but as of Perl 5.12 and more modern Unicode versions, for
1808most purposes it is probably more convenient to use one of the following
1809instead:
1810
1811    my $compexcl = chr(0x09dc) =~ /\p{Comp_Ex};
1812    my $compexcl = chr(0x09dc) =~ /\p{Full_Composition_Exclusion};
1813
1814or even
1815
1816    my $compexcl = chr(0x09dc) =~ /\p{CE};
1817    my $compexcl = chr(0x09dc) =~ /\p{Composition_Exclusion};
1818
1819The first two forms return B<true> if the L</code point argument> should not
1820be produced by composition normalization.  For the final two forms to return
1821B<true>, it is additionally required that this fact not otherwise be
1822determinable from the Unicode data base.
1823
1824This routine behaves identically to the final two forms.  That is,
1825it does not return B<true> if the code point has a decomposition
1826consisting of another single code point, nor if its decomposition starts
1827with a code point whose combining class is non-zero.  Code points that meet
1828either of these conditions should also not be produced by composition
1829normalization, which is probably why you should use the
1830C<Full_Composition_Exclusion> property instead, as shown above.
1831
1832The routine returns B<false> otherwise.
1833
1834=cut
1835
1836# Eval'd so can run on versions earlier than the property is available in
1837my $Composition_Exclusion_re = eval 'qr/\p{Composition_Exclusion}/';
1838
1839sub compexcl {
1840    my $arg  = shift;
1841    my $code = _getcode($arg);
1842    croak __PACKAGE__, "::compexcl: unknown code '$arg'"
1843	unless defined $code;
1844
1845    UnicodeVersion() unless defined $v_unicode_version;
1846    return if $v_unicode_version lt v3.0.0;
1847
1848    no warnings "non_unicode";     # So works on non-Unicode code points
1849    return chr($code) =~ $Composition_Exclusion_re
1850}
1851
1852=head2 B<casefold()>
1853
1854    use Unicode::UCD 'casefold';
1855
1856    my $casefold = casefold(0xDF);
1857    if (defined $casefold) {
1858        my @full_fold_hex = split / /, $casefold->{'full'};
1859        my $full_fold_string =
1860                    join "", map {chr(hex($_))} @full_fold_hex;
1861        my @turkic_fold_hex =
1862                        split / /, ($casefold->{'turkic'} ne "")
1863                                        ? $casefold->{'turkic'}
1864                                        : $casefold->{'full'};
1865        my $turkic_fold_string =
1866                        join "", map {chr(hex($_))} @turkic_fold_hex;
1867    }
1868    if (defined $casefold && $casefold->{'simple'} ne "") {
1869        my $simple_fold_hex = $casefold->{'simple'};
1870        my $simple_fold_string = chr(hex($simple_fold_hex));
1871    }
1872
1873This returns the (almost) locale-independent case folding of the
1874character specified by the L</code point argument>.  (Starting in Perl v5.16,
1875the core function C<fc()> returns the C<full> mapping (described below)
1876faster than this does, and for entire strings.)
1877
1878If there is no case folding for the input code point, C<undef> is returned.
1879
1880If there is a case folding for that code point, a reference to a hash
1881with the following fields is returned:
1882
1883=over
1884
1885=item B<code>
1886
1887the input native L</code point argument> expressed in hexadecimal, with
1888leading zeros
1889added if necessary to make it contain at least four hexdigits
1890
1891=item B<full>
1892
1893one or more codes (separated by spaces) that, taken in order, give the
1894code points for the case folding for I<code>.
1895Each has at least four hexdigits.
1896
1897=item B<simple>
1898
1899is empty, or is exactly one code with at least four hexdigits which can be used
1900as an alternative case folding when the calling program cannot cope with the
1901fold being a sequence of multiple code points.  If I<full> is just one code
1902point, then I<simple> equals I<full>.  If there is no single code point folding
1903defined for I<code>, then I<simple> is the empty string.  Otherwise, it is an
1904inferior, but still better-than-nothing alternative folding to I<full>.
1905
1906=item B<mapping>
1907
1908is the same as I<simple> if I<simple> is not empty, and it is the same as I<full>
1909otherwise.  It can be considered to be the simplest possible folding for
1910I<code>.  It is defined primarily for backwards compatibility.
1911
1912=item B<status>
1913
1914is C<C> (for C<common>) if the best possible fold is a single code point
1915(I<simple> equals I<full> equals I<mapping>).  It is C<S> if there are distinct
1916folds, I<simple> and I<full> (I<mapping> equals I<simple>).  And it is C<F> if
1917there is only a I<full> fold (I<mapping> equals I<full>; I<simple> is empty).
1918Note that this
1919describes the contents of I<mapping>.  It is defined primarily for backwards
1920compatibility.
1921
1922For Unicode versions between 3.1 and 3.1.1 inclusive, I<status> can also be
1923C<I> which is the same as C<C> but is a special case for dotted uppercase I and
1924dotless lowercase i:
1925
1926=over
1927
1928=item Z<>B<*> If you use this C<I> mapping
1929
1930the result is case-insensitive,
1931but dotless and dotted I's are not distinguished
1932
1933=item Z<>B<*> If you exclude this C<I> mapping
1934
1935the result is not fully case-insensitive, but
1936dotless and dotted I's are distinguished
1937
1938=back
1939
1940=item B<turkic>
1941
1942contains any special folding for Turkic languages.  For versions of Unicode
1943starting with 3.2, this field is empty unless I<code> has a different folding
1944in Turkic languages, in which case it is one or more codes (separated by
1945spaces) that, taken in order, give the code points for the case folding for
1946I<code> in those languages.
1947Each code has at least four hexdigits.
1948Note that this folding does not maintain canonical equivalence without
1949additional processing.
1950
1951For Unicode versions between 3.1 and 3.1.1 inclusive, this field is empty unless
1952there is a
1953special folding for Turkic languages, in which case I<status> is C<I>, and
1954I<mapping>, I<full>, I<simple>, and I<turkic> are all equal.
1955
1956=back
1957
1958Programs that want complete generality and the best folding results should use
1959the folding contained in the I<full> field.  But note that the fold for some
1960code points will be a sequence of multiple code points.
1961
1962Programs that can't cope with the fold mapping being multiple code points can
1963use the folding contained in the I<simple> field, with the loss of some
1964generality.  In Unicode 5.1, about 7% of the defined foldings have no single
1965code point folding.
1966
1967The I<mapping> and I<status> fields are provided for backwards compatibility for
1968existing programs.  They contain the same values as in previous versions of
1969this function.
1970
1971Locale is not completely independent.  The I<turkic> field contains results to
1972use when the locale is a Turkic language.
1973
1974For more information about case mappings see
1975L<http://www.unicode.org/unicode/reports/tr21>
1976
1977=cut
1978
1979my %CASEFOLD;
1980
1981sub _casefold {
1982    unless (%CASEFOLD) {   # Populate the hash
1983        my ($full_invlist_ref, $full_invmap_ref, undef, $default)
1984                                                = prop_invmap('Case_Folding');
1985
1986        # Use the recipe given in the prop_invmap() pod to convert the
1987        # inversion map into the hash.
1988        for my $i (0 .. @$full_invlist_ref - 1 - 1) {
1989            next if $full_invmap_ref->[$i] == $default;
1990            my $adjust = -1;
1991            for my $j ($full_invlist_ref->[$i] .. $full_invlist_ref->[$i+1] -1) {
1992                $adjust++;
1993                if (! ref $full_invmap_ref->[$i]) {
1994
1995                    # This is a single character mapping
1996                    $CASEFOLD{$j}{'status'} = 'C';
1997                    $CASEFOLD{$j}{'simple'}
1998                        = $CASEFOLD{$j}{'full'}
1999                        = $CASEFOLD{$j}{'mapping'}
2000                        = sprintf("%04X", $full_invmap_ref->[$i] + $adjust);
2001                    $CASEFOLD{$j}{'code'} = sprintf("%04X", $j);
2002                    $CASEFOLD{$j}{'turkic'} = "";
2003                }
2004                else {  # prop_invmap ensures that $adjust is 0 for a ref
2005                    $CASEFOLD{$j}{'status'} = 'F';
2006                    $CASEFOLD{$j}{'full'}
2007                    = $CASEFOLD{$j}{'mapping'}
2008                    = join " ", map { sprintf "%04X", $_ }
2009                                                    @{$full_invmap_ref->[$i]};
2010                    $CASEFOLD{$j}{'simple'} = "";
2011                    $CASEFOLD{$j}{'code'} = sprintf("%04X", $j);
2012                    $CASEFOLD{$j}{'turkic'} = "";
2013                }
2014            }
2015        }
2016
2017        # We have filled in the full mappings above, assuming there were no
2018        # simple ones for the ones with multi-character maps.  Now, we find
2019        # and fix the cases where that assumption was false.
2020        (my ($simple_invlist_ref, $simple_invmap_ref, undef), $default)
2021                                        = prop_invmap('Simple_Case_Folding');
2022        for my $i (0 .. @$simple_invlist_ref - 1 - 1) {
2023            next if $simple_invmap_ref->[$i] == $default;
2024            my $adjust = -1;
2025            for my $j ($simple_invlist_ref->[$i]
2026                       .. $simple_invlist_ref->[$i+1] -1)
2027            {
2028                $adjust++;
2029                next if $CASEFOLD{$j}{'status'} eq 'C';
2030                $CASEFOLD{$j}{'status'} = 'S';
2031                $CASEFOLD{$j}{'simple'}
2032                    = $CASEFOLD{$j}{'mapping'}
2033                    = sprintf("%04X", $simple_invmap_ref->[$i] + $adjust);
2034                $CASEFOLD{$j}{'code'} = sprintf("%04X", $j);
2035                $CASEFOLD{$j}{'turkic'} = "";
2036            }
2037        }
2038
2039        # We hard-code in the turkish rules
2040        UnicodeVersion() unless defined $v_unicode_version;
2041        if ($v_unicode_version ge v3.2.0) {
2042
2043            # These two code points should already have regular entries, so
2044            # just fill in the turkish fields
2045            $CASEFOLD{ord('I')}{'turkic'} = '0131';
2046            $CASEFOLD{0x130}{'turkic'} = sprintf "%04X", ord('i');
2047        }
2048        elsif ($v_unicode_version ge v3.1.0) {
2049
2050            # These two code points don't have entries otherwise.
2051            $CASEFOLD{0x130}{'code'} = '0130';
2052            $CASEFOLD{0x131}{'code'} = '0131';
2053            $CASEFOLD{0x130}{'status'} = $CASEFOLD{0x131}{'status'} = 'I';
2054            $CASEFOLD{0x130}{'turkic'}
2055                = $CASEFOLD{0x130}{'mapping'}
2056                = $CASEFOLD{0x130}{'full'}
2057                = $CASEFOLD{0x130}{'simple'}
2058                = $CASEFOLD{0x131}{'turkic'}
2059                = $CASEFOLD{0x131}{'mapping'}
2060                = $CASEFOLD{0x131}{'full'}
2061                = $CASEFOLD{0x131}{'simple'}
2062                = sprintf "%04X", ord('i');
2063        }
2064    }
2065}
2066
2067sub casefold {
2068    my $arg  = shift;
2069    my $code = _getcode($arg);
2070    croak __PACKAGE__, "::casefold: unknown code '$arg'"
2071	unless defined $code;
2072
2073    _casefold() unless %CASEFOLD;
2074
2075    return $CASEFOLD{$code};
2076}
2077
2078=head2 B<all_casefolds()>
2079
2080
2081    use Unicode::UCD 'all_casefolds';
2082
2083    my $all_folds_ref = all_casefolds();
2084    foreach my $char_with_casefold (sort { $a <=> $b }
2085                                    keys %$all_folds_ref)
2086    {
2087        printf "%04X:", $char_with_casefold;
2088        my $casefold = $all_folds_ref->{$char_with_casefold};
2089
2090        # Get folds for $char_with_casefold
2091
2092        my @full_fold_hex = split / /, $casefold->{'full'};
2093        my $full_fold_string =
2094                    join "", map {chr(hex($_))} @full_fold_hex;
2095        print " full=", join " ", @full_fold_hex;
2096        my @turkic_fold_hex =
2097                        split / /, ($casefold->{'turkic'} ne "")
2098                                        ? $casefold->{'turkic'}
2099                                        : $casefold->{'full'};
2100        my $turkic_fold_string =
2101                        join "", map {chr(hex($_))} @turkic_fold_hex;
2102        print "; turkic=", join " ", @turkic_fold_hex;
2103        if (defined $casefold && $casefold->{'simple'} ne "") {
2104            my $simple_fold_hex = $casefold->{'simple'};
2105            my $simple_fold_string = chr(hex($simple_fold_hex));
2106            print "; simple=$simple_fold_hex";
2107        }
2108        print "\n";
2109    }
2110
2111This returns all the case foldings in the current version of Unicode in the
2112form of a reference to a hash.  Each key to the hash is the decimal
2113representation of a Unicode character that has a casefold to other than
2114itself.  The casefold of a semi-colon is itself, so it isn't in the hash;
2115likewise for a lowercase "a", but there is an entry for a capital "A".  The
2116hash value for each key is another hash, identical to what is returned by
2117L</casefold()> if called with that code point as its argument.  So the value
2118C<< all_casefolds()->{ord("A")}' >> is equivalent to C<casefold(ord("A"))>;
2119
2120=cut
2121
2122sub all_casefolds () {
2123    _casefold() unless %CASEFOLD;
2124    return _dclone \%CASEFOLD;
2125}
2126
2127=head2 B<casespec()>
2128
2129    use Unicode::UCD 'casespec';
2130
2131    my $casespec = casespec(0xFB00);
2132
2133This returns the potentially locale-dependent case mappings of the L</code point
2134argument>.  The mappings may be longer than a single code point (which the basic
2135Unicode case mappings as returned by L</charinfo()> never are).
2136
2137If there are no case mappings for the L</code point argument>, or if all three
2138possible mappings (I<lower>, I<title> and I<upper>) result in single code
2139points and are locale independent and unconditional, C<undef> is returned
2140(which means that the case mappings, if any, for the code point are those
2141returned by L</charinfo()>).
2142
2143Otherwise, a reference to a hash giving the mappings (or a reference to a hash
2144of such hashes, explained below) is returned with the following keys and their
2145meanings:
2146
2147The keys in the bottom layer hash with the meanings of their values are:
2148
2149=over
2150
2151=item B<code>
2152
2153the input native L</code point argument> expressed in hexadecimal, with
2154leading zeros
2155added if necessary to make it contain at least four hexdigits
2156
2157=item B<lower>
2158
2159one or more codes (separated by spaces) that, taken in order, give the
2160code points for the lower case of I<code>.
2161Each has at least four hexdigits.
2162
2163=item B<title>
2164
2165one or more codes (separated by spaces) that, taken in order, give the
2166code points for the title case of I<code>.
2167Each has at least four hexdigits.
2168
2169=item B<upper>
2170
2171one or more codes (separated by spaces) that, taken in order, give the
2172code points for the upper case of I<code>.
2173Each has at least four hexdigits.
2174
2175=item B<condition>
2176
2177the conditions for the mappings to be valid.
2178If C<undef>, the mappings are always valid.
2179When defined, this field is a list of conditions,
2180all of which must be true for the mappings to be valid.
2181The list consists of one or more
2182I<locales> (see below)
2183and/or I<contexts> (explained in the next paragraph),
2184separated by spaces.
2185(Other than as used to separate elements, spaces are to be ignored.)
2186Case distinctions in the condition list are not significant.
2187Conditions preceded by "NON_" represent the negation of the condition.
2188
2189A I<context> is one of those defined in the Unicode standard.
2190For Unicode 5.1, they are defined in Section 3.13 C<Default Case Operations>
2191available at
2192L<http://www.unicode.org/versions/Unicode5.1.0/>.
2193These are for context-sensitive casing.
2194
2195=back
2196
2197The hash described above is returned for locale-independent casing, where
2198at least one of the mappings has length longer than one.  If C<undef> is
2199returned, the code point may have mappings, but if so, all are length one,
2200and are returned by L</charinfo()>.
2201Note that when this function does return a value, it will be for the complete
2202set of mappings for a code point, even those whose length is one.
2203
2204If there are additional casing rules that apply only in certain locales,
2205an additional key for each will be defined in the returned hash.  Each such key
2206will be its locale name, defined as a 2-letter ISO 3166 country code, possibly
2207followed by a "_" and a 2-letter ISO language code (possibly followed by a "_"
2208and a variant code).  You can find the lists of all possible locales, see
2209L<Locale::Country> and L<Locale::Language>.
2210(In Unicode 6.0, the only locales returned by this function
2211are C<lt>, C<tr>, and C<az>.)
2212
2213Each locale key is a reference to a hash that has the form above, and gives
2214the casing rules for that particular locale, which take precedence over the
2215locale-independent ones when in that locale.
2216
2217If the only casing for a code point is locale-dependent, then the returned
2218hash will not have any of the base keys, like C<code>, C<upper>, etc., but
2219will contain only locale keys.
2220
2221For more information about case mappings see
2222L<http://www.unicode.org/unicode/reports/tr21/>
2223
2224=cut
2225
2226my %CASESPEC;
2227
2228sub _casespec {
2229    unless (%CASESPEC) {
2230        UnicodeVersion() unless defined $v_unicode_version;
2231        if ($v_unicode_version ge v2.1.8) {
2232            my $casespecfh = openunicode("SpecialCasing.txt");
2233	    local $_;
2234	    local $/ = "\n";
2235	    while (<$casespecfh>) {
2236		if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
2237
2238		    my ($hexcode, $lower, $title, $upper, $condition) =
2239			($1, $2, $3, $4, $5);
2240                    if (! IS_ASCII_PLATFORM) { # Remap entry to native
2241                        foreach my $var_ref (\$hexcode,
2242                                             \$lower,
2243                                             \$title,
2244                                             \$upper)
2245                        {
2246                            next unless defined $$var_ref;
2247                            $$var_ref = join " ",
2248                                        map { sprintf("%04X",
2249                                              utf8::unicode_to_native(hex $_)) }
2250                                        split " ", $$var_ref;
2251                        }
2252                    }
2253
2254		    my $code = hex($hexcode);
2255
2256                    # In 2.1.8, there were duplicate entries; ignore all but
2257                    # the first one -- there were no conditions in the file
2258                    # anyway.
2259		    if (exists $CASESPEC{$code} && $v_unicode_version ne v2.1.8)
2260                    {
2261			if (exists $CASESPEC{$code}->{code}) {
2262			    my ($oldlower,
2263				$oldtitle,
2264				$oldupper,
2265				$oldcondition) =
2266				    @{$CASESPEC{$code}}{qw(lower
2267							   title
2268							   upper
2269							   condition)};
2270			    if (defined $oldcondition) {
2271				my ($oldlocale) =
2272				($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
2273				delete $CASESPEC{$code};
2274				$CASESPEC{$code}->{$oldlocale} =
2275				{ code      => $hexcode,
2276				  lower     => $oldlower,
2277				  title     => $oldtitle,
2278				  upper     => $oldupper,
2279				  condition => $oldcondition };
2280			    }
2281			}
2282			my ($locale) =
2283			    ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
2284			$CASESPEC{$code}->{$locale} =
2285			{ code      => $hexcode,
2286			  lower     => $lower,
2287			  title     => $title,
2288			  upper     => $upper,
2289			  condition => $condition };
2290		    } else {
2291			$CASESPEC{$code} =
2292			{ code      => $hexcode,
2293			  lower     => $lower,
2294			  title     => $title,
2295			  upper     => $upper,
2296			  condition => $condition };
2297		    }
2298		}
2299	    }
2300	}
2301    }
2302}
2303
2304sub casespec {
2305    my $arg  = shift;
2306    my $code = _getcode($arg);
2307    croak __PACKAGE__, "::casespec: unknown code '$arg'"
2308	unless defined $code;
2309
2310    _casespec() unless %CASESPEC;
2311
2312    return ref $CASESPEC{$code} ? _dclone $CASESPEC{$code} : $CASESPEC{$code};
2313}
2314
2315=head2 B<namedseq()>
2316
2317    use Unicode::UCD 'namedseq';
2318
2319    my $namedseq = namedseq("KATAKANA LETTER AINU P");
2320    my @namedseq = namedseq("KATAKANA LETTER AINU P");
2321    my %namedseq = namedseq();
2322
2323If used with a single argument in a scalar context, returns the string
2324consisting of the code points of the named sequence, or C<undef> if no
2325named sequence by that name exists.  If used with a single argument in
2326a list context, it returns the list of the ordinals of the code points.
2327
2328If used with no
2329arguments in a list context, it returns a hash with the names of all the
2330named sequences as the keys and their sequences as strings as
2331the values.  Otherwise, it returns C<undef> or an empty list depending
2332on the context.
2333
2334This function only operates on officially approved (not provisional) named
2335sequences.
2336
2337Note that as of Perl 5.14, C<\N{KATAKANA LETTER AINU P}> will insert the named
2338sequence into double-quoted strings, and C<charnames::string_vianame("KATAKANA
2339LETTER AINU P")> will return the same string this function does, but will also
2340operate on character names that aren't named sequences, without you having to
2341know which are which.  See L<charnames>.
2342
2343=cut
2344
2345my %NAMEDSEQ;
2346
2347sub _namedseq {
2348    unless (%NAMEDSEQ) {
2349        my $namedseqfh = openunicode("Name.pl");
2350        local $_;
2351        local $/ = "\n";
2352        while (<$namedseqfh>) {
2353            next if m/ ^ \s* \# /x;
2354
2355            # Each entry is currently two lines.  The first contains the code
2356            # points in the sequence separated by spaces.  If this entry
2357            # doesn't have spaces, it isn't a named sequence.
2358            if (/^ [0-9A-F]{4,5} (?: \  [0-9A-F]{4,5} )+ $ /x) {
2359                my $sequence = $_;
2360                chomp $sequence;
2361
2362                # And the second is the name
2363                my $name = <$namedseqfh>;
2364                chomp $name;
2365                my @s = map { chr(hex($_)) } split(' ', $sequence);
2366                $NAMEDSEQ{$name} = join("", @s);
2367            }
2368        }
2369    }
2370}
2371
2372sub namedseq {
2373
2374    # Use charnames::string_vianame() which now returns this information,
2375    # unless the caller wants the hash returned, in which case we read it in,
2376    # and thereafter use it instead of calling charnames, as it is faster.
2377
2378    my $wantarray = wantarray();
2379    if (defined $wantarray) {
2380	if ($wantarray) {
2381	    if (@_ == 0) {
2382                _namedseq() unless %NAMEDSEQ;
2383		return %NAMEDSEQ;
2384	    } elsif (@_ == 1) {
2385		my $s;
2386                if (%NAMEDSEQ) {
2387                    $s = $NAMEDSEQ{ $_[0] };
2388                }
2389                else {
2390                    $s = charnames::string_vianame($_[0]);
2391                }
2392		return defined $s ? map { ord($_) } split('', $s) : ();
2393	    }
2394	} elsif (@_ == 1) {
2395            return $NAMEDSEQ{ $_[0] } if %NAMEDSEQ;
2396            return charnames::string_vianame($_[0]);
2397	}
2398    }
2399    return;
2400}
2401
2402my %NUMERIC;
2403
2404sub _numeric {
2405    my @numbers = _read_table("To/Nv.pl");
2406    foreach my $entry (@numbers) {
2407        my ($start, $end, $value) = @$entry;
2408
2409        # If value contains a slash, convert to decimal, add a reverse hash
2410        # used by charinfo.
2411        if ((my @rational = split /\//, $value) == 2) {
2412            my $real = $rational[0] / $rational[1];
2413            $real_to_rational{$real} = $value;
2414            $value = $real;
2415
2416            # Should only be single element, but just in case...
2417            for my $i ($start .. $end) {
2418                $NUMERIC{$i} = $value;
2419            }
2420        }
2421        else {
2422            # The values require adjusting, as is in 'a' format
2423            for my $i ($start .. $end) {
2424                $NUMERIC{$i} = $value + $i - $start;
2425            }
2426        }
2427    }
2428
2429    # Decided unsafe to use these that aren't officially part of the Unicode
2430    # standard.
2431    #use Math::Trig;
2432    #my $pi = acos(-1.0);
2433    #$NUMERIC{0x03C0} = $pi;
2434
2435    # Euler's constant, not to be confused with Euler's number
2436    #$NUMERIC{0x2107} = 0.57721566490153286060651209008240243104215933593992;
2437
2438    # Euler's number
2439    #$NUMERIC{0x212F} = 2.7182818284590452353602874713526624977572;
2440
2441    return;
2442}
2443
2444=pod
2445
2446=head2 B<num()>
2447
2448    use Unicode::UCD 'num';
2449
2450    my $val = num("123");
2451    my $one_quarter = num("\N{VULGAR FRACTION 1/4}");
2452    my $val = num("12a", \$valid_length);  # $valid_length contains 2
2453
2454C<num()> returns the numeric value of the input Unicode string; or C<undef> if it
2455doesn't think the entire string has a completely valid, safe numeric value.
2456If called with an optional second parameter, a reference to a scalar, C<num()>
2457will set the scalar to the length of any valid initial substring; or to 0 if none.
2458
2459If the string is just one character in length, the Unicode numeric value
2460is returned if it has one, or C<undef> otherwise.  If the optional scalar ref
2461is passed, it would be set to 1 if the return is valid; or 0 if the return is
2462C<undef>.  Note that the numeric value returned need not be a whole number.
2463C<num("\N{TIBETAN DIGIT HALF ZERO}")>, for example returns -0.5.
2464
2465=cut
2466
2467#A few characters to which Unicode doesn't officially
2468#assign a numeric value are considered numeric by C<num>.
2469#These are:
2470
2471# EULER CONSTANT             0.5772...  (this is NOT Euler's number)
2472# SCRIPT SMALL E             2.71828... (this IS Euler's number)
2473# GREEK SMALL LETTER PI      3.14159...
2474
2475=pod
2476
2477If the string is more than one character, C<undef> is returned unless
2478all its characters are decimal digits (that is, they would match C<\d+>),
2479from the same script.  For example if you have an ASCII '0' and a Bengali
2480'3', mixed together, they aren't considered a valid number, and C<undef>
2481is returned.  A further restriction is that the digits all have to be of
2482the same form.  A half-width digit mixed with a full-width one will
2483return C<undef>.  The Arabic script has two sets of digits;  C<num> will
2484return C<undef> unless all the digits in the string come from the same
2485set.  In all cases, the optional scalar ref parameter is set to how
2486long any valid initial substring of digits is; hence it will be set to the
2487entire string length if the main return value is not C<undef>.
2488
2489C<num> errs on the side of safety, and there may be valid strings of
2490decimal digits that it doesn't recognize.  Note that Unicode defines
2491a number of "digit" characters that aren't "decimal digit" characters.
2492"Decimal digits" have the property that they have a positional value, i.e.,
2493there is a units position, a 10's position, a 100's, etc, AND they are
2494arranged in Unicode in blocks of 10 contiguous code points.  The Chinese
2495digits, for example, are not in such a contiguous block, and so Unicode
2496doesn't view them as decimal digits, but merely digits, and so C<\d> will not
2497match them.  A single-character string containing one of these digits will
2498have its decimal value returned by C<num>, but any longer string containing
2499only these digits will return C<undef>.
2500
2501Strings of multiple sub- and superscripts are not recognized as numbers.  You
2502can use either of the compatibility decompositions in Unicode::Normalize to
2503change these into digits, and then call C<num> on the result.
2504
2505=cut
2506
2507# To handle sub, superscripts, this could if called in list context,
2508# consider those, and return the <decomposition> type in the second
2509# array element.
2510
2511sub num ($;$) {
2512    my ($string, $retlen_ref) = @_;
2513
2514    use feature 'unicode_strings';
2515
2516    _numeric unless %NUMERIC;
2517    $$retlen_ref = 0 if $retlen_ref;    # Assume will fail
2518
2519    my $length = length $string;
2520    return if $length == 0;
2521
2522    my $first_ord = ord(substr($string, 0, 1));
2523    return if ! exists  $NUMERIC{$first_ord}
2524           || ! defined $NUMERIC{$first_ord};
2525
2526    # Here, we know the first character is numeric
2527    my $value = $NUMERIC{$first_ord};
2528    $$retlen_ref = 1 if $retlen_ref;    # Assume only this one is numeric
2529
2530    return $value if $length == 1;
2531
2532    # Here, the input is longer than a single character.  To be valid, it must
2533    # be entirely decimal digits, which means it must start with one.
2534    return if $string =~ / ^ \D /x;
2535
2536    # To be a valid decimal number, it should be in a block of 10 consecutive
2537    # characters, whose values are 0, 1, 2, ... 9.  Therefore this digit's
2538    # value is its offset in that block from the character that means zero.
2539    my $zero_ord = $first_ord - $value;
2540
2541    # Unicode 6.0 instituted the rule that only digits in a consecutive
2542    # block of 10 would be considered decimal digits.  If this is an earlier
2543    # release, we verify that this first character is a member of such a
2544    # block.  That is, that the block of characters surrounding this one
2545    # consists of all \d characters whose numeric values are the expected
2546    # ones.  If not, then this single character is numeric, but the string as
2547    # a whole is not considered to be.
2548    UnicodeVersion() unless defined $v_unicode_version;
2549    if ($v_unicode_version lt v6.0.0) {
2550        for my $i (0 .. 9) {
2551            my $ord = $zero_ord + $i;
2552            return unless chr($ord) =~ /\d/;
2553            my $numeric = $NUMERIC{$ord};
2554            return unless defined $numeric;
2555            return unless $numeric == $i;
2556        }
2557    }
2558
2559    for my $i (1 .. $length -1) {
2560
2561        # Here we know either by verifying, or by fact of the first character
2562        # being a \d in Unicode 6.0 or later, that any character between the
2563        # character that means 0, and 9 positions above it must be \d, and
2564        # must have its value correspond to its offset from the zero.  Any
2565        # characters outside these 10 do not form a legal number for this
2566        # function.
2567        my $ord = ord(substr($string, $i, 1));
2568        my $digit = $ord - $zero_ord;
2569        if ($digit < 0 || $digit > 9) {
2570            $$retlen_ref = $i if $retlen_ref;
2571            return;
2572        }
2573        $value = $value * 10 + $digit;
2574    }
2575
2576    $$retlen_ref = $length if $retlen_ref;
2577    return $value;
2578}
2579
2580=pod
2581
2582=head2 B<prop_aliases()>
2583
2584    use Unicode::UCD 'prop_aliases';
2585
2586    my ($short_name, $full_name, @other_names) = prop_aliases("space");
2587    my $same_full_name = prop_aliases("Space");     # Scalar context
2588    my ($same_short_name) = prop_aliases("Space");  # gets 0th element
2589    print "The full name is $full_name\n";
2590    print "The short name is $short_name\n";
2591    print "The other aliases are: ", join(", ", @other_names), "\n";
2592
2593    prints:
2594    The full name is White_Space
2595    The short name is WSpace
2596    The other aliases are: Space
2597
2598Most Unicode properties have several synonymous names.  Typically, there is at
2599least a short name, convenient to type, and a long name that more fully
2600describes the property, and hence is more easily understood.
2601
2602If you know one name for a Unicode property, you can use C<prop_aliases> to find
2603either the long name (when called in scalar context), or a list of all of the
2604names, somewhat ordered so that the short name is in the 0th element, the long
2605name in the next element, and any other synonyms are in the remaining
2606elements, in no particular order.
2607
2608The long name is returned in a form nicely capitalized, suitable for printing.
2609
2610The input parameter name is loosely matched, which means that white space,
2611hyphens, and underscores are ignored (except for the trailing underscore in
2612the old_form grandfathered-in C<"L_">, which is better written as C<"LC">, and
2613both of which mean C<General_Category=Cased Letter>).
2614
2615If the name is unknown, C<undef> is returned (or an empty list in list
2616context).  Note that Perl typically recognizes property names in regular
2617expressions with an optional C<"Is_>" (with or without the underscore)
2618prefixed to them, such as C<\p{isgc=punct}>.  This function does not recognize
2619those in the input, returning C<undef>.  Nor are they included in the output
2620as possible synonyms.
2621
2622C<prop_aliases> does know about the Perl extensions to Unicode properties,
2623such as C<Any> and C<XPosixAlpha>, and the single form equivalents to Unicode
2624properties such as C<XDigit>, C<Greek>, C<In_Greek>, and C<Is_Greek>.  The
2625final example demonstrates that the C<"Is_"> prefix is recognized for these
2626extensions; it is needed to resolve ambiguities.  For example,
2627C<prop_aliases('lc')> returns the list C<(lc, Lowercase_Mapping)>, but
2628C<prop_aliases('islc')> returns C<(Is_LC, Cased_Letter)>.  This is
2629because C<islc> is a Perl extension which is short for
2630C<General_Category=Cased Letter>.  The lists returned for the Perl extensions
2631will not include the C<"Is_"> prefix (whether or not the input had it) unless
2632needed to resolve ambiguities, as shown in the C<"islc"> example, where the
2633returned list had one element containing C<"Is_">, and the other without.
2634
2635It is also possible for the reverse to happen:  C<prop_aliases('isc')> returns
2636the list C<(isc, ISO_Comment)>; whereas C<prop_aliases('c')> returns
2637C<(C, Other)> (the latter being a Perl extension meaning
2638C<General_Category=Other>.
2639L<perluniprops/Properties accessible through Unicode::UCD> lists the available
2640forms, including which ones are discouraged from use.
2641
2642Those discouraged forms are accepted as input to C<prop_aliases>, but are not
2643returned in the lists.  C<prop_aliases('isL&')> and C<prop_aliases('isL_')>,
2644which are old synonyms for C<"Is_LC"> and should not be used in new code, are
2645examples of this.  These both return C<(Is_LC, Cased_Letter)>.  Thus this
2646function allows you to take a discouraged form, and find its acceptable
2647alternatives.  The same goes with single-form Block property equivalences.
2648Only the forms that begin with C<"In_"> are not discouraged; if you pass
2649C<prop_aliases> a discouraged form, you will get back the equivalent ones that
2650begin with C<"In_">.  It will otherwise look like a new-style block name (see.
2651L</Old-style versus new-style block names>).
2652
2653C<prop_aliases> does not know about any user-defined properties, and will
2654return C<undef> if called with one of those.  Likewise for Perl internal
2655properties, with the exception of "Perl_Decimal_Digit" which it does know
2656about (and which is documented below in L</prop_invmap()>).
2657
2658=cut
2659
2660# It may be that there are use cases where the discouraged forms should be
2661# returned.  If that comes up, an optional boolean second parameter to the
2662# function could be created, for example.
2663
2664# These are created by mktables for this routine and stored in unicore/UCD.pl
2665# where their structures are described.
2666our %string_property_loose_to_name;
2667our %ambiguous_names;
2668our %loose_perlprop_to_name;
2669
2670sub prop_aliases ($) {
2671    my $prop = $_[0];
2672    return unless defined $prop;
2673
2674    require "unicore/UCD.pl";
2675
2676    # The property name may be loosely or strictly matched; we don't know yet.
2677    # But both types use lower-case.
2678    $prop = lc $prop;
2679
2680    # It is loosely matched if its lower case isn't known to be strict.
2681    my $list_ref;
2682    if (! exists $stricter_to_file_of{$prop}) {
2683        my $loose = loose_name($prop);
2684
2685        # There is a hash that converts from any loose name to its standard
2686        # form, mapping all synonyms for a  name to one name that can be used
2687        # as a key into another hash.  The whole concept is for memory
2688        # savings, as the second hash doesn't have to have all the
2689        # combinations.  Actually, there are two hashes that do the
2690        # conversion.  One is stored in UCD.pl) for looking up properties
2691        # matchable in regexes.  This function needs to access string
2692        # properties, which aren't available in regexes, so a second
2693        # conversion hash is made for them (stored in UCD.pl).  Look in the
2694        # string one now, as the rest can have an optional 'is' prefix, which
2695        # these don't.
2696        if (exists $string_property_loose_to_name{$loose}) {
2697
2698            # Convert to its standard loose name.
2699            $prop = $string_property_loose_to_name{$loose};
2700        }
2701        else {
2702            my $retrying = 0;   # bool.  ? Has an initial 'is' been stripped
2703        RETRY:
2704            if (exists $loose_property_name_of{$loose}
2705                && (! $retrying
2706                    || ! exists $ambiguous_names{$loose}))
2707            {
2708                # Found an entry giving the standard form.  We don't get here
2709                # (in the test above) when we've stripped off an
2710                # 'is' and the result is an ambiguous name.  That is because
2711                # these are official Unicode properties (though Perl can have
2712                # an optional 'is' prefix meaning the official property), and
2713                # all ambiguous cases involve a Perl single-form extension
2714                # for the gc, script, or block properties, and the stripped
2715                # 'is' means that they mean one of those, and not one of
2716                # these
2717                $prop = $loose_property_name_of{$loose};
2718            }
2719            elsif (exists $loose_perlprop_to_name{$loose}) {
2720
2721                # This hash is specifically for this function to list Perl
2722                # extensions that aren't in the earlier hashes.  If there is
2723                # only one element, the short and long names are identical.
2724                # Otherwise the form is already in the same form as
2725                # %prop_aliases, which is handled at the end of the function.
2726                $list_ref = $loose_perlprop_to_name{$loose};
2727                if (@$list_ref == 1) {
2728                    my @list = ($list_ref->[0], $list_ref->[0]);
2729                    $list_ref = \@list;
2730                }
2731            }
2732            elsif (! exists $loose_to_file_of{$loose}) {
2733
2734                # loose_to_file_of is a complete list of loose names.  If not
2735                # there, the input is unknown.
2736                return;
2737            }
2738            elsif ($loose =~ / [:=] /x) {
2739
2740                # Here we found the name but not its aliases, so it has to
2741                # exist.  Exclude property-value combinations.  (This shows up
2742                # for something like ccc=vr which matches loosely, but is a
2743                # synonym for ccc=9 which matches only strictly.
2744                return;
2745            }
2746            else {
2747
2748                # Here it has to exist, and isn't a property-value
2749                # combination.  This means it must be one of the Perl
2750                # single-form extensions.  First see if it is for a
2751                # property-value combination in one of the following
2752                # properties.
2753                my @list;
2754                foreach my $property ("gc", "script") {
2755                    @list = prop_value_aliases($property, $loose);
2756                    last if @list;
2757                }
2758                if (@list) {
2759
2760                    # Here, it is one of those property-value combination
2761                    # single-form synonyms.  There are ambiguities with some
2762                    # of these.  Check against the list for these, and adjust
2763                    # if necessary.
2764                    for my $i (0 .. @list -1) {
2765                        if (exists $ambiguous_names
2766                                   {loose_name(lc $list[$i])})
2767                        {
2768                            # The ambiguity is resolved by toggling whether or
2769                            # not it has an 'is' prefix
2770                            $list[$i] =~ s/^Is_// or $list[$i] =~ s/^/Is_/;
2771                        }
2772                    }
2773                    return @list;
2774                }
2775
2776                # Here, it wasn't one of the gc or script single-form
2777                # extensions.  It could be a block property single-form
2778                # extension.  An 'in' prefix definitely means that, and should
2779                # be looked up without the prefix.  However, starting in
2780                # Unicode 6.1, we have to special case 'indic...', as there
2781                # is a property that begins with that name.   We shouldn't
2782                # strip the 'in' from that.   I'm (khw) generalizing this to
2783                # 'indic' instead of the single property, because I suspect
2784                # that others of this class may come along in the future.
2785                # However, this could backfire and a block created whose name
2786                # begins with 'dic...', and we would want to strip the 'in'.
2787                # At which point this would have to be tweaked.
2788                my $began_with_in = $loose =~ s/^in(?!dic)//;
2789                @list = prop_value_aliases("block", $loose);
2790                if (@list) {
2791                    map { $_ =~ s/^/In_/ } @list;
2792                    return @list;
2793                }
2794
2795                # Here still haven't found it.  The last opportunity for it
2796                # being valid is only if it began with 'is'.  We retry without
2797                # the 'is', setting a flag to that effect so that we don't
2798                # accept things that begin with 'isis...'
2799                if (! $retrying && ! $began_with_in && $loose =~ s/^is//) {
2800                    $retrying = 1;
2801                    goto RETRY;
2802                }
2803
2804                # Here, didn't find it.  Since it was in %loose_to_file_of, we
2805                # should have been able to find it.
2806                carp __PACKAGE__, "::prop_aliases: Unexpectedly could not find '$prop'.  Send bug report to perlbug\@perl.org";
2807                return;
2808            }
2809        }
2810    }
2811
2812    if (! $list_ref) {
2813        # Here, we have set $prop to a standard form name of the input.  Look
2814        # it up in the structure created by mktables for this purpose, which
2815        # contains both strict and loosely matched properties.  Avoid
2816        # autovivifying.
2817        $list_ref = $prop_aliases{$prop} if exists $prop_aliases{$prop};
2818        return unless $list_ref;
2819    }
2820
2821    # The full name is in element 1.
2822    return $list_ref->[1] unless wantarray;
2823
2824    return @{_dclone $list_ref};
2825}
2826
2827=pod
2828
2829=head2 B<prop_values()>
2830
2831    use Unicode::UCD 'prop_values';
2832
2833    print "AHex values are: ", join(", ", prop_values("AHex")),
2834                               "\n";
2835  prints:
2836    AHex values are: N, Y
2837
2838Some Unicode properties have a restricted set of legal values.  For example,
2839all binary properties are restricted to just C<true> or C<false>; and there
2840are only a few dozen possible General Categories.  Use C<prop_values>
2841to find out if a given property is one such, and if so, to get a list of the
2842values:
2843
2844    print join ", ", prop_values("NFC_Quick_Check");
2845  prints:
2846    M, N, Y
2847
2848If the property doesn't have such a restricted set, C<undef> is returned.
2849
2850There are usually several synonyms for each possible value.  Use
2851L</prop_value_aliases()> to access those.
2852
2853Case, white space, hyphens, and underscores are ignored in the input property
2854name (except for the trailing underscore in the old-form grandfathered-in
2855general category property value C<"L_">, which is better written as C<"LC">).
2856
2857If the property name is unknown, C<undef> is returned.  Note that Perl typically
2858recognizes property names in regular expressions with an optional C<"Is_>"
2859(with or without the underscore) prefixed to them, such as C<\p{isgc=punct}>.
2860This function does not recognize those in the property parameter, returning
2861C<undef>.
2862
2863For the block property, new-style block names are returned (see
2864L</Old-style versus new-style block names>).
2865
2866C<prop_values> does not know about any user-defined properties, and
2867will return C<undef> if called with one of those.
2868
2869=cut
2870
2871# These are created by mktables for this module and stored in unicore/UCD.pl
2872# where their structures are described.
2873our %loose_to_standard_value;
2874our %prop_value_aliases;
2875
2876sub prop_values ($) {
2877    my $prop = shift;
2878    return undef unless defined $prop;
2879
2880    require "unicore/UCD.pl";
2881
2882    # Find the property name synonym that's used as the key in other hashes,
2883    # which is element 0 in the returned list.
2884    ($prop) = prop_aliases($prop);
2885    return undef if ! $prop;
2886    $prop = loose_name(lc $prop);
2887
2888    # Here is a legal property.
2889    return undef unless exists $prop_value_aliases{$prop};
2890    my @return;
2891    foreach my $value_key (sort { lc $a cmp lc $b }
2892                            keys %{$prop_value_aliases{$prop}})
2893    {
2894        push @return, $prop_value_aliases{$prop}{$value_key}[0];
2895    }
2896    return @return;
2897}
2898
2899=pod
2900
2901=head2 B<prop_value_aliases()>
2902
2903    use Unicode::UCD 'prop_value_aliases';
2904
2905    my ($short_name, $full_name, @other_names)
2906                                   = prop_value_aliases("Gc", "Punct");
2907    my $same_full_name = prop_value_aliases("Gc", "P");   # Scalar cntxt
2908    my ($same_short_name) = prop_value_aliases("Gc", "P"); # gets 0th
2909                                                           # element
2910    print "The full name is $full_name\n";
2911    print "The short name is $short_name\n";
2912    print "The other aliases are: ", join(", ", @other_names), "\n";
2913
2914  prints:
2915    The full name is Punctuation
2916    The short name is P
2917    The other aliases are: Punct
2918
2919Some Unicode properties have a restricted set of legal values.  For example,
2920all binary properties are restricted to just C<true> or C<false>; and there
2921are only a few dozen possible General Categories.
2922
2923You can use L</prop_values()> to find out if a given property is one which has
2924a restricted set of values, and if so, what those values are.  But usually
2925each value actually has several synonyms.  For example, in Unicode binary
2926properties, I<truth> can be represented by any of the strings "Y", "Yes", "T",
2927or "True"; and the General Category "Punctuation" by that string, or "Punct",
2928or simply "P".
2929
2930Like property names, there is typically at least a short name for each such
2931property-value, and a long name.  If you know any name of the property-value
2932(which you can get by L</prop_values()>, you can use C<prop_value_aliases>()
2933to get the long name (when called in scalar context), or a list of all the
2934names, with the short name in the 0th element, the long name in the next
2935element, and any other synonyms in the remaining elements, in no particular
2936order, except that any all-numeric synonyms will be last.
2937
2938The long name is returned in a form nicely capitalized, suitable for printing.
2939
2940Case, white space, hyphens, and underscores are ignored in the input parameters
2941(except for the trailing underscore in the old-form grandfathered-in general
2942category property value C<"L_">, which is better written as C<"LC">).
2943
2944If either name is unknown, C<undef> is returned.  Note that Perl typically
2945recognizes property names in regular expressions with an optional C<"Is_>"
2946(with or without the underscore) prefixed to them, such as C<\p{isgc=punct}>.
2947This function does not recognize those in the property parameter, returning
2948C<undef>.
2949
2950If called with a property that doesn't have synonyms for its values, it
2951returns the input value, possibly normalized with capitalization and
2952underscores, but not necessarily checking that the input value is valid.
2953
2954For the block property, new-style block names are returned (see
2955L</Old-style versus new-style block names>).
2956
2957To find the synonyms for single-forms, such as C<\p{Any}>, use
2958L</prop_aliases()> instead.
2959
2960C<prop_value_aliases> does not know about any user-defined properties, and
2961will return C<undef> if called with one of those.
2962
2963=cut
2964
2965sub prop_value_aliases ($$) {
2966    my ($prop, $value) = @_;
2967    return unless defined $prop && defined $value;
2968
2969    require "unicore/UCD.pl";
2970
2971    # Find the property name synonym that's used as the key in other hashes,
2972    # which is element 0 in the returned list.
2973    ($prop) = prop_aliases($prop);
2974    return if ! $prop;
2975    $prop = loose_name(lc $prop);
2976
2977    # Here is a legal property, but the hash below (created by mktables for
2978    # this purpose) only knows about the properties that have a very finite
2979    # number of potential values, that is not ones whose value could be
2980    # anything, like most (if not all) string properties.  These don't have
2981    # synonyms anyway.  Simply return the input.  For example, there is no
2982    # synonym for ('Uppercase_Mapping', A').
2983    if (! exists $prop_value_aliases{$prop}) {
2984
2985        # Here, we have a legal property, but an unknown value.  Since the
2986        # property is legal, if it isn't in the prop_aliases hash, it must be
2987        # a Perl-extension All perl extensions are binary, hence are
2988        # enumerateds, which means that we know that the input unknown value
2989        # is illegal.
2990        return if ! exists $prop_aliases{$prop};
2991
2992        # Otherwise, we assume it's valid, as documented.
2993        return $value;
2994    }
2995
2996    # The value name may be loosely or strictly matched; we don't know yet.
2997    # But both types use lower-case.
2998    $value = lc $value;
2999
3000    # If the name isn't found under loose matching, it certainly won't be
3001    # found under strict
3002    my $loose_value = loose_name($value);
3003    return unless exists $loose_to_standard_value{"$prop=$loose_value"};
3004
3005    # Similarly if the combination under loose matching doesn't exist, it
3006    # won't exist under strict.
3007    my $standard_value = $loose_to_standard_value{"$prop=$loose_value"};
3008    return unless exists $prop_value_aliases{$prop}{$standard_value};
3009
3010    # Here we did find a combination under loose matching rules.  But it could
3011    # be that is a strict property match that shouldn't have matched.
3012    # %prop_value_aliases is set up so that the strict matches will appear as
3013    # if they were in loose form.  Thus, if the non-loose version is legal,
3014    # we're ok, can skip the further check.
3015    if (! exists $stricter_to_file_of{"$prop=$value"}
3016
3017        # We're also ok and skip the further check if value loosely matches.
3018        # mktables has verified that no strict name under loose rules maps to
3019        # an existing loose name.  This code relies on the very limited
3020        # circumstances that strict names can be here.  Strict name matching
3021        # happens under two conditions:
3022        # 1) when the name begins with an underscore.  But this function
3023        #    doesn't accept those, and %prop_value_aliases doesn't have
3024        #    them.
3025        # 2) When the values are numeric, in which case we need to look
3026        #    further, but their squeezed-out loose values will be in
3027        #    %stricter_to_file_of
3028        && exists $stricter_to_file_of{"$prop=$loose_value"})
3029    {
3030        # The only thing that's legal loosely under strict is that can have an
3031        # underscore between digit pairs XXX
3032        while ($value =~ s/(\d)_(\d)/$1$2/g) {}
3033        return unless exists $stricter_to_file_of{"$prop=$value"};
3034    }
3035
3036    # Here, we know that the combination exists.  Return it.
3037    my $list_ref = $prop_value_aliases{$prop}{$standard_value};
3038    if (@$list_ref > 1) {
3039        # The full name is in element 1.
3040        return $list_ref->[1] unless wantarray;
3041
3042        return @{_dclone $list_ref};
3043    }
3044
3045    return $list_ref->[0] unless wantarray;
3046
3047    # Only 1 element means that it repeats
3048    return ( $list_ref->[0], $list_ref->[0] );
3049}
3050
3051# All 1 bits but the top one is the largest possible IV.
3052$MAX_CP = (~0) >> 1;
3053
3054=pod
3055
3056=head2 B<prop_invlist()>
3057
3058C<prop_invlist> returns an inversion list (described below) that defines all the
3059code points for the binary Unicode property (or "property=value" pair) given
3060by the input parameter string:
3061
3062 use feature 'say';
3063 use Unicode::UCD 'prop_invlist';
3064 say join ", ", prop_invlist("Any");
3065
3066 prints:
3067 0, 1114112
3068
3069If the input is unknown C<undef> is returned in scalar context; an empty-list
3070in list context.  If the input is known, the number of elements in
3071the list is returned if called in scalar context.
3072
3073L<perluniprops|perluniprops/Properties accessible through \p{} and \P{}> gives
3074the list of properties that this function accepts, as well as all the possible
3075forms for them (including with the optional "Is_" prefixes).  (Except this
3076function doesn't accept any Perl-internal properties, some of which are listed
3077there.) This function uses the same loose or tighter matching rules for
3078resolving the input property's name as is done for regular expressions.  These
3079are also specified in L<perluniprops|perluniprops/Properties accessible
3080through \p{} and \P{}>.  Examples of using the "property=value" form are:
3081
3082 say join ", ", prop_invlist("Script_Extensions=Shavian");
3083
3084 prints:
3085 66640, 66688
3086
3087 say join ", ", prop_invlist("ASCII_Hex_Digit=No");
3088
3089 prints:
3090 0, 48, 58, 65, 71, 97, 103
3091
3092 say join ", ", prop_invlist("ASCII_Hex_Digit=Yes");
3093
3094 prints:
3095 48, 58, 65, 71, 97, 103
3096
3097Inversion lists are a compact way of specifying Unicode property-value
3098definitions.  The 0th item in the list is the lowest code point that has the
3099property-value.  The next item (item [1]) is the lowest code point beyond that
3100one that does NOT have the property-value.  And the next item beyond that
3101([2]) is the lowest code point beyond that one that does have the
3102property-value, and so on.  Put another way, each element in the list gives
3103the beginning of a range that has the property-value (for even numbered
3104elements), or doesn't have the property-value (for odd numbered elements).
3105The name for this data structure stems from the fact that each element in the
3106list toggles (or inverts) whether the corresponding range is or isn't on the
3107list.
3108
3109In the final example above, the first ASCII Hex digit is code point 48, the
3110character "0", and all code points from it through 57 (a "9") are ASCII hex
3111digits.  Code points 58 through 64 aren't, but 65 (an "A") through 70 (an "F")
3112are, as are 97 ("a") through 102 ("f").  103 starts a range of code points
3113that aren't ASCII hex digits.  That range extends to infinity, which on your
3114computer can be found in the variable C<$Unicode::UCD::MAX_CP>.  (This
3115variable is as close to infinity as Perl can get on your platform, and may be
3116too high for some operations to work; you may wish to use a smaller number for
3117your purposes.)
3118
3119Note that the inversion lists returned by this function can possibly include
3120non-Unicode code points, that is anything above 0x10FFFF.  Unicode properties
3121are not defined on such code points.  You might wish to change the output to
3122not include these.  Simply add 0x110000 at the end of the non-empty returned
3123list if it isn't already that value; and pop that value if it is; like:
3124
3125 my @list = prop_invlist("foo");
3126 if (@list) {
3127     if ($list[-1] == 0x110000) {
3128         pop @list;  # Defeat the turning on for above Unicode
3129     }
3130     else {
3131         push @list, 0x110000; # Turn off for above Unicode
3132     }
3133 }
3134
3135It is a simple matter to expand out an inversion list to a full list of all
3136code points that have the property-value:
3137
3138 my @invlist = prop_invlist($property_name);
3139 die "empty" unless @invlist;
3140 my @full_list;
3141 for (my $i = 0; $i < @invlist; $i += 2) {
3142    my $upper = ($i + 1) < @invlist
3143                ? $invlist[$i+1] - 1      # In range
3144                : $Unicode::UCD::MAX_CP;  # To infinity.
3145    for my $j ($invlist[$i] .. $upper) {
3146        push @full_list, $j;
3147    }
3148 }
3149
3150C<prop_invlist> does not know about any user-defined nor Perl internal-only
3151properties, and will return C<undef> if called with one of those.
3152
3153The L</search_invlist()> function is provided for finding a code point within
3154an inversion list.
3155
3156=cut
3157
3158# User-defined properties could be handled with some changes to SWASHNEW;
3159# and implementing here of dealing with EXTRAS.  If done, consideration should
3160# be given to the fact that the user subroutine could return different results
3161# with each call; security issues need to be thought about.
3162
3163# These are created by mktables for this routine and stored in unicore/UCD.pl
3164# where their structures are described.
3165our %loose_defaults;
3166our $MAX_UNICODE_CODEPOINT;
3167
3168sub prop_invlist ($;$) {
3169    my $prop = $_[0];
3170
3171    # Undocumented way to get at Perl internal properties; it may be changed
3172    # or removed without notice at any time.
3173    my $internal_ok = defined $_[1] && $_[1] eq '_perl_core_internal_ok';
3174
3175    return if ! defined $prop;
3176
3177    # Warnings for these are only for regexes, so not applicable to us
3178    no warnings 'deprecated';
3179
3180    # Get the swash definition of the property-value.
3181    my $swash = SWASHNEW(__PACKAGE__, $prop, undef, 1, 0);
3182
3183    # Fail if not found, or isn't a boolean property-value, or is a
3184    # user-defined property, or is internal-only.
3185    return if ! $swash
3186              || ref $swash eq ""
3187              || $swash->{'BITS'} != 1
3188              || $swash->{'USER_DEFINED'}
3189              || (! $internal_ok && $prop =~ /^\s*_/);
3190
3191    if ($swash->{'EXTRAS'}) {
3192        carp __PACKAGE__, "::prop_invlist: swash returned for $prop unexpectedly has EXTRAS magic";
3193        return;
3194    }
3195    if ($swash->{'SPECIALS'}) {
3196        carp __PACKAGE__, "::prop_invlist: swash returned for $prop unexpectedly has SPECIALS magic";
3197        return;
3198    }
3199
3200    my @invlist;
3201
3202    if ($swash->{'LIST'} =~ /^V/) {
3203
3204        # A 'V' as the first character marks the input as already an inversion
3205        # list, in which case, all we need to do is put the remaining lines
3206        # into our array.
3207        @invlist = split "\n", $swash->{'LIST'} =~ s/ \s* (?: \# .* )? $ //xmgr;
3208        shift @invlist;
3209    }
3210    else {
3211        # The input lines look like:
3212        # 0041\t005A   # [26]
3213        # 005F
3214
3215        # Split into lines, stripped of trailing comments
3216        foreach my $range (split "\n",
3217                              $swash->{'LIST'} =~ s/ \s* (?: \# .* )? $ //xmgr)
3218        {
3219            # And find the beginning and end of the range on the line
3220            my ($hex_begin, $hex_end) = split "\t", $range;
3221            my $begin = hex $hex_begin;
3222
3223            # If the new range merely extends the old, we remove the marker
3224            # created the last time through the loop for the old's end, which
3225            # causes the new one's end to be used instead.
3226            if (@invlist && $begin == $invlist[-1]) {
3227                pop @invlist;
3228            }
3229            else {
3230                # Add the beginning of the range
3231                push @invlist, $begin;
3232            }
3233
3234            if (defined $hex_end) { # The next item starts with the code point 1
3235                                    # beyond the end of the range.
3236                no warnings 'portable';
3237                my $end = hex $hex_end;
3238                last if $end == $MAX_CP;
3239                push @invlist, $end + 1;
3240            }
3241            else {  # No end of range, is a single code point.
3242                push @invlist, $begin + 1;
3243            }
3244        }
3245    }
3246
3247    # Could need to be inverted: add or subtract a 0 at the beginning of the
3248    # list.
3249    if ($swash->{'INVERT_IT'}) {
3250        if (@invlist && $invlist[0] == 0) {
3251            shift @invlist;
3252        }
3253        else {
3254            unshift @invlist, 0;
3255        }
3256    }
3257
3258    return @invlist;
3259}
3260
3261=pod
3262
3263=head2 B<prop_invmap()>
3264
3265 use Unicode::UCD 'prop_invmap';
3266 my ($list_ref, $map_ref, $format, $default)
3267                                      = prop_invmap("General Category");
3268
3269C<prop_invmap> is used to get the complete mapping definition for a property,
3270in the form of an inversion map.  An inversion map consists of two parallel
3271arrays.  One is an ordered list of code points that mark range beginnings, and
3272the other gives the value (or mapping) that all code points in the
3273corresponding range have.
3274
3275C<prop_invmap> is called with the name of the desired property.  The name is
3276loosely matched, meaning that differences in case, white-space, hyphens, and
3277underscores are not meaningful (except for the trailing underscore in the
3278old-form grandfathered-in property C<"L_">, which is better written as C<"LC">,
3279or even better, C<"Gc=LC">).
3280
3281Many Unicode properties have more than one name (or alias).  C<prop_invmap>
3282understands all of these, including Perl extensions to them.  Ambiguities are
3283resolved as described above for L</prop_aliases()> (except if a property has
3284both a complete mapping, and a binary C<Y>/C<N> mapping, then specifying the
3285property name prefixed by C<"is"> causes the binary one to be returned).  The
3286Perl internal property "Perl_Decimal_Digit, described below, is also accepted.
3287An empty list is returned if the property name is unknown.
3288See L<perluniprops/Properties accessible through Unicode::UCD> for the
3289properties acceptable as inputs to this function.
3290
3291It is a fatal error to call this function except in list context.
3292
3293In addition to the two arrays that form the inversion map, C<prop_invmap>
3294returns two other values; one is a scalar that gives some details as to the
3295format of the entries of the map array; the other is a default value, useful
3296in maps whose format name begins with the letter C<"a">, as described
3297L<below in its subsection|/a>; and for specialized purposes, such as
3298converting to another data structure, described at the end of this main
3299section.
3300
3301This means that C<prop_invmap> returns a 4 element list.  For example,
3302
3303 my ($blocks_ranges_ref, $blocks_maps_ref, $format, $default)
3304                                                 = prop_invmap("Block");
3305
3306In this call, the two arrays will be populated as shown below (for Unicode
33076.0):
3308
3309 Index  @blocks_ranges  @blocks_maps
3310   0        0x0000      Basic Latin
3311   1        0x0080      Latin-1 Supplement
3312   2        0x0100      Latin Extended-A
3313   3        0x0180      Latin Extended-B
3314   4        0x0250      IPA Extensions
3315   5        0x02B0      Spacing Modifier Letters
3316   6        0x0300      Combining Diacritical Marks
3317   7        0x0370      Greek and Coptic
3318   8        0x0400      Cyrillic
3319  ...
3320 233        0x2B820     No_Block
3321 234        0x2F800     CJK Compatibility Ideographs Supplement
3322 235        0x2FA20     No_Block
3323 236        0xE0000     Tags
3324 237        0xE0080     No_Block
3325 238        0xE0100     Variation Selectors Supplement
3326 239        0xE01F0     No_Block
3327 240        0xF0000     Supplementary Private Use Area-A
3328 241        0x100000    Supplementary Private Use Area-B
3329 242        0x110000    No_Block
3330
3331The first line (with Index [0]) means that the value for code point 0 is "Basic
3332Latin".  The entry "0x0080" in the @blocks_ranges column in the second line
3333means that the value from the first line, "Basic Latin", extends to all code
3334points in the range from 0 up to but not including 0x0080, that is, through
3335127.  In other words, the code points from 0 to 127 are all in the "Basic
3336Latin" block.  Similarly, all code points in the range from 0x0080 up to (but
3337not including) 0x0100 are in the block named "Latin-1 Supplement", etc.
3338(Notice that the return is the old-style block names; see L</Old-style versus
3339new-style block names>).
3340
3341The final line (with Index [242]) means that the value for all code points above
3342the legal Unicode maximum code point have the value "No_Block", which is the
3343term Unicode uses for a non-existing block.
3344
3345The arrays completely specify the mappings for all possible code points.
3346The final element in an inversion map returned by this function will always be
3347for the range that consists of all the code points that aren't legal Unicode,
3348but that are expressible on the platform.  (That is, it starts with code point
33490x110000, the first code point above the legal Unicode maximum, and extends to
3350infinity.) The value for that range will be the same that any typical
3351unassigned code point has for the specified property.  (Certain unassigned
3352code points are not "typical"; for example the non-character code points, or
3353those in blocks that are to be written right-to-left.  The above-Unicode
3354range's value is not based on these atypical code points.)  It could be argued
3355that, instead of treating these as unassigned Unicode code points, the value
3356for this range should be C<undef>.  If you wish, you can change the returned
3357arrays accordingly.
3358
3359The maps for almost all properties are simple scalars that should be
3360interpreted as-is.
3361These values are those given in the Unicode-supplied data files, which may be
3362inconsistent as to capitalization and as to which synonym for a property-value
3363is given.  The results may be normalized by using the L</prop_value_aliases()>
3364function.
3365
3366There are exceptions to the simple scalar maps.  Some properties have some
3367elements in their map list that are themselves lists of scalars; and some
3368special strings are returned that are not to be interpreted as-is.  Element
3369[2] (placed into C<$format> in the example above) of the returned four element
3370list tells you if the map has any of these special elements or not, as follows:
3371
3372=over
3373
3374=item B<C<s>>
3375
3376means all the elements of the map array are simple scalars, with no special
3377elements.  Almost all properties are like this, like the C<block> example
3378above.
3379
3380=item B<C<sl>>
3381
3382means that some of the map array elements have the form given by C<"s">, and
3383the rest are lists of scalars.  For example, here is a portion of the output
3384of calling C<prop_invmap>() with the "Script Extensions" property:
3385
3386 @scripts_ranges  @scripts_maps
3387      ...
3388      0x0953      Devanagari
3389      0x0964      [ Bengali, Devanagari, Gurumukhi, Oriya ]
3390      0x0966      Devanagari
3391      0x0970      Common
3392
3393Here, the code points 0x964 and 0x965 are both used in Bengali,
3394Devanagari, Gurmukhi, and Oriya, but no other scripts.
3395
3396The Name_Alias property is also of this form.  But each scalar consists of two
3397components:  1) the name, and 2) the type of alias this is.  They are
3398separated by a colon and a space.  In Unicode 6.1, there are several alias types:
3399
3400=over
3401
3402=item C<correction>
3403
3404indicates that the name is a corrected form for the
3405original name (which remains valid) for the same code point.
3406
3407=item C<control>
3408
3409adds a new name for a control character.
3410
3411=item C<alternate>
3412
3413is an alternate name for a character
3414
3415=item C<figment>
3416
3417is a name for a character that has been documented but was never in any
3418actual standard.
3419
3420=item C<abbreviation>
3421
3422is a common abbreviation for a character
3423
3424=back
3425
3426The lists are ordered (roughly) so the most preferred names come before less
3427preferred ones.
3428
3429For example,
3430
3431 @aliases_ranges        @alias_maps
3432    ...
3433    0x009E        [ 'PRIVACY MESSAGE: control', 'PM: abbreviation' ]
3434    0x009F        [ 'APPLICATION PROGRAM COMMAND: control',
3435                    'APC: abbreviation'
3436                  ]
3437    0x00A0        'NBSP: abbreviation'
3438    0x00A1        ""
3439    0x00AD        'SHY: abbreviation'
3440    0x00AE        ""
3441    0x01A2        'LATIN CAPITAL LETTER GHA: correction'
3442    0x01A3        'LATIN SMALL LETTER GHA: correction'
3443    0x01A4        ""
3444    ...
3445
3446A map to the empty string means that there is no alias defined for the code
3447point.
3448
3449=item B<C<a>>
3450
3451is like C<"s"> in that all the map array elements are scalars, but here they are
3452restricted to all being integers, and some have to be adjusted (hence the name
3453C<"a">) to get the correct result.  For example, in:
3454
3455 my ($uppers_ranges_ref, $uppers_maps_ref, $format, $default)
3456                          = prop_invmap("Simple_Uppercase_Mapping");
3457
3458the returned arrays look like this:
3459
3460 @$uppers_ranges_ref    @$uppers_maps_ref   Note
3461       0                      0
3462      97                     65          'a' maps to 'A', b => B ...
3463     123                      0
3464     181                    924          MICRO SIGN => Greek Cap MU
3465     182                      0
3466     ...
3467
3468and C<$default> is 0.
3469
3470Let's start with the second line.  It says that the uppercase of code point 97
3471is 65; or C<uc("a")> == "A".  But the line is for the entire range of code
3472points 97 through 122.  To get the mapping for any code point in this range,
3473you take the offset it has from the beginning code point of the range, and add
3474that to the mapping for that first code point.  So, the mapping for 122 ("z")
3475is derived by taking the offset of 122 from 97 (=25) and adding that to 65,
3476yielding 90 ("z").  Likewise for everything in between.
3477
3478Requiring this simple adjustment allows the returned arrays to be
3479significantly smaller than otherwise, up to a factor of 10, speeding up
3480searching through them.
3481
3482Ranges that map to C<$default>, C<"0">, behave somewhat differently.  For
3483these, each code point maps to itself.  So, in the first line in the example,
3484S<C<ord(uc(chr(0)))>> is 0, S<C<ord(uc(chr(1)))>> is 1, ..
3485S<C<ord(uc(chr(96)))>> is 96.
3486
3487=item B<C<al>>
3488
3489means that some of the map array elements have the form given by C<"a">, and
3490the rest are ordered lists of code points.
3491For example, in:
3492
3493 my ($uppers_ranges_ref, $uppers_maps_ref, $format, $default)
3494                                 = prop_invmap("Uppercase_Mapping");
3495
3496the returned arrays look like this:
3497
3498 @$uppers_ranges_ref    @$uppers_maps_ref
3499       0                      0
3500      97                     65
3501     123                      0
3502     181                    924
3503     182                      0
3504     ...
3505    0x0149              [ 0x02BC 0x004E ]
3506    0x014A                    0
3507    0x014B                  330
3508     ...
3509
3510This is the full Uppercase_Mapping property (as opposed to the
3511Simple_Uppercase_Mapping given in the example for format C<"a">).  The only
3512difference between the two in the ranges shown is that the code point at
35130x0149 (LATIN SMALL LETTER N PRECEDED BY APOSTROPHE) maps to a string of two
3514characters, 0x02BC (MODIFIER LETTER APOSTROPHE) followed by 0x004E (LATIN
3515CAPITAL LETTER N).
3516
3517No adjustments are needed to entries that are references to arrays; each such
3518entry will have exactly one element in its range, so the offset is always 0.
3519
3520The fourth (index [3]) element (C<$default>) in the list returned for this
3521format is 0.
3522
3523=item B<C<ae>>
3524
3525This is like C<"a">, but some elements are the empty string, and should not be
3526adjusted.
3527The one internal Perl property accessible by C<prop_invmap> is of this type:
3528"Perl_Decimal_Digit" returns an inversion map which gives the numeric values
3529that are represented by the Unicode decimal digit characters.  Characters that
3530don't represent decimal digits map to the empty string, like so:
3531
3532 @digits    @values
3533 0x0000       ""
3534 0x0030        0
3535 0x003A:      ""
3536 0x0660:       0
3537 0x066A:      ""
3538 0x06F0:       0
3539 0x06FA:      ""
3540 0x07C0:       0
3541 0x07CA:      ""
3542 0x0966:       0
3543 ...
3544
3545This means that the code points from 0 to 0x2F do not represent decimal digits;
3546the code point 0x30 (DIGIT ZERO) represents 0;  code point 0x31, (DIGIT ONE),
3547represents 0+1-0 = 1; ... code point 0x39, (DIGIT NINE), represents 0+9-0 = 9;
3548... code points 0x3A through 0x65F do not represent decimal digits; 0x660
3549(ARABIC-INDIC DIGIT ZERO), represents 0; ... 0x07C1 (NKO DIGIT ONE),
3550represents 0+1-0 = 1 ...
3551
3552The fourth (index [3]) element (C<$default>) in the list returned for this
3553format is the empty string.
3554
3555=item B<C<ale>>
3556
3557is a combination of the C<"al"> type and the C<"ae"> type.  Some of
3558the map array elements have the forms given by C<"al">, and
3559the rest are the empty string.  The property C<NFKC_Casefold> has this form.
3560An example slice is:
3561
3562 @$ranges_ref  @$maps_ref         Note
3563    ...
3564   0x00AA       97                FEMININE ORDINAL INDICATOR => 'a'
3565   0x00AB        0
3566   0x00AD                         SOFT HYPHEN => ""
3567   0x00AE        0
3568   0x00AF     [ 0x0020, 0x0304 ]  MACRON => SPACE . COMBINING MACRON
3569   0x00B0        0
3570   ...
3571
3572The fourth (index [3]) element (C<$default>) in the list returned for this
3573format is 0.
3574
3575=item B<C<ar>>
3576
3577means that all the elements of the map array are either rational numbers or
3578the string C<"NaN">, meaning "Not a Number".  A rational number is either an
3579integer, or two integers separated by a solidus (C<"/">).  The second integer
3580represents the denominator of the division implied by the solidus, and is
3581actually always positive, so it is guaranteed not to be 0 and to not be
3582signed.  When the element is a plain integer (without the
3583solidus), it may need to be adjusted to get the correct value by adding the
3584offset, just as other C<"a"> properties.  No adjustment is needed for
3585fractions, as the range is guaranteed to have just a single element, and so
3586the offset is always 0.
3587
3588If you want to convert the returned map to entirely scalar numbers, you
3589can use something like this:
3590
3591 my ($invlist_ref, $invmap_ref, $format) = prop_invmap($property);
3592 if ($format && $format eq "ar") {
3593     map { $_ = eval $_ if $_ ne 'NaN' } @$map_ref;
3594 }
3595
3596Here's some entries from the output of the property "Nv", which has format
3597C<"ar">.
3598
3599 @numerics_ranges  @numerics_maps       Note
3600        0x00           "NaN"
3601        0x30             0           DIGIT 0 .. DIGIT 9
3602        0x3A           "NaN"
3603        0xB2             2           SUPERSCRIPTs 2 and 3
3604        0xB4           "NaN"
3605        0xB9             1           SUPERSCRIPT 1
3606        0xBA           "NaN"
3607        0xBC            1/4          VULGAR FRACTION 1/4
3608        0xBD            1/2          VULGAR FRACTION 1/2
3609        0xBE            3/4          VULGAR FRACTION 3/4
3610        0xBF           "NaN"
3611        0x660            0           ARABIC-INDIC DIGIT ZERO .. NINE
3612        0x66A          "NaN"
3613
3614The fourth (index [3]) element (C<$default>) in the list returned for this
3615format is C<"NaN">.
3616
3617=item B<C<n>>
3618
3619means the Name property.  All the elements of the map array are simple
3620scalars, but some of them contain special strings that require more work to
3621get the actual name.
3622
3623Entries such as:
3624
3625 CJK UNIFIED IDEOGRAPH-<code point>
3626
3627mean that the name for the code point is "CJK UNIFIED IDEOGRAPH-"
3628with the code point (expressed in hexadecimal) appended to it, like "CJK
3629UNIFIED IDEOGRAPH-3403" (similarly for S<C<CJK COMPATIBILITY IDEOGRAPH-E<lt>code
3630pointE<gt>>>).
3631
3632Also, entries like
3633
3634 <hangul syllable>
3635
3636means that the name is algorithmically calculated.  This is easily done by
3637the function L<charnames/charnames::viacode(code)>.
3638
3639Note that for control characters (C<Gc=cc>), Unicode's data files have the
3640string "C<E<lt>controlE<gt>>", but the real name of each of these characters is the empty
3641string.  This function returns that real name, the empty string.  (There are
3642names for these characters, but they are considered aliases, not the Name
3643property name, and are contained in the C<Name_Alias> property.)
3644
3645=item B<C<ad>>
3646
3647means the Decomposition_Mapping property.  This property is like C<"al">
3648properties, except that one of the scalar elements is of the form:
3649
3650 <hangul syllable>
3651
3652This signifies that this entry should be replaced by the decompositions for
3653all the code points whose decomposition is algorithmically calculated.  (All
3654of them are currently in one range and no others outside the range are likely
3655to ever be added to Unicode; the C<"n"> format
3656has this same entry.)  These can be generated via the function
3657L<Unicode::Normalize::NFD()|Unicode::Normalize>.
3658
3659Note that the mapping is the one that is specified in the Unicode data files,
3660and to get the final decomposition, it may need to be applied recursively.
3661Unicode in fact discourages use of this property except internally in
3662implementations of the Unicode Normalization Algorithm.
3663
3664The fourth (index [3]) element (C<$default>) in the list returned for this
3665format is 0.
3666
3667=back
3668
3669Note that a format begins with the letter "a" if and only the property it is
3670for requires adjustments by adding the offsets in multi-element ranges.  For
3671all these properties, an entry should be adjusted only if the map is a scalar
3672which is an integer.  That is, it must match the regular expression:
3673
3674    / ^ -? \d+ $ /xa
3675
3676Further, the first element in a range never needs adjustment, as the
3677adjustment would be just adding 0.
3678
3679A binary search such as that provided by L</search_invlist()>, can be used to
3680quickly find a code point in the inversion list, and hence its corresponding
3681mapping.
3682
3683The final, fourth element (index [3], assigned to C<$default> in the "block"
3684example) in the four element list returned by this function is used with the
3685C<"a"> format types; it may also be useful for applications
3686that wish to convert the returned inversion map data structure into some
3687other, such as a hash.  It gives the mapping that most code points map to
3688under the property.  If you establish the convention that any code point not
3689explicitly listed in your data structure maps to this value, you can
3690potentially make your data structure much smaller.  As you construct your data
3691structure from the one returned by this function, simply ignore those ranges
3692that map to this value.  For example, to
3693convert to the data structure searchable by L</charinrange()>, you can follow
3694this recipe for properties that don't require adjustments:
3695
3696 my ($list_ref, $map_ref, $format, $default) = prop_invmap($property);
3697 my @range_list;
3698
3699 # Look at each element in the list, but the -2 is needed because we
3700 # look at $i+1 in the loop, and the final element is guaranteed to map
3701 # to $default by prop_invmap(), so we would skip it anyway.
3702 for my $i (0 .. @$list_ref - 2) {
3703    next if $map_ref->[$i] eq $default;
3704    push @range_list, [ $list_ref->[$i],
3705                        $list_ref->[$i+1],
3706                        $map_ref->[$i]
3707                      ];
3708 }
3709
3710 print charinrange(\@range_list, $code_point), "\n";
3711
3712With this, C<charinrange()> will return C<undef> if its input code point maps
3713to C<$default>.  You can avoid this by omitting the C<next> statement, and adding
3714a line after the loop to handle the final element of the inversion map.
3715
3716Similarly, this recipe can be used for properties that do require adjustments:
3717
3718 for my $i (0 .. @$list_ref - 2) {
3719    next if $map_ref->[$i] eq $default;
3720
3721    # prop_invmap() guarantees that if the mapping is to an array, the
3722    # range has just one element, so no need to worry about adjustments.
3723    if (ref $map_ref->[$i]) {
3724        push @range_list,
3725                   [ $list_ref->[$i], $list_ref->[$i], $map_ref->[$i] ];
3726    }
3727    else {  # Otherwise each element is actually mapped to a separate
3728            # value, so the range has to be split into single code point
3729            # ranges.
3730
3731        my $adjustment = 0;
3732
3733        # For each code point that gets mapped to something...
3734        for my $j ($list_ref->[$i] .. $list_ref->[$i+1] -1 ) {
3735
3736            # ... add a range consisting of just it mapping to the
3737            # original plus the adjustment, which is incremented for the
3738            # next time through the loop, as the offset increases by 1
3739            # for each element in the range
3740            push @range_list,
3741                             [ $j, $j, $map_ref->[$i] + $adjustment++ ];
3742        }
3743    }
3744 }
3745
3746Note that the inversion maps returned for the C<Case_Folding> and
3747C<Simple_Case_Folding> properties do not include the Turkic-locale mappings.
3748Use L</casefold()> for these.
3749
3750C<prop_invmap> does not know about any user-defined properties, and will
3751return C<undef> if called with one of those.
3752
3753The returned values for the Perl extension properties, such as C<Any> and
3754C<Greek> are somewhat misleading.  The values are either C<"Y"> or C<"N>".
3755All Unicode properties are bipartite, so you can actually use the C<"Y"> or
3756C<"N>" in a Perl regular expression for these, like C<qr/\p{ID_Start=Y/}> or
3757C<qr/\p{Upper=N/}>.  But the Perl extensions aren't specified this way, only
3758like C</qr/\p{Any}>, I<etc>.  You can't actually use the C<"Y"> and C<"N>" in
3759them.
3760
3761=head3 Getting every available name
3762
3763Instead of reading the Unicode Database directly from files, as you were able
3764to do for a long time, you are encouraged to use the supplied functions. So,
3765instead of reading C<Name.pl> directly, which changed formats in 5.32, and may
3766do so again without notice in the future or even disappear, you ought to use
3767L</prop_invmap()> like this:
3768
3769  my (%name, %cp, %cps, $n);
3770  # All codepoints
3771  foreach my $cat (qw( Name Name_Alias )) {
3772      my ($codepoints, $names, $format, $default) = prop_invmap($cat);
3773      # $format => "n", $default => ""
3774      foreach my $i (0 .. @$codepoints - 2) {
3775          my ($cp, $n) = ($codepoints->[$i], $names->[$i]);
3776          # If $n is a ref, the same codepoint has multiple names
3777          foreach my $name (ref $n ? @$n : $n) {
3778              $name{$cp} //= $name;
3779              $cp{$name} //= $cp;
3780          }
3781      }
3782  }
3783  # Named sequences
3784  {   my %ns = namedseq();
3785      foreach my $name (sort { $ns{$a} cmp $ns{$b} } keys %ns) {
3786          $cp{$name} //= [ map { ord } split "" => $ns{$name} ];
3787      }
3788  }
3789
3790=cut
3791
3792# User-defined properties could be handled with some changes to SWASHNEW;
3793# if done, consideration should be given to the fact that the user subroutine
3794# could return different results with each call, which could lead to some
3795# security issues.
3796
3797# One could store things in memory so they don't have to be recalculated, but
3798# it is unlikely this will be called often, and some properties would take up
3799# significant memory.
3800
3801# These are created by mktables for this routine and stored in unicore/UCD.pl
3802# where their structures are described.
3803our @algorithmic_named_code_points;
3804our $HANGUL_BEGIN;
3805our $HANGUL_COUNT;
3806
3807sub prop_invmap ($;$) {
3808
3809    croak __PACKAGE__, "::prop_invmap: must be called in list context" unless wantarray;
3810
3811    my $prop = $_[0];
3812    return unless defined $prop;
3813
3814    # Undocumented way to get at Perl internal properties; it may be changed
3815    # or removed without notice at any time.  It currently also changes the
3816    # output to use the format specified in the file rather than the one we
3817    # normally compute and return
3818    my $internal_ok = defined $_[1] && $_[1] eq '_perl_core_internal_ok';
3819
3820    # Fail internal properties
3821    return if $prop =~ /^_/ && ! $internal_ok;
3822
3823    # The values returned by this function.
3824    my (@invlist, @invmap, $format, $missing);
3825
3826    # The swash has two components we look at, the base list, and a hash,
3827    # named 'SPECIALS', containing any additional members whose mappings don't
3828    # fit into the base list scheme of things.  These generally 'override'
3829    # any value in the base list for the same code point.
3830    my $overrides;
3831
3832    require "unicore/UCD.pl";
3833
3834RETRY:
3835
3836    # If there are multiple entries for a single code point
3837    my $has_multiples = 0;
3838
3839    # Try to get the map swash for the property.  They have 'To' prepended to
3840    # the property name, and 32 means we will accept 32 bit return values.
3841    # The 0 means we aren't calling this from tr///.
3842    my $swash = SWASHNEW(__PACKAGE__, "To$prop", undef, 32, 0);
3843
3844    # If didn't find it, could be because needs a proxy.  And if was the
3845    # 'Block' or 'Name' property, use a proxy even if did find it.  Finding it
3846    # in these cases would be the result of the installation changing mktables
3847    # to output the Block or Name tables.  The Block table gives block names
3848    # in the new-style, and this routine is supposed to return old-style block
3849    # names.  The Name table is valid, but we need to execute the special code
3850    # below to add in the algorithmic-defined name entries.
3851    # And NFKCCF needs conversion, so handle that here too.
3852    if (ref $swash eq ""
3853        || $swash->{'TYPE'} =~ / ^ To (?: Blk | Na | NFKCCF ) $ /x)
3854    {
3855
3856        # Get the short name of the input property, in standard form
3857        my ($second_try) = prop_aliases($prop);
3858        return unless $second_try;
3859        $second_try = loose_name(lc $second_try);
3860
3861        if ($second_try eq "in") {
3862
3863            # This property is identical to age for inversion map purposes
3864            $prop = "age";
3865            goto RETRY;
3866        }
3867        elsif ($second_try =~ / ^ s ( cf | fc | [ltu] c ) $ /x) {
3868
3869            # These properties use just the LIST part of the full mapping,
3870            # which includes the simple maps that are otherwise overridden by
3871            # the SPECIALS.  So all we need do is to not look at the SPECIALS;
3872            # set $overrides to indicate that
3873            $overrides = -1;
3874
3875            # The full name is the simple name stripped of its initial 's'
3876            $prop = $1;
3877
3878            # .. except for this case
3879            $prop = 'cf' if $prop eq 'fc';
3880
3881            goto RETRY;
3882        }
3883        elsif ($second_try eq "blk") {
3884
3885            # We use the old block names.  Just create a fake swash from its
3886            # data.
3887            _charblocks();
3888            my %blocks;
3889            $blocks{'LIST'} = "";
3890            $blocks{'TYPE'} = "ToBlk";
3891            $SwashInfo{ToBlk}{'missing'} = "No_Block";
3892            $SwashInfo{ToBlk}{'format'} = "s";
3893
3894            foreach my $block (@BLOCKS) {
3895                $blocks{'LIST'} .= sprintf "%x\t%x\t%s\n",
3896                                           $block->[0],
3897                                           $block->[1],
3898                                           $block->[2];
3899            }
3900            $swash = \%blocks;
3901        }
3902        elsif ($second_try eq "na") {
3903
3904            # Use the combo file that has all the Name-type properties in it,
3905            # extracting just the ones that are for the actual 'Name'
3906            # property.  And create a fake swash from it.
3907            my %names;
3908            $names{'LIST'} = "";
3909            my $original = do "unicore/Name.pl";
3910
3911            # Change the double \n format of the file back to single lines
3912            # with a tab
3913            $original =~ s/\n\n/\e/g;   # Use a control that shouldn't occur
3914                                        #in the file
3915            $original =~ s/\n/\t/g;
3916            $original =~ s/\e/\n/g;
3917
3918            my $algorithm_names = \@algorithmic_named_code_points;
3919
3920            # We need to remove the names from it that are aliases.  For that
3921            # we need to also read in that table.  Create a hash with the keys
3922            # being the code points, and the values being a list of the
3923            # aliases for the code point key.
3924            my ($aliases_code_points, $aliases_maps, undef, undef)
3925                  = &prop_invmap("_Perl_Name_Alias", '_perl_core_internal_ok');
3926            my %aliases;
3927            for (my $i = 0; $i < @$aliases_code_points; $i++) {
3928                my $code_point = $aliases_code_points->[$i];
3929                $aliases{$code_point} = $aliases_maps->[$i];
3930
3931                # If not already a list, make it into one, so that later we
3932                # can treat things uniformly
3933                if (! ref $aliases{$code_point}) {
3934                    $aliases{$code_point} = [ $aliases{$code_point} ];
3935                }
3936
3937                # Remove the alias type from the entry, retaining just the
3938                # name.
3939                map { s/:.*// } @{$aliases{$code_point}};
3940            }
3941
3942            my $i = 0;
3943            foreach my $line (split "\n", $original) {
3944                my ($hex_code_point, $name) = split "\t", $line;
3945
3946                # Weeds out any comments, blank lines, and named sequences
3947                next if $hex_code_point =~ /[^[:xdigit:]]/a;
3948
3949                my $code_point = hex $hex_code_point;
3950
3951                # The name of all controls is the default: the empty string.
3952                # The set of controls is immutable
3953                next if chr($code_point) =~ /[[:cntrl:]]/u;
3954
3955                # If this is a name_alias, it isn't a name
3956                next if grep { $_ eq $name } @{$aliases{$code_point}};
3957
3958                # If we are beyond where one of the special lines needs to
3959                # be inserted ...
3960                while ($i < @$algorithm_names
3961                    && $code_point > $algorithm_names->[$i]->{'low'})
3962                {
3963
3964                    # ... then insert it, ahead of what we were about to
3965                    # output
3966                    $names{'LIST'} .= sprintf "%x\t%x\t%s\n",
3967                                            $algorithm_names->[$i]->{'low'},
3968                                            $algorithm_names->[$i]->{'high'},
3969                                            $algorithm_names->[$i]->{'name'};
3970
3971                    # Done with this range.
3972                    $i++;
3973
3974                    # We loop until all special lines that precede the next
3975                    # regular one are output.
3976                }
3977
3978                # Here, is a normal name.
3979                $names{'LIST'} .= sprintf "%x\t\t%s\n", $code_point, $name;
3980            } # End of loop through all the names
3981
3982            $names{'TYPE'} = "ToNa";
3983            $SwashInfo{ToNa}{'missing'} = "";
3984            $SwashInfo{ToNa}{'format'} = "n";
3985            $swash = \%names;
3986        }
3987        elsif ($second_try =~ / ^ ( d [mt] ) $ /x) {
3988
3989            # The file is a combination of dt and dm properties.  Create a
3990            # fake swash from the portion that we want.
3991            my $original = do "unicore/Decomposition.pl";
3992            my %decomps;
3993
3994            if ($second_try eq 'dt') {
3995                $decomps{'TYPE'} = "ToDt";
3996                $SwashInfo{'ToDt'}{'missing'} = "None";
3997                $SwashInfo{'ToDt'}{'format'} = "s";
3998            }   # 'dm' is handled below, with 'nfkccf'
3999
4000            $decomps{'LIST'} = "";
4001
4002            # This property has one special range not in the file: for the
4003            # hangul syllables.  But not in Unicode version 1.
4004            UnicodeVersion() unless defined $v_unicode_version;
4005            my $done_hangul = ($v_unicode_version lt v2.0.0)
4006                              ? 1
4007                              : 0;    # Have we done the hangul range ?
4008            foreach my $line (split "\n", $original) {
4009                my ($hex_lower, $hex_upper, $type_and_map) = split "\t", $line;
4010                my $code_point = hex $hex_lower;
4011                my $value;
4012                my $redo = 0;
4013
4014                # The type, enclosed in <...>, precedes the mapping separated
4015                # by blanks
4016                if ($type_and_map =~ / ^ < ( .* ) > \s+ (.*) $ /x) {
4017                    $value = ($second_try eq 'dt') ? $1 : $2
4018                }
4019                else {  # If there is no type specified, it's canonical
4020                    $value = ($second_try eq 'dt')
4021                             ? "Canonical" :
4022                             $type_and_map;
4023                }
4024
4025                # Insert the hangul range at the appropriate spot.
4026                if (! $done_hangul && $code_point > $HANGUL_BEGIN) {
4027                    $done_hangul = 1;
4028                    $decomps{'LIST'} .=
4029                                sprintf "%x\t%x\t%s\n",
4030                                        $HANGUL_BEGIN,
4031                                        $HANGUL_BEGIN + $HANGUL_COUNT - 1,
4032                                        ($second_try eq 'dt')
4033                                        ? "Canonical"
4034                                        : "<hangul syllable>";
4035                }
4036
4037                if ($value =~ / / && $hex_upper ne "" && $hex_upper ne $hex_lower) {
4038                    $line = sprintf("%04X\t%s\t%s", hex($hex_lower) + 1, $hex_upper, $value);
4039                    $hex_upper = "";
4040                    $redo = 1;
4041                }
4042
4043                # And append this to our constructed LIST.
4044                $decomps{'LIST'} .= "$hex_lower\t$hex_upper\t$value\n";
4045
4046                redo if $redo;
4047            }
4048            $swash = \%decomps;
4049        }
4050        elsif ($second_try ne 'nfkccf') { # Don't know this property. Fail.
4051            return;
4052        }
4053
4054        if ($second_try eq 'nfkccf' || $second_try eq 'dm') {
4055
4056            # The 'nfkccf' property is stored in the old format for backwards
4057            # compatibility for any applications that has read its file
4058            # directly before prop_invmap() existed.
4059            # And the code above has extracted the 'dm' property from its file
4060            # yielding the same format.  So here we convert them to adjusted
4061            # format for compatibility with the other properties similar to
4062            # them.
4063            my %revised_swash;
4064
4065            # We construct a new converted list.
4066            my $list = "";
4067
4068            my @ranges = split "\n", $swash->{'LIST'};
4069            for (my $i = 0; $i < @ranges; $i++) {
4070                my ($hex_begin, $hex_end, $map) = split "\t", $ranges[$i];
4071
4072                # The dm property has maps that are space separated sequences
4073                # of code points, as well as the special entry "<hangul
4074                # syllable>, which also contains a blank.
4075                my @map = split " ", $map;
4076                if (@map > 1) {
4077
4078                    # If it's just the special entry, append as-is.
4079                    if ($map eq '<hangul syllable>') {
4080                        $list .= "$ranges[$i]\n";
4081                    }
4082                    else {
4083
4084                        # These should all be single-element ranges.
4085                        croak __PACKAGE__, "::prop_invmap: Not expecting a mapping with multiple code points in a multi-element range, $ranges[$i]" if $hex_end ne "" && $hex_end ne $hex_begin;
4086
4087                        # Convert them to decimal, as that's what's expected.
4088                        $list .= "$hex_begin\t\t"
4089                            . join(" ", map { hex } @map)
4090                            . "\n";
4091                    }
4092                    next;
4093                }
4094
4095                # Here, the mapping doesn't have a blank, is for a single code
4096                # point.
4097                my $begin = hex $hex_begin;
4098                my $end = (defined $hex_end && $hex_end ne "")
4099                        ? hex $hex_end
4100                        : $begin;
4101
4102                # Again, the output is to be in decimal.
4103                my $decimal_map = hex $map;
4104
4105                # We know that multi-element ranges with the same mapping
4106                # should not be adjusted, as after the adjustment
4107                # multi-element ranges are for consecutive increasing code
4108                # points.  Further, the final element in the list won't be
4109                # adjusted, as there is nothing after it to include in the
4110                # adjustment
4111                if ($begin != $end || $i == @ranges -1) {
4112
4113                    # So just convert these to single-element ranges
4114                    foreach my $code_point ($begin .. $end) {
4115                        $list .= sprintf("%04X\t\t%d\n",
4116                                        $code_point, $decimal_map);
4117                    }
4118                }
4119                else {
4120
4121                    # Here, we have a candidate for adjusting.  What we do is
4122                    # look through the subsequent adjacent elements in the
4123                    # input.  If the map to the next one differs by 1 from the
4124                    # one before, then we combine into a larger range with the
4125                    # initial map.  Loop doing this until we find one that
4126                    # can't be combined.
4127
4128                    my $offset = 0;     # How far away are we from the initial
4129                                        # map
4130                    my $squished = 0;   # ? Did we squish at least two
4131                                        # elements together into one range
4132                    for ( ; $i < @ranges; $i++) {
4133                        my ($next_hex_begin, $next_hex_end, $next_map)
4134                                                = split "\t", $ranges[$i+1];
4135
4136                        # In the case of 'dm', the map may be a sequence of
4137                        # multiple code points, which are never combined with
4138                        # another range
4139                        last if $next_map =~ / /;
4140
4141                        $offset++;
4142                        my $next_decimal_map = hex $next_map;
4143
4144                        # If the next map is not next in sequence, it
4145                        # shouldn't be combined.
4146                        last if $next_decimal_map != $decimal_map + $offset;
4147
4148                        my $next_begin = hex $next_hex_begin;
4149
4150                        # Likewise, if the next element isn't adjacent to the
4151                        # previous one, it shouldn't be combined.
4152                        last if $next_begin != $begin + $offset;
4153
4154                        my $next_end = (defined $next_hex_end
4155                                        && $next_hex_end ne "")
4156                                            ? hex $next_hex_end
4157                                            : $next_begin;
4158
4159                        # And finally, if the next element is a multi-element
4160                        # range, it shouldn't be combined.
4161                        last if $next_end != $next_begin;
4162
4163                        # Here, we will combine.  Loop to see if we should
4164                        # combine the next element too.
4165                        $squished = 1;
4166                    }
4167
4168                    if ($squished) {
4169
4170                        # Here, 'i' is the element number of the last element to
4171                        # be combined, and the range is single-element, or we
4172                        # wouldn't be combining.  Get it's code point.
4173                        my ($hex_end, undef, undef) = split "\t", $ranges[$i];
4174                        $list .= "$hex_begin\t$hex_end\t$decimal_map\n";
4175                    } else {
4176
4177                        # Here, no combining done.  Just append the initial
4178                        # (and current) values.
4179                        $list .= "$hex_begin\t\t$decimal_map\n";
4180                    }
4181                }
4182            } # End of loop constructing the converted list
4183
4184            # Finish up the data structure for our converted swash
4185            my $type = ($second_try eq 'nfkccf') ? 'ToNFKCCF' : 'ToDm';
4186            $revised_swash{'LIST'} = $list;
4187            $revised_swash{'TYPE'} = $type;
4188            $revised_swash{'SPECIALS'} = $swash->{'SPECIALS'};
4189            $swash = \%revised_swash;
4190
4191            $SwashInfo{$type}{'missing'} = 0;
4192            $SwashInfo{$type}{'format'} = 'a';
4193        }
4194    }
4195
4196    if ($swash->{'EXTRAS'}) {
4197        carp __PACKAGE__, "::prop_invmap: swash returned for $prop unexpectedly has EXTRAS magic";
4198        return;
4199    }
4200
4201    # Here, have a valid swash return.  Examine it.
4202    my $returned_prop = $swash->{'TYPE'};
4203
4204    # All properties but binary ones should have 'missing' and 'format'
4205    # entries
4206    $missing = $SwashInfo{$returned_prop}{'missing'};
4207    $missing = 'N' unless defined $missing;
4208
4209    $format = $SwashInfo{$returned_prop}{'format'};
4210    $format = 'b' unless defined $format;
4211
4212    my $requires_adjustment = $format =~ /^a/;
4213
4214    if ($swash->{'LIST'} =~ /^V/) {
4215        @invlist = split "\n", $swash->{'LIST'} =~ s/ \s* (?: \# .* )? $ //xmgr;
4216
4217        shift @invlist;     # Get rid of 'V';
4218
4219        # Could need to be inverted: add or subtract a 0 at the beginning of
4220        # the list.
4221        if ($swash->{'INVERT_IT'}) {
4222            if (@invlist && $invlist[0] == 0) {
4223                shift @invlist;
4224            }
4225            else {
4226                unshift @invlist, 0;
4227            }
4228        }
4229
4230        if (@invlist) {
4231            foreach my $i (0 .. @invlist - 1) {
4232                $invmap[$i] = ($i % 2 == 0) ? 'Y' : 'N'
4233            }
4234
4235            # The map includes lines for all code points; add one for the range
4236            # from 0 to the first Y.
4237            if ($invlist[0] != 0) {
4238                unshift @invlist, 0;
4239                unshift @invmap, 'N';
4240            }
4241        }
4242    }
4243    else {
4244        if ($swash->{'INVERT_IT'}) {
4245            croak __PACKAGE__, ":prop_invmap: Don't know how to deal with inverted";
4246        }
4247
4248        # The LIST input lines look like:
4249        # ...
4250        # 0374\t\tCommon
4251        # 0375\t0377\tGreek   # [3]
4252        # 037A\t037D\tGreek   # [4]
4253        # 037E\t\tCommon
4254        # 0384\t\tGreek
4255        # ...
4256        #
4257        # Convert them to like
4258        # 0374 => Common
4259        # 0375 => Greek
4260        # 0378 => $missing
4261        # 037A => Greek
4262        # 037E => Common
4263        # 037F => $missing
4264        # 0384 => Greek
4265        #
4266        # For binary properties, the final non-comment column is absent, and
4267        # assumed to be 'Y'.
4268
4269        foreach my $range (split "\n", $swash->{'LIST'}) {
4270            $range =~ s/ \s* (?: \# .* )? $ //xg; # rmv trailing space, comments
4271
4272            # Find the beginning and end of the range on the line
4273            my ($hex_begin, $hex_end, $map) = split "\t", $range;
4274            my $begin = hex $hex_begin;
4275            no warnings 'portable';
4276            my $end = (defined $hex_end && $hex_end ne "")
4277                    ? hex $hex_end
4278                    : $begin;
4279
4280            # Each time through the loop (after the first):
4281            # $invlist[-2] contains the beginning of the previous range processed
4282            # $invlist[-1] contains the end+1 of the previous range processed
4283            # $invmap[-2] contains the value of the previous range processed
4284            # $invmap[-1] contains the default value for missing ranges
4285            #                                                       ($missing)
4286            #
4287            # Thus, things are set up for the typical case of a new
4288            # non-adjacent range of non-missings to be added.  But, if the new
4289            # range is adjacent, it needs to replace the [-1] element; and if
4290            # the new range is a multiple value of the previous one, it needs
4291            # to be added to the [-2] map element.
4292
4293            # The first time through, everything will be empty.  If the
4294            # property doesn't have a range that begins at 0, add one that
4295            # maps to $missing
4296            if (! @invlist) {
4297                if ($begin != 0) {
4298                    push @invlist, 0;
4299                    push @invmap, $missing;
4300                }
4301            }
4302            elsif (@invlist > 1 && $invlist[-2] == $begin) {
4303
4304                # Here we handle the case where the input has multiple entries
4305                # for each code point.  mktables should have made sure that
4306                # each such range contains only one code point.  At this
4307                # point, $invlist[-1] is the $missing that was added at the
4308                # end of the last loop iteration, and [-2] is the last real
4309                # input code point, and that code point is the same as the one
4310                # we are adding now, making the new one a multiple entry.  Add
4311                # it to the existing entry, either by pushing it to the
4312                # existing list of multiple entries, or converting the single
4313                # current entry into a list with both on it.  This is all we
4314                # need do for this iteration.
4315
4316                if ($end != $begin) {
4317                    croak __PACKAGE__, ":prop_invmap: Multiple maps per code point in '$prop' require single-element ranges: begin=$begin, end=$end, map=$map";
4318                }
4319                if (! ref $invmap[-2]) {
4320                    $invmap[-2] = [ $invmap[-2], $map ];
4321                }
4322                else {
4323                    push @{$invmap[-2]}, $map;
4324                }
4325                $has_multiples = 1;
4326                next;
4327            }
4328            elsif ($invlist[-1] == $begin) {
4329
4330                # If the input isn't in the most compact form, so that there
4331                # are two adjacent ranges that map to the same thing, they
4332                # should be combined (EXCEPT where the arrays require
4333                # adjustments, in which case everything is already set up
4334                # correctly).  This happens in our constructed dt mapping, as
4335                # Element [-2] is the map for the latest range so far
4336                # processed.  Just set the beginning point of the map to
4337                # $missing (in invlist[-1]) to 1 beyond where this range ends.
4338                # For example, in
4339                # 12\t13\tXYZ
4340                # 14\t17\tXYZ
4341                # we have set it up so that it looks like
4342                # 12 => XYZ
4343                # 14 => $missing
4344                #
4345                # We now see that it should be
4346                # 12 => XYZ
4347                # 18 => $missing
4348                if (! $requires_adjustment && @invlist > 1 && ( (defined $map)
4349                                    ? $invmap[-2] eq $map
4350                                    : $invmap[-2] eq 'Y'))
4351                {
4352                    $invlist[-1] = $end + 1;
4353                    next;
4354                }
4355
4356                # Here, the range started in the previous iteration that maps
4357                # to $missing starts at the same code point as this range.
4358                # That means there is no gap to fill that that range was
4359                # intended for, so we just pop it off the parallel arrays.
4360                pop @invlist;
4361                pop @invmap;
4362            }
4363
4364            # Add the range beginning, and the range's map.
4365            push @invlist, $begin;
4366            if ($returned_prop eq 'ToDm') {
4367
4368                # The decomposition maps are either a line like <hangul
4369                # syllable> which are to be taken as is; or a sequence of code
4370                # points in hex and separated by blanks.  Convert them to
4371                # decimal, and if there is more than one, use an anonymous
4372                # array as the map.
4373                if ($map =~ /^ < /x) {
4374                    push @invmap, $map;
4375                }
4376                else {
4377                    my @map = split " ", $map;
4378                    if (@map == 1) {
4379                        push @invmap, $map[0];
4380                    }
4381                    else {
4382                        push @invmap, \@map;
4383                    }
4384                }
4385            }
4386            else {
4387
4388                # Otherwise, convert hex formatted list entries to decimal;
4389                # add a 'Y' map for the missing value in binary properties, or
4390                # otherwise, use the input map unchanged.
4391                $map = ($format eq 'x' || $format eq 'ax')
4392                    ? hex $map
4393                    : $format eq 'b'
4394                    ? 'Y'
4395                    : $map;
4396                push @invmap, $map;
4397            }
4398
4399            # We just started a range.  It ends with $end.  The gap between it
4400            # and the next element in the list must be filled with a range
4401            # that maps to the default value.  If there is no gap, the next
4402            # iteration will pop this, unless there is no next iteration, and
4403            # we have filled all of the Unicode code space, so check for that
4404            # and skip.
4405            if ($end < $MAX_CP) {
4406                push @invlist, $end + 1;
4407                push @invmap, $missing;
4408            }
4409        }
4410    }
4411
4412    # If the property is empty, make all code points use the value for missing
4413    # ones.
4414    if (! @invlist) {
4415        push @invlist, 0;
4416        push @invmap, $missing;
4417    }
4418
4419    # The final element is always for just the above-Unicode code points.  If
4420    # not already there, add it.  It merely splits the current final range
4421    # that extends to infinity into two elements, each with the same map.
4422    # (This is to conform with the API that says the final element is for
4423    # $MAX_UNICODE_CODEPOINT + 1 .. INFINITY.)
4424    if ($invlist[-1] != $MAX_UNICODE_CODEPOINT + 1) {
4425        push @invmap, $invmap[-1];
4426        push @invlist, $MAX_UNICODE_CODEPOINT + 1;
4427    }
4428
4429    # The second component of the map are those values that require
4430    # non-standard specification, stored in SPECIALS.  These override any
4431    # duplicate code points in LIST.  If we are using a proxy, we may have
4432    # already set $overrides based on the proxy.
4433    $overrides = $swash->{'SPECIALS'} unless defined $overrides;
4434    if ($overrides) {
4435
4436        # A negative $overrides implies that the SPECIALS should be ignored,
4437        # and a simple 'a' list is the value.
4438        if ($overrides < 0) {
4439            $format = 'a';
4440        }
4441        else {
4442
4443            # Currently, all overrides are for properties that normally map to
4444            # single code points, but now some will map to lists of code
4445            # points (but there is an exception case handled below).
4446            $format = 'al';
4447
4448            # Look through the overrides.
4449            foreach my $cp_maybe_utf8 (keys %$overrides) {
4450                my $cp;
4451                my @map;
4452
4453                # If the overrides came from SPECIALS, the code point keys are
4454                # packed UTF-8.
4455                if ($overrides == $swash->{'SPECIALS'}) {
4456                    $cp = $cp_maybe_utf8;
4457                    if (! utf8::decode($cp)) {
4458                        croak __PACKAGE__, "::prop_invmap: Malformed UTF-8: ",
4459                              map { sprintf("\\x{%02X}", unpack("C", $_)) }
4460                                                                split "", $cp;
4461                    }
4462
4463                    $cp = unpack("W", $cp);
4464                    @map = unpack "W*", $swash->{'SPECIALS'}{$cp_maybe_utf8};
4465
4466                    # The empty string will show up unpacked as an empty
4467                    # array.
4468                    $format = 'ale' if @map == 0;
4469                }
4470                else {
4471
4472                    # But if we generated the overrides, we didn't bother to
4473                    # pack them, and we, so far, do this only for properties
4474                    # that are 'a' ones.
4475                    $cp = $cp_maybe_utf8;
4476                    @map = hex $overrides->{$cp};
4477                    $format = 'a';
4478                }
4479
4480                # Find the range that the override applies to.
4481                my $i = search_invlist(\@invlist, $cp);
4482                if ($cp < $invlist[$i] || $cp >= $invlist[$i + 1]) {
4483                    croak __PACKAGE__, "::prop_invmap: wrong_range, cp=$cp; i=$i, current=$invlist[$i]; next=$invlist[$i + 1]"
4484                }
4485
4486                # And what that range currently maps to
4487                my $cur_map = $invmap[$i];
4488
4489                # If there is a gap between the next range and the code point
4490                # we are overriding, we have to add elements to both arrays to
4491                # fill that gap, using the map that applies to it, which is
4492                # $cur_map, since it is part of the current range.
4493                if ($invlist[$i + 1] > $cp + 1) {
4494                    #use feature 'say';
4495                    #say "Before splice:";
4496                    #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
4497                    #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
4498                    #say 'i  =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
4499                    #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
4500                    #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
4501
4502                    splice @invlist, $i + 1, 0, $cp + 1;
4503                    splice @invmap, $i + 1, 0, $cur_map;
4504
4505                    #say "After splice:";
4506                    #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
4507                    #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
4508                    #say 'i  =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
4509                    #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
4510                    #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
4511                }
4512
4513                # If the remaining portion of the range is multiple code
4514                # points (ending with the one we are replacing, guaranteed by
4515                # the earlier splice).  We must split it into two
4516                if ($invlist[$i] < $cp) {
4517                    $i++;   # Compensate for the new element
4518
4519                    #use feature 'say';
4520                    #say "Before splice:";
4521                    #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
4522                    #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
4523                    #say 'i  =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
4524                    #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
4525                    #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
4526
4527                    splice @invlist, $i, 0, $cp;
4528                    splice @invmap, $i, 0, 'dummy';
4529
4530                    #say "After splice:";
4531                    #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
4532                    #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
4533                    #say 'i  =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
4534                    #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
4535                    #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
4536                }
4537
4538                # Here, the range we are overriding contains a single code
4539                # point.  The result could be the empty string, a single
4540                # value, or a list.  If the last case, we use an anonymous
4541                # array.
4542                $invmap[$i] = (scalar @map == 0)
4543                               ? ""
4544                               : (scalar @map > 1)
4545                                  ? \@map
4546                                  : $map[0];
4547            }
4548        }
4549    }
4550    elsif ($format eq 'x') {
4551
4552        # All hex-valued properties are really to code points, and have been
4553        # converted to decimal.
4554        $format = 's';
4555    }
4556    elsif ($returned_prop eq 'ToDm') {
4557        $format = 'ad';
4558    }
4559    elsif ($format eq 'sw') { # blank-separated elements to form a list.
4560        map { $_ = [ split " ", $_  ] if $_ =~ / / } @invmap;
4561        $format = 'sl';
4562    }
4563    elsif ($returned_prop =~ / To ( _Perl )? NameAlias/x) {
4564
4565        # This property currently doesn't have any lists, but theoretically
4566        # could
4567        $format = 'sl';
4568    }
4569    elsif ($returned_prop eq 'ToPerlDecimalDigit') {
4570        $format = 'ae';
4571    }
4572    elsif ($returned_prop eq 'ToNv') {
4573
4574        # The one property that has this format is stored as a delta, so needs
4575        # to indicate that need to add code point to it.
4576        $format = 'ar';
4577    }
4578    elsif ($format eq 'ax') {
4579
4580        # Normally 'ax' properties have overrides, and will have been handled
4581        # above, but if not, they still need adjustment, and the hex values
4582        # have already been converted to decimal
4583        $format = 'a';
4584    }
4585    elsif ($format ne 'n' && $format !~ / ^ a /x) {
4586
4587        # All others are simple scalars
4588        $format = 's';
4589    }
4590    if ($has_multiples &&  $format !~ /l/) {
4591	croak __PACKAGE__, "::prop_invmap: Wrong format '$format' for prop_invmap('$prop'); should indicate has lists";
4592    }
4593
4594    return (\@invlist, \@invmap, $format, $missing);
4595}
4596
4597sub search_invlist {
4598
4599=pod
4600
4601=head2 B<search_invlist()>
4602
4603 use Unicode::UCD qw(prop_invmap prop_invlist);
4604 use Unicode::UCD 'search_invlist';
4605
4606 my @invlist = prop_invlist($property_name);
4607 print $code_point, ((search_invlist(\@invlist, $code_point) // -1) % 2)
4608                     ? " isn't"
4609                     : " is",
4610     " in $property_name\n";
4611
4612 my ($blocks_ranges_ref, $blocks_map_ref) = prop_invmap("Block");
4613 my $index = search_invlist($blocks_ranges_ref, $code_point);
4614 print "$code_point is in block ", $blocks_map_ref->[$index], "\n";
4615
4616C<search_invlist> is used to search an inversion list returned by
4617C<prop_invlist> or C<prop_invmap> for a particular L</code point argument>.
4618C<undef> is returned if the code point is not found in the inversion list
4619(this happens only when it is not a legal L</code point argument>, or is less
4620than the list's first element).  A warning is raised in the first instance.
4621
4622Otherwise, it returns the index into the list of the range that contains the
4623code point.; that is, find C<i> such that
4624
4625    list[i]<= code_point < list[i+1].
4626
4627As explained in L</prop_invlist()>, whether a code point is in the list or not
4628depends on if the index is even (in) or odd (not in).  And as explained in
4629L</prop_invmap()>, the index is used with the returned parallel array to find
4630the mapping.
4631
4632=cut
4633
4634
4635    my $list_ref = shift;
4636    my $input_code_point = shift;
4637    my $code_point = _getcode($input_code_point);
4638
4639    if (! defined $code_point) {
4640        carp __PACKAGE__, "::search_invlist: unknown code '$input_code_point'";
4641        return;
4642    }
4643
4644    my $max_element = @$list_ref - 1;
4645
4646    # Return undef if list is empty or requested item is before the first element.
4647    return if $max_element < 0;
4648    return if $code_point < $list_ref->[0];
4649
4650    # Short cut something at the far-end of the table.  This also allows us to
4651    # refer to element [$i+1] without fear of being out-of-bounds in the loop
4652    # below.
4653    return $max_element if $code_point >= $list_ref->[$max_element];
4654
4655    use integer;        # want integer division
4656
4657    my $i = $max_element / 2;
4658
4659    my $lower = 0;
4660    my $upper = $max_element;
4661    while (1) {
4662
4663        if ($code_point >= $list_ref->[$i]) {
4664
4665            # Here we have met the lower constraint.  We can quit if we
4666            # also meet the upper one.
4667            last if $code_point < $list_ref->[$i+1];
4668
4669            $lower = $i;        # Still too low.
4670
4671        }
4672        else {
4673
4674            # Here, $code_point < $list_ref[$i], so look lower down.
4675            $upper = $i;
4676        }
4677
4678        # Split search domain in half to try again.
4679        my $temp = ($upper + $lower) / 2;
4680
4681        # No point in continuing unless $i changes for next time
4682        # in the loop.
4683        return $i if $temp == $i;
4684        $i = $temp;
4685    } # End of while loop
4686
4687    # Here we have found the offset
4688    return $i;
4689}
4690
4691=head2 Unicode::UCD::UnicodeVersion
4692
4693This returns the version of the Unicode Character Database, in other words, the
4694version of the Unicode standard the database implements.  The version is a
4695string of numbers delimited by dots (C<'.'>).
4696
4697=cut
4698
4699my $UNICODEVERSION;
4700
4701sub UnicodeVersion {
4702    unless (defined $UNICODEVERSION) {
4703	my $versionfh = openunicode("version");
4704	local $/ = "\n";
4705	chomp($UNICODEVERSION = <$versionfh>);
4706	croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
4707	    unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
4708    }
4709    $v_unicode_version = pack "C*", split /\./, $UNICODEVERSION;
4710    return $UNICODEVERSION;
4711}
4712
4713=head2 B<Blocks versus Scripts>
4714
4715The difference between a block and a script is that scripts are closer
4716to the linguistic notion of a set of code points required to represent
4717languages, while block is more of an artifact of the Unicode code point
4718numbering and separation into blocks of consecutive code points (so far the
4719size of a block is some multiple of 16, like 128 or 256).
4720
4721For example the Latin B<script> is spread over several B<blocks>, such
4722as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
4723C<Latin Extended-B>.  On the other hand, the Latin script does not
4724contain all the characters of the C<Basic Latin> block (also known as
4725ASCII): it includes only the letters, and not, for example, the digits
4726nor the punctuation.
4727
4728For blocks see L<http://www.unicode.org/Public/UNIDATA/Blocks.txt>
4729
4730For scripts see UTR #24: L<http://www.unicode.org/unicode/reports/tr24/>
4731
4732=head2 B<Matching Scripts and Blocks>
4733
4734Scripts are matched with the regular-expression construct
4735C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
4736while C<\p{Blk=...}> is used for blocks (e.g. C<\p{Blk=Tibetan}> matches
4737any of the 256 code points in the Tibetan block).
4738
4739=head2 Old-style versus new-style block names
4740
4741Unicode publishes the names of blocks in two different styles, though the two
4742are equivalent under Unicode's loose matching rules.
4743
4744The original style uses blanks and hyphens in the block names (except for
4745C<No_Block>), like so:
4746
4747 Miscellaneous Mathematical Symbols-B
4748
4749The newer style replaces these with underscores, like this:
4750
4751 Miscellaneous_Mathematical_Symbols_B
4752
4753This newer style is consistent with the values of other Unicode properties.
4754To preserve backward compatibility, all the functions in Unicode::UCD that
4755return block names (except as noted) return the old-style ones.
4756L</prop_value_aliases()> returns the new-style and can be used to convert from
4757old-style to new-style:
4758
4759 my $new_style = prop_values_aliases("block", $old_style);
4760
4761Perl also has single-form extensions that refer to blocks, C<In_Cyrillic>,
4762meaning C<Block=Cyrillic>.  These have always been written in the new style.
4763
4764To convert from new-style to old-style, follow this recipe:
4765
4766 $old_style = charblock((prop_invlist("block=$new_style"))[0]);
4767
4768(which finds the range of code points in the block using C<prop_invlist>,
4769gets the lower end of the range (0th element) and then looks up the old name
4770for its block using C<charblock>).
4771
4772Note that starting in Unicode 6.1, many of the block names have shorter
4773synonyms.  These are always given in the new style.
4774
4775=head2 Use with older Unicode versions
4776
4777The functions in this module work as well as can be expected when
4778used on earlier Unicode versions.  But, obviously, they use the available data
4779from that Unicode version.  For example, if the Unicode version predates the
4780definition of the script property (Unicode 3.1), then any function that deals
4781with scripts is going to return C<undef> for the script portion of the return
4782value.
4783
4784=head1 AUTHOR
4785
4786Jarkko Hietaniemi.  Now maintained by perl5 porters.
4787
4788=cut
4789
47901;
4791