1package Unicode::UCD;
2
3use strict;
4use warnings;
5no warnings 'surrogate';    # surrogates can be inputs to this
6use charnames ();
7
8our $VERSION = '0.78';
9
10sub DEBUG () { 0 }
11$|=1 if DEBUG;
12
13require Exporter;
14
15our @ISA = qw(Exporter);
16
17our @EXPORT_OK = qw(charinfo
18		    charblock charscript
19		    charblocks charscripts
20		    charinrange
21		    charprop
22		    charprops_all
23		    general_categories bidi_types
24		    compexcl
25		    casefold all_casefolds casespec
26		    namedseq
27                    num
28                    prop_aliases
29                    prop_value_aliases
30                    prop_values
31                    prop_invlist
32                    prop_invmap
33                    search_invlist
34                    MAX_CP
35                );
36
37use Carp;
38
39sub IS_ASCII_PLATFORM { ord("A") == 65 }
40
41=head1 NAME
42
43Unicode::UCD - Unicode character database
44
45=head1 SYNOPSIS
46
47    use Unicode::UCD 'charinfo';
48    my $charinfo   = charinfo($codepoint);
49
50    use Unicode::UCD 'charprop';
51    my $value  = charprop($codepoint, $property);
52
53    use Unicode::UCD 'charprops_all';
54    my $all_values_hash_ref = charprops_all($codepoint);
55
56    use Unicode::UCD 'casefold';
57    my $casefold = casefold($codepoint);
58
59    use Unicode::UCD 'all_casefolds';
60    my $all_casefolds_ref = all_casefolds();
61
62    use Unicode::UCD 'casespec';
63    my $casespec = casespec($codepoint);
64
65    use Unicode::UCD 'charblock';
66    my $charblock  = charblock($codepoint);
67
68    use Unicode::UCD 'charscript';
69    my $charscript = charscript($codepoint);
70
71    use Unicode::UCD 'charblocks';
72    my $charblocks = charblocks();
73
74    use Unicode::UCD 'charscripts';
75    my $charscripts = charscripts();
76
77    use Unicode::UCD qw(charscript charinrange);
78    my $range = charscript($script);
79    print "looks like $script\n" if charinrange($range, $codepoint);
80
81    use Unicode::UCD qw(general_categories bidi_types);
82    my $categories = general_categories();
83    my $types = bidi_types();
84
85    use Unicode::UCD 'prop_aliases';
86    my @space_names = prop_aliases("space");
87
88    use Unicode::UCD 'prop_value_aliases';
89    my @gc_punct_names = prop_value_aliases("Gc", "Punct");
90
91    use Unicode::UCD 'prop_values';
92    my @all_EA_short_names = prop_values("East_Asian_Width");
93
94    use Unicode::UCD 'prop_invlist';
95    my @puncts = prop_invlist("gc=punctuation");
96
97    use Unicode::UCD 'prop_invmap';
98    my ($list_ref, $map_ref, $format, $missing)
99                                      = prop_invmap("General Category");
100
101    use Unicode::UCD 'search_invlist';
102    my $index = search_invlist(\@invlist, $code_point);
103
104    # The following function should be used only internally in
105    # implementations of the Unicode Normalization Algorithm, and there
106    # are better choices than it.
107    use Unicode::UCD 'compexcl';
108    my $compexcl = compexcl($codepoint);
109
110    use Unicode::UCD 'namedseq';
111    my $namedseq = namedseq($named_sequence_name);
112
113    my $unicode_version = Unicode::UCD::UnicodeVersion();
114
115    my $convert_to_numeric =
116              Unicode::UCD::num("\N{RUMI DIGIT ONE}\N{RUMI DIGIT TWO}");
117
118=head1 DESCRIPTION
119
120The Unicode::UCD module offers a series of functions that
121provide a simple interface to the Unicode
122Character Database.
123
124=head2 code point argument
125
126Some of the functions are called with a I<code point argument>, which is either
127a decimal or a hexadecimal scalar designating a code point in the platform's
128native character set (extended to Unicode), or a string containing C<U+>
129followed by hexadecimals
130designating a Unicode code point.  A leading 0 will force a hexadecimal
131interpretation, as will a hexadecimal digit that isn't a decimal digit.
132
133Examples:
134
135    223     # Decimal 223 in native character set
136    0223    # Hexadecimal 223, native (= 547 decimal)
137    0xDF    # Hexadecimal DF, native (= 223 decimal)
138    '0xDF'  # String form of hexadecimal (= 223 decimal)
139    'U+DF'  # Hexadecimal DF, in Unicode's character set
140                              (= LATIN SMALL LETTER SHARP S)
141
142Note that the largest code point in Unicode is U+10FFFF.
143
144=cut
145
146our %caseless_equivalent;
147our $e_precision;
148our %file_to_swash_name;
149our @inline_definitions;
150our %loose_property_name_of;
151our %loose_property_to_file_of;
152our %loose_to_file_of;
153our $MAX_CP;
154our %nv_floating_to_rational;
155our %prop_aliases;
156our %stricter_to_file_of;
157our %strict_property_to_file_of;
158our %SwashInfo;
159our %why_deprecated;
160
161my $v_unicode_version;  # v-string.
162
163sub openunicode {
164    my (@path) = @_;
165    my $rfh;
166    for my $d (@INC) {
167        use File::Spec;
168        my $f = File::Spec->catfile($d, "unicore", @path);
169        return $rfh if open($rfh, '<', $f);
170    }
171    croak __PACKAGE__, ": failed to find ",
172        File::Spec->catfile("unicore", @path), " in @INC";
173}
174
175sub _dclone ($) {   # Use Storable::dclone if available; otherwise emulate it.
176
177    use if defined &DynaLoader::boot_DynaLoader, Storable => qw(dclone);
178
179    return dclone(shift) if defined &dclone;
180
181    my $arg = shift;
182    my $type = ref $arg;
183    return $arg unless $type;   # No deep cloning needed for scalars
184
185    if ($type eq 'ARRAY') {
186        my @return;
187        foreach my $element (@$arg) {
188            push @return, &_dclone($element);
189        }
190        return \@return;
191    }
192    elsif ($type eq 'HASH') {
193        my %return;
194        foreach my $key (keys %$arg) {
195            $return{$key} = &_dclone($arg->{$key});
196        }
197        return \%return;
198    }
199    else {
200        croak "_dclone can't handle " . $type;
201    }
202}
203
204=head2 B<charinfo()>
205
206    use Unicode::UCD 'charinfo';
207
208    my $charinfo = charinfo(0x41);
209
210This returns information about the input L</code point argument>
211as a reference to a hash of fields as defined by the Unicode
212standard.  If the L</code point argument> is not assigned in the standard
213(i.e., has the general category C<Cn> meaning C<Unassigned>)
214or is a non-character (meaning it is guaranteed to never be assigned in
215the standard),
216C<undef> is returned.
217
218Fields that aren't applicable to the particular code point argument exist in the
219returned hash, and are empty.
220
221For results that are less "raw" than this function returns, or to get the values for
222any property, not just the few covered by this function, use the
223L</charprop()> function.
224
225The keys in the hash with the meanings of their values are:
226
227=over
228
229=item B<code>
230
231the input native L</code point argument> expressed in hexadecimal, with
232leading zeros
233added if necessary to make it contain at least four hexdigits
234
235=item B<name>
236
237name of I<code>, all IN UPPER CASE.
238Some control-type code points do not have names.
239This field will be empty for C<Surrogate> and C<Private Use> code points,
240and for the others without a name,
241it will contain a description enclosed in angle brackets, like
242C<E<lt>controlE<gt>>.
243
244
245=item B<category>
246
247The short name of the general category of I<code>.
248This will match one of the keys in the hash returned by L</general_categories()>.
249
250The L</prop_value_aliases()> function can be used to get all the synonyms
251of the category name.
252
253=item B<combining>
254
255the combining class number for I<code> used in the Canonical Ordering Algorithm.
256For Unicode 5.1, this is described in Section 3.11 C<Canonical Ordering Behavior>
257available at
258L<http://www.unicode.org/versions/Unicode5.1.0/>
259
260The L</prop_value_aliases()> function can be used to get all the synonyms
261of the combining class number.
262
263=item B<bidi>
264
265bidirectional type of I<code>.
266This will match one of the keys in the hash returned by L</bidi_types()>.
267
268The L</prop_value_aliases()> function can be used to get all the synonyms
269of the bidi type name.
270
271=item B<decomposition>
272
273is empty if I<code> has no decomposition; or is one or more codes
274(separated by spaces) that, taken in order, represent a decomposition for
275I<code>.  Each has at least four hexdigits.
276The codes may be preceded by a word enclosed in angle brackets, then a space,
277like C<E<lt>compatE<gt> >, giving the type of decomposition
278
279This decomposition may be an intermediate one whose components are also
280decomposable.  Use L<Unicode::Normalize> to get the final decomposition in one
281step.
282
283=item B<decimal>
284
285if I<code> represents a decimal digit this is its integer numeric value
286
287=item B<digit>
288
289if I<code> represents some other digit-like number, this is its integer
290numeric value
291
292=item B<numeric>
293
294if I<code> represents a whole or rational number, this is its numeric value.
295Rational values are expressed as a string like C<1/4>.
296
297=item B<mirrored>
298
299C<Y> or C<N> designating if I<code> is mirrored in bidirectional text
300
301=item B<unicode10>
302
303name of I<code> in the Unicode 1.0 standard if one
304existed for this code point and is different from the current name
305
306=item B<comment>
307
308As of Unicode 6.0, this is always empty.
309
310=item B<upper>
311
312is, if non-empty, the uppercase mapping for I<code> expressed as at least four
313hexdigits.  This indicates that the full uppercase mapping is a single
314character, and is identical to the simple (single-character only) mapping.
315When this field is empty, it means that the simple uppercase mapping is
316I<code> itself; you'll need some other means, (like L</charprop()> or
317L</casespec()> to get the full mapping.
318
319=item B<lower>
320
321is, if non-empty, the lowercase mapping for I<code> expressed as at least four
322hexdigits.  This indicates that the full lowercase mapping is a single
323character, and is identical to the simple (single-character only) mapping.
324When this field is empty, it means that the simple lowercase mapping is
325I<code> itself; you'll need some other means, (like L</charprop()> or
326L</casespec()> to get the full mapping.
327
328=item B<title>
329
330is, if non-empty, the titlecase mapping for I<code> expressed as at least four
331hexdigits.  This indicates that the full titlecase mapping is a single
332character, and is identical to the simple (single-character only) mapping.
333When this field is empty, it means that the simple titlecase mapping is
334I<code> itself; you'll need some other means, (like L</charprop()> or
335L</casespec()> to get the full mapping.
336
337=item B<block>
338
339the block I<code> belongs to (used in C<\p{Blk=...}>).
340The L</prop_value_aliases()> function can be used to get all the synonyms
341of the block name.
342
343See L</Blocks versus Scripts>.
344
345=item B<script>
346
347the script I<code> belongs to.
348The L</prop_value_aliases()> function can be used to get all the synonyms
349of the script name.  Note that this is the older "Script" property value, and
350not the improved "Script_Extensions" value.
351
352See L</Blocks versus Scripts>.
353
354=back
355
356Note that you cannot do (de)composition and casing based solely on the
357I<decomposition>, I<combining>, I<lower>, I<upper>, and I<title> fields; you
358will need also the L</casespec()> function and the C<Composition_Exclusion>
359property.  (Or you could just use the L<lc()|perlfunc/lc>,
360L<uc()|perlfunc/uc>, and L<ucfirst()|perlfunc/ucfirst> functions, and the
361L<Unicode::Normalize> module.)
362
363=cut
364
365my %Cache;
366
367# Digits may be separated by a single underscore
368my $digits = qr/ ( [0-9] _? )+ (?!:_) /x;
369
370# A sign can be surrounded by white space
371my $sign = qr/ \s* [+-]? \s* /x;
372
373my $f_float = qr/  $sign $digits+ \. $digits*    # e.g., 5.0, 5.
374                 | $sign $digits* \. $digits+/x; # 0.7, .7
375
376# A number may be an integer, a rational, or a float with an optional exponent
377# We (shudder) accept a signed denominator
378my $number = qr{  ^ $sign $digits+ $
379                | ^ $sign $digits+ \/ $sign $digits+ $
380                | ^ $f_float (?: [Ee] [+-]? $digits )? $}x;
381
382sub loose_name ($) {
383    # Given a lowercase property or property-value name, return its
384    # standardized version that is expected for look-up in the 'loose' hashes
385    # in UCD.pl (hence, this depends on what mktables does).  This squeezes
386    # out blanks, underscores and dashes.  The complication stems from the
387    # grandfathered-in 'L_', which retains a single trailing underscore.
388
389# integer or float (no exponent)
390my $integer_or_float_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /x;
391
392# Also includes rationals
393my $numeric_re = qr! $integer_or_float_re | ^ -? \d+ / \d+ $ !x;
394    return $_[0] if $_[0] =~ $numeric_re;
395
396    (my $loose = $_[0]) =~ s/[-_ \t]//g;
397
398    return $loose if $loose !~ / ^ (?: is | to )? l $/x;
399    return 'l_' if $_[0] =~ / l .* _ /x;    # If original had a trailing '_'
400    return $loose;
401}
402
403##
404## "SWASH" == "SWATCH HASH". A "swatch" is a swatch of the Unicode landscape.
405## It's a data structure that encodes a set of Unicode characters.
406##
407
408{
409    use re "/aa";  # Nothing here uses above Latin1.
410
411    # If a floating point number is within this distance from the value of a
412    # fraction, it is considered to be that fraction, even if many more digits
413    # are specified that don't exactly match.
414    my $min_floating_slop;
415
416    # To guard against this program calling something that in turn ends up
417    # calling this program with the same inputs, and hence infinitely
418    # recursing, we keep a stack of the properties that are currently in
419    # progress, pushed upon entry, popped upon return.
420    my @recursed;
421
422    sub SWASHNEW {
423        my ($class, $type, $list, $minbits) = @_;
424        my $user_defined = 0;
425        local $^D = 0 if $^D;
426
427        $class = "" unless defined $class;
428        print STDERR __LINE__, ": class=$class, type=$type, list=",
429                                (defined $list) ? $list : ':undef:',
430                                ", minbits=$minbits\n" if DEBUG;
431
432        ##
433        ## Get the list of codepoints for the type.
434        ## Called from swash_init (see utf8.c) or SWASHNEW itself.
435        ##
436        ## Callers of swash_init:
437        ##     prop_invlist
438        ##     Unicode::UCD::prop_invmap
439        ##
440        ## Given a $type, our goal is to fill $list with the set of codepoint
441        ## ranges. If $type is false, $list passed is used.
442        ##
443        ## $minbits:
444        ##     For binary properties, $minbits must be 1.
445        ##     For character mappings (case and transliteration), $minbits must
446        ##     be a number except 1.
447        ##
448        ## $list (or that filled according to $type):
449        ##     Refer to perlunicode.pod, "User-Defined Character Properties."
450        ##
451        ##     For binary properties, only characters with the property value
452        ##     of True should be listed. The 3rd column, if any, will be ignored
453        ##
454        ## To make the parsing of $type clear, this code takes the a rather
455        ## unorthodox approach of last'ing out of the block once we have the
456        ## info we need. Were this to be a subroutine, the 'last' would just
457        ## be a 'return'.
458        ##
459        #   If a problem is found $type is returned;
460        #   Upon success, a new (or cached) blessed object is returned with
461        #   keys TYPE, BITS, EXTRAS, LIST, and with values having the
462        #   same meanings as the input parameters.
463        #   SPECIALS contains a reference to any special-treatment hash in the
464        #       property.
465        #   INVERT_IT is non-zero if the result should be inverted before use
466        #   USER_DEFINED is non-zero if the result came from a user-defined
467        my $file; ## file to load data from, and also part of the %Cache key.
468
469        # Change this to get a different set of Unicode tables
470        my $unicore_dir = 'unicore';
471        my $invert_it = 0;
472        my $list_is_from_mktables = 0;  # Is $list returned from a mktables
473                                        # generated file?  If so, we know it's
474                                        # well behaved.
475
476        if ($type)
477        {
478            # Verify that this isn't a recursive call for this property.
479            # Can't use croak, as it may try to recurse to here itself.
480            my $class_type = $class . "::$type";
481            if (grep { $_ eq $class_type } @recursed) {
482                CORE::die "panic: Infinite recursion in SWASHNEW for '$type'\n";
483            }
484            push @recursed, $class_type;
485
486            $type =~ s/^\s+//;
487            $type =~ s/\s+$//;
488
489            # regcomp.c surrounds the property name with '__" and '_i' if this
490            # is to be caseless matching.
491            my $caseless = $type =~ s/^(.*)__(.*)_i$/$1$2/;
492
493            print STDERR __LINE__, ": type=$type, caseless=$caseless\n" if DEBUG;
494
495        GETFILE:
496            {
497                ##
498                ## It could be a user-defined property.  Look in current
499                ## package if no package given
500                ##
501
502
503                my $caller0 = caller(0);
504                my $caller1 = $type =~ s/(.+):://
505                              ? $1
506                              : $caller0 eq 'main'
507                                ? 'main'
508                                : caller(1);
509
510                if (defined $caller1 && $type =~ /^I[ns]\w+$/) {
511                    my $prop = "${caller1}::$type";
512                    if (exists &{$prop}) {
513                        # stolen from Scalar::Util::PP::tainted()
514                        my $tainted;
515                        {
516                            local($@, $SIG{__DIE__}, $SIG{__WARN__});
517                            local $^W = 0;
518                            no warnings;
519                            eval { kill 0 * $prop };
520                            $tainted = 1 if $@ =~ /^Insecure/;
521                        }
522                        die "Insecure user-defined property \\p{$prop}\n"
523                            if $tainted;
524                        no strict 'refs';
525                        $list = &{$prop}($caseless);
526                        $user_defined = 1;
527                        last GETFILE;
528                    }
529                }
530
531                require "$unicore_dir/UCD.pl";
532
533                # All property names are matched caselessly
534                my $property_and_table = CORE::lc $type;
535                print STDERR __LINE__, ": $property_and_table\n" if DEBUG;
536
537                # See if is of the compound form 'property=value', where the
538                # value indicates the table we should use.
539                my ($property, $table, @remainder) =
540                                    split /\s*[:=]\s*/, $property_and_table, -1;
541                if (@remainder) {
542                    pop @recursed if @recursed;
543                    return $type;
544                }
545
546                my $prefix;
547                if (! defined $table) {
548
549                    # Here, is the single form.  The property becomes empty, and
550                    # the whole value is the table.
551                    $table = $property;
552                    $prefix = $property = "";
553                } else {
554                    print STDERR __LINE__, ": $property\n" if DEBUG;
555
556                    # Here it is the compound property=table form.  The property
557                    # name is always loosely matched, and always can have an
558                    # optional 'is' prefix (which isn't true in the single
559                    # form).
560                    $property = loose_name($property) =~ s/^is//r;
561
562                    # And convert to canonical form.  Quit if not valid.
563                    $property = $loose_property_name_of{$property};
564                    if (! defined $property) {
565                        pop @recursed if @recursed;
566                        return $type;
567                    }
568
569                    $prefix = "$property=";
570
571                    # If the rhs looks like it is a number...
572                    print STDERR __LINE__, ": table=$table\n" if DEBUG;
573
574                    if ($table =~ $number) {
575                        print STDERR __LINE__, ": table=$table\n" if DEBUG;
576
577                        # Split on slash, in case it is a rational, like \p{1/5}
578                        my @parts = split m{ \s* / \s* }x, $table, -1;
579                        print __LINE__, ": $type\n" if @parts > 2 && DEBUG;
580
581                        foreach my $part (@parts) {
582                            print __LINE__, ": part=$part\n" if DEBUG;
583
584                            $part =~ s/^\+\s*//;    # Remove leading plus
585                            $part =~ s/^-\s*/-/;    # Remove blanks after unary
586                                                    # minus
587
588                            # Remove underscores between digits.
589                            $part =~ s/(?<= [0-9] ) _ (?= [0-9] ) //xg;
590
591                            # No leading zeros (but don't make a single '0'
592                            # into a null string)
593                            $part =~ s/ ^ ( -? ) 0+ /$1/x;
594                            $part .= '0' if $part eq '-' || $part eq "";
595
596                            # No trailing zeros after a decimal point
597                            $part =~ s/ ( \. [0-9]*? ) 0+ $ /$1/x;
598
599                            # Begin with a 0 if a leading decimal point
600                            $part =~ s/ ^ ( -? ) \. /${1}0./x;
601
602                            # Ensure not a trailing decimal point: turn into an
603                            # integer
604                            $part =~ s/ \. $ //x;
605
606                            print STDERR __LINE__, ": part=$part\n" if DEBUG;
607                            #return $type if $part eq "";
608                        }
609
610                        #  If a rational...
611                        if (@parts == 2) {
612
613                            # If denominator is negative, get rid of it, and ...
614                            if ($parts[1] =~ s/^-//) {
615
616                                # If numerator is also negative, convert the
617                                # whole thing to positive, else move the minus
618                                # to the numerator
619                                if ($parts[0] !~ s/^-//) {
620                                    $parts[0] = '-' . $parts[0];
621                                }
622                            }
623                            $table = join '/', @parts;
624                        }
625                        elsif ($property ne 'nv' || $parts[0] !~ /\./) {
626
627                            # Here is not numeric value, or doesn't have a
628                            # decimal point.  No further manipulation is
629                            # necessary.  (Note the hard-coded property name.
630                            # This could fail if other properties eventually
631                            # had fractions as well; perhaps the cjk ones
632                            # could evolve to do that.  This hard-coding could
633                            # be fixed by mktables generating a list of
634                            # properties that could have fractions.)
635                            $table = $parts[0];
636                        } else {
637
638                            # Here is a floating point numeric_value.  Convert
639                            # to rational.  Get a normalized form, like
640                            # 5.00E-01, and look that up in the hash
641
642                            my $float = sprintf "%.*e",
643                                                $e_precision,
644                                                0 + $parts[0];
645
646                            if (exists $nv_floating_to_rational{$float}) {
647                                $table = $nv_floating_to_rational{$float};
648                            } else {
649                                pop @recursed if @recursed;
650                                return $type;
651                            }
652                        }
653                        print STDERR __LINE__, ": $property=$table\n" if DEBUG;
654                    }
655                }
656
657                # Combine lhs (if any) and rhs to get something that matches
658                # the syntax of the lookups.
659                $property_and_table = "$prefix$table";
660                print STDERR __LINE__, ": $property_and_table\n" if DEBUG;
661
662                # First try stricter matching.
663                $file = $stricter_to_file_of{$property_and_table};
664
665                # If didn't find it, try again with looser matching by editing
666                # out the applicable characters on the rhs and looking up
667                # again.
668                my $strict_property_and_table;
669                if (! defined $file) {
670
671                    # This isn't used unless the name begins with 'to'
672                    $strict_property_and_table = $property_and_table =~  s/^to//r;
673                    $table = loose_name($table);
674                    $property_and_table = "$prefix$table";
675                    print STDERR __LINE__, ": $property_and_table\n" if DEBUG;
676                    $file = $loose_to_file_of{$property_and_table};
677                    print STDERR __LINE__, ": $property_and_table\n" if DEBUG;
678                }
679
680                # Add the constant and go fetch it in.
681                if (defined $file) {
682
683                    # If the file name contains a !, it means to invert.  The
684                    # 0+ makes sure result is numeric
685                    $invert_it = 0 + $file =~ s/!//;
686
687                    if ($caseless
688                        && exists $caseless_equivalent{$property_and_table})
689                    {
690                        $file = $caseless_equivalent{$property_and_table};
691                    }
692
693                    # The pseudo-directory '#' means that there really isn't a
694                    # file to read, the data is in-line as part of the string;
695                    # we extract it below.
696                    $file = "$unicore_dir/lib/$file.pl" unless $file =~ m!^#/!;
697                    last GETFILE;
698                }
699                print STDERR __LINE__, ": didn't find $property_and_table\n" if DEBUG;
700
701                ##
702                ## Last attempt -- see if it's a standard "To" name
703                ## (e.g. "ToLower")  ToTitle is used by ucfirst().
704                ## The user-level way to access ToDigit() and ToFold()
705                ## is to use Unicode::UCD.
706                ##
707                # Only check if caller wants non-binary
708                if ($minbits != 1) {
709                    if ($property_and_table =~ s/^to//) {
710                    # Look input up in list of properties for which we have
711                    # mapping files.  First do it with the strict approach
712                        if (defined ($file = $strict_property_to_file_of{
713                                                    $strict_property_and_table}))
714                        {
715                            $type = $file_to_swash_name{$file};
716                            print STDERR __LINE__, ": type set to $type\n"
717                                                                        if DEBUG;
718                            $file = "$unicore_dir/$file.pl";
719                            last GETFILE;
720                        }
721                        elsif (defined ($file =
722                          $loose_property_to_file_of{$property_and_table}))
723                        {
724                            $type = $file_to_swash_name{$file};
725                            print STDERR __LINE__, ": type set to $type\n"
726                                                                        if DEBUG;
727                            $file = "$unicore_dir/$file.pl";
728                            last GETFILE;
729                        }   # If that fails see if there is a corresponding binary
730                            # property file
731                        elsif (defined ($file =
732                                    $loose_to_file_of{$property_and_table}))
733                        {
734
735                            # Here, there is no map file for the property we
736                            # are trying to get the map of, but this is a
737                            # binary property, and there is a file for it that
738                            # can easily be translated to a mapping, so use
739                            # that, treating this as a binary property.
740                            # Setting 'minbits' here causes it to be stored as
741                            # such in the cache, so if someone comes along
742                            # later looking for just a binary, they get it.
743                            $minbits = 1;
744
745                            # The 0+ makes sure is numeric
746                            $invert_it = 0 + $file =~ s/!//;
747                            $file = "$unicore_dir/lib/$file.pl"
748                                                         unless $file =~ m!^#/!;
749                            last GETFILE;
750                        }
751                    }
752                }
753
754                ##
755                ## If we reach this line, it's because we couldn't figure
756                ## out what to do with $type. Ouch.
757                ##
758
759                pop @recursed if @recursed;
760                return $type;
761            } # end of GETFILE block
762
763            if (defined $file) {
764                print STDERR __LINE__, ": found it (file='$file')\n" if DEBUG;
765
766                ##
767                ## If we reach here, it was due to a 'last GETFILE' above
768                ## (exception: user-defined properties and mappings), so we
769                ## have a filename, so now we load it if we haven't already.
770
771                # The pseudo-directory '#' means the result isn't really a
772                # file, but is in-line, with semi-colons to be turned into
773                # new-lines.  Since it is in-line there is no advantage to
774                # caching the result
775                if ($file =~ s!^#/!!) {
776                    $list = $inline_definitions[$file];
777                }
778                else {
779                    # Here, we have an actual file to read in and load, but it
780                    # may already have been read-in and cached.  The cache key
781                    # is the class and file to load, and whether the results
782                    # need to be inverted.
783                    my $found = $Cache{$class, $file, $invert_it};
784                    if ($found and ref($found) eq $class) {
785                        print STDERR __LINE__, ": Returning cached swash for '$class,$file,$invert_it' for \\p{$type}\n" if DEBUG;
786                        pop @recursed if @recursed;
787                        return $found;
788                    }
789
790                    local $@;
791                    local $!;
792                    $list = do $file; die $@ if $@;
793                }
794
795                $list_is_from_mktables = 1;
796            }
797        } # End of $type is non-null
798
799        # Here, either $type was null, or we found the requested property and
800        # read it into $list
801
802        my $extras = "";
803
804        my $bits = $minbits;
805
806        # mktables lists don't have extras, like '&prop', so don't need
807        # to separate them; also lists are already sorted, so don't need to do
808        # that.
809        if ($list && ! $list_is_from_mktables) {
810            my $taint = substr($list,0,0); # maintain taint
811
812            # Separate the extras from the code point list, and make sure
813            # user-defined properties are well-behaved for
814            # downstream code.
815            if ($user_defined) {
816                my @tmp = split(/^/m, $list);
817                my %seen;
818                no warnings;
819
820                # The extras are anything that doesn't begin with a hex digit.
821                $extras = join '', $taint, grep /^[^0-9a-fA-F]/, @tmp;
822
823                # Remove the extras, and sort the remaining entries by the
824                # numeric value of their beginning hex digits, removing any
825                # duplicates.
826                $list = join '', $taint,
827                        map  { $_->[1] }
828                        sort { $a->[0] <=> $b->[0] }
829                        map  { /^([0-9a-fA-F]+)/ && !$seen{$1}++ ? [ CORE::hex($1), $_ ] : () }
830                        @tmp; # XXX doesn't do ranges right
831            }
832            else {
833                # mktables has gone to some trouble to make non-user defined
834                # properties well-behaved, so we can skip the effort we do for
835                # user-defined ones.  Any extras are at the very beginning of
836                # the string.
837
838                # This regex splits out the first lines of $list into $1 and
839                # strips them off from $list, until we get one that begins
840                # with a hex number, alone on the line, or followed by a tab.
841                # Either portion may be empty.
842                $list =~ s/ \A ( .*? )
843                            (?: \z | (?= ^ [0-9a-fA-F]+ (?: \t | $) ) )
844                          //msx;
845
846                $extras = "$taint$1";
847            }
848        }
849
850        if ($minbits != 1 && $minbits < 32) { # not binary property
851            my $top = 0;
852            while ($list =~ /^([0-9a-fA-F]+)(?:[\t]([0-9a-fA-F]+)?)(?:[ \t]([0-9a-fA-F]+))?/mg) {
853                my $min = CORE::hex $1;
854                my $max = defined $2 ? CORE::hex $2 : $min;
855                my $val = defined $3 ? CORE::hex $3 : 0;
856                $val += $max - $min if defined $3;
857                $top = $val if $val > $top;
858            }
859            my $topbits =
860                $top > 0xffff ? 32 :
861                $top > 0xff ? 16 : 8;
862            $bits = $topbits if $bits < $topbits;
863        }
864
865        my @extras;
866        if ($extras) {
867            for my $x ($extras) {
868                my $taint = substr($x,0,0); # maintain taint
869                pos $x = 0;
870                while ($x =~ /^([^0-9a-fA-F\n])(.*)/mg) {
871                    my $char = "$1$taint";
872                    my $name = "$2$taint";
873                    print STDERR __LINE__, ": char [$char] => name [$name]\n"
874                        if DEBUG;
875                    if ($char =~ /[-+!&]/) {
876                        my ($c,$t) = split(/::/, $name, 2);	# bogus use of ::, really
877                        my $subobj;
878                        if ($c eq 'utf8') { # khw is unsure of this
879                            $subobj = SWASHNEW($t, "", $minbits, 0);
880                        }
881                        elsif (exists &$name) {
882                            $subobj = SWASHNEW($name, "", $minbits, 0);
883                        }
884                        elsif ($c =~ /^([0-9a-fA-F]+)/) {
885                            $subobj = SWASHNEW("", $c, $minbits, 0);
886                        }
887                        print STDERR __LINE__, ": returned from getting sub object for $name\n" if DEBUG;
888                        if (! ref $subobj) {
889                            pop @recursed if @recursed && $type;
890                            return $subobj;
891                        }
892                        push @extras, $name => $subobj;
893                        $bits = $subobj->{BITS} if $bits < $subobj->{BITS};
894                        $user_defined = $subobj->{USER_DEFINED}
895                                              if $subobj->{USER_DEFINED};
896                    }
897                }
898            }
899        }
900
901        if (DEBUG) {
902            print STDERR __LINE__, ": CLASS = $class, TYPE => $type, BITS => $bits, INVERT_IT => $invert_it, USER_DEFINED => $user_defined";
903            print STDERR "\nLIST =>\n$list" if defined $list;
904            print STDERR "\nEXTRAS =>\n$extras" if defined $extras;
905            print STDERR "\n";
906        }
907
908        my $SWASH = bless {
909            TYPE => $type,
910            BITS => $bits,
911            EXTRAS => $extras,
912            LIST => $list,
913            USER_DEFINED => $user_defined,
914            @extras,
915        } => $class;
916
917        if ($file) {
918            $Cache{$class, $file, $invert_it} = $SWASH;
919            if ($type
920                && exists $SwashInfo{$type}
921                && exists $SwashInfo{$type}{'specials_name'})
922            {
923                my $specials_name = $SwashInfo{$type}{'specials_name'};
924                no strict "refs";
925                print STDERR "\nspecials_name => $specials_name\n" if DEBUG;
926                $SWASH->{'SPECIALS'} = \%$specials_name;
927            }
928            $SWASH->{'INVERT_IT'} = $invert_it;
929        }
930
931        pop @recursed if @recursed && $type;
932
933        return $SWASH;
934    }
935}
936
937# NB: This function is nearly duplicated in charnames.pm
938sub _getcode {
939    my $arg = shift;
940
941    if ($arg =~ /^[1-9]\d*$/) {
942	return $arg;
943    }
944    elsif ($arg =~ /^(?:0[xX])?([[:xdigit:]]+)$/) {
945	return CORE::hex($1);
946    }
947    elsif ($arg =~ /^[Uu]\+([[:xdigit:]]+)$/) { # Is of form U+0000, means
948                                                # wants the Unicode code
949                                                # point, not the native one
950        my $decimal = CORE::hex($1);
951        return $decimal if IS_ASCII_PLATFORM;
952        return utf8::unicode_to_native($decimal);
953    }
954
955    return;
956}
957
958# Populated by _num.  Converts real number back to input rational
959my %real_to_rational;
960
961# To store the contents of files found on disk.
962my @BIDIS;
963my @CATEGORIES;
964my @DECOMPOSITIONS;
965my @NUMERIC_TYPES;
966my %SIMPLE_LOWER;
967my %SIMPLE_TITLE;
968my %SIMPLE_UPPER;
969my %UNICODE_1_NAMES;
970my %ISO_COMMENT;
971
972# Eval'd so can run on versions earlier than the property is available in
973my $Hangul_Syllables_re = eval 'qr/\p{Block=Hangul_Syllables}/';
974
975sub charinfo {
976
977    # This function has traditionally mimicked what is in UnicodeData.txt,
978    # warts and all.  This is a re-write that avoids UnicodeData.txt so that
979    # it can be removed to save disk space.  Instead, this assembles
980    # information gotten by other methods that get data from various other
981    # files.  It uses charnames to get the character name; and various
982    # mktables tables.
983
984    use feature 'unicode_strings';
985
986    # Will fail if called under minitest
987    use if defined &DynaLoader::boot_DynaLoader, "Unicode::Normalize" => qw(getCombinClass NFD);
988
989    my $arg  = shift;
990    my $code = _getcode($arg);
991    croak __PACKAGE__, "::charinfo: unknown code '$arg'" unless defined $code;
992
993    # Non-unicode implies undef.
994    return if $code > 0x10FFFF;
995
996    my %prop;
997    my $char = chr($code);
998
999    @CATEGORIES =_read_table("To/Gc.pl") unless @CATEGORIES;
1000    $prop{'category'} = _search(\@CATEGORIES, 0, $#CATEGORIES, $code)
1001                        // $SwashInfo{'ToGc'}{'missing'};
1002    # Return undef if category value is 'Unassigned' or one of its synonyms
1003    return if grep { lc $_ eq 'unassigned' }
1004                                    prop_value_aliases('Gc', $prop{'category'});
1005
1006    $prop{'code'} = sprintf "%04X", $code;
1007    $prop{'name'} = ($char =~ /\p{Cntrl}/) ? '<control>'
1008                                           : (charnames::viacode($code) // "");
1009
1010    $prop{'combining'} = getCombinClass($code);
1011
1012    @BIDIS =_read_table("To/Bc.pl") unless @BIDIS;
1013    $prop{'bidi'} = _search(\@BIDIS, 0, $#BIDIS, $code)
1014                    // $SwashInfo{'ToBc'}{'missing'};
1015
1016    # For most code points, we can just read in "unicore/Decomposition.pl", as
1017    # its contents are exactly what should be output.  But that file doesn't
1018    # contain the data for the Hangul syllable decompositions, which can be
1019    # algorithmically computed, and NFD() does that, so we call NFD() for
1020    # those.  We can't use NFD() for everything, as it does a complete
1021    # recursive decomposition, and what this function has always done is to
1022    # return what's in UnicodeData.txt which doesn't show that recursiveness.
1023    # Fortunately, the NFD() of the Hanguls doesn't have any recursion
1024    # issues.
1025    # Having no decomposition implies an empty field; otherwise, all but
1026    # "Canonical" imply a compatible decomposition, and the type is prefixed
1027    # to that, as it is in UnicodeData.txt
1028    UnicodeVersion() unless defined $v_unicode_version;
1029    if ($v_unicode_version ge v2.0.0 && $char =~ $Hangul_Syllables_re) {
1030        # The code points of the decomposition are output in standard Unicode
1031        # hex format, separated by blanks.
1032        $prop{'decomposition'} = join " ", map { sprintf("%04X", $_)}
1033                                           unpack "U*", NFD($char);
1034    }
1035    else {
1036        @DECOMPOSITIONS = _read_table("Decomposition.pl")
1037                          unless @DECOMPOSITIONS;
1038        $prop{'decomposition'} = _search(\@DECOMPOSITIONS, 0, $#DECOMPOSITIONS,
1039                                                                $code) // "";
1040    }
1041
1042    # Can use num() to get the numeric values, if any.
1043    if (! defined (my $value = num($char))) {
1044        $prop{'decimal'} = $prop{'digit'} = $prop{'numeric'} = "";
1045    }
1046    else {
1047        if ($char =~ /\d/) {
1048            $prop{'decimal'} = $prop{'digit'} = $prop{'numeric'} = $value;
1049        }
1050        else {
1051
1052            # For non-decimal-digits, we have to read in the Numeric type
1053            # to distinguish them.  It is not just a matter of integer vs.
1054            # rational, as some whole number values are not considered digits,
1055            # e.g., TAMIL NUMBER TEN.
1056            $prop{'decimal'} = "";
1057
1058            @NUMERIC_TYPES =_read_table("To/Nt.pl") unless @NUMERIC_TYPES;
1059            if ((_search(\@NUMERIC_TYPES, 0, $#NUMERIC_TYPES, $code) // "")
1060                eq 'Digit')
1061            {
1062                $prop{'digit'} = $prop{'numeric'} = $value;
1063            }
1064            else {
1065                $prop{'digit'} = "";
1066                $prop{'numeric'} = $real_to_rational{$value} // $value;
1067            }
1068        }
1069    }
1070
1071    $prop{'mirrored'} = ($char =~ /\p{Bidi_Mirrored}/) ? 'Y' : 'N';
1072
1073    %UNICODE_1_NAMES =_read_table("To/Na1.pl", "use_hash") unless %UNICODE_1_NAMES;
1074    $prop{'unicode10'} = $UNICODE_1_NAMES{$code} // "";
1075
1076    UnicodeVersion() unless defined $v_unicode_version;
1077    if ($v_unicode_version ge v6.0.0) {
1078        $prop{'comment'} = "";
1079    }
1080    else {
1081        %ISO_COMMENT = _read_table("To/Isc.pl", "use_hash") unless %ISO_COMMENT;
1082        $prop{'comment'} = (defined $ISO_COMMENT{$code})
1083                           ? $ISO_COMMENT{$code}
1084                           : "";
1085    }
1086
1087    %SIMPLE_UPPER = _read_table("To/Uc.pl", "use_hash") unless %SIMPLE_UPPER;
1088    $prop{'upper'} = (defined $SIMPLE_UPPER{$code})
1089                     ? sprintf("%04X", $SIMPLE_UPPER{$code})
1090                     : "";
1091
1092    %SIMPLE_LOWER = _read_table("To/Lc.pl", "use_hash") unless %SIMPLE_LOWER;
1093    $prop{'lower'} = (defined $SIMPLE_LOWER{$code})
1094                     ? sprintf("%04X", $SIMPLE_LOWER{$code})
1095                     : "";
1096
1097    %SIMPLE_TITLE = _read_table("To/Tc.pl", "use_hash") unless %SIMPLE_TITLE;
1098    $prop{'title'} = (defined $SIMPLE_TITLE{$code})
1099                     ? sprintf("%04X", $SIMPLE_TITLE{$code})
1100                     : "";
1101
1102    $prop{block}  = charblock($code);
1103    $prop{script} = charscript($code);
1104    return \%prop;
1105}
1106
1107sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
1108    my ($table, $lo, $hi, $code) = @_;
1109
1110    return if $lo > $hi;
1111
1112    my $mid = int(($lo+$hi) / 2);
1113
1114    if ($table->[$mid]->[0] < $code) {
1115	if ($table->[$mid]->[1] >= $code) {
1116	    return $table->[$mid]->[2];
1117	} else {
1118	    _search($table, $mid + 1, $hi, $code);
1119	}
1120    } elsif ($table->[$mid]->[0] > $code) {
1121	_search($table, $lo, $mid - 1, $code);
1122    } else {
1123	return $table->[$mid]->[2];
1124    }
1125}
1126
1127sub _read_table ($;$) {
1128
1129    # Returns the contents of the mktables generated table file located at $1
1130    # in the form of either an array of arrays or a hash, depending on if the
1131    # optional second parameter is true (for hash return) or not.  In the case
1132    # of a hash return, each key is a code point, and its corresponding value
1133    # is what the table gives as the code point's corresponding value.  In the
1134    # case of an array return, each outer array denotes a range with [0] the
1135    # start point of that range; [1] the end point; and [2] the value that
1136    # every code point in the range has.  The hash return is useful for fast
1137    # lookup when the table contains only single code point ranges.  The array
1138    # return takes much less memory when there are large ranges.
1139    #
1140    # This function has the side effect of setting
1141    # $SwashInfo{$property}{'format'} to be the mktables format of the
1142    #                                       table; and
1143    # $SwashInfo{$property}{'missing'} to be the value for all entries
1144    #                                        not listed in the table.
1145    # where $property is the Unicode property name, preceded by 'To' for map
1146    # properties., e.g., 'ToSc'.
1147    #
1148    # Table entries look like one of:
1149    # 0000	0040	Common	# [65]
1150    # 00AA		Latin
1151
1152    my $table = shift;
1153    my $return_hash = shift;
1154    $return_hash = 0 unless defined $return_hash;
1155    my @return;
1156    my %return;
1157    local $_;
1158    my $list = do "unicore/$table";
1159
1160    # Look up if this property requires adjustments, which we do below if it
1161    # does.
1162    require "unicore/UCD.pl";
1163    my $property = $table =~ s/\.pl//r;
1164    $property = $file_to_swash_name{$property};
1165    my $to_adjust = defined $property
1166                    && $SwashInfo{$property}{'format'} =~ / ^ a /x;
1167
1168    for (split /^/m, $list) {
1169        my ($start, $end, $value) = / ^ (.+?) \t (.*?) \t (.+?)
1170                                        \s* ( \# .* )?  # Optional comment
1171                                        $ /x;
1172        my $decimal_start = hex $start;
1173        my $decimal_end = ($end eq "") ? $decimal_start : hex $end;
1174        $value = hex $value if $to_adjust
1175                               && $SwashInfo{$property}{'format'} eq 'ax';
1176        if ($return_hash) {
1177            foreach my $i ($decimal_start .. $decimal_end) {
1178                $return{$i} = ($to_adjust)
1179                              ? $value + $i - $decimal_start
1180                              : $value;
1181            }
1182        }
1183        elsif (! $to_adjust
1184               && @return
1185               && $return[-1][1] == $decimal_start - 1
1186               && $return[-1][2] eq $value)
1187        {
1188            # If this is merely extending the previous range, do just that.
1189            $return[-1]->[1] = $decimal_end;
1190        }
1191        else {
1192            push @return, [ $decimal_start, $decimal_end, $value ];
1193        }
1194    }
1195    return ($return_hash) ? %return : @return;
1196}
1197
1198sub charinrange {
1199    my ($range, $arg) = @_;
1200    my $code = _getcode($arg);
1201    croak __PACKAGE__, "::charinrange: unknown code '$arg'"
1202	unless defined $code;
1203    _search($range, 0, $#$range, $code);
1204}
1205
1206=head2 B<charprop()>
1207
1208    use Unicode::UCD 'charprop';
1209
1210    print charprop(0x41, "Gc"), "\n";
1211    print charprop(0x61, "General_Category"), "\n";
1212
1213  prints
1214    Lu
1215    Ll
1216
1217This returns the value of the Unicode property given by the second parameter
1218for the  L</code point argument> given by the first.
1219
1220The passed-in property may be specified as any of the synonyms returned by
1221L</prop_aliases()>.
1222
1223The return value is always a scalar, either a string or a number.  For
1224properties where there are synonyms for the values, the synonym returned by
1225this function is the longest, most descriptive form, the one returned by
1226L</prop_value_aliases()> when called in a scalar context.  Of course, you can
1227call L</prop_value_aliases()> on the result to get other synonyms.
1228
1229The return values are more "cooked" than the L</charinfo()> ones.  For
1230example, the C<"uc"> property value is the actual string containing the full
1231uppercase mapping of the input code point.  You have to go to extra trouble
1232with C<charinfo> to get this value from its C<upper> hash element when the
1233full mapping differs from the simple one.
1234
1235Special note should be made of the return values for a few properties:
1236
1237=over
1238
1239=item Block
1240
1241The value returned is the new-style (see L</Old-style versus new-style block
1242names>).
1243
1244=item Decomposition_Mapping
1245
1246Like L</charinfo()>, the result may be an intermediate decomposition whose
1247components are also decomposable.  Use L<Unicode::Normalize> to get the final
1248decomposition in one step.
1249
1250Unlike L</charinfo()>, this does not include the decomposition type.  Use the
1251C<Decomposition_Type> property to get that.
1252
1253=item Name_Alias
1254
1255If the input code point's name has more than one synonym, they are returned
1256joined into a single comma-separated string.
1257
1258=item Numeric_Value
1259
1260If the result is a fraction, it is converted into a floating point number to
1261the accuracy of your platform.
1262
1263=item Script_Extensions
1264
1265If the result is multiple script names, they are returned joined into a single
1266comma-separated string.
1267
1268=back
1269
1270When called with a property that is a Perl extension that isn't expressible in
1271a compound form, this function currently returns C<undef>, as the only two
1272possible values are I<true> or I<false> (1 or 0 I suppose).  This behavior may
1273change in the future, so don't write code that relies on it.  C<Present_In> is
1274a Perl extension that is expressible in a bipartite or compound form (for
1275example, C<\p{Present_In=4.0}>), so C<charprop> accepts it.  But C<Any> is a
1276Perl extension that isn't expressible that way, so C<charprop> returns
1277C<undef> for it.  Also C<charprop> returns C<undef> for all Perl extensions
1278that are internal-only.
1279
1280=cut
1281
1282sub charprop ($$;$) {
1283    my ($input_cp, $prop, $internal_ok) = @_;
1284
1285    my $cp = _getcode($input_cp);
1286    croak __PACKAGE__, "::charprop: unknown code point '$input_cp'" unless defined $cp;
1287
1288    my ($list_ref, $map_ref, $format, $default)
1289                                      = prop_invmap($prop, $internal_ok);
1290    return undef unless defined $list_ref;
1291
1292    my $i = search_invlist($list_ref, $cp);
1293    croak __PACKAGE__, "::charprop: prop_invmap return is invalid for charprop('$input_cp', '$prop)" unless defined $i;
1294
1295    # $i is the index into both the inversion list and map of $cp.
1296    my $map = $map_ref->[$i];
1297
1298    # Convert enumeration values to their most complete form.
1299    if (! ref $map) {
1300        my $long_form = prop_value_aliases($prop, $map);
1301        $map = $long_form if defined $long_form;
1302    }
1303
1304    if ($format =~ / ^ s /x) {  # Scalars
1305        return join ",", @$map if ref $map; # Convert to scalar with comma
1306                                            # separated array elements
1307
1308        # Resolve ambiguity as to whether an all digit value is a code point
1309        # that should be converted to a character, or whether it is really
1310        # just a number.  To do this, look at the default.  If it is a
1311        # non-empty number, we can safely assume the result is also a number.
1312        if ($map =~ / ^ \d+ $ /ax && $default !~ / ^ \d+ $ /ax) {
1313            $map = chr $map;
1314        }
1315        elsif ($map =~ / ^ (?: Y | N ) $ /x) {
1316
1317            # prop_invmap() returns these values for properties that are Perl
1318            # extensions.  But this is misleading.  For now, return undef for
1319            # these, as currently documented.
1320            undef $map unless
1321                exists $prop_aliases{loose_name(lc $prop)};
1322        }
1323        return $map;
1324    }
1325    elsif ($format eq 'ar') {   # numbers, including rationals
1326        my $offset = $cp - $list_ref->[$i];
1327        return $map if $map =~ /nan/i;
1328        return $map + $offset if $offset != 0;  # If needs adjustment
1329        return eval $map;   # Convert e.g., 1/2 to 0.5
1330    }
1331    elsif ($format =~ /^a/) {   # Some entries need adjusting
1332
1333        # Linearize sequences into a string.
1334        return join "", map { chr $_ } @$map if ref $map; # XXX && $format =~ /^ a [dl] /x;
1335
1336        return "" if $map eq "" && $format =~ /^a.*e/;
1337
1338        # These are all character mappings.  Return the chr if no adjustment
1339        # is needed
1340        return chr $cp if $map eq "0";
1341
1342        # Convert special entry.
1343        if ($map eq '<hangul syllable>' && $format eq 'ad') {
1344            use Unicode::Normalize qw(NFD);
1345            return NFD(chr $cp);
1346        }
1347
1348        # The rest need adjustment from the first entry in the inversion list
1349        # corresponding to this map.
1350        my $offset = $cp - $list_ref->[$i];
1351        return chr($map + $cp - $list_ref->[$i]);
1352    }
1353    elsif ($format eq 'n') {    # The name property
1354
1355        # There are two special cases, handled here.
1356        if ($map =~ / ( .+ ) <code\ point> $ /x) {
1357            $map = sprintf("$1%04X", $cp);
1358        }
1359        elsif ($map eq '<hangul syllable>') {
1360            $map = charnames::viacode($cp);
1361        }
1362        return $map;
1363    }
1364    else {
1365        croak __PACKAGE__, "::charprop: Internal error: unknown format '$format'.  Please perlbug this";
1366    }
1367}
1368
1369=head2 B<charprops_all()>
1370
1371    use Unicode::UCD 'charprops_all';
1372
1373    my $%properties_of_A_hash_ref = charprops_all("U+41");
1374
1375This returns a reference to a hash whose keys are all the distinct Unicode (no
1376Perl extension) properties, and whose values are the respective values for
1377those properties for the input L</code point argument>.
1378
1379Each key is the property name in its longest, most descriptive form.  The
1380values are what L</charprop()> would return.
1381
1382This function is expensive in time and memory.
1383
1384=cut
1385
1386sub charprops_all($) {
1387    my $input_cp = shift;
1388
1389    my $cp = _getcode($input_cp);
1390    croak __PACKAGE__, "::charprops_all: unknown code point '$input_cp'" unless defined $cp;
1391
1392    my %return;
1393
1394    require "unicore/UCD.pl";
1395
1396    foreach my $prop (keys %prop_aliases) {
1397
1398        # Don't return a Perl extension.  (This is the only one that
1399        # %prop_aliases has in it.)
1400        next if $prop eq 'perldecimaldigit';
1401
1402        # Use long name for $prop in the hash
1403        $return{scalar prop_aliases($prop)} = charprop($cp, $prop);
1404    }
1405
1406    return \%return;
1407}
1408
1409=head2 B<charblock()>
1410
1411    use Unicode::UCD 'charblock';
1412
1413    my $charblock = charblock(0x41);
1414    my $charblock = charblock(1234);
1415    my $charblock = charblock(0x263a);
1416    my $charblock = charblock("U+263a");
1417
1418    my $range     = charblock('Armenian');
1419
1420With a L</code point argument> C<charblock()> returns the I<block> the code point
1421belongs to, e.g.  C<Basic Latin>.  The old-style block name is returned (see
1422L</Old-style versus new-style block names>).
1423The L</prop_value_aliases()> function can be used to get all the synonyms
1424of the block name.
1425
1426If the code point is unassigned, this returns the block it would belong to if
1427it were assigned.  (If the Unicode version being used is so early as to not
1428have blocks, all code points are considered to be in C<No_Block>.)
1429
1430See also L</Blocks versus Scripts>.
1431
1432If supplied with an argument that can't be a code point, C<charblock()> tries to
1433do the opposite and interpret the argument as an old-style block name.  On an
1434ASCII platform, the return value is a I<range set> with one range: an
1435anonymous array with a single element that consists of another anonymous array
1436whose first element is the first code point in the block, and whose second
1437element is the final code point in the block.  On an EBCDIC
1438platform, the first two Unicode blocks are not contiguous.  Their range sets
1439are lists containing I<start-of-range>, I<end-of-range> code point pairs.  You
1440can test whether a code point is in a range set using the L</charinrange()>
1441function.  (To be precise, each I<range set> contains a third array element,
1442after the range boundary ones: the old_style block name.)
1443
1444If the argument to C<charblock()> is not a known block, C<undef> is
1445returned.
1446
1447=cut
1448
1449my @BLOCKS;
1450my %BLOCKS;
1451
1452sub _charblocks {
1453
1454    # Can't read from the mktables table because it loses the hyphens in the
1455    # original.
1456    unless (@BLOCKS) {
1457        UnicodeVersion() unless defined $v_unicode_version;
1458        if ($v_unicode_version lt v2.0.0) {
1459            my $subrange = [ 0, 0x10FFFF, 'No_Block' ];
1460            push @BLOCKS, $subrange;
1461            push @{$BLOCKS{'No_Block'}}, $subrange;
1462        }
1463        else {
1464            my $blocksfh = openunicode("Blocks.txt");
1465	    local $_;
1466	    local $/ = "\n";
1467	    while (<$blocksfh>) {
1468
1469                # Old versions used a different syntax to mark the range.
1470                $_ =~ s/;\s+/../ if $v_unicode_version lt v3.1.0;
1471
1472		if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
1473		    my ($lo, $hi) = (hex($1), hex($2));
1474		    my $subrange = [ $lo, $hi, $3 ];
1475		    push @BLOCKS, $subrange;
1476		    push @{$BLOCKS{$3}}, $subrange;
1477		}
1478	    }
1479            if (! IS_ASCII_PLATFORM) {
1480                # The first two blocks, through 0xFF, are wrong on EBCDIC
1481                # platforms.
1482
1483                my @new_blocks = _read_table("To/Blk.pl");
1484
1485                # Get rid of the first two ranges in the Unicode version, and
1486                # replace them with the ones computed by mktables.
1487                shift @BLOCKS;
1488                shift @BLOCKS;
1489                delete $BLOCKS{'Basic Latin'};
1490                delete $BLOCKS{'Latin-1 Supplement'};
1491
1492                # But there are multiple entries in the computed versions, and
1493                # we change their names to (which we know) to be the old-style
1494                # ones.
1495                for my $i (0.. @new_blocks - 1) {
1496                    if ($new_blocks[$i][2] =~ s/Basic_Latin/Basic Latin/
1497                        or $new_blocks[$i][2] =~
1498                                    s/Latin_1_Supplement/Latin-1 Supplement/)
1499                    {
1500                        push @{$BLOCKS{$new_blocks[$i][2]}}, $new_blocks[$i];
1501                    }
1502                    else {
1503                        splice @new_blocks, $i;
1504                        last;
1505                    }
1506                }
1507                unshift @BLOCKS, @new_blocks;
1508            }
1509	}
1510    }
1511}
1512
1513sub charblock {
1514    my $arg = shift;
1515
1516    _charblocks() unless @BLOCKS;
1517
1518    my $code = _getcode($arg);
1519
1520    if (defined $code) {
1521	my $result = _search(\@BLOCKS, 0, $#BLOCKS, $code);
1522        return $result if defined $result;
1523        return 'No_Block';
1524    }
1525    elsif (exists $BLOCKS{$arg}) {
1526        return _dclone $BLOCKS{$arg};
1527    }
1528
1529    carp __PACKAGE__, "::charblock: unknown code '$arg'";
1530    return;
1531}
1532
1533=head2 B<charscript()>
1534
1535    use Unicode::UCD 'charscript';
1536
1537    my $charscript = charscript(0x41);
1538    my $charscript = charscript(1234);
1539    my $charscript = charscript("U+263a");
1540
1541    my $range      = charscript('Thai');
1542
1543With a L</code point argument>, C<charscript()> returns the I<script> the
1544code point belongs to, e.g., C<Latin>, C<Greek>, C<Han>.
1545If the code point is unassigned or the Unicode version being used is so early
1546that it doesn't have scripts, this function returns C<"Unknown">.
1547The L</prop_value_aliases()> function can be used to get all the synonyms
1548of the script name.
1549
1550Note that the Script_Extensions property is an improved version of the Script
1551property, and you should probably be using that instead, with the
1552L</charprop()> function.
1553
1554If supplied with an argument that can't be a code point, charscript() tries
1555to do the opposite and interpret the argument as a script name. The
1556return value is a I<range set>: an anonymous array of arrays that contain
1557I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
1558code point is in a range set using the L</charinrange()> function.
1559(To be precise, each I<range set> contains a third array element,
1560after the range boundary ones: the script name.)
1561
1562If the C<charscript()> argument is not a known script, C<undef> is returned.
1563
1564See also L</Blocks versus Scripts>.
1565
1566=cut
1567
1568my @SCRIPTS;
1569my %SCRIPTS;
1570
1571sub _charscripts {
1572    unless (@SCRIPTS) {
1573        UnicodeVersion() unless defined $v_unicode_version;
1574        if ($v_unicode_version lt v3.1.0) {
1575            push @SCRIPTS, [ 0, 0x10FFFF, 'Unknown' ];
1576        }
1577        else {
1578            @SCRIPTS =_read_table("To/Sc.pl");
1579        }
1580    }
1581    foreach my $entry (@SCRIPTS) {
1582        $entry->[2] =~ s/(_\w)/\L$1/g;  # Preserve old-style casing
1583        push @{$SCRIPTS{$entry->[2]}}, $entry;
1584    }
1585}
1586
1587sub charscript {
1588    my $arg = shift;
1589
1590    _charscripts() unless @SCRIPTS;
1591
1592    my $code = _getcode($arg);
1593
1594    if (defined $code) {
1595	my $result = _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
1596        return $result if defined $result;
1597        return $SwashInfo{'ToSc'}{'missing'};
1598    } elsif (exists $SCRIPTS{$arg}) {
1599        return _dclone $SCRIPTS{$arg};
1600    }
1601
1602    carp __PACKAGE__, "::charscript: unknown code '$arg'";
1603    return;
1604}
1605
1606=head2 B<charblocks()>
1607
1608    use Unicode::UCD 'charblocks';
1609
1610    my $charblocks = charblocks();
1611
1612C<charblocks()> returns a reference to a hash with the known block names
1613as the keys, and the code point ranges (see L</charblock()>) as the values.
1614
1615The names are in the old-style (see L</Old-style versus new-style block
1616names>).
1617
1618L<prop_invmap("block")|/prop_invmap()> can be used to get this same data in a
1619different type of data structure.
1620
1621L<prop_values("Block")|/prop_values()> can be used to get all
1622the known new-style block names as a list, without the code point ranges.
1623
1624See also L</Blocks versus Scripts>.
1625
1626=cut
1627
1628sub charblocks {
1629    _charblocks() unless %BLOCKS;
1630    return _dclone \%BLOCKS;
1631}
1632
1633=head2 B<charscripts()>
1634
1635    use Unicode::UCD 'charscripts';
1636
1637    my $charscripts = charscripts();
1638
1639C<charscripts()> returns a reference to a hash with the known script
1640names as the keys, and the code point ranges (see L</charscript()>) as
1641the values.
1642
1643L<prop_invmap("script")|/prop_invmap()> can be used to get this same data in a
1644different type of data structure.  Since the Script_Extensions property is an
1645improved version of the Script property, you should instead use
1646L<prop_invmap("scx")|/prop_invmap()>.
1647
1648L<C<prop_values("Script")>|/prop_values()> can be used to get all
1649the known script names as a list, without the code point ranges.
1650
1651See also L</Blocks versus Scripts>.
1652
1653=cut
1654
1655sub charscripts {
1656    _charscripts() unless %SCRIPTS;
1657    return _dclone \%SCRIPTS;
1658}
1659
1660=head2 B<charinrange()>
1661
1662In addition to using the C<\p{Blk=...}> and C<\P{Blk=...}> constructs, you
1663can also test whether a code point is in the I<range> as returned by
1664L</charblock()> and L</charscript()> or as the values of the hash returned
1665by L</charblocks()> and L</charscripts()> by using C<charinrange()>:
1666
1667    use Unicode::UCD qw(charscript charinrange);
1668
1669    $range = charscript('Hiragana');
1670    print "looks like hiragana\n" if charinrange($range, $codepoint);
1671
1672=cut
1673
1674my %GENERAL_CATEGORIES =
1675 (
1676    'L'  =>         'Letter',
1677    'LC' =>         'CasedLetter',
1678    'Lu' =>         'UppercaseLetter',
1679    'Ll' =>         'LowercaseLetter',
1680    'Lt' =>         'TitlecaseLetter',
1681    'Lm' =>         'ModifierLetter',
1682    'Lo' =>         'OtherLetter',
1683    'M'  =>         'Mark',
1684    'Mn' =>         'NonspacingMark',
1685    'Mc' =>         'SpacingMark',
1686    'Me' =>         'EnclosingMark',
1687    'N'  =>         'Number',
1688    'Nd' =>         'DecimalNumber',
1689    'Nl' =>         'LetterNumber',
1690    'No' =>         'OtherNumber',
1691    'P'  =>         'Punctuation',
1692    'Pc' =>         'ConnectorPunctuation',
1693    'Pd' =>         'DashPunctuation',
1694    'Ps' =>         'OpenPunctuation',
1695    'Pe' =>         'ClosePunctuation',
1696    'Pi' =>         'InitialPunctuation',
1697    'Pf' =>         'FinalPunctuation',
1698    'Po' =>         'OtherPunctuation',
1699    'S'  =>         'Symbol',
1700    'Sm' =>         'MathSymbol',
1701    'Sc' =>         'CurrencySymbol',
1702    'Sk' =>         'ModifierSymbol',
1703    'So' =>         'OtherSymbol',
1704    'Z'  =>         'Separator',
1705    'Zs' =>         'SpaceSeparator',
1706    'Zl' =>         'LineSeparator',
1707    'Zp' =>         'ParagraphSeparator',
1708    'C'  =>         'Other',
1709    'Cc' =>         'Control',
1710    'Cf' =>         'Format',
1711    'Cs' =>         'Surrogate',
1712    'Co' =>         'PrivateUse',
1713    'Cn' =>         'Unassigned',
1714 );
1715
1716sub general_categories {
1717    return _dclone \%GENERAL_CATEGORIES;
1718}
1719
1720=head2 B<general_categories()>
1721
1722    use Unicode::UCD 'general_categories';
1723
1724    my $categories = general_categories();
1725
1726This returns a reference to a hash which has short
1727general category names (such as C<Lu>, C<Nd>, C<Zs>, C<S>) as keys and long
1728names (such as C<UppercaseLetter>, C<DecimalNumber>, C<SpaceSeparator>,
1729C<Symbol>) as values.  The hash is reversible in case you need to go
1730from the long names to the short names.  The general category is the
1731one returned from
1732L</charinfo()> under the C<category> key.
1733
1734The L</prop_values()> and L</prop_value_aliases()> functions can be used as an
1735alternative to this function; the first returning a simple list of the short
1736category names; and the second gets all the synonyms of a given category name.
1737
1738=cut
1739
1740my %BIDI_TYPES =
1741 (
1742   'L'   => 'Left-to-Right',
1743   'LRE' => 'Left-to-Right Embedding',
1744   'LRO' => 'Left-to-Right Override',
1745   'R'   => 'Right-to-Left',
1746   'AL'  => 'Right-to-Left Arabic',
1747   'RLE' => 'Right-to-Left Embedding',
1748   'RLO' => 'Right-to-Left Override',
1749   'PDF' => 'Pop Directional Format',
1750   'EN'  => 'European Number',
1751   'ES'  => 'European Number Separator',
1752   'ET'  => 'European Number Terminator',
1753   'AN'  => 'Arabic Number',
1754   'CS'  => 'Common Number Separator',
1755   'NSM' => 'Non-Spacing Mark',
1756   'BN'  => 'Boundary Neutral',
1757   'B'   => 'Paragraph Separator',
1758   'S'   => 'Segment Separator',
1759   'WS'  => 'Whitespace',
1760   'ON'  => 'Other Neutrals',
1761 );
1762
1763=head2 B<bidi_types()>
1764
1765    use Unicode::UCD 'bidi_types';
1766
1767    my $categories = bidi_types();
1768
1769This returns a reference to a hash which has the short
1770bidi (bidirectional) type names (such as C<L>, C<R>) as keys and long
1771names (such as C<Left-to-Right>, C<Right-to-Left>) as values.  The
1772hash is reversible in case you need to go from the long names to the
1773short names.  The bidi type is the one returned from
1774L</charinfo()>
1775under the C<bidi> key.  For the exact meaning of the various bidi classes
1776the Unicode TR9 is recommended reading:
1777L<http://www.unicode.org/reports/tr9/>
1778(as of Unicode 5.0.0)
1779
1780The L</prop_values()> and L</prop_value_aliases()> functions can be used as an
1781alternative to this function; the first returning a simple list of the short
1782bidi type names; and the second gets all the synonyms of a given bidi type
1783name.
1784
1785=cut
1786
1787sub bidi_types {
1788    return _dclone \%BIDI_TYPES;
1789}
1790
1791=head2 B<compexcl()>
1792
1793WARNING: Unicode discourages the use of this function or any of the
1794alternative mechanisms listed in this section (the documentation of
1795C<compexcl()>), except internally in implementations of the Unicode
1796Normalization Algorithm.  You should be using L<Unicode::Normalize> directly
1797instead of these.  Using these will likely lead to half-baked results.
1798
1799    use Unicode::UCD 'compexcl';
1800
1801    my $compexcl = compexcl(0x09dc);
1802
1803This routine returns C<undef> if the Unicode version being used is so early
1804that it doesn't have this property.
1805
1806C<compexcl()> is included for backwards
1807compatibility, but as of Perl 5.12 and more modern Unicode versions, for
1808most purposes it is probably more convenient to use one of the following
1809instead:
1810
1811    my $compexcl = chr(0x09dc) =~ /\p{Comp_Ex};
1812    my $compexcl = chr(0x09dc) =~ /\p{Full_Composition_Exclusion};
1813
1814or even
1815
1816    my $compexcl = chr(0x09dc) =~ /\p{CE};
1817    my $compexcl = chr(0x09dc) =~ /\p{Composition_Exclusion};
1818
1819The first two forms return B<true> if the L</code point argument> should not
1820be produced by composition normalization.  For the final two forms to return
1821B<true>, it is additionally required that this fact not otherwise be
1822determinable from the Unicode data base.
1823
1824This routine behaves identically to the final two forms.  That is,
1825it does not return B<true> if the code point has a decomposition
1826consisting of another single code point, nor if its decomposition starts
1827with a code point whose combining class is non-zero.  Code points that meet
1828either of these conditions should also not be produced by composition
1829normalization, which is probably why you should use the
1830C<Full_Composition_Exclusion> property instead, as shown above.
1831
1832The routine returns B<false> otherwise.
1833
1834=cut
1835
1836# Eval'd so can run on versions earlier than the property is available in
1837my $Composition_Exclusion_re = eval 'qr/\p{Composition_Exclusion}/';
1838
1839sub compexcl {
1840    my $arg  = shift;
1841    my $code = _getcode($arg);
1842    croak __PACKAGE__, "::compexcl: unknown code '$arg'"
1843	unless defined $code;
1844
1845    UnicodeVersion() unless defined $v_unicode_version;
1846    return if $v_unicode_version lt v3.0.0;
1847
1848    no warnings "non_unicode";     # So works on non-Unicode code points
1849    return chr($code) =~ $Composition_Exclusion_re
1850}
1851
1852=head2 B<casefold()>
1853
1854    use Unicode::UCD 'casefold';
1855
1856    my $casefold = casefold(0xDF);
1857    if (defined $casefold) {
1858        my @full_fold_hex = split / /, $casefold->{'full'};
1859        my $full_fold_string =
1860                    join "", map {chr(hex($_))} @full_fold_hex;
1861        my @turkic_fold_hex =
1862                        split / /, ($casefold->{'turkic'} ne "")
1863                                        ? $casefold->{'turkic'}
1864                                        : $casefold->{'full'};
1865        my $turkic_fold_string =
1866                        join "", map {chr(hex($_))} @turkic_fold_hex;
1867    }
1868    if (defined $casefold && $casefold->{'simple'} ne "") {
1869        my $simple_fold_hex = $casefold->{'simple'};
1870        my $simple_fold_string = chr(hex($simple_fold_hex));
1871    }
1872
1873This returns the (almost) locale-independent case folding of the
1874character specified by the L</code point argument>.  (Starting in Perl v5.16,
1875the core function C<fc()> returns the C<full> mapping (described below)
1876faster than this does, and for entire strings.)
1877
1878If there is no case folding for the input code point, C<undef> is returned.
1879
1880If there is a case folding for that code point, a reference to a hash
1881with the following fields is returned:
1882
1883=over
1884
1885=item B<code>
1886
1887the input native L</code point argument> expressed in hexadecimal, with
1888leading zeros
1889added if necessary to make it contain at least four hexdigits
1890
1891=item B<full>
1892
1893one or more codes (separated by spaces) that, taken in order, give the
1894code points for the case folding for I<code>.
1895Each has at least four hexdigits.
1896
1897=item B<simple>
1898
1899is empty, or is exactly one code with at least four hexdigits which can be used
1900as an alternative case folding when the calling program cannot cope with the
1901fold being a sequence of multiple code points.  If I<full> is just one code
1902point, then I<simple> equals I<full>.  If there is no single code point folding
1903defined for I<code>, then I<simple> is the empty string.  Otherwise, it is an
1904inferior, but still better-than-nothing alternative folding to I<full>.
1905
1906=item B<mapping>
1907
1908is the same as I<simple> if I<simple> is not empty, and it is the same as I<full>
1909otherwise.  It can be considered to be the simplest possible folding for
1910I<code>.  It is defined primarily for backwards compatibility.
1911
1912=item B<status>
1913
1914is C<C> (for C<common>) if the best possible fold is a single code point
1915(I<simple> equals I<full> equals I<mapping>).  It is C<S> if there are distinct
1916folds, I<simple> and I<full> (I<mapping> equals I<simple>).  And it is C<F> if
1917there is only a I<full> fold (I<mapping> equals I<full>; I<simple> is empty).
1918Note that this
1919describes the contents of I<mapping>.  It is defined primarily for backwards
1920compatibility.
1921
1922For Unicode versions between 3.1 and 3.1.1 inclusive, I<status> can also be
1923C<I> which is the same as C<C> but is a special case for dotted uppercase I and
1924dotless lowercase i:
1925
1926=over
1927
1928=item Z<>B<*> If you use this C<I> mapping
1929
1930the result is case-insensitive,
1931but dotless and dotted I's are not distinguished
1932
1933=item Z<>B<*> If you exclude this C<I> mapping
1934
1935the result is not fully case-insensitive, but
1936dotless and dotted I's are distinguished
1937
1938=back
1939
1940=item B<turkic>
1941
1942contains any special folding for Turkic languages.  For versions of Unicode
1943starting with 3.2, this field is empty unless I<code> has a different folding
1944in Turkic languages, in which case it is one or more codes (separated by
1945spaces) that, taken in order, give the code points for the case folding for
1946I<code> in those languages.
1947Each code has at least four hexdigits.
1948Note that this folding does not maintain canonical equivalence without
1949additional processing.
1950
1951For Unicode versions between 3.1 and 3.1.1 inclusive, this field is empty unless
1952there is a
1953special folding for Turkic languages, in which case I<status> is C<I>, and
1954I<mapping>, I<full>, I<simple>, and I<turkic> are all equal.
1955
1956=back
1957
1958Programs that want complete generality and the best folding results should use
1959the folding contained in the I<full> field.  But note that the fold for some
1960code points will be a sequence of multiple code points.
1961
1962Programs that can't cope with the fold mapping being multiple code points can
1963use the folding contained in the I<simple> field, with the loss of some
1964generality.  In Unicode 5.1, about 7% of the defined foldings have no single
1965code point folding.
1966
1967The I<mapping> and I<status> fields are provided for backwards compatibility for
1968existing programs.  They contain the same values as in previous versions of
1969this function.
1970
1971Locale is not completely independent.  The I<turkic> field contains results to
1972use when the locale is a Turkic language.
1973
1974For more information about case mappings see
1975L<http://www.unicode.org/reports/tr21>
1976
1977=cut
1978
1979my %CASEFOLD;
1980
1981sub _casefold {
1982    unless (%CASEFOLD) {   # Populate the hash
1983        my ($full_invlist_ref, $full_invmap_ref, undef, $default)
1984                                                = prop_invmap('Case_Folding');
1985
1986        # Use the recipe given in the prop_invmap() pod to convert the
1987        # inversion map into the hash.
1988        for my $i (0 .. @$full_invlist_ref - 1 - 1) {
1989            next if $full_invmap_ref->[$i] == $default;
1990            my $adjust = -1;
1991            for my $j ($full_invlist_ref->[$i] .. $full_invlist_ref->[$i+1] -1) {
1992                $adjust++;
1993                if (! ref $full_invmap_ref->[$i]) {
1994
1995                    # This is a single character mapping
1996                    $CASEFOLD{$j}{'status'} = 'C';
1997                    $CASEFOLD{$j}{'simple'}
1998                        = $CASEFOLD{$j}{'full'}
1999                        = $CASEFOLD{$j}{'mapping'}
2000                        = sprintf("%04X", $full_invmap_ref->[$i] + $adjust);
2001                    $CASEFOLD{$j}{'code'} = sprintf("%04X", $j);
2002                    $CASEFOLD{$j}{'turkic'} = "";
2003                }
2004                else {  # prop_invmap ensures that $adjust is 0 for a ref
2005                    $CASEFOLD{$j}{'status'} = 'F';
2006                    $CASEFOLD{$j}{'full'}
2007                    = $CASEFOLD{$j}{'mapping'}
2008                    = join " ", map { sprintf "%04X", $_ }
2009                                                    @{$full_invmap_ref->[$i]};
2010                    $CASEFOLD{$j}{'simple'} = "";
2011                    $CASEFOLD{$j}{'code'} = sprintf("%04X", $j);
2012                    $CASEFOLD{$j}{'turkic'} = "";
2013                }
2014            }
2015        }
2016
2017        # We have filled in the full mappings above, assuming there were no
2018        # simple ones for the ones with multi-character maps.  Now, we find
2019        # and fix the cases where that assumption was false.
2020        (my ($simple_invlist_ref, $simple_invmap_ref, undef), $default)
2021                                        = prop_invmap('Simple_Case_Folding');
2022        for my $i (0 .. @$simple_invlist_ref - 1 - 1) {
2023            next if $simple_invmap_ref->[$i] == $default;
2024            my $adjust = -1;
2025            for my $j ($simple_invlist_ref->[$i]
2026                       .. $simple_invlist_ref->[$i+1] -1)
2027            {
2028                $adjust++;
2029                next if $CASEFOLD{$j}{'status'} eq 'C';
2030                $CASEFOLD{$j}{'status'} = 'S';
2031                $CASEFOLD{$j}{'simple'}
2032                    = $CASEFOLD{$j}{'mapping'}
2033                    = sprintf("%04X", $simple_invmap_ref->[$i] + $adjust);
2034                $CASEFOLD{$j}{'code'} = sprintf("%04X", $j);
2035                $CASEFOLD{$j}{'turkic'} = "";
2036            }
2037        }
2038
2039        # We hard-code in the turkish rules
2040        UnicodeVersion() unless defined $v_unicode_version;
2041        if ($v_unicode_version ge v3.2.0) {
2042
2043            # These two code points should already have regular entries, so
2044            # just fill in the turkish fields
2045            $CASEFOLD{ord('I')}{'turkic'} = '0131';
2046            $CASEFOLD{0x130}{'turkic'} = sprintf "%04X", ord('i');
2047        }
2048        elsif ($v_unicode_version ge v3.1.0) {
2049
2050            # These two code points don't have entries otherwise.
2051            $CASEFOLD{0x130}{'code'} = '0130';
2052            $CASEFOLD{0x131}{'code'} = '0131';
2053            $CASEFOLD{0x130}{'status'} = $CASEFOLD{0x131}{'status'} = 'I';
2054            $CASEFOLD{0x130}{'turkic'}
2055                = $CASEFOLD{0x130}{'mapping'}
2056                = $CASEFOLD{0x130}{'full'}
2057                = $CASEFOLD{0x130}{'simple'}
2058                = $CASEFOLD{0x131}{'turkic'}
2059                = $CASEFOLD{0x131}{'mapping'}
2060                = $CASEFOLD{0x131}{'full'}
2061                = $CASEFOLD{0x131}{'simple'}
2062                = sprintf "%04X", ord('i');
2063        }
2064    }
2065}
2066
2067sub casefold {
2068    my $arg  = shift;
2069    my $code = _getcode($arg);
2070    croak __PACKAGE__, "::casefold: unknown code '$arg'"
2071	unless defined $code;
2072
2073    _casefold() unless %CASEFOLD;
2074
2075    return $CASEFOLD{$code};
2076}
2077
2078=head2 B<all_casefolds()>
2079
2080
2081    use Unicode::UCD 'all_casefolds';
2082
2083    my $all_folds_ref = all_casefolds();
2084    foreach my $char_with_casefold (sort { $a <=> $b }
2085                                    keys %$all_folds_ref)
2086    {
2087        printf "%04X:", $char_with_casefold;
2088        my $casefold = $all_folds_ref->{$char_with_casefold};
2089
2090        # Get folds for $char_with_casefold
2091
2092        my @full_fold_hex = split / /, $casefold->{'full'};
2093        my $full_fold_string =
2094                    join "", map {chr(hex($_))} @full_fold_hex;
2095        print " full=", join " ", @full_fold_hex;
2096        my @turkic_fold_hex =
2097                        split / /, ($casefold->{'turkic'} ne "")
2098                                        ? $casefold->{'turkic'}
2099                                        : $casefold->{'full'};
2100        my $turkic_fold_string =
2101                        join "", map {chr(hex($_))} @turkic_fold_hex;
2102        print "; turkic=", join " ", @turkic_fold_hex;
2103        if (defined $casefold && $casefold->{'simple'} ne "") {
2104            my $simple_fold_hex = $casefold->{'simple'};
2105            my $simple_fold_string = chr(hex($simple_fold_hex));
2106            print "; simple=$simple_fold_hex";
2107        }
2108        print "\n";
2109    }
2110
2111This returns all the case foldings in the current version of Unicode in the
2112form of a reference to a hash.  Each key to the hash is the decimal
2113representation of a Unicode character that has a casefold to other than
2114itself.  The casefold of a semi-colon is itself, so it isn't in the hash;
2115likewise for a lowercase "a", but there is an entry for a capital "A".  The
2116hash value for each key is another hash, identical to what is returned by
2117L</casefold()> if called with that code point as its argument.  So the value
2118C<< all_casefolds()->{ord("A")}' >> is equivalent to C<casefold(ord("A"))>;
2119
2120=cut
2121
2122sub all_casefolds () {
2123    _casefold() unless %CASEFOLD;
2124    return _dclone \%CASEFOLD;
2125}
2126
2127=head2 B<casespec()>
2128
2129    use Unicode::UCD 'casespec';
2130
2131    my $casespec = casespec(0xFB00);
2132
2133This returns the potentially locale-dependent case mappings of the L</code point
2134argument>.  The mappings may be longer than a single code point (which the basic
2135Unicode case mappings as returned by L</charinfo()> never are).
2136
2137If there are no case mappings for the L</code point argument>, or if all three
2138possible mappings (I<lower>, I<title> and I<upper>) result in single code
2139points and are locale independent and unconditional, C<undef> is returned
2140(which means that the case mappings, if any, for the code point are those
2141returned by L</charinfo()>).
2142
2143Otherwise, a reference to a hash giving the mappings (or a reference to a hash
2144of such hashes, explained below) is returned with the following keys and their
2145meanings:
2146
2147The keys in the bottom layer hash with the meanings of their values are:
2148
2149=over
2150
2151=item B<code>
2152
2153the input native L</code point argument> expressed in hexadecimal, with
2154leading zeros
2155added if necessary to make it contain at least four hexdigits
2156
2157=item B<lower>
2158
2159one or more codes (separated by spaces) that, taken in order, give the
2160code points for the lower case of I<code>.
2161Each has at least four hexdigits.
2162
2163=item B<title>
2164
2165one or more codes (separated by spaces) that, taken in order, give the
2166code points for the title case of I<code>.
2167Each has at least four hexdigits.
2168
2169=item B<upper>
2170
2171one or more codes (separated by spaces) that, taken in order, give the
2172code points for the upper case of I<code>.
2173Each has at least four hexdigits.
2174
2175=item B<condition>
2176
2177the conditions for the mappings to be valid.
2178If C<undef>, the mappings are always valid.
2179When defined, this field is a list of conditions,
2180all of which must be true for the mappings to be valid.
2181The list consists of one or more
2182I<locales> (see below)
2183and/or I<contexts> (explained in the next paragraph),
2184separated by spaces.
2185(Other than as used to separate elements, spaces are to be ignored.)
2186Case distinctions in the condition list are not significant.
2187Conditions preceded by "NON_" represent the negation of the condition.
2188
2189A I<context> is one of those defined in the Unicode standard.
2190For Unicode 5.1, they are defined in Section 3.13 C<Default Case Operations>
2191available at
2192L<http://www.unicode.org/versions/Unicode5.1.0/>.
2193These are for context-sensitive casing.
2194
2195=back
2196
2197The hash described above is returned for locale-independent casing, where
2198at least one of the mappings has length longer than one.  If C<undef> is
2199returned, the code point may have mappings, but if so, all are length one,
2200and are returned by L</charinfo()>.
2201Note that when this function does return a value, it will be for the complete
2202set of mappings for a code point, even those whose length is one.
2203
2204If there are additional casing rules that apply only in certain locales,
2205an additional key for each will be defined in the returned hash.  Each such key
2206will be its locale name, defined as a 2-letter ISO 3166 country code, possibly
2207followed by a "_" and a 2-letter ISO language code (possibly followed by a "_"
2208and a variant code).  You can find the lists of all possible locales, see
2209L<Locale::Country> and L<Locale::Language>.
2210(In Unicode 6.0, the only locales returned by this function
2211are C<lt>, C<tr>, and C<az>.)
2212
2213Each locale key is a reference to a hash that has the form above, and gives
2214the casing rules for that particular locale, which take precedence over the
2215locale-independent ones when in that locale.
2216
2217If the only casing for a code point is locale-dependent, then the returned
2218hash will not have any of the base keys, like C<code>, C<upper>, etc., but
2219will contain only locale keys.
2220
2221For more information about case mappings see
2222L<http://www.unicode.org/reports/tr21/>
2223
2224=cut
2225
2226my %CASESPEC;
2227
2228sub _casespec {
2229    unless (%CASESPEC) {
2230        UnicodeVersion() unless defined $v_unicode_version;
2231        if ($v_unicode_version ge v2.1.8) {
2232            my $casespecfh = openunicode("SpecialCasing.txt");
2233	    local $_;
2234	    local $/ = "\n";
2235	    while (<$casespecfh>) {
2236		if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
2237
2238		    my ($hexcode, $lower, $title, $upper, $condition) =
2239			($1, $2, $3, $4, $5);
2240                    if (! IS_ASCII_PLATFORM) { # Remap entry to native
2241                        foreach my $var_ref (\$hexcode,
2242                                             \$lower,
2243                                             \$title,
2244                                             \$upper)
2245                        {
2246                            next unless defined $$var_ref;
2247                            $$var_ref = join " ",
2248                                        map { sprintf("%04X",
2249                                              utf8::unicode_to_native(hex $_)) }
2250                                        split " ", $$var_ref;
2251                        }
2252                    }
2253
2254		    my $code = hex($hexcode);
2255
2256                    # In 2.1.8, there were duplicate entries; ignore all but
2257                    # the first one -- there were no conditions in the file
2258                    # anyway.
2259		    if (exists $CASESPEC{$code} && $v_unicode_version ne v2.1.8)
2260                    {
2261			if (exists $CASESPEC{$code}->{code}) {
2262			    my ($oldlower,
2263				$oldtitle,
2264				$oldupper,
2265				$oldcondition) =
2266				    @{$CASESPEC{$code}}{qw(lower
2267							   title
2268							   upper
2269							   condition)};
2270			    if (defined $oldcondition) {
2271				my ($oldlocale) =
2272				($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
2273				delete $CASESPEC{$code};
2274				$CASESPEC{$code}->{$oldlocale} =
2275				{ code      => $hexcode,
2276				  lower     => $oldlower,
2277				  title     => $oldtitle,
2278				  upper     => $oldupper,
2279				  condition => $oldcondition };
2280			    }
2281			}
2282			my ($locale) =
2283			    ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
2284			$CASESPEC{$code}->{$locale} =
2285			{ code      => $hexcode,
2286			  lower     => $lower,
2287			  title     => $title,
2288			  upper     => $upper,
2289			  condition => $condition };
2290		    } else {
2291			$CASESPEC{$code} =
2292			{ code      => $hexcode,
2293			  lower     => $lower,
2294			  title     => $title,
2295			  upper     => $upper,
2296			  condition => $condition };
2297		    }
2298		}
2299	    }
2300	}
2301    }
2302}
2303
2304sub casespec {
2305    my $arg  = shift;
2306    my $code = _getcode($arg);
2307    croak __PACKAGE__, "::casespec: unknown code '$arg'"
2308	unless defined $code;
2309
2310    _casespec() unless %CASESPEC;
2311
2312    return ref $CASESPEC{$code} ? _dclone $CASESPEC{$code} : $CASESPEC{$code};
2313}
2314
2315=head2 B<namedseq()>
2316
2317    use Unicode::UCD 'namedseq';
2318
2319    my $namedseq = namedseq("KATAKANA LETTER AINU P");
2320    my @namedseq = namedseq("KATAKANA LETTER AINU P");
2321    my %namedseq = namedseq();
2322
2323If used with a single argument in a scalar context, returns the string
2324consisting of the code points of the named sequence, or C<undef> if no
2325named sequence by that name exists.  If used with a single argument in
2326a list context, it returns the list of the ordinals of the code points.
2327
2328If used with no
2329arguments in a list context, it returns a hash with the names of all the
2330named sequences as the keys and their sequences as strings as
2331the values.  Otherwise, it returns C<undef> or an empty list depending
2332on the context.
2333
2334This function only operates on officially approved (not provisional) named
2335sequences.
2336
2337Note that as of Perl 5.14, C<\N{KATAKANA LETTER AINU P}> will insert the named
2338sequence into double-quoted strings, and C<charnames::string_vianame("KATAKANA
2339LETTER AINU P")> will return the same string this function does, but will also
2340operate on character names that aren't named sequences, without you having to
2341know which are which.  See L<charnames>.
2342
2343=cut
2344
2345my %NAMEDSEQ;
2346
2347sub _namedseq {
2348    unless (%NAMEDSEQ) {
2349        my @list = split "\n", do "unicore/Name.pl";
2350        for (my $i = 0; $i < @list; $i += 3) {
2351            # Each entry is currently three lines.  The first contains the code
2352            # points in the sequence separated by spaces.  If this entry
2353            # doesn't have spaces, it isn't a named sequence.
2354            next unless $list[$i] =~ /^ [0-9A-F]{4,5} (?: \  [0-9A-F]{4,5} )+ $ /x;
2355
2356            my $sequence = $list[$i];
2357            chomp $sequence;
2358
2359            # And the second is the name
2360            my $name = $list[$i+1];
2361            chomp $name;
2362            my @s = map { chr(hex($_)) } split(' ', $sequence);
2363            $NAMEDSEQ{$name} = join("", @s);
2364
2365            # And the third is empty
2366        }
2367    }
2368}
2369
2370sub namedseq {
2371
2372    # Use charnames::string_vianame() which now returns this information,
2373    # unless the caller wants the hash returned, in which case we read it in,
2374    # and thereafter use it instead of calling charnames, as it is faster.
2375
2376    my $wantarray = wantarray();
2377    if (defined $wantarray) {
2378	if ($wantarray) {
2379	    if (@_ == 0) {
2380                _namedseq() unless %NAMEDSEQ;
2381		return %NAMEDSEQ;
2382	    } elsif (@_ == 1) {
2383		my $s;
2384                if (%NAMEDSEQ) {
2385                    $s = $NAMEDSEQ{ $_[0] };
2386                }
2387                else {
2388                    $s = charnames::string_vianame($_[0]);
2389                }
2390		return defined $s ? map { ord($_) } split('', $s) : ();
2391	    }
2392	} elsif (@_ == 1) {
2393            return $NAMEDSEQ{ $_[0] } if %NAMEDSEQ;
2394            return charnames::string_vianame($_[0]);
2395	}
2396    }
2397    return;
2398}
2399
2400my %NUMERIC;
2401
2402sub _numeric {
2403    my @numbers = _read_table("To/Nv.pl");
2404    foreach my $entry (@numbers) {
2405        my ($start, $end, $value) = @$entry;
2406
2407        # If value contains a slash, convert to decimal, add a reverse hash
2408        # used by charinfo.
2409        if ((my @rational = split /\//, $value) == 2) {
2410            my $real = $rational[0] / $rational[1];
2411            $real_to_rational{$real} = $value;
2412            $value = $real;
2413
2414            # Should only be single element, but just in case...
2415            for my $i ($start .. $end) {
2416                $NUMERIC{$i} = $value;
2417            }
2418        }
2419        else {
2420            # The values require adjusting, as is in 'a' format
2421            for my $i ($start .. $end) {
2422                $NUMERIC{$i} = $value + $i - $start;
2423            }
2424        }
2425    }
2426
2427    # Decided unsafe to use these that aren't officially part of the Unicode
2428    # standard.
2429    #use Math::Trig;
2430    #my $pi = acos(-1.0);
2431    #$NUMERIC{0x03C0} = $pi;
2432
2433    # Euler's constant, not to be confused with Euler's number
2434    #$NUMERIC{0x2107} = 0.57721566490153286060651209008240243104215933593992;
2435
2436    # Euler's number
2437    #$NUMERIC{0x212F} = 2.7182818284590452353602874713526624977572;
2438
2439    return;
2440}
2441
2442=pod
2443
2444=head2 B<num()>
2445
2446    use Unicode::UCD 'num';
2447
2448    my $val = num("123");
2449    my $one_quarter = num("\N{VULGAR FRACTION ONE QUARTER}");
2450    my $val = num("12a", \$valid_length);  # $valid_length contains 2
2451
2452C<num()> returns the numeric value of the input Unicode string; or C<undef> if it
2453doesn't think the entire string has a completely valid, safe numeric value.
2454If called with an optional second parameter, a reference to a scalar, C<num()>
2455will set the scalar to the length of any valid initial substring; or to 0 if none.
2456
2457If the string is just one character in length, the Unicode numeric value
2458is returned if it has one, or C<undef> otherwise.  If the optional scalar ref
2459is passed, it would be set to 1 if the return is valid; or 0 if the return is
2460C<undef>.  Note that the numeric value returned need not be a whole number.
2461C<num("\N{TIBETAN DIGIT HALF ZERO}")>, for example returns -0.5.
2462
2463=cut
2464
2465#A few characters to which Unicode doesn't officially
2466#assign a numeric value are considered numeric by C<num>.
2467#These are:
2468
2469# EULER CONSTANT             0.5772...  (this is NOT Euler's number)
2470# SCRIPT SMALL E             2.71828... (this IS Euler's number)
2471# GREEK SMALL LETTER PI      3.14159...
2472
2473=pod
2474
2475If the string is more than one character, C<undef> is returned unless
2476all its characters are decimal digits (that is, they would match C<\d+>),
2477from the same script.  For example if you have an ASCII '0' and a Bengali
2478'3', mixed together, they aren't considered a valid number, and C<undef>
2479is returned.  A further restriction is that the digits all have to be of
2480the same form.  A half-width digit mixed with a full-width one will
2481return C<undef>.  The Arabic script has two sets of digits;  C<num> will
2482return C<undef> unless all the digits in the string come from the same
2483set.  In all cases, the optional scalar ref parameter is set to how
2484long any valid initial substring of digits is; hence it will be set to the
2485entire string length if the main return value is not C<undef>.
2486
2487C<num> errs on the side of safety, and there may be valid strings of
2488decimal digits that it doesn't recognize.  Note that Unicode defines
2489a number of "digit" characters that aren't "decimal digit" characters.
2490"Decimal digits" have the property that they have a positional value, i.e.,
2491there is a units position, a 10's position, a 100's, etc, AND they are
2492arranged in Unicode in blocks of 10 contiguous code points.  The Chinese
2493digits, for example, are not in such a contiguous block, and so Unicode
2494doesn't view them as decimal digits, but merely digits, and so C<\d> will not
2495match them.  A single-character string containing one of these digits will
2496have its decimal value returned by C<num>, but any longer string containing
2497only these digits will return C<undef>.
2498
2499Strings of multiple sub- and superscripts are not recognized as numbers.  You
2500can use either of the compatibility decompositions in Unicode::Normalize to
2501change these into digits, and then call C<num> on the result.
2502
2503=cut
2504
2505# To handle sub, superscripts, this could if called in list context,
2506# consider those, and return the <decomposition> type in the second
2507# array element.
2508
2509sub num ($;$) {
2510    my ($string, $retlen_ref) = @_;
2511
2512    use feature 'unicode_strings';
2513
2514    _numeric unless %NUMERIC;
2515    $$retlen_ref = 0 if $retlen_ref;    # Assume will fail
2516
2517    my $length = length $string;
2518    return if $length == 0;
2519
2520    my $first_ord = ord(substr($string, 0, 1));
2521    return if ! exists  $NUMERIC{$first_ord}
2522           || ! defined $NUMERIC{$first_ord};
2523
2524    # Here, we know the first character is numeric
2525    my $value = $NUMERIC{$first_ord};
2526    $$retlen_ref = 1 if $retlen_ref;    # Assume only this one is numeric
2527
2528    return $value if $length == 1;
2529
2530    # Here, the input is longer than a single character.  To be valid, it must
2531    # be entirely decimal digits, which means it must start with one.
2532    return if $string =~ / ^ \D /x;
2533
2534    # To be a valid decimal number, it should be in a block of 10 consecutive
2535    # characters, whose values are 0, 1, 2, ... 9.  Therefore this digit's
2536    # value is its offset in that block from the character that means zero.
2537    my $zero_ord = $first_ord - $value;
2538
2539    # Unicode 6.0 instituted the rule that only digits in a consecutive
2540    # block of 10 would be considered decimal digits.  If this is an earlier
2541    # release, we verify that this first character is a member of such a
2542    # block.  That is, that the block of characters surrounding this one
2543    # consists of all \d characters whose numeric values are the expected
2544    # ones.  If not, then this single character is numeric, but the string as
2545    # a whole is not considered to be.
2546    UnicodeVersion() unless defined $v_unicode_version;
2547    if ($v_unicode_version lt v6.0.0) {
2548        for my $i (0 .. 9) {
2549            my $ord = $zero_ord + $i;
2550            return unless chr($ord) =~ /\d/;
2551            my $numeric = $NUMERIC{$ord};
2552            return unless defined $numeric;
2553            return unless $numeric == $i;
2554        }
2555    }
2556
2557    for my $i (1 .. $length -1) {
2558
2559        # Here we know either by verifying, or by fact of the first character
2560        # being a \d in Unicode 6.0 or later, that any character between the
2561        # character that means 0, and 9 positions above it must be \d, and
2562        # must have its value correspond to its offset from the zero.  Any
2563        # characters outside these 10 do not form a legal number for this
2564        # function.
2565        my $ord = ord(substr($string, $i, 1));
2566        my $digit = $ord - $zero_ord;
2567        if ($digit < 0 || $digit > 9) {
2568            $$retlen_ref = $i if $retlen_ref;
2569            return;
2570        }
2571        $value = $value * 10 + $digit;
2572    }
2573
2574    $$retlen_ref = $length if $retlen_ref;
2575    return $value;
2576}
2577
2578=pod
2579
2580=head2 B<prop_aliases()>
2581
2582    use Unicode::UCD 'prop_aliases';
2583
2584    my ($short_name, $full_name, @other_names) = prop_aliases("space");
2585    my $same_full_name = prop_aliases("Space");     # Scalar context
2586    my ($same_short_name) = prop_aliases("Space");  # gets 0th element
2587    print "The full name is $full_name\n";
2588    print "The short name is $short_name\n";
2589    print "The other aliases are: ", join(", ", @other_names), "\n";
2590
2591    prints:
2592    The full name is White_Space
2593    The short name is WSpace
2594    The other aliases are: Space
2595
2596Most Unicode properties have several synonymous names.  Typically, there is at
2597least a short name, convenient to type, and a long name that more fully
2598describes the property, and hence is more easily understood.
2599
2600If you know one name for a Unicode property, you can use C<prop_aliases> to find
2601either the long name (when called in scalar context), or a list of all of the
2602names, somewhat ordered so that the short name is in the 0th element, the long
2603name in the next element, and any other synonyms are in the remaining
2604elements, in no particular order.
2605
2606The long name is returned in a form nicely capitalized, suitable for printing.
2607
2608The input parameter name is loosely matched, which means that white space,
2609hyphens, and underscores are ignored (except for the trailing underscore in
2610the old_form grandfathered-in C<"L_">, which is better written as C<"LC">, and
2611both of which mean C<General_Category=Cased Letter>).
2612
2613If the name is unknown, C<undef> is returned (or an empty list in list
2614context).  Note that Perl typically recognizes property names in regular
2615expressions with an optional C<"Is_>" (with or without the underscore)
2616prefixed to them, such as C<\p{isgc=punct}>.  This function does not recognize
2617those in the input, returning C<undef>.  Nor are they included in the output
2618as possible synonyms.
2619
2620C<prop_aliases> does know about the Perl extensions to Unicode properties,
2621such as C<Any> and C<XPosixAlpha>, and the single form equivalents to Unicode
2622properties such as C<XDigit>, C<Greek>, C<In_Greek>, and C<Is_Greek>.  The
2623final example demonstrates that the C<"Is_"> prefix is recognized for these
2624extensions; it is needed to resolve ambiguities.  For example,
2625C<prop_aliases('lc')> returns the list C<(lc, Lowercase_Mapping)>, but
2626C<prop_aliases('islc')> returns C<(Is_LC, Cased_Letter)>.  This is
2627because C<islc> is a Perl extension which is short for
2628C<General_Category=Cased Letter>.  The lists returned for the Perl extensions
2629will not include the C<"Is_"> prefix (whether or not the input had it) unless
2630needed to resolve ambiguities, as shown in the C<"islc"> example, where the
2631returned list had one element containing C<"Is_">, and the other without.
2632
2633It is also possible for the reverse to happen:  C<prop_aliases('isc')> returns
2634the list C<(isc, ISO_Comment)>; whereas C<prop_aliases('c')> returns
2635C<(C, Other)> (the latter being a Perl extension meaning
2636C<General_Category=Other>.
2637L<perluniprops/Properties accessible through Unicode::UCD> lists the available
2638forms, including which ones are discouraged from use.
2639
2640Those discouraged forms are accepted as input to C<prop_aliases>, but are not
2641returned in the lists.  C<prop_aliases('isL&')> and C<prop_aliases('isL_')>,
2642which are old synonyms for C<"Is_LC"> and should not be used in new code, are
2643examples of this.  These both return C<(Is_LC, Cased_Letter)>.  Thus this
2644function allows you to take a discouraged form, and find its acceptable
2645alternatives.  The same goes with single-form Block property equivalences.
2646Only the forms that begin with C<"In_"> are not discouraged; if you pass
2647C<prop_aliases> a discouraged form, you will get back the equivalent ones that
2648begin with C<"In_">.  It will otherwise look like a new-style block name (see.
2649L</Old-style versus new-style block names>).
2650
2651C<prop_aliases> does not know about any user-defined properties, and will
2652return C<undef> if called with one of those.  Likewise for Perl internal
2653properties, with the exception of "Perl_Decimal_Digit" which it does know
2654about (and which is documented below in L</prop_invmap()>).
2655
2656=cut
2657
2658# It may be that there are use cases where the discouraged forms should be
2659# returned.  If that comes up, an optional boolean second parameter to the
2660# function could be created, for example.
2661
2662# These are created by mktables for this routine and stored in unicore/UCD.pl
2663# where their structures are described.
2664our %string_property_loose_to_name;
2665our %ambiguous_names;
2666our %loose_perlprop_to_name;
2667
2668sub prop_aliases ($) {
2669    my $prop = $_[0];
2670    return unless defined $prop;
2671
2672    require "unicore/UCD.pl";
2673
2674    # The property name may be loosely or strictly matched; we don't know yet.
2675    # But both types use lower-case.
2676    $prop = lc $prop;
2677
2678    # It is loosely matched if its lower case isn't known to be strict.
2679    my $list_ref;
2680    if (! exists $stricter_to_file_of{$prop}) {
2681        my $loose = loose_name($prop);
2682
2683        # There is a hash that converts from any loose name to its standard
2684        # form, mapping all synonyms for a  name to one name that can be used
2685        # as a key into another hash.  The whole concept is for memory
2686        # savings, as the second hash doesn't have to have all the
2687        # combinations.  Actually, there are two hashes that do the
2688        # conversion.  One is stored in UCD.pl) for looking up properties
2689        # matchable in regexes.  This function needs to access string
2690        # properties, which aren't available in regexes, so a second
2691        # conversion hash is made for them (stored in UCD.pl).  Look in the
2692        # string one now, as the rest can have an optional 'is' prefix, which
2693        # these don't.
2694        if (exists $string_property_loose_to_name{$loose}) {
2695
2696            # Convert to its standard loose name.
2697            $prop = $string_property_loose_to_name{$loose};
2698        }
2699        else {
2700            my $retrying = 0;   # bool.  ? Has an initial 'is' been stripped
2701        RETRY:
2702            if (exists $loose_property_name_of{$loose}
2703                && (! $retrying
2704                    || ! exists $ambiguous_names{$loose}))
2705            {
2706                # Found an entry giving the standard form.  We don't get here
2707                # (in the test above) when we've stripped off an
2708                # 'is' and the result is an ambiguous name.  That is because
2709                # these are official Unicode properties (though Perl can have
2710                # an optional 'is' prefix meaning the official property), and
2711                # all ambiguous cases involve a Perl single-form extension
2712                # for the gc, script, or block properties, and the stripped
2713                # 'is' means that they mean one of those, and not one of
2714                # these
2715                $prop = $loose_property_name_of{$loose};
2716            }
2717            elsif (exists $loose_perlprop_to_name{$loose}) {
2718
2719                # This hash is specifically for this function to list Perl
2720                # extensions that aren't in the earlier hashes.  If there is
2721                # only one element, the short and long names are identical.
2722                # Otherwise the form is already in the same form as
2723                # %prop_aliases, which is handled at the end of the function.
2724                $list_ref = $loose_perlprop_to_name{$loose};
2725                if (@$list_ref == 1) {
2726                    my @list = ($list_ref->[0], $list_ref->[0]);
2727                    $list_ref = \@list;
2728                }
2729            }
2730            elsif (! exists $loose_to_file_of{$loose}) {
2731
2732                # loose_to_file_of is a complete list of loose names.  If not
2733                # there, the input is unknown.
2734                return;
2735            }
2736            elsif ($loose =~ / [:=] /x) {
2737
2738                # Here we found the name but not its aliases, so it has to
2739                # exist.  Exclude property-value combinations.  (This shows up
2740                # for something like ccc=vr which matches loosely, but is a
2741                # synonym for ccc=9 which matches only strictly.
2742                return;
2743            }
2744            else {
2745
2746                # Here it has to exist, and isn't a property-value
2747                # combination.  This means it must be one of the Perl
2748                # single-form extensions.  First see if it is for a
2749                # property-value combination in one of the following
2750                # properties.
2751                my @list;
2752                foreach my $property ("gc", "script") {
2753                    @list = prop_value_aliases($property, $loose);
2754                    last if @list;
2755                }
2756                if (@list) {
2757
2758                    # Here, it is one of those property-value combination
2759                    # single-form synonyms.  There are ambiguities with some
2760                    # of these.  Check against the list for these, and adjust
2761                    # if necessary.
2762                    for my $i (0 .. @list -1) {
2763                        if (exists $ambiguous_names
2764                                   {loose_name(lc $list[$i])})
2765                        {
2766                            # The ambiguity is resolved by toggling whether or
2767                            # not it has an 'is' prefix
2768                            $list[$i] =~ s/^Is_// or $list[$i] =~ s/^/Is_/;
2769                        }
2770                    }
2771                    return @list;
2772                }
2773
2774                # Here, it wasn't one of the gc or script single-form
2775                # extensions.  It could be a block property single-form
2776                # extension.  An 'in' prefix definitely means that, and should
2777                # be looked up without the prefix.  However, starting in
2778                # Unicode 6.1, we have to special case 'indic...', as there
2779                # is a property that begins with that name.   We shouldn't
2780                # strip the 'in' from that.   I'm (khw) generalizing this to
2781                # 'indic' instead of the single property, because I suspect
2782                # that others of this class may come along in the future.
2783                # However, this could backfire and a block created whose name
2784                # begins with 'dic...', and we would want to strip the 'in'.
2785                # At which point this would have to be tweaked.
2786                my $began_with_in = $loose =~ s/^in(?!dic)//;
2787                @list = prop_value_aliases("block", $loose);
2788                if (@list) {
2789                    map { $_ =~ s/^/In_/ } @list;
2790                    return @list;
2791                }
2792
2793                # Here still haven't found it.  The last opportunity for it
2794                # being valid is only if it began with 'is'.  We retry without
2795                # the 'is', setting a flag to that effect so that we don't
2796                # accept things that begin with 'isis...'
2797                if (! $retrying && ! $began_with_in && $loose =~ s/^is//) {
2798                    $retrying = 1;
2799                    goto RETRY;
2800                }
2801
2802                # Here, didn't find it.  Since it was in %loose_to_file_of, we
2803                # should have been able to find it.
2804                carp __PACKAGE__, "::prop_aliases: Unexpectedly could not find '$prop'.  Send bug report to perlbug\@perl.org";
2805                return;
2806            }
2807        }
2808    }
2809
2810    if (! $list_ref) {
2811        # Here, we have set $prop to a standard form name of the input.  Look
2812        # it up in the structure created by mktables for this purpose, which
2813        # contains both strict and loosely matched properties.  Avoid
2814        # autovivifying.
2815        $list_ref = $prop_aliases{$prop} if exists $prop_aliases{$prop};
2816        return unless $list_ref;
2817    }
2818
2819    # The full name is in element 1.
2820    return $list_ref->[1] unless wantarray;
2821
2822    return @{_dclone $list_ref};
2823}
2824
2825=pod
2826
2827=head2 B<prop_values()>
2828
2829    use Unicode::UCD 'prop_values';
2830
2831    print "AHex values are: ", join(", ", prop_values("AHex")),
2832                               "\n";
2833  prints:
2834    AHex values are: N, Y
2835
2836Some Unicode properties have a restricted set of legal values.  For example,
2837all binary properties are restricted to just C<true> or C<false>; and there
2838are only a few dozen possible General Categories.  Use C<prop_values>
2839to find out if a given property is one such, and if so, to get a list of the
2840values:
2841
2842    print join ", ", prop_values("NFC_Quick_Check");
2843  prints:
2844    M, N, Y
2845
2846If the property doesn't have such a restricted set, C<undef> is returned.
2847
2848There are usually several synonyms for each possible value.  Use
2849L</prop_value_aliases()> to access those.
2850
2851Case, white space, hyphens, and underscores are ignored in the input property
2852name (except for the trailing underscore in the old-form grandfathered-in
2853general category property value C<"L_">, which is better written as C<"LC">).
2854
2855If the property name is unknown, C<undef> is returned.  Note that Perl typically
2856recognizes property names in regular expressions with an optional C<"Is_>"
2857(with or without the underscore) prefixed to them, such as C<\p{isgc=punct}>.
2858This function does not recognize those in the property parameter, returning
2859C<undef>.
2860
2861For the block property, new-style block names are returned (see
2862L</Old-style versus new-style block names>).
2863
2864C<prop_values> does not know about any user-defined properties, and
2865will return C<undef> if called with one of those.
2866
2867=cut
2868
2869# These are created by mktables for this module and stored in unicore/UCD.pl
2870# where their structures are described.
2871our %loose_to_standard_value;
2872our %prop_value_aliases;
2873
2874sub prop_values ($) {
2875    my $prop = shift;
2876    return undef unless defined $prop;
2877
2878    require "unicore/UCD.pl";
2879
2880    # Find the property name synonym that's used as the key in other hashes,
2881    # which is element 0 in the returned list.
2882    ($prop) = prop_aliases($prop);
2883    return undef if ! $prop;
2884    $prop = loose_name(lc $prop);
2885
2886    # Here is a legal property.
2887    return undef unless exists $prop_value_aliases{$prop};
2888    my @return;
2889    foreach my $value_key (sort { lc $a cmp lc $b }
2890                            keys %{$prop_value_aliases{$prop}})
2891    {
2892        push @return, $prop_value_aliases{$prop}{$value_key}[0];
2893    }
2894    return @return;
2895}
2896
2897=pod
2898
2899=head2 B<prop_value_aliases()>
2900
2901    use Unicode::UCD 'prop_value_aliases';
2902
2903    my ($short_name, $full_name, @other_names)
2904                                   = prop_value_aliases("Gc", "Punct");
2905    my $same_full_name = prop_value_aliases("Gc", "P");   # Scalar cntxt
2906    my ($same_short_name) = prop_value_aliases("Gc", "P"); # gets 0th
2907                                                           # element
2908    print "The full name is $full_name\n";
2909    print "The short name is $short_name\n";
2910    print "The other aliases are: ", join(", ", @other_names), "\n";
2911
2912  prints:
2913    The full name is Punctuation
2914    The short name is P
2915    The other aliases are: Punct
2916
2917Some Unicode properties have a restricted set of legal values.  For example,
2918all binary properties are restricted to just C<true> or C<false>; and there
2919are only a few dozen possible General Categories.
2920
2921You can use L</prop_values()> to find out if a given property is one which has
2922a restricted set of values, and if so, what those values are.  But usually
2923each value actually has several synonyms.  For example, in Unicode binary
2924properties, I<truth> can be represented by any of the strings "Y", "Yes", "T",
2925or "True"; and the General Category "Punctuation" by that string, or "Punct",
2926or simply "P".
2927
2928Like property names, there is typically at least a short name for each such
2929property-value, and a long name.  If you know any name of the property-value
2930(which you can get by L</prop_values()>, you can use C<prop_value_aliases>()
2931to get the long name (when called in scalar context), or a list of all the
2932names, with the short name in the 0th element, the long name in the next
2933element, and any other synonyms in the remaining elements, in no particular
2934order, except that any all-numeric synonyms will be last.
2935
2936The long name is returned in a form nicely capitalized, suitable for printing.
2937
2938Case, white space, hyphens, and underscores are ignored in the input parameters
2939(except for the trailing underscore in the old-form grandfathered-in general
2940category property value C<"L_">, which is better written as C<"LC">).
2941
2942If either name is unknown, C<undef> is returned.  Note that Perl typically
2943recognizes property names in regular expressions with an optional C<"Is_>"
2944(with or without the underscore) prefixed to them, such as C<\p{isgc=punct}>.
2945This function does not recognize those in the property parameter, returning
2946C<undef>.
2947
2948If called with a property that doesn't have synonyms for its values, it
2949returns the input value, possibly normalized with capitalization and
2950underscores, but not necessarily checking that the input value is valid.
2951
2952For the block property, new-style block names are returned (see
2953L</Old-style versus new-style block names>).
2954
2955To find the synonyms for single-forms, such as C<\p{Any}>, use
2956L</prop_aliases()> instead.
2957
2958C<prop_value_aliases> does not know about any user-defined properties, and
2959will return C<undef> if called with one of those.
2960
2961=cut
2962
2963sub prop_value_aliases ($$) {
2964    my ($prop, $value) = @_;
2965    return unless defined $prop && defined $value;
2966
2967    require "unicore/UCD.pl";
2968
2969    # Find the property name synonym that's used as the key in other hashes,
2970    # which is element 0 in the returned list.
2971    ($prop) = prop_aliases($prop);
2972    return if ! $prop;
2973    $prop = loose_name(lc $prop);
2974
2975    # Here is a legal property, but the hash below (created by mktables for
2976    # this purpose) only knows about the properties that have a very finite
2977    # number of potential values, that is not ones whose value could be
2978    # anything, like most (if not all) string properties.  These don't have
2979    # synonyms anyway.  Simply return the input.  For example, there is no
2980    # synonym for ('Uppercase_Mapping', A').
2981    if (! exists $prop_value_aliases{$prop}) {
2982
2983        # Here, we have a legal property, but an unknown value.  Since the
2984        # property is legal, if it isn't in the prop_aliases hash, it must be
2985        # a Perl-extension All perl extensions are binary, hence are
2986        # enumerateds, which means that we know that the input unknown value
2987        # is illegal.
2988        return if ! exists $prop_aliases{$prop};
2989
2990        # Otherwise, we assume it's valid, as documented.
2991        return $value;
2992    }
2993
2994    # The value name may be loosely or strictly matched; we don't know yet.
2995    # But both types use lower-case.
2996    $value = lc $value;
2997
2998    # If the name isn't found under loose matching, it certainly won't be
2999    # found under strict
3000    my $loose_value = loose_name($value);
3001    return unless exists $loose_to_standard_value{"$prop=$loose_value"};
3002
3003    # Similarly if the combination under loose matching doesn't exist, it
3004    # won't exist under strict.
3005    my $standard_value = $loose_to_standard_value{"$prop=$loose_value"};
3006    return unless exists $prop_value_aliases{$prop}{$standard_value};
3007
3008    # Here we did find a combination under loose matching rules.  But it could
3009    # be that is a strict property match that shouldn't have matched.
3010    # %prop_value_aliases is set up so that the strict matches will appear as
3011    # if they were in loose form.  Thus, if the non-loose version is legal,
3012    # we're ok, can skip the further check.
3013    if (! exists $stricter_to_file_of{"$prop=$value"}
3014
3015        # We're also ok and skip the further check if value loosely matches.
3016        # mktables has verified that no strict name under loose rules maps to
3017        # an existing loose name.  This code relies on the very limited
3018        # circumstances that strict names can be here.  Strict name matching
3019        # happens under two conditions:
3020        # 1) when the name begins with an underscore.  But this function
3021        #    doesn't accept those, and %prop_value_aliases doesn't have
3022        #    them.
3023        # 2) When the values are numeric, in which case we need to look
3024        #    further, but their squeezed-out loose values will be in
3025        #    %stricter_to_file_of
3026        && exists $stricter_to_file_of{"$prop=$loose_value"})
3027    {
3028        # The only thing that's legal loosely under strict is that can have an
3029        # underscore between digit pairs XXX
3030        while ($value =~ s/(\d)_(\d)/$1$2/g) {}
3031        return unless exists $stricter_to_file_of{"$prop=$value"};
3032    }
3033
3034    # Here, we know that the combination exists.  Return it.
3035    my $list_ref = $prop_value_aliases{$prop}{$standard_value};
3036    if (@$list_ref > 1) {
3037        # The full name is in element 1.
3038        return $list_ref->[1] unless wantarray;
3039
3040        return @{_dclone $list_ref};
3041    }
3042
3043    return $list_ref->[0] unless wantarray;
3044
3045    # Only 1 element means that it repeats
3046    return ( $list_ref->[0], $list_ref->[0] );
3047}
3048
3049# All 1 bits but the top one is the largest possible IV.
3050$MAX_CP = (~0) >> 1;
3051
3052=pod
3053
3054=head2 B<prop_invlist()>
3055
3056C<prop_invlist> returns an inversion list (described below) that defines all the
3057code points for the binary Unicode property (or "property=value" pair) given
3058by the input parameter string:
3059
3060 use feature 'say';
3061 use Unicode::UCD 'prop_invlist';
3062 say join ", ", prop_invlist("Any");
3063
3064 prints:
3065 0, 1114112
3066
3067If the input is unknown C<undef> is returned in scalar context; an empty-list
3068in list context.  If the input is known, the number of elements in
3069the list is returned if called in scalar context.
3070
3071L<perluniprops|perluniprops/Properties accessible through \p{} and \P{}> gives
3072the list of properties that this function accepts, as well as all the possible
3073forms for them (including with the optional "Is_" prefixes).  (Except this
3074function doesn't accept any Perl-internal properties, some of which are listed
3075there.) This function uses the same loose or tighter matching rules for
3076resolving the input property's name as is done for regular expressions.  These
3077are also specified in L<perluniprops|perluniprops/Properties accessible
3078through \p{} and \P{}>.  Examples of using the "property=value" form are:
3079
3080 say join ", ", prop_invlist("Script_Extensions=Shavian");
3081
3082 prints:
3083 66640, 66688
3084
3085 say join ", ", prop_invlist("ASCII_Hex_Digit=No");
3086
3087 prints:
3088 0, 48, 58, 65, 71, 97, 103
3089
3090 say join ", ", prop_invlist("ASCII_Hex_Digit=Yes");
3091
3092 prints:
3093 48, 58, 65, 71, 97, 103
3094
3095Inversion lists are a compact way of specifying Unicode property-value
3096definitions.  The 0th item in the list is the lowest code point that has the
3097property-value.  The next item (item [1]) is the lowest code point beyond that
3098one that does NOT have the property-value.  And the next item beyond that
3099([2]) is the lowest code point beyond that one that does have the
3100property-value, and so on.  Put another way, each element in the list gives
3101the beginning of a range that has the property-value (for even numbered
3102elements), or doesn't have the property-value (for odd numbered elements).
3103The name for this data structure stems from the fact that each element in the
3104list toggles (or inverts) whether the corresponding range is or isn't on the
3105list.
3106
3107In the final example above, the first ASCII Hex digit is code point 48, the
3108character "0", and all code points from it through 57 (a "9") are ASCII hex
3109digits.  Code points 58 through 64 aren't, but 65 (an "A") through 70 (an "F")
3110are, as are 97 ("a") through 102 ("f").  103 starts a range of code points
3111that aren't ASCII hex digits.  That range extends to infinity, which on your
3112computer can be found in the variable C<$Unicode::UCD::MAX_CP>.  (This
3113variable is as close to infinity as Perl can get on your platform, and may be
3114too high for some operations to work; you may wish to use a smaller number for
3115your purposes.)
3116
3117Note that the inversion lists returned by this function can possibly include
3118non-Unicode code points, that is anything above 0x10FFFF.  Unicode properties
3119are not defined on such code points.  You might wish to change the output to
3120not include these.  Simply add 0x110000 at the end of the non-empty returned
3121list if it isn't already that value; and pop that value if it is; like:
3122
3123 my @list = prop_invlist("foo");
3124 if (@list) {
3125     if ($list[-1] == 0x110000) {
3126         pop @list;  # Defeat the turning on for above Unicode
3127     }
3128     else {
3129         push @list, 0x110000; # Turn off for above Unicode
3130     }
3131 }
3132
3133It is a simple matter to expand out an inversion list to a full list of all
3134code points that have the property-value:
3135
3136 my @invlist = prop_invlist($property_name);
3137 die "empty" unless @invlist;
3138 my @full_list;
3139 for (my $i = 0; $i < @invlist; $i += 2) {
3140    my $upper = ($i + 1) < @invlist
3141                ? $invlist[$i+1] - 1      # In range
3142                : $Unicode::UCD::MAX_CP;  # To infinity.
3143    for my $j ($invlist[$i] .. $upper) {
3144        push @full_list, $j;
3145    }
3146 }
3147
3148C<prop_invlist> does not know about any user-defined nor Perl internal-only
3149properties, and will return C<undef> if called with one of those.
3150
3151The L</search_invlist()> function is provided for finding a code point within
3152an inversion list.
3153
3154=cut
3155
3156# User-defined properties could be handled with some changes to SWASHNEW;
3157# and implementing here of dealing with EXTRAS.  If done, consideration should
3158# be given to the fact that the user subroutine could return different results
3159# with each call; security issues need to be thought about.
3160
3161# These are created by mktables for this routine and stored in unicore/UCD.pl
3162# where their structures are described.
3163our %loose_defaults;
3164our $MAX_UNICODE_CODEPOINT;
3165
3166sub prop_invlist ($;$) {
3167    my $prop = $_[0];
3168
3169    # Undocumented way to get at Perl internal properties; it may be changed
3170    # or removed without notice at any time.
3171    my $internal_ok = defined $_[1] && $_[1] eq '_perl_core_internal_ok';
3172
3173    return if ! defined $prop;
3174
3175    # Warnings for these are only for regexes, so not applicable to us
3176    no warnings 'deprecated';
3177
3178    # Get the swash definition of the property-value.
3179    my $swash = SWASHNEW(__PACKAGE__, $prop, undef, 1, 0);
3180
3181    # Fail if not found, or isn't a boolean property-value, or is a
3182    # user-defined property, or is internal-only.
3183    return if ! $swash
3184              || ref $swash eq ""
3185              || $swash->{'BITS'} != 1
3186              || $swash->{'USER_DEFINED'}
3187              || (! $internal_ok && $prop =~ /^\s*_/);
3188
3189    if ($swash->{'EXTRAS'}) {
3190        carp __PACKAGE__, "::prop_invlist: swash returned for $prop unexpectedly has EXTRAS magic";
3191        return;
3192    }
3193    if ($swash->{'SPECIALS'}) {
3194        carp __PACKAGE__, "::prop_invlist: swash returned for $prop unexpectedly has SPECIALS magic";
3195        return;
3196    }
3197
3198    my @invlist;
3199
3200    if ($swash->{'LIST'} =~ /^V/) {
3201
3202        # A 'V' as the first character marks the input as already an inversion
3203        # list, in which case, all we need to do is put the remaining lines
3204        # into our array.
3205        @invlist = split "\n", $swash->{'LIST'} =~ s/ \s* (?: \# .* )? $ //xmgr;
3206        shift @invlist;
3207    }
3208    else {
3209        # The input lines look like:
3210        # 0041\t005A   # [26]
3211        # 005F
3212
3213        # Split into lines, stripped of trailing comments
3214        foreach my $range (split "\n",
3215                              $swash->{'LIST'} =~ s/ \s* (?: \# .* )? $ //xmgr)
3216        {
3217            # And find the beginning and end of the range on the line
3218            my ($hex_begin, $hex_end) = split "\t", $range;
3219            my $begin = hex $hex_begin;
3220
3221            # If the new range merely extends the old, we remove the marker
3222            # created the last time through the loop for the old's end, which
3223            # causes the new one's end to be used instead.
3224            if (@invlist && $begin == $invlist[-1]) {
3225                pop @invlist;
3226            }
3227            else {
3228                # Add the beginning of the range
3229                push @invlist, $begin;
3230            }
3231
3232            if (defined $hex_end) { # The next item starts with the code point 1
3233                                    # beyond the end of the range.
3234                no warnings 'portable';
3235                my $end = hex $hex_end;
3236                last if $end == $MAX_CP;
3237                push @invlist, $end + 1;
3238            }
3239            else {  # No end of range, is a single code point.
3240                push @invlist, $begin + 1;
3241            }
3242        }
3243    }
3244
3245    # Could need to be inverted: add or subtract a 0 at the beginning of the
3246    # list.
3247    if ($swash->{'INVERT_IT'}) {
3248        if (@invlist && $invlist[0] == 0) {
3249            shift @invlist;
3250        }
3251        else {
3252            unshift @invlist, 0;
3253        }
3254    }
3255
3256    return @invlist;
3257}
3258
3259=pod
3260
3261=head2 B<prop_invmap()>
3262
3263 use Unicode::UCD 'prop_invmap';
3264 my ($list_ref, $map_ref, $format, $default)
3265                                      = prop_invmap("General Category");
3266
3267C<prop_invmap> is used to get the complete mapping definition for a property,
3268in the form of an inversion map.  An inversion map consists of two parallel
3269arrays.  One is an ordered list of code points that mark range beginnings, and
3270the other gives the value (or mapping) that all code points in the
3271corresponding range have.
3272
3273C<prop_invmap> is called with the name of the desired property.  The name is
3274loosely matched, meaning that differences in case, white-space, hyphens, and
3275underscores are not meaningful (except for the trailing underscore in the
3276old-form grandfathered-in property C<"L_">, which is better written as C<"LC">,
3277or even better, C<"Gc=LC">).
3278
3279Many Unicode properties have more than one name (or alias).  C<prop_invmap>
3280understands all of these, including Perl extensions to them.  Ambiguities are
3281resolved as described above for L</prop_aliases()> (except if a property has
3282both a complete mapping, and a binary C<Y>/C<N> mapping, then specifying the
3283property name prefixed by C<"is"> causes the binary one to be returned).  The
3284Perl internal property "Perl_Decimal_Digit, described below, is also accepted.
3285An empty list is returned if the property name is unknown.
3286See L<perluniprops/Properties accessible through Unicode::UCD> for the
3287properties acceptable as inputs to this function.
3288
3289It is a fatal error to call this function except in list context.
3290
3291In addition to the two arrays that form the inversion map, C<prop_invmap>
3292returns two other values; one is a scalar that gives some details as to the
3293format of the entries of the map array; the other is a default value, useful
3294in maps whose format name begins with the letter C<"a">, as described
3295L<below in its subsection|/a>; and for specialized purposes, such as
3296converting to another data structure, described at the end of this main
3297section.
3298
3299This means that C<prop_invmap> returns a 4 element list.  For example,
3300
3301 my ($blocks_ranges_ref, $blocks_maps_ref, $format, $default)
3302                                                 = prop_invmap("Block");
3303
3304In this call, the two arrays will be populated as shown below (for Unicode
33056.0):
3306
3307 Index  @blocks_ranges  @blocks_maps
3308   0        0x0000      Basic Latin
3309   1        0x0080      Latin-1 Supplement
3310   2        0x0100      Latin Extended-A
3311   3        0x0180      Latin Extended-B
3312   4        0x0250      IPA Extensions
3313   5        0x02B0      Spacing Modifier Letters
3314   6        0x0300      Combining Diacritical Marks
3315   7        0x0370      Greek and Coptic
3316   8        0x0400      Cyrillic
3317  ...
3318 233        0x2B820     No_Block
3319 234        0x2F800     CJK Compatibility Ideographs Supplement
3320 235        0x2FA20     No_Block
3321 236        0xE0000     Tags
3322 237        0xE0080     No_Block
3323 238        0xE0100     Variation Selectors Supplement
3324 239        0xE01F0     No_Block
3325 240        0xF0000     Supplementary Private Use Area-A
3326 241        0x100000    Supplementary Private Use Area-B
3327 242        0x110000    No_Block
3328
3329The first line (with Index [0]) means that the value for code point 0 is "Basic
3330Latin".  The entry "0x0080" in the @blocks_ranges column in the second line
3331means that the value from the first line, "Basic Latin", extends to all code
3332points in the range from 0 up to but not including 0x0080, that is, through
3333127.  In other words, the code points from 0 to 127 are all in the "Basic
3334Latin" block.  Similarly, all code points in the range from 0x0080 up to (but
3335not including) 0x0100 are in the block named "Latin-1 Supplement", etc.
3336(Notice that the return is the old-style block names; see L</Old-style versus
3337new-style block names>).
3338
3339The final line (with Index [242]) means that the value for all code points above
3340the legal Unicode maximum code point have the value "No_Block", which is the
3341term Unicode uses for a non-existing block.
3342
3343The arrays completely specify the mappings for all possible code points.
3344The final element in an inversion map returned by this function will always be
3345for the range that consists of all the code points that aren't legal Unicode,
3346but that are expressible on the platform.  (That is, it starts with code point
33470x110000, the first code point above the legal Unicode maximum, and extends to
3348infinity.) The value for that range will be the same that any typical
3349unassigned code point has for the specified property.  (Certain unassigned
3350code points are not "typical"; for example the non-character code points, or
3351those in blocks that are to be written right-to-left.  The above-Unicode
3352range's value is not based on these atypical code points.)  It could be argued
3353that, instead of treating these as unassigned Unicode code points, the value
3354for this range should be C<undef>.  If you wish, you can change the returned
3355arrays accordingly.
3356
3357The maps for almost all properties are simple scalars that should be
3358interpreted as-is.
3359These values are those given in the Unicode-supplied data files, which may be
3360inconsistent as to capitalization and as to which synonym for a property-value
3361is given.  The results may be normalized by using the L</prop_value_aliases()>
3362function.
3363
3364There are exceptions to the simple scalar maps.  Some properties have some
3365elements in their map list that are themselves lists of scalars; and some
3366special strings are returned that are not to be interpreted as-is.  Element
3367[2] (placed into C<$format> in the example above) of the returned four element
3368list tells you if the map has any of these special elements or not, as follows:
3369
3370=over
3371
3372=item B<C<s>>
3373
3374means all the elements of the map array are simple scalars, with no special
3375elements.  Almost all properties are like this, like the C<block> example
3376above.
3377
3378=item B<C<sl>>
3379
3380means that some of the map array elements have the form given by C<"s">, and
3381the rest are lists of scalars.  For example, here is a portion of the output
3382of calling C<prop_invmap>() with the "Script Extensions" property:
3383
3384 @scripts_ranges  @scripts_maps
3385      ...
3386      0x0953      Devanagari
3387      0x0964      [ Bengali, Devanagari, Gurumukhi, Oriya ]
3388      0x0966      Devanagari
3389      0x0970      Common
3390
3391Here, the code points 0x964 and 0x965 are both used in Bengali,
3392Devanagari, Gurmukhi, and Oriya, but no other scripts.
3393
3394The Name_Alias property is also of this form.  But each scalar consists of two
3395components:  1) the name, and 2) the type of alias this is.  They are
3396separated by a colon and a space.  In Unicode 6.1, there are several alias types:
3397
3398=over
3399
3400=item C<correction>
3401
3402indicates that the name is a corrected form for the
3403original name (which remains valid) for the same code point.
3404
3405=item C<control>
3406
3407adds a new name for a control character.
3408
3409=item C<alternate>
3410
3411is an alternate name for a character
3412
3413=item C<figment>
3414
3415is a name for a character that has been documented but was never in any
3416actual standard.
3417
3418=item C<abbreviation>
3419
3420is a common abbreviation for a character
3421
3422=back
3423
3424The lists are ordered (roughly) so the most preferred names come before less
3425preferred ones.
3426
3427For example,
3428
3429 @aliases_ranges        @alias_maps
3430    ...
3431    0x009E        [ 'PRIVACY MESSAGE: control', 'PM: abbreviation' ]
3432    0x009F        [ 'APPLICATION PROGRAM COMMAND: control',
3433                    'APC: abbreviation'
3434                  ]
3435    0x00A0        'NBSP: abbreviation'
3436    0x00A1        ""
3437    0x00AD        'SHY: abbreviation'
3438    0x00AE        ""
3439    0x01A2        'LATIN CAPITAL LETTER GHA: correction'
3440    0x01A3        'LATIN SMALL LETTER GHA: correction'
3441    0x01A4        ""
3442    ...
3443
3444A map to the empty string means that there is no alias defined for the code
3445point.
3446
3447=item B<C<a>>
3448
3449is like C<"s"> in that all the map array elements are scalars, but here they are
3450restricted to all being integers, and some have to be adjusted (hence the name
3451C<"a">) to get the correct result.  For example, in:
3452
3453 my ($uppers_ranges_ref, $uppers_maps_ref, $format, $default)
3454                          = prop_invmap("Simple_Uppercase_Mapping");
3455
3456the returned arrays look like this:
3457
3458 @$uppers_ranges_ref    @$uppers_maps_ref   Note
3459       0                      0
3460      97                     65          'a' maps to 'A', b => B ...
3461     123                      0
3462     181                    924          MICRO SIGN => Greek Cap MU
3463     182                      0
3464     ...
3465
3466and C<$default> is 0.
3467
3468Let's start with the second line.  It says that the uppercase of code point 97
3469is 65; or C<uc("a")> == "A".  But the line is for the entire range of code
3470points 97 through 122.  To get the mapping for any code point in this range,
3471you take the offset it has from the beginning code point of the range, and add
3472that to the mapping for that first code point.  So, the mapping for 122 ("z")
3473is derived by taking the offset of 122 from 97 (=25) and adding that to 65,
3474yielding 90 ("Z").  Likewise for everything in between.
3475
3476Requiring this simple adjustment allows the returned arrays to be
3477significantly smaller than otherwise, up to a factor of 10, speeding up
3478searching through them.
3479
3480Ranges that map to C<$default>, C<"0">, behave somewhat differently.  For
3481these, each code point maps to itself.  So, in the first line in the example,
3482S<C<ord(uc(chr(0)))>> is 0, S<C<ord(uc(chr(1)))>> is 1, ..
3483S<C<ord(uc(chr(96)))>> is 96.
3484
3485=item B<C<al>>
3486
3487means that some of the map array elements have the form given by C<"a">, and
3488the rest are ordered lists of code points.
3489For example, in:
3490
3491 my ($uppers_ranges_ref, $uppers_maps_ref, $format, $default)
3492                                 = prop_invmap("Uppercase_Mapping");
3493
3494the returned arrays look like this:
3495
3496 @$uppers_ranges_ref    @$uppers_maps_ref
3497       0                      0
3498      97                     65
3499     123                      0
3500     181                    924
3501     182                      0
3502     ...
3503    0x0149              [ 0x02BC 0x004E ]
3504    0x014A                    0
3505    0x014B                  330
3506     ...
3507
3508This is the full Uppercase_Mapping property (as opposed to the
3509Simple_Uppercase_Mapping given in the example for format C<"a">).  The only
3510difference between the two in the ranges shown is that the code point at
35110x0149 (LATIN SMALL LETTER N PRECEDED BY APOSTROPHE) maps to a string of two
3512characters, 0x02BC (MODIFIER LETTER APOSTROPHE) followed by 0x004E (LATIN
3513CAPITAL LETTER N).
3514
3515No adjustments are needed to entries that are references to arrays; each such
3516entry will have exactly one element in its range, so the offset is always 0.
3517
3518The fourth (index [3]) element (C<$default>) in the list returned for this
3519format is 0.
3520
3521=item B<C<ae>>
3522
3523This is like C<"a">, but some elements are the empty string, and should not be
3524adjusted.
3525The one internal Perl property accessible by C<prop_invmap> is of this type:
3526"Perl_Decimal_Digit" returns an inversion map which gives the numeric values
3527that are represented by the Unicode decimal digit characters.  Characters that
3528don't represent decimal digits map to the empty string, like so:
3529
3530 @digits    @values
3531 0x0000       ""
3532 0x0030        0
3533 0x003A:      ""
3534 0x0660:       0
3535 0x066A:      ""
3536 0x06F0:       0
3537 0x06FA:      ""
3538 0x07C0:       0
3539 0x07CA:      ""
3540 0x0966:       0
3541 ...
3542
3543This means that the code points from 0 to 0x2F do not represent decimal digits;
3544the code point 0x30 (DIGIT ZERO) represents 0;  code point 0x31, (DIGIT ONE),
3545represents 0+1-0 = 1; ... code point 0x39, (DIGIT NINE), represents 0+9-0 = 9;
3546... code points 0x3A through 0x65F do not represent decimal digits; 0x660
3547(ARABIC-INDIC DIGIT ZERO), represents 0; ... 0x07C1 (NKO DIGIT ONE),
3548represents 0+1-0 = 1 ...
3549
3550The fourth (index [3]) element (C<$default>) in the list returned for this
3551format is the empty string.
3552
3553=item B<C<ale>>
3554
3555is a combination of the C<"al"> type and the C<"ae"> type.  Some of
3556the map array elements have the forms given by C<"al">, and
3557the rest are the empty string.  The property C<NFKC_Casefold> has this form.
3558An example slice is:
3559
3560 @$ranges_ref  @$maps_ref         Note
3561    ...
3562   0x00AA       97                FEMININE ORDINAL INDICATOR => 'a'
3563   0x00AB        0
3564   0x00AD                         SOFT HYPHEN => ""
3565   0x00AE        0
3566   0x00AF     [ 0x0020, 0x0304 ]  MACRON => SPACE . COMBINING MACRON
3567   0x00B0        0
3568   ...
3569
3570The fourth (index [3]) element (C<$default>) in the list returned for this
3571format is 0.
3572
3573=item B<C<ar>>
3574
3575means that all the elements of the map array are either rational numbers or
3576the string C<"NaN">, meaning "Not a Number".  A rational number is either an
3577integer, or two integers separated by a solidus (C<"/">).  The second integer
3578represents the denominator of the division implied by the solidus, and is
3579actually always positive, so it is guaranteed not to be 0 and to not be
3580signed.  When the element is a plain integer (without the
3581solidus), it may need to be adjusted to get the correct value by adding the
3582offset, just as other C<"a"> properties.  No adjustment is needed for
3583fractions, as the range is guaranteed to have just a single element, and so
3584the offset is always 0.
3585
3586If you want to convert the returned map to entirely scalar numbers, you
3587can use something like this:
3588
3589 my ($invlist_ref, $invmap_ref, $format) = prop_invmap($property);
3590 if ($format && $format eq "ar") {
3591     map { $_ = eval $_ if $_ ne 'NaN' } @$map_ref;
3592 }
3593
3594Here's some entries from the output of the property "Nv", which has format
3595C<"ar">.
3596
3597 @numerics_ranges  @numerics_maps       Note
3598        0x00           "NaN"
3599        0x30             0           DIGIT 0 .. DIGIT 9
3600        0x3A           "NaN"
3601        0xB2             2           SUPERSCRIPTs 2 and 3
3602        0xB4           "NaN"
3603        0xB9             1           SUPERSCRIPT 1
3604        0xBA           "NaN"
3605        0xBC            1/4          VULGAR FRACTION 1/4
3606        0xBD            1/2          VULGAR FRACTION 1/2
3607        0xBE            3/4          VULGAR FRACTION 3/4
3608        0xBF           "NaN"
3609        0x660            0           ARABIC-INDIC DIGIT ZERO .. NINE
3610        0x66A          "NaN"
3611
3612The fourth (index [3]) element (C<$default>) in the list returned for this
3613format is C<"NaN">.
3614
3615=item B<C<n>>
3616
3617means the Name property.  All the elements of the map array are simple
3618scalars, but some of them contain special strings that require more work to
3619get the actual name.
3620
3621Entries such as:
3622
3623 CJK UNIFIED IDEOGRAPH-<code point>
3624
3625mean that the name for the code point is "CJK UNIFIED IDEOGRAPH-"
3626with the code point (expressed in hexadecimal) appended to it, like "CJK
3627UNIFIED IDEOGRAPH-3403" (similarly for S<C<CJK COMPATIBILITY IDEOGRAPH-E<lt>code
3628pointE<gt>>>).
3629
3630Also, entries like
3631
3632 <hangul syllable>
3633
3634means that the name is algorithmically calculated.  This is easily done by
3635the function L<charnames/charnames::viacode(code)>.
3636
3637Note that for control characters (C<Gc=cc>), Unicode's data files have the
3638string "C<E<lt>controlE<gt>>", but the real name of each of these characters is the empty
3639string.  This function returns that real name, the empty string.  (There are
3640names for these characters, but they are considered aliases, not the Name
3641property name, and are contained in the C<Name_Alias> property.)
3642
3643=item B<C<ad>>
3644
3645means the Decomposition_Mapping property.  This property is like C<"al">
3646properties, except that one of the scalar elements is of the form:
3647
3648 <hangul syllable>
3649
3650This signifies that this entry should be replaced by the decompositions for
3651all the code points whose decomposition is algorithmically calculated.  (All
3652of them are currently in one range and no others outside the range are likely
3653to ever be added to Unicode; the C<"n"> format
3654has this same entry.)  These can be generated via the function
3655L<Unicode::Normalize::NFD()|Unicode::Normalize>.
3656
3657Note that the mapping is the one that is specified in the Unicode data files,
3658and to get the final decomposition, it may need to be applied recursively.
3659Unicode in fact discourages use of this property except internally in
3660implementations of the Unicode Normalization Algorithm.
3661
3662The fourth (index [3]) element (C<$default>) in the list returned for this
3663format is 0.
3664
3665=back
3666
3667Note that a format begins with the letter "a" if and only the property it is
3668for requires adjustments by adding the offsets in multi-element ranges.  For
3669all these properties, an entry should be adjusted only if the map is a scalar
3670which is an integer.  That is, it must match the regular expression:
3671
3672    / ^ -? \d+ $ /xa
3673
3674Further, the first element in a range never needs adjustment, as the
3675adjustment would be just adding 0.
3676
3677A binary search such as that provided by L</search_invlist()>, can be used to
3678quickly find a code point in the inversion list, and hence its corresponding
3679mapping.
3680
3681The final, fourth element (index [3], assigned to C<$default> in the "block"
3682example) in the four element list returned by this function is used with the
3683C<"a"> format types; it may also be useful for applications
3684that wish to convert the returned inversion map data structure into some
3685other, such as a hash.  It gives the mapping that most code points map to
3686under the property.  If you establish the convention that any code point not
3687explicitly listed in your data structure maps to this value, you can
3688potentially make your data structure much smaller.  As you construct your data
3689structure from the one returned by this function, simply ignore those ranges
3690that map to this value.  For example, to
3691convert to the data structure searchable by L</charinrange()>, you can follow
3692this recipe for properties that don't require adjustments:
3693
3694 my ($list_ref, $map_ref, $format, $default) = prop_invmap($property);
3695 my @range_list;
3696
3697 # Look at each element in the list, but the -2 is needed because we
3698 # look at $i+1 in the loop, and the final element is guaranteed to map
3699 # to $default by prop_invmap(), so we would skip it anyway.
3700 for my $i (0 .. @$list_ref - 2) {
3701    next if $map_ref->[$i] eq $default;
3702    push @range_list, [ $list_ref->[$i],
3703                        $list_ref->[$i+1],
3704                        $map_ref->[$i]
3705                      ];
3706 }
3707
3708 print charinrange(\@range_list, $code_point), "\n";
3709
3710With this, C<charinrange()> will return C<undef> if its input code point maps
3711to C<$default>.  You can avoid this by omitting the C<next> statement, and adding
3712a line after the loop to handle the final element of the inversion map.
3713
3714Similarly, this recipe can be used for properties that do require adjustments:
3715
3716 for my $i (0 .. @$list_ref - 2) {
3717    next if $map_ref->[$i] eq $default;
3718
3719    # prop_invmap() guarantees that if the mapping is to an array, the
3720    # range has just one element, so no need to worry about adjustments.
3721    if (ref $map_ref->[$i]) {
3722        push @range_list,
3723                   [ $list_ref->[$i], $list_ref->[$i], $map_ref->[$i] ];
3724    }
3725    else {  # Otherwise each element is actually mapped to a separate
3726            # value, so the range has to be split into single code point
3727            # ranges.
3728
3729        my $adjustment = 0;
3730
3731        # For each code point that gets mapped to something...
3732        for my $j ($list_ref->[$i] .. $list_ref->[$i+1] -1 ) {
3733
3734            # ... add a range consisting of just it mapping to the
3735            # original plus the adjustment, which is incremented for the
3736            # next time through the loop, as the offset increases by 1
3737            # for each element in the range
3738            push @range_list,
3739                             [ $j, $j, $map_ref->[$i] + $adjustment++ ];
3740        }
3741    }
3742 }
3743
3744Note that the inversion maps returned for the C<Case_Folding> and
3745C<Simple_Case_Folding> properties do not include the Turkic-locale mappings.
3746Use L</casefold()> for these.
3747
3748C<prop_invmap> does not know about any user-defined properties, and will
3749return C<undef> if called with one of those.
3750
3751The returned values for the Perl extension properties, such as C<Any> and
3752C<Greek> are somewhat misleading.  The values are either C<"Y"> or C<"N>".
3753All Unicode properties are bipartite, so you can actually use the C<"Y"> or
3754C<"N>" in a Perl regular expression for these, like C<qr/\p{ID_Start=Y/}> or
3755C<qr/\p{Upper=N/}>.  But the Perl extensions aren't specified this way, only
3756like C</qr/\p{Any}>, I<etc>.  You can't actually use the C<"Y"> and C<"N>" in
3757them.
3758
3759=head3 Getting every available name
3760
3761Instead of reading the Unicode Database directly from files, as you were able
3762to do for a long time, you are encouraged to use the supplied functions. So,
3763instead of reading C<Name.pl> directly, which changed formats in 5.32, and may
3764do so again without notice in the future or even disappear, you ought to use
3765L</prop_invmap()> like this:
3766
3767  my (%name, %cp, %cps, $n);
3768  # All codepoints
3769  foreach my $cat (qw( Name Name_Alias )) {
3770      my ($codepoints, $names, $format, $default) = prop_invmap($cat);
3771      # $format => "n", $default => ""
3772      foreach my $i (0 .. @$codepoints - 2) {
3773          my ($cp, $n) = ($codepoints->[$i], $names->[$i]);
3774          # If $n is a ref, the same codepoint has multiple names
3775          foreach my $name (ref $n ? @$n : $n) {
3776              $name{$cp} //= $name;
3777              $cp{$name} //= $cp;
3778          }
3779      }
3780  }
3781  # Named sequences
3782  {   my %ns = namedseq();
3783      foreach my $name (sort { $ns{$a} cmp $ns{$b} } keys %ns) {
3784          $cp{$name} //= [ map { ord } split "" => $ns{$name} ];
3785      }
3786  }
3787
3788=cut
3789
3790# User-defined properties could be handled with some changes to SWASHNEW;
3791# if done, consideration should be given to the fact that the user subroutine
3792# could return different results with each call, which could lead to some
3793# security issues.
3794
3795# One could store things in memory so they don't have to be recalculated, but
3796# it is unlikely this will be called often, and some properties would take up
3797# significant memory.
3798
3799# These are created by mktables for this routine and stored in unicore/UCD.pl
3800# where their structures are described.
3801our @algorithmic_named_code_points;
3802our $HANGUL_BEGIN;
3803our $HANGUL_COUNT;
3804
3805sub prop_invmap ($;$) {
3806
3807    croak __PACKAGE__, "::prop_invmap: must be called in list context" unless wantarray;
3808
3809    my $prop = $_[0];
3810    return unless defined $prop;
3811
3812    # Undocumented way to get at Perl internal properties; it may be changed
3813    # or removed without notice at any time.  It currently also changes the
3814    # output to use the format specified in the file rather than the one we
3815    # normally compute and return
3816    my $internal_ok = defined $_[1] && $_[1] eq '_perl_core_internal_ok';
3817
3818    # Fail internal properties
3819    return if $prop =~ /^_/ && ! $internal_ok;
3820
3821    # The values returned by this function.
3822    my (@invlist, @invmap, $format, $missing);
3823
3824    # The swash has two components we look at, the base list, and a hash,
3825    # named 'SPECIALS', containing any additional members whose mappings don't
3826    # fit into the base list scheme of things.  These generally 'override'
3827    # any value in the base list for the same code point.
3828    my $overrides;
3829
3830    require "unicore/UCD.pl";
3831
3832RETRY:
3833
3834    # If there are multiple entries for a single code point
3835    my $has_multiples = 0;
3836
3837    # Try to get the map swash for the property.  They have 'To' prepended to
3838    # the property name, and 32 means we will accept 32 bit return values.
3839    # The 0 means we aren't calling this from tr///.
3840    my $swash = SWASHNEW(__PACKAGE__, "To$prop", undef, 32, 0);
3841
3842    # If didn't find it, could be because needs a proxy.  And if was the
3843    # 'Block' or 'Name' property, use a proxy even if did find it.  Finding it
3844    # in these cases would be the result of the installation changing mktables
3845    # to output the Block or Name tables.  The Block table gives block names
3846    # in the new-style, and this routine is supposed to return old-style block
3847    # names.  The Name table is valid, but we need to execute the special code
3848    # below to add in the algorithmic-defined name entries.
3849    # And NFKCCF needs conversion, so handle that here too.
3850    if (ref $swash eq ""
3851        || $swash->{'TYPE'} =~ / ^ To (?: Blk | Na | NFKCCF ) $ /x)
3852    {
3853
3854        # Get the short name of the input property, in standard form
3855        my ($second_try) = prop_aliases($prop);
3856        return unless $second_try;
3857        $second_try = loose_name(lc $second_try);
3858
3859        if ($second_try eq "in") {
3860
3861            # This property is identical to age for inversion map purposes
3862            $prop = "age";
3863            goto RETRY;
3864        }
3865        elsif ($second_try =~ / ^ s ( cf | fc | [ltu] c ) $ /x) {
3866
3867            # These properties use just the LIST part of the full mapping,
3868            # which includes the simple maps that are otherwise overridden by
3869            # the SPECIALS.  So all we need do is to not look at the SPECIALS;
3870            # set $overrides to indicate that
3871            $overrides = -1;
3872
3873            # The full name is the simple name stripped of its initial 's'
3874            $prop = $1;
3875
3876            # .. except for this case
3877            $prop = 'cf' if $prop eq 'fc';
3878
3879            goto RETRY;
3880        }
3881        elsif ($second_try eq "blk") {
3882
3883            # We use the old block names.  Just create a fake swash from its
3884            # data.
3885            _charblocks();
3886            my %blocks;
3887            $blocks{'LIST'} = "";
3888            $blocks{'TYPE'} = "ToBlk";
3889            $SwashInfo{ToBlk}{'missing'} = "No_Block";
3890            $SwashInfo{ToBlk}{'format'} = "s";
3891
3892            foreach my $block (@BLOCKS) {
3893                $blocks{'LIST'} .= sprintf "%x\t%x\t%s\n",
3894                                           $block->[0],
3895                                           $block->[1],
3896                                           $block->[2];
3897            }
3898            $swash = \%blocks;
3899        }
3900        elsif ($second_try eq "na") {
3901
3902            # Use the combo file that has all the Name-type properties in it,
3903            # extracting just the ones that are for the actual 'Name'
3904            # property.  And create a fake swash from it.
3905            my %names;
3906            $names{'LIST'} = "";
3907            my $original = do "unicore/Name.pl";
3908
3909            # Change the double \n format of the file back to single lines
3910            # with a tab
3911            $original =~ s/\n\n/\e/g;   # Use a control that shouldn't occur
3912                                        #in the file
3913            $original =~ s/\n/\t/g;
3914            $original =~ s/\e/\n/g;
3915
3916            my $algorithm_names = \@algorithmic_named_code_points;
3917
3918            # We need to remove the names from it that are aliases.  For that
3919            # we need to also read in that table.  Create a hash with the keys
3920            # being the code points, and the values being a list of the
3921            # aliases for the code point key.
3922            my ($aliases_code_points, $aliases_maps, undef, undef)
3923                  = &prop_invmap("_Perl_Name_Alias", '_perl_core_internal_ok');
3924            my %aliases;
3925            for (my $i = 0; $i < @$aliases_code_points; $i++) {
3926                my $code_point = $aliases_code_points->[$i];
3927                $aliases{$code_point} = $aliases_maps->[$i];
3928
3929                # If not already a list, make it into one, so that later we
3930                # can treat things uniformly
3931                if (! ref $aliases{$code_point}) {
3932                    $aliases{$code_point} = [ $aliases{$code_point} ];
3933                }
3934
3935                # Remove the alias type from the entry, retaining just the
3936                # name.
3937                map { s/:.*// } @{$aliases{$code_point}};
3938            }
3939
3940            my $i = 0;
3941            foreach my $line (split "\n", $original) {
3942                my ($hex_code_point, $name) = split "\t", $line;
3943
3944                # Weeds out any comments, blank lines, and named sequences
3945                next if $hex_code_point =~ /[^[:xdigit:]]/a;
3946
3947                my $code_point = hex $hex_code_point;
3948
3949                # The name of all controls is the default: the empty string.
3950                # The set of controls is immutable
3951                next if chr($code_point) =~ /[[:cntrl:]]/u;
3952
3953                # If this is a name_alias, it isn't a name
3954                next if grep { $_ eq $name } @{$aliases{$code_point}};
3955
3956                # If we are beyond where one of the special lines needs to
3957                # be inserted ...
3958                while ($i < @$algorithm_names
3959                    && $code_point > $algorithm_names->[$i]->{'low'})
3960                {
3961
3962                    # ... then insert it, ahead of what we were about to
3963                    # output
3964                    $names{'LIST'} .= sprintf "%x\t%x\t%s\n",
3965                                            $algorithm_names->[$i]->{'low'},
3966                                            $algorithm_names->[$i]->{'high'},
3967                                            $algorithm_names->[$i]->{'name'};
3968
3969                    # Done with this range.
3970                    $i++;
3971
3972                    # We loop until all special lines that precede the next
3973                    # regular one are output.
3974                }
3975
3976                # Here, is a normal name.
3977                $names{'LIST'} .= sprintf "%x\t\t%s\n", $code_point, $name;
3978            } # End of loop through all the names
3979
3980            $names{'TYPE'} = "ToNa";
3981            $SwashInfo{ToNa}{'missing'} = "";
3982            $SwashInfo{ToNa}{'format'} = "n";
3983            $swash = \%names;
3984        }
3985        elsif ($second_try =~ / ^ ( d [mt] ) $ /x) {
3986
3987            # The file is a combination of dt and dm properties.  Create a
3988            # fake swash from the portion that we want.
3989            my $original = do "unicore/Decomposition.pl";
3990            my %decomps;
3991
3992            if ($second_try eq 'dt') {
3993                $decomps{'TYPE'} = "ToDt";
3994                $SwashInfo{'ToDt'}{'missing'} = "None";
3995                $SwashInfo{'ToDt'}{'format'} = "s";
3996            }   # 'dm' is handled below, with 'nfkccf'
3997
3998            $decomps{'LIST'} = "";
3999
4000            # This property has one special range not in the file: for the
4001            # hangul syllables.  But not in Unicode version 1.
4002            UnicodeVersion() unless defined $v_unicode_version;
4003            my $done_hangul = ($v_unicode_version lt v2.0.0)
4004                              ? 1
4005                              : 0;    # Have we done the hangul range ?
4006            foreach my $line (split "\n", $original) {
4007                my ($hex_lower, $hex_upper, $type_and_map) = split "\t", $line;
4008                my $code_point = hex $hex_lower;
4009                my $value;
4010                my $redo = 0;
4011
4012                # The type, enclosed in <...>, precedes the mapping separated
4013                # by blanks
4014                if ($type_and_map =~ / ^ < ( .* ) > \s+ (.*) $ /x) {
4015                    $value = ($second_try eq 'dt') ? $1 : $2
4016                }
4017                else {  # If there is no type specified, it's canonical
4018                    $value = ($second_try eq 'dt')
4019                             ? "Canonical" :
4020                             $type_and_map;
4021                }
4022
4023                # Insert the hangul range at the appropriate spot.
4024                if (! $done_hangul && $code_point > $HANGUL_BEGIN) {
4025                    $done_hangul = 1;
4026                    $decomps{'LIST'} .=
4027                                sprintf "%x\t%x\t%s\n",
4028                                        $HANGUL_BEGIN,
4029                                        $HANGUL_BEGIN + $HANGUL_COUNT - 1,
4030                                        ($second_try eq 'dt')
4031                                        ? "Canonical"
4032                                        : "<hangul syllable>";
4033                }
4034
4035                if ($value =~ / / && $hex_upper ne "" && $hex_upper ne $hex_lower) {
4036                    $line = sprintf("%04X\t%s\t%s", hex($hex_lower) + 1, $hex_upper, $value);
4037                    $hex_upper = "";
4038                    $redo = 1;
4039                }
4040
4041                # And append this to our constructed LIST.
4042                $decomps{'LIST'} .= "$hex_lower\t$hex_upper\t$value\n";
4043
4044                redo if $redo;
4045            }
4046            $swash = \%decomps;
4047        }
4048        elsif ($second_try ne 'nfkccf') { # Don't know this property. Fail.
4049            return;
4050        }
4051
4052        if ($second_try eq 'nfkccf' || $second_try eq 'dm') {
4053
4054            # The 'nfkccf' property is stored in the old format for backwards
4055            # compatibility for any applications that has read its file
4056            # directly before prop_invmap() existed.
4057            # And the code above has extracted the 'dm' property from its file
4058            # yielding the same format.  So here we convert them to adjusted
4059            # format for compatibility with the other properties similar to
4060            # them.
4061            my %revised_swash;
4062
4063            # We construct a new converted list.
4064            my $list = "";
4065
4066            my @ranges = split "\n", $swash->{'LIST'};
4067            for (my $i = 0; $i < @ranges; $i++) {
4068                my ($hex_begin, $hex_end, $map) = split "\t", $ranges[$i];
4069
4070                # The dm property has maps that are space separated sequences
4071                # of code points, as well as the special entry "<hangul
4072                # syllable>, which also contains a blank.
4073                my @map = split " ", $map;
4074                if (@map > 1) {
4075
4076                    # If it's just the special entry, append as-is.
4077                    if ($map eq '<hangul syllable>') {
4078                        $list .= "$ranges[$i]\n";
4079                    }
4080                    else {
4081
4082                        # These should all be single-element ranges.
4083                        croak __PACKAGE__, "::prop_invmap: Not expecting a mapping with multiple code points in a multi-element range, $ranges[$i]" if $hex_end ne "" && $hex_end ne $hex_begin;
4084
4085                        # Convert them to decimal, as that's what's expected.
4086                        $list .= "$hex_begin\t\t"
4087                            . join(" ", map { hex } @map)
4088                            . "\n";
4089                    }
4090                    next;
4091                }
4092
4093                # Here, the mapping doesn't have a blank, is for a single code
4094                # point.
4095                my $begin = hex $hex_begin;
4096                my $end = (defined $hex_end && $hex_end ne "")
4097                        ? hex $hex_end
4098                        : $begin;
4099
4100                # Again, the output is to be in decimal.
4101                my $decimal_map = hex $map;
4102
4103                # We know that multi-element ranges with the same mapping
4104                # should not be adjusted, as after the adjustment
4105                # multi-element ranges are for consecutive increasing code
4106                # points.  Further, the final element in the list won't be
4107                # adjusted, as there is nothing after it to include in the
4108                # adjustment
4109                if ($begin != $end || $i == @ranges -1) {
4110
4111                    # So just convert these to single-element ranges
4112                    foreach my $code_point ($begin .. $end) {
4113                        $list .= sprintf("%04X\t\t%d\n",
4114                                        $code_point, $decimal_map);
4115                    }
4116                }
4117                else {
4118
4119                    # Here, we have a candidate for adjusting.  What we do is
4120                    # look through the subsequent adjacent elements in the
4121                    # input.  If the map to the next one differs by 1 from the
4122                    # one before, then we combine into a larger range with the
4123                    # initial map.  Loop doing this until we find one that
4124                    # can't be combined.
4125
4126                    my $offset = 0;     # How far away are we from the initial
4127                                        # map
4128                    my $squished = 0;   # ? Did we squish at least two
4129                                        # elements together into one range
4130                    for ( ; $i < @ranges; $i++) {
4131                        my ($next_hex_begin, $next_hex_end, $next_map)
4132                                                = split "\t", $ranges[$i+1];
4133
4134                        # In the case of 'dm', the map may be a sequence of
4135                        # multiple code points, which are never combined with
4136                        # another range
4137                        last if $next_map =~ / /;
4138
4139                        $offset++;
4140                        my $next_decimal_map = hex $next_map;
4141
4142                        # If the next map is not next in sequence, it
4143                        # shouldn't be combined.
4144                        last if $next_decimal_map != $decimal_map + $offset;
4145
4146                        my $next_begin = hex $next_hex_begin;
4147
4148                        # Likewise, if the next element isn't adjacent to the
4149                        # previous one, it shouldn't be combined.
4150                        last if $next_begin != $begin + $offset;
4151
4152                        my $next_end = (defined $next_hex_end
4153                                        && $next_hex_end ne "")
4154                                            ? hex $next_hex_end
4155                                            : $next_begin;
4156
4157                        # And finally, if the next element is a multi-element
4158                        # range, it shouldn't be combined.
4159                        last if $next_end != $next_begin;
4160
4161                        # Here, we will combine.  Loop to see if we should
4162                        # combine the next element too.
4163                        $squished = 1;
4164                    }
4165
4166                    if ($squished) {
4167
4168                        # Here, 'i' is the element number of the last element to
4169                        # be combined, and the range is single-element, or we
4170                        # wouldn't be combining.  Get it's code point.
4171                        my ($hex_end, undef, undef) = split "\t", $ranges[$i];
4172                        $list .= "$hex_begin\t$hex_end\t$decimal_map\n";
4173                    } else {
4174
4175                        # Here, no combining done.  Just append the initial
4176                        # (and current) values.
4177                        $list .= "$hex_begin\t\t$decimal_map\n";
4178                    }
4179                }
4180            } # End of loop constructing the converted list
4181
4182            # Finish up the data structure for our converted swash
4183            my $type = ($second_try eq 'nfkccf') ? 'ToNFKCCF' : 'ToDm';
4184            $revised_swash{'LIST'} = $list;
4185            $revised_swash{'TYPE'} = $type;
4186            $revised_swash{'SPECIALS'} = $swash->{'SPECIALS'};
4187            $swash = \%revised_swash;
4188
4189            $SwashInfo{$type}{'missing'} = 0;
4190            $SwashInfo{$type}{'format'} = 'a';
4191        }
4192    }
4193
4194    if ($swash->{'EXTRAS'}) {
4195        carp __PACKAGE__, "::prop_invmap: swash returned for $prop unexpectedly has EXTRAS magic";
4196        return;
4197    }
4198
4199    # Here, have a valid swash return.  Examine it.
4200    my $returned_prop = $swash->{'TYPE'};
4201
4202    # All properties but binary ones should have 'missing' and 'format'
4203    # entries
4204    $missing = $SwashInfo{$returned_prop}{'missing'};
4205    $missing = 'N' unless defined $missing;
4206
4207    $format = $SwashInfo{$returned_prop}{'format'};
4208    $format = 'b' unless defined $format;
4209
4210    my $requires_adjustment = $format =~ /^a/;
4211
4212    if ($swash->{'LIST'} =~ /^V/) {
4213        @invlist = split "\n", $swash->{'LIST'} =~ s/ \s* (?: \# .* )? $ //xmgr;
4214
4215        shift @invlist;     # Get rid of 'V';
4216
4217        # Could need to be inverted: add or subtract a 0 at the beginning of
4218        # the list.
4219        if ($swash->{'INVERT_IT'}) {
4220            if (@invlist && $invlist[0] == 0) {
4221                shift @invlist;
4222            }
4223            else {
4224                unshift @invlist, 0;
4225            }
4226        }
4227
4228        if (@invlist) {
4229            foreach my $i (0 .. @invlist - 1) {
4230                $invmap[$i] = ($i % 2 == 0) ? 'Y' : 'N'
4231            }
4232
4233            # The map includes lines for all code points; add one for the range
4234            # from 0 to the first Y.
4235            if ($invlist[0] != 0) {
4236                unshift @invlist, 0;
4237                unshift @invmap, 'N';
4238            }
4239        }
4240    }
4241    else {
4242        if ($swash->{'INVERT_IT'}) {
4243            croak __PACKAGE__, ":prop_invmap: Don't know how to deal with inverted";
4244        }
4245
4246        # The LIST input lines look like:
4247        # ...
4248        # 0374\t\tCommon
4249        # 0375\t0377\tGreek   # [3]
4250        # 037A\t037D\tGreek   # [4]
4251        # 037E\t\tCommon
4252        # 0384\t\tGreek
4253        # ...
4254        #
4255        # Convert them to like
4256        # 0374 => Common
4257        # 0375 => Greek
4258        # 0378 => $missing
4259        # 037A => Greek
4260        # 037E => Common
4261        # 037F => $missing
4262        # 0384 => Greek
4263        #
4264        # For binary properties, the final non-comment column is absent, and
4265        # assumed to be 'Y'.
4266
4267        foreach my $range (split "\n", $swash->{'LIST'}) {
4268            $range =~ s/ \s* (?: \# .* )? $ //xg; # rmv trailing space, comments
4269
4270            # Find the beginning and end of the range on the line
4271            my ($hex_begin, $hex_end, $map) = split "\t", $range;
4272            my $begin = hex $hex_begin;
4273            no warnings 'portable';
4274            my $end = (defined $hex_end && $hex_end ne "")
4275                    ? hex $hex_end
4276                    : $begin;
4277
4278            # Each time through the loop (after the first):
4279            # $invlist[-2] contains the beginning of the previous range processed
4280            # $invlist[-1] contains the end+1 of the previous range processed
4281            # $invmap[-2] contains the value of the previous range processed
4282            # $invmap[-1] contains the default value for missing ranges
4283            #                                                       ($missing)
4284            #
4285            # Thus, things are set up for the typical case of a new
4286            # non-adjacent range of non-missings to be added.  But, if the new
4287            # range is adjacent, it needs to replace the [-1] element; and if
4288            # the new range is a multiple value of the previous one, it needs
4289            # to be added to the [-2] map element.
4290
4291            # The first time through, everything will be empty.  If the
4292            # property doesn't have a range that begins at 0, add one that
4293            # maps to $missing
4294            if (! @invlist) {
4295                if ($begin != 0) {
4296                    push @invlist, 0;
4297                    push @invmap, $missing;
4298                }
4299            }
4300            elsif (@invlist > 1 && $invlist[-2] == $begin) {
4301
4302                # Here we handle the case where the input has multiple entries
4303                # for each code point.  mktables should have made sure that
4304                # each such range contains only one code point.  At this
4305                # point, $invlist[-1] is the $missing that was added at the
4306                # end of the last loop iteration, and [-2] is the last real
4307                # input code point, and that code point is the same as the one
4308                # we are adding now, making the new one a multiple entry.  Add
4309                # it to the existing entry, either by pushing it to the
4310                # existing list of multiple entries, or converting the single
4311                # current entry into a list with both on it.  This is all we
4312                # need do for this iteration.
4313
4314                if ($end != $begin) {
4315                    croak __PACKAGE__, ":prop_invmap: Multiple maps per code point in '$prop' require single-element ranges: begin=$begin, end=$end, map=$map";
4316                }
4317                if (! ref $invmap[-2]) {
4318                    $invmap[-2] = [ $invmap[-2], $map ];
4319                }
4320                else {
4321                    push @{$invmap[-2]}, $map;
4322                }
4323                $has_multiples = 1;
4324                next;
4325            }
4326            elsif ($invlist[-1] == $begin) {
4327
4328                # If the input isn't in the most compact form, so that there
4329                # are two adjacent ranges that map to the same thing, they
4330                # should be combined (EXCEPT where the arrays require
4331                # adjustments, in which case everything is already set up
4332                # correctly).  This happens in our constructed dt mapping, as
4333                # Element [-2] is the map for the latest range so far
4334                # processed.  Just set the beginning point of the map to
4335                # $missing (in invlist[-1]) to 1 beyond where this range ends.
4336                # For example, in
4337                # 12\t13\tXYZ
4338                # 14\t17\tXYZ
4339                # we have set it up so that it looks like
4340                # 12 => XYZ
4341                # 14 => $missing
4342                #
4343                # We now see that it should be
4344                # 12 => XYZ
4345                # 18 => $missing
4346                if (! $requires_adjustment && @invlist > 1 && ( (defined $map)
4347                                    ? $invmap[-2] eq $map
4348                                    : $invmap[-2] eq 'Y'))
4349                {
4350                    $invlist[-1] = $end + 1;
4351                    next;
4352                }
4353
4354                # Here, the range started in the previous iteration that maps
4355                # to $missing starts at the same code point as this range.
4356                # That means there is no gap to fill that that range was
4357                # intended for, so we just pop it off the parallel arrays.
4358                pop @invlist;
4359                pop @invmap;
4360            }
4361
4362            # Add the range beginning, and the range's map.
4363            push @invlist, $begin;
4364            if ($returned_prop eq 'ToDm') {
4365
4366                # The decomposition maps are either a line like <hangul
4367                # syllable> which are to be taken as is; or a sequence of code
4368                # points in hex and separated by blanks.  Convert them to
4369                # decimal, and if there is more than one, use an anonymous
4370                # array as the map.
4371                if ($map =~ /^ < /x) {
4372                    push @invmap, $map;
4373                }
4374                else {
4375                    my @map = split " ", $map;
4376                    if (@map == 1) {
4377                        push @invmap, $map[0];
4378                    }
4379                    else {
4380                        push @invmap, \@map;
4381                    }
4382                }
4383            }
4384            else {
4385
4386                # Otherwise, convert hex formatted list entries to decimal;
4387                # add a 'Y' map for the missing value in binary properties, or
4388                # otherwise, use the input map unchanged.
4389                $map = ($format eq 'x' || $format eq 'ax')
4390                    ? hex $map
4391                    : $format eq 'b'
4392                    ? 'Y'
4393                    : $map;
4394                push @invmap, $map;
4395            }
4396
4397            # We just started a range.  It ends with $end.  The gap between it
4398            # and the next element in the list must be filled with a range
4399            # that maps to the default value.  If there is no gap, the next
4400            # iteration will pop this, unless there is no next iteration, and
4401            # we have filled all of the Unicode code space, so check for that
4402            # and skip.
4403            if ($end < $MAX_CP) {
4404                push @invlist, $end + 1;
4405                push @invmap, $missing;
4406            }
4407        }
4408    }
4409
4410    # If the property is empty, make all code points use the value for missing
4411    # ones.
4412    if (! @invlist) {
4413        push @invlist, 0;
4414        push @invmap, $missing;
4415    }
4416
4417    # The final element is always for just the above-Unicode code points.  If
4418    # not already there, add it.  It merely splits the current final range
4419    # that extends to infinity into two elements, each with the same map.
4420    # (This is to conform with the API that says the final element is for
4421    # $MAX_UNICODE_CODEPOINT + 1 .. INFINITY.)
4422    if ($invlist[-1] != $MAX_UNICODE_CODEPOINT + 1) {
4423        push @invmap, $invmap[-1];
4424        push @invlist, $MAX_UNICODE_CODEPOINT + 1;
4425    }
4426
4427    # The second component of the map are those values that require
4428    # non-standard specification, stored in SPECIALS.  These override any
4429    # duplicate code points in LIST.  If we are using a proxy, we may have
4430    # already set $overrides based on the proxy.
4431    $overrides = $swash->{'SPECIALS'} unless defined $overrides;
4432    if ($overrides) {
4433
4434        # A negative $overrides implies that the SPECIALS should be ignored,
4435        # and a simple 'a' list is the value.
4436        if ($overrides < 0) {
4437            $format = 'a';
4438        }
4439        else {
4440
4441            # Currently, all overrides are for properties that normally map to
4442            # single code points, but now some will map to lists of code
4443            # points (but there is an exception case handled below).
4444            $format = 'al';
4445
4446            # Look through the overrides.
4447            foreach my $cp_maybe_utf8 (keys %$overrides) {
4448                my $cp;
4449                my @map;
4450
4451                # If the overrides came from SPECIALS, the code point keys are
4452                # packed UTF-8.
4453                if ($overrides == $swash->{'SPECIALS'}) {
4454                    $cp = $cp_maybe_utf8;
4455                    if (! utf8::decode($cp)) {
4456                        croak __PACKAGE__, "::prop_invmap: Malformed UTF-8: ",
4457                              map { sprintf("\\x{%02X}", unpack("C", $_)) }
4458                                                                split "", $cp;
4459                    }
4460
4461                    $cp = unpack("W", $cp);
4462                    @map = unpack "W*", $swash->{'SPECIALS'}{$cp_maybe_utf8};
4463
4464                    # The empty string will show up unpacked as an empty
4465                    # array.
4466                    $format = 'ale' if @map == 0;
4467                }
4468                else {
4469
4470                    # But if we generated the overrides, we didn't bother to
4471                    # pack them, and we, so far, do this only for properties
4472                    # that are 'a' ones.
4473                    $cp = $cp_maybe_utf8;
4474                    @map = hex $overrides->{$cp};
4475                    $format = 'a';
4476                }
4477
4478                # Find the range that the override applies to.
4479                my $i = search_invlist(\@invlist, $cp);
4480                if ($cp < $invlist[$i] || $cp >= $invlist[$i + 1]) {
4481                    croak __PACKAGE__, "::prop_invmap: wrong_range, cp=$cp; i=$i, current=$invlist[$i]; next=$invlist[$i + 1]"
4482                }
4483
4484                # And what that range currently maps to
4485                my $cur_map = $invmap[$i];
4486
4487                # If there is a gap between the next range and the code point
4488                # we are overriding, we have to add elements to both arrays to
4489                # fill that gap, using the map that applies to it, which is
4490                # $cur_map, since it is part of the current range.
4491                if ($invlist[$i + 1] > $cp + 1) {
4492                    #use feature 'say';
4493                    #say "Before splice:";
4494                    #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
4495                    #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
4496                    #say 'i  =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
4497                    #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
4498                    #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
4499
4500                    splice @invlist, $i + 1, 0, $cp + 1;
4501                    splice @invmap, $i + 1, 0, $cur_map;
4502
4503                    #say "After splice:";
4504                    #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
4505                    #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
4506                    #say 'i  =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
4507                    #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
4508                    #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
4509                }
4510
4511                # If the remaining portion of the range is multiple code
4512                # points (ending with the one we are replacing, guaranteed by
4513                # the earlier splice).  We must split it into two
4514                if ($invlist[$i] < $cp) {
4515                    $i++;   # Compensate for the new element
4516
4517                    #use feature 'say';
4518                    #say "Before splice:";
4519                    #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
4520                    #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
4521                    #say 'i  =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
4522                    #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
4523                    #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
4524
4525                    splice @invlist, $i, 0, $cp;
4526                    splice @invmap, $i, 0, 'dummy';
4527
4528                    #say "After splice:";
4529                    #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
4530                    #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
4531                    #say 'i  =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
4532                    #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
4533                    #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
4534                }
4535
4536                # Here, the range we are overriding contains a single code
4537                # point.  The result could be the empty string, a single
4538                # value, or a list.  If the last case, we use an anonymous
4539                # array.
4540                $invmap[$i] = (scalar @map == 0)
4541                               ? ""
4542                               : (scalar @map > 1)
4543                                  ? \@map
4544                                  : $map[0];
4545            }
4546        }
4547    }
4548    elsif ($format eq 'x') {
4549
4550        # All hex-valued properties are really to code points, and have been
4551        # converted to decimal.
4552        $format = 's';
4553    }
4554    elsif ($returned_prop eq 'ToDm') {
4555        $format = 'ad';
4556    }
4557    elsif ($format eq 'sw') { # blank-separated elements to form a list.
4558        map { $_ = [ split " ", $_  ] if $_ =~ / / } @invmap;
4559        $format = 'sl';
4560    }
4561    elsif ($returned_prop =~ / To ( _Perl )? NameAlias/x) {
4562
4563        # This property currently doesn't have any lists, but theoretically
4564        # could
4565        $format = 'sl';
4566    }
4567    elsif ($returned_prop eq 'ToPerlDecimalDigit') {
4568        $format = 'ae';
4569    }
4570    elsif ($returned_prop eq 'ToNv') {
4571
4572        # The one property that has this format is stored as a delta, so needs
4573        # to indicate that need to add code point to it.
4574        $format = 'ar';
4575    }
4576    elsif ($format eq 'ax') {
4577
4578        # Normally 'ax' properties have overrides, and will have been handled
4579        # above, but if not, they still need adjustment, and the hex values
4580        # have already been converted to decimal
4581        $format = 'a';
4582    }
4583    elsif ($format ne 'n' && $format !~ / ^ a /x) {
4584
4585        # All others are simple scalars
4586        $format = 's';
4587    }
4588    if ($has_multiples &&  $format !~ /l/) {
4589	croak __PACKAGE__, "::prop_invmap: Wrong format '$format' for prop_invmap('$prop'); should indicate has lists";
4590    }
4591
4592    return (\@invlist, \@invmap, $format, $missing);
4593}
4594
4595sub search_invlist {
4596
4597=pod
4598
4599=head2 B<search_invlist()>
4600
4601 use Unicode::UCD qw(prop_invmap prop_invlist);
4602 use Unicode::UCD 'search_invlist';
4603
4604 my @invlist = prop_invlist($property_name);
4605 print $code_point, ((search_invlist(\@invlist, $code_point) // -1) % 2)
4606                     ? " isn't"
4607                     : " is",
4608     " in $property_name\n";
4609
4610 my ($blocks_ranges_ref, $blocks_map_ref) = prop_invmap("Block");
4611 my $index = search_invlist($blocks_ranges_ref, $code_point);
4612 print "$code_point is in block ", $blocks_map_ref->[$index], "\n";
4613
4614C<search_invlist> is used to search an inversion list returned by
4615C<prop_invlist> or C<prop_invmap> for a particular L</code point argument>.
4616C<undef> is returned if the code point is not found in the inversion list
4617(this happens only when it is not a legal L</code point argument>, or is less
4618than the list's first element).  A warning is raised in the first instance.
4619
4620Otherwise, it returns the index into the list of the range that contains the
4621code point.; that is, find C<i> such that
4622
4623    list[i]<= code_point < list[i+1].
4624
4625As explained in L</prop_invlist()>, whether a code point is in the list or not
4626depends on if the index is even (in) or odd (not in).  And as explained in
4627L</prop_invmap()>, the index is used with the returned parallel array to find
4628the mapping.
4629
4630=cut
4631
4632
4633    my $list_ref = shift;
4634    my $input_code_point = shift;
4635    my $code_point = _getcode($input_code_point);
4636
4637    if (! defined $code_point) {
4638        carp __PACKAGE__, "::search_invlist: unknown code '$input_code_point'";
4639        return;
4640    }
4641
4642    my $max_element = @$list_ref - 1;
4643
4644    # Return undef if list is empty or requested item is before the first element.
4645    return if $max_element < 0;
4646    return if $code_point < $list_ref->[0];
4647
4648    # Short cut something at the far-end of the table.  This also allows us to
4649    # refer to element [$i+1] without fear of being out-of-bounds in the loop
4650    # below.
4651    return $max_element if $code_point >= $list_ref->[$max_element];
4652
4653    use integer;        # want integer division
4654
4655    my $i = $max_element / 2;
4656
4657    my $lower = 0;
4658    my $upper = $max_element;
4659    while (1) {
4660
4661        if ($code_point >= $list_ref->[$i]) {
4662
4663            # Here we have met the lower constraint.  We can quit if we
4664            # also meet the upper one.
4665            last if $code_point < $list_ref->[$i+1];
4666
4667            $lower = $i;        # Still too low.
4668
4669        }
4670        else {
4671
4672            # Here, $code_point < $list_ref[$i], so look lower down.
4673            $upper = $i;
4674        }
4675
4676        # Split search domain in half to try again.
4677        my $temp = ($upper + $lower) / 2;
4678
4679        # No point in continuing unless $i changes for next time
4680        # in the loop.
4681        return $i if $temp == $i;
4682        $i = $temp;
4683    } # End of while loop
4684
4685    # Here we have found the offset
4686    return $i;
4687}
4688
4689=head2 Unicode::UCD::UnicodeVersion
4690
4691This returns the version of the Unicode Character Database, in other words, the
4692version of the Unicode standard the database implements.  The version is a
4693string of numbers delimited by dots (C<'.'>).
4694
4695=cut
4696
4697my $UNICODEVERSION;
4698
4699sub UnicodeVersion {
4700    unless (defined $UNICODEVERSION) {
4701	my $versionfh = openunicode("version");
4702	local $/ = "\n";
4703	chomp($UNICODEVERSION = <$versionfh>);
4704	croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
4705	    unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
4706    }
4707    $v_unicode_version = pack "C*", split /\./, $UNICODEVERSION;
4708    return $UNICODEVERSION;
4709}
4710
4711=head2 B<Blocks versus Scripts>
4712
4713The difference between a block and a script is that scripts are closer
4714to the linguistic notion of a set of code points required to represent
4715languages, while block is more of an artifact of the Unicode code point
4716numbering and separation into blocks of consecutive code points (so far the
4717size of a block is some multiple of 16, like 128 or 256).
4718
4719For example the Latin B<script> is spread over several B<blocks>, such
4720as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
4721C<Latin Extended-B>.  On the other hand, the Latin script does not
4722contain all the characters of the C<Basic Latin> block (also known as
4723ASCII): it includes only the letters, and not, for example, the digits
4724nor the punctuation.
4725
4726For blocks see L<http://www.unicode.org/Public/UNIDATA/Blocks.txt>
4727
4728For scripts see UTR #24: L<http://www.unicode.org/reports/tr24/>
4729
4730=head2 B<Matching Scripts and Blocks>
4731
4732Scripts are matched with the regular-expression construct
4733C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
4734while C<\p{Blk=...}> is used for blocks (e.g. C<\p{Blk=Tibetan}> matches
4735any of the 256 code points in the Tibetan block).
4736
4737=head2 Old-style versus new-style block names
4738
4739Unicode publishes the names of blocks in two different styles, though the two
4740are equivalent under Unicode's loose matching rules.
4741
4742The original style uses blanks and hyphens in the block names (except for
4743C<No_Block>), like so:
4744
4745 Miscellaneous Mathematical Symbols-B
4746
4747The newer style replaces these with underscores, like this:
4748
4749 Miscellaneous_Mathematical_Symbols_B
4750
4751This newer style is consistent with the values of other Unicode properties.
4752To preserve backward compatibility, all the functions in Unicode::UCD that
4753return block names (except as noted) return the old-style ones.
4754L</prop_value_aliases()> returns the new-style and can be used to convert from
4755old-style to new-style:
4756
4757 my $new_style = prop_values_aliases("block", $old_style);
4758
4759Perl also has single-form extensions that refer to blocks, C<In_Cyrillic>,
4760meaning C<Block=Cyrillic>.  These have always been written in the new style.
4761
4762To convert from new-style to old-style, follow this recipe:
4763
4764 $old_style = charblock((prop_invlist("block=$new_style"))[0]);
4765
4766(which finds the range of code points in the block using C<prop_invlist>,
4767gets the lower end of the range (0th element) and then looks up the old name
4768for its block using C<charblock>).
4769
4770Note that starting in Unicode 6.1, many of the block names have shorter
4771synonyms.  These are always given in the new style.
4772
4773=head2 Use with older Unicode versions
4774
4775The functions in this module work as well as can be expected when
4776used on earlier Unicode versions.  But, obviously, they use the available data
4777from that Unicode version.  For example, if the Unicode version predates the
4778definition of the script property (Unicode 3.1), then any function that deals
4779with scripts is going to return C<undef> for the script portion of the return
4780value.
4781
4782=head1 AUTHOR
4783
4784Jarkko Hietaniemi.  Now maintained by perl5 porters.
4785
4786=cut
4787
47881;
4789