xref: /openbsd/gnu/usr.bin/perl/lib/unicore/mktables (revision 097a140d)
1#!/usr/bin/perl -w
2
3# !!!!!!!!!!!!!!       IF YOU MODIFY THIS FILE       !!!!!!!!!!!!!!!!!!!!!!!!!
4# Any files created or read by this program should be listed in 'mktables.lst'
5# Use -makelist to regenerate it.
6
7# There was an attempt when this was first rewritten to make it 5.8
8# compatible, but that has now been abandoned, and newer constructs are used
9# as convenient.
10
11# NOTE: this script can run quite slowly in older/slower systems.
12# It can also consume a lot of memory (128 MB or more), you may need
13# to raise your process resource limits (e.g. in bash, "ulimit -a"
14# to inspect, and "ulimit -d ..." or "ulimit -m ..." to set)
15
16my $start_time;
17BEGIN { # Get the time the script started running; do it at compilation to
18        # get it as close as possible
19    $start_time= time;
20}
21
22require 5.010_001;
23use strict;
24use warnings;
25use Carp;
26use Config;
27use File::Find;
28use File::Path;
29use File::Spec;
30use Text::Tabs;
31use re "/aa";
32
33use feature 'state';
34use feature 'signatures';
35no warnings 'experimental::signatures';
36
37sub DEBUG () { 0 }  # Set to 0 for production; 1 for development
38my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
39
40sub NON_ASCII_PLATFORM { ord("A") != 65 }
41
42# When a new version of Unicode is published, unfortunately the algorithms for
43# dealing with various bounds, like \b{gcb}, \b{lb} may have to be updated
44# manually.  The changes may or may not be backward compatible with older
45# releases.  The code is in regen/mk_invlist.pl and regexec.c.  Make the
46# changes, then come back here and set the variable below to what version the
47# code is expecting.  If a newer version of Unicode is being compiled than
48# expected, a warning will be generated.  If an older version is being
49# compiled, any bounds tests that fail in the generated test file (-maketest
50# option) will be marked as TODO.
51my $version_of_mk_invlist_bounds = v13.0.0;
52
53##########################################################################
54#
55# mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
56# from the Unicode database files (lib/unicore/.../*.txt),  It also generates
57# a pod file and .t files, depending on option parameters.
58#
59# The structure of this file is:
60#   First these introductory comments; then
61#   code needed for everywhere, such as debugging stuff; then
62#   code to handle input parameters; then
63#   data structures likely to be of external interest (some of which depend on
64#       the input parameters, so follows them; then
65#   more data structures and subroutine and package (class) definitions; then
66#   the small actual loop to process the input files and finish up; then
67#   a __DATA__ section, for the .t tests
68#
69# This program works on all releases of Unicode so far.  The outputs have been
70# scrutinized most intently for release 5.1.  The others have been checked for
71# somewhat more than just sanity.  It can handle all non-provisional Unicode
72# character properties in those releases.
73#
74# This program is mostly about Unicode character (or code point) properties.
75# A property describes some attribute or quality of a code point, like if it
76# is lowercase or not, its name, what version of Unicode it was first defined
77# in, or what its uppercase equivalent is.  Unicode deals with these disparate
78# possibilities by making all properties into mappings from each code point
79# into some corresponding value.  In the case of it being lowercase or not,
80# the mapping is either to 'Y' or 'N' (or various synonyms thereof).  Each
81# property maps each Unicode code point to a single value, called a "property
82# value".  (Some more recently defined properties, map a code point to a set
83# of values.)
84#
85# When using a property in a regular expression, what is desired isn't the
86# mapping of the code point to its property's value, but the reverse (or the
87# mathematical "inverse relation"): starting with the property value, "Does a
88# code point map to it?"  These are written in a "compound" form:
89# \p{property=value}, e.g., \p{category=punctuation}.  This program generates
90# files containing the lists of code points that map to each such regular
91# expression property value, one file per list
92#
93# There is also a single form shortcut that Perl adds for many of the commonly
94# used properties.  This happens for all binary properties, plus script,
95# general_category, and block properties.
96#
97# Thus the outputs of this program are files.  There are map files, mostly in
98# the 'To' directory; and there are list files for use in regular expression
99# matching, all in subdirectories of the 'lib' directory, with each
100# subdirectory being named for the property that the lists in it are for.
101# Bookkeeping, test, and documentation files are also generated.
102
103my $matches_directory = 'lib';   # Where match (\p{}) files go.
104my $map_directory = 'To';        # Where map files go.
105
106# DATA STRUCTURES
107#
108# The major data structures of this program are Property, of course, but also
109# Table.  There are two kinds of tables, very similar to each other.
110# "Match_Table" is the data structure giving the list of code points that have
111# a particular property value, mentioned above.  There is also a "Map_Table"
112# data structure which gives the property's mapping from code point to value.
113# There are two structures because the match tables need to be combined in
114# various ways, such as constructing unions, intersections, complements, etc.,
115# and the map ones don't.  And there would be problems, perhaps subtle, if
116# a map table were inadvertently operated on in some of those ways.
117# The use of separate classes with operations defined on one but not the other
118# prevents accidentally confusing the two.
119#
120# At the heart of each table's data structure is a "Range_List", which is just
121# an ordered list of "Ranges", plus ancillary information, and methods to
122# operate on them.  A Range is a compact way to store property information.
123# Each range has a starting code point, an ending code point, and a value that
124# is meant to apply to all the code points between the two end points,
125# inclusive.  For a map table, this value is the property value for those
126# code points.  Two such ranges could be written like this:
127#   0x41 .. 0x5A, 'Upper',
128#   0x61 .. 0x7A, 'Lower'
129#
130# Each range also has a type used as a convenience to classify the values.
131# Most ranges in this program will be Type 0, or normal, but there are some
132# ranges that have a non-zero type.  These are used only in map tables, and
133# are for mappings that don't fit into the normal scheme of things.  Mappings
134# that require a hash entry to communicate with utf8.c are one example;
135# another example is mappings for charnames.pm to use which indicate a name
136# that is algorithmically determinable from its code point (and the reverse).
137# These are used to significantly compact these tables, instead of listing
138# each one of the tens of thousands individually.
139#
140# In a match table, the value of a range is irrelevant (and hence the type as
141# well, which will always be 0), and arbitrarily set to the empty string.
142# Using the example above, there would be two match tables for those two
143# entries, one named Upper would contain the 0x41..0x5A range, and the other
144# named Lower would contain 0x61..0x7A.
145#
146# Actually, there are two types of range lists, "Range_Map" is the one
147# associated with map tables, and "Range_List" with match tables.
148# Again, this is so that methods can be defined on one and not the others so
149# as to prevent operating on them in incorrect ways.
150#
151# Eventually, most tables are written out to files to be read by Unicode::UCD.
152# All tables could in theory be written, but some are suppressed because there
153# is no current practical use for them.  It is easy to change which get
154# written by changing various lists that are near the top of the actual code
155# in this file.  The table data structures contain enough ancillary
156# information to allow them to be treated as separate entities for writing,
157# such as the path to each one's file.  There is a heading in each map table
158# that gives the format of its entries, and what the map is for all the code
159# points missing from it.  (This allows tables to be more compact.)
160#
161# The Property data structure contains one or more tables.  All properties
162# contain a map table (except the $perl property which is a
163# pseudo-property containing only match tables), and any properties that
164# are usable in regular expression matches also contain various matching
165# tables, one for each value the property can have.  A binary property can
166# have two values, True and False (or Y and N, which are preferred by Unicode
167# terminology).  Thus each of these properties will have a map table that
168# takes every code point and maps it to Y or N (but having ranges cuts the
169# number of entries in that table way down), and two match tables, one
170# which has a list of all the code points that map to Y, and one for all the
171# code points that map to N.  (For each binary property, a third table is also
172# generated for the pseudo Perl property.  It contains the identical code
173# points as the Y table, but can be written in regular expressions, not in the
174# compound form, but in a "single" form like \p{IsUppercase}.)  Many
175# properties are binary, but some properties have several possible values,
176# some have many, and properties like Name have a different value for every
177# named code point.  Those will not, unless the controlling lists are changed,
178# have their match tables written out.  But all the ones which can be used in
179# regular expression \p{} and \P{} constructs will.  Prior to 5.14, generally
180# a property would have either its map table or its match tables written but
181# not both.  Again, what gets written is controlled by lists which can easily
182# be changed.  Starting in 5.14, advantage was taken of this, and all the map
183# tables needed to reconstruct the Unicode db are now written out, while
184# suppressing the Unicode .txt files that contain the data.  Our tables are
185# much more compact than the .txt files, so a significant space savings was
186# achieved.  Also, tables are not written out that are trivially derivable
187# from tables that do get written.  So, there typically is no file containing
188# the code points not matched by a binary property (the table for \P{} versus
189# lowercase \p{}), since you just need to invert the True table to get the
190# False table.
191
192# Properties have a 'Type', like 'binary', or 'string', or 'enum' depending on
193# how many match tables there are and the content of the maps.  This 'Type' is
194# different than a range 'Type', so don't get confused by the two concepts
195# having the same name.
196#
197# For information about the Unicode properties, see Unicode's UAX44 document:
198
199my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
200
201# As stated earlier, this program will work on any release of Unicode so far.
202# Most obvious problems in earlier data have NOT been corrected except when
203# necessary to make Perl or this program work reasonably, and to keep out
204# potential security issues.  For example, no folding information was given in
205# early releases, so this program substitutes lower case instead, just so that
206# a regular expression with the /i option will do something that actually
207# gives the right results in many cases.  There are also a couple other
208# corrections for version 1.1.5, commented at the point they are made.  As an
209# example of corrections that weren't made (but could be) is this statement
210# from DerivedAge.txt: "The supplementary private use code points and the
211# non-character code points were assigned in version 2.0, but not specifically
212# listed in the UCD until versions 3.0 and 3.1 respectively."  (To be precise
213# it was 3.0.1 not 3.0.0)  More information on Unicode version glitches is
214# further down in these introductory comments.
215#
216# This program works on all non-provisional properties as of the current
217# Unicode release, though the files for some are suppressed for various
218# reasons.  You can change which are output by changing lists in this program.
219#
220# The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
221# loose matchings rules (from Unicode TR18):
222#
223#    The recommended names for UCD properties and property values are in
224#    PropertyAliases.txt [Prop] and PropertyValueAliases.txt
225#    [PropValue]. There are both abbreviated names and longer, more
226#    descriptive names. It is strongly recommended that both names be
227#    recognized, and that loose matching of property names be used,
228#    whereby the case distinctions, whitespace, hyphens, and underbar
229#    are ignored.
230#
231# The program still allows Fuzzy to override its determination of if loose
232# matching should be used, but it isn't currently used, as it is no longer
233# needed; the calculations it makes are good enough.
234#
235# SUMMARY OF HOW IT WORKS:
236#
237#   Process arguments
238#
239#   A list is constructed containing each input file that is to be processed
240#
241#   Each file on the list is processed in a loop, using the associated handler
242#   code for each:
243#        The PropertyAliases.txt and PropValueAliases.txt files are processed
244#            first.  These files name the properties and property values.
245#            Objects are created of all the property and property value names
246#            that the rest of the input should expect, including all synonyms.
247#        The other input files give mappings from properties to property
248#           values.  That is, they list code points and say what the mapping
249#           is under the given property.  Some files give the mappings for
250#           just one property; and some for many.  This program goes through
251#           each file and populates the properties and their map tables from
252#           them.  Some properties are listed in more than one file, and
253#           Unicode has set up a precedence as to which has priority if there
254#           is a conflict.  Thus the order of processing matters, and this
255#           program handles the conflict possibility by processing the
256#           overriding input files last, so that if necessary they replace
257#           earlier values.
258#        After this is all done, the program creates the property mappings not
259#            furnished by Unicode, but derivable from what it does give.
260#        The tables of code points that match each property value in each
261#            property that is accessible by regular expressions are created.
262#        The Perl-defined properties are created and populated.  Many of these
263#            require data determined from the earlier steps
264#        Any Perl-defined synonyms are created, and name clashes between Perl
265#            and Unicode are reconciled and warned about.
266#        All the properties are written to files
267#        Any other files are written, and final warnings issued.
268#
269# For clarity, a number of operators have been overloaded to work on tables:
270#   ~ means invert (take all characters not in the set).  The more
271#       conventional '!' is not used because of the possibility of confusing
272#       it with the actual boolean operation.
273#   + means union
274#   - means subtraction
275#   & means intersection
276# The precedence of these is the order listed.  Parentheses should be
277# copiously used.  These are not a general scheme.  The operations aren't
278# defined for a number of things, deliberately, to avoid getting into trouble.
279# Operations are done on references and affect the underlying structures, so
280# that the copy constructors for them have been overloaded to not return a new
281# clone, but the input object itself.
282#
283# The bool operator is deliberately not overloaded to avoid confusion with
284# "should it mean if the object merely exists, or also is non-empty?".
285#
286# WHY CERTAIN DESIGN DECISIONS WERE MADE
287#
288# This program needs to be able to run under miniperl.  Therefore, it uses a
289# minimum of other modules, and hence implements some things itself that could
290# be gotten from CPAN
291#
292# This program uses inputs published by the Unicode Consortium.  These can
293# change incompatibly between releases without the Perl maintainers realizing
294# it.  Therefore this program is now designed to try to flag these.  It looks
295# at the directories where the inputs are, and flags any unrecognized files.
296# It keeps track of all the properties in the files it handles, and flags any
297# that it doesn't know how to handle.  It also flags any input lines that
298# don't match the expected syntax, among other checks.
299#
300# It is also designed so if a new input file matches one of the known
301# templates, one hopefully just needs to add it to a list to have it
302# processed.
303#
304# As mentioned earlier, some properties are given in more than one file.  In
305# particular, the files in the extracted directory are supposedly just
306# reformattings of the others.  But they contain information not easily
307# derivable from the other files, including results for Unihan (which isn't
308# usually available to this program) and for unassigned code points.  They
309# also have historically had errors or been incomplete.  In an attempt to
310# create the best possible data, this program thus processes them first to
311# glean information missing from the other files; then processes those other
312# files to override any errors in the extracted ones.  Much of the design was
313# driven by this need to store things and then possibly override them.
314#
315# It tries to keep fatal errors to a minimum, to generate something usable for
316# testing purposes.  It always looks for files that could be inputs, and will
317# warn about any that it doesn't know how to handle (the -q option suppresses
318# the warning).
319#
320# Why is there more than one type of range?
321#   This simplified things.  There are some very specialized code points that
322#   have to be handled specially for output, such as Hangul syllable names.
323#   By creating a range type (done late in the development process), it
324#   allowed this to be stored with the range, and overridden by other input.
325#   Originally these were stored in another data structure, and it became a
326#   mess trying to decide if a second file that was for the same property was
327#   overriding the earlier one or not.
328#
329# Why are there two kinds of tables, match and map?
330#   (And there is a base class shared by the two as well.)  As stated above,
331#   they actually are for different things.  Development proceeded much more
332#   smoothly when I (khw) realized the distinction.  Map tables are used to
333#   give the property value for every code point (actually every code point
334#   that doesn't map to a default value).  Match tables are used for regular
335#   expression matches, and are essentially the inverse mapping.  Separating
336#   the two allows more specialized methods, and error checks so that one
337#   can't just take the intersection of two map tables, for example, as that
338#   is nonsensical.
339#
340# What about 'fate' and 'status'.  The concept of a table's fate was created
341#   late when it became clear that something more was needed.  The difference
342#   between this and 'status' is unclean, and could be improved if someone
343#   wanted to spend the effort.
344#
345# DEBUGGING
346#
347# This program is written so it will run under miniperl.  Occasionally changes
348# will cause an error where the backtrace doesn't work well under miniperl.
349# To diagnose the problem, you can instead run it under regular perl, if you
350# have one compiled.
351#
352# There is a good trace facility.  To enable it, first sub DEBUG must be set
353# to return true.  Then a line like
354#
355# local $to_trace = 1 if main::DEBUG;
356#
357# can be added to enable tracing in its lexical scope (plus dynamic) or until
358# you insert another line:
359#
360# local $to_trace = 0 if main::DEBUG;
361#
362# To actually trace, use a line like "trace $a, @b, %c, ...;
363#
364# Some of the more complex subroutines already have trace statements in them.
365# Permanent trace statements should be like:
366#
367# trace ... if main::DEBUG && $to_trace;
368#
369# main::stack_trace() will display what its name implies
370#
371# If there is just one or a few files that you're debugging, you can easily
372# cause most everything else to be skipped.  Change the line
373#
374# my $debug_skip = 0;
375#
376# to 1, and every file whose object is in @input_file_objects and doesn't have
377# a, 'non_skip => 1,' in its constructor will be skipped.  However, skipping
378# Jamo.txt or UnicodeData.txt will likely cause fatal errors.
379#
380# To compare the output tables, it may be useful to specify the -annotate
381# flag.  (As of this writing, this can't be done on a clean workspace, due to
382# requirements in Text::Tabs used in this option; so first run mktables
383# without this option.)  This option adds comment lines to each table, one for
384# each non-algorithmically named character giving, currently its code point,
385# name, and graphic representation if printable (and you have a font that
386# knows about it).  This makes it easier to see what the particular code
387# points are in each output table.  Non-named code points are annotated with a
388# description of their status, and contiguous ones with the same description
389# will be output as a range rather than individually.  Algorithmically named
390# characters are also output as ranges, except when there are just a few
391# contiguous ones.
392#
393# FUTURE ISSUES
394#
395# The program would break if Unicode were to change its names so that
396# interior white space, underscores, or dashes differences were significant
397# within property and property value names.
398#
399# It might be easier to use the xml versions of the UCD if this program ever
400# would need heavy revision, and the ability to handle old versions was not
401# required.
402#
403# There is the potential for name collisions, in that Perl has chosen names
404# that Unicode could decide it also likes.  There have been such collisions in
405# the past, with mostly Perl deciding to adopt the Unicode definition of the
406# name.  However in the 5.2 Unicode beta testing, there were a number of such
407# collisions, which were withdrawn before the final release, because of Perl's
408# and other's protests.  These all involved new properties which began with
409# 'Is'.  Based on the protests, Unicode is unlikely to try that again.  Also,
410# many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
411# Unicode document, so they are unlikely to be used by Unicode for another
412# purpose.  However, they might try something beginning with 'In', or use any
413# of the other Perl-defined properties.  This program will warn you of name
414# collisions, and refuse to generate tables with them, but manual intervention
415# will be required in this event.  One scheme that could be implemented, if
416# necessary, would be to have this program generate another file, or add a
417# field to mktables.lst that gives the date of first definition of a property.
418# Each new release of Unicode would use that file as a basis for the next
419# iteration.  And the Perl synonym addition code could sort based on the age
420# of the property, so older properties get priority, and newer ones that clash
421# would be refused; hence existing code would not be impacted, and some other
422# synonym would have to be used for the new property.  This is ugly, and
423# manual intervention would certainly be easier to do in the short run; lets
424# hope it never comes to this.
425#
426# A NOTE ON UNIHAN
427#
428# This program can generate tables from the Unihan database.  But that DB
429# isn't normally available, so it is marked as optional.  Prior to version
430# 5.2, this database was in a single file, Unihan.txt.  In 5.2 the database
431# was split into 8 different files, all beginning with the letters 'Unihan'.
432# If you plunk those files down into the directory mktables ($0) is in, this
433# program will read them and automatically create tables for the properties
434# from it that are listed in PropertyAliases.txt and PropValueAliases.txt,
435# plus any you add to the @cjk_properties array and the @cjk_property_values
436# array, being sure to add necessary '# @missings' lines to the latter.  For
437# Unicode versions earlier than 5.2, most of the Unihan properties are not
438# listed at all in PropertyAliases nor PropValueAliases.  This program assumes
439# for these early releases that you want the properties that are specified in
440# the 5.2 release.
441#
442# You may need to adjust the entries to suit your purposes.  setup_unihan(),
443# and filter_unihan_line() are the functions where this is done.  This program
444# already does some adjusting to make the lines look more like the rest of the
445# Unicode DB;  You can see what that is in filter_unihan_line()
446#
447# There is a bug in the 3.2 data file in which some values for the
448# kPrimaryNumeric property have commas and an unexpected comment.  A filter
449# could be added to correct these; or for a particular installation, the
450# Unihan.txt file could be edited to fix them.
451#
452# HOW TO ADD A FILE TO BE PROCESSED
453#
454# A new file from Unicode needs to have an object constructed for it in
455# @input_file_objects, probably at the end or at the end of the extracted
456# ones.  The program should warn you if its name will clash with others on
457# restrictive file systems, like DOS.  If so, figure out a better name, and
458# add lines to the README.perl file giving that.  If the file is a character
459# property, it should be in the format that Unicode has implicitly
460# standardized for such files for the more recently introduced ones.
461# If so, the Input_file constructor for @input_file_objects can just be the
462# file name and release it first appeared in.  If not, then it should be
463# possible to construct an each_line_handler() to massage the line into the
464# standardized form.
465#
466# For non-character properties, more code will be needed.  You can look at
467# the existing entries for clues.
468#
469# UNICODE VERSIONS NOTES
470#
471# The Unicode UCD has had a number of errors in it over the versions.  And
472# these remain, by policy, in the standard for that version.  Therefore it is
473# risky to correct them, because code may be expecting the error.  So this
474# program doesn't generally make changes, unless the error breaks the Perl
475# core.  As an example, some versions of 2.1.x Jamo.txt have the wrong value
476# for U+1105, which causes real problems for the algorithms for Jamo
477# calculations, so it is changed here.
478#
479# But it isn't so clear cut as to what to do about concepts that are
480# introduced in a later release; should they extend back to earlier releases
481# where the concept just didn't exist?  It was easier to do this than to not,
482# so that's what was done.  For example, the default value for code points not
483# in the files for various properties was probably undefined until changed by
484# some version.  No_Block for blocks is such an example.  This program will
485# assign No_Block even in Unicode versions that didn't have it.  This has the
486# benefit that code being written doesn't have to special case earlier
487# versions; and the detriment that it doesn't match the Standard precisely for
488# the affected versions.
489#
490# Here are some observations about some of the issues in early versions:
491#
492# Prior to version 3.0, there were 3 character decompositions.  These are not
493# handled by Unicode::Normalize, nor will it compile when presented a version
494# that has them.  However, you can trivially get it to compile by simply
495# ignoring those decompositions, by changing the croak to a carp.  At the time
496# of this writing, the line (in dist/Unicode-Normalize/Normalize.pm or
497# dist/Unicode-Normalize/mkheader) reads
498#
499#   croak("Weird Canonical Decomposition of U+$h");
500#
501# Simply comment it out.  It will compile, but will not know about any three
502# character decompositions.
503
504# The number of code points in \p{alpha=True} halved in 2.1.9.  It turns out
505# that the reason is that the CJK block starting at 4E00 was removed from
506# PropList, and was not put back in until 3.1.0.  The Perl extension (the
507# single property name \p{alpha}) has the correct values.  But the compound
508# form is simply not generated until 3.1, as it can be argued that prior to
509# this release, this was not an official property.  The comments for
510# filter_old_style_proplist() give more details.
511#
512# Unicode introduced the synonym Space for White_Space in 4.1.  Perl has
513# always had a \p{Space}.  In release 3.2 only, they are not synonymous.  The
514# reason is that 3.2 introduced U+205F=medium math space, which was not
515# classed as white space, but Perl figured out that it should have been. 4.0
516# reclassified it correctly.
517#
518# Another change between 3.2 and 4.0 is the CCC property value ATBL.  In 3.2
519# this was erroneously a synonym for 202 (it should be 200).  In 4.0, ATB
520# became 202, and ATBL was left with no code points, as all the ones that
521# mapped to 202 stayed mapped to 202.  Thus if your program used the numeric
522# name for the class, it would not have been affected, but if it used the
523# mnemonic, it would have been.
524#
525# \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1.  Before that, code
526# points which eventually came to have this script property value, instead
527# mapped to "Unknown".  But in the next release all these code points were
528# moved to \p{sc=common} instead.
529
530# The tests furnished  by Unicode for testing WordBreak and SentenceBreak
531# generate errors in 5.0 and earlier.
532#
533# The default for missing code points for BidiClass is complicated.  Starting
534# in 3.1.1, the derived file DBidiClass.txt handles this, but this program
535# tries to do the best it can for earlier releases.  It is done in
536# process_PropertyAliases()
537#
538# In version 2.1.2, the entry in UnicodeData.txt:
539#   0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;;019F;
540# should instead be
541#   0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F
542# Without this change, there are casing problems for this character.
543#
544# Search for $string_compare_versions to see how to compare changes to
545# properties between Unicode versions
546#
547##############################################################################
548
549my $UNDEF = ':UNDEF:';  # String to print out for undefined values in tracing
550                        # and errors
551my $MAX_LINE_WIDTH = 78;
552
553# Debugging aid to skip most files so as to not be distracted by them when
554# concentrating on the ones being debugged.  Add
555# non_skip => 1,
556# to the constructor for those files you want processed when you set this.
557# Files with a first version number of 0 are special: they are always
558# processed regardless of the state of this flag.  Generally, Jamo.txt and
559# UnicodeData.txt must not be skipped if you want this program to not die
560# before normal completion.
561my $debug_skip = 0;
562
563
564# Normally these are suppressed.
565my $write_Unicode_deprecated_tables = 0;
566
567# Set to 1 to enable tracing.
568our $to_trace = 0;
569
570{ # Closure for trace: debugging aid
571    my $print_caller = 1;        # ? Include calling subroutine name
572    my $main_with_colon = 'main::';
573    my $main_colon_length = length($main_with_colon);
574
575    sub trace {
576        return unless $to_trace;        # Do nothing if global flag not set
577
578        my @input = @_;
579
580        local $DB::trace = 0;
581        $DB::trace = 0;          # Quiet 'used only once' message
582
583        my $line_number;
584
585        # Loop looking up the stack to get the first non-trace caller
586        my $caller_line;
587        my $caller_name;
588        my $i = 0;
589        do {
590            $line_number = $caller_line;
591            (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
592            $caller = $main_with_colon unless defined $caller;
593
594            $caller_name = $caller;
595
596            # get rid of pkg
597            $caller_name =~ s/.*:://;
598            if (substr($caller_name, 0, $main_colon_length)
599                eq $main_with_colon)
600            {
601                $caller_name = substr($caller_name, $main_colon_length);
602            }
603
604        } until ($caller_name ne 'trace');
605
606        # If the stack was empty, we were called from the top level
607        $caller_name = 'main' if ($caller_name eq ""
608                                    || $caller_name eq 'trace');
609
610        my $output = "";
611        #print STDERR __LINE__, ": ", join ", ", @input, "\n";
612        foreach my $string (@input) {
613            if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
614                $output .= simple_dumper($string);
615            }
616            else {
617                $string = "$string" if ref $string;
618                $string = $UNDEF unless defined $string;
619                chomp $string;
620                $string = '""' if $string eq "";
621                $output .= " " if $output ne ""
622                                && $string ne ""
623                                && substr($output, -1, 1) ne " "
624                                && substr($string, 0, 1) ne " ";
625                $output .= $string;
626            }
627        }
628
629        print STDERR sprintf "%4d: ", $line_number if defined $line_number;
630        print STDERR "$caller_name: " if $print_caller;
631        print STDERR $output, "\n";
632        return;
633    }
634}
635
636sub stack_trace() {
637    local $to_trace = 1 if main::DEBUG;
638    my $line = (caller(0))[2];
639    my $i = 1;
640
641    # Accumulate the stack trace
642    while (1) {
643        my ($pkg, $file, $caller_line, $caller) = caller $i++;
644
645        last unless defined $caller;
646
647        trace "called from $caller() at line $line";
648        $line = $caller_line;
649    }
650}
651
652# This is for a rarely used development feature that allows you to compare two
653# versions of the Unicode standard without having to deal with changes caused
654# by the code points introduced in the later version.  You probably also want
655# to use the -annotate option when using this.  Run this program on a unicore
656# containing the starting release you want to compare.  Save that output
657# structure.  Then, switching to a unicore with the ending release, change the
658# "" in the $string_compare_versions definition just below to a string
659# containing a SINGLE dotted Unicode release number (e.g. "2.1") corresponding
660# to the starting release.  This program will then compile, but throw away all
661# code points introduced after the starting release.  Finally use a diff tool
662# to compare the two directory structures.  They include only the code points
663# common to both releases, and you can see the changes caused just by the
664# underlying release semantic changes.  For versions earlier than 3.2, you
665# must copy a version of DAge.txt into the directory.
666my $string_compare_versions = DEBUG && "";
667my $compare_versions = DEBUG
668                       && $string_compare_versions
669                       && pack "C*", split /\./, $string_compare_versions;
670
671sub uniques {
672    # Returns non-duplicated input values.  From "Perl Best Practices:
673    # Encapsulated Cleverness".  p. 455 in first edition.
674
675    my %seen;
676    # Arguably this breaks encapsulation, if the goal is to permit multiple
677    # distinct objects to stringify to the same value, and be interchangeable.
678    # However, for this program, no two objects stringify identically, and all
679    # lists passed to this function are either objects or strings. So this
680    # doesn't affect correctness, but it does give a couple of percent speedup.
681    no overloading;
682    return grep { ! $seen{$_}++ } @_;
683}
684
685$0 = File::Spec->canonpath($0);
686
687my $make_test_script = 0;      # ? Should we output a test script
688my $make_norm_test_script = 0; # ? Should we output a normalization test script
689my $write_unchanged_files = 0; # ? Should we update the output files even if
690                               #    we don't think they have changed
691my $use_directory = "";        # ? Should we chdir somewhere.
692my $pod_directory;             # input directory to store the pod file.
693my $pod_file = 'perluniprops';
694my $t_path;                     # Path to the .t test file
695my $file_list = 'mktables.lst'; # File to store input and output file names.
696                               # This is used to speed up the build, by not
697                               # executing the main body of the program if
698                               # nothing on the list has changed since the
699                               # previous build
700my $make_list = 1;             # ? Should we write $file_list.  Set to always
701                               # make a list so that when the pumpking is
702                               # preparing a release, s/he won't have to do
703                               # special things
704my $glob_list = 0;             # ? Should we try to include unknown .txt files
705                               # in the input.
706my $output_range_counts = $debugging_build;   # ? Should we include the number
707                                              # of code points in ranges in
708                                              # the output
709my $annotate = 0;              # ? Should character names be in the output
710
711# Verbosity levels; 0 is quiet
712my $NORMAL_VERBOSITY = 1;
713my $PROGRESS = 2;
714my $VERBOSE = 3;
715
716my $verbosity = $NORMAL_VERBOSITY;
717
718# Stored in mktables.lst so that if this program is called with different
719# options, will regenerate even if the files otherwise look like they're
720# up-to-date.
721my $command_line_arguments = join " ", @ARGV;
722
723# Process arguments
724while (@ARGV) {
725    my $arg = shift @ARGV;
726    if ($arg eq '-v') {
727        $verbosity = $VERBOSE;
728    }
729    elsif ($arg eq '-p') {
730        $verbosity = $PROGRESS;
731        $| = 1;     # Flush buffers as we go.
732    }
733    elsif ($arg eq '-q') {
734        $verbosity = 0;
735    }
736    elsif ($arg eq '-w') {
737        # update the files even if they haven't changed
738        $write_unchanged_files = 1;
739    }
740    elsif ($arg eq '-check') {
741        my $this = shift @ARGV;
742        my $ok = shift @ARGV;
743        if ($this ne $ok) {
744            print "Skipping as check params are not the same.\n";
745            exit(0);
746        }
747    }
748    elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
749        -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
750    }
751    elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
752    {
753        $make_test_script = 1;
754    }
755    elsif ($arg eq '-makenormtest')
756    {
757        $make_norm_test_script = 1;
758    }
759    elsif ($arg eq '-makelist') {
760        $make_list = 1;
761    }
762    elsif ($arg eq '-C' && defined ($use_directory = shift)) {
763        -d $use_directory or croak "Unknown directory '$use_directory'";
764    }
765    elsif ($arg eq '-L') {
766
767        # Existence not tested until have chdir'd
768        $file_list = shift;
769    }
770    elsif ($arg eq '-globlist') {
771        $glob_list = 1;
772    }
773    elsif ($arg eq '-c') {
774        $output_range_counts = ! $output_range_counts
775    }
776    elsif ($arg eq '-annotate') {
777        $annotate = 1;
778        $debugging_build = 1;
779        $output_range_counts = 1;
780    }
781    else {
782        my $with_c = 'with';
783        $with_c .= 'out' if $output_range_counts;   # Complements the state
784        croak <<END;
785usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
786          [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
787          [-check A B ]
788  -c          : Output comments $with_c number of code points in ranges
789  -q          : Quiet Mode: Only output serious warnings.
790  -p          : Set verbosity level to normal plus show progress.
791  -v          : Set Verbosity level high:  Show progress and non-serious
792                warnings
793  -w          : Write files regardless
794  -C dir      : Change to this directory before proceeding. All relative paths
795                except those specified by the -P and -T options will be done
796                with respect to this directory.
797  -P dir      : Output $pod_file file to directory 'dir'.
798  -T path     : Create a test script as 'path'; overrides -maketest
799  -L filelist : Use alternate 'filelist' instead of standard one
800  -globlist   : Take as input all non-Test *.txt files in current and sub
801                directories
802  -maketest   : Make test script 'TestProp.pl' in current (or -C directory),
803                overrides -T
804  -makelist   : Rewrite the file list $file_list based on current setup
805  -annotate   : Output an annotation for each character in the table files;
806                useful for debugging mktables, looking at diffs; but is slow
807                and memory intensive
808  -check A B  : Executes $0 only if A and B are the same
809END
810    }
811}
812
813# Stores the most-recently changed file.  If none have changed, can skip the
814# build
815my $most_recent = (stat $0)[9];   # Do this before the chdir!
816
817# Change directories now, because need to read 'version' early.
818if ($use_directory) {
819    if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
820        $pod_directory = File::Spec->rel2abs($pod_directory);
821    }
822    if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
823        $t_path = File::Spec->rel2abs($t_path);
824    }
825    chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
826    if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
827        $pod_directory = File::Spec->abs2rel($pod_directory);
828    }
829    if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
830        $t_path = File::Spec->abs2rel($t_path);
831    }
832}
833
834# Get Unicode version into regular and v-string.  This is done now because
835# various tables below get populated based on it.  These tables are populated
836# here to be near the top of the file, and so easily seeable by those needing
837# to modify things.
838open my $VERSION, "<", "version"
839                    or croak "$0: can't open required file 'version': $!\n";
840my $string_version = <$VERSION>;
841close $VERSION;
842chomp $string_version;
843my $v_version = pack "C*", split /\./, $string_version;        # v string
844
845my $unicode_version = ($compare_versions)
846                      ? (  "$string_compare_versions (using "
847                         . "$string_version rules)")
848                      : $string_version;
849
850# The following are the complete names of properties with property values that
851# are known to not match any code points in some versions of Unicode, but that
852# may change in the future so they should be matchable, hence an empty file is
853# generated for them.
854my @tables_that_may_be_empty;
855push @tables_that_may_be_empty, 'Joining_Type=Left_Joining'
856                                                    if $v_version lt v6.3.0;
857push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
858push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
859push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
860                                                    if $v_version ge v4.1.0;
861push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
862                                                    if $v_version ge v6.0.0;
863push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend'
864                                                    if $v_version ge v6.1.0;
865push @tables_that_may_be_empty, 'Canonical_Combining_Class=CCC133'
866                                                    if $v_version ge v6.2.0;
867
868# The lists below are hashes, so the key is the item in the list, and the
869# value is the reason why it is in the list.  This makes generation of
870# documentation easier.
871
872my %why_suppressed;  # No file generated for these.
873
874# Files aren't generated for empty extraneous properties.  This is arguable.
875# Extraneous properties generally come about because a property is no longer
876# used in a newer version of Unicode.  If we generated a file without code
877# points, programs that used to work on that property will still execute
878# without errors.  It just won't ever match (or will always match, with \P{}).
879# This means that the logic is now likely wrong.  I (khw) think its better to
880# find this out by getting an error message.  Just move them to the table
881# above to change this behavior
882my %why_suppress_if_empty_warn_if_not = (
883
884   # It is the only property that has ever officially been removed from the
885   # Standard.  The database never contained any code points for it.
886   'Special_Case_Condition' => 'Obsolete',
887
888   # Apparently never official, but there were code points in some versions of
889   # old-style PropList.txt
890   'Non_Break' => 'Obsolete',
891);
892
893# These would normally go in the warn table just above, but they were changed
894# a long time before this program was written, so warnings about them are
895# moot.
896if ($v_version gt v3.2.0) {
897    push @tables_that_may_be_empty,
898                                'Canonical_Combining_Class=Attached_Below_Left'
899}
900
901# Obsoleted
902if ($v_version ge v11.0.0) {
903    push @tables_that_may_be_empty, qw(
904                                       Grapheme_Cluster_Break=E_Base
905                                       Grapheme_Cluster_Break=E_Base_GAZ
906                                       Grapheme_Cluster_Break=E_Modifier
907                                       Grapheme_Cluster_Break=Glue_After_Zwj
908                                       Word_Break=E_Base
909                                       Word_Break=E_Base_GAZ
910                                       Word_Break=E_Modifier
911                                       Word_Break=Glue_After_Zwj);
912}
913
914# Enum values for to_output_map() method in the Map_Table package. (0 is don't
915# output)
916my $EXTERNAL_MAP = 1;
917my $INTERNAL_MAP = 2;
918my $OUTPUT_ADJUSTED = 3;
919
920# To override computed values for writing the map tables for these properties.
921# The default for enum map tables is to write them out, so that the Unicode
922# .txt files can be removed, but all the data to compute any property value
923# for any code point is available in a more compact form.
924my %global_to_output_map = (
925    # Needed by UCD.pm, but don't want to publicize that it exists, so won't
926    # get stuck supporting it if things change.  Since it is a STRING
927    # property, it normally would be listed in the pod, but INTERNAL_MAP
928    # suppresses that.
929    Unicode_1_Name => $INTERNAL_MAP,
930
931    Present_In => 0,                # Suppress, as easily computed from Age
932    Block => (NON_ASCII_PLATFORM) ? 1 : 0,  # Suppress, as Blocks.txt is
933                                            # retained, but needed for
934                                            # non-ASCII
935
936    # Suppress, as mapping can be found instead from the
937    # Perl_Decomposition_Mapping file
938    Decomposition_Type => 0,
939);
940
941# There are several types of obsolete properties defined by Unicode.  These
942# must be hand-edited for every new Unicode release.
943my %why_deprecated;  # Generates a deprecated warning message if used.
944my %why_stabilized;  # Documentation only
945my %why_obsolete;    # Documentation only
946
947{   # Closure
948    my $simple = 'Perl uses the more complete version';
949    my $unihan = 'Unihan properties are by default not enabled in the Perl core.  Instead use CPAN: Unicode::Unihan';
950
951    my $other_properties = 'other properties';
952    my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
953    my $why_no_expand  = "Deprecated by Unicode.  These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used.  For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character.";
954
955    %why_deprecated = (
956        'Grapheme_Link' => 'Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
957        'Jamo_Short_Name' => $contributory,
958        'Line_Break=Surrogate' => 'Surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
959        'Other_Alphabetic' => $contributory,
960        'Other_Default_Ignorable_Code_Point' => $contributory,
961        'Other_Grapheme_Extend' => $contributory,
962        'Other_ID_Continue' => $contributory,
963        'Other_ID_Start' => $contributory,
964        'Other_Lowercase' => $contributory,
965        'Other_Math' => $contributory,
966        'Other_Uppercase' => $contributory,
967        'Expands_On_NFC' => $why_no_expand,
968        'Expands_On_NFD' => $why_no_expand,
969        'Expands_On_NFKC' => $why_no_expand,
970        'Expands_On_NFKD' => $why_no_expand,
971    );
972
973    %why_suppressed = (
974        # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which
975        # contains the same information, but without the algorithmically
976        # determinable Hangul syllables'.  This file is not published, so it's
977        # existence is not noted in the comment.
978        'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or prop_invmap() or charprop() in Unicode::UCD::',
979
980        # Don't suppress ISO_Comment, as otherwise special handling is needed
981        # to differentiate between it and gc=c, which can be written as 'isc',
982        # which is the same characters as ISO_Comment's short name.
983
984        'Name' => "Accessible via \\N{...} or 'use charnames;' or charprop() or prop_invmap() in Unicode::UCD::",
985
986        'Simple_Case_Folding' => "$simple.  Can access this through casefold(), charprop(), or prop_invmap() in Unicode::UCD",
987        'Simple_Lowercase_Mapping' => "$simple.  Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
988        'Simple_Titlecase_Mapping' => "$simple.  Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
989        'Simple_Uppercase_Mapping' => "$simple.  Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
990
991        FC_NFKC_Closure => 'Deprecated by Unicode, and supplanted in usage by NFKC_Casefold; otherwise not useful',
992    );
993
994    foreach my $property (
995
996            # The following are suppressed because they were made contributory
997            # or deprecated by Unicode before Perl ever thought about
998            # supporting them.
999            'Jamo_Short_Name',
1000            'Grapheme_Link',
1001            'Expands_On_NFC',
1002            'Expands_On_NFD',
1003            'Expands_On_NFKC',
1004            'Expands_On_NFKD',
1005
1006            # The following are suppressed because they have been marked
1007            # as deprecated for a sufficient amount of time
1008            'Other_Alphabetic',
1009            'Other_Default_Ignorable_Code_Point',
1010            'Other_Grapheme_Extend',
1011            'Other_ID_Continue',
1012            'Other_ID_Start',
1013            'Other_Lowercase',
1014            'Other_Math',
1015            'Other_Uppercase',
1016    ) {
1017        $why_suppressed{$property} = $why_deprecated{$property};
1018    }
1019
1020    # Customize the message for all the 'Other_' properties
1021    foreach my $property (keys %why_deprecated) {
1022        next if (my $main_property = $property) !~ s/^Other_//;
1023        $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
1024    }
1025}
1026
1027if ($write_Unicode_deprecated_tables) {
1028    foreach my $property (keys %why_suppressed) {
1029        delete $why_suppressed{$property} if $property =~
1030                                                    / ^ Other | Grapheme /x;
1031    }
1032}
1033
1034if ($v_version ge 4.0.0) {
1035    $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
1036    if ($v_version ge 6.0.0) {
1037        $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14';
1038    }
1039}
1040if ($v_version ge 5.2.0 && $v_version lt 6.0.0) {
1041    $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
1042    if ($v_version ge 6.0.0) {
1043        $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed';
1044    }
1045}
1046
1047# Probably obsolete forever
1048if ($v_version ge v4.1.0) {
1049    $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete.  All code points previously matched by this have been moved to "Script=Common".';
1050}
1051if ($v_version ge v6.0.0) {
1052    $why_suppressed{'Script=Katakana_Or_Hiragana'} .= '  Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana" (or both)';
1053    $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"';
1054}
1055
1056# This program can create files for enumerated-like properties, such as
1057# 'Numeric_Type'.  This file would be the same format as for a string
1058# property, with a mapping from code point to its value, so you could look up,
1059# for example, the script a code point is in.  But no one so far wants this
1060# mapping, or they have found another way to get it since this is a new
1061# feature.  So no file is generated except if it is in this list.
1062my @output_mapped_properties = split "\n", <<END;
1063END
1064
1065# If you want more Unihan properties than the default, you need to add them to
1066# these arrays.  Depending on the property type, @missing lines might have to
1067# be added to the second array.  A sample entry would be (including the '#'):
1068# @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
1069my @cjk_properties = split "\n", <<'END';
1070END
1071my @cjk_property_values = split "\n", <<'END';
1072END
1073
1074# The input files don't list every code point.  Those not listed are to be
1075# defaulted to some value.  Below are hard-coded what those values are for
1076# non-binary properties as of 5.1.  Starting in 5.0, there are
1077# machine-parsable comment lines in the files that give the defaults; so this
1078# list shouldn't have to be extended.  The claim is that all missing entries
1079# for binary properties will default to 'N'.  Unicode tried to change that in
1080# 5.2, but the beta period produced enough protest that they backed off.
1081#
1082# The defaults for the fields that appear in UnicodeData.txt in this hash must
1083# be in the form that it expects.  The others may be synonyms.
1084my $CODE_POINT = '<code point>';
1085my %default_mapping = (
1086    Age => "Unassigned",
1087    # Bidi_Class => Complicated; set in code
1088    Bidi_Mirroring_Glyph => "",
1089    Block => 'No_Block',
1090    Canonical_Combining_Class => 0,
1091    Case_Folding => $CODE_POINT,
1092    Decomposition_Mapping => $CODE_POINT,
1093    Decomposition_Type => 'None',
1094    East_Asian_Width => "Neutral",
1095    FC_NFKC_Closure => $CODE_POINT,
1096    General_Category => ($v_version le 6.3.0) ? 'Cn' : 'Unassigned',
1097    Grapheme_Cluster_Break => 'Other',
1098    Hangul_Syllable_Type => 'NA',
1099    ISO_Comment => "",
1100    Jamo_Short_Name => "",
1101    Joining_Group => "No_Joining_Group",
1102    # Joining_Type => Complicated; set in code
1103    kIICore => 'N',   #                       Is converted to binary
1104    #Line_Break => Complicated; set in code
1105    Lowercase_Mapping => $CODE_POINT,
1106    Name => "",
1107    Name_Alias => "",
1108    NFC_QC => 'Yes',
1109    NFD_QC => 'Yes',
1110    NFKC_QC => 'Yes',
1111    NFKD_QC => 'Yes',
1112    Numeric_Type => 'None',
1113    Numeric_Value => 'NaN',
1114    Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
1115    Sentence_Break => 'Other',
1116    Simple_Case_Folding => $CODE_POINT,
1117    Simple_Lowercase_Mapping => $CODE_POINT,
1118    Simple_Titlecase_Mapping => $CODE_POINT,
1119    Simple_Uppercase_Mapping => $CODE_POINT,
1120    Titlecase_Mapping => $CODE_POINT,
1121    Unicode_1_Name => "",
1122    Unicode_Radical_Stroke => "",
1123    Uppercase_Mapping => $CODE_POINT,
1124    Word_Break => 'Other',
1125);
1126
1127### End of externally interesting definitions, except for @input_file_objects
1128
1129my $HEADER=<<"EOF";
1130# !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
1131# This file is machine-generated by $0 from the Unicode
1132# database, Version $unicode_version.  Any changes made here will be lost!
1133EOF
1134
1135my $INTERNAL_ONLY_HEADER = <<"EOF";
1136
1137# !!!!!!!   INTERNAL PERL USE ONLY   !!!!!!!
1138# This file is for internal use by core Perl only.  The format and even the
1139# name or existence of this file are subject to change without notice.  Don't
1140# use it directly.  Use Unicode::UCD to access the Unicode character data
1141# base.
1142EOF
1143
1144my $DEVELOPMENT_ONLY=<<"EOF";
1145# !!!!!!!   DEVELOPMENT USE ONLY   !!!!!!!
1146# This file contains information artificially constrained to code points
1147# present in Unicode release $string_compare_versions.
1148# IT CANNOT BE RELIED ON.  It is for use during development only and should
1149# not be used for production.
1150
1151EOF
1152
1153my $MAX_UNICODE_CODEPOINT_STRING = ($v_version ge v2.0.0)
1154                                   ? "10FFFF"
1155                                   : "FFFF";
1156my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
1157my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
1158
1159# We work with above-Unicode code points, up to IV_MAX, but we may want to use
1160# sentinels above that number.  Therefore for internal use, we use a much
1161# smaller number, translating it to IV_MAX only for output.  The exact number
1162# is immaterial (all above-Unicode code points are treated exactly the same),
1163# but the algorithm requires it to be at least
1164# 2 * $MAX_UNICODE_CODEPOINTS + 1
1165my $MAX_WORKING_CODEPOINTS= $MAX_UNICODE_CODEPOINT * 8;
1166my $MAX_WORKING_CODEPOINT = $MAX_WORKING_CODEPOINTS - 1;
1167my $MAX_WORKING_CODEPOINT_STRING = sprintf("%X", $MAX_WORKING_CODEPOINT);
1168
1169my $MAX_PLATFORM_CODEPOINT = ~0 >> 1;
1170
1171# Matches legal code point.  4-6 hex numbers, If there are 6, the first
1172# two must be 10; if there are 5, the first must not be a 0.  Written this way
1173# to decrease backtracking.  The first regex allows the code point to be at
1174# the end of a word, but to work properly, the word shouldn't end with a valid
1175# hex character.  The second one won't match a code point at the end of a
1176# word, and doesn't have the run-on issue
1177my $run_on_code_point_re =
1178            qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1179my $code_point_re = qr/\b$run_on_code_point_re/;
1180
1181# This matches the beginning of the line in the Unicode DB files that give the
1182# defaults for code points not listed (i.e., missing) in the file.  The code
1183# depends on this ending with a semi-colon, so it can assume it is a valid
1184# field when the line is split() by semi-colons
1185my $missing_defaults_prefix = qr/^#\s+\@missing:\s+0000\.\.10FFFF\s*;/;
1186
1187# Property types.  Unicode has more types, but these are sufficient for our
1188# purposes.
1189my $UNKNOWN = -1;   # initialized to illegal value
1190my $NON_STRING = 1; # Either binary or enum
1191my $BINARY = 2;
1192my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal
1193                       # tables, additional true and false tables are
1194                       # generated so that false is anything matching the
1195                       # default value, and true is everything else.
1196my $ENUM = 4;       # Include catalog
1197my $STRING = 5;     # Anything else: string or misc
1198
1199# Some input files have lines that give default values for code points not
1200# contained in the file.  Sometimes these should be ignored.
1201my $NO_DEFAULTS = 0;        # Must evaluate to false
1202my $NOT_IGNORED = 1;
1203my $IGNORED = 2;
1204
1205# Range types.  Each range has a type.  Most ranges are type 0, for normal,
1206# and will appear in the main body of the tables in the output files, but
1207# there are other types of ranges as well, listed below, that are specially
1208# handled.   There are pseudo-types as well that will never be stored as a
1209# type, but will affect the calculation of the type.
1210
1211# 0 is for normal, non-specials
1212my $MULTI_CP = 1;           # Sequence of more than code point
1213my $HANGUL_SYLLABLE = 2;
1214my $CP_IN_NAME = 3;         # The NAME contains the code point appended to it.
1215my $NULL = 4;               # The map is to the null string; utf8.c can't
1216                            # handle these, nor is there an accepted syntax
1217                            # for them in \p{} constructs
1218my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
1219                             # otherwise be $MULTI_CP type are instead type 0
1220
1221# process_generic_property_file() can accept certain overrides in its input.
1222# Each of these must begin AND end with $CMD_DELIM.
1223my $CMD_DELIM = "\a";
1224my $REPLACE_CMD = 'replace';    # Override the Replace
1225my $MAP_TYPE_CMD = 'map_type';  # Override the Type
1226
1227my $NO = 0;
1228my $YES = 1;
1229
1230# Values for the Replace argument to add_range.
1231# $NO                      # Don't replace; add only the code points not
1232                           # already present.
1233my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1234                           # the comments at the subroutine definition.
1235my $UNCONDITIONALLY = 2;   # Replace without conditions.
1236my $MULTIPLE_BEFORE = 4;   # Don't replace, but add a duplicate record if
1237                           # already there
1238my $MULTIPLE_AFTER = 5;    # Don't replace, but add a duplicate record if
1239                           # already there
1240my $CROAK = 6;             # Die with an error if is already there
1241
1242# Flags to give property statuses.  The phrases are to remind maintainers that
1243# if the flag is changed, the indefinite article referring to it in the
1244# documentation may need to be as well.
1245my $NORMAL = "";
1246my $DEPRECATED = 'D';
1247my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1248my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1249my $DISCOURAGED = 'X';
1250my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1251my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1252my $STRICTER = 'T';
1253my $a_bold_stricter = "a 'B<$STRICTER>'";
1254my $A_bold_stricter = "A 'B<$STRICTER>'";
1255my $STABILIZED = 'S';
1256my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1257my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1258my $OBSOLETE = 'O';
1259my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1260my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1261
1262# Aliases can also have an extra status:
1263my $INTERNAL_ALIAS = 'P';
1264
1265my %status_past_participles = (
1266    $DISCOURAGED => 'discouraged',
1267    $STABILIZED => 'stabilized',
1268    $OBSOLETE => 'obsolete',
1269    $DEPRECATED => 'deprecated',
1270    $INTERNAL_ALIAS => 'reserved for Perl core internal use only',
1271);
1272
1273# Table fates.  These are somewhat ordered, so that fates < $MAP_PROXIED should be
1274# externally documented.
1275my $ORDINARY = 0;       # The normal fate.
1276my $MAP_PROXIED = 1;    # The map table for the property isn't written out,
1277                        # but there is a file written that can be used to
1278                        # reconstruct this table
1279my $INTERNAL_ONLY = 2;  # The file for this table is written out, but it is
1280                        # for Perl's internal use only
1281my $LEGACY_ONLY = 3;    # Like $INTERNAL_ONLY, but not actually used by Perl.
1282                        # Is for backwards compatibility for applications that
1283                        # read the file directly, so it's format is
1284                        # unchangeable.
1285my $SUPPRESSED = 4;     # The file for this table is not written out, and as a
1286                        # result, we don't bother to do many computations on
1287                        # it.
1288my $PLACEHOLDER = 5;    # Like $SUPPRESSED, but we go through all the
1289                        # computations anyway, as the values are needed for
1290                        # things to work.  This happens when we have Perl
1291                        # extensions that depend on Unicode tables that
1292                        # wouldn't normally be in a given Unicode version.
1293
1294# The format of the values of the tables:
1295my $EMPTY_FORMAT = "";
1296my $BINARY_FORMAT = 'b';
1297my $DECIMAL_FORMAT = 'd';
1298my $FLOAT_FORMAT = 'f';
1299my $INTEGER_FORMAT = 'i';
1300my $HEX_FORMAT = 'x';
1301my $RATIONAL_FORMAT = 'r';
1302my $STRING_FORMAT = 's';
1303my $ADJUST_FORMAT = 'a';
1304my $HEX_ADJUST_FORMAT = 'ax';
1305my $DECOMP_STRING_FORMAT = 'c';
1306my $STRING_WHITE_SPACE_LIST = 'sw';
1307
1308my %map_table_formats = (
1309    $BINARY_FORMAT => 'binary',
1310    $DECIMAL_FORMAT => 'single decimal digit',
1311    $FLOAT_FORMAT => 'floating point number',
1312    $INTEGER_FORMAT => 'integer',
1313    $HEX_FORMAT => 'non-negative hex whole number; a code point',
1314    $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1315    $STRING_FORMAT => 'string',
1316    $ADJUST_FORMAT => 'some entries need adjustment',
1317    $HEX_ADJUST_FORMAT => 'mapped value in hex; some entries need adjustment',
1318    $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
1319    $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators'
1320);
1321
1322# Unicode didn't put such derived files in a separate directory at first.
1323my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1324my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1325my $AUXILIARY = 'auxiliary';
1326my $EMOJI = 'emoji';
1327
1328# Hashes and arrays that will eventually go into UCD.pl for the use of UCD.pm
1329my %loose_to_file_of;       # loosely maps table names to their respective
1330                            # files
1331my %stricter_to_file_of;    # same; but for stricter mapping.
1332my %loose_property_to_file_of; # Maps a loose property name to its map file
1333my %strict_property_to_file_of; # Same, but strict
1334my @inline_definitions = "V0"; # Each element gives a definition of a unique
1335                            # inversion list.  When a definition is inlined,
1336                            # its value in the hash it's in (one of the two
1337                            # defined just above) will include an index into
1338                            # this array.  The 0th element is initialized to
1339                            # the definition for a zero length inversion list
1340my %file_to_swash_name;     # Maps the file name to its corresponding key name
1341                            # in the hash %Unicode::UCD::SwashInfo
1342my %nv_floating_to_rational; # maps numeric values floating point numbers to
1343                             # their rational equivalent
1344my %loose_property_name_of; # Loosely maps (non_string) property names to
1345                            # standard form
1346my %strict_property_name_of; # Strictly maps (non_string) property names to
1347                            # standard form
1348my %string_property_loose_to_name; # Same, for string properties.
1349my %loose_defaults;         # keys are of form "prop=value", where 'prop' is
1350                            # the property name in standard loose form, and
1351                            # 'value' is the default value for that property,
1352                            # also in standard loose form.
1353my %loose_to_standard_value; # loosely maps table names to the canonical
1354                            # alias for them
1355my %ambiguous_names;        # keys are alias names (in standard form) that
1356                            # have more than one possible meaning.
1357my %combination_property;   # keys are alias names (in standard form) that
1358                            # have both a map table, and a binary one that
1359                            # yields true for all non-null maps.
1360my %prop_aliases;           # Keys are standard property name; values are each
1361                            # one's aliases
1362my %prop_value_aliases;     # Keys of top level are standard property name;
1363                            # values are keys to another hash,  Each one is
1364                            # one of the property's values, in standard form.
1365                            # The values are that prop-val's aliases.
1366my %skipped_files;          # List of files that we skip
1367my %ucd_pod;    # Holds entries that will go into the UCD section of the pod
1368
1369# Most properties are immune to caseless matching, otherwise you would get
1370# nonsensical results, as properties are a function of a code point, not
1371# everything that is caselessly equivalent to that code point.  For example,
1372# Changes_When_Case_Folded('s') should be false, whereas caselessly it would
1373# be true because 's' and 'S' are equivalent caselessly.  However,
1374# traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we
1375# extend that concept to those very few properties that are like this.  Each
1376# such property will match the full range caselessly.  They are hard-coded in
1377# the program; it's not worth trying to make it general as it's extremely
1378# unlikely that they will ever change.
1379my %caseless_equivalent_to;
1380
1381# This is the range of characters that were in Release 1 of Unicode, and
1382# removed in Release 2 (replaced with the current Hangul syllables starting at
1383# U+AC00).  The range was reused starting in Release 3 for other purposes.
1384my $FIRST_REMOVED_HANGUL_SYLLABLE = 0x3400;
1385my $FINAL_REMOVED_HANGUL_SYLLABLE = 0x4DFF;
1386
1387# These constants names and values were taken from the Unicode standard,
1388# version 5.1, section 3.12.  They are used in conjunction with Hangul
1389# syllables.  The '_string' versions are so generated tables can retain the
1390# hex format, which is the more familiar value
1391my $SBase_string = "0xAC00";
1392my $SBase = CORE::hex $SBase_string;
1393my $LBase_string = "0x1100";
1394my $LBase = CORE::hex $LBase_string;
1395my $VBase_string = "0x1161";
1396my $VBase = CORE::hex $VBase_string;
1397my $TBase_string = "0x11A7";
1398my $TBase = CORE::hex $TBase_string;
1399my $SCount = 11172;
1400my $LCount = 19;
1401my $VCount = 21;
1402my $TCount = 28;
1403my $NCount = $VCount * $TCount;
1404
1405# For Hangul syllables;  These store the numbers from Jamo.txt in conjunction
1406# with the above published constants.
1407my %Jamo;
1408my %Jamo_L;     # Leading consonants
1409my %Jamo_V;     # Vowels
1410my %Jamo_T;     # Trailing consonants
1411
1412# For code points whose name contains its ordinal as a '-ABCD' suffix.
1413# The key is the base name of the code point, and the value is an
1414# array giving all the ranges that use this base name.  Each range
1415# is actually a hash giving the 'low' and 'high' values of it.
1416my %names_ending_in_code_point;
1417my %loose_names_ending_in_code_point;   # Same as above, but has blanks, dashes
1418                                        # removed from the names
1419# Inverse mapping.  The list of ranges that have these kinds of
1420# names.  Each element contains the low, high, and base names in an
1421# anonymous hash.
1422my @code_points_ending_in_code_point;
1423
1424# To hold Unicode's normalization test suite
1425my @normalization_tests;
1426
1427# Boolean: does this Unicode version have the hangul syllables, and are we
1428# writing out a table for them?
1429my $has_hangul_syllables = 0;
1430
1431# Does this Unicode version have code points whose names end in their
1432# respective code points, and are we writing out a table for them?  0 for no;
1433# otherwise points to first property that a table is needed for them, so that
1434# if multiple tables are needed, we don't create duplicates
1435my $needing_code_points_ending_in_code_point = 0;
1436
1437my @backslash_X_tests;     # List of tests read in for testing \X
1438my @LB_tests;              # List of tests read in for testing \b{lb}
1439my @SB_tests;              # List of tests read in for testing \b{sb}
1440my @WB_tests;              # List of tests read in for testing \b{wb}
1441my @unhandled_properties;  # Will contain a list of properties found in
1442                           # the input that we didn't process.
1443my @match_properties;      # Properties that have match tables, to be
1444                           # listed in the pod
1445my @map_properties;        # Properties that get map files written
1446my @named_sequences;       # NamedSequences.txt contents.
1447my %potential_files;       # Generated list of all .txt files in the directory
1448                           # structure so we can warn if something is being
1449                           # ignored.
1450my @missing_early_files;   # Generated list of absent files that we need to
1451                           # proceed in compiling this early Unicode version
1452my @files_actually_output; # List of files we generated.
1453my @more_Names;            # Some code point names are compound; this is used
1454                           # to store the extra components of them.
1455my $E_FLOAT_PRECISION = 2; # The minimum number of digits after the decimal
1456                           # point of a normalized floating point number
1457                           # needed to match before we consider it equivalent
1458                           # to a candidate rational
1459
1460# These store references to certain commonly used property objects
1461my $age;
1462my $ccc;
1463my $gc;
1464my $perl;
1465my $block;
1466my $perl_charname;
1467my $print;
1468my $All;
1469my $Assigned;   # All assigned characters in this Unicode release
1470my $DI;         # Default_Ignorable_Code_Point property
1471my $NChar;      # Noncharacter_Code_Point property
1472my $script;
1473my $scx;        # Script_Extensions property
1474my $idt;        # Identifier_Type property
1475
1476# Are there conflicting names because of beginning with 'In_', or 'Is_'
1477my $has_In_conflicts = 0;
1478my $has_Is_conflicts = 0;
1479
1480sub internal_file_to_platform ($file=undef) {
1481    # Convert our file paths which have '/' separators to those of the
1482    # platform.
1483
1484    return undef unless defined $file;
1485
1486    return File::Spec->join(split '/', $file);
1487}
1488
1489sub file_exists ($file=undef) {   # platform independent '-e'.  This program internally
1490                        # uses slash as a path separator.
1491    return 0 unless defined $file;
1492    return -e internal_file_to_platform($file);
1493}
1494
1495sub objaddr($addr) {
1496    # Returns the address of the blessed input object.
1497    # It doesn't check for blessedness because that would do a string eval
1498    # every call, and the program is structured so that this is never called
1499    # for a non-blessed object.
1500
1501    no overloading; # If overloaded, numifying below won't work.
1502
1503    # Numifying a ref gives its address.
1504    return pack 'J', $addr;
1505}
1506
1507# These are used only if $annotate is true.
1508# The entire range of Unicode characters is examined to populate these
1509# after all the input has been processed.  But most can be skipped, as they
1510# have the same descriptive phrases, such as being unassigned
1511my @viacode;            # Contains the 1 million character names
1512my @age;                # And their ages ("" if none)
1513my @printable;          # boolean: And are those characters printable?
1514my @annotate_char_type; # Contains a type of those characters, specifically
1515                        # for the purposes of annotation.
1516my $annotate_ranges;    # A map of ranges of code points that have the same
1517                        # name for the purposes of annotation.  They map to the
1518                        # upper edge of the range, so that the end point can
1519                        # be immediately found.  This is used to skip ahead to
1520                        # the end of a range, and avoid processing each
1521                        # individual code point in it.
1522my $unassigned_sans_noncharacters; # A Range_List of the unassigned
1523                                   # characters, but excluding those which are
1524                                   # also noncharacter code points
1525
1526# The annotation types are an extension of the regular range types, though
1527# some of the latter are folded into one.  Make the new types negative to
1528# avoid conflicting with the regular types
1529my $SURROGATE_TYPE = -1;
1530my $UNASSIGNED_TYPE = -2;
1531my $PRIVATE_USE_TYPE = -3;
1532my $NONCHARACTER_TYPE = -4;
1533my $CONTROL_TYPE = -5;
1534my $ABOVE_UNICODE_TYPE = -6;
1535my $UNKNOWN_TYPE = -7;  # Used only if there is a bug in this program
1536
1537sub populate_char_info ($i) {
1538    # Used only with the $annotate option.  Populates the arrays with the
1539    # input code point's info that are needed for outputting more detailed
1540    # comments.  If calling context wants a return, it is the end point of
1541    # any contiguous range of characters that share essentially the same info
1542
1543    $viacode[$i] = $perl_charname->value_of($i) || "";
1544    $age[$i] = (defined $age)
1545               ? (($age->value_of($i) =~ / ^ \d+ \. \d+ $ /x)
1546                  ? $age->value_of($i)
1547                  : "")
1548               : "";
1549
1550    # A character is generally printable if Unicode says it is,
1551    # but below we make sure that most Unicode general category 'C' types
1552    # aren't.
1553    $printable[$i] = $print->contains($i);
1554
1555    # But the characters in this range were removed in v2.0 and replaced by
1556    # different ones later.  Modern fonts will be for the replacement
1557    # characters, so suppress printing them.
1558    if (($v_version lt v2.0
1559         || ($compare_versions && $compare_versions lt v2.0))
1560        && (   $i >= $FIRST_REMOVED_HANGUL_SYLLABLE
1561            && $i <= $FINAL_REMOVED_HANGUL_SYLLABLE))
1562    {
1563        $printable[$i] = 0;
1564    }
1565
1566    $annotate_char_type[$i] = $perl_charname->type_of($i) || 0;
1567
1568    # Only these two regular types are treated specially for annotations
1569    # purposes
1570    $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME
1571                                && $annotate_char_type[$i] != $HANGUL_SYLLABLE;
1572
1573    # Give a generic name to all code points that don't have a real name.
1574    # We output ranges, if applicable, for these.  Also calculate the end
1575    # point of the range.
1576    my $end;
1577    if (! $viacode[$i]) {
1578        if ($i > $MAX_UNICODE_CODEPOINT) {
1579            $viacode[$i] = 'Above-Unicode';
1580            $annotate_char_type[$i] = $ABOVE_UNICODE_TYPE;
1581            $printable[$i] = 0;
1582            $end = $MAX_WORKING_CODEPOINT;
1583        }
1584        elsif ($gc-> table('Private_use')->contains($i)) {
1585            $viacode[$i] = 'Private Use';
1586            $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
1587            $printable[$i] = 0;
1588            $end = $gc->table('Private_Use')->containing_range($i)->end;
1589        }
1590        elsif ($NChar->contains($i)) {
1591            $viacode[$i] = 'Noncharacter';
1592            $annotate_char_type[$i] = $NONCHARACTER_TYPE;
1593            $printable[$i] = 0;
1594            $end = $NChar->containing_range($i)->end;
1595        }
1596        elsif ($gc-> table('Control')->contains($i)) {
1597            my $name_ref = property_ref('Name_Alias');
1598            $name_ref = property_ref('Unicode_1_Name') if ! defined $name_ref;
1599            $viacode[$i] = (defined $name_ref)
1600                           ? $name_ref->value_of($i)
1601                           : 'Control';
1602            $annotate_char_type[$i] = $CONTROL_TYPE;
1603            $printable[$i] = 0;
1604        }
1605        elsif ($gc-> table('Unassigned')->contains($i)) {
1606            $annotate_char_type[$i] = $UNASSIGNED_TYPE;
1607            $printable[$i] = 0;
1608            $viacode[$i] = 'Unassigned';
1609
1610            if (defined $block) { # No blocks in earliest releases
1611                $viacode[$i] .= ', block=' . $block-> value_of($i);
1612                $end = $gc-> table('Unassigned')->containing_range($i)->end;
1613
1614                # Because we name the unassigned by the blocks they are in, it
1615                # can't go past the end of that block, and it also can't go
1616                # past the unassigned range it is in.  The special table makes
1617                # sure that the non-characters, which are unassigned, are
1618                # separated out.
1619                $end = min($block->containing_range($i)->end,
1620                           $unassigned_sans_noncharacters->
1621                                                    containing_range($i)->end);
1622            }
1623            else {
1624                $end = $i + 1;
1625                while ($unassigned_sans_noncharacters->contains($end)) {
1626                    $end++;
1627                }
1628                $end--;
1629            }
1630        }
1631        elsif ($perl->table('_Perl_Surrogate')->contains($i)) {
1632            $viacode[$i] = 'Surrogate';
1633            $annotate_char_type[$i] = $SURROGATE_TYPE;
1634            $printable[$i] = 0;
1635            $end = $gc->table('Surrogate')->containing_range($i)->end;
1636        }
1637        else {
1638            Carp::my_carp_bug("Can't figure out how to annotate "
1639                              . sprintf("U+%04X", $i)
1640                              . ".  Proceeding anyway.");
1641            $viacode[$i] = 'UNKNOWN';
1642            $annotate_char_type[$i] = $UNKNOWN_TYPE;
1643            $printable[$i] = 0;
1644        }
1645    }
1646
1647    # Here, has a name, but if it's one in which the code point number is
1648    # appended to the name, do that.
1649    elsif ($annotate_char_type[$i] == $CP_IN_NAME) {
1650        $viacode[$i] .= sprintf("-%04X", $i);
1651
1652        my $limit = $perl_charname->containing_range($i)->end;
1653        if (defined $age) {
1654            # Do all these as groups of the same age, instead of individually,
1655            # because their names are so meaningless, and there are typically
1656            # large quantities of them.
1657            $end = $i + 1;
1658            while ($end <= $limit && $age->value_of($end) == $age[$i]) {
1659                $end++;
1660            }
1661            $end--;
1662        }
1663        else {
1664            $end = $limit;
1665        }
1666    }
1667
1668    # And here, has a name, but if it's a hangul syllable one, replace it with
1669    # the correct name from the Unicode algorithm
1670    elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) {
1671        use integer;
1672        my $SIndex = $i - $SBase;
1673        my $L = $LBase + $SIndex / $NCount;
1674        my $V = $VBase + ($SIndex % $NCount) / $TCount;
1675        my $T = $TBase + $SIndex % $TCount;
1676        $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}";
1677        $viacode[$i] .= $Jamo{$T} if $T != $TBase;
1678        $end = $perl_charname->containing_range($i)->end;
1679    }
1680
1681    return if ! defined wantarray;
1682    return $i if ! defined $end;    # If not a range, return the input
1683
1684    # Save this whole range so can find the end point quickly
1685    $annotate_ranges->add_map($i, $end, $end);
1686
1687    return $end;
1688}
1689
1690sub max($a, $b) {
1691    return $a >= $b ? $a : $b;
1692}
1693
1694sub min($a, $b) {
1695    return $a <= $b ? $a : $b;
1696}
1697
1698sub clarify_number ($number) {
1699    # This returns the input number with underscores inserted every 3 digits
1700    # in large (5 digits or more) numbers.  Input must be entirely digits, not
1701    # checked.
1702
1703    my $pos = length($number) - 3;
1704    return $number if $pos <= 1;
1705    while ($pos > 0) {
1706        substr($number, $pos, 0) = '_';
1707        $pos -= 3;
1708    }
1709    return $number;
1710}
1711
1712sub clarify_code_point_count ($number) {
1713    # This is like clarify_number(), but the input is assumed to be a count of
1714    # code points, rather than a generic number.
1715
1716    my $append = "";
1717
1718    if ($number > $MAX_UNICODE_CODEPOINTS) {
1719        $number -= ($MAX_WORKING_CODEPOINTS - $MAX_UNICODE_CODEPOINTS);
1720        return "All above-Unicode code points" if $number == 0;
1721        $append = " + all above-Unicode code points";
1722    }
1723    return clarify_number($number) . $append;
1724}
1725
1726package Carp;
1727
1728# These routines give a uniform treatment of messages in this program.  They
1729# are placed in the Carp package to cause the stack trace to not include them,
1730# although an alternative would be to use another package and set @CARP_NOT
1731# for it.
1732
1733our $Verbose = 1 if main::DEBUG;  # Useful info when debugging
1734
1735# This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1736# and overload trying to load Scalar:Util under miniperl.  See
1737# http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1738undef $overload::VERSION;
1739
1740sub my_carp($message="", $nofold=0) {
1741
1742    if ($message) {
1743        $message = main::join_lines($message);
1744        $message =~ s/^$0: *//;     # Remove initial program name
1745        $message =~ s/[.;,]+$//;    # Remove certain ending punctuation
1746        $message = "\n$0: $message;";
1747
1748        # Fold the message with program name, semi-colon end punctuation
1749        # (which looks good with the message that carp appends to it), and a
1750        # hanging indent for continuation lines.
1751        $message = main::simple_fold($message, "", 4) unless $nofold;
1752        $message =~ s/\n$//;        # Remove the trailing nl so what carp
1753                                    # appends is to the same line
1754    }
1755
1756    return $message if defined wantarray;   # If a caller just wants the msg
1757
1758    carp $message;
1759    return;
1760}
1761
1762sub my_carp_bug($message="") {
1763    # This is called when it is clear that the problem is caused by a bug in
1764    # this program.
1765    $message =~ s/^$0: *//;
1766    $message = my_carp("Bug in $0.  Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1767    carp $message;
1768    return;
1769}
1770
1771sub carp_too_few_args($args_ref, $count) {
1772    my_carp_bug("Need at least $count arguments to "
1773        . (caller 1)[3]
1774        . ".  Instead got: '"
1775        . join ', ', @$args_ref
1776        . "'.  No action taken.");
1777    return;
1778}
1779
1780sub carp_extra_args($args_ref) {
1781    unless (ref $args_ref) {
1782        my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref.  Not checking arguments.");
1783        return;
1784    }
1785    my ($package, $file, $line) = caller;
1786    my $subroutine = (caller 1)[3];
1787
1788    my $list;
1789    if (ref $args_ref eq 'HASH') {
1790        foreach my $key (keys %$args_ref) {
1791            $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
1792        }
1793        $list = join ', ', each %{$args_ref};
1794    }
1795    elsif (ref $args_ref eq 'ARRAY') {
1796        foreach my $arg (@$args_ref) {
1797            $arg = $UNDEF unless defined $arg;
1798        }
1799        $list = join ', ', @$args_ref;
1800    }
1801    else {
1802        my_carp_bug("Can't cope with ref "
1803                . ref($args_ref)
1804                . " . argument to 'carp_extra_args'.  Not checking arguments.");
1805        return;
1806    }
1807
1808    my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine.  Skipped.");
1809    return;
1810}
1811
1812package main;
1813
1814{ # Closure
1815
1816    # This program uses the inside-out method for objects, as recommended in
1817    # "Perl Best Practices".  (This is the best solution still, since this has
1818    # to run under miniperl.)  This closure aids in generating those.  There
1819    # are two routines.  setup_package() is called once per package to set
1820    # things up, and then set_access() is called for each hash representing a
1821    # field in the object.  These routines arrange for the object to be
1822    # properly destroyed when no longer used, and for standard accessor
1823    # functions to be generated.  If you need more complex accessors, just
1824    # write your own and leave those accesses out of the call to set_access().
1825    # More details below.
1826
1827    my %constructor_fields; # fields that are to be used in constructors; see
1828                            # below
1829
1830    # The values of this hash will be the package names as keys to other
1831    # hashes containing the name of each field in the package as keys, and
1832    # references to their respective hashes as values.
1833    my %package_fields;
1834
1835    sub setup_package {
1836        # Sets up the package, creating standard DESTROY and dump methods
1837        # (unless already defined).  The dump method is used in debugging by
1838        # simple_dumper().
1839        # The optional parameters are:
1840        #   a)  a reference to a hash, that gets populated by later
1841        #       set_access() calls with one of the accesses being
1842        #       'constructor'.  The caller can then refer to this, but it is
1843        #       not otherwise used by these two routines.
1844        #   b)  a reference to a callback routine to call during destruction
1845        #       of the object, before any fields are actually destroyed
1846
1847        my %args = @_;
1848        my $constructor_ref = delete $args{'Constructor_Fields'};
1849        my $destroy_callback = delete $args{'Destroy_Callback'};
1850        Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1851
1852        my %fields;
1853        my $package = (caller)[0];
1854
1855        $package_fields{$package} = \%fields;
1856        $constructor_fields{$package} = $constructor_ref;
1857
1858        unless ($package->can('DESTROY')) {
1859            my $destroy_name = "${package}::DESTROY";
1860            no strict "refs";
1861
1862            # Use typeglob to give the anonymous subroutine the name we want
1863            *$destroy_name = sub {
1864                my $self = shift;
1865                my $addr = do { no overloading; pack 'J', $self; };
1866
1867                $self->$destroy_callback if $destroy_callback;
1868                foreach my $field (keys %{$package_fields{$package}}) {
1869                    #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1870                    delete $package_fields{$package}{$field}{$addr};
1871                }
1872                return;
1873            }
1874        }
1875
1876        unless ($package->can('dump')) {
1877            my $dump_name = "${package}::dump";
1878            no strict "refs";
1879            *$dump_name = sub {
1880                my $self = shift;
1881                return dump_inside_out($self, $package_fields{$package}, @_);
1882            }
1883        }
1884        return;
1885    }
1886
1887    sub set_access($name, $field, @accessors) {
1888        # Arrange for the input field to be garbage collected when no longer
1889        # needed.  Also, creates standard accessor functions for the field
1890        # based on the optional parameters-- none if none of these parameters:
1891        #   'addable'    creates an 'add_NAME()' accessor function.
1892        #   'readable' or 'readable_array'   creates a 'NAME()' accessor
1893        #                function.
1894        #   'settable'   creates a 'set_NAME()' accessor function.
1895        #   'constructor' doesn't create an accessor function, but adds the
1896        #                field to the hash that was previously passed to
1897        #                setup_package();
1898        # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1899        # 'add' etc. all mean 'addable'.
1900        # The read accessor function will work on both array and scalar
1901        # values.  If another accessor in the parameter list is 'a', the read
1902        # access assumes an array.  You can also force it to be array access
1903        # by specifying 'readable_array' instead of 'readable'
1904        #
1905        # A sort-of 'protected' access can be set-up by preceding the addable,
1906        # readable or settable with some initial portion of 'protected_' (but,
1907        # the underscore is required), like 'p_a', 'pro_set', etc.  The
1908        # "protection" is only by convention.  All that happens is that the
1909        # accessor functions' names begin with an underscore.  So instead of
1910        # calling set_foo, the call is _set_foo.  (Real protection could be
1911        # accomplished by having a new subroutine, end_package, called at the
1912        # end of each package, and then storing the __LINE__ ranges and
1913        # checking them on every accessor.  But that is way overkill.)
1914
1915        # We create anonymous subroutines as the accessors and then use
1916        # typeglobs to assign them to the proper package and name
1917
1918        # $name 	Name of the field
1919        # $field 	Reference to the inside-out hash containing the
1920		# 			field
1921
1922        my $package = (caller)[0];
1923
1924        if (! exists $package_fields{$package}) {
1925            croak "$0: Must call 'setup_package' before 'set_access'";
1926        }
1927
1928        # Stash the field so DESTROY can get it.
1929        $package_fields{$package}{$name} = $field;
1930
1931        # Remaining arguments are the accessors.  For each...
1932        foreach my $access (@accessors) {
1933            my $access = lc $access;
1934
1935            my $protected = "";
1936
1937            # Match the input as far as it goes.
1938            if ($access =~ /^(p[^_]*)_/) {
1939                $protected = $1;
1940                if (substr('protected_', 0, length $protected)
1941                    eq $protected)
1942                {
1943
1944                    # Add 1 for the underscore not included in $protected
1945                    $access = substr($access, length($protected) + 1);
1946                    $protected = '_';
1947                }
1948                else {
1949                    $protected = "";
1950                }
1951            }
1952
1953            if (substr('addable', 0, length $access) eq $access) {
1954                my $subname = "${package}::${protected}add_$name";
1955                no strict "refs";
1956
1957                # add_ accessor.  Don't add if already there, which we
1958                # determine using 'eq' for scalars and '==' otherwise.
1959                *$subname = sub ($self, $value) {
1960                    use strict "refs";
1961                    my $addr = do { no overloading; pack 'J', $self; };
1962                    if (ref $value) {
1963                        return if grep { $value == $_ } @{$field->{$addr}};
1964                    }
1965                    else {
1966                        return if grep { $value eq $_ } @{$field->{$addr}};
1967                    }
1968                    push @{$field->{$addr}}, $value;
1969                    return;
1970                }
1971            }
1972            elsif (substr('constructor', 0, length $access) eq $access) {
1973                if ($protected) {
1974                    Carp::my_carp_bug("Can't set-up 'protected' constructors")
1975                }
1976                else {
1977                    $constructor_fields{$package}{$name} = $field;
1978                }
1979            }
1980            elsif (substr('readable_array', 0, length $access) eq $access) {
1981
1982                # Here has read access.  If one of the other parameters for
1983                # access is array, or this one specifies array (by being more
1984                # than just 'readable_'), then create a subroutine that
1985                # assumes the data is an array.  Otherwise just a scalar
1986                my $subname = "${package}::${protected}$name";
1987                if (grep { /^a/i } @_
1988                    or length($access) > length('readable_'))
1989                {
1990                    no strict "refs";
1991                    *$subname = sub ($_addr) {
1992                        use strict "refs";
1993                        my $addr = do { no overloading; pack 'J', $_addr; };
1994                        if (ref $field->{$addr} ne 'ARRAY') {
1995                            my $type = ref $field->{$addr};
1996                            $type = 'scalar' unless $type;
1997                            Carp::my_carp_bug("Trying to read $name as an array when it is a $type.  Big problems.");
1998                            return;
1999                        }
2000                        return scalar @{$field->{$addr}} unless wantarray;
2001
2002                        # Make a copy; had problems with caller modifying the
2003                        # original otherwise
2004                        my @return = @{$field->{$addr}};
2005                        return @return;
2006                    }
2007                }
2008                else {
2009
2010                    # Here not an array value, a simpler function.
2011                    no strict "refs";
2012                    *$subname = sub ($addr) {
2013                        use strict "refs";
2014                        no overloading;
2015                        return $field->{pack 'J', $addr};
2016                    }
2017                }
2018            }
2019            elsif (substr('settable', 0, length $access) eq $access) {
2020                my $subname = "${package}::${protected}set_$name";
2021                no strict "refs";
2022                *$subname = sub ($self, $value) {
2023                    use strict "refs";
2024                    # $self is $_[0]; $value is $_[1]
2025                    no overloading;
2026                    $field->{pack 'J', $self} = $value;
2027                    return;
2028                }
2029            }
2030            else {
2031                Carp::my_carp_bug("Unknown accessor type $access.  No accessor set.");
2032            }
2033        }
2034        return;
2035    }
2036}
2037
2038package Input_file;
2039
2040# All input files use this object, which stores various attributes about them,
2041# and provides for convenient, uniform handling.  The run method wraps the
2042# processing.  It handles all the bookkeeping of opening, reading, and closing
2043# the file, returning only significant input lines.
2044#
2045# Each object gets a handler which processes the body of the file, and is
2046# called by run().  All character property files must use the generic,
2047# default handler, which has code scrubbed to handle things you might not
2048# expect, including automatic EBCDIC handling.  For files that don't deal with
2049# mapping code points to a property value, such as test files,
2050# PropertyAliases, PropValueAliases, and named sequences, you can override the
2051# handler to be a custom one.  Such a handler should basically be a
2052# while(next_line()) {...} loop.
2053#
2054# You can also set up handlers to
2055#   0) call during object construction time, after everything else is done
2056#   1) call before the first line is read, for pre processing
2057#   2) call to adjust each line of the input before the main handler gets
2058#      them.  This can be automatically generated, if appropriately simple
2059#      enough, by specifying a Properties parameter in the constructor.
2060#   3) call upon EOF before the main handler exits its loop
2061#   4) call at the end, for post processing
2062#
2063# $_ is used to store the input line, and is to be filtered by the
2064# each_line_handler()s.  So, if the format of the line is not in the desired
2065# format for the main handler, these are used to do that adjusting.  They can
2066# be stacked (by enclosing them in an [ anonymous array ] in the constructor,
2067# so the $_ output of one is used as the input to the next.  The EOF handler
2068# is also stackable, but none of the others are, but could easily be changed
2069# to be so.
2070#
2071# Some properties are used by the Perl core but aren't defined until later
2072# Unicode releases.  The perl interpreter would have problems working when
2073# compiled with an earlier Unicode version that doesn't have them, so we need
2074# to define them somehow for those releases.  The 'Early' constructor
2075# parameter can be used to automatically handle this.  It is essentially
2076# ignored if the Unicode version being compiled has a data file for this
2077# property.  Either code to execute or a file to read can be specified.
2078# Details are at the %early definition.
2079#
2080# Most of the handlers can call insert_lines() or insert_adjusted_lines()
2081# which insert the parameters as lines to be processed before the next input
2082# file line is read.  This allows the EOF handler(s) to flush buffers, for
2083# example.  The difference between the two routines is that the lines inserted
2084# by insert_lines() are subjected to the each_line_handler()s.  (So if you
2085# called it from such a handler, you would get infinite recursion without some
2086# mechanism to prevent that.)  Lines inserted by insert_adjusted_lines() go
2087# directly to the main handler without any adjustments.  If the
2088# post-processing handler calls any of these, there will be no effect.  Some
2089# error checking for these conditions could be added, but it hasn't been done.
2090#
2091# carp_bad_line() should be called to warn of bad input lines, which clears $_
2092# to prevent further processing of the line.  This routine will output the
2093# message as a warning once, and then keep a count of the lines that have the
2094# same message, and output that count at the end of the file's processing.
2095# This keeps the number of messages down to a manageable amount.
2096#
2097# get_missings() should be called to retrieve any @missing input lines.
2098# Messages will be raised if this isn't done if the options aren't to ignore
2099# missings.
2100
2101sub trace { return main::trace(@_); }
2102
2103{ # Closure
2104    # Keep track of fields that are to be put into the constructor.
2105    my %constructor_fields;
2106
2107    main::setup_package(Constructor_Fields => \%constructor_fields);
2108
2109    my %file; # Input file name, required
2110    main::set_access('file', \%file, qw{ c r });
2111
2112    my %first_released; # Unicode version file was first released in, required
2113    main::set_access('first_released', \%first_released, qw{ c r });
2114
2115    my %handler;    # Subroutine to process the input file, defaults to
2116                    # 'process_generic_property_file'
2117    main::set_access('handler', \%handler, qw{ c });
2118
2119    my %property;
2120    # name of property this file is for.  defaults to none, meaning not
2121    # applicable, or is otherwise determinable, for example, from each line.
2122    main::set_access('property', \%property, qw{ c r });
2123
2124    my %optional;
2125    # This is either an unsigned number, or a list of property names.  In the
2126    # former case, if it is non-zero, it means the file is optional, so if the
2127    # file is absent, no warning about that is output.  In the latter case, it
2128    # is a list of properties that the file (exclusively) defines.  If the
2129    # file is present, tables for those properties will be produced; if
2130    # absent, none will, even if they are listed elsewhere (namely
2131    # PropertyAliases.txt and PropValueAliases.txt) as being in this release,
2132    # and no warnings will be raised about them not being available.  (And no
2133    # warning about the file itself will be raised.)
2134    main::set_access('optional', \%optional, qw{ c readable_array } );
2135
2136    my %non_skip;
2137    # This is used for debugging, to skip processing of all but a few input
2138    # files.  Add 'non_skip => 1' to the constructor for those files you want
2139    # processed when you set the $debug_skip global.
2140    main::set_access('non_skip', \%non_skip, 'c');
2141
2142    my %skip;
2143    # This is used to skip processing of this input file (semi-) permanently.
2144    # The value should be the reason the file is being skipped.  It is used
2145    # for files that we aren't planning to process anytime soon, but want to
2146    # allow to be in the directory and be checked for their names not
2147    # conflicting with any other files on a DOS 8.3 name filesystem, but to
2148    # not otherwise be processed, and to not raise a warning about not being
2149    # handled.  In the constructor call, any value that evaluates to a numeric
2150    # 0 or undef means don't skip.  Any other value is a string giving the
2151    # reason it is being skipped, and this will appear in generated pod.
2152    # However, an empty string reason will suppress the pod entry.
2153    # Internally, calls that evaluate to numeric 0 are changed into undef to
2154    # distinguish them from an empty string call.
2155    main::set_access('skip', \%skip, 'c', 'r');
2156
2157    my %each_line_handler;
2158    # list of subroutines to look at and filter each non-comment line in the
2159    # file.  defaults to none.  The subroutines are called in order, each is
2160    # to adjust $_ for the next one, and the final one adjusts it for
2161    # 'handler'
2162    main::set_access('each_line_handler', \%each_line_handler, 'c');
2163
2164    my %retain_trailing_comments;
2165    # This is used to not discard the comments that end data lines.  This
2166    # would be used only for files with non-typical syntax, and most code here
2167    # assumes that comments have been stripped, so special handlers would have
2168    # to be written.  It is assumed that the code will use these in
2169    # single-quoted contexts, and so any "'" marks in the comment will be
2170    # prefixed by a backslash.
2171    main::set_access('retain_trailing_comments', \%retain_trailing_comments, 'c');
2172
2173    my %properties; # Optional ordered list of the properties that occur in each
2174    # meaningful line of the input file.  If present, an appropriate
2175    # each_line_handler() is automatically generated and pushed onto the stack
2176    # of such handlers.  This is useful when a file contains multiple
2177    # properties per line, but no other special considerations are necessary.
2178    # The special value "<ignored>" means to discard the corresponding input
2179    # field.
2180    # Any @missing lines in the file should also match this syntax; no such
2181    # files exist as of 6.3.  But if it happens in a future release, the code
2182    # could be expanded to properly parse them.
2183    main::set_access('properties', \%properties, qw{ c r });
2184
2185    my %has_missings_defaults;
2186    # ? Are there lines in the file giving default values for code points
2187    # missing from it?.  Defaults to NO_DEFAULTS.  Otherwise NOT_IGNORED is
2188    # the norm, but IGNORED means it has such lines, but the handler doesn't
2189    # use them.  Having these three states allows us to catch changes to the
2190    # UCD that this program should track.  XXX This could be expanded to
2191    # specify the syntax for such lines, like %properties above.
2192    main::set_access('has_missings_defaults',
2193                                        \%has_missings_defaults, qw{ c r });
2194
2195    my %construction_time_handler;
2196    # Subroutine to call at the end of the new method.  If undef, no such
2197    # handler is called.
2198    main::set_access('construction_time_handler',
2199                                        \%construction_time_handler, qw{ c });
2200
2201    my %pre_handler;
2202    # Subroutine to call before doing anything else in the file.  If undef, no
2203    # such handler is called.
2204    main::set_access('pre_handler', \%pre_handler, qw{ c });
2205
2206    my %eof_handler;
2207    # Subroutines to call upon getting an EOF on the input file, but before
2208    # that is returned to the main handler.  This is to allow buffers to be
2209    # flushed.  The handler is expected to call insert_lines() or
2210    # insert_adjusted() with the buffered material
2211    main::set_access('eof_handler', \%eof_handler, qw{ c });
2212
2213    my %post_handler;
2214    # Subroutine to call after all the lines of the file are read in and
2215    # processed.  If undef, no such handler is called.  Note that this cannot
2216    # add lines to be processed; instead use eof_handler
2217    main::set_access('post_handler', \%post_handler, qw{ c });
2218
2219    my %progress_message;
2220    # Message to print to display progress in lieu of the standard one
2221    main::set_access('progress_message', \%progress_message, qw{ c });
2222
2223    my %handle;
2224    # cache open file handle, internal.  Is undef if file hasn't been
2225    # processed at all, empty if has;
2226    main::set_access('handle', \%handle);
2227
2228    my %added_lines;
2229    # cache of lines added virtually to the file, internal
2230    main::set_access('added_lines', \%added_lines);
2231
2232    my %remapped_lines;
2233    # cache of lines added virtually to the file, internal
2234    main::set_access('remapped_lines', \%remapped_lines);
2235
2236    my %errors;
2237    # cache of errors found, internal
2238    main::set_access('errors', \%errors);
2239
2240    my %missings;
2241    # storage of '@missing' defaults lines
2242    main::set_access('missings', \%missings);
2243
2244    my %early;
2245    # Used for properties that must be defined (for Perl's purposes) on
2246    # versions of Unicode earlier than Unicode itself defines them.  The
2247    # parameter is an array (it would be better to be a hash, but not worth
2248    # bothering about due to its rare use).
2249    #
2250    # The first element is either a code reference to call when in a release
2251    # earlier than the Unicode file is available in, or it is an alternate
2252    # file to use instead of the non-existent one.  This file must have been
2253    # plunked down in the same directory as mktables.  Should you be compiling
2254    # on a release that needs such a file, mktables will abort the
2255    # compilation, and tell you where to get the necessary file(s), and what
2256    # name(s) to use to store them as.
2257    # In the case of specifying an alternate file, the array must contain two
2258    # further elements:
2259    #
2260    # [1] is the name of the property that will be generated by this file.
2261    # The class automatically takes the input file and excludes any code
2262    # points in it that were not assigned in the Unicode version being
2263    # compiled.  It then uses this result to define the property in the given
2264    # version.  Since the property doesn't actually exist in the Unicode
2265    # version being compiled, this should be a name accessible only by core
2266    # perl.  If it is the same name as the regular property, the constructor
2267    # will mark the output table as a $PLACEHOLDER so that it doesn't actually
2268    # get output, and so will be unusable by non-core code.  Otherwise it gets
2269    # marked as $INTERNAL_ONLY.
2270    #
2271    # [2] is a property value to assign (only when compiling Unicode 1.1.5) to
2272    # the Hangul syllables in that release (which were ripped out in version
2273    # 2) for the given property .  (Hence it is ignored except when compiling
2274    # version 1.  You only get one value that applies to all of them, which
2275    # may not be the actual reality, but probably nobody cares anyway for
2276    # these obsolete characters.)
2277    #
2278    # [3] if present is the default value for the property to assign for code
2279    # points not given in the input.  If not present, the default from the
2280    # normal property is used
2281    #
2282    # [-1] If there is an extra final element that is the string 'ONLY_EARLY'.
2283    # it means to not add the name in [1] as an alias to the property name
2284    # used for these.  Normally, when compiling Unicode versions that don't
2285    # invoke the early handling, the name is added as a synonym.
2286    #
2287    # Not all files can be handled in the above way, and so the code ref
2288    # alternative is available.  It can do whatever it needs to.  The other
2289    # array elements are optional in this case, and the code is free to use or
2290    # ignore them if they are present.
2291    #
2292    # Internally, the constructor unshifts a 0 or 1 onto this array to
2293    # indicate if an early alternative is actually being used or not.  This
2294    # makes for easier testing later on.
2295    main::set_access('early', \%early, 'c');
2296
2297    my %only_early;
2298    main::set_access('only_early', \%only_early, 'c');
2299
2300    my %required_even_in_debug_skip;
2301    # debug_skip is used to speed up compilation during debugging by skipping
2302    # processing files that are not needed for the task at hand.  However,
2303    # some files pretty much can never be skipped, and this is used to specify
2304    # that this is one of them.  In order to skip this file, the call to the
2305    # constructor must be edited to comment out this parameter.
2306    main::set_access('required_even_in_debug_skip',
2307                     \%required_even_in_debug_skip, 'c');
2308
2309    my %withdrawn;
2310    # Some files get removed from the Unicode DB.  This is a version object
2311    # giving the first release without this file.
2312    main::set_access('withdrawn', \%withdrawn, 'c');
2313
2314    my %ucd;
2315    # Some files are not actually part of the Unicode Character Database.
2316    # These typically have a different way of indicating their version
2317    main::set_access('ucd', \%ucd, 'c');
2318
2319    my %in_this_release;
2320    # Calculated value from %first_released and %withdrawn.  Are we compiling
2321    # a Unicode release which includes this file?
2322    main::set_access('in_this_release', \%in_this_release);
2323
2324    sub _next_line;
2325    sub _next_line_with_remapped_range;
2326
2327    sub new {
2328        my $class = shift;
2329
2330        my $self = bless \do{ my $anonymous_scalar }, $class;
2331        my $addr = do { no overloading; pack 'J', $self; };
2332
2333        # Set defaults
2334        $handler{$addr} = \&main::process_generic_property_file;
2335        $retain_trailing_comments{$addr} = 0;
2336        $non_skip{$addr} = 0;
2337        $skip{$addr} = undef;
2338        $has_missings_defaults{$addr} = $NO_DEFAULTS;
2339        $handle{$addr} = undef;
2340        $added_lines{$addr} = [ ];
2341        $remapped_lines{$addr} = [ ];
2342        $each_line_handler{$addr} = [ ];
2343        $eof_handler{$addr} = [ ];
2344        $errors{$addr} = { };
2345        $missings{$addr} = [ ];
2346        $early{$addr} = [ ];
2347        $optional{$addr} = [ ];
2348        $ucd{$addr} = 1;
2349
2350        # Two positional parameters.
2351        return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
2352        $file{$addr} = main::internal_file_to_platform(shift);
2353        $first_released{$addr} = shift;
2354
2355        # The rest of the arguments are key => value pairs
2356        # %constructor_fields has been set up earlier to list all possible
2357        # ones.  Either set or push, depending on how the default has been set
2358        # up just above.
2359        my %args = @_;
2360        foreach my $key (keys %args) {
2361            my $argument = $args{$key};
2362
2363            # Note that the fields are the lower case of the constructor keys
2364            my $hash = $constructor_fields{lc $key};
2365            if (! defined $hash) {
2366                Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self.  Skipped");
2367                next;
2368            }
2369            if (ref $hash->{$addr} eq 'ARRAY') {
2370                if (ref $argument eq 'ARRAY') {
2371                    foreach my $argument (@{$argument}) {
2372                        next if ! defined $argument;
2373                        push @{$hash->{$addr}}, $argument;
2374                    }
2375                }
2376                else {
2377                    push @{$hash->{$addr}}, $argument if defined $argument;
2378                }
2379            }
2380            else {
2381                $hash->{$addr} = $argument;
2382            }
2383            delete $args{$key};
2384        };
2385
2386        $non_skip{$addr} = 1 if $required_even_in_debug_skip{$addr};
2387
2388        # Convert 0 (meaning don't skip) to undef
2389        undef $skip{$addr} unless $skip{$addr};
2390
2391        # Handle the case where this file is optional
2392        my $pod_message_for_non_existent_optional = "";
2393        if ($optional{$addr}->@*) {
2394
2395            # First element is the pod message
2396            $pod_message_for_non_existent_optional
2397                                                = shift $optional{$addr}->@*;
2398            # Convert a 0 'Optional' argument to an empty list to make later
2399            # code more concise.
2400            if (   $optional{$addr}->@*
2401                && $optional{$addr}->@* == 1
2402                && $optional{$addr}[0] ne ""
2403                && $optional{$addr}[0] !~ /\D/
2404                && $optional{$addr}[0] == 0)
2405            {
2406                $optional{$addr} = [ ];
2407            }
2408            else {  # But if the only element doesn't evaluate to 0, make sure
2409                    # that this file is indeed considered optional below.
2410                unshift $optional{$addr}->@*, 1;
2411            }
2412        }
2413
2414        my $progress;
2415        my $function_instead_of_file = 0;
2416
2417        if ($early{$addr}->@* && $early{$addr}[-1] eq 'ONLY_EARLY') {
2418            $only_early{$addr} = 1;
2419            pop $early{$addr}->@*;
2420        }
2421
2422        # If we are compiling a Unicode release earlier than the file became
2423        # available, the constructor may have supplied a substitute
2424        if ($first_released{$addr} gt $v_version && $early{$addr}->@*) {
2425
2426            # Yes, we have a substitute, that we will use; mark it so
2427            unshift $early{$addr}->@*, 1;
2428
2429            # See the definition of %early for what the array elements mean.
2430            # Note that we have just unshifted onto the array, so the numbers
2431            # below are +1 of those in the %early description.
2432            # If we have a property this defines, create a table and default
2433            # map for it now (at essentially compile time), so that it will be
2434            # available for the whole of run time.  (We will want to add this
2435            # name as an alias when we are using the official property name;
2436            # but this must be deferred until run(), because at construction
2437            # time the official names have yet to be defined.)
2438            if ($early{$addr}[2]) {
2439                my $fate = ($property{$addr}
2440                            && $property{$addr} eq $early{$addr}[2])
2441                          ? $PLACEHOLDER
2442                          : $INTERNAL_ONLY;
2443                my $prop_object = Property->new($early{$addr}[2],
2444                                                Fate => $fate,
2445                                                Perl_Extension => 1,
2446                                                );
2447
2448                # If not specified by the constructor, use the default mapping
2449                # for the regular property for this substitute one.
2450                if ($early{$addr}[4]) {
2451                    $prop_object->set_default_map($early{$addr}[4]);
2452                }
2453                elsif (    defined $property{$addr}
2454                       &&  defined $default_mapping{$property{$addr}})
2455                {
2456                    $prop_object
2457                        ->set_default_map($default_mapping{$property{$addr}});
2458                }
2459            }
2460
2461            if (ref $early{$addr}[1] eq 'CODE') {
2462                $function_instead_of_file = 1;
2463
2464                # If the first element of the array is a code ref, the others
2465                # are optional.
2466                $handler{$addr} = $early{$addr}[1];
2467                $property{$addr} = $early{$addr}[2]
2468                                                if defined $early{$addr}[2];
2469                $progress = "substitute $file{$addr}";
2470
2471                undef $file{$addr};
2472            }
2473            else {  # Specifying a substitute file
2474
2475                if (! main::file_exists($early{$addr}[1])) {
2476
2477                    # If we don't see the substitute file, generate an error
2478                    # message giving the needed things, and add it to the list
2479                    # of such to output before actual processing happens
2480                    # (hence the user finds out all of them in one run).
2481                    # Instead of creating a general method for NameAliases,
2482                    # hard-code it here, as there is unlikely to ever be a
2483                    # second one which needs special handling.
2484                    my $string_version = ($file{$addr} eq "NameAliases.txt")
2485                                    ? 'at least 6.1 (the later, the better)'
2486                                    : sprintf "%vd", $first_released{$addr};
2487                    push @missing_early_files, <<END;
2488'$file{$addr}' version $string_version should be copied to '$early{$addr}[1]'.
2489END
2490                    ;
2491                    return;
2492                }
2493                $progress = $early{$addr}[1];
2494                $progress .= ", substituting for $file{$addr}" if $file{$addr};
2495                $file{$addr} = $early{$addr}[1];
2496                $property{$addr} = $early{$addr}[2];
2497
2498                # Ignore code points not in the version being compiled
2499                push $each_line_handler{$addr}->@*, \&_exclude_unassigned;
2500
2501                if (   $v_version lt v2.0        # Hanguls in this release ...
2502                    && defined $early{$addr}[3]) # ... need special treatment
2503                {
2504                    push $eof_handler{$addr}->@*, \&_fixup_obsolete_hanguls;
2505                }
2506            }
2507
2508            # And this substitute is valid for all releases.
2509            $first_released{$addr} = v0;
2510        }
2511        else {  # Normal behavior
2512            $progress = $file{$addr};
2513            unshift $early{$addr}->@*, 0; # No substitute
2514        }
2515
2516        my $file = $file{$addr};
2517        $progress_message{$addr} = "Processing $progress"
2518                                            unless $progress_message{$addr};
2519
2520        # A file should be there if it is within the window of versions for
2521        # which Unicode supplies it
2522        if ($withdrawn{$addr} && $withdrawn{$addr} le $v_version) {
2523            $in_this_release{$addr} = 0;
2524            $skip{$addr} = "";
2525        }
2526        else {
2527            $in_this_release{$addr} = $first_released{$addr} le $v_version;
2528
2529            # Check that the file for this object (possibly using a substitute
2530            # for early releases) exists or we have a function alternative
2531            if (   ! $function_instead_of_file
2532                && ! main::file_exists($file))
2533            {
2534                # Here there is nothing available for this release.  This is
2535                # fine if we aren't expecting anything in this release.
2536                if (! $in_this_release{$addr}) {
2537                    $skip{$addr} = "";  # Don't remark since we expected
2538                                        # nothing and got nothing
2539                }
2540                elsif ($optional{$addr}->@*) {
2541
2542                    # Here the file is optional in this release; Use the
2543                    # passed in text to document this case in the pod.
2544                    $skip{$addr} = $pod_message_for_non_existent_optional;
2545                }
2546                elsif (   $in_this_release{$addr}
2547                       && ! defined $skip{$addr}
2548                       && defined $file)
2549                { # Doesn't exist but should.
2550                    $skip{$addr} = "'$file' not found.  Possibly Big problems";
2551                    Carp::my_carp($skip{$addr});
2552                }
2553            }
2554            elsif ($debug_skip && ! defined $skip{$addr} && ! $non_skip{$addr})
2555            {
2556
2557                # The file exists; if not skipped for another reason, and we are
2558                # skipping most everything during debugging builds, use that as
2559                # the skip reason.
2560                $skip{$addr} = '$debug_skip is on'
2561            }
2562        }
2563
2564        if (   ! $debug_skip
2565            && $non_skip{$addr}
2566            && ! $required_even_in_debug_skip{$addr}
2567            && $verbosity)
2568        {
2569            print "Warning: " . __PACKAGE__ . " constructor for $file has useless 'non_skip' in it\n";
2570        }
2571
2572        # Here, we have figured out if we will be skipping this file or not.
2573        # If so, we add any single property it defines to any passed in
2574        # optional property list.  These will be dealt with at run time.
2575        if (defined $skip{$addr}) {
2576            if ($property{$addr}) {
2577                push $optional{$addr}->@*, $property{$addr};
2578            }
2579        } # Otherwise, are going to process the file.
2580        elsif ($property{$addr}) {
2581
2582            # If the file has a property defined in the constructor for it, it
2583            # means that the property is not listed in the file's entries.  So
2584            # add a handler (to the list of line handlers) to insert the
2585            # property name into the lines, to provide a uniform interface to
2586            # the final processing subroutine.
2587            push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
2588        }
2589        elsif ($properties{$addr}) {
2590
2591            # Similarly, there may be more than one property represented on
2592            # each line, with no clue but the constructor input what those
2593            # might be.  Add a handler for each line in the input so that it
2594            # creates a separate input line for each property in those input
2595            # lines, thus making them suitable to handle generically.
2596
2597            push @{$each_line_handler{$addr}},
2598                 sub {
2599                    my $file = shift;
2600                    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2601                    my @fields = split /\s*;\s*/, $_, -1;
2602
2603                    if (@fields - 1 > @{$properties{$addr}}) {
2604                        $file->carp_bad_line('Extra fields');
2605                        $_ = "";
2606                        return;
2607                    }
2608                    my $range = shift @fields;  # 0th element is always the
2609                                                # range
2610
2611                    # The next fields in the input line correspond
2612                    # respectively to the stored properties.
2613                    for my $i (0 ..  @{$properties{$addr}} - 1) {
2614                        my $property_name = $properties{$addr}[$i];
2615                        next if $property_name eq '<ignored>';
2616                        $file->insert_adjusted_lines(
2617                              "$range; $property_name; $fields[$i]");
2618                    }
2619                    $_ = "";
2620
2621                    return;
2622                };
2623        }
2624
2625        {   # On non-ascii platforms, we use a special pre-handler
2626            no strict;
2627            no warnings 'once';
2628            *next_line = (main::NON_ASCII_PLATFORM)
2629                         ? *_next_line_with_remapped_range
2630                         : *_next_line;
2631        }
2632
2633        &{$construction_time_handler{$addr}}($self)
2634                                        if $construction_time_handler{$addr};
2635
2636        return $self;
2637    }
2638
2639
2640    use overload
2641        fallback => 0,
2642        qw("") => "_operator_stringify",
2643        "." => \&main::_operator_dot,
2644        ".=" => \&main::_operator_dot_equal,
2645    ;
2646
2647    sub _operator_stringify($self) {
2648        return __PACKAGE__ . " object for " . $self->file;
2649    }
2650
2651    sub run($self) {
2652        # Process the input object $self.  This opens and closes the file and
2653        # calls all the handlers for it.  Currently,  this can only be called
2654        # once per file, as it destroy's the EOF handlers
2655
2656        # flag to make sure extracted files are processed early
2657        state $seen_non_extracted = 0;
2658
2659        my $addr = do { no overloading; pack 'J', $self; };
2660
2661        my $file = $file{$addr};
2662
2663        if (! $file) {
2664            $handle{$addr} = 'pretend_is_open';
2665        }
2666        else {
2667            if ($seen_non_extracted) {
2668                if ($file =~ /$EXTRACTED/i) # Some platforms may change the
2669                                            # case of the file's name
2670                {
2671                    Carp::my_carp_bug(main::join_lines(<<END
2672$file should be processed just after the 'Prop...Alias' files, and before
2673anything not in the $EXTRACTED_DIR directory.  Proceeding, but the results may
2674have subtle problems
2675END
2676                    ));
2677                }
2678            }
2679            elsif ($EXTRACTED_DIR
2680
2681                    # We only do this check for generic property files
2682                    && $handler{$addr} == \&main::process_generic_property_file
2683
2684                    && $file !~ /$EXTRACTED/i)
2685            {
2686                # We don't set this (by the 'if' above) if we have no
2687                # extracted directory, so if running on an early version,
2688                # this test won't work.  Not worth worrying about.
2689                $seen_non_extracted = 1;
2690            }
2691
2692            # Mark the file as having being processed, and warn if it
2693            # isn't a file we are expecting.  As we process the files,
2694            # they are deleted from the hash, so any that remain at the
2695            # end of the program are files that we didn't process.
2696            my $fkey = File::Spec->rel2abs($file);
2697            my $exists = delete $potential_files{lc($fkey)};
2698
2699            Carp::my_carp("Was not expecting '$file'.")
2700                                    if $exists && ! $in_this_release{$addr};
2701
2702            # If there is special handling for compiling Unicode releases
2703            # earlier than the first one in which Unicode defines this
2704            # property ...
2705            if ($early{$addr}->@* > 1) {
2706
2707                # Mark as processed any substitute file that would be used in
2708                # such a release
2709                $fkey = File::Spec->rel2abs($early{$addr}[1]);
2710                delete $potential_files{lc($fkey)};
2711
2712                # As commented in the constructor code, when using the
2713                # official property, we still have to allow the publicly
2714                # inaccessible early name so that the core code which uses it
2715                # will work regardless.
2716                if (   ! $only_early{$addr}
2717                    && ! $early{$addr}[0]
2718                    && $early{$addr}->@* > 2)
2719                {
2720                    my $early_property_name = $early{$addr}[2];
2721                    if ($property{$addr} ne $early_property_name) {
2722                        main::property_ref($property{$addr})
2723                                            ->add_alias($early_property_name);
2724                    }
2725                }
2726            }
2727
2728            # We may be skipping this file ...
2729            if (defined $skip{$addr}) {
2730
2731                # If the file isn't supposed to be in this release, there is
2732                # nothing to do
2733                if ($in_this_release{$addr}) {
2734
2735                    # But otherwise, we may print a message
2736                    if ($debug_skip) {
2737                        print STDERR "Skipping input file '$file'",
2738                                     " because '$skip{$addr}'\n";
2739                    }
2740
2741                    # And add it to the list of skipped files, which is later
2742                    # used to make the pod
2743                    $skipped_files{$file} = $skip{$addr};
2744
2745                    # The 'optional' list contains properties that are also to
2746                    # be skipped along with the file.  (There may also be
2747                    # digits which are just placeholders to make sure it isn't
2748                    # an empty list
2749                    foreach my $property ($optional{$addr}->@*) {
2750                        next unless $property =~ /\D/;
2751                        my $prop_object = main::property_ref($property);
2752                        next unless defined $prop_object;
2753                        $prop_object->set_fate($SUPPRESSED, $skip{$addr});
2754                    }
2755                }
2756
2757                return;
2758            }
2759
2760            # Here, we are going to process the file.  Open it, converting the
2761            # slashes used in this program into the proper form for the OS
2762            my $file_handle;
2763            if (not open $file_handle, "<", $file) {
2764                Carp::my_carp("Can't open $file.  Skipping: $!");
2765                return;
2766            }
2767            $handle{$addr} = $file_handle; # Cache the open file handle
2768
2769            # If possible, make sure that the file is the correct version.
2770            # (This data isn't available on early Unicode releases or in
2771            # UnicodeData.txt.)  We don't do this check if we are using a
2772            # substitute file instead of the official one (though the code
2773            # could be extended to do so).
2774            if ($in_this_release{$addr}
2775                && ! $early{$addr}[0]
2776                && lc($file) ne 'unicodedata.txt')
2777            {
2778                my $this_version;
2779
2780                if ($file !~ /^Unihan/i) {
2781
2782                    # The non-Unihan files started getting version numbers in
2783                    # 3.2, but some files in 4.0 are unchanged from 3.2, and
2784                    # marked as 3.2.  4.0.1 is the first version where there
2785                    # are no files marked as being from less than 4.0, though
2786                    # some are marked as 4.0.  In versions after that, the
2787                    # numbers are correct.
2788                    if ($v_version ge v4.0.1) {
2789                        $_ = <$file_handle>;    # The version number is in the
2790                                                # very first line if it is a
2791                                                # UCD file; otherwise, it
2792                                                # might be
2793                        goto valid_version if $_ =~ / - $string_version \. /x;
2794                        chomp;
2795                        if ($ucd{$addr}) {
2796                            $_ =~ s/^#\s*//;
2797
2798                            # 4.0.1 had some valid files that weren't updated.
2799                            goto valid_version
2800                                    if $v_version eq v4.0.1 && $_ =~ /4\.0\.0/;
2801                            $this_version = $_;
2802                            goto wrong_version;
2803                        }
2804                        else {
2805                            my $BOM = "\x{FEFF}";
2806                            utf8::encode($BOM);
2807                            my $BOM_re = qr/ ^ (?:$BOM)? /x;
2808
2809                            while ($_ =~ s/$BOM_re//) { # BOM; seems to be on
2810                                                        # many lines in some files!!
2811                                $_ = <$file_handle>;
2812                                chomp;
2813                                if ($_ =~ /^# Version: (.*)/) {
2814                                    $this_version = $1;
2815                                    goto valid_version
2816                                        if $this_version eq $string_version;
2817                                    goto valid_version
2818                                        if "$this_version.0" eq $string_version;
2819                                    goto wrong_version;
2820                                }
2821                            }
2822                            goto no_version;
2823                        }
2824                    }
2825                }
2826                elsif ($v_version ge v6.0.0) { # Unihan
2827
2828                    # Unihan files didn't get accurate version numbers until
2829                    # 6.0.  The version is somewhere in the first comment
2830                    # block
2831                    while (<$file_handle>) {
2832                        goto no_version if $_ !~ /^#/;
2833                        chomp;
2834                        $_ =~ s/^#\s*//;
2835                        next if $_ !~ / version: /x;
2836                        goto valid_version if $_ =~ /$string_version/;
2837                        goto wrong_version;
2838                    }
2839                    goto no_version;
2840                }
2841                else {  # Old Unihan; have to assume is valid
2842                    goto valid_version;
2843                }
2844
2845              wrong_version:
2846                die Carp::my_carp("File '$file' is version "
2847                                . "'$this_version'.  It should be "
2848                                . "version $string_version");
2849              no_version:
2850                Carp::my_carp_bug("Could not find the expected "
2851                                . "version info in file '$file'");
2852            }
2853        }
2854
2855      valid_version:
2856        print "$progress_message{$addr}\n" if $verbosity >= $PROGRESS;
2857
2858        # Call any special handler for before the file.
2859        &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
2860
2861        # Then the main handler
2862        &{$handler{$addr}}($self);
2863
2864        # Then any special post-file handler.
2865        &{$post_handler{$addr}}($self) if $post_handler{$addr};
2866
2867        # If any errors have been accumulated, output the counts (as the first
2868        # error message in each class was output when it was encountered).
2869        if ($errors{$addr}) {
2870            my $total = 0;
2871            my $types = 0;
2872            foreach my $error (keys %{$errors{$addr}}) {
2873                $total += $errors{$addr}->{$error};
2874                delete $errors{$addr}->{$error};
2875                $types++;
2876            }
2877            if ($total > 1) {
2878                my $message
2879                        = "A total of $total lines had errors in $file.  ";
2880
2881                $message .= ($types == 1)
2882                            ? '(Only the first one was displayed.)'
2883                            : '(Only the first of each type was displayed.)';
2884                Carp::my_carp($message);
2885            }
2886        }
2887
2888        if (@{$missings{$addr}}) {
2889            Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines.  Generated tables likely are wrong");
2890        }
2891
2892        # If a real file handle, close it.
2893        close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2894                                                        ref $handle{$addr};
2895        $handle{$addr} = "";   # Uses empty to indicate that has already seen
2896                               # the file, as opposed to undef
2897        return;
2898    }
2899
2900    sub _next_line($self) {
2901        # Sets $_ to be the next logical input line, if any.  Returns non-zero
2902        # if such a line exists.  'logical' means that any lines that have
2903        # been added via insert_lines() will be returned in $_ before the file
2904        # is read again.
2905
2906        my $addr = do { no overloading; pack 'J', $self; };
2907
2908        # Here the file is open (or if the handle is not a ref, is an open
2909        # 'virtual' file).  Get the next line; any inserted lines get priority
2910        # over the file itself.
2911        my $adjusted;
2912
2913        LINE:
2914        while (1) { # Loop until find non-comment, non-empty line
2915            #local $to_trace = 1 if main::DEBUG;
2916            my $inserted_ref = shift @{$added_lines{$addr}};
2917            if (defined $inserted_ref) {
2918                ($adjusted, $_) = @{$inserted_ref};
2919                trace $adjusted, $_ if main::DEBUG && $to_trace;
2920                return 1 if $adjusted;
2921            }
2922            else {
2923                last if ! ref $handle{$addr}; # Don't read unless is real file
2924                last if ! defined ($_ = readline $handle{$addr});
2925            }
2926            chomp;
2927            trace $_ if main::DEBUG && $to_trace;
2928
2929            # See if this line is the comment line that defines what property
2930            # value that code points that are not listed in the file should
2931            # have.  The format or existence of these lines is not guaranteed
2932            # by Unicode since they are comments, but the documentation says
2933            # that this was added for machine-readability, so probably won't
2934            # change.  This works starting in Unicode Version 5.0.  They look
2935            # like:
2936            #
2937            # @missing: 0000..10FFFF; Not_Reordered
2938            # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2939            # @missing: 0000..10FFFF; ; NaN
2940            #
2941            # Save the line for a later get_missings() call.
2942            if (/$missing_defaults_prefix/) {
2943                if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2944                    $self->carp_bad_line("Unexpected \@missing line.  Assuming no missing entries");
2945                }
2946                elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2947                    my @defaults = split /\s* ; \s*/x, $_;
2948
2949                    # The first field is the @missing, which ends in a
2950                    # semi-colon, so can safely shift.
2951                    shift @defaults;
2952
2953                    # Some of these lines may have empty field placeholders
2954                    # which get in the way.  An example is:
2955                    # @missing: 0000..10FFFF; ; NaN
2956                    # Remove them.  Process starting from the top so the
2957                    # splice doesn't affect things still to be looked at.
2958                    for (my $i = @defaults - 1; $i >= 0; $i--) {
2959                        next if $defaults[$i] ne "";
2960                        splice @defaults, $i, 1;
2961                    }
2962
2963                    # What's left should be just the property (maybe) and the
2964                    # default.  Having only one element means it doesn't have
2965                    # the property.
2966                    my $default;
2967                    my $property;
2968                    if (@defaults >= 1) {
2969                        if (@defaults == 1) {
2970                            $default = $defaults[0];
2971                        }
2972                        else {
2973                            $property = $defaults[0];
2974                            $default = $defaults[1];
2975                        }
2976                    }
2977
2978                    if (@defaults < 1
2979                        || @defaults > 2
2980                        || ($default =~ /^</
2981                            && $default !~ /^<code *point>$/i
2982                            && $default !~ /^<none>$/i
2983                            && $default !~ /^<script>$/i))
2984                    {
2985                        $self->carp_bad_line("Unrecognized \@missing line: $_.  Assuming no missing entries");
2986                    }
2987                    else {
2988
2989                        # If the property is missing from the line, it should
2990                        # be the one for the whole file
2991                        $property = $property{$addr} if ! defined $property;
2992
2993                        # Change <none> to the null string, which is what it
2994                        # really means.  If the default is the code point
2995                        # itself, set it to <code point>, which is what
2996                        # Unicode uses (but sometimes they've forgotten the
2997                        # space)
2998                        if ($default =~ /^<none>$/i) {
2999                            $default = "";
3000                        }
3001                        elsif ($default =~ /^<code *point>$/i) {
3002                            $default = $CODE_POINT;
3003                        }
3004                        elsif ($default =~ /^<script>$/i) {
3005
3006                            # Special case this one.  Currently is from
3007                            # ScriptExtensions.txt, and means for all unlisted
3008                            # code points, use their Script property values.
3009                            # For the code points not listed in that file, the
3010                            # default value is 'Unknown'.
3011                            $default = "Unknown";
3012                        }
3013
3014                        # Store them as a sub-arrays with both components.
3015                        push @{$missings{$addr}}, [ $default, $property ];
3016                    }
3017                }
3018
3019                # There is nothing for the caller to process on this comment
3020                # line.
3021                next;
3022            }
3023
3024            # Unless to keep, remove comments.  If to keep, ignore
3025            # comment-only lines
3026            if ($retain_trailing_comments{$addr}) {
3027                next if / ^ \s* \# /x;
3028
3029                # But escape any single quotes (done in both the comment and
3030                # non-comment portion; this could be a bug someday, but not
3031                # likely)
3032                s/'/\\'/g;
3033            }
3034            else {
3035                s/#.*//;
3036            }
3037
3038            # Remove trailing space, and skip this line if the result is empty
3039            s/\s+$//;
3040            next if /^$/;
3041
3042            # Call any handlers for this line, and skip further processing of
3043            # the line if the handler sets the line to null.
3044            foreach my $sub_ref (@{$each_line_handler{$addr}}) {
3045                &{$sub_ref}($self);
3046                next LINE if /^$/;
3047            }
3048
3049            # Here the line is ok.  return success.
3050            return 1;
3051        } # End of looping through lines.
3052
3053        # If there are EOF handlers, call each (only once) and if it generates
3054        # more lines to process go back in the loop to handle them.
3055        while ($eof_handler{$addr}->@*) {
3056            &{$eof_handler{$addr}[0]}($self);
3057            shift $eof_handler{$addr}->@*;   # Currently only get one shot at it.
3058            goto LINE if $added_lines{$addr};
3059        }
3060
3061        # Return failure -- no more lines.
3062        return 0;
3063
3064    }
3065
3066    sub _next_line_with_remapped_range($self) {
3067        # like _next_line(), but for use on non-ASCII platforms.  It sets $_
3068        # to be the next logical input line, if any.  Returns non-zero if such
3069        # a line exists.  'logical' means that any lines that have been added
3070        # via insert_lines() will be returned in $_ before the file is read
3071        # again.
3072        #
3073        # The difference from _next_line() is that this remaps the Unicode
3074        # code points in the input to those of the native platform.  Each
3075        # input line contains a single code point, or a single contiguous
3076        # range of them  This routine splits each range into its individual
3077        # code points and caches them.  It returns the cached values,
3078        # translated into their native equivalents, one at a time, for each
3079        # call, before reading the next line.  Since native values can only be
3080        # a single byte wide, no translation is needed for code points above
3081        # 0xFF, and ranges that are entirely above that number are not split.
3082        # If an input line contains the range 254-1000, it would be split into
3083        # three elements: 254, 255, and 256-1000.  (The downstream table
3084        # insertion code will sort and coalesce the individual code points
3085        # into appropriate ranges.)
3086
3087        my $addr = do { no overloading; pack 'J', $self; };
3088
3089        while (1) {
3090
3091            # Look in cache before reading the next line.  Return any cached
3092            # value, translated
3093            my $inserted = shift @{$remapped_lines{$addr}};
3094            if (defined $inserted) {
3095                trace $inserted if main::DEBUG && $to_trace;
3096                $_ = $inserted =~ s/^ ( \d+ ) /sprintf("%04X", utf8::unicode_to_native($1))/xer;
3097                trace $_ if main::DEBUG && $to_trace;
3098                return 1;
3099            }
3100
3101            # Get the next line.
3102            return 0 unless _next_line($self);
3103
3104            # If there is a special handler for it, return the line,
3105            # untranslated.  This should happen only for files that are
3106            # special, not being code-point related, such as property names.
3107            return 1 if $handler{$addr}
3108                                    != \&main::process_generic_property_file;
3109
3110            my ($range, $property_name, $map, @remainder)
3111                = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
3112
3113            if (@remainder
3114                || ! defined $property_name
3115                || $range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
3116            {
3117                Carp::my_carp_bug("Unrecognized input line '$_'.  Ignored");
3118            }
3119
3120            my $low = hex $1;
3121            my $high = (defined $2) ? hex $2 : $low;
3122
3123            # If the input maps the range to another code point, remap the
3124            # target if it is between 0 and 255.
3125            my $tail;
3126            if (defined $map) {
3127                $map =~ s/\b 00 ( [0-9A-F]{2} ) \b/sprintf("%04X", utf8::unicode_to_native(hex $1))/gxe;
3128                $tail = "$property_name; $map";
3129                $_ = "$range; $tail";
3130            }
3131            else {
3132                $tail = $property_name;
3133            }
3134
3135            # If entire range is above 255, just return it, unchanged (except
3136            # any mapped-to code point, already changed above)
3137            return 1 if $low > 255;
3138
3139            # Cache an entry for every code point < 255.  For those in the
3140            # range above 255, return a dummy entry for just that portion of
3141            # the range.  Note that this will be out-of-order, but that is not
3142            # a problem.
3143            foreach my $code_point ($low .. $high) {
3144                if ($code_point > 255) {
3145                    $_ = sprintf "%04X..%04X; $tail", $code_point, $high;
3146                    return 1;
3147                }
3148                push @{$remapped_lines{$addr}}, "$code_point; $tail";
3149            }
3150        } # End of looping through lines.
3151
3152        # NOTREACHED
3153    }
3154
3155#   Not currently used, not fully tested.
3156#    sub peek {
3157#        # Non-destructive lookahead one non-adjusted, non-comment, non-blank
3158#        # record.  Not callable from an each_line_handler(), nor does it call
3159#        # an each_line_handler() on the line.
3160#
3161#        my $self = shift;
3162#        my $addr = do { no overloading; pack 'J', $self; };
3163#
3164#        foreach my $inserted_ref (@{$added_lines{$addr}}) {
3165#            my ($adjusted, $line) = @{$inserted_ref};
3166#            next if $adjusted;
3167#
3168#            # Remove comments and trailing space, and return a non-empty
3169#            # resulting line
3170#            $line =~ s/#.*//;
3171#            $line =~ s/\s+$//;
3172#            return $line if $line ne "";
3173#        }
3174#
3175#        return if ! ref $handle{$addr}; # Don't read unless is real file
3176#        while (1) { # Loop until find non-comment, non-empty line
3177#            local $to_trace = 1 if main::DEBUG;
3178#            trace $_ if main::DEBUG && $to_trace;
3179#            return if ! defined (my $line = readline $handle{$addr});
3180#            chomp $line;
3181#            push @{$added_lines{$addr}}, [ 0, $line ];
3182#
3183#            $line =~ s/#.*//;
3184#            $line =~ s/\s+$//;
3185#            return $line if $line ne "";
3186#        }
3187#
3188#        return;
3189#    }
3190
3191
3192    sub insert_lines($self, @lines) {
3193        # Lines can be inserted so that it looks like they were in the input
3194        # file at the place it was when this routine is called.  See also
3195        # insert_adjusted_lines().  Lines inserted via this routine go through
3196        # any each_line_handler()
3197
3198        # Each inserted line is an array, with the first element being 0 to
3199        # indicate that this line hasn't been adjusted, and needs to be
3200        # processed.
3201        no overloading;
3202        push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @lines;
3203        return;
3204    }
3205
3206    sub insert_adjusted_lines($self, @lines) {
3207        # Lines can be inserted so that it looks like they were in the input
3208        # file at the place it was when this routine is called.  See also
3209        # insert_lines().  Lines inserted via this routine are already fully
3210        # adjusted, ready to be processed; each_line_handler()s handlers will
3211        # not be called.  This means this is not a completely general
3212        # facility, as only the last each_line_handler on the stack should
3213        # call this.  It could be made more general, by passing to each of the
3214        # line_handlers their position on the stack, which they would pass on
3215        # to this routine, and that would replace the boolean first element in
3216        # the anonymous array pushed here, so that the next_line routine could
3217        # use that to call only those handlers whose index is after it on the
3218        # stack.  But this is overkill for what is needed now.
3219
3220        trace $_[0] if main::DEBUG && $to_trace;
3221
3222        # Each inserted line is an array, with the first element being 1 to
3223        # indicate that this line has been adjusted
3224        no overloading;
3225        push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @lines;
3226        return;
3227    }
3228
3229    sub get_missings($self) {
3230        # Returns the stored up @missings lines' values, and clears the list.
3231        # The values are in an array, consisting of the default in the first
3232        # element, and the property in the 2nd.  However, since these lines
3233        # can be stacked up, the return is an array of all these arrays.
3234
3235        my $addr = do { no overloading; pack 'J', $self; };
3236
3237        # If not accepting a list return, just return the first one.
3238        return shift @{$missings{$addr}} unless wantarray;
3239
3240        my @return = @{$missings{$addr}};
3241        undef @{$missings{$addr}};
3242        return @return;
3243    }
3244
3245    sub _exclude_unassigned($self) {
3246
3247        # Takes the range in $_ and excludes code points that aren't assigned
3248        # in this release
3249
3250        state $skip_inserted_count = 0;
3251
3252        # Ignore recursive calls.
3253        if ($skip_inserted_count) {
3254            $skip_inserted_count--;
3255            return;
3256        }
3257
3258        # Find what code points are assigned in this release
3259        main::calculate_Assigned() if ! defined $Assigned;
3260
3261        my $addr = do { no overloading; pack 'J', $self; };
3262
3263        my ($range, @remainder)
3264            = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
3265
3266        # Examine the range.
3267        if ($range =~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
3268        {
3269            my $low = hex $1;
3270            my $high = (defined $2) ? hex $2 : $low;
3271
3272            # Split the range into subranges of just those code points in it
3273            # that are assigned.
3274            my @ranges = (Range_List->new(Initialize
3275                              => Range->new($low, $high)) & $Assigned)->ranges;
3276
3277            # Do nothing if nothing in the original range is assigned in this
3278            # release; handle normally if everything is in this release.
3279            if (! @ranges) {
3280                $_ = "";
3281            }
3282            elsif (@ranges != 1) {
3283
3284                # Here, some code points in the original range aren't in this
3285                # release; @ranges gives the ones that are.  Create fake input
3286                # lines for each of the ranges, and set things up so that when
3287                # this routine is called on that fake input, it will do
3288                # nothing.
3289                $skip_inserted_count = @ranges;
3290                my $remainder = join ";", @remainder;
3291                for my $range (@ranges) {
3292                    $self->insert_lines(sprintf("%04X..%04X;%s",
3293                                    $range->start, $range->end, $remainder));
3294                }
3295                $_ = "";    # The original range is now defunct.
3296            }
3297        }
3298
3299        return;
3300    }
3301
3302    sub _fixup_obsolete_hanguls($self) {
3303
3304        # This is called only when compiling Unicode version 1.  All Unicode
3305        # data for subsequent releases assumes that the code points that were
3306        # Hangul syllables in this release only are something else, so if
3307        # using such data, we have to override it
3308
3309        my $addr = do { no overloading; pack 'J', $self; };
3310
3311        my $object = main::property_ref($property{$addr});
3312        $object->add_map($FIRST_REMOVED_HANGUL_SYLLABLE,
3313                         $FINAL_REMOVED_HANGUL_SYLLABLE,
3314                         $early{$addr}[3],  # Passed-in value for these
3315                         Replace => $UNCONDITIONALLY);
3316    }
3317
3318    sub _insert_property_into_line($self) {
3319        # Add a property field to $_, if this file requires it.
3320
3321        my $addr = do { no overloading; pack 'J', $self; };
3322        my $property = $property{$addr};
3323
3324        $_ =~ s/(;|$)/; $property$1/;
3325        return;
3326    }
3327
3328    sub carp_bad_line($self, $message="") {
3329        # Output consistent error messages, using either a generic one, or the
3330        # one given by the optional parameter.  To avoid gazillions of the
3331        # same message in case the syntax of a  file is way off, this routine
3332        # only outputs the first instance of each message, incrementing a
3333        # count so the totals can be output at the end of the file.
3334
3335        my $addr = do { no overloading; pack 'J', $self; };
3336
3337        $message = 'Unexpected line' unless $message;
3338
3339        # No trailing punctuation so as to fit with our addenda.
3340        $message =~ s/[.:;,]$//;
3341
3342        # If haven't seen this exact message before, output it now.  Otherwise
3343        # increment the count of how many times it has occurred
3344        unless ($errors{$addr}->{$message}) {
3345            Carp::my_carp("$message in '$_' in "
3346                            . $file{$addr}
3347                            . " at line $..  Skipping this line;");
3348            $errors{$addr}->{$message} = 1;
3349        }
3350        else {
3351            $errors{$addr}->{$message}++;
3352        }
3353
3354        # Clear the line to prevent any further (meaningful) processing of it.
3355        $_ = "";
3356
3357        return;
3358    }
3359} # End closure
3360
3361package Multi_Default;
3362
3363# Certain properties in early versions of Unicode had more than one possible
3364# default for code points missing from the files.  In these cases, one
3365# default applies to everything left over after all the others are applied,
3366# and for each of the others, there is a description of which class of code
3367# points applies to it.  This object helps implement this by storing the
3368# defaults, and for all but that final default, an eval string that generates
3369# the class that it applies to.
3370
3371use strict;
3372use warnings;
3373
3374use feature 'signatures';
3375no warnings 'experimental::signatures';
3376
3377{   # Closure
3378
3379    main::setup_package();
3380
3381    my %class_defaults;
3382    # The defaults structure for the classes
3383    main::set_access('class_defaults', \%class_defaults);
3384
3385    my %other_default;
3386    # The default that applies to everything left over.
3387    main::set_access('other_default', \%other_default, 'r');
3388
3389
3390    sub new {
3391        # The constructor is called with default => eval pairs, terminated by
3392        # the left-over default. e.g.
3393        # Multi_Default->new(
3394        #        'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
3395        #               -  0x200D',
3396        #        'R' => 'some other expression that evaluates to code points',
3397        #        .
3398        #        .
3399        #        .
3400        #        'U'));
3401        # It is best to leave the final value be the one that matches the
3402        # above-Unicode code points.
3403
3404        my $class = shift;
3405
3406        my $self = bless \do{my $anonymous_scalar}, $class;
3407        my $addr = do { no overloading; pack 'J', $self; };
3408
3409        while (@_ > 1) {
3410            my $default = shift;
3411            my $eval = shift;
3412            $class_defaults{$addr}->{$default} = $eval;
3413        }
3414
3415        $other_default{$addr} = shift;
3416
3417        return $self;
3418    }
3419
3420    sub get_next_defaults($self) {
3421        # Iterates and returns the next class of defaults.
3422
3423        my $addr = do { no overloading; pack 'J', $self; };
3424
3425        return each %{$class_defaults{$addr}};
3426    }
3427}
3428
3429package Alias;
3430
3431# An alias is one of the names that a table goes by.  This class defines them
3432# including some attributes.  Everything is currently setup in the
3433# constructor.
3434
3435use strict;
3436use warnings;
3437
3438use feature 'signatures';
3439no warnings 'experimental::signatures';
3440
3441
3442{   # Closure
3443
3444    main::setup_package();
3445
3446    my %name;
3447    main::set_access('name', \%name, 'r');
3448
3449    my %loose_match;
3450    # Should this name match loosely or not.
3451    main::set_access('loose_match', \%loose_match, 'r');
3452
3453    my %make_re_pod_entry;
3454    # Some aliases should not get their own entries in the re section of the
3455    # pod, because they are covered by a wild-card, and some we want to
3456    # discourage use of.  Binary
3457    main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's');
3458
3459    my %ucd;
3460    # Is this documented to be accessible via Unicode::UCD
3461    main::set_access('ucd', \%ucd, 'r', 's');
3462
3463    my %status;
3464    # Aliases have a status, like deprecated, or even suppressed (which means
3465    # they don't appear in documentation).  Enum
3466    main::set_access('status', \%status, 'r');
3467
3468    my %ok_as_filename;
3469    # Similarly, some aliases should not be considered as usable ones for
3470    # external use, such as file names, or we don't want documentation to
3471    # recommend them.  Boolean
3472    main::set_access('ok_as_filename', \%ok_as_filename, 'r');
3473
3474    sub new {
3475        my $class = shift;
3476
3477        my $self = bless \do { my $anonymous_scalar }, $class;
3478        my $addr = do { no overloading; pack 'J', $self; };
3479
3480        $name{$addr} = shift;
3481        $loose_match{$addr} = shift;
3482        $make_re_pod_entry{$addr} = shift;
3483        $ok_as_filename{$addr} = shift;
3484        $status{$addr} = shift;
3485        $ucd{$addr} = shift;
3486
3487        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3488
3489        # Null names are never ok externally
3490        $ok_as_filename{$addr} = 0 if $name{$addr} eq "";
3491
3492        return $self;
3493    }
3494}
3495
3496package Range;
3497
3498# A range is the basic unit for storing code points, and is described in the
3499# comments at the beginning of the program.  Each range has a starting code
3500# point; an ending code point (not less than the starting one); a value
3501# that applies to every code point in between the two end-points, inclusive;
3502# and an enum type that applies to the value.  The type is for the user's
3503# convenience, and has no meaning here, except that a non-zero type is
3504# considered to not obey the normal Unicode rules for having standard forms.
3505#
3506# The same structure is used for both map and match tables, even though in the
3507# latter, the value (and hence type) is irrelevant and could be used as a
3508# comment.  In map tables, the value is what all the code points in the range
3509# map to.  Type 0 values have the standardized version of the value stored as
3510# well, so as to not have to recalculate it a lot.
3511
3512use strict;
3513use warnings;
3514
3515use feature 'signatures';
3516no warnings 'experimental::signatures';
3517
3518sub trace { return main::trace(@_); }
3519
3520{   # Closure
3521
3522    main::setup_package();
3523
3524    my %start;
3525    main::set_access('start', \%start, 'r', 's');
3526
3527    my %end;
3528    main::set_access('end', \%end, 'r', 's');
3529
3530    my %value;
3531    main::set_access('value', \%value, 'r', 's');
3532
3533    my %type;
3534    main::set_access('type', \%type, 'r');
3535
3536    my %standard_form;
3537    # The value in internal standard form.  Defined only if the type is 0.
3538    main::set_access('standard_form', \%standard_form);
3539
3540    # Note that if these fields change, the dump() method should as well
3541
3542    sub new($class, $_addr, $_end, @_args) {
3543        my $self = bless \do { my $anonymous_scalar }, $class;
3544        my $addr = do { no overloading; pack 'J', $self; };
3545
3546        $start{$addr} = $_addr;
3547        $end{$addr}   = $_end;
3548
3549        my %args = @_args;
3550
3551        my $value = delete $args{'Value'};  # Can be 0
3552        $value = "" unless defined $value;
3553        $value{$addr} = $value;
3554
3555        $type{$addr} = delete $args{'Type'} || 0;
3556
3557        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3558
3559        return $self;
3560    }
3561
3562    use overload
3563        fallback => 0,
3564        qw("") => "_operator_stringify",
3565        "." => \&main::_operator_dot,
3566        ".=" => \&main::_operator_dot_equal,
3567    ;
3568
3569    sub _operator_stringify($self) {
3570        my $addr = do { no overloading; pack 'J', $self; };
3571
3572        # Output it like '0041..0065 (value)'
3573        my $return = sprintf("%04X", $start{$addr})
3574                        .  '..'
3575                        . sprintf("%04X", $end{$addr});
3576        my $value = $value{$addr};
3577        my $type = $type{$addr};
3578        $return .= ' (';
3579        $return .= "$value";
3580        $return .= ", Type=$type" if $type != 0;
3581        $return .= ')';
3582
3583        return $return;
3584    }
3585
3586    sub standard_form($self) {
3587        # Calculate the standard form only if needed, and cache the result.
3588        # The standard form is the value itself if the type is special.
3589        # This represents a considerable CPU and memory saving - at the time
3590        # of writing there are 368676 non-special objects, but the standard
3591        # form is only requested for 22047 of them - ie about 6%.
3592
3593        my $addr = do { no overloading; pack 'J', $self; };
3594
3595        return $standard_form{$addr} if defined $standard_form{$addr};
3596
3597        my $value = $value{$addr};
3598        return $value if $type{$addr};
3599        return $standard_form{$addr} = main::standardize($value);
3600    }
3601
3602    sub dump($self, $indent) {
3603        # Human, not machine readable.  For machine readable, comment out this
3604        # entire routine and let the standard one take effect.
3605        my $addr = do { no overloading; pack 'J', $self; };
3606
3607        my $return = $indent
3608                    . sprintf("%04X", $start{$addr})
3609                    . '..'
3610                    . sprintf("%04X", $end{$addr})
3611                    . " '$value{$addr}';";
3612        if (! defined $standard_form{$addr}) {
3613            $return .= "(type=$type{$addr})";
3614        }
3615        elsif ($standard_form{$addr} ne $value{$addr}) {
3616            $return .= "(standard '$standard_form{$addr}')";
3617        }
3618        return $return;
3619    }
3620} # End closure
3621
3622package _Range_List_Base;
3623
3624use strict;
3625use warnings;
3626
3627use feature 'signatures';
3628no warnings 'experimental::signatures';
3629
3630# Base class for range lists.  A range list is simply an ordered list of
3631# ranges, so that the ranges with the lowest starting numbers are first in it.
3632#
3633# When a new range is added that is adjacent to an existing range that has the
3634# same value and type, it merges with it to form a larger range.
3635#
3636# Ranges generally do not overlap, except that there can be multiple entries
3637# of single code point ranges.  This is because of NameAliases.txt.
3638#
3639# In this program, there is a standard value such that if two different
3640# values, have the same standard value, they are considered equivalent.  This
3641# value was chosen so that it gives correct results on Unicode data
3642
3643# There are a number of methods to manipulate range lists, and some operators
3644# are overloaded to handle them.
3645
3646sub trace { return main::trace(@_); }
3647
3648{ # Closure
3649
3650    our $addr;
3651
3652    # Max is initialized to a negative value that isn't adjacent to 0, for
3653    # simpler tests
3654    my $max_init = -2;
3655
3656    main::setup_package();
3657
3658    my %ranges;
3659    # The list of ranges
3660    main::set_access('ranges', \%ranges, 'readable_array');
3661
3662    my %max;
3663    # The highest code point in the list.  This was originally a method, but
3664    # actual measurements said it was used a lot.
3665    main::set_access('max', \%max, 'r');
3666
3667    my %each_range_iterator;
3668    # Iterator position for each_range()
3669    main::set_access('each_range_iterator', \%each_range_iterator);
3670
3671    my %owner_name_of;
3672    # Name of parent this is attached to, if any.  Solely for better error
3673    # messages.
3674    main::set_access('owner_name_of', \%owner_name_of, 'p_r');
3675
3676    my %_search_ranges_cache;
3677    # A cache of the previous result from _search_ranges(), for better
3678    # performance
3679    main::set_access('_search_ranges_cache', \%_search_ranges_cache);
3680
3681    sub new {
3682        my $class = shift;
3683        my %args = @_;
3684
3685        # Optional initialization data for the range list.
3686        my $initialize = delete $args{'Initialize'};
3687
3688        my $self;
3689
3690        # Use _union() to initialize.  _union() returns an object of this
3691        # class, which means that it will call this constructor recursively.
3692        # But it won't have this $initialize parameter so that it won't
3693        # infinitely loop on this.
3694        return _union($class, $initialize, %args) if defined $initialize;
3695
3696        $self = bless \do { my $anonymous_scalar }, $class;
3697        my $addr = do { no overloading; pack 'J', $self; };
3698
3699        # Optional parent object, only for debug info.
3700        $owner_name_of{$addr} = delete $args{'Owner'};
3701        $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
3702
3703        # Stringify, in case it is an object.
3704        $owner_name_of{$addr} = "$owner_name_of{$addr}";
3705
3706        # This is used only for error messages, and so a colon is added
3707        $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
3708
3709        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3710
3711        $max{$addr} = $max_init;
3712
3713        $_search_ranges_cache{$addr} = 0;
3714        $ranges{$addr} = [];
3715
3716        return $self;
3717    }
3718
3719    use overload
3720        fallback => 0,
3721        qw("") => "_operator_stringify",
3722        "." => \&main::_operator_dot,
3723        ".=" => \&main::_operator_dot_equal,
3724    ;
3725
3726    sub _operator_stringify($self) {
3727        my $addr = do { no overloading; pack 'J', $self; };
3728
3729        return "Range_List attached to '$owner_name_of{$addr}'"
3730                                                if $owner_name_of{$addr};
3731        return "anonymous Range_List " . \$self;
3732    }
3733
3734    sub _union {
3735        # Returns the union of the input code points.  It can be called as
3736        # either a constructor or a method.  If called as a method, the result
3737        # will be a new() instance of the calling object, containing the union
3738        # of that object with the other parameter's code points;  if called as
3739        # a constructor, the first parameter gives the class that the new object
3740        # should be, and the second parameter gives the code points to go into
3741        # it.
3742        # In either case, there are two parameters looked at by this routine;
3743        # any additional parameters are passed to the new() constructor.
3744        #
3745        # The code points can come in the form of some object that contains
3746        # ranges, and has a conventionally named method to access them; or
3747        # they can be an array of individual code points (as integers); or
3748        # just a single code point.
3749        #
3750        # If they are ranges, this routine doesn't make any effort to preserve
3751        # the range values and types of one input over the other.  Therefore
3752        # this base class should not allow _union to be called from other than
3753        # initialization code, so as to prevent two tables from being added
3754        # together where the range values matter.  The general form of this
3755        # routine therefore belongs in a derived class, but it was moved here
3756        # to avoid duplication of code.  The failure to overload this in this
3757        # class keeps it safe.
3758        #
3759        # It does make the effort during initialization to accept tables with
3760        # multiple values for the same code point, and to preserve the order
3761        # of these.  If there is only one input range or range set, it doesn't
3762        # sort (as it should already be sorted to the desired order), and will
3763        # accept multiple values per code point.  Otherwise it will merge
3764        # multiple values into a single one.
3765
3766        my $self;
3767        my @args;   # Arguments to pass to the constructor
3768
3769        my $class = shift;
3770
3771        # If a method call, will start the union with the object itself, and
3772        # the class of the new object will be the same as self.
3773        if (ref $class) {
3774            $self = $class;
3775            $class = ref $self;
3776            push @args, $self;
3777        }
3778
3779        # Add the other required parameter.
3780        push @args, shift;
3781        # Rest of parameters are passed on to the constructor
3782
3783        # Accumulate all records from both lists.
3784        my @records;
3785        my $input_count = 0;
3786        for my $arg (@args) {
3787            #local $to_trace = 0 if main::DEBUG;
3788            trace "argument = $arg" if main::DEBUG && $to_trace;
3789            if (! defined $arg) {
3790                my $message = "";
3791                if (defined $self) {
3792                    no overloading;
3793                    $message .= $owner_name_of{pack 'J', $self};
3794                }
3795                Carp::my_carp_bug($message . "Undefined argument to _union.  No union done.");
3796                return;
3797            }
3798
3799            $arg = [ $arg ] if ! ref $arg;
3800            my $type = ref $arg;
3801            if ($type eq 'ARRAY') {
3802                foreach my $element (@$arg) {
3803                    push @records, Range->new($element, $element);
3804                    $input_count++;
3805                }
3806            }
3807            elsif ($arg->isa('Range')) {
3808                push @records, $arg;
3809                $input_count++;
3810            }
3811            elsif ($arg->can('ranges')) {
3812                push @records, $arg->ranges;
3813                $input_count++;
3814            }
3815            else {
3816                my $message = "";
3817                if (defined $self) {
3818                    no overloading;
3819                    $message .= $owner_name_of{pack 'J', $self};
3820                }
3821                Carp::my_carp_bug($message . "Cannot take the union of a $type.  No union done.");
3822                return;
3823            }
3824        }
3825
3826        # Sort with the range containing the lowest ordinal first, but if
3827        # two ranges start at the same code point, sort with the bigger range
3828        # of the two first, because it takes fewer cycles.
3829        if ($input_count > 1) {
3830            @records = sort { ($a->start <=> $b->start)
3831                                      or
3832                                    # if b is shorter than a, b->end will be
3833                                    # less than a->end, and we want to select
3834                                    # a, so want to return -1
3835                                    ($b->end <=> $a->end)
3836                                   } @records;
3837        }
3838
3839        my $new = $class->new(@_);
3840
3841        # Fold in records so long as they add new information.
3842        for my $set (@records) {
3843            my $start = $set->start;
3844            my $end   = $set->end;
3845            my $value = $set->value;
3846            my $type  = $set->type;
3847            if ($start > $new->max) {
3848                $new->_add_delete('+', $start, $end, $value, Type => $type);
3849            }
3850            elsif ($end > $new->max) {
3851                $new->_add_delete('+', $new->max +1, $end, $value,
3852                                                                Type => $type);
3853            }
3854            elsif ($input_count == 1) {
3855                # Here, overlaps existing range, but is from a single input,
3856                # so preserve the multiple values from that input.
3857                $new->_add_delete('+', $start, $end, $value, Type => $type,
3858                                                Replace => $MULTIPLE_AFTER);
3859            }
3860        }
3861
3862        return $new;
3863    }
3864
3865    sub range_count($self) {        # Return the number of ranges in the range list
3866        no overloading;
3867        return scalar @{$ranges{pack 'J', $self}};
3868    }
3869
3870    sub min($self) {
3871        # Returns the minimum code point currently in the range list, or if
3872        # the range list is empty, 2 beyond the max possible.  This is a
3873        # method because used so rarely, that not worth saving between calls,
3874        # and having to worry about changing it as ranges are added and
3875        # deleted.
3876
3877        my $addr = do { no overloading; pack 'J', $self; };
3878
3879        # If the range list is empty, return a large value that isn't adjacent
3880        # to any that could be in the range list, for simpler tests
3881        return $MAX_WORKING_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
3882        return $ranges{$addr}->[0]->start;
3883    }
3884
3885    sub contains($self, $codepoint) {
3886        # Boolean: Is argument in the range list?  If so returns $i such that:
3887        #   range[$i]->end < $codepoint <= range[$i+1]->end
3888        # which is one beyond what you want; this is so that the 0th range
3889        # doesn't return false
3890
3891        my $i = $self->_search_ranges($codepoint);
3892        return 0 unless defined $i;
3893
3894        # The search returns $i, such that
3895        #   range[$i-1]->end < $codepoint <= range[$i]->end
3896        # So is in the table if and only iff it is at least the start position
3897        # of range $i.
3898        no overloading;
3899        return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint;
3900        return $i + 1;
3901    }
3902
3903    sub containing_range($self, $codepoint) {
3904        # Returns the range object that contains the code point, undef if none
3905        my $i = $self->contains($codepoint);
3906        return unless $i;
3907
3908        # contains() returns 1 beyond where we should look
3909        no overloading;
3910        return $ranges{pack 'J', $self}->[$i-1];
3911    }
3912
3913    sub value_of($self, $codepoint) {
3914        # Returns the value associated with the code point, undef if none
3915        my $range = $self->containing_range($codepoint);
3916        return unless defined $range;
3917
3918        return $range->value;
3919    }
3920
3921    sub type_of($self, $codepoint) {
3922        # Returns the type of the range containing the code point, undef if
3923        # the code point is not in the table
3924        my $range = $self->containing_range($codepoint);
3925        return unless defined $range;
3926
3927        return $range->type;
3928    }
3929
3930    sub _search_ranges($self, $code_point) {
3931        # Find the range in the list which contains a code point, or where it
3932        # should go if were to add it.  That is, it returns $i, such that:
3933        #   range[$i-1]->end < $codepoint <= range[$i]->end
3934        # Returns undef if no such $i is possible (e.g. at end of table), or
3935        # if there is an error.
3936        my $addr = do { no overloading; pack 'J', $self; };
3937
3938        return if $code_point > $max{$addr};
3939        my $r = $ranges{$addr};                # The current list of ranges
3940        my $range_list_size = scalar @$r;
3941        my $i;
3942
3943        use integer;        # want integer division
3944
3945        # Use the cached result as the starting guess for this one, because,
3946        # an experiment on 5.1 showed that 90% of the time the cache was the
3947        # same as the result on the next call (and 7% it was one less).
3948        $i = $_search_ranges_cache{$addr};
3949        $i = 0 if $i >= $range_list_size;   # Reset if no longer valid (prob.
3950                                            # from an intervening deletion
3951        #local $to_trace = 1 if main::DEBUG;
3952        trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
3953        return $i if $code_point <= $r->[$i]->end
3954                     && ($i == 0 || $r->[$i-1]->end < $code_point);
3955
3956        # Here the cache doesn't yield the correct $i.  Try adding 1.
3957        if ($i < $range_list_size - 1
3958            && $r->[$i]->end < $code_point &&
3959            $code_point <= $r->[$i+1]->end)
3960        {
3961            $i++;
3962            trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
3963            $_search_ranges_cache{$addr} = $i;
3964            return $i;
3965        }
3966
3967        # Here, adding 1 also didn't work.  We do a binary search to
3968        # find the correct position, starting with current $i
3969        my $lower = 0;
3970        my $upper = $range_list_size - 1;
3971        while (1) {
3972            trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
3973
3974            if ($code_point <= $r->[$i]->end) {
3975
3976                # Here we have met the upper constraint.  We can quit if we
3977                # also meet the lower one.
3978                last if $i == 0 || $r->[$i-1]->end < $code_point;
3979
3980                $upper = $i;        # Still too high.
3981
3982            }
3983            else {
3984
3985                # Here, $r[$i]->end < $code_point, so look higher up.
3986                $lower = $i;
3987            }
3988
3989            # Split search domain in half to try again.
3990            my $temp = ($upper + $lower) / 2;
3991
3992            # No point in continuing unless $i changes for next time
3993            # in the loop.
3994            if ($temp == $i) {
3995
3996                # We can't reach the highest element because of the averaging.
3997                # So if one below the upper edge, force it there and try one
3998                # more time.
3999                if ($i == $range_list_size - 2) {
4000
4001                    trace "Forcing to upper edge" if main::DEBUG && $to_trace;
4002                    $i = $range_list_size - 1;
4003
4004                    # Change $lower as well so if fails next time through,
4005                    # taking the average will yield the same $i, and we will
4006                    # quit with the error message just below.
4007                    $lower = $i;
4008                    next;
4009                }
4010                Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go.  No action taken.");
4011                return;
4012            }
4013            $i = $temp;
4014        } # End of while loop
4015
4016        if (main::DEBUG && $to_trace) {
4017            trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
4018            trace "i=  [ $i ]", $r->[$i];
4019            trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
4020        }
4021
4022        # Here we have found the offset.  Cache it as a starting point for the
4023        # next call.
4024        $_search_ranges_cache{$addr} = $i;
4025        return $i;
4026    }
4027
4028    sub _add_delete {
4029        # Add, replace or delete ranges to or from a list.  The $type
4030        # parameter gives which:
4031        #   '+' => insert or replace a range, returning a list of any changed
4032        #          ranges.
4033        #   '-' => delete a range, returning a list of any deleted ranges.
4034        #
4035        # The next three parameters give respectively the start, end, and
4036        # value associated with the range.  'value' should be null unless the
4037        # operation is '+';
4038        #
4039        # The range list is kept sorted so that the range with the lowest
4040        # starting position is first in the list, and generally, adjacent
4041        # ranges with the same values are merged into a single larger one (see
4042        # exceptions below).
4043        #
4044        # There are more parameters; all are key => value pairs:
4045        #   Type    gives the type of the value.  It is only valid for '+'.
4046        #           All ranges have types; if this parameter is omitted, 0 is
4047        #           assumed.  Ranges with type 0 are assumed to obey the
4048        #           Unicode rules for casing, etc; ranges with other types are
4049        #           not.  Otherwise, the type is arbitrary, for the caller's
4050        #           convenience, and looked at only by this routine to keep
4051        #           adjacent ranges of different types from being merged into
4052        #           a single larger range, and when Replace =>
4053        #           $IF_NOT_EQUIVALENT is specified (see just below).
4054        #   Replace  determines what to do if the range list already contains
4055        #            ranges which coincide with all or portions of the input
4056        #            range.  It is only valid for '+':
4057        #       => $NO            means that the new value is not to replace
4058        #                         any existing ones, but any empty gaps of the
4059        #                         range list coinciding with the input range
4060        #                         will be filled in with the new value.
4061        #       => $UNCONDITIONALLY  means to replace the existing values with
4062        #                         this one unconditionally.  However, if the
4063        #                         new and old values are identical, the
4064        #                         replacement is skipped to save cycles
4065        #       => $IF_NOT_EQUIVALENT means to replace the existing values
4066        #          (the default)  with this one if they are not equivalent.
4067        #                         Ranges are equivalent if their types are the
4068        #                         same, and they are the same string; or if
4069        #                         both are type 0 ranges, if their Unicode
4070        #                         standard forms are identical.  In this last
4071        #                         case, the routine chooses the more "modern"
4072        #                         one to use.  This is because some of the
4073        #                         older files are formatted with values that
4074        #                         are, for example, ALL CAPs, whereas the
4075        #                         derived files have a more modern style,
4076        #                         which looks better.  By looking for this
4077        #                         style when the pre-existing and replacement
4078        #                         standard forms are the same, we can move to
4079        #                         the modern style
4080        #       => $MULTIPLE_BEFORE means that if this range duplicates an
4081        #                         existing one, but has a different value,
4082        #                         don't replace the existing one, but insert
4083        #                         this one so that the same range can occur
4084        #                         multiple times.  They are stored LIFO, so
4085        #                         that the final one inserted is the first one
4086        #                         returned in an ordered search of the table.
4087        #                         If this is an exact duplicate, including the
4088        #                         value, the original will be moved to be
4089        #                         first, before any other duplicate ranges
4090        #                         with different values.
4091        #       => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored
4092        #                         FIFO, so that this one is inserted after all
4093        #                         others that currently exist.  If this is an
4094        #                         exact duplicate, including value, of an
4095        #                         existing range, this one is discarded
4096        #                         (leaving the existing one in its original,
4097        #                         higher priority position
4098        #       => $CROAK         Die with an error if is already there
4099        #       => anything else  is the same as => $IF_NOT_EQUIVALENT
4100        #
4101        # "same value" means identical for non-type-0 ranges, and it means
4102        # having the same standard forms for type-0 ranges.
4103
4104        return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
4105
4106        my $self = shift;
4107        my $operation = shift;   # '+' for add/replace; '-' for delete;
4108        my $start = shift;
4109        my $end   = shift;
4110        my $value = shift;
4111
4112        my %args = @_;
4113
4114        $value = "" if not defined $value;        # warning: $value can be "0"
4115
4116        my $replace = delete $args{'Replace'};
4117        $replace = $IF_NOT_EQUIVALENT unless defined $replace;
4118
4119        my $type = delete $args{'Type'};
4120        $type = 0 unless defined $type;
4121
4122        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4123
4124        my $addr = do { no overloading; pack 'J', $self; };
4125
4126        if ($operation ne '+' && $operation ne '-') {
4127            Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'.  No action taken.");
4128            return;
4129        }
4130        unless (defined $start && defined $end) {
4131            Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete.  No action taken.");
4132            return;
4133        }
4134        unless ($end >= $start) {
4135            Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . ").  No action taken.");
4136            return;
4137        }
4138        #local $to_trace = 1 if main::DEBUG;
4139
4140        if ($operation eq '-') {
4141            if ($replace != $IF_NOT_EQUIVALENT) {
4142                Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list.  Assuming Replace => \$IF_NOT_EQUIVALENT.");
4143                $replace = $IF_NOT_EQUIVALENT;
4144            }
4145            if ($type) {
4146                Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list.  Assuming Type => 0.");
4147                $type = 0;
4148            }
4149            if ($value ne "") {
4150                Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list.  Assuming Value => \"\".");
4151                $value = "";
4152            }
4153        }
4154
4155        my $r = $ranges{$addr};               # The current list of ranges
4156        my $range_list_size = scalar @$r;     # And its size
4157        my $max = $max{$addr};                # The current high code point in
4158                                              # the list of ranges
4159
4160        # Do a special case requiring fewer machine cycles when the new range
4161        # starts after the current highest point.  The Unicode input data is
4162        # structured so this is common.
4163        if ($start > $max) {
4164
4165            trace "$owner_name_of{$addr} $operation", sprintf("%04X..%04X (%s) type=%d; prev max=%04X", $start, $end, $value, $type, $max) if main::DEBUG && $to_trace;
4166            return if $operation eq '-'; # Deleting a non-existing range is a
4167                                         # no-op
4168
4169            # If the new range doesn't logically extend the current final one
4170            # in the range list, create a new range at the end of the range
4171            # list.  (max cleverly is initialized to a negative number not
4172            # adjacent to 0 if the range list is empty, so even adding a range
4173            # to an empty range list starting at 0 will have this 'if'
4174            # succeed.)
4175            if ($start > $max + 1        # non-adjacent means can't extend.
4176                || @{$r}[-1]->value ne $value # values differ, can't extend.
4177                || @{$r}[-1]->type != $type # types differ, can't extend.
4178            ) {
4179                push @$r, Range->new($start, $end,
4180                                     Value => $value,
4181                                     Type => $type);
4182            }
4183            else {
4184
4185                # Here, the new range starts just after the current highest in
4186                # the range list, and they have the same type and value.
4187                # Extend the existing range to incorporate the new one.
4188                @{$r}[-1]->set_end($end);
4189            }
4190
4191            # This becomes the new maximum.
4192            $max{$addr} = $end;
4193
4194            return;
4195        }
4196        #local $to_trace = 0 if main::DEBUG;
4197
4198        trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
4199
4200        # Here, the input range isn't after the whole rest of the range list.
4201        # Most likely 'splice' will be needed.  The rest of the routine finds
4202        # the needed splice parameters, and if necessary, does the splice.
4203        # First, find the offset parameter needed by the splice function for
4204        # the input range.  Note that the input range may span multiple
4205        # existing ones, but we'll worry about that later.  For now, just find
4206        # the beginning.  If the input range is to be inserted starting in a
4207        # position not currently in the range list, it must (obviously) come
4208        # just after the range below it, and just before the range above it.
4209        # Slightly less obviously, it will occupy the position currently
4210        # occupied by the range that is to come after it.  More formally, we
4211        # are looking for the position, $i, in the array of ranges, such that:
4212        #
4213        # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
4214        #
4215        # (The ordered relationships within existing ranges are also shown in
4216        # the equation above).  However, if the start of the input range is
4217        # within an existing range, the splice offset should point to that
4218        # existing range's position in the list; that is $i satisfies a
4219        # somewhat different equation, namely:
4220        #
4221        #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
4222        #
4223        # More briefly, $start can come before or after r[$i]->start, and at
4224        # this point, we don't know which it will be.  However, these
4225        # two equations share these constraints:
4226        #
4227        #   r[$i-1]->end < $start <= r[$i]->end
4228        #
4229        # And that is good enough to find $i.
4230
4231        my $i = $self->_search_ranges($start);
4232        if (! defined $i) {
4233            Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined.  Operation '$operation' not performed");
4234            return;
4235        }
4236
4237        # The search function returns $i such that:
4238        #
4239        # r[$i-1]->end < $start <= r[$i]->end
4240        #
4241        # That means that $i points to the first range in the range list
4242        # that could possibly be affected by this operation.  We still don't
4243        # know if the start of the input range is within r[$i], or if it
4244        # points to empty space between r[$i-1] and r[$i].
4245        trace "[$i] is the beginning splice point.  Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
4246
4247        # Special case the insertion of data that is not to replace any
4248        # existing data.
4249        if ($replace == $NO) {  # If $NO, has to be operation '+'
4250            #local $to_trace = 1 if main::DEBUG;
4251            trace "Doesn't replace" if main::DEBUG && $to_trace;
4252
4253            # Here, the new range is to take effect only on those code points
4254            # that aren't already in an existing range.  This can be done by
4255            # looking through the existing range list and finding the gaps in
4256            # the ranges that this new range affects, and then calling this
4257            # function recursively on each of those gaps, leaving untouched
4258            # anything already in the list.  Gather up a list of the changed
4259            # gaps first so that changes to the internal state as new ranges
4260            # are added won't be a problem.
4261            my @gap_list;
4262
4263            # First, if the starting point of the input range is outside an
4264            # existing one, there is a gap from there to the beginning of the
4265            # existing range -- add a span to fill the part that this new
4266            # range occupies
4267            if ($start < $r->[$i]->start) {
4268                push @gap_list, Range->new($start,
4269                                           main::min($end,
4270                                                     $r->[$i]->start - 1),
4271                                           Type => $type);
4272                trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
4273            }
4274
4275            # Then look through the range list for other gaps until we reach
4276            # the highest range affected by the input one.
4277            my $j;
4278            for ($j = $i+1; $j < $range_list_size; $j++) {
4279                trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
4280                last if $end < $r->[$j]->start;
4281
4282                # If there is a gap between when this range starts and the
4283                # previous one ends, add a span to fill it.  Note that just
4284                # because there are two ranges doesn't mean there is a
4285                # non-zero gap between them.  It could be that they have
4286                # different values or types
4287                if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
4288                    push @gap_list,
4289                        Range->new($r->[$j-1]->end + 1,
4290                                   $r->[$j]->start - 1,
4291                                   Type => $type);
4292                    trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
4293                }
4294            }
4295
4296            # Here, we have either found an existing range in the range list,
4297            # beyond the area affected by the input one, or we fell off the
4298            # end of the loop because the input range affects the whole rest
4299            # of the range list.  In either case, $j is 1 higher than the
4300            # highest affected range.  If $j == $i, it means that there are no
4301            # affected ranges, that the entire insertion is in the gap between
4302            # r[$i-1], and r[$i], which we already have taken care of before
4303            # the loop.
4304            # On the other hand, if there are affected ranges, it might be
4305            # that there is a gap that needs filling after the final such
4306            # range to the end of the input range
4307            if ($r->[$j-1]->end < $end) {
4308                    push @gap_list, Range->new(main::max($start,
4309                                                         $r->[$j-1]->end + 1),
4310                                               $end,
4311                                               Type => $type);
4312                    trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
4313            }
4314
4315            # Call recursively to fill in all the gaps.
4316            foreach my $gap (@gap_list) {
4317                $self->_add_delete($operation,
4318                                   $gap->start,
4319                                   $gap->end,
4320                                   $value,
4321                                   Type => $type);
4322            }
4323
4324            return;
4325        }
4326
4327        # Here, we have taken care of the case where $replace is $NO.
4328        # Remember that here, r[$i-1]->end < $start <= r[$i]->end
4329        # If inserting a multiple record, this is where it goes, before the
4330        # first (if any) existing one if inserting LIFO.  (If this is to go
4331        # afterwards, FIFO, we below move the pointer to there.)  These imply
4332        # an insertion, and no change to any existing ranges.  Note that $i
4333        # can be -1 if this new range doesn't actually duplicate any existing,
4334        # and comes at the beginning of the list.
4335        if ($replace == $MULTIPLE_BEFORE || $replace == $MULTIPLE_AFTER) {
4336
4337            if ($start != $end) {
4338                Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point.  No action taken.");
4339                return;
4340            }
4341
4342            # If the new code point is within a current range ...
4343            if ($end >= $r->[$i]->start) {
4344
4345                # Don't add an exact duplicate, as it isn't really a multiple
4346                my $existing_value = $r->[$i]->value;
4347                my $existing_type = $r->[$i]->type;
4348                return if $value eq $existing_value && $type eq $existing_type;
4349
4350                # If the multiple value is part of an existing range, we want
4351                # to split up that range, so that only the single code point
4352                # is affected.  To do this, we first call ourselves
4353                # recursively to delete that code point from the table, having
4354                # preserved its current data above.  Then we call ourselves
4355                # recursively again to add the new multiple, which we know by
4356                # the test just above is different than the current code
4357                # point's value, so it will become a range containing a single
4358                # code point: just itself.  Finally, we add back in the
4359                # pre-existing code point, which will again be a single code
4360                # point range.  Because 'i' likely will have changed as a
4361                # result of these operations, we can't just continue on, but
4362                # do this operation recursively as well.  If we are inserting
4363                # LIFO, the pre-existing code point needs to go after the new
4364                # one, so use MULTIPLE_AFTER; and vice versa.
4365                if ($r->[$i]->start != $r->[$i]->end) {
4366                    $self->_add_delete('-', $start, $end, "");
4367                    $self->_add_delete('+', $start, $end, $value, Type => $type);
4368                    return $self->_add_delete('+',
4369                            $start, $end,
4370                            $existing_value,
4371                            Type => $existing_type,
4372                            Replace => ($replace == $MULTIPLE_BEFORE)
4373                                       ? $MULTIPLE_AFTER
4374                                       : $MULTIPLE_BEFORE);
4375                }
4376            }
4377
4378            # If to place this new record after, move to beyond all existing
4379            # ones; but don't add this one if identical to any of them, as it
4380            # isn't really a multiple.  This leaves the original order, so
4381            # that the current request is ignored.  The reasoning is that the
4382            # previous request that wanted this record to have high priority
4383            # should have precedence.
4384            if ($replace == $MULTIPLE_AFTER) {
4385                while ($i < @$r && $r->[$i]->start == $start) {
4386                    return if $value eq $r->[$i]->value
4387                              && $type eq $r->[$i]->type;
4388                    $i++;
4389                }
4390            }
4391            else {
4392                # If instead we are to place this new record before any
4393                # existing ones, remove any identical ones that come after it.
4394                # This changes the existing order so that the new one is
4395                # first, as is being requested.
4396                for (my $j = $i + 1;
4397                     $j < @$r && $r->[$j]->start == $start;
4398                     $j++)
4399                {
4400                    if ($value eq $r->[$j]->value && $type eq $r->[$j]->type) {
4401                        splice @$r, $j, 1;
4402                        last;   # There should only be one instance, so no
4403                                # need to keep looking
4404                    }
4405                }
4406            }
4407
4408            trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
4409            my @return = splice @$r,
4410                                $i,
4411                                0,
4412                                Range->new($start,
4413                                           $end,
4414                                           Value => $value,
4415                                           Type => $type);
4416            if (main::DEBUG && $to_trace) {
4417                trace "After splice:";
4418                trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4419                trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4420                trace "i  =[", $i, "]", $r->[$i] if $i >= 0;
4421                trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4422                trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4423                trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3;
4424            }
4425            return @return;
4426        }
4427
4428        # Here, we have taken care of $NO and $MULTIPLE_foo replaces.  This
4429        # leaves delete, insert, and replace either unconditionally or if not
4430        # equivalent.  $i still points to the first potential affected range.
4431        # Now find the highest range affected, which will determine the length
4432        # parameter to splice.  (The input range can span multiple existing
4433        # ones.)  If this isn't a deletion, while we are looking through the
4434        # range list, see also if this is a replacement rather than a clean
4435        # insertion; that is if it will change the values of at least one
4436        # existing range.  Start off assuming it is an insert, until find it
4437        # isn't.
4438        my $clean_insert = $operation eq '+';
4439        my $j;        # This will point to the highest affected range
4440
4441        # For non-zero types, the standard form is the value itself;
4442        my $standard_form = ($type) ? $value : main::standardize($value);
4443
4444        for ($j = $i; $j < $range_list_size; $j++) {
4445            trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
4446
4447            # If find a range that it doesn't overlap into, we can stop
4448            # searching
4449            last if $end < $r->[$j]->start;
4450
4451            # Here, overlaps the range at $j.  If the values don't match,
4452            # and so far we think this is a clean insertion, it becomes a
4453            # non-clean insertion, i.e., a 'change' or 'replace' instead.
4454            if ($clean_insert) {
4455                if ($r->[$j]->standard_form ne $standard_form) {
4456                    $clean_insert = 0;
4457                    if ($replace == $CROAK) {
4458                        main::croak("The range to add "
4459                        . sprintf("%04X", $start)
4460                        . '-'
4461                        . sprintf("%04X", $end)
4462                        . " with value '$value' overlaps an existing range $r->[$j]");
4463                    }
4464                }
4465                else {
4466
4467                    # Here, the two values are essentially the same.  If the
4468                    # two are actually identical, replacing wouldn't change
4469                    # anything so skip it.
4470                    my $pre_existing = $r->[$j]->value;
4471                    if ($pre_existing ne $value) {
4472
4473                        # Here the new and old standardized values are the
4474                        # same, but the non-standardized values aren't.  If
4475                        # replacing unconditionally, then replace
4476                        if( $replace == $UNCONDITIONALLY) {
4477                            $clean_insert = 0;
4478                        }
4479                        else {
4480
4481                            # Here, are replacing conditionally.  Decide to
4482                            # replace or not based on which appears to look
4483                            # the "nicest".  If one is mixed case and the
4484                            # other isn't, choose the mixed case one.
4485                            my $new_mixed = $value =~ /[A-Z]/
4486                                            && $value =~ /[a-z]/;
4487                            my $old_mixed = $pre_existing =~ /[A-Z]/
4488                                            && $pre_existing =~ /[a-z]/;
4489
4490                            if ($old_mixed != $new_mixed) {
4491                                $clean_insert = 0 if $new_mixed;
4492                                if (main::DEBUG && $to_trace) {
4493                                    if ($clean_insert) {
4494                                        trace "Retaining $pre_existing over $value";
4495                                    }
4496                                    else {
4497                                        trace "Replacing $pre_existing with $value";
4498                                    }
4499                                }
4500                            }
4501                            else {
4502
4503                                # Here casing wasn't different between the two.
4504                                # If one has hyphens or underscores and the
4505                                # other doesn't, choose the one with the
4506                                # punctuation.
4507                                my $new_punct = $value =~ /[-_]/;
4508                                my $old_punct = $pre_existing =~ /[-_]/;
4509
4510                                if ($old_punct != $new_punct) {
4511                                    $clean_insert = 0 if $new_punct;
4512                                    if (main::DEBUG && $to_trace) {
4513                                        if ($clean_insert) {
4514                                            trace "Retaining $pre_existing over $value";
4515                                        }
4516                                        else {
4517                                            trace "Replacing $pre_existing with $value";
4518                                        }
4519                                    }
4520                                }   # else existing one is just as "good";
4521                                    # retain it to save cycles.
4522                            }
4523                        }
4524                    }
4525                }
4526            }
4527        } # End of loop looking for highest affected range.
4528
4529        # Here, $j points to one beyond the highest range that this insertion
4530        # affects (hence to beyond the range list if that range is the final
4531        # one in the range list).
4532
4533        # The splice length is all the affected ranges.  Get it before
4534        # subtracting, for efficiency, so we don't have to later add 1.
4535        my $length = $j - $i;
4536
4537        $j--;        # $j now points to the highest affected range.
4538        trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
4539
4540        # Here, have taken care of $NO and $MULTIPLE_foo replaces.
4541        # $j points to the highest affected range.  But it can be < $i or even
4542        # -1.  These happen only if the insertion is entirely in the gap
4543        # between r[$i-1] and r[$i].  Here's why: j < i means that the j loop
4544        # above exited first time through with $end < $r->[$i]->start.  (And
4545        # then we subtracted one from j)  This implies also that $start <
4546        # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
4547        # $start, so the entire input range is in the gap.
4548        if ($j < $i) {
4549
4550            # Here the entire input range is in the gap before $i.
4551
4552            if (main::DEBUG && $to_trace) {
4553                if ($i) {
4554                    trace "Entire range is between $r->[$i-1] and $r->[$i]";
4555                }
4556                else {
4557                    trace "Entire range is before $r->[$i]";
4558                }
4559            }
4560            return if $operation ne '+'; # Deletion of a non-existent range is
4561                                         # a no-op
4562        }
4563        else {
4564
4565            # Here part of the input range is not in the gap before $i.  Thus,
4566            # there is at least one affected one, and $j points to the highest
4567            # such one.
4568
4569            # At this point, here is the situation:
4570            # This is not an insertion of a multiple, nor of tentative ($NO)
4571            # data.
4572            #   $i  points to the first element in the current range list that
4573            #            may be affected by this operation.  In fact, we know
4574            #            that the range at $i is affected because we are in
4575            #            the else branch of this 'if'
4576            #   $j  points to the highest affected range.
4577            # In other words,
4578            #   r[$i-1]->end < $start <= r[$i]->end
4579            # And:
4580            #   r[$i-1]->end < $start <= $end < r[$j+1]->start
4581            #
4582            # Also:
4583            #   $clean_insert is a boolean which is set true if and only if
4584            #        this is a "clean insertion", i.e., not a change nor a
4585            #        deletion (multiple was handled above).
4586
4587            # We now have enough information to decide if this call is a no-op
4588            # or not.  It is a no-op if this is an insertion of already
4589            # existing data.  To be so, it must be contained entirely in one
4590            # range.
4591
4592            if (main::DEBUG && $to_trace && $clean_insert
4593                                         && $start >= $r->[$i]->start
4594                                         && $end   <= $r->[$i]->end)
4595            {
4596                    trace "no-op";
4597            }
4598            return if $clean_insert
4599                      && $start >= $r->[$i]->start
4600                      && $end   <= $r->[$i]->end;
4601        }
4602
4603        # Here, we know that some action will have to be taken.  We have
4604        # calculated the offset and length (though adjustments may be needed)
4605        # for the splice.  Now start constructing the replacement list.
4606        my @replacement;
4607        my $splice_start = $i;
4608
4609        my $extends_below;
4610        my $extends_above;
4611
4612        # See if should extend any adjacent ranges.
4613        if ($operation eq '-') { # Don't extend deletions
4614            $extends_below = $extends_above = 0;
4615        }
4616        else {  # Here, should extend any adjacent ranges.  See if there are
4617                # any.
4618            $extends_below = ($i > 0
4619                            # can't extend unless adjacent
4620                            && $r->[$i-1]->end == $start -1
4621                            # can't extend unless are same standard value
4622                            && $r->[$i-1]->standard_form eq $standard_form
4623                            # can't extend unless share type
4624                            && $r->[$i-1]->type == $type);
4625            $extends_above = ($j+1 < $range_list_size
4626                            && $r->[$j+1]->start == $end +1
4627                            && $r->[$j+1]->standard_form eq $standard_form
4628                            && $r->[$j+1]->type == $type);
4629        }
4630        if ($extends_below && $extends_above) { # Adds to both
4631            $splice_start--;     # start replace at element below
4632            $length += 2;        # will replace on both sides
4633            trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
4634
4635            # The result will fill in any gap, replacing both sides, and
4636            # create one large range.
4637            @replacement = Range->new($r->[$i-1]->start,
4638                                      $r->[$j+1]->end,
4639                                      Value => $value,
4640                                      Type => $type);
4641        }
4642        else {
4643
4644            # Here we know that the result won't just be the conglomeration of
4645            # a new range with both its adjacent neighbors.  But it could
4646            # extend one of them.
4647
4648            if ($extends_below) {
4649
4650                # Here the new element adds to the one below, but not to the
4651                # one above.  If inserting, and only to that one range,  can
4652                # just change its ending to include the new one.
4653                if ($length == 0 && $clean_insert) {
4654                    $r->[$i-1]->set_end($end);
4655                    trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
4656                    return;
4657                }
4658                else {
4659                    trace "Changing inserted range to start at ", sprintf("%04X",  $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
4660                    $splice_start--;        # start replace at element below
4661                    $length++;              # will replace the element below
4662                    $start = $r->[$i-1]->start;
4663                }
4664            }
4665            elsif ($extends_above) {
4666
4667                # Here the new element adds to the one above, but not below.
4668                # Mirror the code above
4669                if ($length == 0 && $clean_insert) {
4670                    $r->[$j+1]->set_start($start);
4671                    trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
4672                    return;
4673                }
4674                else {
4675                    trace "Changing inserted range to end at ", sprintf("%04X",  $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
4676                    $length++;        # will replace the element above
4677                    $end = $r->[$j+1]->end;
4678                }
4679            }
4680
4681            trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
4682
4683            # Finally, here we know there will have to be a splice.
4684            # If the change or delete affects only the highest portion of the
4685            # first affected range, the range will have to be split.  The
4686            # splice will remove the whole range, but will replace it by a new
4687            # range containing just the unaffected part.  So, in this case,
4688            # add to the replacement list just this unaffected portion.
4689            if (! $extends_below
4690                && $start > $r->[$i]->start && $start <= $r->[$i]->end)
4691            {
4692                push @replacement,
4693                    Range->new($r->[$i]->start,
4694                               $start - 1,
4695                               Value => $r->[$i]->value,
4696                               Type => $r->[$i]->type);
4697            }
4698
4699            # In the case of an insert or change, but not a delete, we have to
4700            # put in the new stuff;  this comes next.
4701            if ($operation eq '+') {
4702                push @replacement, Range->new($start,
4703                                              $end,
4704                                              Value => $value,
4705                                              Type => $type);
4706            }
4707
4708            trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
4709            #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
4710
4711            # And finally, if we're changing or deleting only a portion of the
4712            # highest affected range, it must be split, as the lowest one was.
4713            if (! $extends_above
4714                && $j >= 0  # Remember that j can be -1 if before first
4715                            # current element
4716                && $end >= $r->[$j]->start
4717                && $end < $r->[$j]->end)
4718            {
4719                push @replacement,
4720                    Range->new($end + 1,
4721                               $r->[$j]->end,
4722                               Value => $r->[$j]->value,
4723                               Type => $r->[$j]->type);
4724            }
4725        }
4726
4727        # And do the splice, as calculated above
4728        if (main::DEBUG && $to_trace) {
4729            trace "replacing $length element(s) at $i with ";
4730            foreach my $replacement (@replacement) {
4731                trace "    $replacement";
4732            }
4733            trace "Before splice:";
4734            trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4735            trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4736            trace "i  =[", $i, "]", $r->[$i];
4737            trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4738            trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4739        }
4740
4741        my @return = splice @$r, $splice_start, $length, @replacement;
4742
4743        if (main::DEBUG && $to_trace) {
4744            trace "After splice:";
4745            trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4746            trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4747            trace "i  =[", $i, "]", $r->[$i];
4748            trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4749            trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4750            trace "removed ", @return if @return;
4751        }
4752
4753        # An actual deletion could have changed the maximum in the list.
4754        # There was no deletion if the splice didn't return something, but
4755        # otherwise recalculate it.  This is done too rarely to worry about
4756        # performance.
4757        if ($operation eq '-' && @return) {
4758            if (@$r) {
4759                $max{$addr} = $r->[-1]->end;
4760            }
4761            else {  # Now empty
4762                $max{$addr} = $max_init;
4763            }
4764        }
4765        return @return;
4766    }
4767
4768    sub reset_each_range($self) {  # reset the iterator for each_range();
4769        no overloading;
4770        undef $each_range_iterator{pack 'J', $self};
4771        return;
4772    }
4773
4774    sub each_range($self) {
4775        # Iterate over each range in a range list.  Results are undefined if
4776        # the range list is changed during the iteration.
4777        my $addr = do { no overloading; pack 'J', $self; };
4778
4779        return if $self->is_empty;
4780
4781        $each_range_iterator{$addr} = -1
4782                                if ! defined $each_range_iterator{$addr};
4783        $each_range_iterator{$addr}++;
4784        return $ranges{$addr}->[$each_range_iterator{$addr}]
4785                        if $each_range_iterator{$addr} < @{$ranges{$addr}};
4786        undef $each_range_iterator{$addr};
4787        return;
4788    }
4789
4790    sub count($self) {        # Returns count of code points in range list
4791        my $addr = do { no overloading; pack 'J', $self; };
4792
4793        my $count = 0;
4794        foreach my $range (@{$ranges{$addr}}) {
4795            $count += $range->end - $range->start + 1;
4796        }
4797        return $count;
4798    }
4799
4800    sub delete_range($self, $start, $end) {    # Delete a range
4801        return $self->_add_delete('-', $start, $end, "");
4802    }
4803
4804    sub is_empty($self) { # Returns boolean as to if a range list is empty
4805        no overloading;
4806        return scalar @{$ranges{pack 'J', $self}} == 0;
4807    }
4808
4809    sub hash($self) {
4810        # Quickly returns a scalar suitable for separating tables into
4811        # buckets, i.e. it is a hash function of the contents of a table, so
4812        # there are relatively few conflicts.
4813        my $addr = do { no overloading; pack 'J', $self; };
4814
4815        # These are quickly computable.  Return looks like 'min..max;count'
4816        return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
4817    }
4818} # End closure for _Range_List_Base
4819
4820package Range_List;
4821use parent '-norequire', '_Range_List_Base';
4822
4823use warnings;
4824use strict;
4825
4826use feature 'signatures';
4827no warnings 'experimental::signatures';
4828
4829# A Range_List is a range list for match tables; i.e. the range values are
4830# not significant.  Thus a number of operations can be safely added to it,
4831# such as inversion, intersection.  Note that union is also an unsafe
4832# operation when range values are cared about, and that method is in the base
4833# class, not here.  But things are set up so that that method is callable only
4834# during initialization.  Only in this derived class, is there an operation
4835# that combines two tables.  A Range_Map can thus be used to initialize a
4836# Range_List, and its mappings will be in the list, but are not significant to
4837# this class.
4838
4839sub trace { return main::trace(@_); }
4840
4841{ # Closure
4842
4843    use overload
4844        fallback => 0,
4845        '+' => sub { my $self = shift;
4846                    my $other = shift;
4847
4848                    return $self->_union($other)
4849                },
4850        '+=' => sub { my $self = shift;
4851                    my $other = shift;
4852                    my $reversed = shift;
4853
4854                    if ($reversed) {
4855                        Carp::my_carp_bug("Bad news.  Can't cope with '"
4856                        . ref($other)
4857                        . ' += '
4858                        . ref($self)
4859                        . "'.  undef returned.");
4860                        return;
4861                    }
4862
4863                    return $self->_union($other)
4864                },
4865        '&' => sub { my $self = shift;
4866                    my $other = shift;
4867
4868                    return $self->_intersect($other, 0);
4869                },
4870        '&=' => sub { my $self = shift;
4871                    my $other = shift;
4872                    my $reversed = shift;
4873
4874                    if ($reversed) {
4875                        Carp::my_carp_bug("Bad news.  Can't cope with '"
4876                        . ref($other)
4877                        . ' &= '
4878                        . ref($self)
4879                        . "'.  undef returned.");
4880                        return;
4881                    }
4882
4883                    return $self->_intersect($other, 0);
4884                },
4885        '~' => "_invert",
4886        '-' => "_subtract",
4887    ;
4888
4889    sub _invert($self, @) {
4890        # Returns a new Range_List that gives all code points not in $self.
4891        my $new = Range_List->new;
4892
4893        # Go through each range in the table, finding the gaps between them
4894        my $max = -1;   # Set so no gap before range beginning at 0
4895        for my $range ($self->ranges) {
4896            my $start = $range->start;
4897            my $end   = $range->end;
4898
4899            # If there is a gap before this range, the inverse will contain
4900            # that gap.
4901            if ($start > $max + 1) {
4902                $new->add_range($max + 1, $start - 1);
4903            }
4904            $max = $end;
4905        }
4906
4907        # And finally, add the gap from the end of the table to the max
4908        # possible code point
4909        if ($max < $MAX_WORKING_CODEPOINT) {
4910            $new->add_range($max + 1, $MAX_WORKING_CODEPOINT);
4911        }
4912        return $new;
4913    }
4914
4915    sub _subtract($self, $other, $reversed=0) {
4916        # Returns a new Range_List with the argument deleted from it.  The
4917        # argument can be a single code point, a range, or something that has
4918        # a range, with the _range_list() method on it returning them
4919
4920        if ($reversed) {
4921            Carp::my_carp_bug("Bad news.  Can't cope with '"
4922            . ref($other)
4923            . ' - '
4924            . ref($self)
4925            . "'.  undef returned.");
4926            return;
4927        }
4928
4929        my $new = Range_List->new(Initialize => $self);
4930
4931        if (! ref $other) { # Single code point
4932            $new->delete_range($other, $other);
4933        }
4934        elsif ($other->isa('Range')) {
4935            $new->delete_range($other->start, $other->end);
4936        }
4937        elsif ($other->can('_range_list')) {
4938            foreach my $range ($other->_range_list->ranges) {
4939                $new->delete_range($range->start, $range->end);
4940            }
4941        }
4942        else {
4943            Carp::my_carp_bug("Can't cope with a "
4944                        . ref($other)
4945                        . " argument to '-'.  Subtraction ignored."
4946                        );
4947            return $self;
4948        }
4949
4950        return $new;
4951    }
4952
4953    sub _intersect($a_object, $b_object, $check_if_overlapping=0) {
4954        # Returns either a boolean giving whether the two inputs' range lists
4955        # intersect (overlap), or a new Range_List containing the intersection
4956        # of the two lists.  The optional final parameter being true indicates
4957        # to do the check instead of the intersection.
4958
4959        if (! defined $b_object) {
4960            my $message = "";
4961            $message .= $a_object->_owner_name_of if defined $a_object;
4962            Carp::my_carp_bug($message .= "Called with undefined value.  Intersection not done.");
4963            return;
4964        }
4965
4966        # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
4967        # Thus the intersection could be much more simply be written:
4968        #   return ~(~$a_object + ~$b_object);
4969        # But, this is slower, and when taking the inverse of a large
4970        # range_size_1 table, back when such tables were always stored that
4971        # way, it became prohibitively slow, hence the code was changed to the
4972        # below
4973
4974        if ($b_object->isa('Range')) {
4975            $b_object = Range_List->new(Initialize => $b_object,
4976                                        Owner => $a_object->_owner_name_of);
4977        }
4978        $b_object = $b_object->_range_list if $b_object->can('_range_list');
4979
4980        my @a_ranges = $a_object->ranges;
4981        my @b_ranges = $b_object->ranges;
4982
4983        #local $to_trace = 1 if main::DEBUG;
4984        trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
4985
4986        # Start with the first range in each list
4987        my $a_i = 0;
4988        my $range_a = $a_ranges[$a_i];
4989        my $b_i = 0;
4990        my $range_b = $b_ranges[$b_i];
4991
4992        my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
4993                                                if ! $check_if_overlapping;
4994
4995        # If either list is empty, there is no intersection and no overlap
4996        if (! defined $range_a || ! defined $range_b) {
4997            return $check_if_overlapping ? 0 : $new;
4998        }
4999        trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
5000
5001        # Otherwise, must calculate the intersection/overlap.  Start with the
5002        # very first code point in each list
5003        my $a = $range_a->start;
5004        my $b = $range_b->start;
5005
5006        # Loop through all the ranges of each list; in each iteration, $a and
5007        # $b are the current code points in their respective lists
5008        while (1) {
5009
5010            # If $a and $b are the same code point, ...
5011            if ($a == $b) {
5012
5013                # it means the lists overlap.  If just checking for overlap
5014                # know the answer now,
5015                return 1 if $check_if_overlapping;
5016
5017                # The intersection includes this code point plus anything else
5018                # common to both current ranges.
5019                my $start = $a;
5020                my $end = main::min($range_a->end, $range_b->end);
5021                if (! $check_if_overlapping) {
5022                    trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
5023                    $new->add_range($start, $end);
5024                }
5025
5026                # Skip ahead to the end of the current intersect
5027                $a = $b = $end;
5028
5029                # If the current intersect ends at the end of either range (as
5030                # it must for at least one of them), the next possible one
5031                # will be the beginning code point in it's list's next range.
5032                if ($a == $range_a->end) {
5033                    $range_a = $a_ranges[++$a_i];
5034                    last unless defined $range_a;
5035                    $a = $range_a->start;
5036                }
5037                if ($b == $range_b->end) {
5038                    $range_b = $b_ranges[++$b_i];
5039                    last unless defined $range_b;
5040                    $b = $range_b->start;
5041                }
5042
5043                trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
5044            }
5045            elsif ($a < $b) {
5046
5047                # Not equal, but if the range containing $a encompasses $b,
5048                # change $a to be the middle of the range where it does equal
5049                # $b, so the next iteration will get the intersection
5050                if ($range_a->end >= $b) {
5051                    $a = $b;
5052                }
5053                else {
5054
5055                    # Here, the current range containing $a is entirely below
5056                    # $b.  Go try to find a range that could contain $b.
5057                    $a_i = $a_object->_search_ranges($b);
5058
5059                    # If no range found, quit.
5060                    last unless defined $a_i;
5061
5062                    # The search returns $a_i, such that
5063                    #   range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
5064                    # Set $a to the beginning of this new range, and repeat.
5065                    $range_a = $a_ranges[$a_i];
5066                    $a = $range_a->start;
5067                }
5068            }
5069            else { # Here, $b < $a.
5070
5071                # Mirror image code to the leg just above
5072                if ($range_b->end >= $a) {
5073                    $b = $a;
5074                }
5075                else {
5076                    $b_i = $b_object->_search_ranges($a);
5077                    last unless defined $b_i;
5078                    $range_b = $b_ranges[$b_i];
5079                    $b = $range_b->start;
5080                }
5081            }
5082        } # End of looping through ranges.
5083
5084        # Intersection fully computed, or now know that there is no overlap
5085        return $check_if_overlapping ? 0 : $new;
5086    }
5087
5088    sub overlaps($self, $other) {
5089        # Returns boolean giving whether the two arguments overlap somewhere
5090        return $self->_intersect($other, 1);
5091    }
5092
5093    sub add_range($self, $start, $end) {
5094        # Add a range to the list.
5095        return $self->_add_delete('+', $start, $end, "");
5096    }
5097
5098    sub matches_identically_to($self, $other) {
5099        # Return a boolean as to whether or not two Range_Lists match identical
5100        # sets of code points.
5101        # These are ordered in increasing real time to figure out (at least
5102        # until a patch changes that and doesn't change this)
5103        return 0 if $self->max != $other->max;
5104        return 0 if $self->min != $other->min;
5105        return 0 if $self->range_count != $other->range_count;
5106        return 0 if $self->count != $other->count;
5107
5108        # Here they could be identical because all the tests above passed.
5109        # The loop below is somewhat simpler since we know they have the same
5110        # number of elements.  Compare range by range, until reach the end or
5111        # find something that differs.
5112        my @a_ranges = $self->ranges;
5113        my @b_ranges = $other->ranges;
5114        for my $i (0 .. @a_ranges - 1) {
5115            my $a = $a_ranges[$i];
5116            my $b = $b_ranges[$i];
5117            trace "self $a; other $b" if main::DEBUG && $to_trace;
5118            return 0 if ! defined $b
5119                        || $a->start != $b->start
5120                        || $a->end != $b->end;
5121        }
5122        return 1;
5123    }
5124
5125    sub is_code_point_usable($code, $try_hard) {
5126        # This used only for making the test script.  See if the input
5127        # proposed trial code point is one that Perl will handle.  If second
5128        # parameter is 0, it won't select some code points for various
5129        # reasons, noted below.
5130        return 0 if $code < 0;                # Never use a negative
5131
5132        # shun null.  I'm (khw) not sure why this was done, but NULL would be
5133        # the character very frequently used.
5134        return $try_hard if $code == 0x0000;
5135
5136        # shun non-character code points.
5137        return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
5138        return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
5139
5140        return $try_hard if $code > $MAX_UNICODE_CODEPOINT;   # keep in range
5141        return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
5142
5143        return 1;
5144    }
5145
5146    sub get_valid_code_point($self) {
5147        # Return a code point that's part of the range list.  Returns nothing
5148        # if the table is empty or we can't find a suitable code point.  This
5149        # used only for making the test script.
5150        my $addr = do { no overloading; pack 'J', $self; };
5151
5152        # On first pass, don't choose less desirable code points; if no good
5153        # one is found, repeat, allowing a less desirable one to be selected.
5154        for my $try_hard (0, 1) {
5155
5156            # Look through all the ranges for a usable code point.
5157            for my $set (reverse $self->ranges) {
5158
5159                # Try the edge cases first, starting with the end point of the
5160                # range.
5161                my $end = $set->end;
5162                return $end if is_code_point_usable($end, $try_hard);
5163                $end = $MAX_UNICODE_CODEPOINT + 1 if $end > $MAX_UNICODE_CODEPOINT;
5164
5165                # End point didn't, work.  Start at the beginning and try
5166                # every one until find one that does work.
5167                for my $trial ($set->start .. $end - 1) {
5168                    return $trial if is_code_point_usable($trial, $try_hard);
5169                }
5170            }
5171        }
5172        return ();  # If none found, give up.
5173    }
5174
5175    sub get_invalid_code_point($self) {
5176        # Return a code point that's not part of the table.  Returns nothing
5177        # if the table covers all code points or a suitable code point can't
5178        # be found.  This used only for making the test script.
5179
5180        # Just find a valid code point of the inverse, if any.
5181        return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
5182    }
5183} # end closure for Range_List
5184
5185package Range_Map;
5186use parent '-norequire', '_Range_List_Base';
5187
5188use strict;
5189use warnings;
5190
5191use feature 'signatures';
5192no warnings 'experimental::signatures';
5193
5194# A Range_Map is a range list in which the range values (called maps) are
5195# significant, and hence shouldn't be manipulated by our other code, which
5196# could be ambiguous or lose things.  For example, in taking the union of two
5197# lists, which share code points, but which have differing values, which one
5198# has precedence in the union?
5199# It turns out that these operations aren't really necessary for map tables,
5200# and so this class was created to make sure they aren't accidentally
5201# applied to them.
5202
5203{ # Closure
5204
5205    sub add_map($self, @add) {
5206        # Add a range containing a mapping value to the list
5207        return $self->_add_delete('+', @add);
5208    }
5209
5210    sub replace_map($self, @list) {
5211        # Replace a range
5212        return $self->_add_delete('+', @list, Replace => $UNCONDITIONALLY);
5213    }
5214
5215    sub add_duplicate {
5216        # Adds entry to a range list which can duplicate an existing entry
5217
5218        my $self = shift;
5219        my $code_point = shift;
5220        my $value = shift;
5221        my %args = @_;
5222        my $replace = delete $args{'Replace'} // $MULTIPLE_BEFORE;
5223        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5224
5225        return $self->add_map($code_point, $code_point,
5226                                $value, Replace => $replace);
5227    }
5228} # End of closure for package Range_Map
5229
5230package _Base_Table;
5231
5232use strict;
5233use warnings;
5234
5235use feature 'signatures';
5236no warnings 'experimental::signatures';
5237
5238# A table is the basic data structure that gets written out into a file for
5239# use by the Perl core.  This is the abstract base class implementing the
5240# common elements from the derived ones.  A list of the methods to be
5241# furnished by an implementing class is just after the constructor.
5242
5243sub standardize { return main::standardize($_[0]); }
5244sub trace { return main::trace(@_); }
5245
5246{ # Closure
5247
5248    main::setup_package();
5249
5250    my %range_list;
5251    # Object containing the ranges of the table.
5252    main::set_access('range_list', \%range_list, 'p_r', 'p_s');
5253
5254    my %full_name;
5255    # The full table name.
5256    main::set_access('full_name', \%full_name, 'r');
5257
5258    my %name;
5259    # The table name, almost always shorter
5260    main::set_access('name', \%name, 'r');
5261
5262    my %short_name;
5263    # The shortest of all the aliases for this table, with underscores removed
5264    main::set_access('short_name', \%short_name);
5265
5266    my %nominal_short_name_length;
5267    # The length of short_name before removing underscores
5268    main::set_access('nominal_short_name_length',
5269                    \%nominal_short_name_length);
5270
5271    my %complete_name;
5272    # The complete name, including property.
5273    main::set_access('complete_name', \%complete_name, 'r');
5274
5275    my %property;
5276    # Parent property this table is attached to.
5277    main::set_access('property', \%property, 'r');
5278
5279    my %aliases;
5280    # Ordered list of alias objects of the table's name.  The first ones in
5281    # the list are output first in comments
5282    main::set_access('aliases', \%aliases, 'readable_array');
5283
5284    my %comment;
5285    # A comment associated with the table for human readers of the files
5286    main::set_access('comment', \%comment, 's');
5287
5288    my %description;
5289    # A comment giving a short description of the table's meaning for human
5290    # readers of the files.
5291    main::set_access('description', \%description, 'readable_array');
5292
5293    my %note;
5294    # A comment giving a short note about the table for human readers of the
5295    # files.
5296    main::set_access('note', \%note, 'readable_array');
5297
5298    my %fate;
5299    # Enum; there are a number of possibilities for what happens to this
5300    # table: it could be normal, or suppressed, or not for external use.  See
5301    # values at definition for $SUPPRESSED.
5302    main::set_access('fate', \%fate, 'r');
5303
5304    my %find_table_from_alias;
5305    # The parent property passes this pointer to a hash which this class adds
5306    # all its aliases to, so that the parent can quickly take an alias and
5307    # find this table.
5308    main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
5309
5310    my %locked;
5311    # After this table is made equivalent to another one; we shouldn't go
5312    # changing the contents because that could mean it's no longer equivalent
5313    main::set_access('locked', \%locked, 'r');
5314
5315    my %file_path;
5316    # This gives the final path to the file containing the table.  Each
5317    # directory in the path is an element in the array
5318    main::set_access('file_path', \%file_path, 'readable_array');
5319
5320    my %status;
5321    # What is the table's status, normal, $OBSOLETE, etc.  Enum
5322    main::set_access('status', \%status, 'r');
5323
5324    my %status_info;
5325    # A comment about its being obsolete, or whatever non normal status it has
5326    main::set_access('status_info', \%status_info, 'r');
5327
5328    my %caseless_equivalent;
5329    # The table this is equivalent to under /i matching, if any.
5330    main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's');
5331
5332    my %range_size_1;
5333    # Is the table to be output with each range only a single code point?
5334    # This is done to avoid breaking existing code that may have come to rely
5335    # on this behavior in previous versions of this program.)
5336    main::set_access('range_size_1', \%range_size_1, 'r', 's');
5337
5338    my %perl_extension;
5339    # A boolean set iff this table is a Perl extension to the Unicode
5340    # standard.
5341    main::set_access('perl_extension', \%perl_extension, 'r');
5342
5343    my %output_range_counts;
5344    # A boolean set iff this table is to have comments written in the
5345    # output file that contain the number of code points in the range.
5346    # The constructor can override the global flag of the same name.
5347    main::set_access('output_range_counts', \%output_range_counts, 'r');
5348
5349    my %write_as_invlist;
5350    # A boolean set iff the output file for this table is to be in the form of
5351    # an inversion list/map.
5352    main::set_access('write_as_invlist', \%write_as_invlist, 'r');
5353
5354    my %format;
5355    # The format of the entries of the table.  This is calculated from the
5356    # data in the table (or passed in the constructor).  This is an enum e.g.,
5357    # $STRING_FORMAT.  It is marked protected as it should not be generally
5358    # used to override calculations.
5359    main::set_access('format', \%format, 'r', 'p_s');
5360
5361    my %has_dependency;
5362    # A boolean that gives whether some other table in this property is
5363    # defined as the complement of this table.  This is a crude, but currently
5364    # sufficient, mechanism to make this table not get destroyed before what
5365    # is dependent on it is.  Other dependencies could be added, so the name
5366    # was chosen to reflect a more general situation than actually is
5367    # currently the case.
5368    main::set_access('has_dependency', \%has_dependency, 'r', 's');
5369
5370    sub new {
5371        # All arguments are key => value pairs, which you can see below, most
5372        # of which match fields documented above.  Otherwise: Re_Pod_Entry,
5373        # OK_as_Filename, and Fuzzy apply to the names of the table, and are
5374        # documented in the Alias package
5375
5376        return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
5377
5378        my $class = shift;
5379
5380        my $self = bless \do { my $anonymous_scalar }, $class;
5381        my $addr = do { no overloading; pack 'J', $self; };
5382
5383        my %args = @_;
5384
5385        $name{$addr} = delete $args{'Name'};
5386        $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
5387        $full_name{$addr} = delete $args{'Full_Name'};
5388        my $complete_name = $complete_name{$addr}
5389                          = delete $args{'Complete_Name'};
5390        $format{$addr} = delete $args{'Format'};
5391        $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
5392        $property{$addr} = delete $args{'_Property'};
5393        $range_list{$addr} = delete $args{'_Range_List'};
5394        $status{$addr} = delete $args{'Status'} || $NORMAL;
5395        $status_info{$addr} = delete $args{'_Status_Info'} || "";
5396        $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
5397        $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
5398        $fate{$addr} = delete $args{'Fate'} || $ORDINARY;
5399        $write_as_invlist{$addr} = delete $args{'Write_As_Invlist'};# No default
5400        my $ucd = delete $args{'UCD'};
5401
5402        my $description = delete $args{'Description'};
5403        my $ok_as_filename = delete $args{'OK_as_Filename'};
5404        my $loose_match = delete $args{'Fuzzy'};
5405        my $note = delete $args{'Note'};
5406        my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
5407        my $perl_extension = delete $args{'Perl_Extension'};
5408        my $suppression_reason = delete $args{'Suppression_Reason'};
5409
5410        # Shouldn't have any left over
5411        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5412
5413        # Can't use || above because conceivably the name could be 0, and
5414        # can't use // operator in case this program gets used in Perl 5.8
5415        $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
5416        $output_range_counts{$addr} = $output_range_counts if
5417                                        ! defined $output_range_counts{$addr};
5418
5419        $aliases{$addr} = [ ];
5420        $comment{$addr} = [ ];
5421        $description{$addr} = [ ];
5422        $note{$addr} = [ ];
5423        $file_path{$addr} = [ ];
5424        $locked{$addr} = "";
5425        $has_dependency{$addr} = 0;
5426
5427        push @{$description{$addr}}, $description if $description;
5428        push @{$note{$addr}}, $note if $note;
5429
5430        if ($fate{$addr} == $PLACEHOLDER) {
5431
5432            # A placeholder table doesn't get documented, is a perl extension,
5433            # and quite likely will be empty
5434            $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
5435            $perl_extension = 1 if ! defined $perl_extension;
5436            $ucd = 0 if ! defined $ucd;
5437            push @tables_that_may_be_empty, $complete_name{$addr};
5438            $self->add_comment(<<END);
5439This is a placeholder because it is not in Version $string_version of Unicode,
5440but is needed by the Perl core to work gracefully.  Because it is not in this
5441version of Unicode, it will not be listed in $pod_file.pod
5442END
5443        }
5444        elsif (exists $why_suppressed{$complete_name}
5445                # Don't suppress if overridden
5446                && ! grep { $_ eq $complete_name{$addr} }
5447                                                    @output_mapped_properties)
5448        {
5449            $fate{$addr} = $SUPPRESSED;
5450        }
5451        elsif ($fate{$addr} == $SUPPRESSED) {
5452            Carp::my_carp_bug("Need reason for suppressing") unless $suppression_reason;
5453            # Though currently unused
5454        }
5455        elsif ($suppression_reason) {
5456            Carp::my_carp_bug("A reason was given for suppressing, but not suppressed");
5457        }
5458
5459        # If hasn't set its status already, see if it is on one of the
5460        # lists of properties or tables that have particular statuses; if
5461        # not, is normal.  The lists are prioritized so the most serious
5462        # ones are checked first
5463        if (! $status{$addr}) {
5464            if (exists $why_deprecated{$complete_name}) {
5465                $status{$addr} = $DEPRECATED;
5466            }
5467            elsif (exists $why_stabilized{$complete_name}) {
5468                $status{$addr} = $STABILIZED;
5469            }
5470            elsif (exists $why_obsolete{$complete_name}) {
5471                $status{$addr} = $OBSOLETE;
5472            }
5473
5474            # Existence above doesn't necessarily mean there is a message
5475            # associated with it.  Use the most serious message.
5476            if ($status{$addr}) {
5477                if ($why_deprecated{$complete_name}) {
5478                    $status_info{$addr}
5479                                = $why_deprecated{$complete_name};
5480                }
5481                elsif ($why_stabilized{$complete_name}) {
5482                    $status_info{$addr}
5483                                = $why_stabilized{$complete_name};
5484                }
5485                elsif ($why_obsolete{$complete_name}) {
5486                    $status_info{$addr}
5487                                = $why_obsolete{$complete_name};
5488                }
5489            }
5490        }
5491
5492        $perl_extension{$addr} = $perl_extension || 0;
5493
5494        # Don't list a property by default that is internal only
5495        if ($fate{$addr} > $MAP_PROXIED) {
5496            $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
5497            $ucd = 0 if ! defined $ucd;
5498        }
5499        else {
5500            $ucd = 1 if ! defined $ucd;
5501        }
5502
5503        # By convention what typically gets printed only or first is what's
5504        # first in the list, so put the full name there for good output
5505        # clarity.  Other routines rely on the full name being first on the
5506        # list
5507        $self->add_alias($full_name{$addr},
5508                            OK_as_Filename => $ok_as_filename,
5509                            Fuzzy => $loose_match,
5510                            Re_Pod_Entry => $make_re_pod_entry,
5511                            Status => $status{$addr},
5512                            UCD => $ucd,
5513                            );
5514
5515        # Then comes the other name, if meaningfully different.
5516        if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
5517            $self->add_alias($name{$addr},
5518                            OK_as_Filename => $ok_as_filename,
5519                            Fuzzy => $loose_match,
5520                            Re_Pod_Entry => $make_re_pod_entry,
5521                            Status => $status{$addr},
5522                            UCD => $ucd,
5523                            );
5524        }
5525
5526        return $self;
5527    }
5528
5529    # Here are the methods that are required to be defined by any derived
5530    # class
5531    for my $sub (qw(
5532                    handle_special_range
5533                    append_to_body
5534                    pre_body
5535                ))
5536                # write() knows how to write out normal ranges, but it calls
5537                # handle_special_range() when it encounters a non-normal one.
5538                # append_to_body() is called by it after it has handled all
5539                # ranges to add anything after the main portion of the table.
5540                # And finally, pre_body() is called after all this to build up
5541                # anything that should appear before the main portion of the
5542                # table.  Doing it this way allows things in the middle to
5543                # affect what should appear before the main portion of the
5544                # table.
5545    {
5546        no strict "refs";
5547        *$sub = sub {
5548            Carp::my_carp_bug( __LINE__
5549                              . ": Must create method '$sub()' for "
5550                              . ref shift);
5551            return;
5552        }
5553    }
5554
5555    use overload
5556        fallback => 0,
5557        "." => \&main::_operator_dot,
5558        ".=" => \&main::_operator_dot_equal,
5559        '!=' => \&main::_operator_not_equal,
5560        '==' => \&main::_operator_equal,
5561    ;
5562
5563    sub ranges {
5564        # Returns the array of ranges associated with this table.
5565
5566        no overloading;
5567        return $range_list{pack 'J', shift}->ranges;
5568    }
5569
5570    sub add_alias {
5571        # Add a synonym for this table.
5572
5573        return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
5574
5575        my $self = shift;
5576        my $name = shift;       # The name to add.
5577        my $pointer = shift;    # What the alias hash should point to.  For
5578                                # map tables, this is the parent property;
5579                                # for match tables, it is the table itself.
5580
5581        my %args = @_;
5582        my $loose_match = delete $args{'Fuzzy'};
5583
5584        my $ok_as_filename = delete $args{'OK_as_Filename'};
5585        $ok_as_filename = 1 unless defined $ok_as_filename;
5586
5587        # An internal name does not get documented, unless overridden by the
5588        # input; same for making tests for it.
5589        my $status = delete $args{'Status'} || (($name =~ /^_/)
5590                                                ? $INTERNAL_ALIAS
5591                                                : $NORMAL);
5592        my $make_re_pod_entry = delete $args{'Re_Pod_Entry'}
5593                                            // (($status ne $INTERNAL_ALIAS)
5594                                               ? (($name =~ /^_/) ? $NO : $YES)
5595                                               : $NO);
5596        my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1);
5597
5598        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5599
5600        # Capitalize the first letter of the alias unless it is one of the CJK
5601        # ones which specifically begins with a lower 'k'.  Do this because
5602        # Unicode has varied whether they capitalize first letters or not, and
5603        # have later changed their minds and capitalized them, but not the
5604        # other way around.  So do it always and avoid changes from release to
5605        # release
5606        $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
5607
5608        my $addr = do { no overloading; pack 'J', $self; };
5609
5610        # Figure out if should be loosely matched if not already specified.
5611        if (! defined $loose_match) {
5612
5613            # Is a loose_match if isn't null, and doesn't begin with an
5614            # underscore and isn't just a number
5615            if ($name ne ""
5616                && substr($name, 0, 1) ne '_'
5617                && $name !~ qr{^[0-9_.+-/]+$})
5618            {
5619                $loose_match = 1;
5620            }
5621            else {
5622                $loose_match = 0;
5623            }
5624        }
5625
5626        # If this alias has already been defined, do nothing.
5627        return if defined $find_table_from_alias{$addr}->{$name};
5628
5629        # That includes if it is standardly equivalent to an existing alias,
5630        # in which case, add this name to the list, so won't have to search
5631        # for it again.
5632        my $standard_name = main::standardize($name);
5633        if (defined $find_table_from_alias{$addr}->{$standard_name}) {
5634            $find_table_from_alias{$addr}->{$name}
5635                        = $find_table_from_alias{$addr}->{$standard_name};
5636            return;
5637        }
5638
5639        # Set the index hash for this alias for future quick reference.
5640        $find_table_from_alias{$addr}->{$name} = $pointer;
5641        $find_table_from_alias{$addr}->{$standard_name} = $pointer;
5642        local $to_trace = 0 if main::DEBUG;
5643        trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
5644        trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
5645
5646
5647        # Put the new alias at the end of the list of aliases unless the final
5648        # element begins with an underscore (meaning it is for internal perl
5649        # use) or is all numeric, in which case, put the new one before that
5650        # one.  This floats any all-numeric or underscore-beginning aliases to
5651        # the end.  This is done so that they are listed last in output lists,
5652        # to encourage the user to use a better name (either more descriptive
5653        # or not an internal-only one) instead.  This ordering is relied on
5654        # implicitly elsewhere in this program, like in short_name()
5655        my $list = $aliases{$addr};
5656        my $insert_position = (@$list == 0
5657                                || (substr($list->[-1]->name, 0, 1) ne '_'
5658                                    && $list->[-1]->name =~ /\D/))
5659                            ? @$list
5660                            : @$list - 1;
5661        splice @$list,
5662                $insert_position,
5663                0,
5664                Alias->new($name, $loose_match, $make_re_pod_entry,
5665                           $ok_as_filename, $status, $ucd);
5666
5667        # This name may be shorter than any existing ones, so clear the cache
5668        # of the shortest, so will have to be recalculated.
5669        no overloading;
5670        undef $short_name{pack 'J', $self};
5671        return;
5672    }
5673
5674    sub short_name($self, $nominal_length_ptr=undef) {
5675        # Returns a name suitable for use as the base part of a file name.
5676        # That is, shorter wins.  It can return undef if there is no suitable
5677        # name.  The name has all non-essential underscores removed.
5678
5679        # The optional second parameter is a reference to a scalar in which
5680        # this routine will store the length the returned name had before the
5681        # underscores were removed, or undef if the return is undef.
5682
5683        # The shortest name can change if new aliases are added.  So using
5684        # this should be deferred until after all these are added.  The code
5685        # that does that should clear this one's cache.
5686        # Any name with alphabetics is preferred over an all numeric one, even
5687        # if longer.
5688
5689        my $addr = do { no overloading; pack 'J', $self; };
5690
5691        # For efficiency, don't recalculate, but this means that adding new
5692        # aliases could change what the shortest is, so the code that does
5693        # that needs to undef this.
5694        if (defined $short_name{$addr}) {
5695            if ($nominal_length_ptr) {
5696                $$nominal_length_ptr = $nominal_short_name_length{$addr};
5697            }
5698            return $short_name{$addr};
5699        }
5700
5701        # Look at each alias
5702        my $is_last_resort = 0;
5703        my $deprecated_or_discouraged
5704                                = qr/ ^ (?: $DEPRECATED | $DISCOURAGED ) $/x;
5705        foreach my $alias ($self->aliases()) {
5706
5707            # Don't use an alias that isn't ok to use for an external name.
5708            next if ! $alias->ok_as_filename;
5709
5710            my $name = main::Standardize($alias->name);
5711            trace $self, $name if main::DEBUG && $to_trace;
5712
5713            # Take the first one, or any non-deprecated non-discouraged one
5714            # over one that is, or a shorter one that isn't numeric.  This
5715            # relies on numeric aliases always being last in the array
5716            # returned by aliases().  Any alpha one will have precedence.
5717            if (   ! defined $short_name{$addr}
5718                || (   $is_last_resort
5719                    && $alias->status !~ $deprecated_or_discouraged)
5720                || ($name =~ /\D/
5721                    && length($name) < length($short_name{$addr})))
5722            {
5723                # Remove interior underscores.
5724                ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
5725
5726                $nominal_short_name_length{$addr} = length $name;
5727                $is_last_resort = $alias->status =~ $deprecated_or_discouraged;
5728            }
5729        }
5730
5731        # If the short name isn't a nice one, perhaps an equivalent table has
5732        # a better one.
5733        if (   $self->can('children')
5734            && (   ! defined $short_name{$addr}
5735                || $short_name{$addr} eq ""
5736                || $short_name{$addr} eq "_"))
5737        {
5738            my $return;
5739            foreach my $follower ($self->children) {    # All equivalents
5740                my $follower_name = $follower->short_name;
5741                next unless defined $follower_name;
5742
5743                # Anything (except undefined) is better than underscore or
5744                # empty
5745                if (! defined $return || $return eq "_") {
5746                    $return = $follower_name;
5747                    next;
5748                }
5749
5750                # If the new follower name isn't "_" and is shorter than the
5751                # current best one, prefer the new one.
5752                next if $follower_name eq "_";
5753                next if length $follower_name > length $return;
5754                $return = $follower_name;
5755            }
5756            $short_name{$addr} = $return if defined $return;
5757        }
5758
5759        # If no suitable external name return undef
5760        if (! defined $short_name{$addr}) {
5761            $$nominal_length_ptr = undef if $nominal_length_ptr;
5762            return;
5763        }
5764
5765        # Don't allow a null short name.
5766        if ($short_name{$addr} eq "") {
5767            $short_name{$addr} = '_';
5768            $nominal_short_name_length{$addr} = 1;
5769        }
5770
5771        trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
5772
5773        if ($nominal_length_ptr) {
5774            $$nominal_length_ptr = $nominal_short_name_length{$addr};
5775        }
5776        return $short_name{$addr};
5777    }
5778
5779    sub external_name($self) {
5780        # Returns the external name that this table should be known by.  This
5781        # is usually the short_name, but not if the short_name is undefined,
5782        # in which case the external_name is arbitrarily set to the
5783        # underscore.
5784
5785        my $short = $self->short_name;
5786        return $short if defined $short;
5787
5788        return '_';
5789    }
5790
5791    sub add_description($self, $description) { # Adds the parameter as a short description.
5792        no overloading;
5793        push @{$description{pack 'J', $self}}, $description;
5794
5795        return;
5796    }
5797
5798    sub add_note($self, $note) { # Adds the parameter as a short note.
5799        no overloading;
5800        push @{$note{pack 'J', $self}}, $note;
5801
5802        return;
5803    }
5804
5805    sub add_comment($self, $comment) { # Adds the parameter as a comment.
5806
5807        return unless $debugging_build;
5808
5809        chomp $comment;
5810
5811        no overloading;
5812        push @{$comment{pack 'J', $self}}, $comment;
5813
5814        return;
5815    }
5816
5817    sub comment($self) {
5818        # Return the current comment for this table.  If called in list
5819        # context, returns the array of comments.  In scalar, returns a string
5820        # of each element joined together with a period ending each.
5821
5822        my $addr = do { no overloading; pack 'J', $self; };
5823        my @list = @{$comment{$addr}};
5824        return @list if wantarray;
5825        my $return = "";
5826        foreach my $sentence (@list) {
5827            $return .= '.  ' if $return;
5828            $return .= $sentence;
5829            $return =~ s/\.$//;
5830        }
5831        $return .= '.' if $return;
5832        return $return;
5833    }
5834
5835    sub initialize($self, $initialization) {
5836        # Initialize the table with the argument which is any valid
5837        # initialization for range lists.
5838
5839        my $addr = do { no overloading; pack 'J', $self; };
5840
5841        # Replace the current range list with a new one of the same exact
5842        # type.
5843        my $class = ref $range_list{$addr};
5844        $range_list{$addr} = $class->new(Owner => $self,
5845                                        Initialize => $initialization);
5846        return;
5847
5848    }
5849
5850    sub header($self) {
5851        # The header that is output for the table in the file it is written
5852        # in.
5853        my $return = "";
5854        $return .= $DEVELOPMENT_ONLY if $compare_versions;
5855        $return .= $HEADER;
5856        return $return;
5857    }
5858
5859    sub merge_single_annotation_line ($output, $annotation, $annotation_column) {
5860
5861        # This appends an annotation comment, $annotation, to $output,
5862        # starting in or after column $annotation_column, removing any
5863        # pre-existing comment from $output.
5864
5865        $annotation =~ s/^ \s* \# \  //x;
5866        $output =~ s/ \s* ( \# \N* )? \n //x;
5867        $output = Text::Tabs::expand($output);
5868
5869        my $spaces = $annotation_column - length $output;
5870        $spaces = 2 if $spaces < 0;  # Have 2 blanks before the comment
5871
5872        $output = sprintf "%s%*s# %s",
5873                            $output,
5874                            $spaces,
5875                            " ",
5876                            $annotation;
5877        return Text::Tabs::unexpand $output;
5878    }
5879
5880    sub write($self, $use_adjustments=0, $suppress_value=0) {
5881        # Write a representation of the table to its file.  It calls several
5882        # functions furnished by sub-classes of this abstract base class to
5883        # handle non-normal ranges, to add stuff before the table, and at its
5884        # end.  If the table is to be written so that adjustments are
5885        # required, this does that conversion.
5886
5887
5888        # $use_adjustments ? output in adjusted format or not
5889        # $suppress_value Optional, if the value associated with
5890        # a range equals this one, don't write
5891        # the range
5892
5893        my $addr = do { no overloading; pack 'J', $self; };
5894        my $write_as_invlist = $write_as_invlist{$addr};
5895
5896        # Start with the header
5897        my @HEADER = $self->header;
5898
5899        # Then the comments
5900        push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
5901                                                        if $comment{$addr};
5902
5903        # Things discovered processing the main body of the document may
5904        # affect what gets output before it, therefore pre_body() isn't called
5905        # until after all other processing of the table is done.
5906
5907        # The main body looks like a 'here' document.  If there are comments,
5908        # get rid of them when processing it.
5909        my @OUT;
5910        if ($annotate || $output_range_counts) {
5911            # Use the line below in Perls that don't have /r
5912            #push @OUT, 'return join "\n",  map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n";
5913            push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n";
5914        } else {
5915            push @OUT, "return <<'END';\n";
5916        }
5917
5918        if ($range_list{$addr}->is_empty) {
5919
5920            # This is a kludge for empty tables to silence a warning in
5921            # utf8.c, which can't really deal with empty tables, but it can
5922            # deal with a table that matches nothing, as the inverse of 'All'
5923            # does.
5924            push @OUT, "!Unicode::UCD::All\n";
5925        }
5926        elsif ($self->name eq 'N'
5927
5928               # To save disk space and table cache space, avoid putting out
5929               # binary N tables, but instead create a file which just inverts
5930               # the Y table.  Since the file will still exist and occupy a
5931               # certain number of blocks, might as well output the whole
5932               # thing if it all will fit in one block.   The number of
5933               # ranges below is an approximate number for that.
5934               && ($self->property->type == $BINARY
5935                   || $self->property->type == $FORCED_BINARY)
5936               # && $self->property->tables == 2  Can't do this because the
5937               #        non-binary properties, like NFDQC aren't specifiable
5938               #        by the notation
5939               && $range_list{$addr}->ranges > 15
5940               && ! $annotate)  # Under --annotate, want to see everything
5941        {
5942            push @OUT, "!Unicode::UCD::" . $self->property->name . "\n";
5943        }
5944        else {
5945            my $range_size_1 = $range_size_1{$addr};
5946
5947            # To make it more readable, use a minimum indentation
5948            my $comment_indent;
5949
5950            # These are used only in $annotate option
5951            my $format;         # e.g. $HEX_ADJUST_FORMAT
5952            my $include_name;   # ? Include the character's name in the
5953                                # annotation?
5954            my $include_cp;     # ? Include its code point
5955
5956            if (! $annotate) {
5957                $comment_indent = ($self->isa('Map_Table'))
5958                                  ? 24
5959                                  : ($write_as_invlist)
5960                                    ? 8
5961                                    : 16;
5962            }
5963            else {
5964                $format = $self->format;
5965
5966                # The name of the character is output only for tables that
5967                # don't already include the name in the output.
5968                my $property = $self->property;
5969                $include_name =
5970                    !  ($property == $perl_charname
5971                        || $property == main::property_ref('Unicode_1_Name')
5972                        || $property == main::property_ref('Name')
5973                        || $property == main::property_ref('Name_Alias')
5974                       );
5975
5976                # Don't include the code point in the annotation where all
5977                # lines are a single code point, so it can be easily found in
5978                # the first column
5979                $include_cp = ! $range_size_1;
5980
5981                if (! $self->isa('Map_Table')) {
5982                    $comment_indent = ($write_as_invlist) ? 8 : 16;
5983                }
5984                else {
5985                    $comment_indent = 16;
5986
5987                    # There are just a few short ranges in this table, so no
5988                    # need to include the code point in the annotation.
5989                    $include_cp = 0 if $format eq $DECOMP_STRING_FORMAT;
5990
5991                    # We're trying to get this to look good, as the whole
5992                    # point is to make human-readable tables.  It is easier to
5993                    # read if almost all the annotation comments begin in the
5994                    # same column.  Map tables have varying width maps, so can
5995                    # create a jagged comment appearance.  This code does a
5996                    # preliminary pass through these tables looking for the
5997                    # maximum width map in each, and causing the comments to
5998                    # begin just to the right of that.  However, if the
5999                    # comments begin too far to the right of most lines, it's
6000                    # hard to line them up horizontally with their real data.
6001                    # Therefore we ignore the longest outliers
6002                    my $ignore_longest_X_percent = 2;  # Discard longest X%
6003
6004                    # Each key in this hash is a width of at least one of the
6005                    # maps in the table.  Its value is how many lines have
6006                    # that width.
6007                    my %widths;
6008
6009                    # We won't space things further left than one tab stop
6010                    # after the rest of the line; initializing it to that
6011                    # number saves some work.
6012                    my $max_map_width = 8;
6013
6014                    # Fill in the %widths hash
6015                    my $total = 0;
6016                    for my $set ($range_list{$addr}->ranges) {
6017                        my $value = $set->value;
6018
6019                        # These range types don't appear in the main table
6020                        next if $set->type == 0
6021                                && defined $suppress_value
6022                                && $value eq $suppress_value;
6023                        next if $set->type == $MULTI_CP
6024                                || $set->type == $NULL;
6025
6026                        # Include 2 spaces before the beginning of the
6027                        # comment
6028                        my $this_width = length($value) + 2;
6029
6030                        # Ranges of the remaining non-zero types usually
6031                        # occupy just one line (maybe occasionally two, but
6032                        # this doesn't have to be dead accurate).  This is
6033                        # because these ranges are like "unassigned code
6034                        # points"
6035                        my $count = ($set->type != 0)
6036                                    ? 1
6037                                    : $set->end - $set->start + 1;
6038                        $widths{$this_width} += $count;
6039                        $total += $count;
6040                        $max_map_width = $this_width
6041                                            if $max_map_width < $this_width;
6042                    }
6043
6044                    # If the widest map gives us less than two tab stops
6045                    # worth, just take it as-is.
6046                    if ($max_map_width > 16) {
6047
6048                        # Otherwise go through %widths until we have included
6049                        # the desired percentage of lines in the whole table.
6050                        my $running_total = 0;
6051                        foreach my $width (sort { $a <=> $b } keys %widths)
6052                        {
6053                            $running_total += $widths{$width};
6054                            use integer;
6055                            if ($running_total * 100 / $total
6056                                            >= 100 - $ignore_longest_X_percent)
6057                            {
6058                                $max_map_width = $width;
6059                                last;
6060                            }
6061                        }
6062                    }
6063                    $comment_indent += $max_map_width;
6064                }
6065            }
6066
6067            # Values for previous time through the loop.  Initialize to
6068            # something that won't be adjacent to the first iteration;
6069            # only $previous_end matters for that.
6070            my $previous_start;
6071            my $previous_end = -2;
6072            my $previous_value;
6073
6074            # Values for next time through the portion of the loop that splits
6075            # the range.  0 in $next_start means there is no remaining portion
6076            # to deal with.
6077            my $next_start = 0;
6078            my $next_end;
6079            my $next_value;
6080            my $offset = 0;
6081            my $invlist_count = 0;
6082
6083            my $output_value_in_hex = $self->isa('Map_Table')
6084                                && ($self->format eq $HEX_ADJUST_FORMAT
6085                                    || $self->to_output_map == $EXTERNAL_MAP);
6086            # Use leading zeroes just for files whose format should not be
6087            # changed from what it has been.  Otherwise, they just take up
6088            # space and time to process.
6089            my $hex_format = ($self->isa('Map_Table')
6090                              && $self->to_output_map == $EXTERNAL_MAP)
6091                             ? "%04X"
6092                             : "%X";
6093
6094            # The values for some of these tables are stored in mktables as
6095            # hex strings.  Normally, these are just output as strings without
6096            # change, but when we are doing adjustments, we have to operate on
6097            # these numerically, so we convert those to decimal to do that,
6098            # and back to hex for output
6099            my $convert_map_to_from_hex = 0;
6100            my $output_map_in_hex = 0;
6101            if ($self->isa('Map_Table')) {
6102                $convert_map_to_from_hex
6103                   = ($use_adjustments && $self->format eq $HEX_ADJUST_FORMAT)
6104                      || ($annotate && $self->format eq $HEX_FORMAT);
6105                $output_map_in_hex = $convert_map_to_from_hex
6106                                 || $self->format eq $HEX_FORMAT;
6107            }
6108
6109            # To store any annotations about the characters.
6110            my @annotation;
6111
6112            # Output each range as part of the here document.
6113            RANGE:
6114            for my $set ($range_list{$addr}->ranges) {
6115                if ($set->type != 0) {
6116                    $self->handle_special_range($set);
6117                    next RANGE;
6118                }
6119                my $start = $set->start;
6120                my $end   = $set->end;
6121                my $value  = $set->value;
6122
6123                # Don't output ranges whose value is the one to suppress
6124                next RANGE if defined $suppress_value
6125                              && $value eq $suppress_value;
6126
6127                $value = CORE::hex $value if $convert_map_to_from_hex;
6128
6129
6130                {   # This bare block encloses the scope where we may need to
6131                    # 'redo' to.  Consider a table that is to be written out
6132                    # using single item ranges.  This is given in the
6133                    # $range_size_1 boolean.  To accomplish this, we split the
6134                    # range each time through the loop into two portions, the
6135                    # first item, and the rest.  We handle that first item
6136                    # this time in the loop, and 'redo' to repeat the process
6137                    # for the rest of the range.
6138                    #
6139                    # We may also have to do it, with other special handling,
6140                    # if the table has adjustments.  Consider the table that
6141                    # contains the lowercasing maps.  mktables stores the
6142                    # ASCII range ones as 26 ranges:
6143                    #       ord('A') => ord('a'), .. ord('Z') => ord('z')
6144                    # For compactness, the table that gets written has this as
6145                    # just one range
6146                    #       ( ord('A') .. ord('Z') ) => ord('a')
6147                    # and the software that reads the tables is smart enough
6148                    # to "connect the dots".  This change is accomplished in
6149                    # this loop by looking to see if the current iteration
6150                    # fits the paradigm of the previous iteration, and if so,
6151                    # we merge them by replacing the final output item with
6152                    # the merged data.  Repeated 25 times, this gets A-Z.  But
6153                    # we also have to make sure we don't screw up cases where
6154                    # we have internally stored
6155                    #       ( 0x1C4 .. 0x1C6 ) => 0x1C5
6156                    # This single internal range has to be output as 3 ranges,
6157                    # which is done by splitting, like we do for $range_size_1
6158                    # tables.  (There are very few of such ranges that need to
6159                    # be split, so the gain of doing the combining of other
6160                    # ranges far outweighs the splitting of these.)  The
6161                    # values to use for the redo at the end of this block are
6162                    # set up just below in the scalars whose names begin with
6163                    # '$next_'.
6164
6165                    if (($use_adjustments || $range_size_1) && $end != $start)
6166                    {
6167                        $next_start = $start + 1;
6168                        $next_end = $end;
6169                        $next_value = $value;
6170                        $end = $start;
6171                    }
6172
6173                    if ($use_adjustments && ! $range_size_1) {
6174
6175                        # If this range is adjacent to the previous one, and
6176                        # the values in each are integers that are also
6177                        # adjacent (differ by 1), then this range really
6178                        # extends the previous one that is already in element
6179                        # $OUT[-1].  So we pop that element, and pretend that
6180                        # the range starts with whatever it started with.
6181                        # $offset is incremented by 1 each time so that it
6182                        # gives the current offset from the first element in
6183                        # the accumulating range, and we keep in $value the
6184                        # value of that first element.
6185                        if ($start == $previous_end + 1
6186                            && $value =~ /^ -? \d+ $/xa
6187                            && $previous_value =~ /^ -? \d+ $/xa
6188                            && ($value == ($previous_value + ++$offset)))
6189                        {
6190                            pop @OUT;
6191                            $start = $previous_start;
6192                            $value = $previous_value;
6193                        }
6194                        else {
6195                            $offset = 0;
6196                            if (@annotation == 1) {
6197                                $OUT[-1] = merge_single_annotation_line(
6198                                    $OUT[-1], $annotation[0], $comment_indent);
6199                            }
6200                            else {
6201                                push @OUT, @annotation;
6202                            }
6203                        }
6204                        undef @annotation;
6205
6206                        # Save the current values for the next time through
6207                        # the loop.
6208                        $previous_start = $start;
6209                        $previous_end = $end;
6210                        $previous_value = $value;
6211                    }
6212
6213                    if ($write_as_invlist) {
6214                        if (   $previous_end > 0
6215                            && $output_range_counts{$addr})
6216                        {
6217                            my $complement_count = $start - $previous_end - 1;
6218                            if ($complement_count > 1) {
6219                                $OUT[-1] = merge_single_annotation_line(
6220                                    $OUT[-1],
6221                                       "#"
6222                                     . (" " x 17)
6223                                     . "["
6224                                     .  main::clarify_code_point_count(
6225                                                            $complement_count)
6226                                      . "] in complement\n",
6227                                    $comment_indent);
6228                            }
6229                        }
6230
6231                        # Inversion list format has a single number per line,
6232                        # the starting code point of a range that matches the
6233                        # property
6234                        push @OUT, $start, "\n";
6235                        $invlist_count++;
6236
6237                        # Add a comment with the size of the range, if
6238                        # requested.
6239                        if ($output_range_counts{$addr}) {
6240                            $OUT[-1] = merge_single_annotation_line(
6241                                    $OUT[-1],
6242                                    "# ["
6243                                      . main::clarify_code_point_count($end - $start + 1)
6244                                      . "]\n",
6245                                    $comment_indent);
6246                        }
6247                    }
6248                    elsif ($start != $end) { # If there is a range
6249                        if ($end == $MAX_WORKING_CODEPOINT) {
6250                            push @OUT, sprintf "$hex_format\t$hex_format",
6251                                                $start,
6252                                                $MAX_PLATFORM_CODEPOINT;
6253                        }
6254                        else {
6255                            push @OUT, sprintf "$hex_format\t$hex_format",
6256                                                $start,       $end;
6257                        }
6258                        if (length $value) {
6259                            if ($convert_map_to_from_hex) {
6260                                $OUT[-1] .= sprintf "\t$hex_format\n", $value;
6261                            }
6262                            else {
6263                                $OUT[-1] .= "\t$value\n";
6264                            }
6265                        }
6266
6267                        # Add a comment with the size of the range, if
6268                        # requested.
6269                        if ($output_range_counts{$addr}) {
6270                            $OUT[-1] = merge_single_annotation_line(
6271                                    $OUT[-1],
6272                                    "# ["
6273                                      . main::clarify_code_point_count($end - $start + 1)
6274                                      . "]\n",
6275                                    $comment_indent);
6276                        }
6277                    }
6278                    else { # Here to output a single code point per line.
6279
6280                        # Use any passed in subroutine to output.
6281                        if (ref $range_size_1 eq 'CODE') {
6282                            for my $i ($start .. $end) {
6283                                push @OUT, &{$range_size_1}($i, $value);
6284                            }
6285                        }
6286                        else {
6287
6288                            # Here, caller is ok with default output.
6289                            for (my $i = $start; $i <= $end; $i++) {
6290                                if ($convert_map_to_from_hex) {
6291                                    push @OUT,
6292                                        sprintf "$hex_format\t\t$hex_format\n",
6293                                                 $i,            $value;
6294                                }
6295                                else {
6296                                    push @OUT, sprintf $hex_format, $i;
6297                                    $OUT[-1] .= "\t\t$value" if $value ne "";
6298                                    $OUT[-1] .= "\n";
6299                                }
6300                            }
6301                        }
6302                    }
6303
6304                    if ($annotate) {
6305                        for (my $i = $start; $i <= $end; $i++) {
6306                            my $annotation = "";
6307
6308                            # Get character information if don't have it already
6309                            main::populate_char_info($i)
6310                                                     if ! defined $viacode[$i];
6311                            my $type = $annotate_char_type[$i];
6312
6313                            # Figure out if should output the next code points
6314                            # as part of a range or not.  If this is not in an
6315                            # annotation range, then won't output as a range,
6316                            # so returns $i.  Otherwise use the end of the
6317                            # annotation range, but no further than the
6318                            # maximum possible end point of the loop.
6319                            my $range_end =
6320                                        $range_size_1
6321                                        ? $start
6322                                        : main::min(
6323                                          $annotate_ranges->value_of($i) || $i,
6324                                          $end);
6325
6326                            # Use a range if it is a range, and either is one
6327                            # of the special annotation ranges, or the range
6328                            # is at most 3 long.  This last case causes the
6329                            # algorithmically named code points to be output
6330                            # individually in spans of at most 3, as they are
6331                            # the ones whose $type is > 0.
6332                            if ($range_end != $i
6333                                && ( $type < 0 || $range_end - $i > 2))
6334                            {
6335                                # Here is to output a range.  We don't allow a
6336                                # caller-specified output format--just use the
6337                                # standard one.
6338                                my $range_name = $viacode[$i];
6339
6340                                # For the code points which end in their hex
6341                                # value, we eliminate that from the output
6342                                # annotation, and capitalize only the first
6343                                # letter of each word.
6344                                if ($type == $CP_IN_NAME) {
6345                                    my $hex = sprintf $hex_format, $i;
6346                                    $range_name =~ s/-$hex$//;
6347                                    my @words = split " ", $range_name;
6348                                    for my $word (@words) {
6349                                        $word =
6350                                          ucfirst(lc($word)) if $word ne 'CJK';
6351                                    }
6352                                    $range_name = join " ", @words;
6353                                }
6354                                elsif ($type == $HANGUL_SYLLABLE) {
6355                                    $range_name = "Hangul Syllable";
6356                                }
6357
6358                                # If the annotation would just repeat what's
6359                                # already being output as the range, skip it.
6360                                # (When an inversion list is being written, it
6361                                # isn't a repeat, as that always is in
6362                                # decimal)
6363                                if (   $write_as_invlist
6364                                    || $i != $start
6365                                    || $range_end < $end)
6366                                {
6367                                    if ($range_end < $MAX_WORKING_CODEPOINT)
6368                                    {
6369                                        $annotation = sprintf "%04X..%04X",
6370                                                              $i,   $range_end;
6371                                    }
6372                                    else {
6373                                        $annotation = sprintf "%04X..INFINITY",
6374                                                               $i;
6375                                    }
6376                                }
6377                                else { # Indent if not displaying code points
6378                                    $annotation = " " x 4;
6379                                }
6380
6381                                if ($range_name) {
6382                                    $annotation .= " $age[$i]" if $age[$i];
6383                                    $annotation .= " $range_name";
6384                                }
6385
6386                                # Include the number of code points in the
6387                                # range
6388                                my $count =
6389                                    main::clarify_code_point_count($range_end - $i + 1);
6390                                $annotation .= " [$count]\n";
6391
6392                                # Skip to the end of the range
6393                                $i = $range_end;
6394                            }
6395                            else { # Not in a range.
6396                                my $comment = "";
6397
6398                                # When outputting the names of each character,
6399                                # use the character itself if printable
6400                                $comment .= "'" . main::display_chr($i) . "' "
6401                                                            if $printable[$i];
6402
6403                                my $output_value = $value;
6404
6405                                # Determine the annotation
6406                                if ($format eq $DECOMP_STRING_FORMAT) {
6407
6408                                    # This is very specialized, with the type
6409                                    # of decomposition beginning the line
6410                                    # enclosed in <...>, and the code points
6411                                    # that the code point decomposes to
6412                                    # separated by blanks.  Create two
6413                                    # strings, one of the printable
6414                                    # characters, and one of their official
6415                                    # names.
6416                                    (my $map = $output_value)
6417                                                    =~ s/ \ * < .*? > \ +//x;
6418                                    my $tostr = "";
6419                                    my $to_name = "";
6420                                    my $to_chr = "";
6421                                    foreach my $to (split " ", $map) {
6422                                        $to = CORE::hex $to;
6423                                        $to_name .= " + " if $to_name;
6424                                        $to_chr .= main::display_chr($to);
6425                                        main::populate_char_info($to)
6426                                                    if ! defined $viacode[$to];
6427                                        $to_name .=  $viacode[$to];
6428                                    }
6429
6430                                    $comment .=
6431                                    "=> '$to_chr'; $viacode[$i] => $to_name";
6432                                }
6433                                else {
6434                                    $output_value += $i - $start
6435                                                   if $use_adjustments
6436                                                      # Don't try to adjust a
6437                                                      # non-integer
6438                                                   && $output_value !~ /[-\D]/;
6439
6440                                    if ($output_map_in_hex) {
6441                                        main::populate_char_info($output_value)
6442                                          if ! defined $viacode[$output_value];
6443                                        $comment .= " => '"
6444                                        . main::display_chr($output_value)
6445                                        . "'; " if $printable[$output_value];
6446                                    }
6447                                    if ($include_name && $viacode[$i]) {
6448                                        $comment .= " " if $comment;
6449                                        $comment .= $viacode[$i];
6450                                    }
6451                                    if ($output_map_in_hex) {
6452                                        $comment .=
6453                                                " => $viacode[$output_value]"
6454                                                    if $viacode[$output_value];
6455                                        $output_value = sprintf($hex_format,
6456                                                                $output_value);
6457                                    }
6458                                }
6459
6460                                if ($include_cp) {
6461                                    $annotation = sprintf "%04X %s", $i, $age[$i];
6462                                    if ($use_adjustments) {
6463                                        $annotation .= " => $output_value";
6464                                    }
6465                                }
6466
6467                                if ($comment ne "") {
6468                                    $annotation .= " " if $annotation ne "";
6469                                    $annotation .= $comment;
6470                                }
6471                                $annotation .= "\n" if $annotation ne "";
6472                            }
6473
6474                            if ($annotation ne "") {
6475                                push @annotation, (" " x $comment_indent)
6476                                                  .  "# $annotation";
6477                            }
6478                        }
6479
6480                        # If not adjusting, we don't have to go through the
6481                        # loop again to know that the annotation comes next
6482                        # in the output.
6483                        if (! $use_adjustments) {
6484                            if (@annotation == 1) {
6485                                $OUT[-1] = merge_single_annotation_line(
6486                                    $OUT[-1], $annotation[0], $comment_indent);
6487                            }
6488                            else {
6489                                push @OUT, map { Text::Tabs::unexpand $_ }
6490                                               @annotation;
6491                            }
6492                            undef @annotation;
6493                        }
6494                    }
6495
6496                    # Add the beginning of the range that doesn't match the
6497                    # property, except if the just added match range extends
6498                    # to infinity.  We do this after any annotations for the
6499                    # match range.
6500                    if ($write_as_invlist && $end < $MAX_WORKING_CODEPOINT) {
6501                        push @OUT, $end + 1, "\n";
6502                        $invlist_count++;
6503                    }
6504
6505                    # If we split the range, set up so the next time through
6506                    # we get the remainder, and redo.
6507                    if ($next_start) {
6508                        $start = $next_start;
6509                        $end = $next_end;
6510                        $value = $next_value;
6511                        $next_start = 0;
6512                        redo;
6513                    }
6514                }
6515            } # End of loop through all the table's ranges
6516
6517            push @OUT, @annotation; # Add orphaned annotation, if any
6518
6519            splice @OUT, 1, 0, "V$invlist_count\n" if $invlist_count;
6520        }
6521
6522        # Add anything that goes after the main body, but within the here
6523        # document,
6524        my $append_to_body = $self->append_to_body;
6525        push @OUT, $append_to_body if $append_to_body;
6526
6527        # And finish the here document.
6528        push @OUT, "END\n";
6529
6530        # Done with the main portion of the body.  Can now figure out what
6531        # should appear before it in the file.
6532        my $pre_body = $self->pre_body;
6533        push @HEADER, $pre_body, "\n" if $pre_body;
6534
6535        # All these files should have a .pl suffix added to them.
6536        my @file_with_pl = @{$file_path{$addr}};
6537        $file_with_pl[-1] .= '.pl';
6538
6539        main::write(\@file_with_pl,
6540                    $annotate,      # utf8 iff annotating
6541                    \@HEADER,
6542                    \@OUT);
6543        return;
6544    }
6545
6546    sub set_status($self, $status, $info) {    # Set the table's status
6547        # status The status enum value
6548        # info Any message associated with it.
6549        my $addr = do { no overloading; pack 'J', $self; };
6550
6551        $status{$addr} = $status;
6552        $status_info{$addr} = $info;
6553        return;
6554    }
6555
6556    sub set_fate($self, $fate, $reason=undef) {  # Set the fate of a table
6557        my $addr = do { no overloading; pack 'J', $self; };
6558
6559        return if $fate{$addr} == $fate;    # If no-op
6560
6561        # Can only change the ordinary fate, except if going to $MAP_PROXIED
6562        return if $fate{$addr} != $ORDINARY && $fate != $MAP_PROXIED;
6563
6564        $fate{$addr} = $fate;
6565
6566        # Don't document anything to do with a non-normal fated table
6567        if ($fate != $ORDINARY) {
6568            my $put_in_pod = ($fate == $MAP_PROXIED) ? 1 : 0;
6569            foreach my $alias ($self->aliases) {
6570                $alias->set_ucd($put_in_pod);
6571
6572                # MAP_PROXIED doesn't affect the match tables
6573                next if $fate == $MAP_PROXIED;
6574                $alias->set_make_re_pod_entry($put_in_pod);
6575            }
6576        }
6577
6578        # Save the reason for suppression for output
6579        if ($fate >= $SUPPRESSED) {
6580            $reason = "" unless defined $reason;
6581            $why_suppressed{$complete_name{$addr}} = $reason;
6582        }
6583
6584        return;
6585    }
6586
6587    sub lock($self) {
6588        # Don't allow changes to the table from now on.  This stores a stack
6589        # trace of where it was called, so that later attempts to modify it
6590        # can immediately show where it got locked.
6591        my $addr = do { no overloading; pack 'J', $self; };
6592
6593        $locked{$addr} = "";
6594
6595        my $line = (caller(0))[2];
6596        my $i = 1;
6597
6598        # Accumulate the stack trace
6599        while (1) {
6600            my ($pkg, $file, $caller_line, $caller) = caller $i++;
6601
6602            last unless defined $caller;
6603
6604            $locked{$addr} .= "    called from $caller() at line $line\n";
6605            $line = $caller_line;
6606        }
6607        $locked{$addr} .= "    called from main at line $line\n";
6608
6609        return;
6610    }
6611
6612    sub carp_if_locked($self) {
6613        # Return whether a table is locked or not, and, by the way, complain
6614        # if is locked
6615        my $addr = do { no overloading; pack 'J', $self; };
6616
6617        return 0 if ! $locked{$addr};
6618        Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
6619        return 1;
6620    }
6621
6622    sub set_file_path($self, @path) { # Set the final directory path for this table
6623        no overloading;
6624        @{$file_path{pack 'J', $self}} = @path;
6625        return
6626    }
6627
6628    # Accessors for the range list stored in this table.  First for
6629    # unconditional
6630    for my $sub (qw(
6631                    containing_range
6632                    contains
6633                    count
6634                    each_range
6635                    hash
6636                    is_empty
6637                    matches_identically_to
6638                    max
6639                    min
6640                    range_count
6641                    reset_each_range
6642                    type_of
6643                    value_of
6644                ))
6645    {
6646        no strict "refs";
6647        *$sub = sub {
6648            use strict "refs";
6649            my $self = shift;
6650            return $self->_range_list->$sub(@_);
6651        }
6652    }
6653
6654    # Then for ones that should fail if locked
6655    for my $sub (qw(
6656                    delete_range
6657                ))
6658    {
6659        no strict "refs";
6660        *$sub = sub {
6661            use strict "refs";
6662            my $self = shift;
6663
6664            return if $self->carp_if_locked;
6665            no overloading;
6666            return $self->_range_list->$sub(@_);
6667        }
6668    }
6669
6670} # End closure
6671
6672package Map_Table;
6673use parent '-norequire', '_Base_Table';
6674
6675# A Map Table is a table that contains the mappings from code points to
6676# values.  There are two weird cases:
6677# 1) Anomalous entries are ones that aren't maps of ranges of code points, but
6678#    are written in the table's file at the end of the table nonetheless.  It
6679#    requires specially constructed code to handle these; utf8.c can not read
6680#    these in, so they should not go in $map_directory.  As of this writing,
6681#    the only case that these happen is for named sequences used in
6682#    charnames.pm.   But this code doesn't enforce any syntax on these, so
6683#    something else could come along that uses it.
6684# 2) Specials are anything that doesn't fit syntactically into the body of the
6685#    table.  The ranges for these have a map type of non-zero.  The code below
6686#    knows about and handles each possible type.   In most cases, these are
6687#    written as part of the header.
6688#
6689# A map table deliberately can't be manipulated at will unlike match tables.
6690# This is because of the ambiguities having to do with what to do with
6691# overlapping code points.  And there just isn't a need for those things;
6692# what one wants to do is just query, add, replace, or delete mappings, plus
6693# write the final result.
6694# However, there is a method to get the list of possible ranges that aren't in
6695# this table to use for defaulting missing code point mappings.  And,
6696# map_add_or_replace_non_nulls() does allow one to add another table to this
6697# one, but it is clearly very specialized, and defined that the other's
6698# non-null values replace this one's if there is any overlap.
6699
6700sub trace { return main::trace(@_); }
6701
6702{ # Closure
6703
6704    main::setup_package();
6705
6706    my %default_map;
6707    # Many input files omit some entries; this gives what the mapping for the
6708    # missing entries should be
6709    main::set_access('default_map', \%default_map, 'r');
6710
6711    my %anomalous_entries;
6712    # Things that go in the body of the table which don't fit the normal
6713    # scheme of things, like having a range.  Not much can be done with these
6714    # once there except to output them.  This was created to handle named
6715    # sequences.
6716    main::set_access('anomalous_entry', \%anomalous_entries, 'a');
6717    main::set_access('anomalous_entries',       # Append singular, read plural
6718                    \%anomalous_entries,
6719                    'readable_array');
6720
6721    my %replacement_property;
6722    # Certain files are unused by Perl itself, and are kept only for backwards
6723    # compatibility for programs that used them before Unicode::UCD existed.
6724    # These are termed legacy properties.  At some point they may be removed,
6725    # but for now mark them as legacy.  If non empty, this is the name of the
6726    # property to use instead (i.e., the modern equivalent).
6727    main::set_access('replacement_property', \%replacement_property, 'r');
6728
6729    my %to_output_map;
6730    # Enum as to whether or not to write out this map table, and how:
6731    #   0               don't output
6732    #   $EXTERNAL_MAP   means its existence is noted in the documentation, and
6733    #                   it should not be removed nor its format changed.  This
6734    #                   is done for those files that have traditionally been
6735    #                   output.  Maps of legacy-only properties default to
6736    #                   this.
6737    #   $INTERNAL_MAP   means Perl reserves the right to do anything it wants
6738    #                   with this file
6739    #   $OUTPUT_ADJUSTED means that it is an $INTERNAL_MAP, and instead of
6740    #                   outputting the actual mappings as-is, we adjust things
6741    #                   to create a much more compact table. Only those few
6742    #                   tables where the mapping is convertible at least to an
6743    #                   integer and compacting makes a big difference should
6744    #                   have this.  Hence, the default is to not do this
6745    #                   unless the table's default mapping is to $CODE_POINT,
6746    #                   and the range size is not 1.
6747    main::set_access('to_output_map', \%to_output_map, 's');
6748
6749    sub new {
6750        my $class = shift;
6751        my $name = shift;
6752
6753        my %args = @_;
6754
6755        # Optional initialization data for the table.
6756        my $initialize = delete $args{'Initialize'};
6757
6758        my $default_map = delete $args{'Default_Map'};
6759        my $property = delete $args{'_Property'};
6760        my $full_name = delete $args{'Full_Name'};
6761        my $replacement_property = delete $args{'Replacement_Property'} // "";
6762        my $to_output_map = delete $args{'To_Output_Map'};
6763
6764        # Rest of parameters passed on; legacy properties have several common
6765        # other attributes
6766        if ($replacement_property) {
6767            $args{"Fate"} = $LEGACY_ONLY;
6768            $args{"Range_Size_1"} = 1;
6769            $args{"Perl_Extension"} = 1;
6770            $args{"UCD"} = 0;
6771        }
6772
6773        my $range_list = Range_Map->new(Owner => $property);
6774
6775        my $self = $class->SUPER::new(
6776                                    Name => $name,
6777                                    Complete_Name =>  $full_name,
6778                                    Full_Name => $full_name,
6779                                    _Property => $property,
6780                                    _Range_List => $range_list,
6781                                    Write_As_Invlist => 0,
6782                                    %args);
6783
6784        my $addr = do { no overloading; pack 'J', $self; };
6785
6786        $anomalous_entries{$addr} = [];
6787        $default_map{$addr} = $default_map;
6788        $replacement_property{$addr} = $replacement_property;
6789        $to_output_map = $EXTERNAL_MAP if ! defined $to_output_map
6790                                          && $replacement_property;
6791        $to_output_map{$addr} = $to_output_map;
6792
6793        $self->initialize($initialize) if defined $initialize;
6794
6795        return $self;
6796    }
6797
6798    use overload
6799        fallback => 0,
6800        qw("") => "_operator_stringify",
6801    ;
6802
6803    sub _operator_stringify {
6804        my $self = shift;
6805
6806        my $name = $self->property->full_name;
6807        $name = '""' if $name eq "";
6808        return "Map table for Property '$name'";
6809    }
6810
6811    sub add_alias {
6812        # Add a synonym for this table (which means the property itself)
6813        my $self = shift;
6814        my $name = shift;
6815        # Rest of parameters passed on.
6816
6817        $self->SUPER::add_alias($name, $self->property, @_);
6818        return;
6819    }
6820
6821    sub add_map {
6822        # Add a range of code points to the list of specially-handled code
6823        # points.  $MULTI_CP is assumed if the type of special is not passed
6824        # in.
6825
6826        my $self = shift;
6827        my $lower = shift;
6828        my $upper = shift;
6829        my $string = shift;
6830        my %args = @_;
6831
6832        my $type = delete $args{'Type'} || 0;
6833        # Rest of parameters passed on
6834
6835        # Can't change the table if locked.
6836        return if $self->carp_if_locked;
6837
6838        my $addr = do { no overloading; pack 'J', $self; };
6839
6840        $self->_range_list->add_map($lower, $upper,
6841                                    $string,
6842                                    @_,
6843                                    Type => $type);
6844        return;
6845    }
6846
6847    sub append_to_body($self) {
6848        # Adds to the written HERE document of the table's body any anomalous
6849        # entries in the table..
6850        my $addr = do { no overloading; pack 'J', $self; };
6851
6852        return "" unless @{$anomalous_entries{$addr}};
6853        return join("\n", @{$anomalous_entries{$addr}}) . "\n";
6854    }
6855
6856    sub map_add_or_replace_non_nulls($self, $other) {
6857        # This adds the mappings in the table $other to $self.  Non-null
6858        # mappings from $other override those in $self.  It essentially merges
6859        # the two tables, with the second having priority except for null
6860        # mappings.
6861        return if $self->carp_if_locked;
6862
6863        if (! $other->isa(__PACKAGE__)) {
6864            Carp::my_carp_bug("$other should be a "
6865                        . __PACKAGE__
6866                        . ".  Not a '"
6867                        . ref($other)
6868                        . "'.  Not added;");
6869            return;
6870        }
6871
6872        my $addr = do { no overloading; pack 'J', $self; };
6873        my $other_addr = do { no overloading; pack 'J', $other; };
6874
6875        local $to_trace = 0 if main::DEBUG;
6876
6877        my $self_range_list = $self->_range_list;
6878        my $other_range_list = $other->_range_list;
6879        foreach my $range ($other_range_list->ranges) {
6880            my $value = $range->value;
6881            next if $value eq "";
6882            $self_range_list->_add_delete('+',
6883                                          $range->start,
6884                                          $range->end,
6885                                          $value,
6886                                          Type => $range->type,
6887                                          Replace => $UNCONDITIONALLY);
6888        }
6889
6890        return;
6891    }
6892
6893    sub set_default_map($self, $map, $use_full_name=0) {
6894        # Define what code points that are missing from the input files should
6895        # map to.  The optional second parameter 'full_name' indicates to
6896        # force using the full name of the map instead of its standard name.
6897        if ($use_full_name && $use_full_name ne 'full_name') {
6898            Carp::my_carp_bug("Second parameter to set_default_map() if"
6899                            . " present, must be 'full_name'");
6900        }
6901
6902        my $addr = do { no overloading; pack 'J', $self; };
6903
6904        # Convert the input to the standard equivalent, if any (won't have any
6905        # for $STRING properties)
6906        my $standard = $self->property->table($map);
6907        if (defined $standard) {
6908            $map = ($use_full_name)
6909                   ? $standard->full_name
6910                   : $standard->name;
6911        }
6912
6913        # Warn if there already is a non-equivalent default map for this
6914        # property.  Note that a default map can be a ref, which means that
6915        # what it actually means is delayed until later in the program, and it
6916        # IS permissible to override it here without a message.
6917        my $default_map = $default_map{$addr};
6918        if (defined $default_map
6919            && ! ref($default_map)
6920            && $default_map ne $map
6921            && main::Standardize($map) ne $default_map)
6922        {
6923            my $property = $self->property;
6924            my $map_table = $property->table($map);
6925            my $default_table = $property->table($default_map);
6926            if (defined $map_table
6927                && defined $default_table
6928                && $map_table != $default_table)
6929            {
6930                Carp::my_carp("Changing the default mapping for "
6931                            . $property
6932                            . " from $default_map to $map'");
6933            }
6934        }
6935
6936        $default_map{$addr} = $map;
6937
6938        # Don't also create any missing table for this map at this point,
6939        # because if we did, it could get done before the main table add is
6940        # done for PropValueAliases.txt; instead the caller will have to make
6941        # sure it exists, if desired.
6942        return;
6943    }
6944
6945    sub to_output_map($self) {
6946        # Returns boolean: should we write this map table?
6947        my $addr = do { no overloading; pack 'J', $self; };
6948
6949        # If overridden, use that
6950        return $to_output_map{$addr} if defined $to_output_map{$addr};
6951
6952        my $full_name = $self->full_name;
6953        return $global_to_output_map{$full_name}
6954                                if defined $global_to_output_map{$full_name};
6955
6956        # If table says to output, do so; if says to suppress it, do so.
6957        my $fate = $self->fate;
6958        return $INTERNAL_MAP if $fate == $INTERNAL_ONLY;
6959        return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties;
6960        return 0 if $fate == $SUPPRESSED || $fate == $MAP_PROXIED;
6961
6962        my $type = $self->property->type;
6963
6964        # Don't want to output binary map tables even for debugging.
6965        return 0 if $type == $BINARY;
6966
6967        # But do want to output string ones.  All the ones that remain to
6968        # be dealt with (i.e. which haven't explicitly been set to external)
6969        # are for internal Perl use only.  The default for those that map to
6970        # $CODE_POINT and haven't been restricted to a single element range
6971        # is to use the adjusted form.
6972        if ($type == $STRING) {
6973            return $INTERNAL_MAP if $self->range_size_1
6974                                    || $default_map{$addr} ne $CODE_POINT;
6975            return $OUTPUT_ADJUSTED;
6976        }
6977
6978        # Otherwise is an $ENUM, do output it, for Perl's purposes
6979        return $INTERNAL_MAP;
6980    }
6981
6982    sub inverse_list($self) {
6983        # Returns a Range_List that is gaps of the current table.  That is,
6984        # the inversion
6985        my $current = Range_List->new(Initialize => $self->_range_list,
6986                                Owner => $self->property);
6987        return ~ $current;
6988    }
6989
6990    sub header($self) {
6991        my $return = $self->SUPER::header();
6992
6993        if ($self->to_output_map >= $INTERNAL_MAP) {
6994            $return .= $INTERNAL_ONLY_HEADER;
6995        }
6996        else {
6997            my $property_name = $self->property->replacement_property;
6998
6999            # The legacy-only properties were gotten above; but there are some
7000            # other properties whose files are in current use that have fixed
7001            # formats.
7002            $property_name = $self->property->full_name unless $property_name;
7003
7004            $return .= <<END;
7005
7006# !!!!!!!   IT IS DEPRECATED TO USE THIS FILE   !!!!!!!
7007
7008# This file is for internal use by core Perl only.  It is retained for
7009# backwards compatibility with applications that may have come to rely on it,
7010# but its format and even its name or existence are subject to change without
7011# notice in a future Perl version.  Don't use it directly.  Instead, its
7012# contents are now retrievable through a stable API in the Unicode::UCD
7013# module: Unicode::UCD::prop_invmap('$property_name') (Values for individual
7014# code points can be retrieved via Unicode::UCD::charprop());
7015END
7016        }
7017        return $return;
7018    }
7019
7020    sub set_final_comment($self) {
7021        # Just before output, create the comment that heads the file
7022        # containing this table.
7023
7024        return unless $debugging_build;
7025
7026        # No sense generating a comment if aren't going to write it out.
7027        return if ! $self->to_output_map;
7028
7029        my $addr = do { no overloading; pack 'J', $self; };
7030
7031        my $property = $self->property;
7032
7033        # Get all the possible names for this property.  Don't use any that
7034        # aren't ok for use in a file name, etc.  This is perhaps causing that
7035        # flag to do double duty, and may have to be changed in the future to
7036        # have our own flag for just this purpose; but it works now to exclude
7037        # Perl generated synonyms from the lists for properties, where the
7038        # name is always the proper Unicode one.
7039        my @property_aliases = grep { $_->ok_as_filename } $self->aliases;
7040
7041        my $count = $self->count;
7042        my $default_map = $default_map{$addr};
7043
7044        # The ranges that map to the default aren't output, so subtract that
7045        # to get those actually output.  A property with matching tables
7046        # already has the information calculated.
7047        if ($property->type != $STRING && $property->type != $FORCED_BINARY) {
7048            $count -= $property->table($default_map)->count;
7049        }
7050        elsif (defined $default_map) {
7051
7052            # But for $STRING properties, must calculate now.  Subtract the
7053            # count from each range that maps to the default.
7054            foreach my $range ($self->_range_list->ranges) {
7055                if ($range->value eq $default_map) {
7056                    $count -= $range->end +1 - $range->start;
7057                }
7058            }
7059
7060        }
7061
7062        # Get a  string version of $count with underscores in large numbers,
7063        # for clarity.
7064        my $string_count = main::clarify_code_point_count($count);
7065
7066        my $code_points = ($count == 1)
7067                        ? 'single code point'
7068                        : "$string_count code points";
7069
7070        my $mapping;
7071        my $these_mappings;
7072        my $are;
7073        if (@property_aliases <= 1) {
7074            $mapping = 'mapping';
7075            $these_mappings = 'this mapping';
7076            $are = 'is'
7077        }
7078        else {
7079            $mapping = 'synonymous mappings';
7080            $these_mappings = 'these mappings';
7081            $are = 'are'
7082        }
7083        my $cp;
7084        if ($count >= $MAX_UNICODE_CODEPOINTS) {
7085            $cp = "any code point in Unicode Version $string_version";
7086        }
7087        else {
7088            my $map_to;
7089            if ($default_map eq "") {
7090                $map_to = 'the null string';
7091            }
7092            elsif ($default_map eq $CODE_POINT) {
7093                $map_to = "itself";
7094            }
7095            else {
7096                $map_to = "'$default_map'";
7097            }
7098            if ($count == 1) {
7099                $cp = "the single code point";
7100            }
7101            else {
7102                $cp = "one of the $code_points";
7103            }
7104            $cp .= " in Unicode Version $unicode_version for which the mapping is not to $map_to";
7105        }
7106
7107        my $comment = "";
7108
7109        my $status = $self->status;
7110        if ($status ne $NORMAL) {
7111            my $warn = uc $status_past_participles{$status};
7112            $comment .= <<END;
7113
7114!!!!!!!   $warn !!!!!!!!!!!!!!!!!!!
7115 All property or property=value combinations contained in this file are $warn.
7116 See $unicode_reference_url for what this means.
7117
7118END
7119        }
7120        $comment .= "This file returns the $mapping:\n";
7121
7122        my $ucd_accessible_name = "";
7123        my $has_underscore_name = 0;
7124        my $full_name = $self->property->full_name;
7125        for my $i (0 .. @property_aliases - 1) {
7126            my $name = $property_aliases[$i]->name;
7127            $has_underscore_name = 1 if $name =~ /^_/;
7128            $comment .= sprintf("%-8s%s\n", " ", $name . '(cp)');
7129            if ($property_aliases[$i]->ucd) {
7130                if ($name eq $full_name) {
7131                    $ucd_accessible_name = $full_name;
7132                }
7133                elsif (! $ucd_accessible_name) {
7134                    $ucd_accessible_name = $name;
7135                }
7136            }
7137        }
7138        $comment .= "\nwhere 'cp' is $cp.";
7139        if ($ucd_accessible_name) {
7140            $comment .= "  Note that $these_mappings";
7141            if ($has_underscore_name) {
7142                $comment .= " (except for the one(s) that begin with an underscore)";
7143            }
7144            $comment .= " $are accessible via the functions prop_invmap('$full_name') or charprop() in Unicode::UCD";
7145
7146        }
7147
7148        # And append any commentary already set from the actual property.
7149        $comment .= "\n\n" . $self->comment if $self->comment;
7150        if ($self->description) {
7151            $comment .= "\n\n" . join " ", $self->description;
7152        }
7153        if ($self->note) {
7154            $comment .= "\n\n" . join " ", $self->note;
7155        }
7156        $comment .= "\n";
7157
7158        if (! $self->perl_extension) {
7159            $comment .= <<END;
7160
7161For information about what this property really means, see:
7162$unicode_reference_url
7163END
7164        }
7165
7166        if ($count) {        # Format differs for empty table
7167                $comment.= "\nThe format of the ";
7168            if ($self->range_size_1) {
7169                $comment.= <<END;
7170main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
7171is in hex; MAPPING is what CODE_POINT maps to.
7172END
7173            }
7174            else {
7175
7176                # There are tables which end up only having one element per
7177                # range, but it is not worth keeping track of for making just
7178                # this comment a little better.
7179                $comment .= <<END;
7180non-comment portions of the main body of lines of this file is:
7181START\\tSTOP\\tMAPPING where START is the starting code point of the
7182range, in hex; STOP is the ending point, or if omitted, the range has just one
7183code point; MAPPING is what each code point between START and STOP maps to.
7184END
7185                if ($self->output_range_counts) {
7186                    $comment .= <<END;
7187Numbers in comments in [brackets] indicate how many code points are in the
7188range (omitted when the range is a single code point or if the mapping is to
7189the null string).
7190END
7191                }
7192            }
7193        }
7194        $self->set_comment(main::join_lines($comment));
7195        return;
7196    }
7197
7198    my %swash_keys; # Makes sure don't duplicate swash names.
7199
7200    # The remaining variables are temporaries used while writing each table,
7201    # to output special ranges.
7202    my @multi_code_point_maps;  # Map is to more than one code point.
7203
7204    sub handle_special_range($self, $range) {
7205        # Called in the middle of write when it finds a range it doesn't know
7206        # how to handle.
7207
7208        my $addr = do { no overloading; pack 'J', $self; };
7209
7210        my $type = $range->type;
7211
7212        my $low = $range->start;
7213        my $high = $range->end;
7214        my $map = $range->value;
7215
7216        # No need to output the range if it maps to the default.
7217        return if $map eq $default_map{$addr};
7218
7219        my $property = $self->property;
7220
7221        # Switch based on the map type...
7222        if ($type == $HANGUL_SYLLABLE) {
7223
7224            # These are entirely algorithmically determinable based on
7225            # some constants furnished by Unicode; for now, just set a
7226            # flag to indicate that have them.  After everything is figured
7227            # out, we will output the code that does the algorithm.  (Don't
7228            # output them if not needed because we are suppressing this
7229            # property.)
7230            $has_hangul_syllables = 1 if $property->to_output_map;
7231        }
7232        elsif ($type == $CP_IN_NAME) {
7233
7234            # Code points whose name ends in their code point are also
7235            # algorithmically determinable, but need information about the map
7236            # to do so.  Both the map and its inverse are stored in data
7237            # structures output in the file.  They are stored in the mean time
7238            # in global lists The lists will be written out later into Name.pm,
7239            # which is created only if needed.  In order to prevent duplicates
7240            # in the list, only add to them for one property, should multiple
7241            # ones need them.
7242            if ($needing_code_points_ending_in_code_point == 0) {
7243                $needing_code_points_ending_in_code_point = $property;
7244            }
7245            if ($property == $needing_code_points_ending_in_code_point) {
7246                push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
7247                push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
7248
7249                my $squeezed = $map =~ s/[-\s]+//gr;
7250                push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}},
7251                                                                          $low;
7252                push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}},
7253                                                                         $high;
7254
7255                # Calculate the set of legal characters in names of this
7256                # series.  It includes every character in the name prefix.
7257                my %legal;
7258                $legal{$_} = 1 for split //, $map;
7259
7260                # Plus the hex code point chars, blank, and minus.  Also \n
7261                # can show up as being required due to anchoring
7262                for my $i ('0' .. '9', 'A' .. 'F', '-', ' ', "\n") {
7263                    $legal{$i} = 1;
7264                }
7265                my $legal = join "", sort { $a cmp $b } keys %legal;
7266
7267                # The legal chars can be used in match optimizations
7268                push @code_points_ending_in_code_point, { low => $low,
7269                                                        high => $high,
7270                                                        name => $map,
7271                                                        legal => $legal,
7272                                                        };
7273            }
7274        }
7275        elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
7276
7277            # Multi-code point maps and null string maps have an entry
7278            # for each code point in the range.  They use the same
7279            # output format.
7280            for my $code_point ($low .. $high) {
7281
7282                # The pack() below can't cope with surrogates.  XXX This may
7283                # no longer be true
7284                if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
7285                    Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self.  No map created");
7286                    next;
7287                }
7288
7289                # Generate the hash entries for these in the form that
7290                # utf8.c understands.
7291                my $tostr = "";
7292                my $to_name = "";
7293                my $to_chr = "";
7294                foreach my $to (split " ", $map) {
7295                    if ($to !~ /^$code_point_re$/) {
7296                        Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self.  No map created");
7297                        next;
7298                    }
7299                    $tostr .= sprintf "\\x{%s}", $to;
7300                    $to = CORE::hex $to;
7301                    if ($annotate) {
7302                        $to_name .= " + " if $to_name;
7303                        $to_chr .= main::display_chr($to);
7304                        main::populate_char_info($to)
7305                                            if ! defined $viacode[$to];
7306                        $to_name .=  $viacode[$to];
7307                    }
7308                }
7309
7310                # The unpack yields a list of the bytes that comprise the
7311                # UTF-8 of $code_point, which are each placed in \xZZ format
7312                # and output in the %s to map to $tostr, so the result looks
7313                # like:
7314                # "\xC4\xB0" => "\x{0069}\x{0307}",
7315                my $utf8 = sprintf(qq["%s" => "$tostr",],
7316                        join("", map { sprintf "\\x%02X", $_ }
7317                            unpack("U0C*", chr $code_point)));
7318
7319                # Add a comment so that a human reader can more easily
7320                # see what's going on.
7321                push @multi_code_point_maps,
7322                        sprintf("%-45s # U+%04X", $utf8, $code_point);
7323                if (! $annotate) {
7324                    $multi_code_point_maps[-1] .= " => $map";
7325                }
7326                else {
7327                    main::populate_char_info($code_point)
7328                                    if ! defined $viacode[$code_point];
7329                    $multi_code_point_maps[-1] .= " '"
7330                        . main::display_chr($code_point)
7331                        . "' => '$to_chr'; $viacode[$code_point] => $to_name";
7332                }
7333            }
7334        }
7335        else {
7336            Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self.  Not written");
7337        }
7338
7339        return;
7340    }
7341
7342    sub pre_body($self) {
7343        # Returns the string that should be output in the file before the main
7344        # body of this table.  It isn't called until the main body is
7345        # calculated, saving a pass.  The string includes some hash entries
7346        # identifying the format of the body, and what the single value should
7347        # be for all ranges missing from it.  It also includes any code points
7348        # which have map_types that don't go in the main table.
7349
7350        my $addr = do { no overloading; pack 'J', $self; };
7351
7352        my $name = $self->property->swash_name;
7353
7354        # Currently there is nothing in the pre_body unless a swash is being
7355        # generated.
7356        return unless defined $name;
7357
7358        if (defined $swash_keys{$name}) {
7359            Carp::my_carp(main::join_lines(<<END
7360Already created a swash name '$name' for $swash_keys{$name}.  This means that
7361the same name desired for $self shouldn't be used.  Bad News.  This must be
7362fixed before production use, but proceeding anyway
7363END
7364            ));
7365        }
7366        $swash_keys{$name} = "$self";
7367
7368        my $pre_body = "";
7369
7370        # Here we assume we were called after have gone through the whole
7371        # file.  If we actually generated anything for each map type, add its
7372        # respective header and trailer
7373        my $specials_name = "";
7374        if (@multi_code_point_maps) {
7375            $specials_name = "Unicode::UCD::ToSpec$name";
7376            $pre_body .= <<END;
7377
7378# Some code points require special handling because their mappings are each to
7379# multiple code points.  These do not appear in the main body, but are defined
7380# in the hash below.
7381
7382# Each key is the string of N bytes that together make up the UTF-8 encoding
7383# for the code point.  (i.e. the same as looking at the code point's UTF-8
7384# under "use bytes").  Each value is the UTF-8 of the translation, for speed.
7385\%$specials_name = (
7386END
7387            $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
7388        }
7389
7390        my $format = $self->format;
7391
7392        my $return = "";
7393
7394        my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
7395        if ($output_adjusted) {
7396            if ($specials_name) {
7397                $return .= <<END;
7398# The mappings in the non-hash portion of this file must be modified to get the
7399# correct values by adding the code point ordinal number to each one that is
7400# numeric.
7401END
7402            }
7403            else {
7404                $return .= <<END;
7405# The mappings must be modified to get the correct values by adding the code
7406# point ordinal number to each one that is numeric.
7407END
7408            }
7409        }
7410
7411        $return .= <<END;
7412
7413# The name this table is to be known by, with the format of the mappings in
7414# the main body of the table, and what all code points missing from this file
7415# map to.
7416\$Unicode::UCD::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
7417END
7418        if ($specials_name) {
7419            $return .= <<END;
7420\$Unicode::UCD::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
7421END
7422        }
7423        my $default_map = $default_map{$addr};
7424
7425        # For $CODE_POINT default maps and using adjustments, instead the default
7426        # becomes zero.
7427        $return .= "\$Unicode::UCD::SwashInfo{'To$name'}{'missing'} = '"
7428                .  (($output_adjusted && $default_map eq $CODE_POINT)
7429                   ? "0"
7430                   : $default_map)
7431                . "';";
7432
7433        if ($default_map eq $CODE_POINT) {
7434            $return .= ' # code point maps to itself';
7435        }
7436        elsif ($default_map eq "") {
7437            $return .= ' # code point maps to the null string';
7438        }
7439        $return .= "\n";
7440
7441        $return .= $pre_body;
7442
7443        return $return;
7444    }
7445
7446    sub write($self) {
7447        # Write the table to the file.
7448
7449        my $addr = do { no overloading; pack 'J', $self; };
7450
7451        # Clear the temporaries
7452        undef @multi_code_point_maps;
7453
7454        # Calculate the format of the table if not already done.
7455        my $format = $self->format;
7456        my $type = $self->property->type;
7457        my $default_map = $self->default_map;
7458        if (! defined $format) {
7459            if ($type == $BINARY) {
7460
7461                # Don't bother checking the values, because we elsewhere
7462                # verify that a binary table has only 2 values.
7463                $format = $BINARY_FORMAT;
7464            }
7465            else {
7466                my @ranges = $self->_range_list->ranges;
7467
7468                # default an empty table based on its type and default map
7469                if (! @ranges) {
7470
7471                    # But it turns out that the only one we can say is a
7472                    # non-string (besides binary, handled above) is when the
7473                    # table is a string and the default map is to a code point
7474                    if ($type == $STRING && $default_map eq $CODE_POINT) {
7475                        $format = $HEX_FORMAT;
7476                    }
7477                    else {
7478                        $format = $STRING_FORMAT;
7479                    }
7480                }
7481                else {
7482
7483                    # Start with the most restrictive format, and as we find
7484                    # something that doesn't fit with that, change to the next
7485                    # most restrictive, and so on.
7486                    $format = $DECIMAL_FORMAT;
7487                    foreach my $range (@ranges) {
7488                        next if $range->type != 0;  # Non-normal ranges don't
7489                                                    # affect the main body
7490                        my $map = $range->value;
7491                        if ($map ne $default_map) {
7492                            last if $format eq $STRING_FORMAT;  # already at
7493                                                                # least
7494                                                                # restrictive
7495                            $format = $INTEGER_FORMAT
7496                                                if $format eq $DECIMAL_FORMAT
7497                                                    && $map !~ / ^ [0-9] $ /x;
7498                            $format = $FLOAT_FORMAT
7499                                            if $format eq $INTEGER_FORMAT
7500                                                && $map !~ / ^ -? [0-9]+ $ /x;
7501                            $format = $RATIONAL_FORMAT
7502                                if $format eq $FLOAT_FORMAT
7503                                    && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
7504                            $format = $HEX_FORMAT
7505                                if ($format eq $RATIONAL_FORMAT
7506                                       && $map !~
7507                                           m/ ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x)
7508                                        # Assume a leading zero means hex,
7509                                        # even if all digits are 0-9
7510                                    || ($format eq $INTEGER_FORMAT
7511                                        && $map =~ /^0[0-9A-F]/);
7512                            $format = $STRING_FORMAT if $format eq $HEX_FORMAT
7513                                                       && $map =~ /[^0-9A-F]/;
7514                        }
7515                    }
7516                }
7517            }
7518        } # end of calculating format
7519
7520        if ($default_map eq $CODE_POINT
7521            && $format ne $HEX_FORMAT
7522            && ! defined $self->format)    # manual settings are always
7523                                           # considered ok
7524        {
7525            Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
7526        }
7527
7528        # If the output is to be adjusted, the format of the table that gets
7529        # output is actually 'a' or 'ax' instead of whatever it is stored
7530        # internally as.
7531        my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
7532        if ($output_adjusted) {
7533            if ($default_map eq $CODE_POINT) {
7534                $format = $HEX_ADJUST_FORMAT;
7535            }
7536            else {
7537                $format = $ADJUST_FORMAT;
7538            }
7539        }
7540
7541        $self->_set_format($format);
7542
7543        return $self->SUPER::write(
7544            $output_adjusted,
7545            $default_map);   # don't write defaulteds
7546    }
7547
7548    # Accessors for the underlying list that should fail if locked.
7549    for my $sub (qw(
7550                    add_duplicate
7551                    replace_map
7552                ))
7553    {
7554        no strict "refs";
7555        *$sub = sub {
7556            use strict "refs";
7557            my $self = shift;
7558
7559            return if $self->carp_if_locked;
7560            return $self->_range_list->$sub(@_);
7561        }
7562    }
7563} # End closure for Map_Table
7564
7565package Match_Table;
7566use parent '-norequire', '_Base_Table';
7567
7568# A Match table is one which is a list of all the code points that have
7569# the same property and property value, for use in \p{property=value}
7570# constructs in regular expressions.  It adds very little data to the base
7571# structure, but many methods, as these lists can be combined in many ways to
7572# form new ones.
7573# There are only a few concepts added:
7574# 1) Equivalents and Relatedness.
7575#    Two tables can match the identical code points, but have different names.
7576#    This always happens when there is a perl single form extension
7577#    \p{IsProperty} for the Unicode compound form \P{Property=True}.  The two
7578#    tables are set to be related, with the Perl extension being a child, and
7579#    the Unicode property being the parent.
7580#
7581#    It may be that two tables match the identical code points and we don't
7582#    know if they are related or not.  This happens most frequently when the
7583#    Block and Script properties have the exact range.  But note that a
7584#    revision to Unicode could add new code points to the script, which would
7585#    now have to be in a different block (as the block was filled, or there
7586#    would have been 'Unknown' script code points in it and they wouldn't have
7587#    been identical).  So we can't rely on any two properties from Unicode
7588#    always matching the same code points from release to release, and thus
7589#    these tables are considered coincidentally equivalent--not related.  When
7590#    two tables are unrelated but equivalent, one is arbitrarily chosen as the
7591#    'leader', and the others are 'equivalents'.  This concept is useful
7592#    to minimize the number of tables written out.  Only one file is used for
7593#    any identical set of code points, with entries in UCD.pl mapping all
7594#    the involved tables to it.
7595#
7596#    Related tables will always be identical; we set them up to be so.  Thus
7597#    if the Unicode one is deprecated, the Perl one will be too.  Not so for
7598#    unrelated tables.  Relatedness makes generating the documentation easier.
7599#
7600# 2) Complement.
7601#    Like equivalents, two tables may be the inverses of each other, the
7602#    intersection between them is null, and the union is every Unicode code
7603#    point.  The two tables that occupy a binary property are necessarily like
7604#    this.  By specifying one table as the complement of another, we can avoid
7605#    storing it on disk (using the other table and performing a fast
7606#    transform), and some memory and calculations.
7607#
7608# 3) Conflicting.  It may be that there will eventually be name clashes, with
7609#    the same name meaning different things.  For a while, there actually were
7610#    conflicts, but they have so far been resolved by changing Perl's or
7611#    Unicode's definitions to match the other, but when this code was written,
7612#    it wasn't clear that that was what was going to happen.  (Unicode changed
7613#    because of protests during their beta period.)  Name clashes are warned
7614#    about during compilation, and the documentation.  The generated tables
7615#    are sane, free of name clashes, because the code suppresses the Perl
7616#    version.  But manual intervention to decide what the actual behavior
7617#    should be may be required should this happen.  The introductory comments
7618#    have more to say about this.
7619#
7620# 4) Definition.  This is a string for human consumption that specifies the
7621#    code points that this table matches.  This is used only for the generated
7622#    pod file.  It may be specified explicitly, or automatically computed.
7623#    Only the first portion of complicated definitions is computed and
7624#    displayed.
7625
7626sub standardize { return main::standardize($_[0]); }
7627sub trace { return main::trace(@_); }
7628
7629
7630{ # Closure
7631
7632    main::setup_package();
7633
7634    my %leader;
7635    # The leader table of this one; initially $self.
7636    main::set_access('leader', \%leader, 'r');
7637
7638    my %equivalents;
7639    # An array of any tables that have this one as their leader
7640    main::set_access('equivalents', \%equivalents, 'readable_array');
7641
7642    my %parent;
7643    # The parent table to this one, initially $self.  This allows us to
7644    # distinguish between equivalent tables that are related (for which this
7645    # is set to), and those which may not be, but share the same output file
7646    # because they match the exact same set of code points in the current
7647    # Unicode release.
7648    main::set_access('parent', \%parent, 'r');
7649
7650    my %children;
7651    # An array of any tables that have this one as their parent
7652    main::set_access('children', \%children, 'readable_array');
7653
7654    my %conflicting;
7655    # Array of any tables that would have the same name as this one with
7656    # a different meaning.  This is used for the generated documentation.
7657    main::set_access('conflicting', \%conflicting, 'readable_array');
7658
7659    my %matches_all;
7660    # Set in the constructor for tables that are expected to match all code
7661    # points.
7662    main::set_access('matches_all', \%matches_all, 'r');
7663
7664    my %complement;
7665    # Points to the complement that this table is expressed in terms of; 0 if
7666    # none.
7667    main::set_access('complement', \%complement, 'r');
7668
7669    my %definition;
7670    # Human readable string of the first few ranges of code points matched by
7671    # this table
7672    main::set_access('definition', \%definition, 'r', 's');
7673
7674    sub new {
7675        my $class = shift;
7676
7677        my %args = @_;
7678
7679        # The property for which this table is a listing of property values.
7680        my $property = delete $args{'_Property'};
7681
7682        my $name = delete $args{'Name'};
7683        my $full_name = delete $args{'Full_Name'};
7684        $full_name = $name if ! defined $full_name;
7685
7686        # Optional
7687        my $initialize = delete $args{'Initialize'};
7688        my $matches_all = delete $args{'Matches_All'} || 0;
7689        my $format = delete $args{'Format'};
7690        my $definition = delete $args{'Definition'} // "";
7691        # Rest of parameters passed on.
7692
7693        my $range_list = Range_List->new(Initialize => $initialize,
7694                                         Owner => $property);
7695
7696        my $complete = $full_name;
7697        $complete = '""' if $complete eq "";  # A null name shouldn't happen,
7698                                              # but this helps debug if it
7699                                              # does
7700        # The complete name for a match table includes it's property in a
7701        # compound form 'property=table', except if the property is the
7702        # pseudo-property, perl, in which case it is just the single form,
7703        # 'table' (If you change the '=' must also change the ':' in lots of
7704        # places in this program that assume an equal sign)
7705        $complete = $property->full_name . "=$complete" if $property != $perl;
7706
7707        my $self = $class->SUPER::new(%args,
7708                                      Name => $name,
7709                                      Complete_Name => $complete,
7710                                      Full_Name => $full_name,
7711                                      _Property => $property,
7712                                      _Range_List => $range_list,
7713                                      Format => $EMPTY_FORMAT,
7714                                      Write_As_Invlist => 1,
7715                                      );
7716        my $addr = do { no overloading; pack 'J', $self; };
7717
7718        $conflicting{$addr} = [ ];
7719        $equivalents{$addr} = [ ];
7720        $children{$addr} = [ ];
7721        $matches_all{$addr} = $matches_all;
7722        $leader{$addr} = $self;
7723        $parent{$addr} = $self;
7724        $complement{$addr} = 0;
7725        $definition{$addr} = $definition;
7726
7727        if (defined $format && $format ne $EMPTY_FORMAT) {
7728            Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'.  Using '$EMPTY_FORMAT'");
7729        }
7730
7731        return $self;
7732    }
7733
7734    # See this program's beginning comment block about overloading these.
7735    use overload
7736        fallback => 0,
7737        qw("") => "_operator_stringify",
7738        '=' => sub {
7739                    my $self = shift;
7740
7741                    return if $self->carp_if_locked;
7742                    return $self;
7743                },
7744
7745        '+' => sub {
7746                        my $self = shift;
7747                        my $other = shift;
7748
7749                        return $self->_range_list + $other;
7750                    },
7751        '&' => sub {
7752                        my $self = shift;
7753                        my $other = shift;
7754
7755                        return $self->_range_list & $other;
7756                    },
7757        '+=' => sub {
7758                        my $self = shift;
7759                        my $other = shift;
7760                        my $reversed = shift;
7761
7762                        if ($reversed) {
7763                            Carp::my_carp_bug("Bad news.  Can't cope with '"
7764                            . ref($other)
7765                            . ' += '
7766                            . ref($self)
7767                            . "'.  undef returned.");
7768                            return;
7769                        }
7770
7771                        return if $self->carp_if_locked;
7772
7773                        my $addr = do { no overloading; pack 'J', $self; };
7774
7775                        if (ref $other) {
7776
7777                            # Change the range list of this table to be the
7778                            # union of the two.
7779                            $self->_set_range_list($self->_range_list
7780                                                    + $other);
7781                        }
7782                        else {    # $other is just a simple value
7783                            $self->add_range($other, $other);
7784                        }
7785                        return $self;
7786                    },
7787        '&=' => sub {
7788                        my $self = shift;
7789                        my $other = shift;
7790                        my $reversed = shift;
7791
7792                        if ($reversed) {
7793                            Carp::my_carp_bug("Bad news.  Can't cope with '"
7794                            . ref($other)
7795                            . ' &= '
7796                            . ref($self)
7797                            . "'.  undef returned.");
7798                            return;
7799                        }
7800
7801                        return if $self->carp_if_locked;
7802                        $self->_set_range_list($self->_range_list & $other);
7803                        return $self;
7804                    },
7805        '-' => sub { my $self = shift;
7806                    my $other = shift;
7807                    my $reversed = shift;
7808                    if ($reversed) {
7809                        Carp::my_carp_bug("Bad news.  Can't cope with '"
7810                        . ref($other)
7811                        . ' - '
7812                        . ref($self)
7813                        . "'.  undef returned.");
7814                        return;
7815                    }
7816
7817                    return $self->_range_list - $other;
7818                },
7819        '~' => sub { my $self = shift;
7820                    return ~ $self->_range_list;
7821                },
7822    ;
7823
7824    sub _operator_stringify {
7825        my $self = shift;
7826
7827        my $name = $self->complete_name;
7828        return "Table '$name'";
7829    }
7830
7831    sub _range_list {
7832        # Returns the range list associated with this table, which will be the
7833        # complement's if it has one.
7834
7835        my $self = shift;
7836        my $complement = $self->complement;
7837
7838        # In order to avoid re-complementing on each access, only do the
7839        # complement the first time, and store the result in this table's
7840        # range list to use henceforth.  However, this wouldn't work if the
7841        # controlling (complement) table changed after we do this, so lock it.
7842        # Currently, the value of the complement isn't needed until after it
7843        # is fully constructed, so this works.  If this were to change, the
7844        # each_range iteration functionality would no longer work on this
7845        # complement.
7846        if ($complement != 0 && $self->SUPER::_range_list->count == 0) {
7847            $self->_set_range_list($self->SUPER::_range_list
7848                                + ~ $complement->_range_list);
7849            $complement->lock;
7850        }
7851
7852        return $self->SUPER::_range_list;
7853    }
7854
7855    sub add_alias {
7856        # Add a synonym for this table.  See the comments in the base class
7857
7858        my $self = shift;
7859        my $name = shift;
7860        # Rest of parameters passed on.
7861
7862        $self->SUPER::add_alias($name, $self, @_);
7863        return;
7864    }
7865
7866    sub add_conflicting {
7867        # Add the name of some other object to the list of ones that name
7868        # clash with this match table.
7869
7870        my $self = shift;
7871        my $conflicting_name = shift;   # The name of the conflicting object
7872        my $p = shift || 'p';           # Optional, is this a \p{} or \P{} ?
7873        my $conflicting_object = shift; # Optional, the conflicting object
7874                                        # itself.  This is used to
7875                                        # disambiguate the text if the input
7876                                        # name is identical to any of the
7877                                        # aliases $self is known by.
7878                                        # Sometimes the conflicting object is
7879                                        # merely hypothetical, so this has to
7880                                        # be an optional parameter.
7881        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7882
7883        my $addr = do { no overloading; pack 'J', $self; };
7884
7885        # Check if the conflicting name is exactly the same as any existing
7886        # alias in this table (as long as there is a real object there to
7887        # disambiguate with).
7888        if (defined $conflicting_object) {
7889            foreach my $alias ($self->aliases) {
7890                if (standardize($alias->name) eq standardize($conflicting_name)) {
7891
7892                    # Here, there is an exact match.  This results in
7893                    # ambiguous comments, so disambiguate by changing the
7894                    # conflicting name to its object's complete equivalent.
7895                    $conflicting_name = $conflicting_object->complete_name;
7896                    last;
7897                }
7898            }
7899        }
7900
7901        # Convert to the \p{...} final name
7902        $conflicting_name = "\\$p" . "{$conflicting_name}";
7903
7904        # Only add once
7905        return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
7906
7907        push @{$conflicting{$addr}}, $conflicting_name;
7908
7909        return;
7910    }
7911
7912    sub is_set_equivalent_to($self, $other=undef) {
7913        # Return boolean of whether or not the other object is a table of this
7914        # type and has been marked equivalent to this one.
7915
7916        return 0 if ! defined $other; # Can happen for incomplete early
7917                                      # releases
7918        unless ($other->isa(__PACKAGE__)) {
7919            my $ref_other = ref $other;
7920            my $ref_self = ref $self;
7921            Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'.  $other not set equivalent to $self.");
7922            return 0;
7923        }
7924
7925        # Two tables are equivalent if they have the same leader.
7926        no overloading;
7927        return $leader{pack 'J', $self} == $leader{pack 'J', $other};
7928        return;
7929    }
7930
7931    sub set_equivalent_to {
7932        # Set $self equivalent to the parameter table.
7933        # The required Related => 'x' parameter is a boolean indicating
7934        # whether these tables are related or not.  If related, $other becomes
7935        # the 'parent' of $self; if unrelated it becomes the 'leader'
7936        #
7937        # Related tables share all characteristics except names; equivalents
7938        # not quite so many.
7939        # If they are related, one must be a perl extension.  This is because
7940        # we can't guarantee that Unicode won't change one or the other in a
7941        # later release even if they are identical now.
7942
7943        my $self = shift;
7944        my $other = shift;
7945
7946        my %args = @_;
7947        my $related = delete $args{'Related'};
7948
7949        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
7950
7951        return if ! defined $other;     # Keep on going; happens in some early
7952                                        # Unicode releases.
7953
7954        if (! defined $related) {
7955            Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter.  Assuming $self is not related to $other");
7956            $related = 0;
7957        }
7958
7959        # If already are equivalent, no need to re-do it;  if subroutine
7960        # returns null, it found an error, also do nothing
7961        my $are_equivalent = $self->is_set_equivalent_to($other);
7962        return if ! defined $are_equivalent || $are_equivalent;
7963
7964        my $addr = do { no overloading; pack 'J', $self; };
7965        my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
7966
7967        if ($related) {
7968            if ($current_leader->perl_extension) {
7969                if ($other->perl_extension) {
7970                    Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent.");
7971                    return;
7972                }
7973            } elsif ($self->property != $other->property    # Depending on
7974                                                            # situation, might
7975                                                            # be better to use
7976                                                            # add_alias()
7977                                                            # instead for same
7978                                                            # property
7979                     && ! $other->perl_extension
7980
7981                         # We allow the sc and scx properties to be marked as
7982                         # related.  They are in fact related, and this allows
7983                         # the pod to show that better.  This test isn't valid
7984                         # if this is an early Unicode release without the scx
7985                         # property (having that also implies the sc property
7986                         # exists, so don't have to test for no 'sc')
7987                     && (   ! defined $scx
7988                         && ! (   (   $self->property == $script
7989                                   || $self->property == $scx)
7990                               && (   $self->property == $script
7991                                   || $self->property == $scx))))
7992            {
7993                Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties.  Assuming $self is not related to $other");
7994                $related = 0;
7995            }
7996        }
7997
7998        if (! $self->is_empty && ! $self->matches_identically_to($other)) {
7999            Carp::my_carp_bug("$self should be empty or match identically to $other.  Not setting equivalent");
8000            return;
8001        }
8002
8003        my $leader = do { no overloading; pack 'J', $current_leader; };
8004        my $other_addr = do { no overloading; pack 'J', $other; };
8005
8006        # Any tables that are equivalent to or children of this table must now
8007        # instead be equivalent to or (children) to the new leader (parent),
8008        # still equivalent.  The equivalency includes their matches_all info,
8009        # and for related tables, their fate and status.
8010        # All related tables are of necessity equivalent, but the converse
8011        # isn't necessarily true
8012        my $status = $other->status;
8013        my $status_info = $other->status_info;
8014        my $fate = $other->fate;
8015        my $matches_all = $matches_all{other_addr};
8016        my $caseless_equivalent = $other->caseless_equivalent;
8017        foreach my $table ($current_leader, @{$equivalents{$leader}}) {
8018            next if $table == $other;
8019            trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
8020
8021            my $table_addr = do { no overloading; pack 'J', $table; };
8022            $leader{$table_addr} = $other;
8023            $matches_all{$table_addr} = $matches_all;
8024            $self->_set_range_list($other->_range_list);
8025            push @{$equivalents{$other_addr}}, $table;
8026            if ($related) {
8027                $parent{$table_addr} = $other;
8028                push @{$children{$other_addr}}, $table;
8029                $table->set_status($status, $status_info);
8030
8031                # This reason currently doesn't get exposed outside; otherwise
8032                # would have to look up the parent's reason and use it instead.
8033                $table->set_fate($fate, "Parent's fate");
8034
8035                $self->set_caseless_equivalent($caseless_equivalent);
8036            }
8037        }
8038
8039        # Now that we've declared these to be equivalent, any changes to one
8040        # of the tables would invalidate that equivalency.
8041        $self->lock;
8042        $other->lock;
8043        return;
8044    }
8045
8046    sub set_complement($self, $other) {
8047        # Set $self to be the complement of the parameter table.  $self is
8048        # locked, as what it contains should all come from the other table.
8049
8050        if ($other->complement != 0) {
8051            Carp::my_carp_bug("Can't set $self to be the complement of $other, which itself is the complement of " . $other->complement);
8052            return;
8053        }
8054        my $addr = do { no overloading; pack 'J', $self; };
8055        $complement{$addr} = $other;
8056
8057        # Be sure the other property knows we are depending on them; or the
8058        # other table if it is one in the current property.
8059        if ($self->property != $other->property) {
8060            $other->property->set_has_dependency(1);
8061        }
8062        else {
8063            $other->set_has_dependency(1);
8064        }
8065        $self->lock;
8066        return;
8067    }
8068
8069    sub add_range($self, @range) { # Add a range to the list for this table.
8070        # Rest of parameters passed on
8071
8072        return if $self->carp_if_locked;
8073        return $self->_range_list->add_range(@range);
8074    }
8075
8076    sub header($self) {
8077        # All match tables are to be used only by the Perl core.
8078        return $self->SUPER::header() . $INTERNAL_ONLY_HEADER;
8079    }
8080
8081    sub pre_body {  # Does nothing for match tables.
8082        return
8083    }
8084
8085    sub append_to_body {  # Does nothing for match tables.
8086        return
8087    }
8088
8089    sub set_fate($self, $fate, $reason=undef) {
8090        $self->SUPER::set_fate($fate, $reason);
8091
8092        # All children share this fate
8093        foreach my $child ($self->children) {
8094            $child->set_fate($fate, $reason);
8095        }
8096        return;
8097    }
8098
8099    sub calculate_table_definition
8100    {
8101        # Returns a human-readable string showing some or all of the code
8102        # points matched by this table.  The string will include a
8103        # bracketed-character class for all characters matched in the 00-FF
8104        # range, and the first few ranges matched beyond that.
8105        my $max_ranges = 6;
8106
8107        my $self = shift;
8108        my $definition = $self->definition || "";
8109
8110        # Skip this if already have a definition.
8111        return $definition if $definition;
8112
8113        my $lows_string = "";   # The string representation of the 0-FF
8114                                # characters
8115        my $string_range = "";  # The string rep. of the above FF ranges
8116        my $range_count = 0;    # How many ranges in $string_rage
8117
8118        my @lows_invlist;       # The inversion list of the 0-FF code points
8119        my $first_non_control = ord(" ");   # Everything below this is a
8120                                            # control, on ASCII or EBCDIC
8121        my $max_table_code_point = $self->max;
8122
8123        # On ASCII platforms, the range 80-FF contains no printables.
8124        my $highest_printable = ((main::NON_ASCII_PLATFORM) ? 255 : 126);
8125
8126
8127        # Look through the first few ranges matched by this table.
8128        $self->reset_each_range;    # Defensive programming
8129        while (defined (my $range = $self->each_range())) {
8130            my $start = $range->start;
8131            my $end = $range->end;
8132
8133            # Accumulate an inversion list of the 00-FF code points
8134            if ($start < 256 && ($start > 0 || $end < 256)) {
8135                push @lows_invlist, $start;
8136                push @lows_invlist, 1 + (($end < 256) ? $end : 255);
8137
8138                # Get next range if there are more ranges below 256
8139                next if $end < 256 && $end < $max_table_code_point;
8140
8141                # If the range straddles the 255/256 boundary, we split it
8142                # there.  We already added above the low portion to the
8143                # inversion list
8144                $start = 256 if $end > 256;
8145            }
8146
8147            # Here, @lows_invlist contains the code points below 256, and
8148            # there is no other range, or the current one starts at or above
8149            # 256.  Generate the [char class] for the 0-255 ones.
8150            while (@lows_invlist) {
8151
8152                # If this range (necessarily the first one, by the way) starts
8153                # at 0 ...
8154                if ($lows_invlist[0] == 0) {
8155
8156                    # If it ends within the block of controls, that means that
8157                    # some controls are in it and some aren't.  Since Unicode
8158                    # properties pretty much only know about a few of the
8159                    # controls, like \n, \t, this means that its one of them
8160                    # that isn't in the range.  Complement the inversion list
8161                    # which will likely cause these to be output using their
8162                    # mnemonics, hence being clearer.
8163                    if ($lows_invlist[1] < $first_non_control) {
8164                        $lows_string .= '^';
8165                        shift @lows_invlist;
8166                        push @lows_invlist, 256;
8167                    }
8168                    elsif ($lows_invlist[1] <= $highest_printable) {
8169
8170                        # Here, it extends into the printables block.  Split
8171                        # into two ranges so that the controls are separate.
8172                        $lows_string .= sprintf "\\x00-\\x%02x",
8173                                                    $first_non_control - 1;
8174                        $lows_invlist[0] = $first_non_control;
8175                    }
8176                }
8177
8178                # If the range completely contains the printables, don't
8179                # individually spell out the printables.
8180                if (    $lows_invlist[0] <= $first_non_control
8181                    && $lows_invlist[1] > $highest_printable)
8182                {
8183                    $lows_string .= sprintf "\\x%02x-\\x%02x",
8184                                        $lows_invlist[0], $lows_invlist[1] - 1;
8185                    shift @lows_invlist;
8186                    shift @lows_invlist;
8187                    next;
8188                }
8189
8190                # Here, the range may include some but not all printables.
8191                # Look at each one individually
8192                foreach my $ord (shift @lows_invlist .. shift(@lows_invlist) - 1) {
8193                    my $char = chr $ord;
8194
8195                    # If there is already something in the list, an
8196                    # alphanumeric char could be the next in sequence.  If so,
8197                    # we start or extend a range.  That is, we could have so
8198                    # far something like 'a-c', and the next char is a 'd', so
8199                    # we change it to 'a-d'.  We use native_to_unicode()
8200                    # because a-z on EBCDIC means 26 chars, and excludes the
8201                    # gap ones.
8202                    if ($lows_string ne "" && $char =~ /[[:alnum:]]/) {
8203                        my $prev = substr($lows_string, -1);
8204                        if (   $prev !~ /[[:alnum:]]/
8205                            ||   utf8::native_to_unicode(ord $prev) + 1
8206                              != utf8::native_to_unicode(ord $char))
8207                        {
8208                            # Not extending the range
8209                            $lows_string .= $char;
8210                        }
8211                        elsif (   length $lows_string > 1
8212                               && substr($lows_string, -2, 1) eq '-')
8213                        {
8214                            # We had a sequence like '-c' and the current
8215                            # character is 'd'.  Extend the range.
8216                            substr($lows_string, -1, 1) = $char;
8217                        }
8218                        else {
8219                            # We had something like 'd' and this is 'e'.
8220                            # Start a range.
8221                            $lows_string .= "-$char";
8222                        }
8223                    }
8224                    elsif ($char =~ /[[:graph:]]/) {
8225
8226                        # We output a graphic char as-is, preceded by a
8227                        # backslash if it is a metacharacter
8228                        $lows_string .= '\\'
8229                                if $char =~ /[\\\^\$\@\%\|()\[\]\{\}\-\/"']/;
8230                        $lows_string .= $char;
8231                    } # Otherwise use mnemonic for any that have them
8232                    elsif ($char =~ /[\a]/) {
8233                        $lows_string .= '\a';
8234                    }
8235                    elsif ($char =~ /[\b]/) {
8236                        $lows_string .= '\b';
8237                    }
8238                    elsif ($char eq "\e") {
8239                        $lows_string .= '\e';
8240                    }
8241                    elsif ($char eq "\f") {
8242                        $lows_string .= '\f';
8243                    }
8244                    elsif ($char eq "\cK") {
8245                        $lows_string .= '\cK';
8246                    }
8247                    elsif ($char eq "\n") {
8248                        $lows_string .= '\n';
8249                    }
8250                    elsif ($char eq "\r") {
8251                        $lows_string .= '\r';
8252                    }
8253                    elsif ($char eq "\t") {
8254                        $lows_string .= '\t';
8255                    }
8256                    else {
8257
8258                        # Here is a non-graphic without a mnemonic.  We use \x
8259                        # notation.  But if the ordinal of this is one above
8260                        # the previous, create or extend the range
8261                        my $hex_representation = sprintf("%02x", ord $char);
8262                        if (   length $lows_string >= 4
8263                            && substr($lows_string, -4, 2) eq '\\x'
8264                            && hex(substr($lows_string, -2)) + 1 == ord $char)
8265                        {
8266                            if (       length $lows_string >= 5
8267                                &&     substr($lows_string, -5, 1) eq '-'
8268                                && (   length $lows_string == 5
8269                                    || substr($lows_string, -6, 1) ne '\\'))
8270                            {
8271                                substr($lows_string, -2) = $hex_representation;
8272                            }
8273                            else {
8274                                $lows_string .= '-\\x' . $hex_representation;
8275                            }
8276                        }
8277                        else {
8278                            $lows_string .= '\\x' . $hex_representation;
8279                        }
8280                    }
8281                }
8282            }
8283
8284            # Done with assembling the string of all lows.  If there are only
8285            # lows in the property, are completely done.
8286            if ($max_table_code_point < 256) {
8287                $self->reset_each_range;
8288                last;
8289            }
8290
8291            # Otherwise, quit if reached max number of non-lows ranges.  If
8292            # there are lows, count them as one unit towards the maximum.
8293            $range_count++;
8294            if ($range_count > (($lows_string eq "") ? $max_ranges : $max_ranges - 1)) {
8295                $string_range .= " ...";
8296                $self->reset_each_range;
8297                last;
8298            }
8299
8300            # Otherwise add this range.
8301            $string_range .= ", " if $string_range ne "";
8302            if ($start == $end) {
8303                $string_range .= sprintf("U+%04X", $start);
8304            }
8305            elsif ($end >= $MAX_WORKING_CODEPOINT)  {
8306                $string_range .= sprintf("U+%04X..infinity", $start);
8307            }
8308            else  {
8309                $string_range .= sprintf("U+%04X..%04X",
8310                                        $start, $end);
8311            }
8312        }
8313
8314        # Done with all the ranges we're going to look at.  Assemble the
8315        # definition from the lows + non-lows.
8316
8317        if ($lows_string ne "" || $string_range ne "") {
8318            if ($lows_string ne "") {
8319                $definition .= "[$lows_string]";
8320                $definition .= ", " if $string_range;
8321            }
8322            $definition .= $string_range;
8323        }
8324
8325        return $definition;
8326    }
8327
8328    sub write($self) {
8329        return $self->SUPER::write(0); # No adjustments
8330    }
8331
8332    # $leader - Should only be called on the leader table of an equivalent group
8333    sub set_final_comment($leader) {
8334        # This creates a comment for the file that is to hold the match table
8335        # $self.  It is somewhat convoluted to make the English read nicely,
8336        # but, heh, it's just a comment.
8337        # This should be called only with the leader match table of all the
8338        # ones that share the same file.  It lists all such tables, ordered so
8339        # that related ones are together.
8340
8341        return unless $debugging_build;
8342
8343        my $addr = do { no overloading; pack 'J', $leader; };
8344
8345        if ($leader{$addr} != $leader) {
8346            Carp::my_carp_bug(<<END
8347set_final_comment() must be called on a leader table, which $leader is not.
8348It is equivalent to $leader{$addr}.  No comment created
8349END
8350            );
8351            return;
8352        }
8353
8354        # Get the number of code points matched by each of the tables in this
8355        # file, and add underscores for clarity.
8356        my $count = $leader->count;
8357        my $unicode_count;
8358        my $non_unicode_string;
8359        if ($count > $MAX_UNICODE_CODEPOINTS) {
8360            $unicode_count = $count - ($MAX_WORKING_CODEPOINT
8361                                       - $MAX_UNICODE_CODEPOINT);
8362            $non_unicode_string = "All above-Unicode code points match as well, and are also returned";
8363        }
8364        else {
8365            $unicode_count = $count;
8366            $non_unicode_string = "";
8367        }
8368        my $string_count = main::clarify_code_point_count($unicode_count);
8369
8370        my $loose_count = 0;        # how many aliases loosely matched
8371        my $compound_name = "";     # ? Are any names compound?, and if so, an
8372                                    # example
8373        my $properties_with_compound_names = 0;    # count of these
8374
8375
8376        my %flags;              # The status flags used in the file
8377        my $total_entries = 0;  # number of entries written in the comment
8378        my $matches_comment = ""; # The portion of the comment about the
8379                                  # \p{}'s
8380        my @global_comments;    # List of all the tables' comments that are
8381                                # there before this routine was called.
8382        my $has_ucd_alias = 0;  # If there is an alias that is accessible via
8383                                # Unicode::UCD.  If not, then don't say it is
8384                                # in the comment
8385
8386        # Get list of all the parent tables that are equivalent to this one
8387        # (including itself).
8388        my @parents = grep { $parent{main::objaddr $_} == $_ }
8389                            main::uniques($leader, @{$equivalents{$addr}});
8390        my $has_unrelated = (@parents >= 2);  # boolean, ? are there unrelated
8391                                              # tables
8392        for my $parent (@parents) {
8393
8394            my $property = $parent->property;
8395
8396            # Special case 'N' tables in properties with two match tables when
8397            # the other is a 'Y' one.  These are likely to be binary tables,
8398            # but not necessarily.  In either case, \P{} will match the
8399            # complement of \p{}, and so if something is a synonym of \p, the
8400            # complement of that something will be the synonym of \P.  This
8401            # would be true of any property with just two match tables, not
8402            # just those whose values are Y and N; but that would require a
8403            # little extra work, and there are none such so far in Unicode.
8404            my $perl_p = 'p';        # which is it?  \p{} or \P{}
8405            my @yes_perl_synonyms;   # list of any synonyms for the 'Y' table
8406
8407            if (scalar $property->tables == 2
8408                && $parent == $property->table('N')
8409                && defined (my $yes = $property->table('Y')))
8410            {
8411                my $yes_addr = do { no overloading; pack 'J', $yes; };
8412                @yes_perl_synonyms
8413                    = grep { $_->property == $perl }
8414                                    main::uniques($yes,
8415                                                $parent{$yes_addr},
8416                                                $parent{$yes_addr}->children);
8417
8418                # But these synonyms are \P{} ,not \p{}
8419                $perl_p = 'P';
8420            }
8421
8422            my @description;        # Will hold the table description
8423            my @note;               # Will hold the table notes.
8424            my @conflicting;        # Will hold the table conflicts.
8425
8426            # Look at the parent, any yes synonyms, and all the children
8427            my $parent_addr = do { no overloading; pack 'J', $parent; };
8428            for my $table ($parent,
8429                           @yes_perl_synonyms,
8430                           @{$children{$parent_addr}})
8431            {
8432                my $table_addr = do { no overloading; pack 'J', $table; };
8433                my $table_property = $table->property;
8434
8435                # Tables are separated by a blank line to create a grouping.
8436                $matches_comment .= "\n" if $matches_comment;
8437
8438                # The table is named based on the property and value
8439                # combination it is for, like script=greek.  But there may be
8440                # a number of synonyms for each side, like 'sc' for 'script',
8441                # and 'grek' for 'greek'.  Any combination of these is a valid
8442                # name for this table.  In this case, there are three more,
8443                # 'sc=grek', 'sc=greek', and 'script='grek'.  Rather than
8444                # listing all possible combinations in the comment, we make
8445                # sure that each synonym occurs at least once, and add
8446                # commentary that the other combinations are possible.
8447                # Because regular expressions don't recognize things like
8448                # \p{jsn=}, only look at non-null right-hand-sides
8449                my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table_property->aliases;
8450                my @table_aliases = grep { $_->name ne "" } $table->aliases;
8451
8452                # The alias lists above are already ordered in the order we
8453                # want to output them.  To ensure that each synonym is listed,
8454                # we must use the max of the two numbers.  But if there are no
8455                # legal synonyms (nothing in @table_aliases), then we don't
8456                # list anything.
8457                my $listed_combos = (@table_aliases)
8458                                    ?  main::max(scalar @table_aliases,
8459                                                 scalar @property_aliases)
8460                                    : 0;
8461                trace "$listed_combos, tables=", scalar @table_aliases, "; property names=", scalar @property_aliases if main::DEBUG;
8462
8463                my $property_had_compound_name = 0;
8464
8465                for my $i (0 .. $listed_combos - 1) {
8466                    $total_entries++;
8467
8468                    # The current alias for the property is the next one on
8469                    # the list, or if beyond the end, start over.  Similarly
8470                    # for the table (\p{prop=table})
8471                    my $property_alias = $property_aliases
8472                                            [$i % @property_aliases]->name;
8473                    my $table_alias_object = $table_aliases
8474                                                        [$i % @table_aliases];
8475                    my $table_alias = $table_alias_object->name;
8476                    my $loose_match = $table_alias_object->loose_match;
8477                    $has_ucd_alias |= $table_alias_object->ucd;
8478
8479                    if ($table_alias !~ /\D/) { # Clarify large numbers.
8480                        $table_alias = main::clarify_number($table_alias)
8481                    }
8482
8483                    # Add a comment for this alias combination
8484                    my $current_match_comment;
8485                    if ($table_property == $perl) {
8486                        $current_match_comment = "\\$perl_p"
8487                                                    . "{$table_alias}";
8488                    }
8489                    else {
8490                        $current_match_comment
8491                                        = "\\p{$property_alias=$table_alias}";
8492                        $property_had_compound_name = 1;
8493                    }
8494
8495                    # Flag any abnormal status for this table.
8496                    my $flag = $property->status
8497                                || $table->status
8498                                || $table_alias_object->status;
8499                    if ($flag && $flag ne $PLACEHOLDER) {
8500                        $flags{$flag} = $status_past_participles{$flag};
8501                    }
8502
8503                    $loose_count++;
8504
8505                    # Pretty up the comment.  Note the \b; it says don't make
8506                    # this line a continuation.
8507                    $matches_comment .= sprintf("\b%-1s%-s%s\n",
8508                                        $flag,
8509                                        " " x 7,
8510                                        $current_match_comment);
8511                } # End of generating the entries for this table.
8512
8513                # Save these for output after this group of related tables.
8514                push @description, $table->description;
8515                push @note, $table->note;
8516                push @conflicting, $table->conflicting;
8517
8518                # And this for output after all the tables.
8519                push @global_comments, $table->comment;
8520
8521                # Compute an alternate compound name using the final property
8522                # synonym and the first table synonym with a colon instead of
8523                # the equal sign used elsewhere.
8524                if ($property_had_compound_name) {
8525                    $properties_with_compound_names ++;
8526                    if (! $compound_name || @property_aliases > 1) {
8527                        $compound_name = $property_aliases[-1]->name
8528                                        . ': '
8529                                        . $table_aliases[0]->name;
8530                    }
8531                }
8532            } # End of looping through all children of this table
8533
8534            # Here have assembled in $matches_comment all the related tables
8535            # to the current parent (preceded by the same info for all the
8536            # previous parents).  Put out information that applies to all of
8537            # the current family.
8538            if (@conflicting) {
8539
8540                # But output the conflicting information now, as it applies to
8541                # just this table.
8542                my $conflicting = join ", ", @conflicting;
8543                if ($conflicting) {
8544                    $matches_comment .= <<END;
8545
8546    Note that contrary to what you might expect, the above is NOT the same as
8547END
8548                    $matches_comment .= "any of: " if @conflicting > 1;
8549                    $matches_comment .= "$conflicting\n";
8550                }
8551            }
8552            if (@description) {
8553                $matches_comment .= "\n    Meaning: "
8554                                    . join('; ', @description)
8555                                    . "\n";
8556            }
8557            if (@note) {
8558                $matches_comment .= "\n    Note: "
8559                                    . join("\n    ", @note)
8560                                    . "\n";
8561            }
8562        } # End of looping through all tables
8563
8564        $matches_comment .= "\n$non_unicode_string\n" if $non_unicode_string;
8565
8566
8567        my $code_points;
8568        my $match;
8569        my $any_of_these;
8570        if ($unicode_count == 1) {
8571            $match = 'matches';
8572            $code_points = 'single code point';
8573        }
8574        else {
8575            $match = 'match';
8576            $code_points = "$string_count code points";
8577        }
8578
8579        my $synonyms;
8580        my $entries;
8581        if ($total_entries == 1) {
8582            $synonyms = "";
8583            $entries = 'entry';
8584            $any_of_these = 'this'
8585        }
8586        else {
8587            $synonyms = " any of the following regular expression constructs";
8588            $entries = 'entries';
8589            $any_of_these = 'any of these'
8590        }
8591
8592        my $comment = "";
8593        if ($has_ucd_alias) {
8594            $comment .= "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n";
8595        }
8596        if ($has_unrelated) {
8597            $comment .= <<END;
8598This file is for tables that are not necessarily related:  To conserve
8599resources, every table that matches the identical set of code points in this
8600version of Unicode uses this file.  Each one is listed in a separate group
8601below.  It could be that the tables will match the same set of code points in
8602other Unicode releases, or it could be purely coincidence that they happen to
8603be the same in Unicode $unicode_version, and hence may not in other versions.
8604
8605END
8606        }
8607
8608        if (%flags) {
8609            foreach my $flag (sort keys %flags) {
8610                $comment .= <<END;
8611'$flag' below means that this form is $flags{$flag}.
8612END
8613                if ($flag eq $INTERNAL_ALIAS) {
8614                    $comment .= "DO NOT USE!!!";
8615                }
8616                else {
8617                    $comment .= "Consult $pod_file.pod";
8618                }
8619                $comment .= "\n";
8620            }
8621            $comment .= "\n";
8622        }
8623
8624        if ($total_entries == 0) {
8625            Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string.  Creating file anyway.");
8626            $comment .= <<END;
8627This file returns the $code_points in Unicode Version
8628$unicode_version for
8629$leader, but it is inaccessible through Perl regular expressions, as
8630"\\p{prop=}" is not recognized.
8631END
8632
8633        } else {
8634            $comment .= <<END;
8635This file returns the $code_points in Unicode Version
8636$unicode_version that
8637$match$synonyms:
8638
8639$matches_comment
8640$pod_file.pod should be consulted for the syntax rules for $any_of_these,
8641including if adding or subtracting white space, underscore, and hyphen
8642characters matters or doesn't matter, and other permissible syntactic
8643variants.  Upper/lower case distinctions never matter.
8644END
8645
8646        }
8647        if ($compound_name) {
8648            $comment .= <<END;
8649
8650A colon can be substituted for the equals sign, and
8651END
8652            if ($properties_with_compound_names > 1) {
8653                $comment .= <<END;
8654within each group above,
8655END
8656            }
8657            $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
8658
8659            # Note the \b below, it says don't make that line a continuation.
8660            $comment .= <<END;
8661anything to the left of the equals (or colon) can be combined with anything to
8662the right.  Thus, for example,
8663$compound_name
8664\bis also valid.
8665END
8666        }
8667
8668        # And append any comment(s) from the actual tables.  They are all
8669        # gathered here, so may not read all that well.
8670        if (@global_comments) {
8671            $comment .= "\n" . join("\n\n", @global_comments) . "\n";
8672        }
8673
8674        if ($count) {   # The format differs if no code points, and needs no
8675                        # explanation in that case
8676            if ($leader->write_as_invlist) {
8677                $comment.= <<END;
8678
8679The first data line of this file begins with the letter V to indicate it is in
8680inversion list format.  The number following the V gives the number of lines
8681remaining.  Each of those remaining lines is a single number representing the
8682starting code point of a range which goes up to but not including the number
8683on the next line; The 0th, 2nd, 4th... ranges are for code points that match
8684the property; the 1st, 3rd, 5th... are ranges of code points that don't match
8685the property.  The final line's range extends to the platform's infinity.
8686END
8687            }
8688            else {
8689                $comment.= <<END;
8690The format of the lines of this file is:
8691START\\tSTOP\\twhere START is the starting code point of the range, in hex;
8692STOP is the ending point, or if omitted, the range has just one code point.
8693END
8694            }
8695            if ($leader->output_range_counts) {
8696                $comment .= <<END;
8697Numbers in comments in [brackets] indicate how many code points are in the
8698range.
8699END
8700            }
8701        }
8702
8703        $leader->set_comment(main::join_lines($comment));
8704        return;
8705    }
8706
8707    # Accessors for the underlying list
8708    for my $sub (qw(
8709                    get_valid_code_point
8710                    get_invalid_code_point
8711                ))
8712    {
8713        no strict "refs";
8714        *$sub = sub {
8715            use strict "refs";
8716            my $self = shift;
8717
8718            return $self->_range_list->$sub(@_);
8719        }
8720    }
8721} # End closure for Match_Table
8722
8723package Property;
8724
8725# The Property class represents a Unicode property, or the $perl
8726# pseudo-property.  It contains a map table initialized empty at construction
8727# time, and for properties accessible through regular expressions, various
8728# match tables, created through the add_match_table() method, and referenced
8729# by the table('NAME') or tables() methods, the latter returning a list of all
8730# of the match tables.  Otherwise table operations implicitly are for the map
8731# table.
8732#
8733# Most of the data in the property is actually about its map table, so it
8734# mostly just uses that table's accessors for most methods.  The two could
8735# have been combined into one object, but for clarity because of their
8736# differing semantics, they have been kept separate.  It could be argued that
8737# the 'file' and 'directory' fields should be kept with the map table.
8738#
8739# Each property has a type.  This can be set in the constructor, or in the
8740# set_type accessor, but mostly it is figured out by the data.  Every property
8741# starts with unknown type, overridden by a parameter to the constructor, or
8742# as match tables are added, or ranges added to the map table, the data is
8743# inspected, and the type changed.  After the table is mostly or entirely
8744# filled, compute_type() should be called to finalize they analysis.
8745#
8746# There are very few operations defined.  One can safely remove a range from
8747# the map table, and property_add_or_replace_non_nulls() adds the maps from another
8748# table to this one, replacing any in the intersection of the two.
8749
8750sub standardize { return main::standardize($_[0]); }
8751sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
8752
8753{   # Closure
8754
8755    # This hash will contain as keys, all the aliases of all properties, and
8756    # as values, pointers to their respective property objects.  This allows
8757    # quick look-up of a property from any of its names.
8758    my %alias_to_property_of;
8759
8760    sub dump_alias_to_property_of {
8761        # For debugging
8762
8763        print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
8764        return;
8765    }
8766
8767    sub property_ref($name) {
8768        # This is a package subroutine, not called as a method.
8769        # If the single parameter is a literal '*' it returns a list of all
8770        # defined properties.
8771        # Otherwise, the single parameter is a name, and it returns a pointer
8772        # to the corresponding property object, or undef if none.
8773        #
8774        # Properties can have several different names.  The 'standard' form of
8775        # each of them is stored in %alias_to_property_of as they are defined.
8776        # But it's possible that this subroutine will be called with some
8777        # variant, so if the initial lookup fails, it is repeated with the
8778        # standardized form of the input name.  If found, besides returning the
8779        # result, the input name is added to the list so future calls won't
8780        # have to do the conversion again.
8781
8782        if (! defined $name) {
8783            Carp::my_carp_bug("Undefined input property.  No action taken.");
8784            return;
8785        }
8786
8787        return main::uniques(values %alias_to_property_of) if $name eq '*';
8788
8789        # Return cached result if have it.
8790        my $result = $alias_to_property_of{$name};
8791        return $result if defined $result;
8792
8793        # Convert the input to standard form.
8794        my $standard_name = standardize($name);
8795
8796        $result = $alias_to_property_of{$standard_name};
8797        return unless defined $result;        # Don't cache undefs
8798
8799        # Cache the result before returning it.
8800        $alias_to_property_of{$name} = $result;
8801        return $result;
8802    }
8803
8804
8805    main::setup_package();
8806
8807    my %map;
8808    # A pointer to the map table object for this property
8809    main::set_access('map', \%map);
8810
8811    my %full_name;
8812    # The property's full name.  This is a duplicate of the copy kept in the
8813    # map table, but is needed because stringify needs it during
8814    # construction of the map table, and then would have a chicken before egg
8815    # problem.
8816    main::set_access('full_name', \%full_name, 'r');
8817
8818    my %table_ref;
8819    # This hash will contain as keys, all the aliases of any match tables
8820    # attached to this property, and as values, the pointers to their
8821    # respective tables.  This allows quick look-up of a table from any of its
8822    # names.
8823    main::set_access('table_ref', \%table_ref);
8824
8825    my %type;
8826    # The type of the property, $ENUM, $BINARY, etc
8827    main::set_access('type', \%type, 'r');
8828
8829    my %file;
8830    # The filename where the map table will go (if actually written).
8831    # Normally defaulted, but can be overridden.
8832    main::set_access('file', \%file, 'r', 's');
8833
8834    my %directory;
8835    # The directory where the map table will go (if actually written).
8836    # Normally defaulted, but can be overridden.
8837    main::set_access('directory', \%directory, 's');
8838
8839    my %pseudo_map_type;
8840    # This is used to affect the calculation of the map types for all the
8841    # ranges in the table.  It should be set to one of the values that signify
8842    # to alter the calculation.
8843    main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
8844
8845    my %has_only_code_point_maps;
8846    # A boolean used to help in computing the type of data in the map table.
8847    main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
8848
8849    my %unique_maps;
8850    # A list of the first few distinct mappings this property has.  This is
8851    # used to disambiguate between binary and enum property types, so don't
8852    # have to keep more than three.
8853    main::set_access('unique_maps', \%unique_maps);
8854
8855    my %pre_declared_maps;
8856    # A boolean that gives whether the input data should declare all the
8857    # tables used, or not.  If the former, unknown ones raise a warning.
8858    main::set_access('pre_declared_maps',
8859                                    \%pre_declared_maps, 'r', 's');
8860
8861    my %match_subdir;
8862    # For properties whose shortest names are too long for a DOS 8.3
8863    # filesystem to distinguish between, this is used to manually give short
8864    # names for the directory name immediately under $match_tables that the
8865    # match tables for this property should be placed in.
8866    main::set_access('match_subdir', \%match_subdir, 'r');
8867
8868    my %has_dependency;
8869    # A boolean that gives whether some table somewhere is defined as the
8870    # complement of a table in this property.  This is a crude, but currently
8871    # sufficient, mechanism to make this property not get destroyed before
8872    # what is dependent on it is.  Other dependencies could be added, so the
8873    # name was chosen to reflect a more general situation than actually is
8874    # currently the case.
8875    main::set_access('has_dependency', \%has_dependency, 'r', 's');
8876
8877    sub new {
8878        # The only required parameter is the positionally first, name.  All
8879        # other parameters are key => value pairs.  See the documentation just
8880        # above for the meanings of the ones not passed directly on to the map
8881        # table constructor.
8882
8883        my $class = shift;
8884        my $name = shift || "";
8885
8886        my $self = property_ref($name);
8887        if (defined $self) {
8888            my $options_string = join ", ", @_;
8889            $options_string = ".  Ignoring options $options_string" if $options_string;
8890            Carp::my_carp("$self is already in use.  Using existing one$options_string;");
8891            return $self;
8892        }
8893
8894        my %args = @_;
8895
8896        $self = bless \do { my $anonymous_scalar }, $class;
8897        my $addr = do { no overloading; pack 'J', $self; };
8898
8899        $directory{$addr} = delete $args{'Directory'};
8900        $file{$addr} = delete $args{'File'};
8901        $full_name{$addr} = delete $args{'Full_Name'} || $name;
8902        $type{$addr} = delete $args{'Type'} || $UNKNOWN;
8903        $pseudo_map_type{$addr} = delete $args{'Map_Type'};
8904        $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'}
8905                                    # Starting in this release, property
8906                                    # values should be defined for all
8907                                    # properties, except those overriding this
8908                                    // $v_version ge v5.1.0;
8909        $match_subdir{$addr} = delete $args{'Match_SubDir'};
8910
8911        # Rest of parameters passed on.
8912
8913        $has_only_code_point_maps{$addr} = 1;
8914        $table_ref{$addr} = { };
8915        $unique_maps{$addr} = { };
8916        $has_dependency{$addr} = 0;
8917
8918        $map{$addr} = Map_Table->new($name,
8919                                    Full_Name => $full_name{$addr},
8920                                    _Alias_Hash => \%alias_to_property_of,
8921                                    _Property => $self,
8922                                    %args);
8923        return $self;
8924    }
8925
8926    # See this program's beginning comment block about overloading the copy
8927    # constructor.  Few operations are defined on properties, but a couple are
8928    # useful.  It is safe to take the inverse of a property, and to remove a
8929    # single code point from it.
8930    use overload
8931        fallback => 0,
8932        qw("") => "_operator_stringify",
8933        "." => \&main::_operator_dot,
8934        ".=" => \&main::_operator_dot_equal,
8935        '==' => \&main::_operator_equal,
8936        '!=' => \&main::_operator_not_equal,
8937        '=' => sub { return shift },
8938        '-=' => "_minus_and_equal",
8939    ;
8940
8941    sub _operator_stringify {
8942        return "Property '" .  shift->full_name . "'";
8943    }
8944
8945    sub _minus_and_equal($self, $other, $reversed=0) {
8946        # Remove a single code point from the map table of a property.
8947        if (ref $other) {
8948            Carp::my_carp_bug("Bad news.  Can't cope with a "
8949                        . ref($other)
8950                        . " argument to '-='.  Subtraction ignored.");
8951            return $self;
8952        }
8953        elsif ($reversed) {   # Shouldn't happen in a -=, but just in case
8954            Carp::my_carp_bug("Bad news.  Can't cope with subtracting a "
8955            . ref $self
8956            . " from a non-object.  undef returned.");
8957            return;
8958        }
8959        else {
8960            no overloading;
8961            $map{pack 'J', $self}->delete_range($other, $other);
8962        }
8963        return $self;
8964    }
8965
8966    sub add_match_table {
8967        # Add a new match table for this property, with name given by the
8968        # parameter.  It returns a pointer to the table.
8969
8970        my $self = shift;
8971        my $name = shift;
8972        my %args = @_;
8973
8974        my $addr = do { no overloading; pack 'J', $self; };
8975
8976        my $table = $table_ref{$addr}{$name};
8977        my $standard_name = main::standardize($name);
8978        if (defined $table
8979            || (defined ($table = $table_ref{$addr}{$standard_name})))
8980        {
8981            Carp::my_carp("Table '$name' in $self is already in use.  Using existing one");
8982            $table_ref{$addr}{$name} = $table;
8983            return $table;
8984        }
8985        else {
8986
8987            # See if this is a perl extension, if not passed in.
8988            my $perl_extension = delete $args{'Perl_Extension'};
8989            $perl_extension
8990                        = $self->perl_extension if ! defined $perl_extension;
8991
8992            my $fate;
8993            my $suppression_reason = "";
8994            if ($self->name =~ /^_/) {
8995                $fate = $SUPPRESSED;
8996                $suppression_reason = "Parent property is internal only";
8997            }
8998            elsif ($self->fate >= $SUPPRESSED) {
8999                $fate = $self->fate;
9000                $suppression_reason = $why_suppressed{$self->complete_name};
9001
9002            }
9003            elsif ($name =~ /^_/) {
9004                $fate = $INTERNAL_ONLY;
9005            }
9006            $table = Match_Table->new(
9007                                Name => $name,
9008                                Perl_Extension => $perl_extension,
9009                                _Alias_Hash => $table_ref{$addr},
9010                                _Property => $self,
9011                                Fate => $fate,
9012                                Suppression_Reason => $suppression_reason,
9013                                Status => $self->status,
9014                                _Status_Info => $self->status_info,
9015                                %args);
9016            return unless defined $table;
9017        }
9018
9019        # Save the names for quick look up
9020        $table_ref{$addr}{$standard_name} = $table;
9021        $table_ref{$addr}{$name} = $table;
9022
9023        # Perhaps we can figure out the type of this property based on the
9024        # fact of adding this match table.  First, string properties don't
9025        # have match tables; second, a binary property can't have 3 match
9026        # tables
9027        if ($type{$addr} == $UNKNOWN) {
9028            $type{$addr} = $NON_STRING;
9029        }
9030        elsif ($type{$addr} == $STRING) {
9031            Carp::my_carp("$self Added a match table '$name' to a string property '$self'.  Changed it to a non-string property.  Bad News.");
9032            $type{$addr} = $NON_STRING;
9033        }
9034        elsif ($type{$addr} != $ENUM && $type{$addr} != $FORCED_BINARY) {
9035            if (scalar main::uniques(values %{$table_ref{$addr}}) > 2) {
9036                if ($type{$addr} == $BINARY) {
9037                    Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary.  Changing its type to 'enum'.  Bad News.");
9038                }
9039                $type{$addr} = $ENUM;
9040            }
9041        }
9042
9043        return $table;
9044    }
9045
9046    sub delete_match_table($self, $table_to_remove) {
9047        # Delete the table referred to by $2 from the property $1.
9048        my $addr = do { no overloading; pack 'J', $self; };
9049
9050        # Remove all names that refer to it.
9051        foreach my $key (keys %{$table_ref{$addr}}) {
9052            delete $table_ref{$addr}{$key}
9053                                if $table_ref{$addr}{$key} == $table_to_remove;
9054        }
9055
9056        $table_to_remove->DESTROY;
9057        return;
9058    }
9059
9060    sub table($self, $name) {
9061        # Return a pointer to the match table (with name given by the
9062        # parameter) associated with this property; undef if none.
9063        my $addr = do { no overloading; pack 'J', $self; };
9064
9065        return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
9066
9067        # If quick look-up failed, try again using the standard form of the
9068        # input name.  If that succeeds, cache the result before returning so
9069        # won't have to standardize this input name again.
9070        my $standard_name = main::standardize($name);
9071        return unless defined $table_ref{$addr}{$standard_name};
9072
9073        $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
9074        return $table_ref{$addr}{$name};
9075    }
9076
9077    sub tables {
9078        # Return a list of pointers to all the match tables attached to this
9079        # property
9080
9081        no overloading;
9082        return main::uniques(values %{$table_ref{pack 'J', shift}});
9083    }
9084
9085    sub directory {
9086        # Returns the directory the map table for this property should be
9087        # output in.  If a specific directory has been specified, that has
9088        # priority;  'undef' is returned if the type isn't defined;
9089        # or $map_directory for everything else.
9090
9091        my $addr = do { no overloading; pack 'J', shift; };
9092
9093        return $directory{$addr} if defined $directory{$addr};
9094        return undef if $type{$addr} == $UNKNOWN;
9095        return $map_directory;
9096    }
9097
9098    sub swash_name($self) {
9099        # Return the name that is used to both:
9100        #   1)  Name the file that the map table is written to.
9101        #   2)  The name of swash related stuff inside that file.
9102        # The reason for this is that the Perl core historically has used
9103        # certain names that aren't the same as the Unicode property names.
9104        # To continue using these, $file is hard-coded in this file for those,
9105        # but otherwise the standard name is used.  This is different from the
9106        # external_name, so that the rest of the files, like in lib can use
9107        # the standard name always, without regard to historical precedent.
9108        my $addr = do { no overloading; pack 'J', $self; };
9109
9110        # Swash names are used only on either
9111        # 1) legacy-only properties, because the formats for these are
9112        #    unchangeable, and they have had these lines in them; or
9113        # 2) regular or internal-only map tables
9114        # 3) otherwise there should be no access to the
9115        #    property map table from other parts of Perl.
9116        return if $map{$addr}->fate != $ORDINARY
9117                  && $map{$addr}->fate != $LEGACY_ONLY
9118                  && ! ($map{$addr}->name =~ /^_/
9119                        && $map{$addr}->fate == $INTERNAL_ONLY);
9120
9121        return $file{$addr} if defined $file{$addr};
9122        return $map{$addr}->external_name;
9123    }
9124
9125    sub to_create_match_tables($self) {
9126        # Returns a boolean as to whether or not match tables should be
9127        # created for this property.
9128
9129        # The whole point of this pseudo property is match tables.
9130        return 1 if $self == $perl;
9131
9132        my $addr = do { no overloading; pack 'J', $self; };
9133
9134        # Don't generate tables of code points that match the property values
9135        # of a string property.  Such a list would most likely have many
9136        # property values, each with just one or very few code points mapping
9137        # to it.
9138        return 0 if $type{$addr} == $STRING;
9139
9140        # Otherwise, do.
9141        return 1;
9142    }
9143
9144    sub property_add_or_replace_non_nulls($self, $other) {
9145        # This adds the mappings in the property $other to $self.  Non-null
9146        # mappings from $other override those in $self.  It essentially merges
9147        # the two properties, with the second having priority except for null
9148        # mappings.
9149
9150        if (! $other->isa(__PACKAGE__)) {
9151            Carp::my_carp_bug("$other should be a "
9152                            . __PACKAGE__
9153                            . ".  Not a '"
9154                            . ref($other)
9155                            . "'.  Not added;");
9156            return;
9157        }
9158
9159        no overloading;
9160        return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other});
9161    }
9162
9163    sub set_proxy_for {
9164        # Certain tables are not generally written out to files, but
9165        # Unicode::UCD has the intelligence to know that the file for $self
9166        # can be used to reconstruct those tables.  This routine just changes
9167        # things so that UCD pod entries for those suppressed tables are
9168        # generated, so the fact that a proxy is used is invisible to the
9169        # user.
9170
9171        my $self = shift;
9172
9173        foreach my $property_name (@_) {
9174            my $ref = property_ref($property_name);
9175            next if $ref->to_output_map;
9176            $ref->set_fate($MAP_PROXIED);
9177        }
9178    }
9179
9180    sub set_type($self, $type) {
9181        # Set the type of the property.  Mostly this is figured out by the
9182        # data in the table.  But this is used to set it explicitly.  The
9183        # reason it is not a standard accessor is that when setting a binary
9184        # property, we need to make sure that all the true/false aliases are
9185        # present, as they were omitted in early Unicode releases.
9186
9187        if ($type != $ENUM
9188            && $type != $BINARY
9189            && $type != $FORCED_BINARY
9190            && $type != $STRING)
9191        {
9192            Carp::my_carp("Unrecognized type '$type'.  Type not set");
9193            return;
9194        }
9195
9196        { no overloading; $type{pack 'J', $self} = $type; }
9197        return if $type != $BINARY && $type != $FORCED_BINARY;
9198
9199        my $yes = $self->table('Y');
9200        $yes = $self->table('Yes') if ! defined $yes;
9201        $yes = $self->add_match_table('Y', Full_Name => 'Yes')
9202                                                            if ! defined $yes;
9203
9204        # Add aliases in order wanted, duplicates will be ignored.  We use a
9205        # binary property present in all releases for its ordered lists of
9206        # true/false aliases.  Note, that could run into problems in
9207        # outputting things in that we don't distinguish between the name and
9208        # full name of these.  Hopefully, if the table was already created
9209        # before this code is executed, it was done with these set properly.
9210        my $bm = property_ref("Bidi_Mirrored");
9211        foreach my $alias ($bm->table("Y")->aliases) {
9212            $yes->add_alias($alias->name);
9213        }
9214        my $no = $self->table('N');
9215        $no = $self->table('No') if ! defined $no;
9216        $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no;
9217        foreach my $alias ($bm->table("N")->aliases) {
9218            $no->add_alias($alias->name);
9219        }
9220
9221        return;
9222    }
9223
9224    sub add_map {
9225        # Add a map to the property's map table.  This also keeps
9226        # track of the maps so that the property type can be determined from
9227        # its data.
9228
9229        my $self = shift;
9230        my $start = shift;  # First code point in range
9231        my $end = shift;    # Final code point in range
9232        my $map = shift;    # What the range maps to.
9233        # Rest of parameters passed on.
9234
9235        my $addr = do { no overloading; pack 'J', $self; };
9236
9237        # If haven't the type of the property, gather information to figure it
9238        # out.
9239        if ($type{$addr} == $UNKNOWN) {
9240
9241            # If the map contains an interior blank or dash, or most other
9242            # nonword characters, it will be a string property.  This
9243            # heuristic may actually miss some string properties.  If so, they
9244            # may need to have explicit set_types called for them.  This
9245            # happens in the Unihan properties.
9246            if ($map =~ / (?<= . ) [ -] (?= . ) /x
9247                || $map =~ / [^\w.\/\ -]  /x)
9248            {
9249                $self->set_type($STRING);
9250
9251                # $unique_maps is used for disambiguating between ENUM and
9252                # BINARY later; since we know the property is not going to be
9253                # one of those, no point in keeping the data around
9254                undef $unique_maps{$addr};
9255            }
9256            else {
9257
9258                # Not necessarily a string.  The final decision has to be
9259                # deferred until all the data are in.  We keep track of if all
9260                # the values are code points for that eventual decision.
9261                $has_only_code_point_maps{$addr} &=
9262                                            $map =~ / ^ $code_point_re $/x;
9263
9264                # For the purposes of disambiguating between binary and other
9265                # enumerations at the end, we keep track of the first three
9266                # distinct property values.  Once we get to three, we know
9267                # it's not going to be binary, so no need to track more.
9268                if (scalar keys %{$unique_maps{$addr}} < 3) {
9269                    $unique_maps{$addr}{main::standardize($map)} = 1;
9270                }
9271            }
9272        }
9273
9274        # Add the mapping by calling our map table's method
9275        return $map{$addr}->add_map($start, $end, $map, @_);
9276    }
9277
9278    sub compute_type($self) {
9279        # Compute the type of the property: $ENUM, $STRING, or $BINARY.  This
9280        # should be called after the property is mostly filled with its maps.
9281        # We have been keeping track of what the property values have been,
9282        # and now have the necessary information to figure out the type.
9283
9284        my $addr = do { no overloading; pack 'J', $self; };
9285
9286        my $type = $type{$addr};
9287
9288        # If already have figured these out, no need to do so again, but we do
9289        # a double check on ENUMS to make sure that a string property hasn't
9290        # improperly been classified as an ENUM, so continue on with those.
9291        return if $type == $STRING
9292                  || $type == $BINARY
9293                  || $type == $FORCED_BINARY;
9294
9295        # If every map is to a code point, is a string property.
9296        if ($type == $UNKNOWN
9297            && ($has_only_code_point_maps{$addr}
9298                || (defined $map{$addr}->default_map
9299                    && $map{$addr}->default_map eq "")))
9300        {
9301            $self->set_type($STRING);
9302        }
9303        else {
9304
9305            # Otherwise, it is to some sort of enumeration.  (The case where
9306            # it is a Unicode miscellaneous property, and treated like a
9307            # string in this program is handled in add_map()).  Distinguish
9308            # between binary and some other enumeration type.  Of course, if
9309            # there are more than two values, it's not binary.  But more
9310            # subtle is the test that the default mapping is defined means it
9311            # isn't binary.  This in fact may change in the future if Unicode
9312            # changes the way its data is structured.  But so far, no binary
9313            # properties ever have @missing lines for them, so the default map
9314            # isn't defined for them.  The few properties that are two-valued
9315            # and aren't considered binary have the default map defined
9316            # starting in Unicode 5.0, when the @missing lines appeared; and
9317            # this program has special code to put in a default map for them
9318            # for earlier than 5.0 releases.
9319            if ($type == $ENUM
9320                || scalar keys %{$unique_maps{$addr}} > 2
9321                || defined $self->default_map)
9322            {
9323                my $tables = $self->tables;
9324                my $count = $self->count;
9325                if ($verbosity && $tables > 500 && $tables/$count > .1) {
9326                    Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $tables\n");
9327                }
9328                $self->set_type($ENUM);
9329            }
9330            else {
9331                $self->set_type($BINARY);
9332            }
9333        }
9334        undef $unique_maps{$addr};  # Garbage collect
9335        return;
9336    }
9337
9338    # $reaons - Ignored unless suppressing
9339    sub set_fate($self, $fate, $reason=undef) {
9340        my $addr = do { no overloading; pack 'J', $self; };
9341        if ($fate >= $SUPPRESSED) {
9342            $why_suppressed{$self->complete_name} = $reason;
9343        }
9344
9345        # Each table shares the property's fate, except that MAP_PROXIED
9346        # doesn't affect match tables
9347        $map{$addr}->set_fate($fate, $reason);
9348        if ($fate != $MAP_PROXIED) {
9349            foreach my $table ($map{$addr}, $self->tables) {
9350                $table->set_fate($fate, $reason);
9351            }
9352        }
9353        return;
9354    }
9355
9356
9357    # Most of the accessors for a property actually apply to its map table.
9358    # Setup up accessor functions for those, referring to %map
9359    for my $sub (qw(
9360                    add_alias
9361                    add_anomalous_entry
9362                    add_comment
9363                    add_conflicting
9364                    add_description
9365                    add_duplicate
9366                    add_note
9367                    aliases
9368                    comment
9369                    complete_name
9370                    containing_range
9371                    count
9372                    default_map
9373                    definition
9374                    delete_range
9375                    description
9376                    each_range
9377                    external_name
9378                    fate
9379                    file_path
9380                    format
9381                    initialize
9382                    inverse_list
9383                    is_empty
9384                    replacement_property
9385                    name
9386                    note
9387                    perl_extension
9388                    property
9389                    range_count
9390                    ranges
9391                    range_size_1
9392                    replace_map
9393                    reset_each_range
9394                    set_comment
9395                    set_default_map
9396                    set_file_path
9397                    set_final_comment
9398                    _set_format
9399                    set_range_size_1
9400                    set_status
9401                    set_to_output_map
9402                    short_name
9403                    status
9404                    status_info
9405                    to_output_map
9406                    type_of
9407                    value_of
9408                    write
9409                ))
9410                    # 'property' above is for symmetry, so that one can take
9411                    # the property of a property and get itself, and so don't
9412                    # have to distinguish between properties and tables in
9413                    # calling code
9414    {
9415        no strict "refs";
9416        *$sub = sub {
9417            use strict "refs";
9418            my $self = shift;
9419            no overloading;
9420            return $map{pack 'J', $self}->$sub(@_);
9421        }
9422    }
9423
9424
9425} # End closure
9426
9427package main;
9428
9429sub display_chr {
9430    # Converts an ordinal printable character value to a displayable string,
9431    # using a dotted circle to hold combining characters.
9432
9433    my $ord = shift;
9434    my $chr = chr $ord;
9435    return $chr if $ccc->table(0)->contains($ord);
9436    return "\x{25CC}$chr";
9437}
9438
9439sub join_lines($return) {
9440    # Returns lines of the input joined together, so that they can be folded
9441    # properly.
9442    # This causes continuation lines to be joined together into one long line
9443    # for folding.  A continuation line is any line that doesn't begin with a
9444    # space or "\b" (the latter is stripped from the output).  This is so
9445    # lines can be in a HERE document so as to fit nicely in the terminal
9446    # width, but be joined together in one long line, and then folded with
9447    # indents, '#' prefixes, etc, properly handled.
9448    # A blank separates the joined lines except if there is a break; an extra
9449    # blank is inserted after a period ending a line.
9450
9451    # Initialize the return with the first line.
9452    my ( @lines ) = split "\n", $return;
9453
9454    # If the first line is null, it was an empty line, add the \n back in
9455    $return = "\n" if $return eq "";
9456
9457    # Now join the remainder of the physical lines.
9458    for my $line (@lines) {
9459
9460        # An empty line means wanted a blank line, so add two \n's to get that
9461        # effect, and go to the next line.
9462        if (length $line == 0) {
9463            $return .= "\n\n";
9464            next;
9465        }
9466
9467        # Look at the last character of what we have so far.
9468        my $previous_char = substr($return, -1, 1);
9469
9470        # And at the next char to be output.
9471        my $next_char = substr($line, 0, 1);
9472
9473        if ($previous_char ne "\n") {
9474
9475            # Here didn't end wth a nl.  If the next char a blank or \b, it
9476            # means that here there is a break anyway.  So add a nl to the
9477            # output.
9478            if ($next_char eq " " || $next_char eq "\b") {
9479                $previous_char = "\n";
9480                $return .= $previous_char;
9481            }
9482
9483            # Add an extra space after periods.
9484            $return .= " " if $previous_char eq '.';
9485        }
9486
9487        # Here $previous_char is still the latest character to be output.  If
9488        # it isn't a nl, it means that the next line is to be a continuation
9489        # line, with a blank inserted between them.
9490        $return .= " " if $previous_char ne "\n";
9491
9492        # Get rid of any \b
9493        substr($line, 0, 1) = "" if $next_char eq "\b";
9494
9495        # And append this next line.
9496        $return .= $line;
9497    }
9498
9499    return $return;
9500}
9501
9502sub simple_fold( $line, $prefix="", $hanging_indent=0, $right_margin=0) {
9503    # Returns a string of the input (string or an array of strings) folded
9504    # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
9505    # a \n
9506    # This is tailored for the kind of text written by this program,
9507    # especially the pod file, which can have very long names with
9508    # underscores in the middle, or words like AbcDefgHij....  We allow
9509    # breaking in the middle of such constructs if the line won't fit
9510    # otherwise.  The break in such cases will come either just after an
9511    # underscore, or just before one of the Capital letters.
9512
9513    local $to_trace = 0 if main::DEBUG;
9514
9515    # $prefix Optional string to prepend to each output line
9516    # $hanging_indent Optional number of spaces to indent
9517	# continuation lines
9518    # $right_margin  Optional number of spaces to narrow the
9519    # total width by.
9520
9521    # The space available doesn't include what's automatically prepended
9522    # to each line, or what's reserved on the right.
9523    my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
9524    # XXX Instead of using the 'nofold' perhaps better to look up the stack
9525
9526    if (DEBUG && $hanging_indent >= $max) {
9527        Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max.  Using 0", 'nofold');
9528        $hanging_indent = 0;
9529    }
9530
9531    # First, split into the current physical lines.
9532    my @line;
9533    if (ref $line) {        # Better be an array, because not bothering to
9534                            # test
9535        foreach my $line (@{$line}) {
9536            push @line, split /\n/, $line;
9537        }
9538    }
9539    else {
9540        @line = split /\n/, $line;
9541    }
9542
9543    #local $to_trace = 1 if main::DEBUG;
9544    trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
9545
9546    # Look at each current physical line.
9547    for (my $i = 0; $i < @line; $i++) {
9548        Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
9549        #local $to_trace = 1 if main::DEBUG;
9550        trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
9551
9552        # Remove prefix, because will be added back anyway, don't want
9553        # doubled prefix
9554        $line[$i] =~ s/^$prefix//;
9555
9556        # Remove trailing space
9557        $line[$i] =~ s/\s+\Z//;
9558
9559        # If the line is too long, fold it.
9560        if (length $line[$i] > $max) {
9561            my $remainder;
9562
9563            # Here needs to fold.  Save the leading space in the line for
9564            # later.
9565            $line[$i] =~ /^ ( \s* )/x;
9566            my $leading_space = $1;
9567            trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
9568
9569            # If character at final permissible position is white space,
9570            # fold there, which will delete that white space
9571            if (substr($line[$i], $max - 1, 1) =~ /\s/) {
9572                $remainder = substr($line[$i], $max);
9573                $line[$i] = substr($line[$i], 0, $max - 1);
9574            }
9575            else {
9576
9577                # Otherwise fold at an acceptable break char closest to
9578                # the max length.  Look at just the maximal initial
9579                # segment of the line
9580                my $segment = substr($line[$i], 0, $max - 1);
9581                if ($segment =~
9582                    /^ ( .{$hanging_indent}   # Don't look before the
9583                                              #  indent.
9584                        \ *                   # Don't look in leading
9585                                              #  blanks past the indent
9586                            [^ ] .*           # Find the right-most
9587                        (?:                   #  acceptable break:
9588                            [ \s = ]          # space or equal
9589                            | - (?! [.0-9] )  # or non-unary minus.
9590                            | [^\\[(] (?= \\ )# break before single backslash
9591                                              #  not immediately after opening
9592                                              #  punctuation
9593                        )                     # $1 includes the character
9594                    )/x)
9595                {
9596                    # Split into the initial part that fits, and remaining
9597                    # part of the input
9598                    $remainder = substr($line[$i], length $1);
9599                    $line[$i] = $1;
9600                    trace $line[$i] if DEBUG && $to_trace;
9601                    trace $remainder if DEBUG && $to_trace;
9602                }
9603
9604                # If didn't find a good breaking spot, see if there is a
9605                # not-so-good breaking spot.  These are just after
9606                # underscores or where the case changes from lower to
9607                # upper.  Use \a as a soft hyphen, but give up
9608                # and don't break the line if there is actually a \a
9609                # already in the input.  We use an ascii character for the
9610                # soft-hyphen to avoid any attempt by miniperl to try to
9611                # access the files that this program is creating.
9612                elsif ($segment !~ /\a/
9613                       && ($segment =~ s/_/_\a/g
9614                       || $segment =~ s/ ( (?!\\) [a-z] ) (?= [A-Z] )/$1\a/xg))
9615                {
9616                    # Here were able to find at least one place to insert
9617                    # our substitute soft hyphen.  Find the right-most one
9618                    # and replace it by a real hyphen.
9619                    trace $segment if DEBUG && $to_trace;
9620                    substr($segment,
9621                            rindex($segment, "\a"),
9622                            1) = '-';
9623
9624                    # Then remove the soft hyphen substitutes.
9625                    $segment =~ s/\a//g;
9626                    trace $segment if DEBUG && $to_trace;
9627
9628                    # And split into the initial part that fits, and
9629                    # remainder of the line
9630                    my $pos = rindex($segment, '-');
9631                    $remainder = substr($line[$i], $pos);
9632                    trace $remainder if DEBUG && $to_trace;
9633                    $line[$i] = substr($segment, 0, $pos + 1);
9634                }
9635            }
9636
9637            # Here we know if we can fold or not.  If we can, $remainder
9638            # is what remains to be processed in the next iteration.
9639            if (defined $remainder) {
9640                trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
9641
9642                # Insert the folded remainder of the line as a new element
9643                # of the array.  (It may still be too long, but we will
9644                # deal with that next time through the loop.)  Omit any
9645                # leading space in the remainder.
9646                $remainder =~ s/^\s+//;
9647                trace "remainder='$remainder'" if main::DEBUG && $to_trace;
9648
9649                # But then indent by whichever is larger of:
9650                # 1) the leading space on the input line;
9651                # 2) the hanging indent.
9652                # This preserves indentation in the original line.
9653                my $lead = ($leading_space)
9654                            ? length $leading_space
9655                            : $hanging_indent;
9656                $lead = max($lead, $hanging_indent);
9657                splice @line, $i+1, 0, (" " x $lead) . $remainder;
9658            }
9659        }
9660
9661        # Ready to output the line. Get rid of any trailing space
9662        # And prefix by the required $prefix passed in.
9663        $line[$i] =~ s/\s+$//;
9664        $line[$i] = "$prefix$line[$i]\n";
9665    } # End of looping through all the lines.
9666
9667    return join "", @line;
9668}
9669
9670sub property_ref {  # Returns a reference to a property object.
9671    return Property::property_ref(@_);
9672}
9673
9674sub force_unlink ($filename) {
9675    return unless file_exists($filename);
9676    return if CORE::unlink($filename);
9677
9678    # We might need write permission
9679    chmod 0777, $filename;
9680    CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename.  Proceeding anyway: $!");
9681    return;
9682}
9683
9684sub write ($file, $use_utf8, @lines) {
9685    # Given a filename and references to arrays of lines, write the lines of
9686    # each array to the file
9687    # Filename can be given as an arrayref of directory names
9688
9689    # Get into a single string if an array, and get rid of, in Unix terms, any
9690    # leading '.'
9691    $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
9692    $file = File::Spec->canonpath($file);
9693
9694    # If has directories, make sure that they all exist
9695    (undef, my $directories, undef) = File::Spec->splitpath($file);
9696    File::Path::mkpath($directories) if $directories && ! -d $directories;
9697
9698    push @files_actually_output, $file;
9699
9700    force_unlink ($file);
9701
9702    my $OUT;
9703    if (not open $OUT, ">", $file) {
9704        Carp::my_carp("can't open $file for output.  Skipping this file: $!");
9705        return;
9706    }
9707
9708    binmode $OUT, ":utf8" if $use_utf8;
9709
9710    foreach my $lines_ref (@lines) {
9711        unless (@$lines_ref) {
9712            Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;");
9713        }
9714
9715        print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
9716    }
9717    close $OUT or die Carp::my_carp("close '$file' failed: $!");
9718
9719    print "$file written.\n" if $verbosity >= $VERBOSE;
9720
9721    return;
9722}
9723
9724
9725sub Standardize($name=undef) {
9726    # This converts the input name string into a standardized equivalent to
9727    # use internally.
9728
9729    unless (defined $name) {
9730      Carp::my_carp_bug("Standardize() called with undef.  Returning undef.");
9731      return;
9732    }
9733
9734    # Remove any leading or trailing white space
9735    $name =~ s/^\s+//g;
9736    $name =~ s/\s+$//g;
9737
9738    # Convert interior white space and hyphens into underscores.
9739    $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
9740
9741    # Capitalize the letter following an underscore, and convert a sequence of
9742    # multiple underscores to a single one
9743    $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
9744
9745    # And capitalize the first letter, but not for the special cjk ones.
9746    $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
9747    return $name;
9748}
9749
9750sub standardize ($str=undef) {
9751    # Returns a lower-cased standardized name, without underscores.  This form
9752    # is chosen so that it can distinguish between any real versus superficial
9753    # Unicode name differences.  It relies on the fact that Unicode doesn't
9754    # have interior underscores, white space, nor dashes in any
9755    # stricter-matched name.  It should not be used on Unicode code point
9756    # names (the Name property), as they mostly, but not always follow these
9757    # rules.
9758
9759    my $name = Standardize($str);
9760    return if !defined $name;
9761
9762    $name =~ s/ (?<= .) _ (?= . ) //xg;
9763    return lc $name;
9764}
9765
9766sub UCD_name ($table, $alias) {
9767    # Returns the name that Unicode::UCD will use to find a table.  XXX
9768    # perhaps this function should be placed somewhere, like UCD.pm so that
9769    # Unicode::UCD can use it directly without duplicating code that can get
9770    # out-of sync.
9771
9772    my $property = $table->property;
9773    $property = ($property == $perl)
9774                ? ""                # 'perl' is never explicitly stated
9775                : standardize($property->name) . '=';
9776    if ($alias->loose_match) {
9777        return $property . standardize($alias->name);
9778    }
9779    else {
9780        return lc ($property . $alias->name);
9781    }
9782
9783    return;
9784}
9785
9786{   # Closure
9787
9788    my $indent_increment = " " x ( $debugging_build ? 2 : 0);
9789    %main::already_output = ();
9790
9791    $main::simple_dumper_nesting = 0;
9792
9793    sub simple_dumper( $item, $indent = "" ) {
9794        # Like Simple Data::Dumper. Good enough for our needs. We can't use
9795        # the real thing as we have to run under miniperl.
9796
9797        # It is designed so that on input it is at the beginning of a line,
9798        # and the final thing output in any call is a trailing ",\n".
9799
9800        $indent = "" if ! $debugging_build;
9801
9802        # nesting level is localized, so that as the call stack pops, it goes
9803        # back to the prior value.
9804        local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
9805        local %main::already_output = %main::already_output;
9806        $main::simple_dumper_nesting++;
9807        #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
9808
9809        # Determine the indent for recursive calls.
9810        my $next_indent = $indent . $indent_increment;
9811
9812        my $output;
9813        if (! ref $item) {
9814
9815            # Dump of scalar: just output it in quotes if not a number.  To do
9816            # so we must escape certain characters, and therefore need to
9817            # operate on a copy to avoid changing the original
9818            my $copy = $item;
9819            $copy = $UNDEF unless defined $copy;
9820
9821            # Quote non-integers (integers also have optional leading '-')
9822            if ($copy eq "" || $copy !~ /^ -? \d+ $/x) {
9823
9824                # Escape apostrophe and backslash
9825                $copy =~ s/ ( ['\\] ) /\\$1/xg;
9826                $copy = "'$copy'";
9827            }
9828            $output = "$indent$copy,\n";
9829        }
9830        else {
9831
9832            # Keep track of cycles in the input, and refuse to infinitely loop
9833            my $addr = do { no overloading; pack 'J', $item; };
9834            if (defined $main::already_output{$addr}) {
9835                return "${indent}ALREADY OUTPUT: $item\n";
9836            }
9837            $main::already_output{$addr} = $item;
9838
9839            if (ref $item eq 'ARRAY') {
9840                my $using_brackets;
9841                $output = $indent;
9842                if ($main::simple_dumper_nesting > 1) {
9843                    $output .= '[';
9844                    $using_brackets = 1;
9845                }
9846                else {
9847                    $using_brackets = 0;
9848                }
9849
9850                # If the array is empty, put the closing bracket on the same
9851                # line.  Otherwise, recursively add each array element
9852                if (@$item == 0) {
9853                    $output .= " ";
9854                }
9855                else {
9856                    $output .= "\n";
9857                    for (my $i = 0; $i < @$item; $i++) {
9858
9859                        # Indent array elements one level
9860                        $output .= &simple_dumper($item->[$i], $next_indent);
9861                        next if ! $debugging_build;
9862                        $output =~ s/\n$//;      # Remove any trailing nl so
9863                        $output .= " # [$i]\n";  # as to add a comment giving
9864                                                 # the array index
9865                    }
9866                    $output .= $indent;     # Indent closing ']' to orig level
9867                }
9868                $output .= ']' if $using_brackets;
9869                $output .= ",\n";
9870            }
9871            elsif (ref $item eq 'HASH') {
9872                my $is_first_line;
9873                my $using_braces;
9874                my $body_indent;
9875
9876                # No surrounding braces at top level
9877                $output .= $indent;
9878                if ($main::simple_dumper_nesting > 1) {
9879                    $output .= "{\n";
9880                    $is_first_line = 0;
9881                    $body_indent = $next_indent;
9882                    $next_indent .= $indent_increment;
9883                    $using_braces = 1;
9884                }
9885                else {
9886                    $is_first_line = 1;
9887                    $body_indent = $indent;
9888                    $using_braces = 0;
9889                }
9890
9891                # Output hashes sorted alphabetically instead of apparently
9892                # random.  Use caseless alphabetic sort
9893                foreach my $key (sort { lc $a cmp lc $b } keys %$item)
9894                {
9895                    if ($is_first_line) {
9896                        $is_first_line = 0;
9897                    }
9898                    else {
9899                        $output .= "$body_indent";
9900                    }
9901
9902                    # The key must be a scalar, but this recursive call quotes
9903                    # it
9904                    $output .= &simple_dumper($key);
9905
9906                    # And change the trailing comma and nl to the hash fat
9907                    # comma for clarity, and so the value can be on the same
9908                    # line
9909                    $output =~ s/,\n$/ => /;
9910
9911                    # Recursively call to get the value's dump.
9912                    my $next = &simple_dumper($item->{$key}, $next_indent);
9913
9914                    # If the value is all on one line, remove its indent, so
9915                    # will follow the => immediately.  If it takes more than
9916                    # one line, start it on a new line.
9917                    if ($next !~ /\n.*\n/) {
9918                        $next =~ s/^ *//;
9919                    }
9920                    else {
9921                        $output .= "\n";
9922                    }
9923                    $output .= $next;
9924                }
9925
9926                $output .= "$indent},\n" if $using_braces;
9927            }
9928            elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
9929                $output = $indent . ref($item) . "\n";
9930                # XXX see if blessed
9931            }
9932            elsif ($item->can('dump')) {
9933
9934                # By convention in this program, objects furnish a 'dump'
9935                # method.  Since not doing any output at this level, just pass
9936                # on the input indent
9937                $output = $item->dump($indent);
9938            }
9939            else {
9940                Carp::my_carp("Can't cope with dumping a " . ref($item) . ".  Skipping.");
9941            }
9942        }
9943        return $output;
9944    }
9945}
9946
9947sub dump_inside_out( $object, $fields_ref ) {
9948    # Dump inside-out hashes in an object's state by converting them to a
9949    # regular hash and then calling simple_dumper on that.
9950
9951    my $addr = do { no overloading; pack 'J', $object; };
9952
9953    my %hash;
9954    foreach my $key (keys %$fields_ref) {
9955        $hash{$key} = $fields_ref->{$key}{$addr};
9956    }
9957
9958    return simple_dumper(\%hash, @_);
9959}
9960
9961sub _operator_dot($self, $other="", $reversed=0) {
9962    # Overloaded '.' method that is common to all packages.  It uses the
9963    # package's stringify method.
9964
9965    foreach my $which (\$self, \$other) {
9966        next unless ref $$which;
9967        if ($$which->can('_operator_stringify')) {
9968            $$which = $$which->_operator_stringify;
9969        }
9970        else {
9971            my $ref = ref $$which;
9972            my $addr = do { no overloading; pack 'J', $$which; };
9973            $$which = "$ref ($addr)";
9974        }
9975    }
9976    return ($reversed)
9977            ? "$other$self"
9978            : "$self$other";
9979}
9980
9981sub _operator_dot_equal($self, $other="", $reversed=0) {
9982    # Overloaded '.=' method that is common to all packages.
9983
9984    if ($reversed) {
9985        return $other .= "$self";
9986    }
9987    else {
9988        return "$self" . "$other";
9989    }
9990}
9991
9992sub _operator_equal($self, $other, @) {
9993    # Generic overloaded '==' routine.  To be equal, they must be the exact
9994    # same object
9995
9996    return 0 unless defined $other;
9997    return 0 unless ref $other;
9998    no overloading;
9999    return $self == $other;
10000}
10001
10002sub _operator_not_equal($self, $other, @) {
10003    return ! _operator_equal($self, $other);
10004}
10005
10006sub substitute_PropertyAliases($file_object) {
10007    # Deal with early releases that don't have the crucial PropertyAliases.txt
10008    # file.
10009
10010    $file_object->insert_lines(get_old_property_aliases());
10011
10012    process_PropertyAliases($file_object);
10013}
10014
10015
10016sub process_PropertyAliases($file) {
10017    # This reads in the PropertyAliases.txt file, which contains almost all
10018    # the character properties in Unicode and their equivalent aliases:
10019    # scf       ; Simple_Case_Folding         ; sfc
10020    #
10021    # Field 0 is the preferred short name for the property.
10022    # Field 1 is the full name.
10023    # Any succeeding ones are other accepted names.
10024
10025    # Add any cjk properties that may have been defined.
10026    $file->insert_lines(@cjk_properties);
10027
10028    while ($file->next_line) {
10029
10030        my @data = split /\s*;\s*/;
10031
10032        my $full = $data[1];
10033
10034        # This line is defective in early Perls.  The property in Unihan.txt
10035        # is kRSUnicode.
10036        if ($full eq 'Unicode_Radical_Stroke' && @data < 3) {
10037            push @data, qw(cjkRSUnicode kRSUnicode);
10038        }
10039
10040        my $this = Property->new($data[0], Full_Name => $full);
10041
10042        $this->set_fate($SUPPRESSED, $why_suppressed{$full})
10043                                                    if $why_suppressed{$full};
10044
10045        # Start looking for more aliases after these two.
10046        for my $i (2 .. @data - 1) {
10047            $this->add_alias($data[$i]);
10048        }
10049
10050    }
10051
10052    my $scf = property_ref("Simple_Case_Folding");
10053    $scf->add_alias("scf");
10054    $scf->add_alias("sfc");
10055
10056    return;
10057}
10058
10059sub finish_property_setup($file) {
10060    # Finishes setting up after PropertyAliases.
10061
10062    # This entry was missing from this file in earlier Unicode versions
10063    if (-e 'Jamo.txt' && ! defined property_ref('JSN')) {
10064        Property->new('JSN', Full_Name => 'Jamo_Short_Name');
10065    }
10066
10067    # These are used so much, that we set globals for them.
10068    $gc = property_ref('General_Category');
10069    $block = property_ref('Block');
10070    $script = property_ref('Script');
10071    $age = property_ref('Age');
10072
10073    # Perl adds this alias.
10074    $gc->add_alias('Category');
10075
10076    # Unicode::Normalize expects this file with this name and directory.
10077    $ccc = property_ref('Canonical_Combining_Class');
10078    if (defined $ccc) {
10079        $ccc->set_file('CombiningClass');
10080        $ccc->set_directory(File::Spec->curdir());
10081    }
10082
10083    # These two properties aren't actually used in the core, but unfortunately
10084    # the names just above that are in the core interfere with these, so
10085    # choose different names.  These aren't a problem unless the map tables
10086    # for these files get written out.
10087    my $lowercase = property_ref('Lowercase');
10088    $lowercase->set_file('IsLower') if defined $lowercase;
10089    my $uppercase = property_ref('Uppercase');
10090    $uppercase->set_file('IsUpper') if defined $uppercase;
10091
10092    # Set up the hard-coded default mappings, but only on properties defined
10093    # for this release
10094    foreach my $property (keys %default_mapping) {
10095        my $property_object = property_ref($property);
10096        next if ! defined $property_object;
10097        my $default_map = $default_mapping{$property};
10098        $property_object->set_default_map($default_map);
10099
10100        # A map of <code point> implies the property is string.
10101        if ($property_object->type == $UNKNOWN
10102            && $default_map eq $CODE_POINT)
10103        {
10104            $property_object->set_type($STRING);
10105        }
10106    }
10107
10108    # The following use the Multi_Default class to create objects for
10109    # defaults.
10110
10111    # Bidi class has a complicated default, but the derived file takes care of
10112    # the complications, leaving just 'L'.
10113    if (file_exists("${EXTRACTED}DBidiClass.txt")) {
10114        property_ref('Bidi_Class')->set_default_map('L');
10115    }
10116    else {
10117        my $default;
10118
10119        # The derived file was introduced in 3.1.1.  The values below are
10120        # taken from table 3-8, TUS 3.0
10121        my $default_R =
10122            'my $default = Range_List->new;
10123             $default->add_range(0x0590, 0x05FF);
10124             $default->add_range(0xFB1D, 0xFB4F);'
10125        ;
10126
10127        # The defaults apply only to unassigned characters
10128        $default_R .= '$gc->table("Unassigned") & $default;';
10129
10130        if ($v_version lt v3.0.0) {
10131            $default = Multi_Default->new(R => $default_R, 'L');
10132        }
10133        else {
10134
10135            # AL apparently not introduced until 3.0:  TUS 2.x references are
10136            # not on-line to check it out
10137            my $default_AL =
10138                'my $default = Range_List->new;
10139                 $default->add_range(0x0600, 0x07BF);
10140                 $default->add_range(0xFB50, 0xFDFF);
10141                 $default->add_range(0xFE70, 0xFEFF);'
10142            ;
10143
10144            # Non-character code points introduced in this release; aren't AL
10145            if ($v_version ge 3.1.0) {
10146                $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
10147            }
10148            $default_AL .= '$gc->table("Unassigned") & $default';
10149            $default = Multi_Default->new(AL => $default_AL,
10150                                          R => $default_R,
10151                                          'L');
10152        }
10153        property_ref('Bidi_Class')->set_default_map($default);
10154    }
10155
10156    # Joining type has a complicated default, but the derived file takes care
10157    # of the complications, leaving just 'U' (or Non_Joining), except the file
10158    # is bad in 3.1.0
10159    if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
10160        if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
10161            property_ref('Joining_Type')->set_default_map('Non_Joining');
10162        }
10163        else {
10164
10165            # Otherwise, there are not one, but two possibilities for the
10166            # missing defaults: T and U.
10167            # The missing defaults that evaluate to T are given by:
10168            # T = Mn + Cf - ZWNJ - ZWJ
10169            # where Mn and Cf are the general category values. In other words,
10170            # any non-spacing mark or any format control character, except
10171            # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
10172            # WIDTH JOINER (joining type C).
10173            my $default = Multi_Default->new(
10174               'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
10175               'Non_Joining');
10176            property_ref('Joining_Type')->set_default_map($default);
10177        }
10178    }
10179
10180    # Line break has a complicated default in early releases. It is 'Unknown'
10181    # for non-assigned code points; 'AL' for assigned.
10182    if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
10183        my $lb = property_ref('Line_Break');
10184        if (file_exists("${EXTRACTED}DLineBreak.txt")) {
10185            $lb->set_default_map('Unknown');
10186        }
10187        else {
10188            my $default = Multi_Default->new('AL' => '~ $gc->table("Cn")',
10189                                             'Unknown',
10190                                            );
10191            $lb->set_default_map($default);
10192        }
10193    }
10194
10195    # For backwards compatibility with applications that may read the mapping
10196    # file directly (it was documented in 5.12 and 5.14 as being thusly
10197    # usable), keep it from being adjusted.  (range_size_1 is
10198    # used to force the traditional format.)
10199    if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) {
10200        $nfkc_cf->set_to_output_map($EXTERNAL_MAP);
10201        $nfkc_cf->set_range_size_1(1);
10202    }
10203    if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) {
10204        $bmg->set_to_output_map($EXTERNAL_MAP);
10205        $bmg->set_range_size_1(1);
10206    }
10207
10208    property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED);
10209
10210    return;
10211}
10212
10213sub get_old_property_aliases() {
10214    # Returns what would be in PropertyAliases.txt if it existed in very old
10215    # versions of Unicode.  It was derived from the one in 3.2, and pared
10216    # down based on the data that was actually in the older releases.
10217    # An attempt was made to use the existence of files to mean inclusion or
10218    # not of various aliases, but if this was not sufficient, using version
10219    # numbers was resorted to.
10220
10221    my @return;
10222
10223    # These are to be used in all versions (though some are constructed by
10224    # this program if missing)
10225    push @return, split /\n/, <<'END';
10226bc        ; Bidi_Class
10227Bidi_M    ; Bidi_Mirrored
10228cf        ; Case_Folding
10229ccc       ; Canonical_Combining_Class
10230dm        ; Decomposition_Mapping
10231dt        ; Decomposition_Type
10232gc        ; General_Category
10233isc       ; ISO_Comment
10234lc        ; Lowercase_Mapping
10235na        ; Name
10236na1       ; Unicode_1_Name
10237nt        ; Numeric_Type
10238nv        ; Numeric_Value
10239scf       ; Simple_Case_Folding
10240slc       ; Simple_Lowercase_Mapping
10241stc       ; Simple_Titlecase_Mapping
10242suc       ; Simple_Uppercase_Mapping
10243tc        ; Titlecase_Mapping
10244uc        ; Uppercase_Mapping
10245END
10246
10247    if (-e 'Blocks.txt') {
10248        push @return, "blk       ; Block\n";
10249    }
10250    if (-e 'ArabicShaping.txt') {
10251        push @return, split /\n/, <<'END';
10252jg        ; Joining_Group
10253jt        ; Joining_Type
10254END
10255    }
10256    if (-e 'PropList.txt') {
10257
10258        # This first set is in the original old-style proplist.
10259        push @return, split /\n/, <<'END';
10260Bidi_C    ; Bidi_Control
10261Dash      ; Dash
10262Dia       ; Diacritic
10263Ext       ; Extender
10264Hex       ; Hex_Digit
10265Hyphen    ; Hyphen
10266IDC       ; ID_Continue
10267Ideo      ; Ideographic
10268Join_C    ; Join_Control
10269Math      ; Math
10270QMark     ; Quotation_Mark
10271Term      ; Terminal_Punctuation
10272WSpace    ; White_Space
10273END
10274        # The next sets were added later
10275        if ($v_version ge v3.0.0) {
10276            push @return, split /\n/, <<'END';
10277Upper     ; Uppercase
10278Lower     ; Lowercase
10279END
10280        }
10281        if ($v_version ge v3.0.1) {
10282            push @return, split /\n/, <<'END';
10283NChar     ; Noncharacter_Code_Point
10284END
10285        }
10286        # The next sets were added in the new-style
10287        if ($v_version ge v3.1.0) {
10288            push @return, split /\n/, <<'END';
10289OAlpha    ; Other_Alphabetic
10290OLower    ; Other_Lowercase
10291OMath     ; Other_Math
10292OUpper    ; Other_Uppercase
10293END
10294        }
10295        if ($v_version ge v3.1.1) {
10296            push @return, "AHex      ; ASCII_Hex_Digit\n";
10297        }
10298    }
10299    if (-e 'EastAsianWidth.txt') {
10300        push @return, "ea        ; East_Asian_Width\n";
10301    }
10302    if (-e 'CompositionExclusions.txt') {
10303        push @return, "CE        ; Composition_Exclusion\n";
10304    }
10305    if (-e 'LineBreak.txt') {
10306        push @return, "lb        ; Line_Break\n";
10307    }
10308    if (-e 'BidiMirroring.txt') {
10309        push @return, "bmg       ; Bidi_Mirroring_Glyph\n";
10310    }
10311    if (-e 'Scripts.txt') {
10312        push @return, "sc        ; Script\n";
10313    }
10314    if (-e 'DNormalizationProps.txt') {
10315        push @return, split /\n/, <<'END';
10316Comp_Ex   ; Full_Composition_Exclusion
10317FC_NFKC   ; FC_NFKC_Closure
10318NFC_QC    ; NFC_Quick_Check
10319NFD_QC    ; NFD_Quick_Check
10320NFKC_QC   ; NFKC_Quick_Check
10321NFKD_QC   ; NFKD_Quick_Check
10322XO_NFC    ; Expands_On_NFC
10323XO_NFD    ; Expands_On_NFD
10324XO_NFKC   ; Expands_On_NFKC
10325XO_NFKD   ; Expands_On_NFKD
10326END
10327    }
10328    if (-e 'DCoreProperties.txt') {
10329        push @return, split /\n/, <<'END';
10330Alpha     ; Alphabetic
10331IDS       ; ID_Start
10332XIDC      ; XID_Continue
10333XIDS      ; XID_Start
10334END
10335        # These can also appear in some versions of PropList.txt
10336        push @return, "Lower     ; Lowercase\n"
10337                                    unless grep { $_ =~ /^Lower\b/} @return;
10338        push @return, "Upper     ; Uppercase\n"
10339                                    unless grep { $_ =~ /^Upper\b/} @return;
10340    }
10341
10342    # This flag requires the DAge.txt file to be copied into the directory.
10343    if (DEBUG && $compare_versions) {
10344        push @return, 'age       ; Age';
10345    }
10346
10347    return @return;
10348}
10349
10350sub substitute_PropValueAliases($file_object) {
10351    # Deal with early releases that don't have the crucial
10352    # PropValueAliases.txt file.
10353
10354    $file_object->insert_lines(get_old_property_value_aliases());
10355
10356    process_PropValueAliases($file_object);
10357}
10358
10359sub process_PropValueAliases($file) {
10360    # This file contains values that properties look like:
10361    # bc ; AL        ; Arabic_Letter
10362    # blk; n/a       ; Greek_And_Coptic                 ; Greek
10363    #
10364    # Field 0 is the property.
10365    # Field 1 is the short name of a property value or 'n/a' if no
10366    #                short name exists;
10367    # Field 2 is the full property value name;
10368    # Any other fields are more synonyms for the property value.
10369    # Purely numeric property values are omitted from the file; as are some
10370    # others, fewer and fewer in later releases
10371
10372    # Entries for the ccc property have an extra field before the
10373    # abbreviation:
10374    # ccc;   0; NR   ; Not_Reordered
10375    # It is the numeric value that the names are synonyms for.
10376
10377    # There are comment entries for values missing from this file:
10378    # # @missing: 0000..10FFFF; ISO_Comment; <none>
10379    # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
10380
10381    if ($v_version lt 4.0.0) {
10382        $file->insert_lines(split /\n/, <<'END'
10383Hangul_Syllable_Type; L                                ; Leading_Jamo
10384Hangul_Syllable_Type; LV                               ; LV_Syllable
10385Hangul_Syllable_Type; LVT                              ; LVT_Syllable
10386Hangul_Syllable_Type; NA                               ; Not_Applicable
10387Hangul_Syllable_Type; T                                ; Trailing_Jamo
10388Hangul_Syllable_Type; V                                ; Vowel_Jamo
10389END
10390        );
10391    }
10392    if ($v_version lt 4.1.0) {
10393        $file->insert_lines(split /\n/, <<'END'
10394_Perl_GCB; CN                               ; Control
10395_Perl_GCB; CR                               ; CR
10396_Perl_GCB; EX                               ; Extend
10397_Perl_GCB; L                                ; L
10398_Perl_GCB; LF                               ; LF
10399_Perl_GCB; LV                               ; LV
10400_Perl_GCB; LVT                              ; LVT
10401_Perl_GCB; T                                ; T
10402_Perl_GCB; V                                ; V
10403_Perl_GCB; XX                               ; Other
10404END
10405        );
10406    }
10407
10408    # Add any explicit cjk values
10409    $file->insert_lines(@cjk_property_values);
10410
10411    # This line is used only for testing the code that checks for name
10412    # conflicts.  There is a script Inherited, and when this line is executed
10413    # it causes there to be a name conflict with the 'Inherited' that this
10414    # program generates for this block property value
10415    #$file->insert_lines('blk; n/a; Herited');
10416
10417    # Process each line of the file ...
10418    while ($file->next_line) {
10419
10420        # Fix typo in input file
10421        s/CCC133/CCC132/g if $v_version eq v6.1.0;
10422
10423        my ($property, @data) = split /\s*;\s*/;
10424
10425        # The ccc property has an extra field at the beginning, which is the
10426        # numeric value.  Move it to be after the other two, mnemonic, fields,
10427        # so that those will be used as the property value's names, and the
10428        # number will be an extra alias.  (Rightmost splice removes field 1-2,
10429        # returning them in a slice; left splice inserts that before anything,
10430        # thus shifting the former field 0 to after them.)
10431        splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc';
10432
10433        if ($v_version le v5.0.0 && $property eq 'blk' && $data[1] =~ /-/) {
10434            my $new_style = $data[1] =~ s/-/_/gr;
10435            splice @data, 1, 0, $new_style;
10436        }
10437
10438        # Field 0 is a short name unless "n/a"; field 1 is the full name.  If
10439        # there is no short name, use the full one in element 1
10440        if ($data[0] eq "n/a") {
10441            $data[0] = $data[1];
10442        }
10443        elsif ($data[0] ne $data[1]
10444               && standardize($data[0]) eq standardize($data[1])
10445               && $data[1] !~ /[[:upper:]]/)
10446        {
10447            # Also, there is a bug in the file in which "n/a" is omitted, and
10448            # the two fields are identical except for case, and the full name
10449            # is all lower case.  Copy the "short" name unto the full one to
10450            # give it some upper case.
10451
10452            $data[1] = $data[0];
10453        }
10454
10455        # Earlier releases had the pseudo property 'qc' that should expand to
10456        # the ones that replace it below.
10457        if ($property eq 'qc') {
10458            if (lc $data[0] eq 'y') {
10459                $file->insert_lines('NFC_QC; Y      ; Yes',
10460                                    'NFD_QC; Y      ; Yes',
10461                                    'NFKC_QC; Y     ; Yes',
10462                                    'NFKD_QC; Y     ; Yes',
10463                                    );
10464            }
10465            elsif (lc $data[0] eq 'n') {
10466                $file->insert_lines('NFC_QC; N      ; No',
10467                                    'NFD_QC; N      ; No',
10468                                    'NFKC_QC; N     ; No',
10469                                    'NFKD_QC; N     ; No',
10470                                    );
10471            }
10472            elsif (lc $data[0] eq 'm') {
10473                $file->insert_lines('NFC_QC; M      ; Maybe',
10474                                    'NFKC_QC; M     ; Maybe',
10475                                    );
10476            }
10477            else {
10478                $file->carp_bad_line("qc followed by unexpected '$data[0]");
10479            }
10480            next;
10481        }
10482
10483        # The first field is the short name, 2nd is the full one.
10484        my $property_object = property_ref($property);
10485        my $table = $property_object->add_match_table($data[0],
10486                                                Full_Name => $data[1]);
10487
10488        # Start looking for more aliases after these two.
10489        for my $i (2 .. @data - 1) {
10490            $table->add_alias($data[$i]);
10491        }
10492    } # End of looping through the file
10493
10494    # As noted in the comments early in the program, it generates tables for
10495    # the default values for all releases, even those for which the concept
10496    # didn't exist at the time.  Here we add those if missing.
10497    if (defined $age && ! defined $age->table('Unassigned')) {
10498        $age->add_match_table('Unassigned');
10499    }
10500    $block->add_match_table('No_Block') if -e 'Blocks.txt'
10501                                    && ! defined $block->table('No_Block');
10502
10503
10504    # Now set the default mappings of the properties from the file.  This is
10505    # done after the loop because a number of properties have only @missings
10506    # entries in the file, and may not show up until the end.
10507    my @defaults = $file->get_missings;
10508    foreach my $default_ref (@defaults) {
10509        my $default = $default_ref->[0];
10510        my $property = property_ref($default_ref->[1]);
10511        $property->set_default_map($default);
10512    }
10513    return;
10514}
10515
10516sub get_old_property_value_aliases () {
10517    # Returns what would be in PropValueAliases.txt if it existed in very old
10518    # versions of Unicode.  It was derived from the one in 3.2, and pared
10519    # down.  An attempt was made to use the existence of files to mean
10520    # inclusion or not of various aliases, but if this was not sufficient,
10521    # using version numbers was resorted to.
10522
10523    my @return = split /\n/, <<'END';
10524bc ; AN        ; Arabic_Number
10525bc ; B         ; Paragraph_Separator
10526bc ; CS        ; Common_Separator
10527bc ; EN        ; European_Number
10528bc ; ES        ; European_Separator
10529bc ; ET        ; European_Terminator
10530bc ; L         ; Left_To_Right
10531bc ; ON        ; Other_Neutral
10532bc ; R         ; Right_To_Left
10533bc ; WS        ; White_Space
10534
10535Bidi_M; N; No; F; False
10536Bidi_M; Y; Yes; T; True
10537
10538# The standard combining classes are very much different in v1, so only use
10539# ones that look right (not checked thoroughly)
10540ccc;   0; NR   ; Not_Reordered
10541ccc;   1; OV   ; Overlay
10542ccc;   7; NK   ; Nukta
10543ccc;   8; KV   ; Kana_Voicing
10544ccc;   9; VR   ; Virama
10545ccc; 202; ATBL ; Attached_Below_Left
10546ccc; 216; ATAR ; Attached_Above_Right
10547ccc; 218; BL   ; Below_Left
10548ccc; 220; B    ; Below
10549ccc; 222; BR   ; Below_Right
10550ccc; 224; L    ; Left
10551ccc; 228; AL   ; Above_Left
10552ccc; 230; A    ; Above
10553ccc; 232; AR   ; Above_Right
10554ccc; 234; DA   ; Double_Above
10555
10556dt ; can       ; canonical
10557dt ; enc       ; circle
10558dt ; fin       ; final
10559dt ; font      ; font
10560dt ; fra       ; fraction
10561dt ; init      ; initial
10562dt ; iso       ; isolated
10563dt ; med       ; medial
10564dt ; n/a       ; none
10565dt ; nb        ; noBreak
10566dt ; sqr       ; square
10567dt ; sub       ; sub
10568dt ; sup       ; super
10569
10570gc ; C         ; Other                            # Cc | Cf | Cn | Co | Cs
10571gc ; Cc        ; Control
10572gc ; Cn        ; Unassigned
10573gc ; Co        ; Private_Use
10574gc ; L         ; Letter                           # Ll | Lm | Lo | Lt | Lu
10575gc ; LC        ; Cased_Letter                     # Ll | Lt | Lu
10576gc ; Ll        ; Lowercase_Letter
10577gc ; Lm        ; Modifier_Letter
10578gc ; Lo        ; Other_Letter
10579gc ; Lu        ; Uppercase_Letter
10580gc ; M         ; Mark                             # Mc | Me | Mn
10581gc ; Mc        ; Spacing_Mark
10582gc ; Mn        ; Nonspacing_Mark
10583gc ; N         ; Number                           # Nd | Nl | No
10584gc ; Nd        ; Decimal_Number
10585gc ; No        ; Other_Number
10586gc ; P         ; Punctuation                      # Pc | Pd | Pe | Pf | Pi | Po | Ps
10587gc ; Pd        ; Dash_Punctuation
10588gc ; Pe        ; Close_Punctuation
10589gc ; Po        ; Other_Punctuation
10590gc ; Ps        ; Open_Punctuation
10591gc ; S         ; Symbol                           # Sc | Sk | Sm | So
10592gc ; Sc        ; Currency_Symbol
10593gc ; Sm        ; Math_Symbol
10594gc ; So        ; Other_Symbol
10595gc ; Z         ; Separator                        # Zl | Zp | Zs
10596gc ; Zl        ; Line_Separator
10597gc ; Zp        ; Paragraph_Separator
10598gc ; Zs        ; Space_Separator
10599
10600nt ; de        ; Decimal
10601nt ; di        ; Digit
10602nt ; n/a       ; None
10603nt ; nu        ; Numeric
10604END
10605
10606    if (-e 'ArabicShaping.txt') {
10607        push @return, split /\n/, <<'END';
10608jg ; n/a       ; AIN
10609jg ; n/a       ; ALEF
10610jg ; n/a       ; DAL
10611jg ; n/a       ; GAF
10612jg ; n/a       ; LAM
10613jg ; n/a       ; MEEM
10614jg ; n/a       ; NO_JOINING_GROUP
10615jg ; n/a       ; NOON
10616jg ; n/a       ; QAF
10617jg ; n/a       ; SAD
10618jg ; n/a       ; SEEN
10619jg ; n/a       ; TAH
10620jg ; n/a       ; WAW
10621
10622jt ; C         ; Join_Causing
10623jt ; D         ; Dual_Joining
10624jt ; L         ; Left_Joining
10625jt ; R         ; Right_Joining
10626jt ; U         ; Non_Joining
10627jt ; T         ; Transparent
10628END
10629        if ($v_version ge v3.0.0) {
10630            push @return, split /\n/, <<'END';
10631jg ; n/a       ; ALAPH
10632jg ; n/a       ; BEH
10633jg ; n/a       ; BETH
10634jg ; n/a       ; DALATH_RISH
10635jg ; n/a       ; E
10636jg ; n/a       ; FEH
10637jg ; n/a       ; FINAL_SEMKATH
10638jg ; n/a       ; GAMAL
10639jg ; n/a       ; HAH
10640jg ; n/a       ; HAMZA_ON_HEH_GOAL
10641jg ; n/a       ; HE
10642jg ; n/a       ; HEH
10643jg ; n/a       ; HEH_GOAL
10644jg ; n/a       ; HETH
10645jg ; n/a       ; KAF
10646jg ; n/a       ; KAPH
10647jg ; n/a       ; KNOTTED_HEH
10648jg ; n/a       ; LAMADH
10649jg ; n/a       ; MIM
10650jg ; n/a       ; NUN
10651jg ; n/a       ; PE
10652jg ; n/a       ; QAPH
10653jg ; n/a       ; REH
10654jg ; n/a       ; REVERSED_PE
10655jg ; n/a       ; SADHE
10656jg ; n/a       ; SEMKATH
10657jg ; n/a       ; SHIN
10658jg ; n/a       ; SWASH_KAF
10659jg ; n/a       ; TAW
10660jg ; n/a       ; TEH_MARBUTA
10661jg ; n/a       ; TETH
10662jg ; n/a       ; YEH
10663jg ; n/a       ; YEH_BARREE
10664jg ; n/a       ; YEH_WITH_TAIL
10665jg ; n/a       ; YUDH
10666jg ; n/a       ; YUDH_HE
10667jg ; n/a       ; ZAIN
10668END
10669        }
10670    }
10671
10672
10673    if (-e 'EastAsianWidth.txt') {
10674        push @return, split /\n/, <<'END';
10675ea ; A         ; Ambiguous
10676ea ; F         ; Fullwidth
10677ea ; H         ; Halfwidth
10678ea ; N         ; Neutral
10679ea ; Na        ; Narrow
10680ea ; W         ; Wide
10681END
10682    }
10683
10684    if (-e 'LineBreak.txt' || -e 'LBsubst.txt') {
10685        my @lb = split /\n/, <<'END';
10686lb ; AI        ; Ambiguous
10687lb ; AL        ; Alphabetic
10688lb ; B2        ; Break_Both
10689lb ; BA        ; Break_After
10690lb ; BB        ; Break_Before
10691lb ; BK        ; Mandatory_Break
10692lb ; CB        ; Contingent_Break
10693lb ; CL        ; Close_Punctuation
10694lb ; CM        ; Combining_Mark
10695lb ; CR        ; Carriage_Return
10696lb ; EX        ; Exclamation
10697lb ; GL        ; Glue
10698lb ; HY        ; Hyphen
10699lb ; ID        ; Ideographic
10700lb ; IN        ; Inseperable
10701lb ; IS        ; Infix_Numeric
10702lb ; LF        ; Line_Feed
10703lb ; NS        ; Nonstarter
10704lb ; NU        ; Numeric
10705lb ; OP        ; Open_Punctuation
10706lb ; PO        ; Postfix_Numeric
10707lb ; PR        ; Prefix_Numeric
10708lb ; QU        ; Quotation
10709lb ; SA        ; Complex_Context
10710lb ; SG        ; Surrogate
10711lb ; SP        ; Space
10712lb ; SY        ; Break_Symbols
10713lb ; XX        ; Unknown
10714lb ; ZW        ; ZWSpace
10715END
10716        # If this Unicode version predates the lb property, we use our
10717        # substitute one
10718        if (-e 'LBsubst.txt') {
10719            $_ = s/^lb/_Perl_LB/r for @lb;
10720        }
10721        push @return, @lb;
10722    }
10723
10724    if (-e 'DNormalizationProps.txt') {
10725        push @return, split /\n/, <<'END';
10726qc ; M         ; Maybe
10727qc ; N         ; No
10728qc ; Y         ; Yes
10729END
10730    }
10731
10732    if (-e 'Scripts.txt') {
10733        push @return, split /\n/, <<'END';
10734sc ; Arab      ; Arabic
10735sc ; Armn      ; Armenian
10736sc ; Beng      ; Bengali
10737sc ; Bopo      ; Bopomofo
10738sc ; Cans      ; Canadian_Aboriginal
10739sc ; Cher      ; Cherokee
10740sc ; Cyrl      ; Cyrillic
10741sc ; Deva      ; Devanagari
10742sc ; Dsrt      ; Deseret
10743sc ; Ethi      ; Ethiopic
10744sc ; Geor      ; Georgian
10745sc ; Goth      ; Gothic
10746sc ; Grek      ; Greek
10747sc ; Gujr      ; Gujarati
10748sc ; Guru      ; Gurmukhi
10749sc ; Hang      ; Hangul
10750sc ; Hani      ; Han
10751sc ; Hebr      ; Hebrew
10752sc ; Hira      ; Hiragana
10753sc ; Ital      ; Old_Italic
10754sc ; Kana      ; Katakana
10755sc ; Khmr      ; Khmer
10756sc ; Knda      ; Kannada
10757sc ; Laoo      ; Lao
10758sc ; Latn      ; Latin
10759sc ; Mlym      ; Malayalam
10760sc ; Mong      ; Mongolian
10761sc ; Mymr      ; Myanmar
10762sc ; Ogam      ; Ogham
10763sc ; Orya      ; Oriya
10764sc ; Qaai      ; Inherited
10765sc ; Runr      ; Runic
10766sc ; Sinh      ; Sinhala
10767sc ; Syrc      ; Syriac
10768sc ; Taml      ; Tamil
10769sc ; Telu      ; Telugu
10770sc ; Thaa      ; Thaana
10771sc ; Thai      ; Thai
10772sc ; Tibt      ; Tibetan
10773sc ; Yiii      ; Yi
10774sc ; Zyyy      ; Common
10775END
10776    }
10777
10778    if ($v_version ge v2.0.0) {
10779        push @return, split /\n/, <<'END';
10780dt ; com       ; compat
10781dt ; nar       ; narrow
10782dt ; sml       ; small
10783dt ; vert      ; vertical
10784dt ; wide      ; wide
10785
10786gc ; Cf        ; Format
10787gc ; Cs        ; Surrogate
10788gc ; Lt        ; Titlecase_Letter
10789gc ; Me        ; Enclosing_Mark
10790gc ; Nl        ; Letter_Number
10791gc ; Pc        ; Connector_Punctuation
10792gc ; Sk        ; Modifier_Symbol
10793END
10794    }
10795    if ($v_version ge v2.1.2) {
10796        push @return, "bc ; S         ; Segment_Separator\n";
10797    }
10798    if ($v_version ge v2.1.5) {
10799        push @return, split /\n/, <<'END';
10800gc ; Pf        ; Final_Punctuation
10801gc ; Pi        ; Initial_Punctuation
10802END
10803    }
10804    if ($v_version ge v2.1.8) {
10805        push @return, "ccc; 240; IS   ; Iota_Subscript\n";
10806    }
10807
10808    if ($v_version ge v3.0.0) {
10809        push @return, split /\n/, <<'END';
10810bc ; AL        ; Arabic_Letter
10811bc ; BN        ; Boundary_Neutral
10812bc ; LRE       ; Left_To_Right_Embedding
10813bc ; LRO       ; Left_To_Right_Override
10814bc ; NSM       ; Nonspacing_Mark
10815bc ; PDF       ; Pop_Directional_Format
10816bc ; RLE       ; Right_To_Left_Embedding
10817bc ; RLO       ; Right_To_Left_Override
10818
10819ccc; 233; DB   ; Double_Below
10820END
10821    }
10822
10823    if ($v_version ge v3.1.0) {
10824        push @return, "ccc; 226; R    ; Right\n";
10825    }
10826
10827    return @return;
10828}
10829
10830sub process_NormalizationsTest($file) {
10831
10832    # Each line looks like:
10833    #      source code point; NFC; NFD; NFKC; NFKD
10834    # e.g.
10835    #       1E0A;1E0A;0044 0307;1E0A;0044 0307;
10836
10837    # Process each line of the file ...
10838    while ($file->next_line) {
10839
10840        next if /^@/;
10841
10842        my ($c1, $c2, $c3, $c4, $c5) = split /\s*;\s*/;
10843
10844        foreach my $var (\$c1, \$c2, \$c3, \$c4, \$c5) {
10845            $$var = pack "U0U*", map { hex } split " ", $$var;
10846            $$var =~ s/(\\)/$1$1/g;
10847        }
10848
10849        push @normalization_tests,
10850                "Test_N(q$c1, q$c2, q$c3, q$c4, q$c5);\n";
10851    } # End of looping through the file
10852}
10853
10854sub output_perl_charnames_line ($a, $b) {
10855
10856    # Output the entries in Perl_charnames specially, using 5 digits instead
10857    # of four.  This makes the entries a constant length, and simplifies
10858    # charnames.pm which this table is for.  Unicode can have 6 digit
10859    # ordinals, but they are all private use or noncharacters which do not
10860    # have names, so won't be in this table.
10861
10862    return sprintf "%05X\n%s\n\n", $_[0], $_[1];
10863}
10864
10865{ # Closure
10866
10867    # These are constants to the $property_info hash in this subroutine, to
10868    # avoid using a quoted-string which might have a typo.
10869    my $TYPE  = 'type';
10870    my $DEFAULT_MAP = 'default_map';
10871    my $DEFAULT_TABLE = 'default_table';
10872    my $PSEUDO_MAP_TYPE = 'pseudo_map_type';
10873    my $MISSINGS = 'missings';
10874
10875    sub process_generic_property_file($file) {
10876        # This processes a file containing property mappings and puts them
10877        # into internal map tables.  It should be used to handle any property
10878        # files that have mappings from a code point or range thereof to
10879        # something else.  This means almost all the UCD .txt files.
10880        # each_line_handlers() should be set to adjust the lines of these
10881        # files, if necessary, to what this routine understands:
10882        #
10883        # 0374          ; NFD_QC; N
10884        # 003C..003E    ; Math
10885        #
10886        # the fields are: "codepoint-range ; property; map"
10887        #
10888        # meaning the codepoints in the range all have the value 'map' under
10889        # 'property'.
10890        # Beginning and trailing white space in each field are not significant.
10891        # Note there is not a trailing semi-colon in the above.  A trailing
10892        # semi-colon means the map is a null-string.  An omitted map, as
10893        # opposed to a null-string, is assumed to be 'Y', based on Unicode
10894        # table syntax.  (This could have been hidden from this routine by
10895        # doing it in the $file object, but that would require parsing of the
10896        # line there, so would have to parse it twice, or change the interface
10897        # to pass this an array.  So not done.)
10898        #
10899        # The map field may begin with a sequence of commands that apply to
10900        # this range.  Each such command begins and ends with $CMD_DELIM.
10901        # These are used to indicate, for example, that the mapping for a
10902        # range has a non-default type.
10903        #
10904        # This loops through the file, calling its next_line() method, and
10905        # then taking the map and adding it to the property's table.
10906        # Complications arise because any number of properties can be in the
10907        # file, in any order, interspersed in any way.  The first time a
10908        # property is seen, it gets information about that property and
10909        # caches it for quick retrieval later.  It also normalizes the maps
10910        # so that only one of many synonyms is stored.  The Unicode input
10911        # files do use some multiple synonyms.
10912
10913        my %property_info;               # To keep track of what properties
10914                                         # have already had entries in the
10915                                         # current file, and info about each,
10916                                         # so don't have to recompute.
10917        my $property_name;               # property currently being worked on
10918        my $property_type;               # and its type
10919        my $previous_property_name = ""; # name from last time through loop
10920        my $property_object;             # pointer to the current property's
10921                                         # object
10922        my $property_addr;               # the address of that object
10923        my $default_map;                 # the string that code points missing
10924                                         # from the file map to
10925        my $default_table;               # For non-string properties, a
10926                                         # reference to the match table that
10927                                         # will contain the list of code
10928                                         # points that map to $default_map.
10929
10930        # Get the next real non-comment line
10931        LINE:
10932        while ($file->next_line) {
10933
10934            # Default replacement type; means that if parts of the range have
10935            # already been stored in our tables, the new map overrides them if
10936            # they differ more than cosmetically
10937            my $replace = $IF_NOT_EQUIVALENT;
10938            my $map_type;            # Default type for the map of this range
10939
10940            #local $to_trace = 1 if main::DEBUG;
10941            trace $_ if main::DEBUG && $to_trace;
10942
10943            # Split the line into components
10944            my ($range, $property_name, $map, @remainder)
10945                = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10946
10947            # If more or less on the line than we are expecting, warn and skip
10948            # the line
10949            if (@remainder) {
10950                $file->carp_bad_line('Extra fields');
10951                next LINE;
10952            }
10953            elsif ( ! defined $property_name) {
10954                $file->carp_bad_line('Missing property');
10955                next LINE;
10956            }
10957
10958            # Examine the range.
10959            if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
10960            {
10961                $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
10962                next LINE;
10963            }
10964            my $low = hex $1;
10965            my $high = (defined $2) ? hex $2 : $low;
10966
10967            # If changing to a new property, get the things constant per
10968            # property
10969            if ($previous_property_name ne $property_name) {
10970
10971                $property_object = property_ref($property_name);
10972                if (! defined $property_object) {
10973                    $file->carp_bad_line("Unexpected property '$property_name'.  Skipped");
10974                    next LINE;
10975                }
10976                { no overloading; $property_addr = pack 'J', $property_object; }
10977
10978                # Defer changing names until have a line that is acceptable
10979                # (the 'next' statement above means is unacceptable)
10980                $previous_property_name = $property_name;
10981
10982                # If not the first time for this property, retrieve info about
10983                # it from the cache
10984                if (defined ($property_info{$property_addr}{$TYPE})) {
10985                    $property_type = $property_info{$property_addr}{$TYPE};
10986                    $default_map = $property_info{$property_addr}{$DEFAULT_MAP};
10987                    $map_type
10988                        = $property_info{$property_addr}{$PSEUDO_MAP_TYPE};
10989                    $default_table
10990                            = $property_info{$property_addr}{$DEFAULT_TABLE};
10991                }
10992                else {
10993
10994                    # Here, is the first time for this property.  Set up the
10995                    # cache.
10996                    $property_type = $property_info{$property_addr}{$TYPE}
10997                                   = $property_object->type;
10998                    $map_type
10999                        = $property_info{$property_addr}{$PSEUDO_MAP_TYPE}
11000                        = $property_object->pseudo_map_type;
11001
11002                    # The Unicode files are set up so that if the map is not
11003                    # defined, it is a binary property
11004                    if (! defined $map && $property_type != $BINARY) {
11005                        if ($property_type != $UNKNOWN
11006                            && $property_type != $NON_STRING)
11007                        {
11008                            $file->carp_bad_line("No mapping defined on a non-binary property.  Using 'Y' for the map");
11009                        }
11010                        else {
11011                            $property_object->set_type($BINARY);
11012                            $property_type
11013                                = $property_info{$property_addr}{$TYPE}
11014                                = $BINARY;
11015                        }
11016                    }
11017
11018                    # Get any @missings default for this property.  This
11019                    # should precede the first entry for the property in the
11020                    # input file, and is located in a comment that has been
11021                    # stored by the Input_file class until we access it here.
11022                    # It's possible that there is more than one such line
11023                    # waiting for us; collect them all, and parse
11024                    my @missings_list = $file->get_missings
11025                                            if $file->has_missings_defaults;
11026                    foreach my $default_ref (@missings_list) {
11027                        my $default = $default_ref->[0];
11028                        my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); };
11029
11030                        # For string properties, the default is just what the
11031                        # file says, but non-string properties should already
11032                        # have set up a table for the default property value;
11033                        # use the table for these, so can resolve synonyms
11034                        # later to a single standard one.
11035                        if ($property_type == $STRING
11036                            || $property_type == $UNKNOWN)
11037                        {
11038                            $property_info{$addr}{$MISSINGS} = $default;
11039                        }
11040                        else {
11041                            $property_info{$addr}{$MISSINGS}
11042                                        = $property_object->table($default);
11043                        }
11044                    }
11045
11046                    # Finished storing all the @missings defaults in the input
11047                    # file so far.  Get the one for the current property.
11048                    my $missings = $property_info{$property_addr}{$MISSINGS};
11049
11050                    # But we likely have separately stored what the default
11051                    # should be.  (This is to accommodate versions of the
11052                    # standard where the @missings lines are absent or
11053                    # incomplete.)  Hopefully the two will match.  But check
11054                    # it out.
11055                    $default_map = $property_object->default_map;
11056
11057                    # If the map is a ref, it means that the default won't be
11058                    # processed until later, so undef it, so next few lines
11059                    # will redefine it to something that nothing will match
11060                    undef $default_map if ref $default_map;
11061
11062                    # Create a $default_map if don't have one; maybe a dummy
11063                    # that won't match anything.
11064                    if (! defined $default_map) {
11065
11066                        # Use any @missings line in the file.
11067                        if (defined $missings) {
11068                            if (ref $missings) {
11069                                $default_map = $missings->full_name;
11070                                $default_table = $missings;
11071                            }
11072                            else {
11073                                $default_map = $missings;
11074                            }
11075
11076                            # And store it with the property for outside use.
11077                            $property_object->set_default_map($default_map);
11078                        }
11079                        else {
11080
11081                            # Neither an @missings nor a default map.  Create
11082                            # a dummy one, so won't have to test definedness
11083                            # in the main loop.
11084                            $default_map = '_Perl This will never be in a file
11085                                            from Unicode';
11086                        }
11087                    }
11088
11089                    # Here, we have $default_map defined, possibly in terms of
11090                    # $missings, but maybe not, and possibly is a dummy one.
11091                    if (defined $missings) {
11092
11093                        # Make sure there is no conflict between the two.
11094                        # $missings has priority.
11095                        if (ref $missings) {
11096                            $default_table
11097                                        = $property_object->table($default_map);
11098                            if (! defined $default_table
11099                                || $default_table != $missings)
11100                            {
11101                                if (! defined $default_table) {
11102                                    $default_table = $UNDEF;
11103                                }
11104                                $file->carp_bad_line(<<END
11105The \@missings line for $property_name in $file says that missings default to
11106$missings, but we expect it to be $default_table.  $missings used.
11107END
11108                                );
11109                                $default_table = $missings;
11110                                $default_map = $missings->full_name;
11111                            }
11112                            $property_info{$property_addr}{$DEFAULT_TABLE}
11113                                                        = $default_table;
11114                        }
11115                        elsif ($default_map ne $missings) {
11116                            $file->carp_bad_line(<<END
11117The \@missings line for $property_name in $file says that missings default to
11118$missings, but we expect it to be $default_map.  $missings used.
11119END
11120                            );
11121                            $default_map = $missings;
11122                        }
11123                    }
11124
11125                    $property_info{$property_addr}{$DEFAULT_MAP}
11126                                                    = $default_map;
11127
11128                    # If haven't done so already, find the table corresponding
11129                    # to this map for non-string properties.
11130                    if (! defined $default_table
11131                        && $property_type != $STRING
11132                        && $property_type != $UNKNOWN)
11133                    {
11134                        $default_table = $property_info{$property_addr}
11135                                                        {$DEFAULT_TABLE}
11136                                    = $property_object->table($default_map);
11137                    }
11138                } # End of is first time for this property
11139            } # End of switching properties.
11140
11141            # Ready to process the line.
11142            # The Unicode files are set up so that if the map is not defined,
11143            # it is a binary property with value 'Y'
11144            if (! defined $map) {
11145                $map = 'Y';
11146            }
11147            else {
11148
11149                # If the map begins with a special command to us (enclosed in
11150                # delimiters), extract the command(s).
11151                while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
11152                    my $command = $1;
11153                    if ($command =~  / ^ $REPLACE_CMD= (.*) /x) {
11154                        $replace = $1;
11155                    }
11156                    elsif ($command =~  / ^ $MAP_TYPE_CMD= (.*) /x) {
11157                        $map_type = $1;
11158                    }
11159                    else {
11160                        $file->carp_bad_line("Unknown command line: '$1'");
11161                        next LINE;
11162                    }
11163                }
11164            }
11165
11166            if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
11167            {
11168
11169                # Here, we have a map to a particular code point, and the
11170                # default map is to a code point itself.  If the range
11171                # includes the particular code point, change that portion of
11172                # the range to the default.  This makes sure that in the final
11173                # table only the non-defaults are listed.
11174                my $decimal_map = hex $map;
11175                if ($low <= $decimal_map && $decimal_map <= $high) {
11176
11177                    # If the range includes stuff before or after the map
11178                    # we're changing, split it and process the split-off parts
11179                    # later.
11180                    if ($low < $decimal_map) {
11181                        $file->insert_adjusted_lines(
11182                                            sprintf("%04X..%04X; %s; %s",
11183                                                    $low,
11184                                                    $decimal_map - 1,
11185                                                    $property_name,
11186                                                    $map));
11187                    }
11188                    if ($high > $decimal_map) {
11189                        $file->insert_adjusted_lines(
11190                                            sprintf("%04X..%04X; %s; %s",
11191                                                    $decimal_map + 1,
11192                                                    $high,
11193                                                    $property_name,
11194                                                    $map));
11195                    }
11196                    $low = $high = $decimal_map;
11197                    $map = $CODE_POINT;
11198                }
11199            }
11200
11201            # If we can tell that this is a synonym for the default map, use
11202            # the default one instead.
11203            if ($property_type != $STRING
11204                && $property_type != $UNKNOWN)
11205            {
11206                my $table = $property_object->table($map);
11207                if (defined $table && $table == $default_table) {
11208                    $map = $default_map;
11209                }
11210            }
11211
11212            # And figure out the map type if not known.
11213            if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
11214                if ($map eq "") {   # Nulls are always $NULL map type
11215                    $map_type = $NULL;
11216                } # Otherwise, non-strings, and those that don't allow
11217                  # $MULTI_CP, and those that aren't multiple code points are
11218                  # 0
11219                elsif
11220                   (($property_type != $STRING && $property_type != $UNKNOWN)
11221                   || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
11222                   || $map !~ /^ $code_point_re ( \  $code_point_re )+ $ /x)
11223                {
11224                    $map_type = 0;
11225                }
11226                else {
11227                    $map_type = $MULTI_CP;
11228                }
11229            }
11230
11231            $property_object->add_map($low, $high,
11232                                        $map,
11233                                        Type => $map_type,
11234                                        Replace => $replace);
11235        } # End of loop through file's lines
11236
11237        return;
11238    }
11239}
11240
11241{ # Closure for UnicodeData.txt handling
11242
11243    # This file was the first one in the UCD; its design leads to some
11244    # awkwardness in processing.  Here is a sample line:
11245    # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
11246    # The fields in order are:
11247    my $i = 0;            # The code point is in field 0, and is shifted off.
11248    my $CHARNAME = $i++;  # character name (e.g. "LATIN CAPITAL LETTER A")
11249    my $CATEGORY = $i++;  # category (e.g. "Lu")
11250    my $CCC = $i++;       # Canonical combining class (e.g. "230")
11251    my $BIDI = $i++;      # directional class (e.g. "L")
11252    my $PERL_DECOMPOSITION = $i++;  # decomposition mapping
11253    my $PERL_DECIMAL_DIGIT = $i++;   # decimal digit value
11254    my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
11255                                         # Dual-use in this program; see below
11256    my $NUMERIC = $i++;   # numeric value
11257    my $MIRRORED = $i++;  # ? mirrored
11258    my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
11259    my $COMMENT = $i++;   # iso comment
11260    my $UPPER = $i++;     # simple uppercase mapping
11261    my $LOWER = $i++;     # simple lowercase mapping
11262    my $TITLE = $i++;     # simple titlecase mapping
11263    my $input_field_count = $i;
11264
11265    # This routine in addition outputs these extra fields:
11266
11267    my $DECOMP_TYPE = $i++; # Decomposition type
11268
11269    # These fields are modifications of ones above, and are usually
11270    # suppressed; they must come last, as for speed, the loop upper bound is
11271    # normally set to ignore them
11272    my $NAME = $i++;        # This is the strict name field, not the one that
11273                            # charnames uses.
11274    my $DECOMP_MAP = $i++;  # Strict decomposition mapping; not the one used
11275                            # by Unicode::Normalize
11276    my $last_field = $i - 1;
11277
11278    # All these are read into an array for each line, with the indices defined
11279    # above.  The empty fields in the example line above indicate that the
11280    # value is defaulted.  The handler called for each line of the input
11281    # changes these to their defaults.
11282
11283    # Here are the official names of the properties, in a parallel array:
11284    my @field_names;
11285    $field_names[$BIDI] = 'Bidi_Class';
11286    $field_names[$CATEGORY] = 'General_Category';
11287    $field_names[$CCC] = 'Canonical_Combining_Class';
11288    $field_names[$CHARNAME] = 'Perl_Charnames';
11289    $field_names[$COMMENT] = 'ISO_Comment';
11290    $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
11291    $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
11292    $field_names[$LOWER] = 'Lowercase_Mapping';
11293    $field_names[$MIRRORED] = 'Bidi_Mirrored';
11294    $field_names[$NAME] = 'Name';
11295    $field_names[$NUMERIC] = 'Numeric_Value';
11296    $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
11297    $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
11298    $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
11299    $field_names[$TITLE] = 'Titlecase_Mapping';
11300    $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
11301    $field_names[$UPPER] = 'Uppercase_Mapping';
11302
11303    # Some of these need a little more explanation:
11304    # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
11305    #   property, but is used in calculating the Numeric_Type.  Perl however,
11306    #   creates a file from this field, so a Perl property is created from it.
11307    # Similarly, the Other_Digit field is used only for calculating the
11308    #   Numeric_Type, and so it can be safely re-used as the place to store
11309    #   the value for Numeric_Type; hence it is referred to as
11310    #   $NUMERIC_TYPE_OTHER_DIGIT.
11311    # The input field named $PERL_DECOMPOSITION is a combination of both the
11312    #   decomposition mapping and its type.  Perl creates a file containing
11313    #   exactly this field, so it is used for that.  The two properties are
11314    #   separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
11315    #   $DECOMP_MAP is usually suppressed (unless the lists are changed to
11316    #   output it), as Perl doesn't use it directly.
11317    # The input field named here $CHARNAME is used to construct the
11318    #   Perl_Charnames property, which is a combination of the Name property
11319    #   (which the input field contains), and the Unicode_1_Name property, and
11320    #   others from other files.  Since, the strict Name property is not used
11321    #   by Perl, this field is used for the table that Perl does use.  The
11322    #   strict Name property table is usually suppressed (unless the lists are
11323    #   changed to output it), so it is accumulated in a separate field,
11324    #   $NAME, which to save time is discarded unless the table is actually to
11325    #   be output
11326
11327    # This file is processed like most in this program.  Control is passed to
11328    # process_generic_property_file() which calls filter_UnicodeData_line()
11329    # for each input line.  This filter converts the input into line(s) that
11330    # process_generic_property_file() understands.  There is also a setup
11331    # routine called before any of the file is processed, and a handler for
11332    # EOF processing, all in this closure.
11333
11334    # A huge speed-up occurred at the cost of some added complexity when these
11335    # routines were altered to buffer the outputs into ranges.  Almost all the
11336    # lines of the input file apply to just one code point, and for most
11337    # properties, the map for the next code point up is the same as the
11338    # current one.  So instead of creating a line for each property for each
11339    # input line, filter_UnicodeData_line() remembers what the previous map
11340    # of a property was, and doesn't generate a line to pass on until it has
11341    # to, as when the map changes; and that passed-on line encompasses the
11342    # whole contiguous range of code points that have the same map for that
11343    # property.  This means a slight amount of extra setup, and having to
11344    # flush these buffers on EOF, testing if the maps have changed, plus
11345    # remembering state information in the closure.  But it means a lot less
11346    # real time in not having to change the data base for each property on
11347    # each line.
11348
11349    # Another complication is that there are already a few ranges designated
11350    # in the input.  There are two lines for each, with the same maps except
11351    # the code point and name on each line.  This was actually the hardest
11352    # thing to design around.  The code points in those ranges may actually
11353    # have real maps not given by these two lines.  These maps will either
11354    # be algorithmically determinable, or be in the extracted files furnished
11355    # with the UCD.  In the event of conflicts between these extracted files,
11356    # and this one, Unicode says that this one prevails.  But it shouldn't
11357    # prevail for conflicts that occur in these ranges.  The data from the
11358    # extracted files prevails in those cases.  So, this program is structured
11359    # so that those files are processed first, storing maps.  Then the other
11360    # files are processed, generally overwriting what the extracted files
11361    # stored.  But just the range lines in this input file are processed
11362    # without overwriting.  This is accomplished by adding a special string to
11363    # the lines output to tell process_generic_property_file() to turn off the
11364    # overwriting for just this one line.
11365    # A similar mechanism is used to tell it that the map is of a non-default
11366    # type.
11367
11368    sub setup_UnicodeData($file) { # Called before any lines of the input are read
11369
11370        # Create a new property specially located that is a combination of
11371        # various Name properties: Name, Unicode_1_Name, Named Sequences, and
11372        # _Perl_Name_Alias properties.  (The final one duplicates elements of the
11373        # first, and starting in v6.1, is the same as the 'Name_Alias
11374        # property.)  A comment for the new property will later be constructed
11375        # based on the actual properties present and used
11376        $perl_charname = Property->new('Perl_Charnames',
11377                       Default_Map => "",
11378                       Directory => File::Spec->curdir(),
11379                       File => 'Name',
11380                       Fate => $INTERNAL_ONLY,
11381                       Perl_Extension => 1,
11382                       Range_Size_1 => \&output_perl_charnames_line,
11383                       Type => $STRING,
11384                       );
11385        $perl_charname->set_proxy_for('Name');
11386
11387        my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
11388                                        Directory => File::Spec->curdir(),
11389                                        File => 'Decomposition',
11390                                        Format => $DECOMP_STRING_FORMAT,
11391                                        Fate => $INTERNAL_ONLY,
11392                                        Perl_Extension => 1,
11393                                        Default_Map => $CODE_POINT,
11394
11395                                        # normalize.pm can't cope with these
11396                                        Output_Range_Counts => 0,
11397
11398                                        # This is a specially formatted table
11399                                        # explicitly for normalize.pm, which
11400                                        # is expecting a particular format,
11401                                        # which means that mappings containing
11402                                        # multiple code points are in the main
11403                                        # body of the table
11404                                        Map_Type => $COMPUTE_NO_MULTI_CP,
11405                                        Type => $STRING,
11406                                        To_Output_Map => $INTERNAL_MAP,
11407                                        );
11408        $Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type');
11409        $Perl_decomp->add_comment(join_lines(<<END
11410This mapping is a combination of the Unicode 'Decomposition_Type' and
11411'Decomposition_Mapping' properties, formatted for use by normalize.pm.  It is
11412identical to the official Unicode 'Decomposition_Mapping' property except for
11413two things:
11414 1) It omits the algorithmically determinable Hangul syllable decompositions,
11415which normalize.pm handles algorithmically.
11416 2) It contains the decomposition type as well.  Non-canonical decompositions
11417begin with a word in angle brackets, like <super>, which denotes the
11418compatible decomposition type.  If the map does not begin with the <angle
11419brackets>, the decomposition is canonical.
11420END
11421        ));
11422
11423        my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
11424                                        Default_Map => "",
11425                                        Perl_Extension => 1,
11426                                        Directory => $map_directory,
11427                                        Type => $STRING,
11428                                        To_Output_Map => $OUTPUT_ADJUSTED,
11429                                        );
11430        $Decimal_Digit->add_comment(join_lines(<<END
11431This file gives the mapping of all code points which represent a single
11432decimal digit [0-9] to their respective digits, but it has ranges of 10 code
11433points, and the mapping of each non-initial element of each range is actually
11434not to "0", but to the offset that element has from its corresponding DIGIT 0.
11435These code points are those that have Numeric_Type=Decimal; not special
11436things, like subscripts nor Roman numerals.
11437END
11438        ));
11439
11440        # These properties are not used for generating anything else, and are
11441        # usually not output.  By making them last in the list, we can just
11442        # change the high end of the loop downwards to avoid the work of
11443        # generating a table(s) that is/are just going to get thrown away.
11444        if (! property_ref('Decomposition_Mapping')->to_output_map
11445            && ! property_ref('Name')->to_output_map)
11446        {
11447            $last_field = min($NAME, $DECOMP_MAP) - 1;
11448        } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
11449            $last_field = $DECOMP_MAP;
11450        } elsif (property_ref('Name')->to_output_map) {
11451            $last_field = $NAME;
11452        }
11453        return;
11454    }
11455
11456    my $first_time = 1;                 # ? Is this the first line of the file
11457    my $in_range = 0;                   # ? Are we in one of the file's ranges
11458    my $previous_cp;                    # hex code point of previous line
11459    my $decimal_previous_cp = -1;       # And its decimal equivalent
11460    my @start;                          # For each field, the current starting
11461                                        # code point in hex for the range
11462                                        # being accumulated.
11463    my @fields;                         # The input fields;
11464    my @previous_fields;                # And those from the previous call
11465
11466    sub filter_UnicodeData_line($file) {
11467        # Handle a single input line from UnicodeData.txt; see comments above
11468        # Conceptually this takes a single line from the file containing N
11469        # properties, and converts it into N lines with one property per line,
11470        # which is what the final handler expects.  But there are
11471        # complications due to the quirkiness of the input file, and to save
11472        # time, it accumulates ranges where the property values don't change
11473        # and only emits lines when necessary.  This is about an order of
11474        # magnitude fewer lines emitted.
11475
11476        # $_ contains the input line.
11477        # -1 in split means retain trailing null fields
11478        (my $cp, @fields) = split /\s*;\s*/, $_, -1;
11479
11480        #local $to_trace = 1 if main::DEBUG;
11481        trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
11482        if (@fields > $input_field_count) {
11483            $file->carp_bad_line('Extra fields');
11484            $_ = "";
11485            return;
11486        }
11487
11488        my $decimal_cp = hex $cp;
11489
11490        # We have to output all the buffered ranges when the next code point
11491        # is not exactly one after the previous one, which means there is a
11492        # gap in the ranges.
11493        my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
11494
11495        # The decomposition mapping field requires special handling.  It looks
11496        # like either:
11497        #
11498        # <compat> 0032 0020
11499        # 0041 0300
11500        #
11501        # The decomposition type is enclosed in <brackets>; if missing, it
11502        # means the type is canonical.  There are two decomposition mapping
11503        # tables: the one for use by Perl's normalize.pm has a special format
11504        # which is this field intact; the other, for general use is of
11505        # standard format.  In either case we have to find the decomposition
11506        # type.  Empty fields have None as their type, and map to the code
11507        # point itself
11508        if ($fields[$PERL_DECOMPOSITION] eq "") {
11509            $fields[$DECOMP_TYPE] = 'None';
11510            $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
11511        }
11512        else {
11513            ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
11514                                            =~ / < ( .+? ) > \s* ( .+ ) /x;
11515            if (! defined $fields[$DECOMP_TYPE]) {
11516                $fields[$DECOMP_TYPE] = 'Canonical';
11517                $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
11518            }
11519            else {
11520                $fields[$DECOMP_MAP] = $map;
11521            }
11522        }
11523
11524        # The 3 numeric fields also require special handling.  The 2 digit
11525        # fields must be either empty or match the number field.  This means
11526        # that if it is empty, they must be as well, and the numeric type is
11527        # None, and the numeric value is 'Nan'.
11528        # The decimal digit field must be empty or match the other digit
11529        # field.  If the decimal digit field is non-empty, the code point is
11530        # a decimal digit, and the other two fields will have the same value.
11531        # If it is empty, but the other digit field is non-empty, the code
11532        # point is an 'other digit', and the number field will have the same
11533        # value as the other digit field.  If the other digit field is empty,
11534        # but the number field is non-empty, the code point is a generic
11535        # numeric type.
11536        if ($fields[$NUMERIC] eq "") {
11537            if ($fields[$PERL_DECIMAL_DIGIT] ne ""
11538                || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
11539            ) {
11540                $file->carp_bad_line("Numeric values inconsistent.  Trying to process anyway");
11541            }
11542            $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
11543            $fields[$NUMERIC] = 'NaN';
11544        }
11545        else {
11546            $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number.  Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
11547            if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
11548                $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
11549                $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should be empty since the general category ($fields[$CATEGORY]) isn't 'Nd'.  Processing as Decimal") if $fields[$CATEGORY] ne "Nd";
11550                $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
11551            }
11552            elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
11553                $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
11554                $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
11555            }
11556            else {
11557                $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
11558
11559                # Rationals require extra effort.
11560                if ($fields[$NUMERIC] =~ qr{/}) {
11561                    reduce_fraction(\$fields[$NUMERIC]);
11562                    register_fraction($fields[$NUMERIC])
11563                }
11564            }
11565        }
11566
11567        # For the properties that have empty fields in the file, and which
11568        # mean something different from empty, change them to that default.
11569        # Certain fields just haven't been empty so far in any Unicode
11570        # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
11571        # $CATEGORY.  This leaves just the two fields, and so we hard-code in
11572        # the defaults; which are very unlikely to ever change.
11573        $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
11574        $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
11575
11576        # UAX44 says that if title is empty, it is the same as whatever upper
11577        # is,
11578        $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
11579
11580        # There are a few pairs of lines like:
11581        #   AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
11582        #   D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
11583        # that define ranges.  These should be processed after the fields are
11584        # adjusted above, as they may override some of them; but mostly what
11585        # is left is to possibly adjust the $CHARNAME field.  The names of all the
11586        # paired lines start with a '<', but this is also true of '<control>,
11587        # which isn't one of these special ones.
11588        if ($fields[$CHARNAME] eq '<control>') {
11589
11590            # Some code points in this file have the pseudo-name
11591            # '<control>', but the official name for such ones is the null
11592            # string.
11593            $fields[$NAME] = $fields[$CHARNAME] = "";
11594
11595            # We had better not be in between range lines.
11596            if ($in_range) {
11597                $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'.  Trying anyway");
11598                $in_range = 0;
11599            }
11600        }
11601        elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
11602
11603            # Here is a non-range line.  We had better not be in between range
11604            # lines.
11605            if ($in_range) {
11606                $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'.  Trying anyway");
11607                $in_range = 0;
11608            }
11609            if ($fields[$CHARNAME] =~ s/- $cp $//x) {
11610
11611                # These are code points whose names end in their code points,
11612                # which means the names are algorithmically derivable from the
11613                # code points.  To shorten the output Name file, the algorithm
11614                # for deriving these is placed in the file instead of each
11615                # code point, so they have map type $CP_IN_NAME
11616                $fields[$CHARNAME] = $CMD_DELIM
11617                                 . $MAP_TYPE_CMD
11618                                 . '='
11619                                 . $CP_IN_NAME
11620                                 . $CMD_DELIM
11621                                 . $fields[$CHARNAME];
11622            }
11623            $fields[$NAME] = $fields[$CHARNAME];
11624        }
11625        elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
11626            $fields[$CHARNAME] = $fields[$NAME] = $1;
11627
11628            # Here we are at the beginning of a range pair.
11629            if ($in_range) {
11630                $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'.  Trying anyway");
11631            }
11632            $in_range = 1;
11633
11634            # Because the properties in the range do not overwrite any already
11635            # in the db, we must flush the buffers of what's already there, so
11636            # they get handled in the normal scheme.
11637            $force_output = 1;
11638
11639        }
11640        elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
11641            $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME].  Ignoring this line.");
11642            $_ = "";
11643            return;
11644        }
11645        else { # Here, we are at the last line of a range pair.
11646
11647            if (! $in_range) {
11648                $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one.  Ignoring this line.");
11649                $_ = "";
11650                return;
11651            }
11652            $in_range = 0;
11653
11654            $fields[$NAME] = $fields[$CHARNAME];
11655
11656            # Check that the input is valid: that the closing of the range is
11657            # the same as the beginning.
11658            foreach my $i (0 .. $last_field) {
11659                next if $fields[$i] eq $previous_fields[$i];
11660                $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'.  Bad News.  Trying anyway");
11661            }
11662
11663            # The processing differs depending on the type of range,
11664            # determined by its $CHARNAME
11665            if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
11666
11667                # Check that the data looks right.
11668                if ($decimal_previous_cp != $SBase) {
11669                    $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp.  Bad News.  Results will be wrong");
11670                }
11671                if ($decimal_cp != $SBase + $SCount - 1) {
11672                    $file->carp_bad_line("Unexpected Hangul syllable end = $cp.  Bad News.  Results will be wrong");
11673                }
11674
11675                # The Hangul syllable range has a somewhat complicated name
11676                # generation algorithm.  Each code point in it has a canonical
11677                # decomposition also computable by an algorithm.  The
11678                # perl decomposition map table built from these is used only
11679                # by normalize.pm, which has the algorithm built in it, so the
11680                # decomposition maps are not needed, and are large, so are
11681                # omitted from it.  If the full decomposition map table is to
11682                # be output, the decompositions are generated for it, in the
11683                # EOF handling code for this input file.
11684
11685                $previous_fields[$DECOMP_TYPE] = 'Canonical';
11686
11687                # This range is stored in our internal structure with its
11688                # own map type, different from all others.
11689                $previous_fields[$CHARNAME] = $previous_fields[$NAME]
11690                                        = $CMD_DELIM
11691                                          . $MAP_TYPE_CMD
11692                                          . '='
11693                                          . $HANGUL_SYLLABLE
11694                                          . $CMD_DELIM
11695                                          . $fields[$CHARNAME];
11696            }
11697            elsif ($fields[$CATEGORY] eq 'Lo') {    # Is a letter
11698
11699                # All the CJK ranges like this have the name given as a
11700                # special case in the next code line.  And for the others, we
11701                # hope that Unicode continues to use the correct name in
11702                # future releases, so we don't have to make further special
11703                # cases.
11704                my $name = ($fields[$CHARNAME] =~ /^CJK/)
11705                           ? 'CJK UNIFIED IDEOGRAPH'
11706                           : uc $fields[$CHARNAME];
11707
11708                # The name for these contains the code point itself, and all
11709                # are defined to have the same base name, regardless of what
11710                # is in the file.  They are stored in our internal structure
11711                # with a map type of $CP_IN_NAME
11712                $previous_fields[$CHARNAME] = $previous_fields[$NAME]
11713                                        = $CMD_DELIM
11714                                           . $MAP_TYPE_CMD
11715                                           . '='
11716                                           . $CP_IN_NAME
11717                                           . $CMD_DELIM
11718                                           . $name;
11719
11720            }
11721            elsif ($fields[$CATEGORY] eq 'Co'
11722                     || $fields[$CATEGORY] eq 'Cs')
11723            {
11724                # The names of all the code points in these ranges are set to
11725                # null, as there are no names for the private use and
11726                # surrogate code points.
11727
11728                $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
11729            }
11730            else {
11731                $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY].  Attempting to process it.");
11732            }
11733
11734            # The first line of the range caused everything else to be output,
11735            # and then its values were stored as the beginning values for the
11736            # next set of ranges, which this one ends.  Now, for each value,
11737            # add a command to tell the handler that these values should not
11738            # replace any existing ones in our database.
11739            foreach my $i (0 .. $last_field) {
11740                $previous_fields[$i] = $CMD_DELIM
11741                                        . $REPLACE_CMD
11742                                        . '='
11743                                        . $NO
11744                                        . $CMD_DELIM
11745                                        . $previous_fields[$i];
11746            }
11747
11748            # And change things so it looks like the entire range has been
11749            # gone through with this being the final part of it.  Adding the
11750            # command above to each field will cause this range to be flushed
11751            # during the next iteration, as it guaranteed that the stored
11752            # field won't match whatever value the next one has.
11753            $previous_cp = $cp;
11754            $decimal_previous_cp = $decimal_cp;
11755
11756            # We are now set up for the next iteration; so skip the remaining
11757            # code in this subroutine that does the same thing, but doesn't
11758            # know about these ranges.
11759            $_ = "";
11760
11761            return;
11762        }
11763
11764        # On the very first line, we fake it so the code below thinks there is
11765        # nothing to output, and initialize so that when it does get output it
11766        # uses the first line's values for the lowest part of the range.
11767        # (One could avoid this by using peek(), but then one would need to
11768        # know the adjustments done above and do the same ones in the setup
11769        # routine; not worth it)
11770        if ($first_time) {
11771            $first_time = 0;
11772            @previous_fields = @fields;
11773            @start = ($cp) x scalar @fields;
11774            $decimal_previous_cp = $decimal_cp - 1;
11775        }
11776
11777        # For each field, output the stored up ranges that this code point
11778        # doesn't fit in.  Earlier we figured out if all ranges should be
11779        # terminated because of changing the replace or map type styles, or if
11780        # there is a gap between this new code point and the previous one, and
11781        # that is stored in $force_output.  But even if those aren't true, we
11782        # need to output the range if this new code point's value for the
11783        # given property doesn't match the stored range's.
11784        #local $to_trace = 1 if main::DEBUG;
11785        foreach my $i (0 .. $last_field) {
11786            my $field = $fields[$i];
11787            if ($force_output || $field ne $previous_fields[$i]) {
11788
11789                # Flush the buffer of stored values.
11790                $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
11791
11792                # Start a new range with this code point and its value
11793                $start[$i] = $cp;
11794                $previous_fields[$i] = $field;
11795            }
11796        }
11797
11798        # Set the values for the next time.
11799        $previous_cp = $cp;
11800        $decimal_previous_cp = $decimal_cp;
11801
11802        # The input line has generated whatever adjusted lines are needed, and
11803        # should not be looked at further.
11804        $_ = "";
11805        return;
11806    }
11807
11808    sub EOF_UnicodeData($file) {
11809        # Called upon EOF to flush the buffers, and create the Hangul
11810        # decomposition mappings if needed.
11811
11812        # Flush the buffers.
11813        foreach my $i (0 .. $last_field) {
11814            $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
11815        }
11816
11817        if (-e 'Jamo.txt') {
11818
11819            # The algorithm is published by Unicode, based on values in
11820            # Jamo.txt, (which should have been processed before this
11821            # subroutine), and the results left in %Jamo
11822            unless (%Jamo) {
11823                Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt.  Hangul syllables not generated.");
11824                return;
11825            }
11826
11827            # If the full decomposition map table is being output, insert
11828            # into it the Hangul syllable mappings.  This is to avoid having
11829            # to publish a subroutine in it to compute them.  (which would
11830            # essentially be this code.)  This uses the algorithm published by
11831            # Unicode.  (No hangul syllables in version 1)
11832            if ($v_version ge v2.0.0
11833                && property_ref('Decomposition_Mapping')->to_output_map) {
11834                for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
11835                    use integer;
11836                    my $SIndex = $S - $SBase;
11837                    my $L = $LBase + $SIndex / $NCount;
11838                    my $V = $VBase + ($SIndex % $NCount) / $TCount;
11839                    my $T = $TBase + $SIndex % $TCount;
11840
11841                    trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
11842                    my $decomposition = sprintf("%04X %04X", $L, $V);
11843                    $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
11844                    $file->insert_adjusted_lines(
11845                                sprintf("%04X; Decomposition_Mapping; %s",
11846                                        $S,
11847                                        $decomposition));
11848                }
11849            }
11850        }
11851
11852        return;
11853    }
11854
11855    sub filter_v1_ucd($file) {
11856        # Fix UCD lines in version 1.  This is probably overkill, but this
11857        # fixes some glaring errors in Version 1 UnicodeData.txt.  That file:
11858        # 1)    had many Hangul (U+3400 - U+4DFF) code points that were later
11859        #       removed.  This program retains them
11860        # 2)    didn't include ranges, which it should have, and which are now
11861        #       added in @corrected_lines below.  It was hand populated by
11862        #       taking the data from Version 2, verified by analyzing
11863        #       DAge.txt.
11864        # 3)    There is a syntax error in the entry for U+09F8 which could
11865        #       cause problems for Unicode::UCD, and so is changed.  It's
11866        #       numeric value was simply a minus sign, without any number.
11867        #       (Eventually Unicode changed the code point to non-numeric.)
11868        # 4)    The decomposition types often don't match later versions
11869        #       exactly, and the whole syntax of that field is different; so
11870        #       the syntax is changed as well as the types to their later
11871        #       terminology.  Otherwise normalize.pm would be very unhappy
11872        # 5)    Many ccc classes are different.  These are left intact.
11873        # 6)    U+FF10..U+FF19 are missing their numeric values in all three
11874        #       fields.  These are unchanged because it doesn't really cause
11875        #       problems for Perl.
11876        # 7)    A number of code points, such as controls, don't have their
11877        #       Unicode Version 1 Names in this file.  These are added.
11878        # 8)    A number of Symbols were marked as Lm.  This changes those in
11879        #       the Latin1 range, so that regexes work.
11880        # 9)    The odd characters U+03DB .. U+03E1 weren't encoded but are
11881        #       referred to by their lc equivalents.  Not fixed.
11882
11883        my @corrected_lines = split /\n/, <<'END';
118844E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
118859FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
11886E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
11887F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
11888F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
11889FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
11890END
11891
11892        #local $to_trace = 1 if main::DEBUG;
11893        trace $_ if main::DEBUG && $to_trace;
11894
11895        # -1 => retain trailing null fields
11896        my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11897
11898        # At the first place that is wrong in the input, insert all the
11899        # corrections, replacing the wrong line.
11900        if ($code_point eq '4E00') {
11901            my @copy = @corrected_lines;
11902            $_ = shift @copy;
11903            ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11904
11905            $file->insert_lines(@copy);
11906        }
11907        elsif ($code_point =~ /^00/ && $fields[$CATEGORY] eq 'Lm') {
11908
11909            # There are no Lm characters in Latin1; these should be 'Sk', but
11910            # there isn't that in V1.
11911            $fields[$CATEGORY] = 'So';
11912        }
11913
11914        if ($fields[$NUMERIC] eq '-') {
11915            $fields[$NUMERIC] = '-1';  # This is what 2.0 made it.
11916        }
11917
11918        if  ($fields[$PERL_DECOMPOSITION] ne "") {
11919
11920            # Several entries have this change to superscript 2 or 3 in the
11921            # middle.  Convert these to the modern version, which is to use
11922            # the actual U+00B2 and U+00B3 (the superscript forms) instead.
11923            # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
11924            # 'HHHH HHHH 00B3 HHHH'.
11925            # It turns out that all of these that don't have another
11926            # decomposition defined at the beginning of the line have the
11927            # <square> decomposition in later releases.
11928            if ($code_point ne '00B2' && $code_point ne '00B3') {
11929                if  ($fields[$PERL_DECOMPOSITION]
11930                                    =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
11931                {
11932                    if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
11933                        $fields[$PERL_DECOMPOSITION] = '<square> '
11934                        . $fields[$PERL_DECOMPOSITION];
11935                    }
11936                }
11937            }
11938
11939            # If is like '<+circled> 0052 <-circled>', convert to
11940            # '<circled> 0052'
11941            $fields[$PERL_DECOMPOSITION] =~
11942                            s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/xg;
11943
11944            # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
11945            $fields[$PERL_DECOMPOSITION] =~
11946                            s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
11947            or $fields[$PERL_DECOMPOSITION] =~
11948                            s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
11949            or $fields[$PERL_DECOMPOSITION] =~
11950                            s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
11951            or $fields[$PERL_DECOMPOSITION] =~
11952                        s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
11953
11954            # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
11955            $fields[$PERL_DECOMPOSITION] =~
11956                    s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
11957
11958            # Change names to modern form.
11959            $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
11960            $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
11961            $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
11962            $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
11963
11964            # One entry has weird braces
11965            $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
11966
11967            # One entry at U+2116 has an extra <sup>
11968            $fields[$PERL_DECOMPOSITION] =~ s/( < .*? > .* ) < .*? > \ * /$1/x;
11969        }
11970
11971        $_ = join ';', $code_point, @fields;
11972        trace $_ if main::DEBUG && $to_trace;
11973        return;
11974    }
11975
11976    sub filter_bad_Nd_ucd {
11977        # Early versions specified a value in the decimal digit field even
11978        # though the code point wasn't a decimal digit.  Clear the field in
11979        # that situation, so that the main code doesn't think it is a decimal
11980        # digit.
11981
11982        my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11983        if ($fields[$PERL_DECIMAL_DIGIT] ne "" && $fields[$CATEGORY] ne 'Nd') {
11984            $fields[$PERL_DECIMAL_DIGIT] = "";
11985            $_ = join ';', $code_point, @fields;
11986        }
11987        return;
11988    }
11989
11990    my @U1_control_names = split /\n/, <<'END';
11991NULL
11992START OF HEADING
11993START OF TEXT
11994END OF TEXT
11995END OF TRANSMISSION
11996ENQUIRY
11997ACKNOWLEDGE
11998BELL
11999BACKSPACE
12000HORIZONTAL TABULATION
12001LINE FEED
12002VERTICAL TABULATION
12003FORM FEED
12004CARRIAGE RETURN
12005SHIFT OUT
12006SHIFT IN
12007DATA LINK ESCAPE
12008DEVICE CONTROL ONE
12009DEVICE CONTROL TWO
12010DEVICE CONTROL THREE
12011DEVICE CONTROL FOUR
12012NEGATIVE ACKNOWLEDGE
12013SYNCHRONOUS IDLE
12014END OF TRANSMISSION BLOCK
12015CANCEL
12016END OF MEDIUM
12017SUBSTITUTE
12018ESCAPE
12019FILE SEPARATOR
12020GROUP SEPARATOR
12021RECORD SEPARATOR
12022UNIT SEPARATOR
12023DELETE
12024BREAK PERMITTED HERE
12025NO BREAK HERE
12026INDEX
12027NEXT LINE
12028START OF SELECTED AREA
12029END OF SELECTED AREA
12030CHARACTER TABULATION SET
12031CHARACTER TABULATION WITH JUSTIFICATION
12032LINE TABULATION SET
12033PARTIAL LINE DOWN
12034PARTIAL LINE UP
12035REVERSE LINE FEED
12036SINGLE SHIFT TWO
12037SINGLE SHIFT THREE
12038DEVICE CONTROL STRING
12039PRIVATE USE ONE
12040PRIVATE USE TWO
12041SET TRANSMIT STATE
12042CANCEL CHARACTER
12043MESSAGE WAITING
12044START OF GUARDED AREA
12045END OF GUARDED AREA
12046START OF STRING
12047SINGLE CHARACTER INTRODUCER
12048CONTROL SEQUENCE INTRODUCER
12049STRING TERMINATOR
12050OPERATING SYSTEM COMMAND
12051PRIVACY MESSAGE
12052APPLICATION PROGRAM COMMAND
12053END
12054
12055    sub filter_early_U1_names {
12056        # Very early versions did not have the Unicode_1_name field specified.
12057        # They differed in which ones were present; make sure a U1 name
12058        # exists, so that Unicode::UCD::charinfo will work
12059
12060        my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12061
12062
12063        # @U1_control names above are entirely positional, so we pull them out
12064        # in the exact order required, with gaps for the ones that don't have
12065        # names.
12066        if ($code_point =~ /^00[01]/
12067            || $code_point eq '007F'
12068            || $code_point =~ /^008[2-9A-F]/
12069            || $code_point =~ /^009[0-8A-F]/)
12070        {
12071            my $u1_name = shift @U1_control_names;
12072            $fields[$UNICODE_1_NAME] = $u1_name unless $fields[$UNICODE_1_NAME];
12073            $_ = join ';', $code_point, @fields;
12074        }
12075        return;
12076    }
12077
12078    sub filter_v2_1_5_ucd {
12079        # A dozen entries in this 2.1.5 file had the mirrored and numeric
12080        # columns swapped;  These all had mirrored be 'N'.  So if the numeric
12081        # column appears to be N, swap it back.
12082
12083        my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12084        if ($fields[$NUMERIC] eq 'N') {
12085            $fields[$NUMERIC] = $fields[$MIRRORED];
12086            $fields[$MIRRORED] = 'N';
12087            $_ = join ';', $code_point, @fields;
12088        }
12089        return;
12090    }
12091
12092    sub filter_v6_ucd {
12093
12094        # Unicode 6.0 co-opted the name BELL for U+1F514, but until 5.17,
12095        # it wasn't accepted, to allow for some deprecation cycles.  This
12096        # function is not called after 5.16
12097
12098        return if $_ !~ /^(?:0007|1F514|070F);/;
12099
12100        my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12101        if ($code_point eq '0007') {
12102            $fields[$CHARNAME] = "";
12103        }
12104        elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see
12105                            # http://www.unicode.org/versions/corrigendum8.html
12106            $fields[$BIDI] = "AL";
12107        }
12108        elsif ($^V lt v5.18.0) { # For 5.18 will convert to use Unicode's name
12109            $fields[$CHARNAME] = "";
12110        }
12111
12112        $_ = join ';', $code_point, @fields;
12113
12114        return;
12115    }
12116} # End closure for UnicodeData
12117
12118sub process_GCB_test($file) {
12119
12120    while ($file->next_line) {
12121        push @backslash_X_tests, $_;
12122    }
12123
12124    return;
12125}
12126
12127sub process_LB_test($file) {
12128
12129    while ($file->next_line) {
12130        push @LB_tests, $_;
12131    }
12132
12133    return;
12134}
12135
12136sub process_SB_test($file) {
12137
12138    while ($file->next_line) {
12139        push @SB_tests, $_;
12140    }
12141
12142    return;
12143}
12144
12145sub process_WB_test($file) {
12146
12147    while ($file->next_line) {
12148        push @WB_tests, $_;
12149    }
12150
12151    return;
12152}
12153
12154sub process_NamedSequences($file) {
12155    # NamedSequences.txt entries are just added to an array.  Because these
12156    # don't look like the other tables, they have their own handler.
12157    # An example:
12158    # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
12159    #
12160    # This just adds the sequence to an array for later handling
12161
12162    while ($file->next_line) {
12163        my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
12164        if (@remainder) {
12165            $file->carp_bad_line(
12166                "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
12167            next;
12168        }
12169
12170        # Code points need to be 5 digits long like the other entries in
12171        # Name.pl, for regcomp.c parsing; and the ones below 0x0100 need to be
12172        # converted to native
12173        $sequence = join " ", map { sprintf("%05X",
12174                                    utf8::unicode_to_native(hex $_))
12175                                  } split / /, $sequence;
12176        push @named_sequences, "$sequence\n$name\n";
12177    }
12178    return;
12179}
12180
12181{ # Closure
12182
12183    my $first_range;
12184
12185    sub  filter_early_ea_lb {
12186        # Fixes early EastAsianWidth.txt and LineBreak.txt files.  These had a
12187        # third field be the name of the code point, which can be ignored in
12188        # most cases.  But it can be meaningful if it marks a range:
12189        # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
12190        # 3400;W;<CJK Ideograph Extension A, First>
12191        #
12192        # We need to see the First in the example above to know it's a range.
12193        # They did not use the later range syntaxes.  This routine changes it
12194        # to use the modern syntax.
12195        # $1 is the Input_file object.
12196
12197        my @fields = split /\s*;\s*/;
12198        if ($fields[2] =~ /^<.*, First>/) {
12199            $first_range = $fields[0];
12200            $_ = "";
12201        }
12202        elsif ($fields[2] =~ /^<.*, Last>/) {
12203            $_ = $_ = "$first_range..$fields[0]; $fields[1]";
12204        }
12205        else {
12206            undef $first_range;
12207            $_ = "$fields[0]; $fields[1]";
12208        }
12209
12210        return;
12211    }
12212}
12213
12214sub filter_substitute_lb {
12215    # Used on Unicodes that predate the LB property, where there is a
12216    # substitute file.  This just does the regular ea_lb handling for such
12217    # files, and then substitutes the long property value name for the short
12218    # one that comes with the file.  (The other break files have the long
12219    # names in them, so this is the odd one out.)  The reason for doing this
12220    # kludge is that regen/mk_invlists.pl is expecting the long name.  This
12221    # also fixes the typo 'Inseperable' that leads to problems.
12222
12223    filter_early_ea_lb;
12224    return unless $_;
12225
12226    my @fields = split /\s*;\s*/;
12227    $fields[1] = property_ref('_Perl_LB')->table($fields[1])->full_name;
12228    $fields[1] = 'Inseparable' if lc $fields[1] eq 'inseperable';
12229    $_ = join '; ', @fields;
12230}
12231
12232sub filter_old_style_arabic_shaping {
12233    # Early versions used a different term for the later one.
12234
12235    my @fields = split /\s*;\s*/;
12236    $fields[3] =~ s/<no shaping>/No_Joining_Group/;
12237    $fields[3] =~ s/\s+/_/g;                # Change spaces to underscores
12238    $_ = join ';', @fields;
12239    return;
12240}
12241
12242{ # Closure
12243    my $lc; # Table for lowercase mapping
12244    my $tc;
12245    my $uc;
12246    my %special_casing_code_points;
12247
12248    sub setup_special_casing($file) {
12249        # SpecialCasing.txt contains the non-simple case change mappings.  The
12250        # simple ones are in UnicodeData.txt, which should already have been
12251        # read in to the full property data structures, so as to initialize
12252        # these with the simple ones.  Then the SpecialCasing.txt entries
12253        # add or overwrite the ones which have different full mappings.
12254
12255        # This routine sees if the simple mappings are to be output, and if
12256        # so, copies what has already been put into the full mapping tables,
12257        # while they still contain only the simple mappings.
12258
12259        # The reason it is done this way is that the simple mappings are
12260        # probably not going to be output, so it saves work to initialize the
12261        # full tables with the simple mappings, and then overwrite those
12262        # relatively few entries in them that have different full mappings,
12263        # and thus skip the simple mapping tables altogether.
12264
12265        $lc = property_ref('lc');
12266        $tc = property_ref('tc');
12267        $uc = property_ref('uc');
12268
12269        # For each of the case change mappings...
12270        foreach my $full_casing_table ($lc, $tc, $uc) {
12271            my $full_casing_name = $full_casing_table->name;
12272            my $full_casing_full_name = $full_casing_table->full_name;
12273            unless (defined $full_casing_table
12274                    && ! $full_casing_table->is_empty)
12275            {
12276                Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing.  Only special casing will be generated.");
12277            }
12278
12279            # Create a table in the old-style format and with the original
12280            # file name for backwards compatibility with applications that
12281            # read it directly.  The new tables contain both the simple and
12282            # full maps, and the old are missing simple maps when there is a
12283            # conflicting full one.  Probably it would have been ok to add
12284            # those to the legacy version, as was already done in 5.14 to the
12285            # case folding one, but this was not done, out of an abundance of
12286            # caution.  The tables are set up here before we deal with the
12287            # full maps so that as we handle those, we can override the simple
12288            # maps for them in the legacy table, and merely add them in the
12289            # new-style one.
12290            my $legacy = Property->new("Legacy_" . $full_casing_full_name,
12291                                File => $full_casing_full_name
12292                                                          =~ s/case_Mapping//r,
12293                                Format => $HEX_FORMAT,
12294                                Default_Map => $CODE_POINT,
12295                                Initialize => $full_casing_table,
12296                                Replacement_Property => $full_casing_full_name,
12297            );
12298
12299            $full_casing_table->add_comment(join_lines( <<END
12300This file includes both the simple and full case changing maps.  The simple
12301ones are in the main body of the table below, and the full ones adding to or
12302overriding them are in the hash.
12303END
12304            ));
12305
12306            # The simple version's name in each mapping merely has an 's' in
12307            # front of the full one's
12308            my $simple_name = 's' . $full_casing_name;
12309            my $simple = property_ref($simple_name);
12310            $simple->initialize($full_casing_table) if $simple->to_output_map();
12311        }
12312
12313        return;
12314    }
12315
12316    sub filter_2_1_8_special_casing_line {
12317
12318        # This version had duplicate entries in this file.  Delete all but the
12319        # first one
12320        my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
12321                                              # fields
12322        if (exists $special_casing_code_points{$fields[0]}) {
12323            $_ = "";
12324            return;
12325        }
12326
12327        $special_casing_code_points{$fields[0]} = 1;
12328        filter_special_casing_line(@_);
12329    }
12330
12331    sub filter_special_casing_line($file) {
12332        # Change the format of $_ from SpecialCasing.txt into something that
12333        # the generic handler understands.  Each input line contains three
12334        # case mappings.  This will generate three lines to pass to the
12335        # generic handler for each of those.
12336
12337        # The input syntax (after stripping comments and trailing white space
12338        # is like one of the following (with the final two being entries that
12339        # we ignore):
12340        # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
12341        # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
12342        # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
12343        # Note the trailing semi-colon, unlike many of the input files.  That
12344        # means that there will be an extra null field generated by the split
12345
12346        my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
12347                                              # fields
12348
12349        # field #4 is when this mapping is conditional.  If any of these get
12350        # implemented, it would be by hard-coding in the casing functions in
12351        # the Perl core, not through tables.  But if there is a new condition
12352        # we don't know about, output a warning.  We know about all the
12353        # conditions through 6.0
12354        if ($fields[4] ne "") {
12355            my @conditions = split ' ', $fields[4];
12356            if ($conditions[0] ne 'tr'  # We know that these languages have
12357                                        # conditions, and some are multiple
12358                && $conditions[0] ne 'az'
12359                && $conditions[0] ne 'lt'
12360
12361                # And, we know about a single condition Final_Sigma, but
12362                # nothing else.
12363                && ($v_version gt v5.2.0
12364                    && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
12365            {
12366                $file->carp_bad_line("Unknown condition '$fields[4]'.  You should inspect it and either add code to handle it, or add to list of those that are to ignore");
12367            }
12368            elsif ($conditions[0] ne 'Final_Sigma') {
12369
12370                    # Don't print out a message for Final_Sigma, because we
12371                    # have hard-coded handling for it.  (But the standard
12372                    # could change what the rule should be, but it wouldn't
12373                    # show up here anyway.
12374
12375                    print "# SKIPPING Special Casing: $_\n"
12376                                                    if $verbosity >= $VERBOSE;
12377            }
12378            $_ = "";
12379            return;
12380        }
12381        elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
12382            $file->carp_bad_line('Extra fields');
12383            $_ = "";
12384            return;
12385        }
12386
12387        my $decimal_code_point = hex $fields[0];
12388
12389        # Loop to handle each of the three mappings in the input line, in
12390        # order, with $i indicating the current field number.
12391        my $i = 0;
12392        for my $object ($lc, $tc, $uc) {
12393            $i++;   # First time through, $i = 0 ... 3rd time = 3
12394
12395            my $value = $object->value_of($decimal_code_point);
12396            $value = ($value eq $CODE_POINT)
12397                      ? $decimal_code_point
12398                      : hex $value;
12399
12400            # If this isn't a multi-character mapping, it should already have
12401            # been read in.
12402            if ($fields[$i] !~ / /) {
12403                if ($value != hex $fields[$i]) {
12404                    Carp::my_carp("Bad news. UnicodeData.txt thinks "
12405                                  . $object->name
12406                                  . "(0x$fields[0]) is $value"
12407                                  . " and SpecialCasing.txt thinks it is "
12408                                  . hex($fields[$i])
12409                                  . ".  Good luck.  Retaining UnicodeData value, and proceeding anyway.");
12410                }
12411            }
12412            else {
12413
12414                # The mapping goes into both the legacy table, in which it
12415                # replaces the simple one...
12416                $file->insert_adjusted_lines("$fields[0]; Legacy_"
12417                                             . $object->full_name
12418                                             . "; $fields[$i]");
12419
12420                # ... and the regular table, in which it is additional,
12421                # beyond the simple mapping.
12422                $file->insert_adjusted_lines("$fields[0]; "
12423                                             . $object->name
12424                                            . "; "
12425                                            . $CMD_DELIM
12426                                            . "$REPLACE_CMD=$MULTIPLE_BEFORE"
12427                                            . $CMD_DELIM
12428                                            . $fields[$i]);
12429            }
12430        }
12431
12432        # Everything has been handled by the insert_adjusted_lines()
12433        $_ = "";
12434
12435        return;
12436    }
12437}
12438
12439sub filter_old_style_case_folding($file) {
12440    # This transforms $_ containing the case folding style of 3.0.1, to 3.1
12441    # and later style.  Different letters were used in the earlier.
12442
12443    my @fields = split /\s*;\s*/;
12444
12445    if ($fields[1] eq 'L') {
12446        $fields[1] = 'C';             # L => C always
12447    }
12448    elsif ($fields[1] eq 'E') {
12449        if ($fields[2] =~ / /) {      # E => C if one code point; F otherwise
12450            $fields[1] = 'F'
12451        }
12452        else {
12453            $fields[1] = 'C'
12454        }
12455    }
12456    else {
12457        $file->carp_bad_line("Expecting L or E in second field");
12458        $_ = "";
12459        return;
12460    }
12461    $_ = join("; ", @fields) . ';';
12462    return;
12463}
12464
12465{ # Closure for case folding
12466
12467    # Create the map for simple only if are going to output it, for otherwise
12468    # it takes no part in anything we do.
12469    my $to_output_simple;
12470
12471    sub setup_case_folding {
12472        # Read in the case foldings in CaseFolding.txt.  This handles both
12473        # simple and full case folding.
12474
12475        $to_output_simple
12476                        = property_ref('Simple_Case_Folding')->to_output_map;
12477
12478        if (! $to_output_simple) {
12479            property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding');
12480        }
12481
12482        # If we ever wanted to show that these tables were combined, a new
12483        # property method could be created, like set_combined_props()
12484        property_ref('Case_Folding')->add_comment(join_lines( <<END
12485This file includes both the simple and full case folding maps.  The simple
12486ones are in the main body of the table below, and the full ones adding to or
12487overriding them are in the hash.
12488END
12489        ));
12490        return;
12491    }
12492
12493    sub filter_case_folding_line($file) {
12494        # Called for each line in CaseFolding.txt
12495        # Input lines look like:
12496        # 0041; C; 0061; # LATIN CAPITAL LETTER A
12497        # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
12498        # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
12499        #
12500        # 'C' means that folding is the same for both simple and full
12501        # 'F' that it is only for full folding
12502        # 'S' that it is only for simple folding
12503        # 'T' is locale-dependent, and ignored
12504        # 'I' is a type of 'F' used in some early releases.
12505        # Note the trailing semi-colon, unlike many of the input files.  That
12506        # means that there will be an extra null field generated by the split
12507        # below, which we ignore and hence is not an error.
12508
12509        my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
12510        if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
12511            $file->carp_bad_line('Extra fields');
12512            $_ = "";
12513            return;
12514        }
12515
12516        if ($type =~ / ^ [IT] $/x) {   # Skip Turkic case folding, is locale dependent
12517            $_ = "";
12518            return;
12519        }
12520
12521        # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
12522        # I are all full foldings; S is single-char.  For S, there is always
12523        # an F entry, so we must allow multiple values for the same code
12524        # point.  Fortunately this table doesn't need further manipulation
12525        # which would preclude using multiple-values.  The S is now included
12526        # so that _swash_inversion_hash() is able to construct closures
12527        # without having to worry about F mappings.
12528        if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
12529            $_ = "$range; Case_Folding; "
12530                 . "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map";
12531        }
12532        else {
12533            $_ = "";
12534            $file->carp_bad_line('Expecting C F I S or T in second field');
12535        }
12536
12537        # C and S are simple foldings, but simple case folding is not needed
12538        # unless we explicitly want its map table output.
12539        if ($to_output_simple && $type eq 'C' || $type eq 'S') {
12540            $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
12541        }
12542
12543        return;
12544    }
12545
12546} # End case fold closure
12547
12548sub filter_jamo_line {
12549    # Filter Jamo.txt lines.  This routine mainly is used to populate hashes
12550    # from this file that is used in generating the Name property for Jamo
12551    # code points.  But, it also is used to convert early versions' syntax
12552    # into the modern form.  Here are two examples:
12553    # 1100; G   # HANGUL CHOSEONG KIYEOK            # Modern syntax
12554    # U+1100; G; HANGUL CHOSEONG KIYEOK             # 2.0 syntax
12555    #
12556    # The input is $_, the output is $_ filtered.
12557
12558    my @fields = split /\s*;\s*/, $_, -1;  # -1 => retain trailing null fields
12559
12560    # Let the caller handle unexpected input.  In earlier versions, there was
12561    # a third field which is supposed to be a comment, but did not have a '#'
12562    # before it.
12563    return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
12564
12565    $fields[0] =~ s/^U\+//;     # Also, early versions had this extraneous
12566                                # beginning.
12567
12568    # Some 2.1 versions had this wrong.  Causes havoc with the algorithm.
12569    $fields[1] = 'R' if $fields[0] eq '1105';
12570
12571    # Add to structure so can generate Names from it.
12572    my $cp = hex $fields[0];
12573    my $short_name = $fields[1];
12574    $Jamo{$cp} = $short_name;
12575    if ($cp <= $LBase + $LCount) {
12576        $Jamo_L{$short_name} = $cp - $LBase;
12577    }
12578    elsif ($cp <= $VBase + $VCount) {
12579        $Jamo_V{$short_name} = $cp - $VBase;
12580    }
12581    elsif ($cp <= $TBase + $TCount) {
12582        $Jamo_T{$short_name} = $cp - $TBase;
12583    }
12584    else {
12585        Carp::my_carp_bug("Unexpected Jamo code point in $_");
12586    }
12587
12588
12589    # Reassemble using just the first two fields to look like a typical
12590    # property file line
12591    $_ = "$fields[0]; $fields[1]";
12592
12593    return;
12594}
12595
12596sub register_fraction($rational) {
12597    # This registers the input rational number so that it can be passed on to
12598    # Unicode::UCD, both in rational and floating forms.
12599
12600    my $floating = eval $rational;
12601
12602    my @floats = sprintf "%.*e", $E_FLOAT_PRECISION, $floating;
12603
12604    # See if the denominator is a power of 2.
12605    $rational =~ m!.*/(.*)!;
12606    my $denominator = $1;
12607    if (defined $denominator && (($denominator & ($denominator - 1)) == 0)) {
12608
12609        # Here the denominator is a power of 2.  This means it has an exact
12610        # representation in binary, so rounding could go either way.  It turns
12611        # out that Windows doesn't necessarily round towards even, so output
12612        # an extra entry.  This happens when the final digit we output is even
12613        # and the next digits would be 50* to the precision of the machine.
12614        my $extra_digit_float = sprintf "%e", $floating;
12615        my $q = $E_FLOAT_PRECISION - 1;
12616        if ($extra_digit_float =~ / ( .* \. \d{$q} )
12617                                    ( [02468] ) 5 0* ( e .*)
12618                                  /ix)
12619        {
12620            push @floats, $1 . ($2 + 1) . $3;
12621        }
12622    }
12623
12624    foreach my $float (@floats) {
12625        # Strip off any leading zeros beyond 2 digits to make it C99
12626        # compliant.  (Windows has 3 digit exponents, contrary to C99)
12627        $float =~ s/ ( .* e [-+] ) 0* ( \d{2,}? ) /$1$2/x;
12628
12629        if (   defined $nv_floating_to_rational{$float}
12630            && $nv_floating_to_rational{$float} ne $rational)
12631        {
12632            die Carp::my_carp_bug("Both '$rational' and"
12633                            . " '$nv_floating_to_rational{$float}' evaluate to"
12634                            . " the same floating point number."
12635                            . "  \$E_FLOAT_PRECISION must be increased");
12636        }
12637        $nv_floating_to_rational{$float} = $rational;
12638    }
12639    return;
12640}
12641
12642sub gcd($a, $b) {   # Greatest-common-divisor; from
12643                # http://en.wikipedia.org/wiki/Euclidean_algorithm
12644    use integer;
12645
12646    while ($b != 0) {
12647       my $temp = $b;
12648       $b = $a % $b;
12649       $a = $temp;
12650    }
12651    return $a;
12652}
12653
12654sub reduce_fraction($fraction_ref) {
12655    # Reduce a fraction to lowest terms.  The Unicode data may be reducible,
12656    # hence this is needed.  The argument is a reference to the
12657    # string denoting the fraction, which must be of the form:
12658    if ($$fraction_ref !~ / ^ (-?) (\d+) \/ (\d+) $ /ax) {
12659        Carp::my_carp_bug("Non-fraction input '$$fraction_ref'.  Unchanged");
12660        return;
12661    }
12662
12663    my $sign = $1;
12664    my $numerator = $2;
12665    my $denominator = $3;
12666
12667    use integer;
12668
12669    # Find greatest common divisor
12670    my $gcd = gcd($numerator, $denominator);
12671
12672    # And reduce using the gcd.
12673    if ($gcd != 1) {
12674        $numerator    /= $gcd;
12675        $denominator  /= $gcd;
12676        $$fraction_ref = "$sign$numerator/$denominator";
12677    }
12678
12679    return;
12680}
12681
12682sub filter_numeric_value_line($file) {
12683    # DNumValues contains lines of a different syntax than the typical
12684    # property file:
12685    # 0F33          ; -0.5 ; ; -1/2 # No       TIBETAN DIGIT HALF ZERO
12686    #
12687    # This routine transforms $_ containing the anomalous syntax to the
12688    # typical, by filtering out the extra columns, and convert early version
12689    # decimal numbers to strings that look like rational numbers.
12690
12691    # Starting in 5.1, there is a rational field.  Just use that, omitting the
12692    # extra columns.  Otherwise convert the decimal number in the second field
12693    # to a rational, and omit extraneous columns.
12694    my @fields = split /\s*;\s*/, $_, -1;
12695    my $rational;
12696
12697    if ($v_version ge v5.1.0) {
12698        if (@fields != 4) {
12699            $file->carp_bad_line('Not 4 semi-colon separated fields');
12700            $_ = "";
12701            return;
12702        }
12703        reduce_fraction(\$fields[3]) if $fields[3] =~ qr{/};
12704        $rational = $fields[3];
12705
12706        $_ = join '; ', @fields[ 0, 3 ];
12707    }
12708    else {
12709
12710        # Here, is an older Unicode file, which has decimal numbers instead of
12711        # rationals in it.  Use the fraction to calculate the denominator and
12712        # convert to rational.
12713
12714        if (@fields != 2 && @fields != 3) {
12715            $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
12716            $_ = "";
12717            return;
12718        }
12719
12720        my $codepoints = $fields[0];
12721        my $decimal = $fields[1];
12722        if ($decimal =~ s/\.0+$//) {
12723
12724            # Anything ending with a decimal followed by nothing but 0's is an
12725            # integer
12726            $_ = "$codepoints; $decimal";
12727            $rational = $decimal;
12728        }
12729        else {
12730
12731            my $denominator;
12732            if ($decimal =~ /\.50*$/) {
12733                $denominator = 2;
12734            }
12735
12736            # Here have the hardcoded repeating decimals in the fraction, and
12737            # the denominator they imply.  There were only a few denominators
12738            # in the older Unicode versions of this file which this code
12739            # handles, so it is easy to convert them.
12740
12741            # The 4 is because of a round-off error in the Unicode 3.2 files
12742            elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
12743                $denominator = 3;
12744            }
12745            elsif ($decimal =~ /\.[27]50*$/) {
12746                $denominator = 4;
12747            }
12748            elsif ($decimal =~ /\.[2468]0*$/) {
12749                $denominator = 5;
12750            }
12751            elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
12752                $denominator = 6;
12753            }
12754            elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
12755                $denominator = 8;
12756            }
12757            if ($denominator) {
12758                my $sign = ($decimal < 0) ? "-" : "";
12759                my $numerator = int((abs($decimal) * $denominator) + .5);
12760                $rational = "$sign$numerator/$denominator";
12761                $_ = "$codepoints; $rational";
12762            }
12763            else {
12764                $file->carp_bad_line("Can't cope with number '$decimal'.");
12765                $_ = "";
12766                return;
12767            }
12768        }
12769    }
12770
12771    register_fraction($rational) if $rational =~ qr{/};
12772    return;
12773}
12774
12775{ # Closure
12776    my %unihan_properties;
12777
12778    sub construct_unihan($file_object) {
12779
12780        return unless file_exists($file_object->file);
12781
12782        if ($v_version lt v4.0.0) {
12783            push @cjk_properties, 'URS ; Unicode_Radical_Stroke';
12784            push @cjk_property_values, split "\n", <<'END';
12785# @missing: 0000..10FFFF; Unicode_Radical_Stroke; <none>
12786END
12787        }
12788
12789        if ($v_version ge v3.0.0) {
12790            push @cjk_properties, split "\n", <<'END';
12791cjkIRG_GSource; kIRG_GSource
12792cjkIRG_JSource; kIRG_JSource
12793cjkIRG_KSource; kIRG_KSource
12794cjkIRG_TSource; kIRG_TSource
12795cjkIRG_VSource; kIRG_VSource
12796END
12797        push @cjk_property_values, split "\n", <<'END';
12798# @missing: 0000..10FFFF; cjkIRG_GSource; <none>
12799# @missing: 0000..10FFFF; cjkIRG_JSource; <none>
12800# @missing: 0000..10FFFF; cjkIRG_KSource; <none>
12801# @missing: 0000..10FFFF; cjkIRG_TSource; <none>
12802# @missing: 0000..10FFFF; cjkIRG_VSource; <none>
12803END
12804        }
12805        if ($v_version ge v3.1.0) {
12806            push @cjk_properties, 'cjkIRG_HSource; kIRG_HSource';
12807            push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_HSource; <none>';
12808        }
12809        if ($v_version ge v3.1.1) {
12810            push @cjk_properties, 'cjkIRG_KPSource; kIRG_KPSource';
12811            push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_KPSource; <none>';
12812        }
12813        if ($v_version ge v3.2.0) {
12814            push @cjk_properties, split "\n", <<'END';
12815cjkAccountingNumeric; kAccountingNumeric
12816cjkCompatibilityVariant; kCompatibilityVariant
12817cjkOtherNumeric; kOtherNumeric
12818cjkPrimaryNumeric; kPrimaryNumeric
12819END
12820            push @cjk_property_values, split "\n", <<'END';
12821# @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
12822# @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
12823# @missing: 0000..10FFFF; cjkOtherNumeric; NaN
12824# @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
12825END
12826        }
12827        if ($v_version gt v4.0.0) {
12828            push @cjk_properties, 'cjkIRG_USource; kIRG_USource';
12829            push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_USource; <none>';
12830        }
12831
12832        if ($v_version ge v4.1.0) {
12833            push @cjk_properties, 'cjkIICore ; kIICore';
12834            push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIICore; <none>';
12835        }
12836    }
12837
12838    sub setup_unihan {
12839        # Do any special setup for Unihan properties.
12840
12841        # This property gives the wrong computed type, so override.
12842        my $usource = property_ref('kIRG_USource');
12843        $usource->set_type($STRING) if defined $usource;
12844
12845        # This property is to be considered binary (it says so in
12846        # http://www.unicode.org/reports/tr38/)
12847        my $iicore = property_ref('kIICore');
12848        if (defined $iicore) {
12849            $iicore->set_type($FORCED_BINARY);
12850            $iicore->table("Y")->add_note("Matches any code point which has a non-null value for this property; see unicode.org UAX #38.");
12851
12852            # Unicode doesn't include the maps for this property, so don't
12853            # warn that they are missing.
12854            $iicore->set_pre_declared_maps(0);
12855            $iicore->add_comment(join_lines( <<END
12856This property contains string values, but any non-empty ones are considered to
12857be 'core', so Perl creates tables for both: 1) its string values, plus 2)
12858tables so that \\p{kIICore} matches any code point which has a non-empty
12859value for this property.
12860END
12861            ));
12862        }
12863
12864        return;
12865    }
12866
12867    sub filter_unihan_line {
12868        # Change unihan db lines to look like the others in the db.  Here is
12869        # an input sample:
12870        #   U+341C        kCangjie        IEKN
12871
12872        # Tabs are used instead of semi-colons to separate fields; therefore
12873        # they may have semi-colons embedded in them.  Change these to periods
12874        # so won't screw up the rest of the code.
12875        s/;/./g;
12876
12877        # Remove lines that don't look like ones we accept.
12878        if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
12879            $_ = "";
12880            return;
12881        }
12882
12883        # Extract the property, and save a reference to its object.
12884        my $property = $1;
12885        if (! exists $unihan_properties{$property}) {
12886            $unihan_properties{$property} = property_ref($property);
12887        }
12888
12889        # Don't do anything unless the property is one we're handling, which
12890        # we determine by seeing if there is an object defined for it or not
12891        if (! defined $unihan_properties{$property}) {
12892            $_ = "";
12893            return;
12894        }
12895
12896        # Convert the tab separators to our standard semi-colons, and convert
12897        # the U+HHHH notation to the rest of the standard's HHHH
12898        s/\t/;/g;
12899        s/\b U \+ (?= $code_point_re )//xg;
12900
12901        #local $to_trace = 1 if main::DEBUG;
12902        trace $_ if main::DEBUG && $to_trace;
12903
12904        return;
12905    }
12906}
12907
12908sub filter_blocks_lines($file) {
12909    # In the Blocks.txt file, the names of the blocks don't quite match the
12910    # names given in PropertyValueAliases.txt, so this changes them so they
12911    # do match:  Blanks and hyphens are changed into underscores.  Also makes
12912    # early release versions look like later ones
12913    #
12914    # $_ is transformed to the correct value.
12915
12916    if ($v_version lt v3.2.0) {
12917        if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
12918            $_ = "";
12919            return;
12920        }
12921
12922        # Old versions used a different syntax to mark the range.
12923        $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
12924    }
12925
12926    my @fields = split /\s*;\s*/, $_, -1;
12927    if (@fields != 2) {
12928        $file->carp_bad_line("Expecting exactly two fields");
12929        $_ = "";
12930        return;
12931    }
12932
12933    # Change hyphens and blanks in the block name field only
12934    $fields[1] =~ s/[ -]/_/g;
12935    $fields[1] =~ s/_ ( [a-z] ) /_\u$1/xg;   # Capitalize first letter of word
12936
12937    $_ = join("; ", @fields);
12938    return;
12939}
12940
12941{ # Closure
12942    my $current_property;
12943
12944    sub filter_old_style_proplist {
12945        # PropList.txt has been in Unicode since version 2.0.  Until 3.1, it
12946        # was in a completely different syntax.  Ken Whistler of Unicode says
12947        # that it was something he used as an aid for his own purposes, but
12948        # was never an official part of the standard.  Many of the properties
12949        # in it were incorporated into the later PropList.txt, but some were
12950        # not.  This program uses this early file to generate property tables
12951        # that are otherwise not accessible in the early UCD's.  It does this
12952        # for the ones that eventually became official, and don't appear to be
12953        # too different in their contents from the later official version, and
12954        # throws away the rest.  It could be argued that the ones it generates
12955        # were probably not really official at that time, so should be
12956        # ignored.  You can easily modify things to skip all of them by
12957        # changing this function to just set $_ to "", and return; and to skip
12958        # certain of them by simply removing their declarations from
12959        # get_old_property_aliases().
12960        #
12961        # Here is a list of all the ones that are thrown away:
12962        #   Alphabetic                   The definitions for this are very
12963        #                                defective, so better to not mislead
12964        #                                people into thinking it works.
12965        #                                Instead the Perl extension of the
12966        #                                same name is constructed from first
12967        #                                principles.
12968        #   Bidi=*                       duplicates UnicodeData.txt
12969        #   Combining                    never made into official property;
12970        #                                is \P{ccc=0}
12971        #   Composite                    never made into official property.
12972        #   Currency Symbol              duplicates UnicodeData.txt: gc=sc
12973        #   Decimal Digit                duplicates UnicodeData.txt: gc=nd
12974        #   Delimiter                    never made into official property;
12975        #                                removed in 3.0.1
12976        #   Format Control               never made into official property;
12977        #                                similar to gc=cf
12978        #   High Surrogate               duplicates Blocks.txt
12979        #   Ignorable Control            never made into official property;
12980        #                                similar to di=y
12981        #   ISO Control                  duplicates UnicodeData.txt: gc=cc
12982        #   Left of Pair                 never made into official property;
12983        #   Line Separator               duplicates UnicodeData.txt: gc=zl
12984        #   Low Surrogate                duplicates Blocks.txt
12985        #   Non-break                    was actually listed as a property
12986        #                                in 3.2, but without any code
12987        #                                points.  Unicode denies that this
12988        #                                was ever an official property
12989        #   Non-spacing                  duplicate UnicodeData.txt: gc=mn
12990        #   Numeric                      duplicates UnicodeData.txt: gc=cc
12991        #   Paired Punctuation           never made into official property;
12992        #                                appears to be gc=ps + gc=pe
12993        #   Paragraph Separator          duplicates UnicodeData.txt: gc=cc
12994        #   Private Use                  duplicates UnicodeData.txt: gc=co
12995        #   Private Use High Surrogate   duplicates Blocks.txt
12996        #   Punctuation                  duplicates UnicodeData.txt: gc=p
12997        #   Space                        different definition than eventual
12998        #                                one.
12999        #   Titlecase                    duplicates UnicodeData.txt: gc=lt
13000        #   Unassigned Code Value        duplicates UnicodeData.txt: gc=cn
13001        #   Zero-width                   never made into official property;
13002        #                                subset of gc=cf
13003        # Most of the properties have the same names in this file as in later
13004        # versions, but a couple do not.
13005        #
13006        # This subroutine filters $_, converting it from the old style into
13007        # the new style.  Here's a sample of the old-style
13008        #
13009        #   *******************************************
13010        #
13011        #   Property dump for: 0x100000A0 (Join Control)
13012        #
13013        #   200C..200D  (2 chars)
13014        #
13015        # In the example, the property is "Join Control".  It is kept in this
13016        # closure between calls to the subroutine.  The numbers beginning with
13017        # 0x were internal to Ken's program that generated this file.
13018
13019        # If this line contains the property name, extract it.
13020        if (/^Property dump for: [^(]*\((.*)\)/) {
13021            $_ = $1;
13022
13023            # Convert white space to underscores.
13024            s/ /_/g;
13025
13026            # Convert the few properties that don't have the same name as
13027            # their modern counterparts
13028            s/Identifier_Part/ID_Continue/
13029            or s/Not_a_Character/NChar/;
13030
13031            # If the name matches an existing property, use it.
13032            if (defined property_ref($_)) {
13033                trace "new property=", $_ if main::DEBUG && $to_trace;
13034                $current_property = $_;
13035            }
13036            else {        # Otherwise discard it
13037                trace "rejected property=", $_ if main::DEBUG && $to_trace;
13038                undef $current_property;
13039            }
13040            $_ = "";    # The property is saved for the next lines of the
13041                        # file, but this defining line is of no further use,
13042                        # so clear it so that the caller won't process it
13043                        # further.
13044        }
13045        elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
13046
13047            # Here, the input line isn't a header defining a property for the
13048            # following section, and either we aren't in such a section, or
13049            # the line doesn't look like one that defines the code points in
13050            # such a section.  Ignore this line.
13051            $_ = "";
13052        }
13053        else {
13054
13055            # Here, we have a line defining the code points for the current
13056            # stashed property.  Anything starting with the first blank is
13057            # extraneous.  Otherwise, it should look like a normal range to
13058            # the caller.  Append the property name so that it looks just like
13059            # a modern PropList entry.
13060
13061            $_ =~ s/\s.*//;
13062            $_ .= "; $current_property";
13063        }
13064        trace $_ if main::DEBUG && $to_trace;
13065        return;
13066    }
13067} # End closure for old style proplist
13068
13069sub filter_old_style_normalization_lines {
13070    # For early releases of Unicode, the lines were like:
13071    #        74..2A76    ; NFKD_NO
13072    # For later releases this became:
13073    #        74..2A76    ; NFKD_QC; N
13074    # Filter $_ to look like those in later releases.
13075    # Similarly for MAYBEs
13076
13077    s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
13078
13079    # Also, the property FC_NFKC was abbreviated to FNC
13080    s/FNC/FC_NFKC/;
13081    return;
13082}
13083
13084sub setup_script_extensions {
13085    # The Script_Extensions property starts out with a clone of the Script
13086    # property.
13087
13088    $scx = property_ref("Script_Extensions");
13089    return unless defined $scx;
13090
13091    $scx->_set_format($STRING_WHITE_SPACE_LIST);
13092    $scx->initialize($script);
13093    $scx->set_default_map($script->default_map);
13094    $scx->set_pre_declared_maps(0);     # PropValueAliases doesn't list these
13095    $scx->add_comment(join_lines( <<END
13096The values for code points that appear in one script are just the same as for
13097the 'Script' property.  Likewise the values for those that appear in many
13098scripts are either 'Common' or 'Inherited', same as with 'Script'.  But the
13099values of code points that appear in a few scripts are a space separated list
13100of those scripts.
13101END
13102    ));
13103
13104    # Initialize scx's tables and the aliases for them to be the same as sc's
13105    foreach my $table ($script->tables) {
13106        my $scx_table = $scx->add_match_table($table->name,
13107                                Full_Name => $table->full_name);
13108        foreach my $alias ($table->aliases) {
13109            $scx_table->add_alias($alias->name);
13110        }
13111    }
13112}
13113
13114sub  filter_script_extensions_line {
13115    # The Scripts file comes with the full name for the scripts; the
13116    # ScriptExtensions, with the short name.  The final mapping file is a
13117    # combination of these, and without adjustment, would have inconsistent
13118    # entries.  This filters the latter file to convert to full names.
13119    # Entries look like this:
13120    # 064B..0655    ; Arab Syrc # Mn  [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
13121
13122    my @fields = split /\s*;\s*/;
13123
13124    # This script was erroneously omitted in this Unicode version.
13125    $fields[1] .= ' Takr' if $v_version eq v6.1.0 && $fields[0] =~ /^0964/;
13126
13127    my @full_names;
13128    foreach my $short_name (split " ", $fields[1]) {
13129        push @full_names, $script->table($short_name)->full_name;
13130    }
13131    $fields[1] = join " ", @full_names;
13132    $_ = join "; ", @fields;
13133
13134    return;
13135}
13136
13137sub setup_emojidata {
13138    my $prop_ref = Property->new('ExtPict',
13139                                 Full_Name => 'Extended_Pictographic',
13140    );
13141    $prop_ref->set_fate($PLACEHOLDER,
13142                        "Not part of the Unicode Character Database");
13143}
13144
13145sub filter_emojidata_line {
13146    # We only are interested in this single property from this non-UCD data
13147    # file, and we turn it into a Perl property, so that it isn't accessible
13148    # to the users
13149
13150    $_ = "" unless /\bExtended_Pictographic\b/;
13151
13152    return;
13153}
13154
13155sub setup_IdStatus {
13156    my $ids = Property->new('Identifier_Status',
13157                            Match_SubDir => 'IdStatus',
13158                            Default_Map => 'Restricted',
13159                           );
13160    $ids->add_match_table('Allowed');
13161}
13162
13163sub setup_IdType {
13164    $idt = Property->new('Identifier_Type',
13165                            Match_SubDir => 'IdType',
13166                            Default_Map => 'Not_Character',
13167                            Format => $STRING_WHITE_SPACE_LIST,
13168                           );
13169}
13170
13171sub  filter_IdType_line {
13172
13173    # Some code points have more than one type, separated by spaces on the
13174    # input.  For now, we just add everything as a property value.  Later when
13175    # we look for properties with format $STRING_WHITE_SPACE_LIST, we resolve
13176    # things
13177
13178    my @fields = split /\s*;\s*/;
13179    my $types = $fields[1];
13180    $idt->add_match_table($types) unless defined $idt->table($types);
13181
13182    return;
13183}
13184
13185sub generate_hst($file) {
13186
13187    # Populates the Hangul Syllable Type property from first principles
13188
13189    # These few ranges are hard-coded in.
13190    $file->insert_lines(split /\n/, <<'END'
131911100..1159    ; L
13192115F          ; L
131931160..11A2    ; V
1319411A8..11F9    ; T
13195END
13196);
13197
13198    # The Hangul syllables in version 1 are at different code points than
13199    # those that came along starting in version 2, and have different names;
13200    # they comprise about 60% of the code points of the later version.
13201    # From my (khw) research on them (see <558493EB.4000807@att.net>), the
13202    # initial set is a subset of the later version, with different English
13203    # transliterations.  I did not see an easy mapping between them.  The
13204    # later set includes essentially all possibilities, even ones that aren't
13205    # in modern use (if they ever were), and over 96% of the new ones are type
13206    # LVT.  Mathematically, the early set must also contain a preponderance of
13207    # LVT values.  In lieu of doing nothing, we just set them all to LVT, and
13208    # expect that this will be right most of the time, which is better than
13209    # not being right at all.
13210    if ($v_version lt v2.0.0) {
13211        my $property = property_ref($file->property);
13212        $file->insert_lines(sprintf("%04X..%04X; LVT\n",
13213                                    $FIRST_REMOVED_HANGUL_SYLLABLE,
13214                                    $FINAL_REMOVED_HANGUL_SYLLABLE));
13215        push @tables_that_may_be_empty, $property->table('LV')->complete_name;
13216        return;
13217    }
13218
13219    # The algorithmically derived syllables are almost all LVT ones, so
13220    # initialize the whole range with that.
13221    $file->insert_lines(sprintf "%04X..%04X; LVT\n",
13222                        $SBase, $SBase + $SCount -1);
13223
13224    # Those ones that aren't LVT are LV, and they occur at intervals of
13225    # $TCount code points, starting with the first code point, at $SBase.
13226    for (my $i = $SBase; $i < $SBase + $SCount; $i += $TCount) {
13227        $file->insert_lines(sprintf "%04X..%04X; LV\n", $i, $i);
13228    }
13229
13230    return;
13231}
13232
13233sub generate_GCB($file) {
13234
13235    # Populates the Grapheme Cluster Break property from first principles
13236
13237    # All these definitions are from
13238    # http://www.unicode.org/reports/tr29/tr29-3.html with confirmation
13239    # from http://www.unicode.org/reports/tr29/tr29-4.html
13240
13241    foreach my $range ($gc->ranges) {
13242
13243        # Extend includes gc=Me and gc=Mn, while Control includes gc=Cc
13244        # and gc=Cf
13245        if ($range->value =~ / ^ M [en] $ /x) {
13246            $file->insert_lines(sprintf "%04X..%04X; Extend",
13247                                $range->start,  $range->end);
13248        }
13249        elsif ($range->value =~ / ^ C [cf] $ /x) {
13250            $file->insert_lines(sprintf "%04X..%04X; Control",
13251                                $range->start,  $range->end);
13252        }
13253    }
13254    $file->insert_lines("2028; Control"); # Line Separator
13255    $file->insert_lines("2029; Control"); # Paragraph Separator
13256
13257    $file->insert_lines("000D; CR");
13258    $file->insert_lines("000A; LF");
13259
13260    # Also from http://www.unicode.org/reports/tr29/tr29-3.html.
13261    foreach my $code_point ( qw{
13262                                09BE 09D7 0B3E 0B57 0BBE 0BD7 0CC2 0CD5 0CD6
13263                                0D3E 0D57 0DCF 0DDF FF9E FF9F 1D165 1D16E 1D16F
13264                                }
13265    ) {
13266        my $category = $gc->value_of(hex $code_point);
13267        next if ! defined $category || $category eq 'Cn'; # But not if
13268                                                          # unassigned in this
13269                                                          # release
13270        $file->insert_lines("$code_point; Extend");
13271    }
13272
13273    my $hst = property_ref('Hangul_Syllable_Type');
13274    if ($hst->count > 0) {
13275        foreach my $range ($hst->ranges) {
13276            $file->insert_lines(sprintf "%04X..%04X; %s",
13277                                    $range->start, $range->end, $range->value);
13278        }
13279    }
13280    else {
13281        generate_hst($file);
13282    }
13283
13284    main::process_generic_property_file($file);
13285}
13286
13287
13288sub fixup_early_perl_name_alias($file) {
13289
13290    # Different versions of Unicode have varying support for the name synonyms
13291    # below.  Just include everything.  As of 6.1, all these are correct in
13292    # the Unicode-supplied file.
13293
13294    # ALERT did not come along until 6.0, at which point it became preferred
13295    # over BELL.  By inserting it last in early releases, BELL is preferred
13296    # over it; and vice-vers in 6.0
13297    my $type_for_bell = ($v_version lt v6.0.0)
13298               ? 'correction'
13299               : 'alternate';
13300    $file->insert_lines(split /\n/, <<END
133010007;BELL; $type_for_bell
13302000A;LINE FEED (LF);alternate
13303000C;FORM FEED (FF);alternate
13304000D;CARRIAGE RETURN (CR);alternate
133050085;NEXT LINE (NEL);alternate
13306END
13307
13308    );
13309
13310    # One might think that the 'Unicode_1_Name' field, could work for most
13311    # of the above names, but sadly that field varies depending on the
13312    # release.  Version 1.1.5 had no names for any of the controls; Version
13313    # 2.0 introduced names for the C0 controls, and 3.0 introduced C1 names.
13314    # 3.0.1 removed the name INDEX; and 3.2 changed some names:
13315    #   changed to parenthesized versions like "NEXT LINE" to
13316    #       "NEXT LINE (NEL)";
13317    #   changed PARTIAL LINE DOWN to PARTIAL LINE FORWARD
13318    #   changed PARTIAL LINE UP to PARTIAL LINE BACKWARD;;
13319    #   changed e.g. FILE SEPARATOR to INFORMATION SEPARATOR FOUR
13320    #
13321    # All these are present in the 6.1 NameAliases.txt
13322
13323    return;
13324}
13325
13326sub filter_later_version_name_alias_line {
13327
13328    # This file has an extra entry per line for the alias type.  This is
13329    # handled by creating a compound entry: "$alias: $type";  First, split
13330    # the line into components.
13331    my ($range, $alias, $type, @remainder)
13332        = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
13333
13334    # This file contains multiple entries for some components, so tell the
13335    # downstream code to allow this in our internal tables; the
13336    # $MULTIPLE_AFTER preserves the input ordering.
13337    $_ = join ";", $range, $CMD_DELIM
13338                           . $REPLACE_CMD
13339                           . '='
13340                           . $MULTIPLE_AFTER
13341                           . $CMD_DELIM
13342                           . "$alias: $type",
13343                   @remainder;
13344    return;
13345}
13346
13347sub filter_early_version_name_alias_line {
13348
13349    # Early versions did not have the trailing alias type field; implicitly it
13350    # was 'correction'.
13351    $_ .= "; correction";
13352
13353    filter_later_version_name_alias_line;
13354    return;
13355}
13356
13357sub filter_all_caps_script_names {
13358
13359    # Some early Unicode releases had the script names in all CAPS.  This
13360    # converts them to just the first letter of each word being capital.
13361
13362    my ($range, $script, @remainder)
13363        = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
13364    my @words = split /[_-]/, $script;
13365    for my $word (@words) {
13366        $word =
13367            ucfirst(lc($word)) if $word ne 'CJK';
13368    }
13369    $script = join "_", @words;
13370    $_ = join ";", $range, $script, @remainder;
13371}
13372
13373sub finish_Unicode() {
13374    # This routine should be called after all the Unicode files have been read
13375    # in.  It:
13376    # 1) Creates properties that are missing from the version of Unicode being
13377    #    compiled, and which, for whatever reason, are needed for the Perl
13378    #    core to function properly.  These are minimally populated as
13379    #    necessary.
13380    # 2) Adds the mappings for code points missing from the files which have
13381    #    defaults specified for them.
13382    # 3) At this point all mappings are known, so it computes the type of
13383    #    each property whose type hasn't been determined yet.
13384    # 4) Calculates all the regular expression match tables based on the
13385    #    mappings.
13386    # 5) Calculates and adds the tables which are defined by Unicode, but
13387    #    which aren't derived by them, and certain derived tables that Perl
13388    #    uses.
13389
13390    # Folding information was introduced later into Unicode data.  To get
13391    # Perl's case ignore (/i) to work at all in releases that don't have
13392    # folding, use the best available alternative, which is lower casing.
13393    my $fold = property_ref('Case_Folding');
13394    if ($fold->is_empty) {
13395        $fold->initialize(property_ref('Lowercase_Mapping'));
13396        $fold->add_note(join_lines(<<END
13397WARNING: This table uses lower case as a substitute for missing fold
13398information
13399END
13400        ));
13401    }
13402
13403    # Multiple-character mapping was introduced later into Unicode data, so it
13404    # is by default the simple version.  If to output the simple versions and
13405    # not present, just use the regular (which in these Unicode versions is
13406    # the simple as well).
13407    foreach my $map (qw {   Uppercase_Mapping
13408                            Lowercase_Mapping
13409                            Titlecase_Mapping
13410                            Case_Folding
13411                        } )
13412    {
13413        my $comment = <<END;
13414
13415Note that although the Perl core uses this file, it has the standard values
13416for code points from U+0000 to U+00FF compiled in, so changing this table will
13417not change the core's behavior with respect to these code points.  Use
13418Unicode::Casing to override this table.
13419END
13420        if ($map eq 'Case_Folding') {
13421            $comment .= <<END;
13422(/i regex matching is not overridable except by using a custom regex engine)
13423END
13424        }
13425        property_ref($map)->add_comment(join_lines($comment));
13426        my $simple = property_ref("Simple_$map");
13427        next if ! $simple->is_empty;
13428        if ($simple->to_output_map) {
13429            $simple->initialize(property_ref($map));
13430        }
13431        else {
13432            property_ref($map)->set_proxy_for($simple->name);
13433        }
13434    }
13435
13436    # For each property, fill in any missing mappings, and calculate the re
13437    # match tables.  If a property has more than one missing mapping, the
13438    # default is a reference to a data structure, and may require data from
13439    # other properties to resolve.  The sort is used to cause these to be
13440    # processed last, after all the other properties have been calculated.
13441    # (Fortunately, the missing properties so far don't depend on each other.)
13442    foreach my $property
13443        (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
13444        property_ref('*'))
13445    {
13446        # $perl has been defined, but isn't one of the Unicode properties that
13447        # need to be finished up.
13448        next if $property == $perl;
13449
13450        # Nor do we need to do anything with properties that aren't going to
13451        # be output.
13452        next if $property->fate == $SUPPRESSED;
13453
13454        # Handle the properties that have more than one possible default
13455        if (ref $property->default_map) {
13456            my $default_map = $property->default_map;
13457
13458            # These properties have stored in the default_map:
13459            # One or more of:
13460            #   1)  A default map which applies to all code points in a
13461            #       certain class
13462            #   2)  an expression which will evaluate to the list of code
13463            #       points in that class
13464            # And
13465            #   3) the default map which applies to every other missing code
13466            #      point.
13467            #
13468            # Go through each list.
13469            while (my ($default, $eval) = $default_map->get_next_defaults) {
13470
13471                # Get the class list, and intersect it with all the so-far
13472                # unspecified code points yielding all the code points
13473                # in the class that haven't been specified.
13474                my $list = eval $eval;
13475                if ($@) {
13476                    Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
13477                    last;
13478                }
13479
13480                # Narrow down the list to just those code points we don't have
13481                # maps for yet.
13482                $list = $list & $property->inverse_list;
13483
13484                # Add mappings to the property for each code point in the list
13485                foreach my $range ($list->ranges) {
13486                    $property->add_map($range->start, $range->end, $default,
13487                    Replace => $CROAK);
13488                }
13489            }
13490
13491            # All remaining code points have the other mapping.  Set that up
13492            # so the normal single-default mapping code will work on them
13493            $property->set_default_map($default_map->other_default);
13494
13495            # And fall through to do that
13496        }
13497
13498        # We should have enough data now to compute the type of the property.
13499        my $property_name = $property->name;
13500        $property->compute_type;
13501        my $property_type = $property->type;
13502
13503        next if ! $property->to_create_match_tables;
13504
13505        # Here want to create match tables for this property
13506
13507        # The Unicode db always (so far, and they claim into the future) have
13508        # the default for missing entries in binary properties be 'N' (unless
13509        # there is a '@missing' line that specifies otherwise)
13510        if (! defined $property->default_map) {
13511            if ($property_type == $BINARY) {
13512                $property->set_default_map('N');
13513            }
13514            elsif ($property_type == $ENUM) {
13515                Carp::my_carp("Property '$property_name doesn't have a default mapping.  Using a fake one");
13516                $property->set_default_map('XXX This makes sure there is a default map');
13517            }
13518        }
13519
13520        # Add any remaining code points to the mapping, using the default for
13521        # missing code points.
13522        my $default_table;
13523        my $default_map = $property->default_map;
13524        if ($property_type == $FORCED_BINARY) {
13525
13526            # A forced binary property creates a 'Y' table that matches all
13527            # non-default values.  The actual string values are also written out
13528            # as a map table.  (The default value will almost certainly be the
13529            # empty string, so the pod glosses over the distinction, and just
13530            # talks about empty vs non-empty.)
13531            my $yes = $property->table("Y");
13532            foreach my $range ($property->ranges) {
13533                next if $range->value eq $default_map;
13534                $yes->add_range($range->start, $range->end);
13535            }
13536            $property->table("N")->set_complement($yes);
13537        }
13538        else {
13539            if (defined $default_map) {
13540
13541                # Make sure there is a match table for the default
13542                if (! defined ($default_table = $property->table($default_map)))
13543                {
13544                    $default_table = $property->add_match_table($default_map);
13545                }
13546
13547                # And, if the property is binary, the default table will just
13548                # be the complement of the other table.
13549                if ($property_type == $BINARY) {
13550                    my $non_default_table;
13551
13552                    # Find the non-default table.
13553                    for my $table ($property->tables) {
13554                        if ($table == $default_table) {
13555                            if ($v_version le v5.0.0) {
13556                                $table->add_alias($_) for qw(N No F False);
13557                            }
13558                            next;
13559                        } elsif ($v_version le v5.0.0) {
13560                            $table->add_alias($_) for qw(Y Yes T True);
13561                        }
13562                        $non_default_table = $table;
13563                    }
13564                    $default_table->set_complement($non_default_table);
13565                }
13566                else {
13567
13568                    # This fills in any missing values with the default.  It's
13569                    # not necessary to do this with binary properties, as the
13570                    # default is defined completely in terms of the Y table.
13571                    $property->add_map(0, $MAX_WORKING_CODEPOINT,
13572                                    $default_map, Replace => $NO);
13573                }
13574            }
13575
13576            # Have all we need to populate the match tables.
13577            my $maps_should_be_defined = $property->pre_declared_maps;
13578            foreach my $range ($property->ranges) {
13579                my $map = $range->value;
13580                my $table = $property->table($map);
13581                if (! defined $table) {
13582
13583                    # Integral and rational property values are not
13584                    # necessarily defined in PropValueAliases, but whether all
13585                    # the other ones should be depends on the property.
13586                    if ($maps_should_be_defined
13587                        && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
13588                    {
13589                        Carp::my_carp("Table '$property_name=$map' should "
13590                                    . "have been defined.  Defining it now.")
13591                    }
13592                    $table = $property->add_match_table($map);
13593                }
13594
13595                next if $table->complement != 0; # Don't need to populate these
13596                $table->add_range($range->start, $range->end);
13597            }
13598        }
13599
13600        # For Perl 5.6 compatibility, all properties matchable in regexes can
13601        # have an optional 'Is_' prefix.  This is now done in Unicode::UCD.
13602        # But warn if this creates a conflict with a (new) Unicode property
13603        # name, although it appears that Unicode has made a decision never to
13604        # begin a property name with 'Is_', so this shouldn't happen.
13605        foreach my $alias ($property->aliases) {
13606            my $Is_name = 'Is_' . $alias->name;
13607            if (defined (my $pre_existing = property_ref($Is_name))) {
13608                Carp::my_carp(<<END
13609There is already an alias named $Is_name (from " . $pre_existing . "), so
13610creating one for $property won't work.  This is bad news.  If it is not too
13611late, get Unicode to back off.  Otherwise go back to the old scheme (findable
13612from the git blame log for this area of the code that suppressed individual
13613aliases that conflict with the new Unicode names.  Proceeding anyway.
13614END
13615                );
13616            }
13617        } # End of loop through aliases for this property
13618
13619
13620        # Properties that have sets of values for some characters are now
13621        # converted.  For example, the Script_Extensions property started out
13622        # as a clone of the Script property.  But processing its data file
13623        # caused some elements to be replaced with different data.  (These
13624        # elements were for the Common and Inherited properties.)  This data
13625        # is a qw() list of all the scripts that the code points in the given
13626        # range are in.  An example line is:
13627        #
13628        # 060C          ; Arab Syrc Thaa # Po       ARABIC COMMA
13629        #
13630        # Code executed earlier has created a new match table named "Arab Syrc
13631        # Thaa" which contains 060C.  (The cloned table started out with this
13632        # code point mapping to "Common".)  Now we add 060C to each of the
13633        # Arab, Syrc, and Thaa match tables.  Then we delete the now spurious
13634        # "Arab Syrc Thaa" match table.  This is repeated for all these tables
13635        # and ranges.  The map data is retained in the map table for
13636        # reference, but the spurious match tables are deleted.
13637        my $format = $property->format;
13638        if (defined $format && $format eq $STRING_WHITE_SPACE_LIST) {
13639            foreach my $table ($property->tables) {
13640
13641                # Space separates the entries which should go in multiple
13642                # tables
13643                next unless $table->name =~ /\s/;
13644
13645                # The list of the entries, hence the names of the tables that
13646                # everything in this combo table should be added to.
13647                my @list = split /\s+/, $table->name;
13648
13649                # Add the entries from the combo table to each individual
13650                # table
13651                foreach my $individual (@list) {
13652                    my $existing_table = $property->table($individual);
13653
13654                    # This should only be necessary if this particular entry
13655                    # occurs only in combo with others.
13656                    $existing_table = $property->add_match_table($individual)
13657                                                unless defined $existing_table;
13658                    $existing_table += $table;
13659                }
13660                $property->delete_match_table($table);
13661            }
13662        }
13663    } # End of loop through all Unicode properties.
13664
13665    # Fill in the mappings that Unicode doesn't completely furnish.  First the
13666    # single letter major general categories.  If Unicode were to start
13667    # delivering the values, this would be redundant, but better that than to
13668    # try to figure out if should skip and not get it right.  Ths could happen
13669    # if a new major category were to be introduced, and the hard-coded test
13670    # wouldn't know about it.
13671    # This routine depends on the standard names for the general categories
13672    # being what it thinks they are, like 'Cn'.  The major categories are the
13673    # union of all the general category tables which have the same first
13674    # letters. eg. L = Lu + Lt + Ll + Lo + Lm
13675    foreach my $minor_table ($gc->tables) {
13676        my $minor_name = $minor_table->name;
13677        next if length $minor_name == 1;
13678        if (length $minor_name != 2) {
13679            Carp::my_carp_bug("Unexpected general category '$minor_name'.  Skipped.");
13680            next;
13681        }
13682
13683        my $major_name = uc(substr($minor_name, 0, 1));
13684        my $major_table = $gc->table($major_name);
13685        $major_table += $minor_table;
13686    }
13687
13688    # LC is Ll, Lu, and Lt.  (used to be L& or L_, but PropValueAliases.txt
13689    # defines it as LC)
13690    my $LC = $gc->table('LC');
13691    $LC->add_alias('L_', Status => $DISCOURAGED);   # For backwards...
13692    $LC->add_alias('L&', Status => $DISCOURAGED);   # compatibility.
13693
13694
13695    if ($LC->is_empty) { # Assume if not empty that Unicode has started to
13696                         # deliver the correct values in it
13697        $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
13698
13699        # Lt not in release 1.
13700        if (defined $gc->table('Lt')) {
13701            $LC += $gc->table('Lt');
13702            $gc->table('Lt')->set_caseless_equivalent($LC);
13703        }
13704    }
13705    $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
13706
13707    $gc->table('Ll')->set_caseless_equivalent($LC);
13708    $gc->table('Lu')->set_caseless_equivalent($LC);
13709
13710    # Create digit and case fold tables with the original file names for
13711    # backwards compatibility with applications that read them directly.
13712    my $Digit = Property->new("Legacy_Perl_Decimal_Digit",
13713                              Default_Map => "",
13714                              File => 'Digit',    # Trad. location
13715                              Directory => $map_directory,
13716                              Type => $STRING,
13717                              Replacement_Property => "Perl_Decimal_Digit",
13718                              Initialize => property_ref('Perl_Decimal_Digit'),
13719                            );
13720    $Digit->add_comment(join_lines(<<END
13721This file gives the mapping of all code points which represent a single
13722decimal digit [0-9] to their respective digits.  For example, the code point
13723U+0031 (an ASCII '1') is mapped to a numeric 1.  These code points are those
13724that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
13725numerals.
13726END
13727    ));
13728
13729    # Make sure this assumption in perl core code is valid in this Unicode
13730    # release, with known exceptions
13731    foreach my $range (property_ref('Numeric-Type')->table('Decimal')->ranges) {
13732        next if $range->end - $range->start == 9;
13733        next if $range->start == 0x1D7CE;   # This whole range was added in 3.1
13734        next if $range->end == 0x19DA && $v_version eq v5.2.0;
13735        next if $range->end - $range->start < 9 && $v_version le 4.0.0;
13736        Carp::my_carp("Range $range unexpectedly doesn't contain 10"
13737                    . " decimal digits.  Code in regcomp.c assumes it does,"
13738                    . " and will have to be fixed.  Proceeding anyway.");
13739    }
13740
13741    Property->new('Legacy_Case_Folding',
13742                    File => "Fold",
13743                    Directory => $map_directory,
13744                    Default_Map => $CODE_POINT,
13745                    Type => $STRING,
13746                    Replacement_Property => "Case_Folding",
13747                    Format => $HEX_FORMAT,
13748                    Initialize => property_ref('cf'),
13749    );
13750
13751    # Mark the scx table as the parent of the corresponding sc table for those
13752    # which are identical.  This causes the pod for the script table to refer
13753    # to the corresponding scx one.  This is done after everything, so as to
13754    # wait until the tables are stabilized before checking for equivalency.
13755    if (defined $scx) {
13756        if (defined $pod_directory) {
13757            foreach my $table ($scx->tables) {
13758                my $plain_sc_equiv = $script->table($table->name);
13759                if ($table->matches_identically_to($plain_sc_equiv)) {
13760                    $plain_sc_equiv->set_equivalent_to($table, Related => 1);
13761                }
13762            }
13763        }
13764    }
13765
13766    return;
13767}
13768
13769sub pre_3_dot_1_Nl () {
13770
13771    # Return a range list for gc=nl for Unicode versions prior to 3.1, which
13772    # is when Unicode's became fully usable.  These code points were
13773    # determined by inspection and experimentation.  gc=nl is important for
13774    # certain Perl-extension properties that should be available in all
13775    # releases.
13776
13777    my $Nl = Range_List->new();
13778    if (defined (my $official = $gc->table('Nl'))) {
13779        $Nl += $official;
13780    }
13781    else {
13782        $Nl->add_range(0x2160, 0x2182);
13783        $Nl->add_range(0x3007, 0x3007);
13784        $Nl->add_range(0x3021, 0x3029);
13785    }
13786    $Nl->add_range(0xFE20, 0xFE23);
13787    $Nl->add_range(0x16EE, 0x16F0) if $v_version ge v3.0.0; # 3.0 was when
13788                                                            # these were added
13789    return $Nl;
13790}
13791
13792sub calculate_Assigned() {  # Set $Assigned to the gc != Cn code points; may be
13793                            # called before the Cn's are completely filled.
13794                            # Works on Unicodes earlier than ones that
13795                            # explicitly specify Cn.
13796    return if defined $Assigned;
13797
13798    if (! defined $gc || $gc->is_empty()) {
13799        Carp::my_carp_bug("calculate_Assigned() called before $gc is populated");
13800    }
13801
13802    $Assigned = $perl->add_match_table('Assigned',
13803                                Description  => "All assigned code points",
13804                                );
13805    while (defined (my $range = $gc->each_range())) {
13806        my $standard_value = standardize($range->value);
13807        next if $standard_value eq 'cn' || $standard_value eq 'unassigned';
13808        $Assigned->add_range($range->start, $range->end);
13809    }
13810}
13811
13812sub calculate_DI() {    # Set $DI to a Range_List equivalent to the
13813                        # Default_Ignorable_Code_Point property.  Works on
13814                        # Unicodes earlier than ones that explicitly specify
13815                        # DI.
13816    return if defined $DI;
13817
13818    if (defined (my $di = property_ref('Default_Ignorable_Code_Point'))) {
13819        $DI = $di->table('Y');
13820    }
13821    else {
13822        $DI = Range_List->new(Initialize => [ 0x180B .. 0x180D,
13823                                              0x2060 .. 0x206F,
13824                                              0xFE00 .. 0xFE0F,
13825                                              0xFFF0 .. 0xFFFB,
13826                                            ]);
13827        if ($v_version ge v2.0) {
13828            $DI += $gc->table('Cf')
13829                +  $gc->table('Cs');
13830
13831            # These are above the Unicode version 1 max
13832            $DI->add_range(0xE0000, 0xE0FFF);
13833        }
13834        $DI += $gc->table('Cc')
13835             - ord("\t")
13836             - utf8::unicode_to_native(0x0A)  # LINE FEED
13837             - utf8::unicode_to_native(0x0B)  # VERTICAL TAB
13838             - ord("\f")
13839             - utf8::unicode_to_native(0x0D)  # CARRIAGE RETURN
13840             - utf8::unicode_to_native(0x85); # NEL
13841    }
13842}
13843
13844sub calculate_NChar() {  # Create a Perl extension match table which is the
13845                         # same as the Noncharacter_Code_Point property, and
13846                         # set $NChar to point to it.  Works on Unicodes
13847                         # earlier than ones that explicitly specify NChar
13848    return if defined $NChar;
13849
13850    $NChar = $perl->add_match_table('_Perl_Nchar',
13851                                    Perl_Extension => 1,
13852                                    Fate => $INTERNAL_ONLY);
13853    if (defined (my $off_nchar = property_ref('NChar'))) {
13854        $NChar->initialize($off_nchar->table('Y'));
13855    }
13856    else {
13857        $NChar->initialize([ 0xFFFE .. 0xFFFF ]);
13858        if ($v_version ge v2.0) {   # First release with these nchars
13859            for (my $i = 0x1FFFE; $i <= 0x10FFFE; $i += 0x10000) {
13860                $NChar += [ $i .. $i+1 ];
13861            }
13862        }
13863    }
13864}
13865
13866sub handle_compare_versions () {
13867    # This fixes things up for the $compare_versions capability, where we
13868    # compare Unicode version X with version Y (with Y > X), and we are
13869    # running it on the Unicode Data for version Y.
13870    #
13871    # It works by calculating the code points whose meaning has been specified
13872    # after release X, by using the Age property.  The complement of this set
13873    # is the set of code points whose meaning is unchanged between the
13874    # releases.  This is the set the program restricts itself to.  It includes
13875    # everything whose meaning has been specified by the time version X came
13876    # along, plus those still unassigned by the time of version Y.  (We will
13877    # continue to use the word 'assigned' to mean 'meaning has been
13878    # specified', as it's shorter and is accurate in all cases except the
13879    # Noncharacter code points.)
13880    #
13881    # This function is run after all the properties specified by Unicode have
13882    # been calculated for release Y.  This makes sure we get all the nuances
13883    # of Y's rules.  (It is done before the Perl extensions are calculated, as
13884    # those are based entirely on the Unicode ones.)  But doing it after the
13885    # Unicode table calculations means we have to fix up the Unicode tables.
13886    # We do this by subtracting the code points that have been assigned since
13887    # X (which is actually done by ANDing each table of assigned code points
13888    # with the set of unchanged code points).  Most Unicode properties are of
13889    # the form such that all unassigned code points have a default, grab-bag,
13890    # property value which is changed when the code point gets assigned.  For
13891    # these, we just remove the changed code points from the table for the
13892    # latter property value, and add them back in to the grab-bag one.  A few
13893    # other properties are not entirely of this form and have values for some
13894    # or all unassigned code points that are not the grab-bag one.  These have
13895    # to be handled specially, and are hard-coded in to this routine based on
13896    # manual inspection of the Unicode character database.  A list of the
13897    # outlier code points is made for each of these properties, and those
13898    # outliers are excluded from adding and removing from tables.
13899    #
13900    # Note that there are glitches when comparing against Unicode 1.1, as some
13901    # Hangul syllables in it were later ripped out and eventually replaced
13902    # with other things.
13903
13904    print "Fixing up for version comparison\n" if $verbosity >= $PROGRESS;
13905
13906    my $after_first_version = "All matching code points were added after "
13907                            . "Unicode $string_compare_versions";
13908
13909    # Calculate the delta as those code points that have been newly assigned
13910    # since the first compare version.
13911    my $delta = Range_List->new();
13912    foreach my $table ($age->tables) {
13913        use version;
13914        next if $table == $age->table('Unassigned');
13915        next if version->parse($table->name)
13916             le version->parse($string_compare_versions);
13917        $delta += $table;
13918    }
13919    if ($delta->is_empty) {
13920        die ("No changes; perhaps you need a 'DAge.txt' file?");
13921    }
13922
13923    my $unchanged = ~ $delta;
13924
13925    calculate_Assigned() if ! defined $Assigned;
13926    $Assigned &= $unchanged;
13927
13928    # $Assigned now contains the code points that were assigned as of Unicode
13929    # version X.
13930
13931    # A block is all or nothing.  If nothing is assigned in it, it all goes
13932    # back to the No_Block pool; but if even one code point is assigned, the
13933    # block is retained.
13934    my $no_block = $block->table('No_Block');
13935    foreach my $this_block ($block->tables) {
13936        next if     $this_block == $no_block
13937                ||  ! ($this_block & $Assigned)->is_empty;
13938        $this_block->set_fate($SUPPRESSED, $after_first_version);
13939        foreach my $range ($this_block->ranges) {
13940            $block->replace_map($range->start, $range->end, 'No_Block')
13941        }
13942        $no_block += $this_block;
13943    }
13944
13945    my @special_delta_properties;   # List of properties that have to be
13946                                    # handled specially.
13947    my %restricted_delta;           # Keys are the entries in
13948                                    # @special_delta_properties;  values
13949                                    # are the range list of the code points
13950                                    # that behave normally when they get
13951                                    # assigned.
13952
13953    # In the next three properties, the Default Ignorable code points are
13954    # outliers.
13955    calculate_DI();
13956    $DI &= $unchanged;
13957
13958    push @special_delta_properties, property_ref('_Perl_GCB');
13959    $restricted_delta{$special_delta_properties[-1]} = ~ $DI;
13960
13961    if (defined (my $cwnfkcc = property_ref('Changes_When_NFKC_Casefolded')))
13962    {
13963        push @special_delta_properties, $cwnfkcc;
13964        $restricted_delta{$special_delta_properties[-1]} = ~ $DI;
13965    }
13966
13967    calculate_NChar();      # Non-character code points
13968    $NChar &= $unchanged;
13969
13970    # This may have to be updated from time-to-time to get the most accurate
13971    # results.
13972    my $default_BC_non_LtoR = Range_List->new(Initialize =>
13973                        # These came from the comments in v8.0 DBidiClass.txt
13974                                                        [ # AL
13975                                                            0x0600 .. 0x07BF,
13976                                                            0x08A0 .. 0x08FF,
13977                                                            0xFB50 .. 0xFDCF,
13978                                                            0xFDF0 .. 0xFDFF,
13979                                                            0xFE70 .. 0xFEFF,
13980                                                            0x1EE00 .. 0x1EEFF,
13981                                                           # R
13982                                                            0x0590 .. 0x05FF,
13983                                                            0x07C0 .. 0x089F,
13984                                                            0xFB1D .. 0xFB4F,
13985                                                            0x10800 .. 0x10FFF,
13986                                                            0x1E800 .. 0x1EDFF,
13987                                                            0x1EF00 .. 0x1EFFF,
13988                                                           # ET
13989                                                            0x20A0 .. 0x20CF,
13990                                                         ]
13991                                          );
13992    $default_BC_non_LtoR += $DI + $NChar;
13993    push @special_delta_properties, property_ref('BidiClass');
13994    $restricted_delta{$special_delta_properties[-1]} = ~ $default_BC_non_LtoR;
13995
13996    if (defined (my $eaw = property_ref('East_Asian_Width'))) {
13997
13998        my $default_EA_width_W = Range_List->new(Initialize =>
13999                                    # From comments in v8.0 EastAsianWidth.txt
14000                                                [
14001                                                    0x3400 .. 0x4DBF,
14002                                                    0x4E00 .. 0x9FFF,
14003                                                    0xF900 .. 0xFAFF,
14004                                                    0x20000 .. 0x2A6DF,
14005                                                    0x2A700 .. 0x2B73F,
14006                                                    0x2B740 .. 0x2B81F,
14007                                                    0x2B820 .. 0x2CEAF,
14008                                                    0x2F800 .. 0x2FA1F,
14009                                                    0x20000 .. 0x2FFFD,
14010                                                    0x30000 .. 0x3FFFD,
14011                                                ]
14012                                             );
14013        push @special_delta_properties, $eaw;
14014        $restricted_delta{$special_delta_properties[-1]}
14015                                                       = ~ $default_EA_width_W;
14016
14017        # Line break came along in the same release as East_Asian_Width, and
14018        # the non-grab-bag default set is a superset of the EAW one.
14019        if (defined (my $lb = property_ref('Line_Break'))) {
14020            my $default_LB_non_XX = Range_List->new(Initialize =>
14021                                        # From comments in v8.0 LineBreak.txt
14022                                                        [ 0x20A0 .. 0x20CF ]);
14023            $default_LB_non_XX += $default_EA_width_W;
14024            push @special_delta_properties, $lb;
14025            $restricted_delta{$special_delta_properties[-1]}
14026                                                        = ~ $default_LB_non_XX;
14027        }
14028    }
14029
14030    # Go through every property, skipping those we've already worked on, those
14031    # that are immutable, and the perl ones that will be calculated after this
14032    # routine has done its fixup.
14033    foreach my $property (property_ref('*')) {
14034        next if    $property == $perl     # Done later in the program
14035                || $property == $block    # Done just above
14036                || $property == $DI       # Done just above
14037                || $property == $NChar    # Done just above
14038
14039                   # The next two are invariant across Unicode versions
14040                || $property == property_ref('Pattern_Syntax')
14041                || $property == property_ref('Pattern_White_Space');
14042
14043        #  Find the grab-bag value.
14044        my $default_map = $property->default_map;
14045
14046        if (! $property->to_create_match_tables) {
14047
14048            # Here there aren't any match tables.  So far, all such properties
14049            # have a default map, and don't require special handling.  Just
14050            # change each newly assigned code point back to the default map,
14051            # as if they were unassigned.
14052            foreach my $range ($delta->ranges) {
14053                $property->add_map($range->start,
14054                                $range->end,
14055                                $default_map,
14056                                Replace => $UNCONDITIONALLY);
14057            }
14058        }
14059        else {  # Here there are match tables.  Find the one (if any) for the
14060                # grab-bag value that unassigned code points go to.
14061            my $default_table;
14062            if (defined $default_map) {
14063                $default_table = $property->table($default_map);
14064            }
14065
14066            # If some code points don't go back to the grab-bag when they
14067            # are considered unassigned, exclude them from the list that does
14068            # that.
14069            my $this_delta = $delta;
14070            my $this_unchanged = $unchanged;
14071            if (grep { $_ == $property } @special_delta_properties) {
14072                $this_delta = $delta & $restricted_delta{$property};
14073                $this_unchanged = ~ $this_delta;
14074            }
14075
14076            # Fix up each match table for this property.
14077            foreach my $table ($property->tables) {
14078                if (defined $default_table && $table == $default_table) {
14079
14080                    # The code points assigned after release X (the ones we
14081                    # are excluding in this routine) go back on to the default
14082                    # (grab-bag) table.  However, some of these tables don't
14083                    # actually exist, but are specified solely by the other
14084                    # tables.  (In a binary property, we don't need to
14085                    # actually have an 'N' table, as it's just the complement
14086                    # of the 'Y' table.)  Such tables will be locked, so just
14087                    # skip those.
14088                    $table += $this_delta unless $table->locked;
14089                }
14090                else {
14091
14092                    # Here the table is not for the default value.  We need to
14093                    # subtract the code points we are ignoring for this
14094                    # comparison (the deltas) from it.  But if the table
14095                    # started out with nothing, no need to exclude anything,
14096                    # and want to skip it here anyway, so it gets listed
14097                    # properly in the pod.
14098                    next if $table->is_empty;
14099
14100                    # Save the deltas for later, before we do the subtraction
14101                    my $deltas = $table & $this_delta;
14102
14103                    $table &= $this_unchanged;
14104
14105                    # Suppress the table if the subtraction left it with
14106                    # nothing in it
14107                    if ($table->is_empty) {
14108                        if ($property->type == $BINARY) {
14109                            push @tables_that_may_be_empty, $table->complete_name;
14110                        }
14111                        else {
14112                            $table->set_fate($SUPPRESSED, $after_first_version);
14113                        }
14114                    }
14115
14116                    # Now we add the removed code points to the property's
14117                    # map, as they should now map to the grab-bag default
14118                    # property (which they did in the first comparison
14119                    # version).  But we don't have to do this if the map is
14120                    # only for internal use.
14121                    if (defined $default_map && $property->to_output_map) {
14122
14123                        # The gc property has pseudo property values whose names
14124                        # have length 1.  These are the union of all the
14125                        # property values whose name is longer than 1 and
14126                        # whose first letter is all the same.  The replacement
14127                        # is done once for the longer-named tables.
14128                        next if $property == $gc && length $table->name == 1;
14129
14130                        foreach my $range ($deltas->ranges) {
14131                            $property->add_map($range->start,
14132                                            $range->end,
14133                                            $default_map,
14134                                            Replace => $UNCONDITIONALLY);
14135                        }
14136                    }
14137                }
14138            }
14139        }
14140    }
14141
14142    # The above code doesn't work on 'gc=C', as it is a superset of the default
14143    # ('Cn') table.  It's easiest to just special case it here.
14144    my $C = $gc->table('C');
14145    $C += $gc->table('Cn');
14146
14147    return;
14148}
14149
14150sub compile_perl() {
14151    # Create perl-defined tables.  Almost all are part of the pseudo-property
14152    # named 'perl' internally to this program.  Many of these are recommended
14153    # in UTS#18 "Unicode Regular Expressions", and their derivations are based
14154    # on those found there.
14155    # Almost all of these are equivalent to some Unicode property.
14156    # A number of these properties have equivalents restricted to the ASCII
14157    # range, with their names prefaced by 'Posix', to signify that these match
14158    # what the Posix standard says they should match.  A couple are
14159    # effectively this, but the name doesn't have 'Posix' in it because there
14160    # just isn't any Posix equivalent.  'XPosix' are the Posix tables extended
14161    # to the full Unicode range, by our guesses as to what is appropriate.
14162
14163    # 'All' is all code points.  As an error check, instead of just setting it
14164    # to be that, construct it to be the union of all the major categories
14165    $All = $perl->add_match_table('All',
14166      Description
14167        => "All code points, including those above Unicode.  Same as qr/./s",
14168      Matches_All => 1);
14169
14170    foreach my $major_table ($gc->tables) {
14171
14172        # Major categories are the ones with single letter names.
14173        next if length($major_table->name) != 1;
14174
14175        $All += $major_table;
14176    }
14177
14178    if ($All->max != $MAX_WORKING_CODEPOINT) {
14179        Carp::my_carp_bug("Generated highest code point ("
14180           . sprintf("%X", $All->max)
14181           . ") doesn't match expected value $MAX_WORKING_CODEPOINT_STRING.")
14182    }
14183    if ($All->range_count != 1 || $All->min != 0) {
14184     Carp::my_carp_bug("Generated table 'All' doesn't match all code points.")
14185    }
14186
14187    my $Any = $perl->add_match_table('Any',
14188                                    Description  => "All Unicode code points");
14189    $Any->add_range(0, $MAX_UNICODE_CODEPOINT);
14190    $Any->add_alias('Unicode');
14191
14192    calculate_Assigned();
14193
14194    my $ASCII = $perl->add_match_table('ASCII');
14195    if (defined $block) {   # This is equivalent to the block if have it.
14196        my $Unicode_ASCII = $block->table('Basic_Latin');
14197        if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
14198            $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
14199        }
14200    }
14201
14202    # Very early releases didn't have blocks, so initialize ASCII ourselves if
14203    # necessary
14204    if ($ASCII->is_empty) {
14205        if (! NON_ASCII_PLATFORM) {
14206            $ASCII->add_range(0, 127);
14207        }
14208        else {
14209            for my $i (0 .. 127) {
14210                $ASCII->add_range(utf8::unicode_to_native($i),
14211                                  utf8::unicode_to_native($i));
14212            }
14213        }
14214    }
14215
14216    # Get the best available case definitions.  Early Unicode versions didn't
14217    # have Uppercase and Lowercase defined, so use the general category
14218    # instead for them, modified by hard-coding in the code points each is
14219    # missing.
14220    my $Lower = $perl->add_match_table('XPosixLower');
14221    my $Unicode_Lower = property_ref('Lowercase');
14222    if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
14223        $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
14224
14225    }
14226    else {
14227        $Lower += $gc->table('Lowercase_Letter');
14228
14229        # There are quite a few code points in Lower, that aren't in gc=lc,
14230        # and not all are in all releases.
14231        my $temp = Range_List->new(Initialize => [
14232                                                utf8::unicode_to_native(0xAA),
14233                                                utf8::unicode_to_native(0xBA),
14234                                                0x02B0 .. 0x02B8,
14235                                                0x02C0 .. 0x02C1,
14236                                                0x02E0 .. 0x02E4,
14237                                                0x0345,
14238                                                0x037A,
14239                                                0x1D2C .. 0x1D6A,
14240                                                0x1D78,
14241                                                0x1D9B .. 0x1DBF,
14242                                                0x2071,
14243                                                0x207F,
14244                                                0x2090 .. 0x209C,
14245                                                0x2170 .. 0x217F,
14246                                                0x24D0 .. 0x24E9,
14247                                                0x2C7C .. 0x2C7D,
14248                                                0xA770,
14249                                                0xA7F8 .. 0xA7F9,
14250                                ]);
14251        $Lower += $temp & $Assigned;
14252    }
14253    my $Posix_Lower = $perl->add_match_table("PosixLower",
14254                            Initialize => $Lower & $ASCII,
14255                            );
14256
14257    my $Upper = $perl->add_match_table("XPosixUpper");
14258    my $Unicode_Upper = property_ref('Uppercase');
14259    if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
14260        $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
14261    }
14262    else {
14263
14264        # Unlike Lower, there are only two ranges in Upper that aren't in
14265        # gc=Lu, and all code points were assigned in all releases.
14266        $Upper += $gc->table('Uppercase_Letter');
14267        $Upper->add_range(0x2160, 0x216F);  # Uppercase Roman numerals
14268        $Upper->add_range(0x24B6, 0x24CF);  # Circled Latin upper case letters
14269    }
14270    my $Posix_Upper = $perl->add_match_table("PosixUpper",
14271                            Initialize => $Upper & $ASCII,
14272                            );
14273
14274    # Earliest releases didn't have title case.  Initialize it to empty if not
14275    # otherwise present
14276    my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase',
14277                                       Description => '(= \p{Gc=Lt})');
14278    my $lt = $gc->table('Lt');
14279
14280    # Earlier versions of mktables had this related to $lt since they have
14281    # identical code points, but their caseless equivalents are not the same,
14282    # one being 'Cased' and the other being 'LC', and so now must be kept as
14283    # separate entities.
14284    if (defined $lt) {
14285        $Title += $lt;
14286    }
14287    else {
14288        push @tables_that_may_be_empty, $Title->complete_name;
14289    }
14290
14291    my $Unicode_Cased = property_ref('Cased');
14292    if (defined $Unicode_Cased) {
14293        my $yes = $Unicode_Cased->table('Y');
14294        my $no = $Unicode_Cased->table('N');
14295        $Title->set_caseless_equivalent($yes);
14296        if (defined $Unicode_Upper) {
14297            $Unicode_Upper->table('Y')->set_caseless_equivalent($yes);
14298            $Unicode_Upper->table('N')->set_caseless_equivalent($no);
14299        }
14300        $Upper->set_caseless_equivalent($yes);
14301        if (defined $Unicode_Lower) {
14302            $Unicode_Lower->table('Y')->set_caseless_equivalent($yes);
14303            $Unicode_Lower->table('N')->set_caseless_equivalent($no);
14304        }
14305        $Lower->set_caseless_equivalent($yes);
14306    }
14307    else {
14308        # If this Unicode version doesn't have Cased, set up the Perl
14309        # extension from first principles.  From Unicode 5.1: Definition D120:
14310        # A character C is defined to be cased if and only if C has the
14311        # Lowercase or Uppercase property or has a General_Category value of
14312        # Titlecase_Letter.
14313        my $cased = $perl->add_match_table('Cased',
14314                        Initialize => $Lower + $Upper + $Title,
14315                        Description => 'Uppercase or Lowercase or Titlecase',
14316                        );
14317        # $notcased is purely for the caseless equivalents below
14318        my $notcased = $perl->add_match_table('_Not_Cased',
14319                                Initialize => ~ $cased,
14320                                Fate => $INTERNAL_ONLY,
14321                                Description => 'All not-cased code points');
14322        $Title->set_caseless_equivalent($cased);
14323        if (defined $Unicode_Upper) {
14324            $Unicode_Upper->table('Y')->set_caseless_equivalent($cased);
14325            $Unicode_Upper->table('N')->set_caseless_equivalent($notcased);
14326        }
14327        $Upper->set_caseless_equivalent($cased);
14328        if (defined $Unicode_Lower) {
14329            $Unicode_Lower->table('Y')->set_caseless_equivalent($cased);
14330            $Unicode_Lower->table('N')->set_caseless_equivalent($notcased);
14331        }
14332        $Lower->set_caseless_equivalent($cased);
14333    }
14334
14335    # The remaining perl defined tables are mostly based on Unicode TR 18,
14336    # "Annex C: Compatibility Properties".  All of these have two versions,
14337    # one whose name generally begins with Posix that is posix-compliant, and
14338    # one that matches Unicode characters beyond the Posix, ASCII range
14339
14340    my $Alpha = $perl->add_match_table('XPosixAlpha');
14341
14342    # Alphabetic was not present in early releases
14343    my $Alphabetic = property_ref('Alphabetic');
14344    if (defined $Alphabetic && ! $Alphabetic->is_empty) {
14345        $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
14346    }
14347    else {
14348
14349        # The Alphabetic property doesn't exist for early releases, so
14350        # generate it.  The actual definition, in 5.2 terms is:
14351        #
14352        # gc=L + gc=Nl + Other_Alphabetic
14353        #
14354        # Other_Alphabetic is also not defined in these early releases, but it
14355        # contains one gc=So range plus most of gc=Mn and gc=Mc, so we add
14356        # those last two as well, then subtract the relatively few of them that
14357        # shouldn't have been added.  (The gc=So range is the circled capital
14358        # Latin characters.  Early releases mistakenly didn't also include the
14359        # lower-case versions of these characters, and so we don't either, to
14360        # maintain consistency with those releases that first had this
14361        # property.
14362        $Alpha->initialize($gc->table('Letter')
14363                           + pre_3_dot_1_Nl()
14364                           + $gc->table('Mn')
14365                           + $gc->table('Mc')
14366                        );
14367        $Alpha->add_range(0x24D0, 0x24E9);  # gc=So
14368        foreach my $range (     [ 0x0300, 0x0344 ],
14369                                [ 0x0346, 0x034E ],
14370                                [ 0x0360, 0x0362 ],
14371                                [ 0x0483, 0x0486 ],
14372                                [ 0x0591, 0x05AF ],
14373                                [ 0x06DF, 0x06E0 ],
14374                                [ 0x06EA, 0x06EC ],
14375                                [ 0x0740, 0x074A ],
14376                                0x093C,
14377                                0x094D,
14378                                [ 0x0951, 0x0954 ],
14379                                0x09BC,
14380                                0x09CD,
14381                                0x0A3C,
14382                                0x0A4D,
14383                                0x0ABC,
14384                                0x0ACD,
14385                                0x0B3C,
14386                                0x0B4D,
14387                                0x0BCD,
14388                                0x0C4D,
14389                                0x0CCD,
14390                                0x0D4D,
14391                                0x0DCA,
14392                                [ 0x0E47, 0x0E4C ],
14393                                0x0E4E,
14394                                [ 0x0EC8, 0x0ECC ],
14395                                [ 0x0F18, 0x0F19 ],
14396                                0x0F35,
14397                                0x0F37,
14398                                0x0F39,
14399                                [ 0x0F3E, 0x0F3F ],
14400                                [ 0x0F82, 0x0F84 ],
14401                                [ 0x0F86, 0x0F87 ],
14402                                0x0FC6,
14403                                0x1037,
14404                                0x1039,
14405                                [ 0x17C9, 0x17D3 ],
14406                                [ 0x20D0, 0x20DC ],
14407                                0x20E1,
14408                                [ 0x302A, 0x302F ],
14409                                [ 0x3099, 0x309A ],
14410                                [ 0xFE20, 0xFE23 ],
14411                                [ 0x1D165, 0x1D169 ],
14412                                [ 0x1D16D, 0x1D172 ],
14413                                [ 0x1D17B, 0x1D182 ],
14414                                [ 0x1D185, 0x1D18B ],
14415                                [ 0x1D1AA, 0x1D1AD ],
14416        ) {
14417            if (ref $range) {
14418                $Alpha->delete_range($range->[0], $range->[1]);
14419            }
14420            else {
14421                $Alpha->delete_range($range, $range);
14422            }
14423        }
14424        $Alpha->add_description('Alphabetic');
14425        $Alpha->add_alias('Alphabetic');
14426    }
14427    my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
14428                            Initialize => $Alpha & $ASCII,
14429                            );
14430    $Posix_Upper->set_caseless_equivalent($Posix_Alpha);
14431    $Posix_Lower->set_caseless_equivalent($Posix_Alpha);
14432
14433    my $Alnum = $perl->add_match_table('Alnum', Full_Name => 'XPosixAlnum',
14434                        Description => 'Alphabetic and (decimal) Numeric',
14435                        Initialize => $Alpha + $gc->table('Decimal_Number'),
14436                        );
14437    $perl->add_match_table("PosixAlnum",
14438                            Initialize => $Alnum & $ASCII,
14439                            );
14440
14441    my $Word = $perl->add_match_table('Word', Full_Name => 'XPosixWord',
14442                                Description => '\w, including beyond ASCII;'
14443                                            . ' = \p{Alnum} + \pM + \p{Pc}'
14444                                            . ' + \p{Join_Control}',
14445                                Initialize => $Alnum + $gc->table('Mark'),
14446                                );
14447    my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
14448    if (defined $Pc) {
14449        $Word += $Pc;
14450    }
14451    else {
14452        $Word += ord('_');  # Make sure this is a $Word
14453    }
14454    my $JC = property_ref('Join_Control');  # Wasn't in release 1
14455    if (defined $JC) {
14456        $Word += $JC->table('Y');
14457    }
14458    else {
14459        $Word += 0x200C + 0x200D;
14460    }
14461
14462    # This is a Perl extension, so the name doesn't begin with Posix.
14463    my $PerlWord = $perl->add_match_table('PosixWord',
14464                    Description => '\w, restricted to ASCII',
14465                    Initialize => $Word & $ASCII,
14466                    );
14467    $PerlWord->add_alias('PerlWord');
14468
14469    my $Blank = $perl->add_match_table('Blank', Full_Name => 'XPosixBlank',
14470                                Description => '\h, Horizontal white space',
14471
14472                                # 200B is Zero Width Space which is for line
14473                                # break control, and was listed as
14474                                # Space_Separator in early releases
14475                                Initialize => $gc->table('Space_Separator')
14476                                            +   ord("\t")
14477                                            -   0x200B, # ZWSP
14478                                );
14479    $Blank->add_alias('HorizSpace');        # Another name for it.
14480    $perl->add_match_table("PosixBlank",
14481                            Initialize => $Blank & $ASCII,
14482                            );
14483
14484    my $VertSpace = $perl->add_match_table('VertSpace',
14485                            Description => '\v',
14486                            Initialize =>
14487                               $gc->table('Line_Separator')
14488                             + $gc->table('Paragraph_Separator')
14489                             + utf8::unicode_to_native(0x0A)  # LINE FEED
14490                             + utf8::unicode_to_native(0x0B)  # VERTICAL TAB
14491                             + ord("\f")
14492                             + utf8::unicode_to_native(0x0D)  # CARRIAGE RETURN
14493                             + utf8::unicode_to_native(0x85)  # NEL
14494                    );
14495    # No Posix equivalent for vertical space
14496
14497    my $Space = $perl->add_match_table('XPosixSpace',
14498                Description => '\s including beyond ASCII and vertical tab',
14499                Initialize => $Blank + $VertSpace,
14500    );
14501    $Space->add_alias('XPerlSpace');    # Pre-existing synonyms
14502    $Space->add_alias('SpacePerl');
14503    $Space->add_alias('Space') if $v_version lt v4.1.0;
14504
14505    my $Posix_space = $perl->add_match_table("PosixSpace",
14506                            Initialize => $Space & $ASCII,
14507                            );
14508    $Posix_space->add_alias('PerlSpace'); # A pre-existing synonym
14509
14510    my $Cntrl = $perl->add_match_table('Cntrl', Full_Name => 'XPosixCntrl',
14511                                        Description => 'Control characters');
14512    $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
14513    $perl->add_match_table("PosixCntrl",
14514                            Description => "ASCII control characters",
14515                            Definition =>  "ACK, BEL, BS, CAN, CR, DC1, DC2,"
14516                                         . " DC3, DC4, DEL, DLE, ENQ, EOM,"
14517                                         . " EOT, ESC, ETB, ETX, FF, FS, GS,"
14518                                         . " HT, LF, NAK, NUL, RS, SI, SO,"
14519                                         . " SOH, STX, SUB, SYN, US, VT",
14520                            Initialize => $Cntrl & $ASCII,
14521                            );
14522
14523    my $perl_surrogate = $perl->add_match_table('_Perl_Surrogate');
14524    my $Cs = $gc->table('Cs');
14525    if (defined $Cs && ! $Cs->is_empty) {
14526        $perl_surrogate += $Cs;
14527    }
14528    else {
14529        push @tables_that_may_be_empty, '_Perl_Surrogate';
14530    }
14531
14532    # $controls is a temporary used to construct Graph.
14533    my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
14534                                                + $gc->table('Control')
14535                                                + $perl_surrogate);
14536
14537    # Graph is  ~space &  ~(Cc|Cs|Cn) = ~(space + $controls)
14538    my $Graph = $perl->add_match_table('Graph', Full_Name => 'XPosixGraph',
14539                        Description => 'Characters that are graphical',
14540                        Initialize => ~ ($Space + $controls),
14541                        );
14542    $perl->add_match_table("PosixGraph",
14543                            Initialize => $Graph & $ASCII,
14544                            );
14545
14546    $print = $perl->add_match_table('Print', Full_Name => 'XPosixPrint',
14547                        Description => 'Characters that are graphical plus space characters (but no controls)',
14548                        Initialize => $Blank + $Graph - $gc->table('Control'),
14549                        );
14550    $perl->add_match_table("PosixPrint",
14551                            Initialize => $print & $ASCII,
14552                            );
14553
14554    my $Punct = $perl->add_match_table('Punct');
14555    $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
14556
14557    # \p{punct} doesn't include the symbols, which posix does
14558    my $XPosixPunct = $perl->add_match_table('XPosixPunct',
14559                    Description => '\p{Punct} + ASCII-range \p{Symbol}',
14560                    Initialize => $gc->table('Punctuation')
14561                                + ($ASCII & $gc->table('Symbol')),
14562                                Perl_Extension => 1
14563        );
14564    $perl->add_match_table('PosixPunct', Perl_Extension => 1,
14565        Initialize => $ASCII & $XPosixPunct,
14566        );
14567
14568    my $Digit = $perl->add_match_table('Digit', Full_Name => 'XPosixDigit',
14569                            Description => '[0-9] + all other decimal digits');
14570    $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
14571    my $PosixDigit = $perl->add_match_table("PosixDigit",
14572                                            Initialize => $Digit & $ASCII,
14573                                            );
14574
14575    # Hex_Digit was not present in first release
14576    my $Xdigit = $perl->add_match_table('XDigit', Full_Name => 'XPosixXDigit');
14577    my $Hex = property_ref('Hex_Digit');
14578    if (defined $Hex && ! $Hex->is_empty) {
14579        $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
14580    }
14581    else {
14582        $Xdigit->initialize([ ord('0') .. ord('9'),
14583                              ord('A') .. ord('F'),
14584                              ord('a') .. ord('f'),
14585                              0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
14586    }
14587
14588    # AHex was not present in early releases
14589    my $PosixXDigit = $perl->add_match_table('PosixXDigit');
14590    my $AHex = property_ref('ASCII_Hex_Digit');
14591    if (defined $AHex && ! $AHex->is_empty) {
14592        $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1);
14593    }
14594    else {
14595        $PosixXDigit->initialize($Xdigit & $ASCII);
14596        $PosixXDigit->add_alias('AHex');
14597        $PosixXDigit->add_alias('Ascii_Hex_Digit');
14598    }
14599
14600    my $any_folds = $perl->add_match_table("_Perl_Any_Folds",
14601                    Description => "Code points that particpate in some fold",
14602                    );
14603    my $loc_problem_folds = $perl->add_match_table(
14604               "_Perl_Problematic_Locale_Folds",
14605               Description =>
14606                   "Code points that are in some way problematic under locale",
14607    );
14608
14609    # This allows regexec.c to skip some work when appropriate.  Some of the
14610    # entries in _Perl_Problematic_Locale_Folds are multi-character folds,
14611    my $loc_problem_folds_start = $perl->add_match_table(
14612               "_Perl_Problematic_Locale_Foldeds_Start",
14613               Description =>
14614                   "The first character of every sequence in _Perl_Problematic_Locale_Folds",
14615    );
14616
14617    my $cf = property_ref('Case_Folding');
14618
14619    # Every character 0-255 is problematic because what each folds to depends
14620    # on the current locale
14621    $loc_problem_folds->add_range(0, 255);
14622    $loc_problem_folds->add_range(0x130, 0x131);    # These are problematic in
14623                                                    # Turkic locales
14624    $loc_problem_folds_start += $loc_problem_folds;
14625
14626    # Also problematic are anything these fold to outside the range.  Likely
14627    # forever the only thing folded to by these outside the 0-255 range is the
14628    # GREEK SMALL MU (from the MICRO SIGN), but it's easy to make the code
14629    # completely general, which should catch any unexpected changes or errors.
14630    # We look at each code point 0-255, and add its fold (including each part
14631    # of a multi-char fold) to the list.  See commit message
14632    # 31f05a37c4e9c37a7263491f2fc0237d836e1a80 for a more complete description
14633    # of the MU issue.
14634    foreach my $range ($loc_problem_folds->ranges) {
14635        foreach my $code_point ($range->start .. $range->end) {
14636            my $fold_range = $cf->containing_range($code_point);
14637            next unless defined $fold_range;
14638
14639            # Skip if folds to itself
14640            next if $fold_range->value eq $CODE_POINT;
14641
14642            my @hex_folds = split " ", $fold_range->value;
14643            my $start_cp = $hex_folds[0];
14644            next if $start_cp eq $CODE_POINT;
14645            $start_cp = hex $start_cp;
14646            foreach my $i (0 .. @hex_folds - 1) {
14647                my $cp = $hex_folds[$i];
14648                next if $cp eq $CODE_POINT;
14649                $cp = hex $cp;
14650                next unless $cp > 255;    # Already have the < 256 ones
14651
14652                $loc_problem_folds->add_range($cp, $cp);
14653                $loc_problem_folds_start->add_range($start_cp, $start_cp);
14654            }
14655        }
14656    }
14657
14658    my $folds_to_multi_char = $perl->add_match_table(
14659         "_Perl_Folds_To_Multi_Char",
14660         Description =>
14661              "Code points whose fold is a string of more than one character",
14662    );
14663    my $in_multi_fold = $perl->add_match_table(
14664               "_Perl_Is_In_Multi_Char_Fold",
14665               Description =>
14666                   "Code points that are in some multiple character fold",
14667    );
14668    if ($v_version lt v3.0.1) {
14669        push @tables_that_may_be_empty, '_Perl_Folds_To_Multi_Char',
14670                                        '_Perl_Is_In_Multi_Char_Fold',
14671                                        '_Perl_Non_Final_Folds';
14672    }
14673
14674    # Look through all the known folds to populate these tables.
14675    foreach my $range ($cf->ranges) {
14676        next if $range->value eq $CODE_POINT;
14677        my $start = $range->start;
14678        my $end = $range->end;
14679        $any_folds->add_range($start, $end);
14680
14681        my @hex_folds = split " ", $range->value;
14682        if (@hex_folds > 1) {   # Is multi-char fold
14683            $folds_to_multi_char->add_range($start, $end);
14684        }
14685
14686        my $found_locale_problematic = 0;
14687
14688        my $folded_count = @hex_folds;
14689        if ($folded_count > 3) {
14690            die Carp::my_carp("Maximum number of characters in a fold should be 3: Instead, it's  $folded_count for U+" . sprintf "%04X", $range->start);
14691        }
14692
14693        # Look at each of the folded-to characters...
14694        foreach my $i (1 .. $folded_count) {
14695            my $cp = hex $hex_folds[$i-1];
14696            $any_folds->add_range($cp, $cp);
14697
14698            # The fold is problematic if any of the folded-to characters is
14699            # already considered problematic.
14700            if ($loc_problem_folds->contains($cp)) {
14701                $loc_problem_folds->add_range($start, $end);
14702                $found_locale_problematic = 1;
14703            }
14704
14705            if ($folded_count > 1) {
14706                $in_multi_fold->add_range($cp, $cp);
14707            }
14708        }
14709
14710        # If this is a problematic fold, add to the start chars the
14711        # folding-from characters and first folded-to character.
14712        if ($found_locale_problematic) {
14713            $loc_problem_folds_start->add_range($start, $end);
14714            my $cp = hex $hex_folds[0];
14715            $loc_problem_folds_start->add_range($cp, $cp);
14716        }
14717    }
14718
14719    my $dt = property_ref('Decomposition_Type');
14720    $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
14721        Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
14722        Perl_Extension => 1,
14723        Note => 'Union of all non-canonical decompositions',
14724        );
14725
14726    # For backward compatibility, Perl has its own definition for IDStart.
14727    # It is regular XID_Start plus the underscore, but all characters must be
14728    # Word characters as well
14729    my $XID_Start = property_ref('XID_Start');
14730    my $perl_xids = $perl->add_match_table('_Perl_IDStart',
14731                                            Perl_Extension => 1,
14732                                            Fate => $INTERNAL_ONLY,
14733                                            Initialize => ord('_')
14734                                            );
14735    if (defined $XID_Start
14736        || defined ($XID_Start = property_ref('ID_Start')))
14737    {
14738        $perl_xids += $XID_Start->table('Y');
14739    }
14740    else {
14741        # For Unicode versions that don't have the property, construct our own
14742        # from first principles.  The actual definition is:
14743        #     Letters
14744        #   + letter numbers (Nl)
14745        #   - Pattern_Syntax
14746        #   - Pattern_White_Space
14747        #   + stability extensions
14748        #   - NKFC modifications
14749        #
14750        # What we do in the code below is to include the identical code points
14751        # that are in the first release that had Unicode's version of this
14752        # property, essentially extrapolating backwards.  There were no
14753        # stability extensions until v4.1, so none are included; likewise in
14754        # no Unicode version so far do subtracting PatSyn and PatWS make any
14755        # difference, so those also are ignored.
14756        $perl_xids += $gc->table('Letter') + pre_3_dot_1_Nl();
14757
14758        # We do subtract the NFKC modifications that are in the first version
14759        # that had this property.  We don't bother to test if they are in the
14760        # version in question, because if they aren't, the operation is a
14761        # no-op.  The NKFC modifications are discussed in
14762        # http://www.unicode.org/reports/tr31/#NFKC_Modifications
14763        foreach my $range ( 0x037A,
14764                            0x0E33,
14765                            0x0EB3,
14766                            [ 0xFC5E, 0xFC63 ],
14767                            [ 0xFDFA, 0xFE70 ],
14768                            [ 0xFE72, 0xFE76 ],
14769                            0xFE78,
14770                            0xFE7A,
14771                            0xFE7C,
14772                            0xFE7E,
14773                            [ 0xFF9E, 0xFF9F ],
14774        ) {
14775            if (ref $range) {
14776                $perl_xids->delete_range($range->[0], $range->[1]);
14777            }
14778            else {
14779                $perl_xids->delete_range($range, $range);
14780            }
14781        }
14782    }
14783
14784    $perl_xids &= $Word;
14785
14786    my $perl_xidc = $perl->add_match_table('_Perl_IDCont',
14787                                        Perl_Extension => 1,
14788                                        Fate => $INTERNAL_ONLY);
14789    my $XIDC = property_ref('XID_Continue');
14790    if (defined $XIDC
14791        || defined ($XIDC = property_ref('ID_Continue')))
14792    {
14793        $perl_xidc += $XIDC->table('Y');
14794    }
14795    else {
14796        # Similarly, we construct our own XIDC if necessary for early Unicode
14797        # versions.  The definition is:
14798        #     everything in XIDS
14799        #   + Gc=Mn
14800        #   + Gc=Mc
14801        #   + Gc=Nd
14802        #   + Gc=Pc
14803        #   - Pattern_Syntax
14804        #   - Pattern_White_Space
14805        #   + stability extensions
14806        #   - NFKC modifications
14807        #
14808        # The same thing applies to this as with XIDS for the PatSyn, PatWS,
14809        # and stability extensions.  There is a somewhat different set of NFKC
14810        # mods to remove (and add in this case).  The ones below make this
14811        # have identical code points as in the first release that defined it.
14812        $perl_xidc += $perl_xids
14813                    + $gc->table('L')
14814                    + $gc->table('Mn')
14815                    + $gc->table('Mc')
14816                    + $gc->table('Nd')
14817                    + utf8::unicode_to_native(0xB7)
14818                    ;
14819        if (defined (my $pc = $gc->table('Pc'))) {
14820            $perl_xidc += $pc;
14821        }
14822        else {  # 1.1.5 didn't have Pc, but these should have been in it
14823            $perl_xidc += 0xFF3F;
14824            $perl_xidc->add_range(0x203F, 0x2040);
14825            $perl_xidc->add_range(0xFE33, 0xFE34);
14826            $perl_xidc->add_range(0xFE4D, 0xFE4F);
14827        }
14828
14829        # Subtract the NFKC mods
14830        foreach my $range ( 0x037A,
14831                            [ 0xFC5E, 0xFC63 ],
14832                            [ 0xFDFA, 0xFE1F ],
14833                            0xFE70,
14834                            [ 0xFE72, 0xFE76 ],
14835                            0xFE78,
14836                            0xFE7A,
14837                            0xFE7C,
14838                            0xFE7E,
14839        ) {
14840            if (ref $range) {
14841                $perl_xidc->delete_range($range->[0], $range->[1]);
14842            }
14843            else {
14844                $perl_xidc->delete_range($range, $range);
14845            }
14846        }
14847    }
14848
14849    $perl_xidc &= $Word;
14850
14851    my $charname_begin = $perl->add_match_table('_Perl_Charname_Begin',
14852                    Perl_Extension => 1,
14853                    Fate => $INTERNAL_ONLY,
14854                    Initialize => $gc->table('Letter') & $Alpha & $perl_xids,
14855                    );
14856
14857    my $charname_continue = $perl->add_match_table('_Perl_Charname_Continue',
14858                        Perl_Extension => 1,
14859                        Fate => $INTERNAL_ONLY,
14860                        Initialize => $perl_xidc
14861                                    + ord(" ")
14862                                    + ord("(")
14863                                    + ord(")")
14864                                    + ord("-")
14865                        );
14866
14867    my @composition = ('Name', 'Unicode_1_Name', '_Perl_Name_Alias');
14868
14869    if (@named_sequences) {
14870        push @composition, 'Named_Sequence';
14871        foreach my $sequence (@named_sequences) {
14872            $perl_charname->add_anomalous_entry($sequence);
14873        }
14874    }
14875
14876    my $alias_sentence = "";
14877    my %abbreviations;
14878    my $alias = property_ref('_Perl_Name_Alias');
14879    $perl_charname->set_proxy_for('_Perl_Name_Alias');
14880
14881    # Add each entry in _Perl_Name_Alias to Perl_Charnames.  Where these go
14882    # with respect to any existing entry depends on the entry type.
14883    # Corrections go before said entry, as they should be returned in
14884    # preference over the existing entry.  (A correction to a correction
14885    # should be later in the _Perl_Name_Alias table, so it will correctly
14886    # precede the erroneous correction in Perl_Charnames.)
14887    #
14888    # Abbreviations go after everything else, so they are saved temporarily in
14889    # a hash for later.
14890    #
14891    # Everything else is added afterwards, which preserves the input
14892    # ordering
14893
14894    foreach my $range ($alias->ranges) {
14895        next if $range->value eq "";
14896        my $code_point = $range->start;
14897        if ($code_point != $range->end) {
14898            Carp::my_carp_bug("Bad News.  Expecting only one code point in the range $range.  Just to keep going, using only the first code point;");
14899        }
14900        my ($value, $type) = split ': ', $range->value;
14901        my $replace_type;
14902        if ($type eq 'correction') {
14903            $replace_type = $MULTIPLE_BEFORE;
14904        }
14905        elsif ($type eq 'abbreviation') {
14906
14907            # Save for later
14908            $abbreviations{$value} = $code_point;
14909            next;
14910        }
14911        else {
14912            $replace_type = $MULTIPLE_AFTER;
14913        }
14914
14915        # Actually add; before or after current entry(ies) as determined
14916        # above.
14917
14918        $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
14919    }
14920    $alias_sentence = <<END;
14921The _Perl_Name_Alias property adds duplicate code point entries that are
14922alternatives to the original name.  If an addition is a corrected
14923name, it will be physically first in the table.  The original (less correct,
14924but still valid) name will be next; then any alternatives, in no particular
14925order; and finally any abbreviations, again in no particular order.
14926END
14927
14928    # Now add the Unicode_1 names for the controls.  The Unicode_1 names had
14929    # precedence before 6.1, including the awful ones like "LINE FEED (LF)",
14930    # so should be first in the file; the other names have precedence starting
14931    # in 6.1,
14932    my $before_or_after = ($v_version lt v6.1.0)
14933                          ? $MULTIPLE_BEFORE
14934                          : $MULTIPLE_AFTER;
14935
14936    foreach my $range (property_ref('Unicode_1_Name')->ranges) {
14937        my $code_point = $range->start;
14938        my $unicode_1_value = $range->value;
14939        next if $unicode_1_value eq "";     # Skip if name doesn't exist.
14940
14941        if ($code_point != $range->end) {
14942            Carp::my_carp_bug("Bad News.  Expecting only one code point in the range $range.  Just to keep going, using only the first code point;");
14943        }
14944
14945        # To handle EBCDIC, we don't hard code in the code points of the
14946        # controls; instead realizing that all of them are below 256.
14947        last if $code_point > 255;
14948
14949        # We only add in the controls.
14950        next if $gc->value_of($code_point) ne 'Cc';
14951
14952        # We reject this Unicode1 name for later Perls, as it is used for
14953        # another code point
14954        next if $unicode_1_value eq 'BELL' && $^V ge v5.17.0;
14955
14956        # This won't add an exact duplicate.
14957        $perl_charname->add_duplicate($code_point, $unicode_1_value,
14958                                        Replace => $before_or_after);
14959    }
14960
14961    # Now that have everything added, add in abbreviations after
14962    # everything else.  Sort so results don't change between runs of this
14963    # program
14964    foreach my $value (sort keys %abbreviations) {
14965        $perl_charname->add_duplicate($abbreviations{$value}, $value,
14966                                        Replace => $MULTIPLE_AFTER);
14967    }
14968
14969    my $comment;
14970    if (@composition <= 2) { # Always at least 2
14971        $comment = join " and ", @composition;
14972    }
14973    else {
14974        $comment = join ", ", @composition[0 .. scalar @composition - 2];
14975        $comment .= ", and $composition[-1]";
14976    }
14977
14978    $perl_charname->add_comment(join_lines( <<END
14979This file is for charnames.pm.  It is the union of the $comment properties.
14980Unicode_1_Name entries are used only for nameless code points in the Name
14981property.
14982$alias_sentence
14983This file doesn't include the algorithmically determinable names.  For those,
14984use 'unicore/Name.pm'
14985END
14986    ));
14987    property_ref('Name')->add_comment(join_lines( <<END
14988This file doesn't include the algorithmically determinable names.  For those,
14989use 'unicore/Name.pm'
14990END
14991    ));
14992
14993    # Construct the Present_In property from the Age property.
14994    if (-e 'DAge.txt' && defined $age) {
14995        my $default_map = $age->default_map;
14996        my $in = Property->new('In',
14997                                Default_Map => $default_map,
14998                                Full_Name => "Present_In",
14999                                Perl_Extension => 1,
15000                                Type => $ENUM,
15001                                Initialize => $age,
15002                                );
15003        $in->add_comment(join_lines(<<END
15004THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE.  The values in this file are the
15005same as for $age, and not for what $in really means.  This is because anything
15006defined in a given release should have multiple values: that release and all
15007higher ones.  But only one value per code point can be represented in a table
15008like this.
15009END
15010        ));
15011
15012        # The Age tables are named like 1.5, 2.0, 2.1, ....  Sort so that the
15013        # lowest numbered (earliest) come first, with the non-numeric one
15014        # last.
15015        my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
15016                                            ? 1
15017                                            : ($b->name !~ /^[\d.]*$/)
15018                                                ? -1
15019                                                : $a->name <=> $b->name
15020                                            } $age->tables;
15021
15022        # The Present_In property is the cumulative age properties.  The first
15023        # one hence is identical to the first age one.
15024        my $previous_in = $in->add_match_table($first_age->name);
15025        $previous_in->set_equivalent_to($first_age, Related => 1);
15026
15027        my $description_start = "Code point's usage introduced in version ";
15028        $first_age->add_description($description_start . $first_age->name);
15029
15030        # To construct the accumulated values, for each of the age tables
15031        # starting with the 2nd earliest, merge the earliest with it, to get
15032        # all those code points existing in the 2nd earliest.  Repeat merging
15033        # the new 2nd earliest with the 3rd earliest to get all those existing
15034        # in the 3rd earliest, and so on.
15035        foreach my $current_age (@rest_ages) {
15036            next if $current_age->name !~ /^[\d.]*$/;   # Skip the non-numeric
15037
15038            my $current_in = $in->add_match_table(
15039                                    $current_age->name,
15040                                    Initialize => $current_age + $previous_in,
15041                                    Description => $description_start
15042                                                    . $current_age->name
15043                                                    . ' or earlier',
15044                                    );
15045            foreach my $alias ($current_age->aliases) {
15046                $current_in->add_alias($alias->name);
15047            }
15048            $previous_in = $current_in;
15049
15050            # Add clarifying material for the corresponding age file.  This is
15051            # in part because of the confusing and contradictory information
15052            # given in the Standard's documentation itself, as of 5.2.
15053            $current_age->add_description(
15054                            "Code point's usage was introduced in version "
15055                            . $current_age->name);
15056            $current_age->add_note("See also $in");
15057
15058        }
15059
15060        # And finally the code points whose usages have yet to be decided are
15061        # the same in both properties.  Note that permanently unassigned code
15062        # points actually have their usage assigned (as being permanently
15063        # unassigned), so that these tables are not the same as gc=cn.
15064        my $unassigned = $in->add_match_table($default_map);
15065        my $age_default = $age->table($default_map);
15066        $age_default->add_description(<<END
15067Code point's usage has not been assigned in any Unicode release thus far.
15068END
15069        );
15070        $unassigned->set_equivalent_to($age_default, Related => 1);
15071    }
15072
15073    my $patws = $perl->add_match_table('_Perl_PatWS',
15074                                       Perl_Extension => 1,
15075                                       Fate => $INTERNAL_ONLY);
15076    if (defined (my $off_patws = property_ref('Pattern_White_Space'))) {
15077        $patws->initialize($off_patws->table('Y'));
15078    }
15079    else {
15080        $patws->initialize([ ord("\t"),
15081                             ord("\n"),
15082                             utf8::unicode_to_native(0x0B), # VT
15083                             ord("\f"),
15084                             ord("\r"),
15085                             ord(" "),
15086                             utf8::unicode_to_native(0x85), # NEL
15087                             0x200E..0x200F,             # Left, Right marks
15088                             0x2028..0x2029              # Line, Paragraph seps
15089                           ] );
15090    }
15091
15092    # See L<perlfunc/quotemeta>
15093    my $quotemeta = $perl->add_match_table('_Perl_Quotemeta',
15094                                           Perl_Extension => 1,
15095                                           Fate => $INTERNAL_ONLY,
15096
15097                                           # Initialize to what's common in
15098                                           # all Unicode releases.
15099                                           Initialize =>
15100                                                  $gc->table('Control')
15101                                                + $Space
15102                                                + $patws
15103                                                + ((~ $Word) & $ASCII)
15104                           );
15105
15106    if (defined (my $patsyn = property_ref('Pattern_Syntax'))) {
15107        $quotemeta += $patsyn->table('Y');
15108    }
15109    else {
15110        $quotemeta += ((~ $Word) & Range->new(0, 255))
15111                    - utf8::unicode_to_native(0xA8)
15112                    - utf8::unicode_to_native(0xAF)
15113                    - utf8::unicode_to_native(0xB2)
15114                    - utf8::unicode_to_native(0xB3)
15115                    - utf8::unicode_to_native(0xB4)
15116                    - utf8::unicode_to_native(0xB7)
15117                    - utf8::unicode_to_native(0xB8)
15118                    - utf8::unicode_to_native(0xB9)
15119                    - utf8::unicode_to_native(0xBC)
15120                    - utf8::unicode_to_native(0xBD)
15121                    - utf8::unicode_to_native(0xBE);
15122        $quotemeta += [ # These are above-Latin1 patsyn; hence should be the
15123                        # same in all releases
15124                        0x2010 .. 0x2027,
15125                        0x2030 .. 0x203E,
15126                        0x2041 .. 0x2053,
15127                        0x2055 .. 0x205E,
15128                        0x2190 .. 0x245F,
15129                        0x2500 .. 0x2775,
15130                        0x2794 .. 0x2BFF,
15131                        0x2E00 .. 0x2E7F,
15132                        0x3001 .. 0x3003,
15133                        0x3008 .. 0x3020,
15134                        0x3030 .. 0x3030,
15135                        0xFD3E .. 0xFD3F,
15136                        0xFE45 .. 0xFE46
15137                      ];
15138    }
15139
15140    if (defined (my $di = property_ref('Default_Ignorable_Code_Point'))) {
15141        $quotemeta += $di->table('Y')
15142    }
15143    else {
15144        if ($v_version ge v2.0) {
15145            $quotemeta += $gc->table('Cf')
15146                       +  $gc->table('Cs');
15147
15148            # These are above the Unicode version 1 max
15149            $quotemeta->add_range(0xE0000, 0xE0FFF);
15150        }
15151        $quotemeta += $gc->table('Cc')
15152                    - $Space;
15153        my $temp = Range_List->new(Initialize => [ 0x180B .. 0x180D,
15154                                                   0x2060 .. 0x206F,
15155                                                   0xFE00 .. 0xFE0F,
15156                                                   0xFFF0 .. 0xFFFB,
15157                                                  ]);
15158        $temp->add_range(0xE0000, 0xE0FFF) if $v_version ge v2.0;
15159        $quotemeta += $temp;
15160    }
15161    calculate_DI();
15162    $quotemeta += $DI;
15163
15164    calculate_NChar();
15165
15166    # Finished creating all the perl properties.  All non-internal non-string
15167    # ones have a synonym of 'Is_' prefixed.  (Internal properties begin with
15168    # an underscore.)  These do not get a separate entry in the pod file
15169    foreach my $table ($perl->tables) {
15170        foreach my $alias ($table->aliases) {
15171            next if $alias->name =~ /^_/;
15172            $table->add_alias('Is_' . $alias->name,
15173                               Re_Pod_Entry => 0,
15174                               UCD => 0,
15175                               Status => $alias->status,
15176                               OK_as_Filename => 0);
15177        }
15178    }
15179
15180    # Perl tailors the WordBreak property so that \b{wb} doesn't split
15181    # adjacent spaces into separate words.  Unicode 11.0 moved in that
15182    # direction, but left TAB,  FIGURE SPACE (U+2007), and (ironically) NO
15183    # BREAK SPACE as breaking, so we retained the original Perl customization.
15184    # To do this, in the Perl copy of WB, simply replace the mappings of
15185    # horizontal space characters that otherwise would map to the default or
15186    # the 11.0 'WSegSpace' to instead map to our tailoring.
15187    my $perl_wb = property_ref('_Perl_WB');
15188    my $default = $perl_wb->default_map;
15189    for my $range ($Blank->ranges) {
15190        for my $i ($range->start .. $range->end) {
15191            my $value = $perl_wb->value_of($i);
15192
15193            next unless $value eq $default || $value eq 'WSegSpace';
15194            $perl_wb->add_map($i, $i, 'Perl_Tailored_HSpace',
15195                              Replace => $UNCONDITIONALLY);
15196        }
15197    }
15198
15199    # Also starting in Unicode 11.0, rules for some of the boundary types are
15200    # based on a non-UCD property (which we have read in if it exists).
15201    # Recall that these boundary properties partition the code points into
15202    # equivalence classes (represented as enums).
15203    #
15204    # The loop below goes through each code point that matches the non-UCD
15205    # property, and for each current equivalence class containing such a code
15206    # point, splits it so that those that are in both are now in a newly
15207    # created equivalence class whose name is a combination of the property
15208    # and the old class name, leaving unchanged everything that doesn't match
15209    # the non-UCD property.
15210    my $pictographic_emoji = property_ref('ExtPict');
15211    if (defined $pictographic_emoji) {
15212        foreach my $base_property (property_ref('GCB'),
15213                                   property_ref('WB'))
15214        {
15215            my $property = property_ref('_Perl_' . $base_property->name);
15216            foreach my $range ($pictographic_emoji->table('Y')->ranges) {
15217                foreach my $i ($range->start .. $range->end) {
15218                    my $current = $property->value_of($i);
15219                    $current = $property->table($current)->short_name;
15220                    $property->add_map($i, $i, 'ExtPict_' . $current,
15221                                       Replace => $UNCONDITIONALLY);
15222                }
15223            }
15224        }
15225    }
15226
15227    # Create a version of the LineBreak property with the mappings that are
15228    # omitted in the default algorithm remapped to what
15229    # http://www.unicode.org/reports/tr14 says they should be.
15230    #
15231    # Original 	   Resolved  General_Category
15232    # AI, SG, XX      AL      Any
15233    # SA              CM      Only Mn or Mc
15234    # SA              AL      Any except Mn and Mc
15235    # CJ              NS      Any
15236    #
15237    # All property values are also written out in their long form, as
15238    # regen/mk_invlist.pl expects that.  This also fixes occurrences of the
15239    # typo in early Unicode versions: 'inseperable'.
15240    my $perl_lb = property_ref('_Perl_LB');
15241    if (! defined $perl_lb) {
15242        $perl_lb = Property->new('_Perl_LB',
15243                                 Fate => $INTERNAL_ONLY,
15244                                 Perl_Extension => 1,
15245                                 Directory => $map_directory,
15246                                 Type => $STRING);
15247        my $lb = property_ref('Line_Break');
15248
15249        # Populate from $lb, but use full name and fix typo.
15250        foreach my $range ($lb->ranges) {
15251            my $full_name = $lb->table($range->value)->full_name;
15252            $full_name = 'Inseparable'
15253                                if standardize($full_name) eq 'inseperable';
15254            $perl_lb->add_map($range->start, $range->end, $full_name);
15255        }
15256    }
15257
15258    $perl_lb->set_default_map('Alphabetic', 'full_name');    # XX -> AL
15259    my $ea = property_ref('East_Asian_Width');
15260
15261    for my $range ($perl_lb->ranges) {
15262        my $value = standardize($range->value);
15263        if (   $value eq standardize('Unknown')
15264            || $value eq standardize('Ambiguous')
15265            || $value eq standardize('Surrogate'))
15266        {
15267            $perl_lb->add_map($range->start, $range->end, 'Alphabetic',
15268                              Replace => $UNCONDITIONALLY);
15269        }
15270        elsif ($value eq standardize('Conditional_Japanese_Starter')) {
15271            $perl_lb->add_map($range->start, $range->end, 'Nonstarter',
15272                              Replace => $UNCONDITIONALLY);
15273        }
15274        elsif ($value eq standardize('Complex_Context')) {
15275            for my $i ($range->start .. $range->end) {
15276                my $gc_val = $gc->value_of($i);
15277                if ($gc_val eq 'Mn' || $gc_val eq 'Mc') {
15278                    $perl_lb->add_map($i, $i, 'Combining_Mark',
15279                                      Replace => $UNCONDITIONALLY);
15280                }
15281                else {
15282                    $perl_lb->add_map($i, $i, 'Alphabetic',
15283                                      Replace => $UNCONDITIONALLY);
15284                }
15285            }
15286        }
15287        elsif (    defined $ea
15288               && (   $value eq standardize('Close_Parenthesis')
15289                   || $value eq standardize('Open_Punctuation')))
15290        {
15291            # Unicode 13 splits the OP and CP properties each into East Asian,
15292            # and non-.  We retain the (now somewhat misleading) names OP and
15293            # CP for the non-East Asian variety, as there are very few East
15294            # Asian ones.
15295            my $replace = ($value eq standardize('Open_Punctuation'))
15296                          ? 'East_Asian_OP'
15297                          : 'East_Asian_CP';
15298            for my $i ($range->start .. $range->end) {
15299                my $ea_val = $ea->value_of($i);
15300                if ($ea_val eq 'F' || $ea_val eq 'W' || $ea_val eq 'H') {
15301                    $perl_lb->add_map($i, $i, $replace,
15302                                                Replace => $UNCONDITIONALLY);
15303                }
15304            }
15305        }
15306    }
15307
15308    # This property is a modification of the scx property
15309    my $perl_scx = Property->new('_Perl_SCX',
15310                                 Fate => $INTERNAL_ONLY,
15311                                 Perl_Extension => 1,
15312                                 Directory => $map_directory,
15313                                 Type => $ENUM);
15314    my $source;
15315
15316    # Use scx if available; otherwise sc;  if neither is there (a very old
15317    # Unicode version, just say that everything is 'Common'
15318    if (defined $scx) {
15319        $source = $scx;
15320        $perl_scx->set_default_map('Unknown');
15321    }
15322    elsif (defined $script) {
15323        $source = $script;
15324
15325        # Early versions of 'sc', had everything be 'Common'
15326        if (defined $script->table('Unknown')) {
15327            $perl_scx->set_default_map('Unknown');
15328        }
15329        else {
15330            $perl_scx->set_default_map('Common');
15331        }
15332    } else {
15333        $perl_scx->add_match_table('Common');
15334        $perl_scx->add_map(0, $MAX_UNICODE_CODEPOINT, 'Common');
15335
15336        $perl_scx->add_match_table('Unknown');
15337        $perl_scx->set_default_map('Unknown');
15338    }
15339
15340    $perl_scx->_set_format($STRING_WHITE_SPACE_LIST);
15341    $perl_scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these
15342
15343    if (defined $source) {
15344        $perl_scx->initialize($source);
15345
15346        # UTS 39 says that the scx property should be modified for these
15347        # countries where certain mixed scripts are commonly used.
15348        for my $range ($perl_scx->ranges) {
15349            my $value = $range->value;
15350            my $changed = $value =~ s/ ( \b Han i? \b ) /$1 Hanb Jpan Kore/xi;
15351             $changed |=  $value =~ s/ ( \b Hira (gana)? \b ) /$1 Jpan/xi;
15352             $changed |=  $value =~ s/ ( \b Kata (kana)? \b ) /$1 Jpan/xi;
15353             $changed |=  $value =~ s{ ( \b Katakana_or_Hiragana \b ) }
15354                                     {$1 Katakana Hiragana Jpan}xi;
15355             $changed |=  $value =~ s/ ( \b Hang (ul)? \b ) /$1 Kore/xi;
15356             $changed |=  $value =~ s/ ( \b Bopo (mofo)? \b ) /$1 Hanb/xi;
15357
15358            if ($changed) {
15359                $value = join " ", uniques split " ", $value;
15360                $range->set_value($value)
15361            }
15362        }
15363
15364        foreach my $table ($source->tables) {
15365            my $scx_table = $perl_scx->add_match_table($table->name,
15366                                    Full_Name => $table->full_name);
15367            foreach my $alias ($table->aliases) {
15368                $scx_table->add_alias($alias->name);
15369            }
15370        }
15371    }
15372
15373    # Here done with all the basic stuff.  Ready to populate the information
15374    # about each character if annotating them.
15375    if ($annotate) {
15376
15377        # See comments at its declaration
15378        $annotate_ranges = Range_Map->new;
15379
15380        # This separates out the non-characters from the other unassigneds, so
15381        # can give different annotations for each.
15382        $unassigned_sans_noncharacters = Range_List->new(
15383                                    Initialize => $gc->table('Unassigned'));
15384        $unassigned_sans_noncharacters &= (~ $NChar);
15385
15386        for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT + 1; $i++ ) {
15387            $i = populate_char_info($i);    # Note sets $i so may cause skips
15388
15389        }
15390    }
15391
15392    return;
15393}
15394
15395sub add_perl_synonyms() {
15396    # A number of Unicode tables have Perl synonyms that are expressed in
15397    # the single-form, \p{name}.  These are:
15398    #   All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
15399    #       \p{Is_Name} as synonyms
15400    #   \p{Script_Extensions=Value} gets \p{Value}, \p{Is_Value} as synonyms
15401    #   \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
15402    #   \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
15403    #       conflict, \p{Value} and \p{Is_Value} as well
15404    #
15405    # This routine generates these synonyms, warning of any unexpected
15406    # conflicts.
15407
15408    # Construct the list of tables to get synonyms for.  Start with all the
15409    # binary and the General_Category ones.
15410    my @tables = grep { $_->type == $BINARY || $_->type == $FORCED_BINARY }
15411                                                            property_ref('*');
15412    push @tables, $gc->tables;
15413
15414    # If the version of Unicode includes the Script Extensions (preferably),
15415    # or Script property, add its tables
15416    if (defined $scx) {
15417        push @tables, $scx->tables;
15418    }
15419    else {
15420        push @tables, $script->tables if defined $script;
15421    }
15422
15423    # The Block tables are kept separate because they are treated differently.
15424    # And the earliest versions of Unicode didn't include them, so add only if
15425    # there are some.
15426    my @blocks;
15427    push @blocks, $block->tables if defined $block;
15428
15429    # Here, have the lists of tables constructed.  Process blocks last so that
15430    # if there are name collisions with them, blocks have lowest priority.
15431    # Should there ever be other collisions, manual intervention would be
15432    # required.  See the comments at the beginning of the program for a
15433    # possible way to handle those semi-automatically.
15434    foreach my $table (@tables,  @blocks) {
15435
15436        # For non-binary properties, the synonym is just the name of the
15437        # table, like Greek, but for binary properties the synonym is the name
15438        # of the property, and means the code points in its 'Y' table.
15439        my $nominal = $table;
15440        my $nominal_property = $nominal->property;
15441        my $actual;
15442        if (! $nominal->isa('Property')) {
15443            $actual = $table;
15444        }
15445        else {
15446
15447            # Here is a binary property.  Use the 'Y' table.  Verify that is
15448            # there
15449            my $yes = $nominal->table('Y');
15450            unless (defined $yes) {  # Must be defined, but is permissible to
15451                                     # be empty.
15452                Carp::my_carp_bug("Undefined $nominal, 'Y'.  Skipping.");
15453                next;
15454            }
15455            $actual = $yes;
15456        }
15457
15458        foreach my $alias ($nominal->aliases) {
15459
15460            # Attempt to create a table in the perl directory for the
15461            # candidate table, using whatever aliases in it that don't
15462            # conflict.  Also add non-conflicting aliases for all these
15463            # prefixed by 'Is_' (and/or 'In_' for Block property tables)
15464            PREFIX:
15465            foreach my $prefix ("", 'Is_', 'In_') {
15466
15467                # Only Block properties can have added 'In_' aliases.
15468                next if $prefix eq 'In_' and $nominal_property != $block;
15469
15470                my $proposed_name = $prefix . $alias->name;
15471
15472                # No Is_Is, In_In, nor combinations thereof
15473                trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
15474                next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
15475
15476                trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
15477
15478                # Get a reference to any existing table in the perl
15479                # directory with the desired name.
15480                my $pre_existing = $perl->table($proposed_name);
15481
15482                if (! defined $pre_existing) {
15483
15484                    # No name collision, so OK to add the perl synonym.
15485
15486                    my $make_re_pod_entry;
15487                    my $ok_as_filename;
15488                    my $status = $alias->status;
15489                    if ($nominal_property == $block) {
15490
15491                        # For block properties, only the compound form is
15492                        # preferred for external use; the others are
15493                        # discouraged.  The pod file contains wild cards for
15494                        # the 'In' and 'Is' forms so no entries for those; and
15495                        # we don't want people using the name without any
15496                        # prefix, so discourage that.
15497                        if ($prefix eq "") {
15498                            $make_re_pod_entry = 1;
15499                            $status = $status || $DISCOURAGED;
15500                            $ok_as_filename = 0;
15501                        }
15502                        elsif ($prefix eq 'In_') {
15503                            $make_re_pod_entry = 0;
15504                            $status = $status || $DISCOURAGED;
15505                            $ok_as_filename = 1;
15506                        }
15507                        else {
15508                            $make_re_pod_entry = 0;
15509                            $status = $status || $DISCOURAGED;
15510                            $ok_as_filename = 0;
15511                        }
15512                    }
15513                    elsif ($prefix ne "") {
15514
15515                        # The 'Is' prefix is handled in the pod by a wild
15516                        # card, and we won't use it for an external name
15517                        $make_re_pod_entry = 0;
15518                        $status = $status || $NORMAL;
15519                        $ok_as_filename = 0;
15520                    }
15521                    else {
15522
15523                        # Here, is an empty prefix, non block.  This gets its
15524                        # own pod entry and can be used for an external name.
15525                        $make_re_pod_entry = 1;
15526                        $status = $status || $NORMAL;
15527                        $ok_as_filename = 1;
15528                    }
15529
15530                    # Here, there isn't a perl pre-existing table with the
15531                    # name.  Look through the list of equivalents of this
15532                    # table to see if one is a perl table.
15533                    foreach my $equivalent ($actual->leader->equivalents) {
15534                        next if $equivalent->property != $perl;
15535
15536                        # Here, have found a table for $perl.  Add this alias
15537                        # to it, and are done with this prefix.
15538                        $equivalent->add_alias($proposed_name,
15539                                        Re_Pod_Entry => $make_re_pod_entry,
15540
15541                                        # Currently don't output these in the
15542                                        # ucd pod, as are strongly discouraged
15543                                        # from being used
15544                                        UCD => 0,
15545
15546                                        Status => $status,
15547                                        OK_as_Filename => $ok_as_filename);
15548                        trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
15549                        next PREFIX;
15550                    }
15551
15552                    # Here, $perl doesn't already have a table that is a
15553                    # synonym for this property, add one.
15554                    my $added_table = $perl->add_match_table($proposed_name,
15555                                            Re_Pod_Entry => $make_re_pod_entry,
15556
15557                                            # See UCD comment just above
15558                                            UCD => 0,
15559
15560                                            Status => $status,
15561                                            OK_as_Filename => $ok_as_filename);
15562                    # And it will be related to the actual table, since it is
15563                    # based on it.
15564                    $added_table->set_equivalent_to($actual, Related => 1);
15565                    trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
15566                    next;
15567                } # End of no pre-existing.
15568
15569                # Here, there is a pre-existing table that has the proposed
15570                # name.  We could be in trouble, but not if this is just a
15571                # synonym for another table that we have already made a child
15572                # of the pre-existing one.
15573                if ($pre_existing->is_set_equivalent_to($actual)) {
15574                    trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
15575                    $pre_existing->add_alias($proposed_name);
15576                    next;
15577                }
15578
15579                # Here, there is a name collision, but it still could be OK if
15580                # the tables match the identical set of code points, in which
15581                # case, we can combine the names.  Compare each table's code
15582                # point list to see if they are identical.
15583                trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
15584                if ($pre_existing->matches_identically_to($actual)) {
15585
15586                    # Here, they do match identically.  Not a real conflict.
15587                    # Make the perl version a child of the Unicode one, except
15588                    # in the non-obvious case of where the perl name is
15589                    # already a synonym of another Unicode property.  (This is
15590                    # excluded by the test for it being its own parent.)  The
15591                    # reason for this exclusion is that then the two Unicode
15592                    # properties become related; and we don't really know if
15593                    # they are or not.  We generate documentation based on
15594                    # relatedness, and this would be misleading.  Code
15595                    # later executed in the process will cause the tables to
15596                    # be represented by a single file anyway, without making
15597                    # it look in the pod like they are necessarily related.
15598                    if ($pre_existing->parent == $pre_existing
15599                        && ($pre_existing->property == $perl
15600                            || $actual->property == $perl))
15601                    {
15602                        trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
15603                        $pre_existing->set_equivalent_to($actual, Related => 1);
15604                    }
15605                    elsif (main::DEBUG && $to_trace) {
15606                        trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
15607                        trace $pre_existing->parent;
15608                    }
15609                    next PREFIX;
15610                }
15611
15612                # Here they didn't match identically, there is a real conflict
15613                # between our new name and a pre-existing property.
15614                $actual->add_conflicting($proposed_name, 'p', $pre_existing);
15615                $pre_existing->add_conflicting($nominal->full_name,
15616                                               'p',
15617                                               $actual);
15618
15619                # Don't output a warning for aliases for the block
15620                # properties (unless they start with 'In_') as it is
15621                # expected that there will be conflicts and the block
15622                # form loses.
15623                if ($verbosity >= $NORMAL_VERBOSITY
15624                    && ($actual->property != $block || $prefix eq 'In_'))
15625                {
15626                    print simple_fold(join_lines(<<END
15627There is already an alias named $proposed_name (from $pre_existing),
15628so not creating this alias for $actual
15629END
15630                    ), "", 4);
15631                }
15632
15633                # Keep track for documentation purposes.
15634                $has_In_conflicts++ if $prefix eq 'In_';
15635                $has_Is_conflicts++ if $prefix eq 'Is_';
15636            }
15637        }
15638    }
15639
15640    # There are some properties which have No and Yes (and N and Y) as
15641    # property values, but aren't binary, and could possibly be confused with
15642    # binary ones.  So create caveats for them.  There are tables that are
15643    # named 'No', and tables that are named 'N', but confusion is not likely
15644    # unless they are the same table.  For example, N meaning Number or
15645    # Neutral is not likely to cause confusion, so don't add caveats to things
15646    # like them.
15647    foreach my $property (grep { $_->type != $BINARY
15648                                 && $_->type != $FORCED_BINARY }
15649                                                            property_ref('*'))
15650    {
15651        my $yes = $property->table('Yes');
15652        if (defined $yes) {
15653            my $y = $property->table('Y');
15654            if (defined $y && $yes == $y) {
15655                foreach my $alias ($property->aliases) {
15656                    $yes->add_conflicting($alias->name);
15657                }
15658            }
15659        }
15660        my $no = $property->table('No');
15661        if (defined $no) {
15662            my $n = $property->table('N');
15663            if (defined $n && $no == $n) {
15664                foreach my $alias ($property->aliases) {
15665                    $no->add_conflicting($alias->name, 'P');
15666                }
15667            }
15668        }
15669    }
15670
15671    return;
15672}
15673
15674sub register_file_for_name($table, $directory_ref, $file) {
15675    # Given info about a table and a datafile that it should be associated
15676    # with, register that association
15677
15678    # $directory_ref    # Array of the directory path for the file
15679    # $file             # The file name in the final directory.
15680
15681    trace "table=$table, file=$file, directory=@$directory_ref, fate=", $table->fate if main::DEBUG && $to_trace;
15682
15683    if ($table->isa('Property')) {
15684        $table->set_file_path(@$directory_ref, $file);
15685        push @map_properties, $table;
15686
15687        # No swash means don't do the rest of this.
15688        return if $table->fate != $ORDINARY
15689                  && ! ($table->name =~ /^_/ && $table->fate == $INTERNAL_ONLY);
15690
15691        # Get the path to the file
15692        my @path = $table->file_path;
15693
15694        # Use just the file name if no subdirectory.
15695        shift @path if $path[0] eq File::Spec->curdir();
15696
15697        my $file = join '/', @path;
15698
15699        # Create a hash entry for Unicode::UCD to get the file that stores this
15700        # property's map table
15701        foreach my $alias ($table->aliases) {
15702            my $name = $alias->name;
15703            if ($name =~ /^_/) {
15704                $strict_property_to_file_of{lc $name} = $file;
15705            }
15706            else {
15707                $loose_property_to_file_of{standardize($name)} = $file;
15708            }
15709        }
15710
15711        # And a way for Unicode::UCD to find the proper key in the SwashInfo
15712        # hash for this property.
15713        $file_to_swash_name{$file} = "To" . $table->swash_name;
15714        return;
15715    }
15716
15717    # Do all of the work for all equivalent tables when called with the leader
15718    # table, so skip if isn't the leader.
15719    return if $table->leader != $table;
15720
15721    # If this is a complement of another file, use that other file instead,
15722    # with a ! prepended to it.
15723    my $complement;
15724    if (($complement = $table->complement) != 0) {
15725        my @directories = $complement->file_path;
15726
15727        # This assumes that the 0th element is something like 'lib',
15728        # the 1th element the property name (in its own directory), like
15729        # 'AHex', and the 2th element the file like 'Y' which will have a .pl
15730        # appended to it later.
15731        $directories[1] =~ s/^/!/;
15732        $file = pop @directories;
15733        $directory_ref =\@directories;
15734    }
15735
15736    # Join all the file path components together, using slashes.
15737    my $full_filename = join('/', @$directory_ref, $file);
15738
15739    # All go in the same subdirectory of unicore, or the special
15740    # pseudo-directory '#'
15741    if ($directory_ref->[0] !~ / ^ $matches_directory | \# $ /x) {
15742        Carp::my_carp("Unexpected directory in "
15743                .  join('/', @{$directory_ref}, $file));
15744    }
15745
15746    # For this table and all its equivalents ...
15747    foreach my $table ($table, $table->equivalents) {
15748
15749        # Associate it with its file internally.  Don't include the
15750        # $matches_directory first component
15751        $table->set_file_path(@$directory_ref, $file);
15752
15753        # No swash means don't do the rest of this.
15754        next if $table->isa('Map_Table') && $table->fate != $ORDINARY;
15755
15756        my $sub_filename = join('/', $directory_ref->[1, -1], $file);
15757
15758        my $property = $table->property;
15759        my $property_name = ($property == $perl)
15760                             ? ""  # 'perl' is never explicitly stated
15761                             : standardize($property->name) . '=';
15762
15763        my $is_default = 0; # Is this table the default one for the property?
15764
15765        # To calculate $is_default, we find if this table is the same as the
15766        # default one for the property.  But this is complicated by the
15767        # possibility that there is a master table for this one, and the
15768        # information is stored there instead of here.
15769        my $parent = $table->parent;
15770        my $leader_prop = $parent->property;
15771        my $default_map = $leader_prop->default_map;
15772        if (defined $default_map) {
15773            my $default_table = $leader_prop->table($default_map);
15774            $is_default = 1 if defined $default_table && $parent == $default_table;
15775        }
15776
15777        # Calculate the loose name for this table.  Mostly it's just its name,
15778        # standardized.  But in the case of Perl tables that are single-form
15779        # equivalents to Unicode properties, it is the latter's name.
15780        my $loose_table_name =
15781                        ($property != $perl || $leader_prop == $perl)
15782                        ? standardize($table->name)
15783                        : standardize($parent->name);
15784
15785        my $deprecated = ($table->status eq $DEPRECATED)
15786                         ? $table->status_info
15787                         : "";
15788        my $caseless_equivalent = $table->caseless_equivalent;
15789
15790        # And for each of the table's aliases...  This inner loop eventually
15791        # goes through all aliases in the UCD that we generate regex match
15792        # files for
15793        foreach my $alias ($table->aliases) {
15794            my $standard = UCD_name($table, $alias);
15795
15796            # Generate an entry in either the loose or strict hashes, which
15797            # will translate the property and alias names combination into the
15798            # file where the table for them is stored.
15799            if ($alias->loose_match) {
15800                if (exists $loose_to_file_of{$standard}) {
15801                    Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
15802                }
15803                else {
15804                    $loose_to_file_of{$standard} = $sub_filename;
15805                }
15806            }
15807            else {
15808                if (exists $stricter_to_file_of{$standard}) {
15809                    Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
15810                }
15811                else {
15812                    $stricter_to_file_of{$standard} = $sub_filename;
15813
15814                    # Tightly coupled with how Unicode::UCD works, for a
15815                    # floating point number that is a whole number, get rid of
15816                    # the trailing decimal point and 0's, so that Unicode::UCD
15817                    # will work.  Also note that this assumes that such a
15818                    # number is matched strictly; so if that were to change,
15819                    # this would be wrong.
15820                    if ((my $integer_name = $alias->name)
15821                            =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
15822                    {
15823                        $stricter_to_file_of{$property_name . $integer_name}
15824                                                            = $sub_filename;
15825                    }
15826                }
15827            }
15828
15829            # For Unicode::UCD, create a mapping of the prop=value to the
15830            # canonical =value for that property.
15831            if ($standard =~ /=/) {
15832
15833                # This could happen if a strict name mapped into an existing
15834                # loose name.  In that event, the strict names would have to
15835                # be moved to a new hash.
15836                if (exists($loose_to_standard_value{$standard})) {
15837                    Carp::my_carp_bug("'$standard' conflicts with a pre-existing use.  Bad News.  Continuing anyway");
15838                }
15839                $loose_to_standard_value{$standard} = $loose_table_name;
15840            }
15841
15842            # Keep a list of the deprecated properties and their filenames
15843            if ($deprecated && $complement == 0) {
15844                $Unicode::UCD::why_deprecated{$sub_filename} = $deprecated;
15845            }
15846
15847            # And a substitute table, if any, for case-insensitive matching
15848            if ($caseless_equivalent != 0) {
15849                $caseless_equivalent_to{$standard} = $caseless_equivalent;
15850            }
15851
15852            # Add to defaults list if the table this alias belongs to is the
15853            # default one
15854            $loose_defaults{$standard} = 1 if $is_default;
15855        }
15856    }
15857
15858    return;
15859}
15860
15861{   # Closure
15862    my %base_names;  # Names already used for avoiding DOS 8.3 filesystem
15863                     # conflicts
15864    my %full_dir_name_of;   # Full length names of directories used.
15865
15866    sub construct_filename($name, $mutable, $directories_ref) {
15867        # Return a file name for a table, based on the table name, but perhaps
15868        # changed to get rid of non-portable characters in it, and to make
15869        # sure that it is unique on a file system that allows the names before
15870        # any period to be at most 8 characters (DOS).  While we're at it
15871        # check and complain if there are any directory conflicts.
15872
15873        # $name                 # The name to start with
15874        # $mutable              # Boolean: can it be changed?  If no, but
15875                                # yet it must be to work properly, a warning
15876                                # is given
15877        # $directories_ref      # A reference to an array containing the
15878                                # path to the file, with each element one path
15879                                # component.  This is used because the same
15880                                # name can be used in different directories.
15881
15882        my $warn = ! defined wantarray;  # If true, then if the name is
15883                                # changed, a warning is issued as well.
15884
15885        if (! defined $name) {
15886            Carp::my_carp("Undefined name in directory "
15887                          . File::Spec->join(@$directories_ref)
15888                          . ". '_' used");
15889            return '_';
15890        }
15891
15892        # Make sure that no directory names conflict with each other.  Look at
15893        # each directory in the input file's path.  If it is already in use,
15894        # assume it is correct, and is merely being re-used, but if we
15895        # truncate it to 8 characters, and find that there are two directories
15896        # that are the same for the first 8 characters, but differ after that,
15897        # then that is a problem.
15898        foreach my $directory (@$directories_ref) {
15899            my $short_dir = substr($directory, 0, 8);
15900            if (defined $full_dir_name_of{$short_dir}) {
15901                next if $full_dir_name_of{$short_dir} eq $directory;
15902                Carp::my_carp("Directory $directory conflicts with directory $full_dir_name_of{$short_dir}.  Bad News.  Continuing anyway");
15903            }
15904            else {
15905                $full_dir_name_of{$short_dir} = $directory;
15906            }
15907        }
15908
15909        my $path = join '/', @$directories_ref;
15910        $path .= '/' if $path;
15911
15912        # Remove interior underscores.
15913        (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
15914
15915        # Convert the dot in floating point numbers to an underscore
15916        $filename =~ s/\./_/ if $filename =~ / ^ \d+ \. \d+ $ /x;
15917
15918        my $suffix = "";
15919
15920        # Extract any suffix, delete any non-word character, and truncate to 3
15921        # after the dot
15922        if ($filename =~ m/ ( .*? ) ( \. .* ) /x) {
15923            $filename = $1;
15924            $suffix = $2;
15925            $suffix =~ s/\W+//g;
15926            substr($suffix, 4) = "" if length($suffix) > 4;
15927        }
15928
15929        # Change any non-word character outside the suffix into an underscore,
15930        # and truncate to 8.
15931        $filename =~ s/\W+/_/g;   # eg., "L&" -> "L_"
15932        substr($filename, 8) = "" if length($filename) > 8;
15933
15934        # Make sure the basename doesn't conflict with something we
15935        # might have already written. If we have, say,
15936        #     InGreekExtended1
15937        #     InGreekExtended2
15938        # they become
15939        #     InGreekE
15940        #     InGreek2
15941        my $warned = 0;
15942        while (my $num = $base_names{$path}{lc "$filename$suffix"}++) {
15943            $num++; # so basenames with numbers start with '2', which
15944                    # just looks more natural.
15945
15946            # Want to append $num, but if it'll make the basename longer
15947            # than 8 characters, pre-truncate $filename so that the result
15948            # is acceptable.
15949            my $delta = length($filename) + length($num) - 8;
15950            if ($delta > 0) {
15951                substr($filename, -$delta) = $num;
15952            }
15953            else {
15954                $filename .= $num;
15955            }
15956            if ($warn && ! $warned) {
15957                $warned = 1;
15958                Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS).  Proceeding anyway.");
15959            }
15960        }
15961
15962        return $filename if $mutable;
15963
15964        # If not changeable, must return the input name, but warn if needed to
15965        # change it beyond shortening it.
15966        if ($name ne $filename
15967            && substr($name, 0, length($filename)) ne $filename) {
15968            Carp::my_carp("'$path$name' had to be changed into '$filename'.  Bad News.  Proceeding anyway.");
15969        }
15970        return $name;
15971    }
15972}
15973
15974# The pod file contains a very large table.  Many of the lines in that table
15975# would exceed a typical output window's size, and so need to be wrapped with
15976# a hanging indent to make them look good.  The pod language is really
15977# insufficient here.  There is no general construct to do that in pod, so it
15978# is done here by beginning each such line with a space to cause the result to
15979# be output without formatting, and doing all the formatting here.  This leads
15980# to the result that if the eventual display window is too narrow it won't
15981# look good, and if the window is too wide, no advantage is taken of that
15982# extra width.  A further complication is that the output may be indented by
15983# the formatter so that there is less space than expected.  What I (khw) have
15984# done is to assume that that indent is a particular number of spaces based on
15985# what it is in my Linux system;  people can always resize their windows if
15986# necessary, but this is obviously less than desirable, but the best that can
15987# be expected.
15988my $automatic_pod_indent = 8;
15989
15990# Try to format so that uses fewest lines, but few long left column entries
15991# slide into the right column.  An experiment on 5.1 data yielded the
15992# following percentages that didn't cut into the other side along with the
15993# associated first-column widths
15994# 69% = 24
15995# 80% not too bad except for a few blocks
15996# 90% = 33; # , cuts 353/3053 lines from 37 = 12%
15997# 95% = 37;
15998my $indent_info_column = 27;    # 75% of lines didn't have overlap
15999
16000my $FILLER = 3;     # Length of initial boiler-plate columns in a pod line
16001                    # The 3 is because of:
16002                    #   1   for the leading space to tell the pod formatter to
16003                    #       output as-is
16004                    #   1   for the flag
16005                    #   1   for the space between the flag and the main data
16006
16007sub format_pod_line($first_column_width, $entry, $info, $status = "", $loose_match = 1 ) {
16008    # Take a pod line and return it, formatted properly
16009
16010    # $entry Contents of left column
16011    # $info Contents of right column
16012
16013    my $flags = "";
16014    $flags .= $STRICTER if ! $loose_match;
16015
16016    $flags .= $status if $status;
16017
16018    # There is a blank in the left column to cause the pod formatter to
16019    # output the line as-is.
16020    return sprintf " %-*s%-*s %s\n",
16021                    # The first * in the format is replaced by this, the -1 is
16022                    # to account for the leading blank.  There isn't a
16023                    # hard-coded blank after this to separate the flags from
16024                    # the rest of the line, so that in the unlikely event that
16025                    # multiple flags are shown on the same line, they both
16026                    # will get displayed at the expense of that separation,
16027                    # but since they are left justified, a blank will be
16028                    # inserted in the normal case.
16029                    $FILLER - 1,
16030                    $flags,
16031
16032                    # The other * in the format is replaced by this number to
16033                    # cause the first main column to right fill with blanks.
16034                    # The -1 is for the guaranteed blank following it.
16035                    $first_column_width - $FILLER - 1,
16036                    $entry,
16037                    $info;
16038}
16039
16040my @zero_match_tables;  # List of tables that have no matches in this release
16041
16042sub make_re_pod_entries($input_table) {
16043    # This generates the entries for the pod file for a given table.
16044    # Also done at this time are any children tables.  The output looks like:
16045    # \p{Common}              \p{Script=Common} (Short: \p{Zyyy}) (5178)
16046
16047    # Generate parent and all its children at the same time.
16048    return if $input_table->parent != $input_table;
16049
16050    my $property = $input_table->property;
16051    my $type = $property->type;
16052    my $full_name = $property->full_name;
16053
16054    my $count = $input_table->count;
16055    my $unicode_count;
16056    my $non_unicode_string;
16057    if ($count > $MAX_UNICODE_CODEPOINTS) {
16058        $unicode_count = $count - ($MAX_WORKING_CODEPOINT
16059                                    - $MAX_UNICODE_CODEPOINT);
16060        $non_unicode_string = " plus all above-Unicode code points";
16061    }
16062    else {
16063        $unicode_count = $count;
16064        $non_unicode_string = "";
16065    }
16066
16067    my $string_count = clarify_number($unicode_count) . $non_unicode_string;
16068
16069    my $definition = $input_table->calculate_table_definition;
16070    if ($definition) {
16071
16072        # Save the definition for later use.
16073        $input_table->set_definition($definition);
16074
16075        $definition = ": $definition";
16076    }
16077
16078    my $status = $input_table->status;
16079    my $status_info = $input_table->status_info;
16080    my $caseless_equivalent = $input_table->caseless_equivalent;
16081
16082    # Don't mention a placeholder equivalent as it isn't to be listed in the
16083    # pod
16084    $caseless_equivalent = 0 if $caseless_equivalent != 0
16085                                && $caseless_equivalent->fate > $ORDINARY;
16086
16087    my $entry_for_first_table; # The entry for the first table output.
16088                           # Almost certainly, it is the parent.
16089
16090    # For each related table (including itself), we will generate a pod entry
16091    # for each name each table goes by
16092    foreach my $table ($input_table, $input_table->children) {
16093
16094        # Unicode::UCD cannot deal with null string property values, so skip
16095        # any tables that have no non-null names.
16096        next if ! grep { $_->name ne "" } $table->aliases;
16097
16098        # First, gather all the info that applies to this table as a whole.
16099
16100        push @zero_match_tables, $table if $count == 0
16101                                            # Don't mention special tables
16102                                            # as being zero length
16103                                           && $table->fate == $ORDINARY;
16104
16105        my $table_property = $table->property;
16106
16107        # The short name has all the underscores removed, while the full name
16108        # retains them.  Later, we decide whether to output a short synonym
16109        # for the full one, we need to compare apples to apples, so we use the
16110        # short name's length including underscores.
16111        my $table_property_short_name_length;
16112        my $table_property_short_name
16113            = $table_property->short_name(\$table_property_short_name_length);
16114        my $table_property_full_name = $table_property->full_name;
16115
16116        # Get how much savings there is in the short name over the full one
16117        # (delta will always be <= 0)
16118        my $table_property_short_delta = $table_property_short_name_length
16119                                         - length($table_property_full_name);
16120        my @table_description = $table->description;
16121        my @table_note = $table->note;
16122
16123        # Generate an entry for each alias in this table.
16124        my $entry_for_first_alias;  # saves the first one encountered.
16125        foreach my $alias ($table->aliases) {
16126
16127            # Skip if not to go in pod.
16128            next unless $alias->make_re_pod_entry;
16129
16130            # Start gathering all the components for the entry
16131            my $name = $alias->name;
16132
16133            # Skip if name is empty, as can't be accessed by regexes.
16134            next if $name eq "";
16135
16136            my $entry;      # Holds the left column, may include extras
16137            my $entry_ref;  # To refer to the left column's contents from
16138                            # another entry; has no extras
16139
16140            # First the left column of the pod entry.  Tables for the $perl
16141            # property always use the single form.
16142            if ($table_property == $perl) {
16143                $entry = "\\p{$name}";
16144                $entry .= " \\p$name" if length $name == 1; # Show non-braced
16145                                                            # form too
16146                $entry_ref = "\\p{$name}";
16147            }
16148            else {    # Compound form.
16149
16150                # Only generate one entry for all the aliases that mean true
16151                # or false in binary properties.  Append a '*' to indicate
16152                # some are missing.  (The heading comment notes this.)
16153                my $rhs;
16154                if ($type == $BINARY) {
16155                    next if $name ne 'N' && $name ne 'Y';
16156                    $rhs = "$name*";
16157                }
16158                elsif ($type != $FORCED_BINARY) {
16159                    $rhs = $name;
16160                }
16161                else {
16162
16163                    # Forced binary properties require special handling.  It
16164                    # has two sets of tables, one set is true/false; and the
16165                    # other set is everything else.  Entries are generated for
16166                    # each set.  Use the Bidi_Mirrored property (which appears
16167                    # in all Unicode versions) to get a list of the aliases
16168                    # for the true/false tables.  Of these, only output the N
16169                    # and Y ones, the same as, a regular binary property.  And
16170                    # output all the rest, same as a non-binary property.
16171                    my $bm = property_ref("Bidi_Mirrored");
16172                    if ($name eq 'N' || $name eq 'Y') {
16173                        $rhs = "$name*";
16174                    } elsif (grep { $name eq $_->name } $bm->table("Y")->aliases,
16175                                                        $bm->table("N")->aliases)
16176                    {
16177                        next;
16178                    }
16179                    else {
16180                        $rhs = $name;
16181                    }
16182                }
16183
16184                # Colon-space is used to give a little more space to be easier
16185                # to read;
16186                $entry = "\\p{"
16187                        . $table_property_full_name
16188                        . ": $rhs}";
16189
16190                # But for the reference to this entry, which will go in the
16191                # right column, where space is at a premium, use equals
16192                # without a space
16193                $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
16194            }
16195
16196            # Then the right (info) column.  This is stored as components of
16197            # an array for the moment, then joined into a string later.  For
16198            # non-internal only properties, begin the info with the entry for
16199            # the first table we encountered (if any), as things are ordered
16200            # so that that one is the most descriptive.  This leads to the
16201            # info column of an entry being a more descriptive version of the
16202            # name column
16203            my @info;
16204            if ($name =~ /^_/) {
16205                push @info,
16206                        '(For internal use by Perl, not necessarily stable)';
16207            }
16208            elsif ($entry_for_first_alias) {
16209                push @info, $entry_for_first_alias;
16210            }
16211
16212            # If this entry is equivalent to another, add that to the info,
16213            # using the first such table we encountered
16214            if ($entry_for_first_table) {
16215                if (@info) {
16216                    push @info, "(= $entry_for_first_table)";
16217                }
16218                else {
16219                    push @info, $entry_for_first_table;
16220                }
16221            }
16222
16223            # If the name is a large integer, add an equivalent with an
16224            # exponent for better readability
16225            if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
16226                push @info, sprintf "(= %.1e)", $name
16227            }
16228
16229            my $parenthesized = "";
16230            if (! $entry_for_first_alias) {
16231
16232                # This is the first alias for the current table.  The alias
16233                # array is ordered so that this is the fullest, most
16234                # descriptive alias, so it gets the fullest info.  The other
16235                # aliases are mostly merely pointers to this one, using the
16236                # information already added above.
16237
16238                # Display any status message, but only on the parent table
16239                if ($status && ! $entry_for_first_table) {
16240                    push @info, $status_info;
16241                }
16242
16243                # Put out any descriptive info
16244                if (@table_description || @table_note) {
16245                    push @info, join "; ", @table_description, @table_note;
16246                }
16247
16248                # Look to see if there is a shorter name we can point people
16249                # at
16250                my $standard_name = standardize($name);
16251                my $short_name;
16252                my $proposed_short = $table->short_name;
16253                if (defined $proposed_short) {
16254                    my $standard_short = standardize($proposed_short);
16255
16256                    # If the short name is shorter than the standard one, or
16257                    # even if it's not, but the combination of it and its
16258                    # short property name (as in \p{prop=short} ($perl doesn't
16259                    # have this form)) saves at least two characters, then,
16260                    # cause it to be listed as a shorter synonym.
16261                    if (length $standard_short < length $standard_name
16262                        || ($table_property != $perl
16263                            && (length($standard_short)
16264                                - length($standard_name)
16265                                + $table_property_short_delta)  # (<= 0)
16266                                < -2))
16267                    {
16268                        $short_name = $proposed_short;
16269                        if ($table_property != $perl) {
16270                            $short_name = $table_property_short_name
16271                                          . "=$short_name";
16272                        }
16273                        $short_name = "\\p{$short_name}";
16274                    }
16275                }
16276
16277                # And if this is a compound form name, see if there is a
16278                # single form equivalent
16279                my $single_form;
16280                if ($table_property != $perl && $table_property != $block) {
16281
16282                    # Special case the binary N tables, so that will print
16283                    # \P{single}, but use the Y table values to populate
16284                    # 'single', as we haven't likewise populated the N table.
16285                    # For forced binary tables, we can't just look at the N
16286                    # table, but must see if this table is equivalent to the N
16287                    # one, as there are two equivalent beasts in these
16288                    # properties.
16289                    my $test_table;
16290                    my $p;
16291                    if (   ($type == $BINARY
16292                            && $input_table == $property->table('No'))
16293                        || ($type == $FORCED_BINARY
16294                            && $property->table('No')->
16295                                        is_set_equivalent_to($input_table)))
16296                    {
16297                        $test_table = $property->table('Yes');
16298                        $p = 'P';
16299                    }
16300                    else {
16301                        $test_table = $input_table;
16302                        $p = 'p';
16303                    }
16304
16305                    # Look for a single form amongst all the children.
16306                    foreach my $table ($test_table->children) {
16307                        next if $table->property != $perl;
16308                        my $proposed_name = $table->short_name;
16309                        next if ! defined $proposed_name;
16310
16311                        # Don't mention internal-only properties as a possible
16312                        # single form synonym
16313                        next if substr($proposed_name, 0, 1) eq '_';
16314
16315                        $proposed_name = "\\$p\{$proposed_name}";
16316                        if (! defined $single_form
16317                            || length($proposed_name) < length $single_form)
16318                        {
16319                            $single_form = $proposed_name;
16320
16321                            # The goal here is to find a single form; not the
16322                            # shortest possible one.  We've already found a
16323                            # short name.  So, stop at the first single form
16324                            # found, which is likely to be closer to the
16325                            # original.
16326                            last;
16327                        }
16328                    }
16329                }
16330
16331                # Output both short and single in the same parenthesized
16332                # expression, but with only one of 'Single', 'Short' if there
16333                # are both items.
16334                if ($short_name || $single_form || $table->conflicting) {
16335                    $parenthesized .= "Short: $short_name" if $short_name;
16336                    if ($short_name && $single_form) {
16337                        $parenthesized .= ', ';
16338                    }
16339                    elsif ($single_form) {
16340                        $parenthesized .= 'Single: ';
16341                    }
16342                    $parenthesized .= $single_form if $single_form;
16343                }
16344            }
16345
16346            if ($caseless_equivalent != 0) {
16347                $parenthesized .=  '; ' if $parenthesized ne "";
16348                $parenthesized .= "/i= " . $caseless_equivalent->complete_name;
16349            }
16350
16351
16352            # Warn if this property isn't the same as one that a
16353            # semi-casual user might expect.  The other components of this
16354            # parenthesized structure are calculated only for the first entry
16355            # for this table, but the conflicting is deemed important enough
16356            # to go on every entry.
16357            my $conflicting = join " NOR ", $table->conflicting;
16358            if ($conflicting) {
16359                $parenthesized .=  '; ' if $parenthesized ne "";
16360                $parenthesized .= "NOT $conflicting";
16361            }
16362
16363            push @info, "($parenthesized)" if $parenthesized;
16364
16365            if ($name =~ /_$/ && $alias->loose_match) {
16366                push @info, "Note the trailing '_' matters in spite of loose matching rules.";
16367            }
16368
16369            if ($table_property != $perl && $table->perl_extension) {
16370                push @info, '(Perl extension)';
16371            }
16372            my $definition = $table->definition // "";
16373            $definition = "" if $entry_for_first_alias;
16374            $definition = ": $definition" if $definition;
16375            push @info, "($string_count$definition)";
16376
16377            # Now, we have both the entry and info so add them to the
16378            # list of all the properties.
16379            push @match_properties,
16380                format_pod_line($indent_info_column,
16381                                $entry,
16382                                join( " ", @info),
16383                                $alias->status,
16384                                $alias->loose_match);
16385
16386            $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
16387        } # End of looping through the aliases for this table.
16388
16389        if (! $entry_for_first_table) {
16390            $entry_for_first_table = $entry_for_first_alias;
16391        }
16392    } # End of looping through all the related tables
16393    return;
16394}
16395
16396sub make_ucd_table_pod_entries($table) {
16397    # Generate the entries for the UCD section of the pod for $table.  This
16398    # also calculates if names are ambiguous, so has to be called even if the
16399    # pod is not being output
16400
16401    my $short_name = $table->name;
16402    my $standard_short_name = standardize($short_name);
16403    my $full_name = $table->full_name;
16404    my $standard_full_name = standardize($full_name);
16405
16406    my $full_info = "";     # Text of info column for full-name entries
16407    my $other_info = "";    # Text of info column for short-name entries
16408    my $short_info = "";    # Text of info column for other entries
16409    my $meaning = "";       # Synonym of this table
16410
16411    my $property = ($table->isa('Property'))
16412                   ? $table
16413                   : $table->parent->property;
16414
16415    my $perl_extension = $table->perl_extension;
16416    my $is_perl_extension_match_table_but_not_dollar_perl
16417                                                        = $property != $perl
16418                                                       && $perl_extension
16419                                                       && $property != $table;
16420
16421    # Get the more official name for perl extensions that aren't
16422    # stand-alone properties
16423    if ($is_perl_extension_match_table_but_not_dollar_perl) {
16424        if ($property->type == $BINARY) {
16425            $meaning = $property->full_name;
16426        }
16427        else {
16428            $meaning = $table->parent->complete_name;
16429        }
16430    }
16431
16432    # There are three types of info column.  One for the short name, one for
16433    # the full name, and one for everything else.  They mostly are the same,
16434    # so initialize in the same loop.
16435
16436    foreach my $info_ref (\$full_info, \$short_info, \$other_info) {
16437        if ($info_ref != \$full_info) {
16438
16439            # The non-full name columns include the full name
16440            $$info_ref .= $full_name;
16441        }
16442
16443
16444        if ($is_perl_extension_match_table_but_not_dollar_perl) {
16445
16446            # Add the synonymous name for the non-full name entries; and to
16447            # the full-name entry if it adds extra information
16448            if (   standardize($meaning) ne $standard_full_name
16449                || $info_ref == \$other_info
16450                || $info_ref == \$short_info)
16451            {
16452                my $parenthesized =  $info_ref != \$full_info;
16453                $$info_ref .= " " if $$info_ref && $parenthesized;
16454                $$info_ref .= "(=" if $parenthesized;
16455                $$info_ref .= "$meaning";
16456                $$info_ref .= ")" if $parenthesized;
16457                $$info_ref .= ".";
16458            }
16459        }
16460
16461        # And the full-name entry includes the short name, if shorter
16462        if ($info_ref == \$full_info
16463            && length $standard_short_name < length $standard_full_name)
16464        {
16465            $full_info =~ s/\.\Z//;
16466            $full_info .= "  " if $full_info;
16467            $full_info .= "(Short: $short_name)";
16468        }
16469
16470        if ($table->perl_extension) {
16471            $$info_ref =~ s/\.\Z//;
16472            $$info_ref .= ".  " if $$info_ref;
16473            $$info_ref .= "(Perl extension)";
16474        }
16475    }
16476
16477    my $definition;
16478    my $definition_table;
16479    my $type = $table->property->type;
16480    if ($type == $BINARY || $type == $FORCED_BINARY) {
16481        $definition_table = $table->property->table('Y');
16482    }
16483    elsif ($table->isa('Match_Table')) {
16484        $definition_table = $table;
16485    }
16486
16487    $definition = $definition_table->calculate_table_definition
16488                                            if defined $definition_table
16489                                                    && $definition_table != 0;
16490
16491    # Add any extra annotations to the full name entry
16492    foreach my $more_info ($table->description,
16493                            $definition,
16494                            $table->note,
16495                            $table->status_info)
16496    {
16497        next unless $more_info;
16498        $full_info =~ s/\.\Z//;
16499        $full_info .= ".  " if $full_info;
16500        $full_info .= $more_info;
16501    }
16502    if ($table->property->type == $FORCED_BINARY) {
16503        if ($full_info) {
16504            $full_info =~ s/\.\Z//;
16505            $full_info .= ".  ";
16506        }
16507        $full_info .= "This is a combination property which has both:"
16508                    . " 1) a map to various string values; and"
16509                    . " 2) a map to boolean Y/N, where 'Y' means the"
16510                    . " string value is non-empty.  Add the prefix 'is'"
16511                    . " to the prop_invmap() call to get the latter";
16512    }
16513
16514    # These keep track if have created full and short name pod entries for the
16515    # property
16516    my $done_full = 0;
16517    my $done_short = 0;
16518
16519    # Every possible name is kept track of, even those that aren't going to be
16520    # output.  This way we can be sure to find the ambiguities.
16521    foreach my $alias ($table->aliases) {
16522        my $name = $alias->name;
16523        my $standard = standardize($name);
16524        my $info;
16525        my $output_this = $alias->ucd;
16526
16527        # If the full and short names are the same, we want to output the full
16528        # one's entry, so it has priority.
16529        if ($standard eq $standard_full_name) {
16530            next if $done_full;
16531            $done_full = 1;
16532            $info = $full_info;
16533        }
16534        elsif ($standard eq $standard_short_name) {
16535            next if $done_short;
16536            $done_short = 1;
16537            next if $standard_short_name eq $standard_full_name;
16538            $info = $short_info;
16539        }
16540        else {
16541            $info = $other_info;
16542        }
16543
16544        $combination_property{$standard} = 1
16545                                  if $table->property->type == $FORCED_BINARY;
16546
16547        # Here, we have set up the two columns for this entry.  But if an
16548        # entry already exists for this name, we have to decide which one
16549        # we're going to later output.
16550        if (exists $ucd_pod{$standard}) {
16551
16552            # If the two entries refer to the same property, it's not going to
16553            # be ambiguous.  (Likely it's because the names when standardized
16554            # are the same.)  But that means if they are different properties,
16555            # there is ambiguity.
16556            if ($ucd_pod{$standard}->{'property'} != $property) {
16557
16558                # Here, we have an ambiguity.  This code assumes that one is
16559                # scheduled to be output and one not and that one is a perl
16560                # extension (which is not to be output) and the other isn't.
16561                # If those assumptions are wrong, things have to be rethought.
16562                if ($ucd_pod{$standard}{'output_this'} == $output_this
16563                    || $ucd_pod{$standard}{'perl_extension'} == $perl_extension
16564                    || $output_this == $perl_extension)
16565                {
16566                    Carp::my_carp("Bad news.  $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations.  Proceeding anyway.");
16567                }
16568
16569                # We modify the info column of the one being output to
16570                # indicate the ambiguity.  Set $which to point to that one's
16571                # info.
16572                my $which;
16573                if ($ucd_pod{$standard}{'output_this'}) {
16574                    $which = \$ucd_pod{$standard}->{'info'};
16575                }
16576                else {
16577                    $which = \$info;
16578                    $meaning = $ucd_pod{$standard}{'meaning'};
16579                }
16580
16581                chomp $$which;
16582                $$which =~ s/\.\Z//;
16583                $$which .= "; NOT '$standard' meaning '$meaning'";
16584
16585                $ambiguous_names{$standard} = 1;
16586            }
16587
16588            # Use the non-perl-extension variant
16589            next unless $ucd_pod{$standard}{'perl_extension'};
16590        }
16591
16592        # Store enough information about this entry that we can later look for
16593        # ambiguities, and output it properly.
16594        $ucd_pod{$standard} = { 'name' => $name,
16595                                'info' => $info,
16596                                'meaning' => $meaning,
16597                                'output_this' => $output_this,
16598                                'perl_extension' => $perl_extension,
16599                                'property' => $property,
16600                                'status' => $alias->status,
16601        };
16602    } # End of looping through all this table's aliases
16603
16604    return;
16605}
16606
16607sub pod_alphanumeric_sort {
16608    # Sort pod entries alphanumerically.
16609
16610    # The first few character columns are filler, plus the '\p{'; and get rid
16611    # of all the trailing stuff, starting with the trailing '}', so as to sort
16612    # on just 'Name=Value'
16613    (my $a = lc $a) =~ s/^ .*? \{ //x;
16614    $a =~ s/}.*//;
16615    (my $b = lc $b) =~ s/^ .*? \{ //x;
16616    $b =~ s/}.*//;
16617
16618    # Determine if the two operands are both internal only or both not.
16619    # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
16620    # should be the underscore that begins internal only
16621    my $a_is_internal = (substr($a, 0, 1) eq '_');
16622    my $b_is_internal = (substr($b, 0, 1) eq '_');
16623
16624    # Sort so the internals come last in the table instead of first (which the
16625    # leading underscore would otherwise indicate).
16626    if ($a_is_internal != $b_is_internal) {
16627        return 1 if $a_is_internal;
16628        return -1
16629    }
16630
16631    # Determine if the two operands are compound or not, and if so if are
16632    # "numeric" property values or not, like \p{Age: 3.0}.  But there are also
16633    # things like \p{Canonical_Combining_Class: CCC133} and \p{Age: V10_0},
16634    # all of which this considers numeric, and for sorting, looks just at the
16635    # numeric parts.  It can also be a rational like \p{Numeric Value=-1/2}.
16636    my $split_re = qr/
16637        ^ ( [^:=]+ ) # $1 is undef if not a compound form, otherwise is the
16638                     # property name
16639        [:=] \s*     # The syntax for the compound form
16640        (?:          # followed by ...
16641            (        # $2 gets defined if what follows is a "numeric"
16642                     # expression, which is ...
16643              ( -? \d+ (?: [.\/] \d+)?  # An integer, float, or rational
16644                                        # number, optionally signed
16645               | [[:alpha:]]{2,} \d+ $ ) # or something like CCC131.  Either
16646                                         # of these go into $3
16647             | ( V \d+ _ \d+ )           # or a Unicode's Age property version
16648                                         # number, into $4
16649            )
16650            | .* $    # If not "numeric", accept anything so that $1 gets
16651                      # defined if it is any compound form
16652        ) /ix;
16653    my ($a_initial, $a_numeric, $a_number, $a_version) = ($a =~ $split_re);
16654    my ($b_initial, $b_numeric, $b_number, $b_version) = ($b =~ $split_re);
16655
16656    # Sort alphabeticlly on the whole property name if either operand isn't
16657    # compound, or they differ.
16658    return $a cmp $b if   ! defined $a_initial
16659                       || ! defined $b_initial
16660                       || $a_initial ne $b_initial;
16661
16662    if (! defined $a_numeric) {
16663
16664        # If neither is numeric, use alpha sort
16665        return $a cmp $b if ! defined $b_numeric;
16666        return 1;  # Sort numeric ahead of alpha
16667    }
16668
16669    # Here $a is numeric
16670    return -1 if ! defined $b_numeric;  # Numeric sorts before alpha
16671
16672    # Here they are both numeric in the same property.
16673    # Convert version numbers into regular numbers
16674    if (defined $a_version) {
16675        ($a_number = $a_version) =~ s/^V//i;
16676        $a_number =~ s/_/./;
16677    }
16678    else {  # Otherwise get rid of the, e.g., CCC in CCC9 */
16679        $a_number =~ s/ ^ [[:alpha:]]+ //x;
16680    }
16681    if (defined $b_version) {
16682        ($b_number = $b_version) =~ s/^V//i;
16683        $b_number =~ s/_/./;
16684    }
16685    else {
16686        $b_number =~ s/ ^ [[:alpha:]]+ //x;
16687    }
16688
16689    # Convert rationals to floating for the comparison.
16690    $a_number = eval $a_number if $a_number =~ qr{/};
16691    $b_number = eval $b_number if $b_number =~ qr{/};
16692
16693    return $a_number <=> $b_number || $a cmp $b;
16694}
16695
16696sub make_pod () {
16697    # Create the .pod file.  This generates the various subsections and then
16698    # combines them in one big HERE document.
16699
16700    my $Is_flags_text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)";
16701
16702    return unless defined $pod_directory;
16703    print "Making pod file\n" if $verbosity >= $PROGRESS;
16704
16705    my $exception_message =
16706    '(Any exceptions are individually noted beginning with the word NOT.)';
16707    my @block_warning;
16708    if (-e 'Blocks.txt') {
16709
16710        # Add the line: '\p{In_*}    \p{Block: *}', with the warning message
16711        # if the global $has_In_conflicts indicates we have them.
16712        push @match_properties, format_pod_line($indent_info_column,
16713                                                '\p{In_*}',
16714                                                '\p{Block: *}'
16715                                                    . (($has_In_conflicts)
16716                                                      ? " $exception_message"
16717                                                      : ""),
16718                                                 $DISCOURAGED);
16719        @block_warning = << "END";
16720
16721In particular, matches in the Block property have single forms
16722defined by Perl that begin with C<"In_">, C<"Is_>, or even with no prefix at
16723all,  Like all B<DISCOURAGED> forms, these are not stable.  For example,
16724C<\\p{Block=Deseret}> can currently be written as C<\\p{In_Deseret}>,
16725C<\\p{Is_Deseret}>, or C<\\p{Deseret}>.  But, a new Unicode version may
16726come along that would force Perl to change the meaning of one or more of
16727these, and your program would no longer be correct.  Currently there are no
16728such conflicts with the form that begins C<"In_">, but there are many with the
16729other two shortcuts, and Unicode continues to define new properties that begin
16730with C<"In">, so it's quite possible that a conflict will occur in the future.
16731The compound form is guaranteed to not become obsolete, and its meaning is
16732clearer anyway.  See L<perlunicode/"Blocks"> for more information about this.
16733
16734User-defined properties must begin with "In" or "Is".  These override any
16735Unicode property of the same name.
16736END
16737    }
16738    my $text = $Is_flags_text;
16739    $text = "$exception_message $text" if $has_Is_conflicts;
16740
16741    # And the 'Is_ line';
16742    push @match_properties, format_pod_line($indent_info_column,
16743                                            '\p{Is_*}',
16744                                            "\\p{*} $text");
16745    push @match_properties, format_pod_line($indent_info_column,
16746            '\p{Name=*}',
16747            "Combination of Name and Name_Alias properties; has special"
16748          . " loose matching rules, for which see Unicode UAX #44");
16749    push @match_properties, format_pod_line($indent_info_column,
16750                                            '\p{Na=*}',
16751                                            '\p{Name=*}');
16752
16753    # Sort the properties array for output.  It is sorted alphabetically
16754    # except numerically for numeric properties, and only output unique lines.
16755    @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
16756
16757    my $formatted_properties = simple_fold(\@match_properties,
16758                                        "",
16759                                        # indent succeeding lines by two extra
16760                                        # which looks better
16761                                        $indent_info_column + 2,
16762
16763                                        # shorten the line length by how much
16764                                        # the formatter indents, so the folded
16765                                        # line will fit in the space
16766                                        # presumably available
16767                                        $automatic_pod_indent);
16768    # Add column headings, indented to be a little more centered, but not
16769    # exactly
16770    $formatted_properties =  format_pod_line($indent_info_column,
16771                                                    '    NAME',
16772                                                    '           INFO')
16773                                    . "\n"
16774                                    . $formatted_properties;
16775
16776    # Generate pod documentation lines for the tables that match nothing
16777    my $zero_matches = "";
16778    if (@zero_match_tables) {
16779        @zero_match_tables = uniques(@zero_match_tables);
16780        $zero_matches = join "\n\n",
16781                        map { $_ = '=item \p{' . $_->complete_name . "}" }
16782                            sort { $a->complete_name cmp $b->complete_name }
16783                            @zero_match_tables;
16784
16785        $zero_matches = <<END;
16786
16787=head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters
16788
16789Unicode has some property-value pairs that currently don't match anything.
16790This happens generally either because they are obsolete, or they exist for
16791symmetry with other forms, but no language has yet been encoded that uses
16792them.  In this version of Unicode, the following match zero code points:
16793
16794=over 4
16795
16796$zero_matches
16797
16798=back
16799
16800END
16801    }
16802
16803    # Generate list of properties that we don't accept, grouped by the reasons
16804    # why.  This is so only put out the 'why' once, and then list all the
16805    # properties that have that reason under it.
16806
16807    my %why_list;   # The keys are the reasons; the values are lists of
16808                    # properties that have the key as their reason
16809
16810    # For each property, add it to the list that are suppressed for its reason
16811    # The sort will cause the alphabetically first properties to be added to
16812    # each list first, so each list will be sorted.
16813    foreach my $property (sort keys %why_suppressed) {
16814        next unless $why_suppressed{$property};
16815        push @{$why_list{$why_suppressed{$property}}}, $property;
16816    }
16817
16818    # For each reason (sorted by the first property that has that reason)...
16819    my @bad_re_properties;
16820    foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
16821                     keys %why_list)
16822    {
16823        # Add to the output, all the properties that have that reason.
16824        my $has_item = 0;   # Flag if actually output anything.
16825        foreach my $name (@{$why_list{$why}}) {
16826
16827            # Split compound names into $property and $table components
16828            my $property = $name;
16829            my $table;
16830            if ($property =~ / (.*) = (.*) /x) {
16831                $property = $1;
16832                $table = $2;
16833            }
16834
16835            # This release of Unicode may not have a property that is
16836            # suppressed, so don't reference a non-existent one.
16837            $property = property_ref($property);
16838            next if ! defined $property;
16839
16840            # And since this list is only for match tables, don't list the
16841            # ones that don't have match tables.
16842            next if ! $property->to_create_match_tables;
16843
16844            # Find any abbreviation, and turn it into a compound name if this
16845            # is a property=value pair.
16846            my $short_name = $property->name;
16847            $short_name .= '=' . $property->table($table)->name if $table;
16848
16849            # Start with an empty line.
16850            push @bad_re_properties, "\n\n" unless $has_item;
16851
16852            # And add the property as an item for the reason.
16853            push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
16854            $has_item = 1;
16855        }
16856
16857        # And add the reason under the list of properties, if such a list
16858        # actually got generated.  Note that the header got added
16859        # unconditionally before.  But pod ignores extra blank lines, so no
16860        # harm.
16861        push @bad_re_properties, "\n$why\n" if $has_item;
16862
16863    } # End of looping through each reason.
16864
16865    if (! @bad_re_properties) {
16866        push @bad_re_properties,
16867                "*** This installation accepts ALL non-Unihan properties ***";
16868    }
16869    else {
16870        # Add =over only if non-empty to avoid an empty =over/=back section,
16871        # which is considered bad form.
16872        unshift @bad_re_properties, "\n=over 4\n";
16873        push @bad_re_properties, "\n=back\n";
16874    }
16875
16876    # Similarly, generate a list of files that we don't use, grouped by the
16877    # reasons why (Don't output if the reason is empty).  First, create a hash
16878    # whose keys are the reasons, and whose values are anonymous arrays of all
16879    # the files that share that reason.
16880    my %grouped_by_reason;
16881    foreach my $file (keys %skipped_files) {
16882        next unless $skipped_files{$file};
16883        push @{$grouped_by_reason{$skipped_files{$file}}}, $file;
16884    }
16885
16886    # Then, sort each group.
16887    foreach my $group (keys %grouped_by_reason) {
16888        @{$grouped_by_reason{$group}} = sort { lc $a cmp lc $b }
16889                                        @{$grouped_by_reason{$group}} ;
16890    }
16891
16892    # Finally, create the output text.  For each reason (sorted by the
16893    # alphabetically first file that has that reason)...
16894    my @unused_files;
16895    foreach my $reason (sort { lc $grouped_by_reason{$a}->[0]
16896                               cmp lc $grouped_by_reason{$b}->[0]
16897                              }
16898                         keys %grouped_by_reason)
16899    {
16900        # Add all the files that have that reason to the output.  Start
16901        # with an empty line.
16902        push @unused_files, "\n\n";
16903        push @unused_files, map { "\n=item F<$_> \n" }
16904                            @{$grouped_by_reason{$reason}};
16905        # And add the reason under the list of files
16906        push @unused_files, "\n$reason\n";
16907    }
16908
16909    # Similarly, create the output text for the UCD section of the pod
16910    my @ucd_pod;
16911    foreach my $key (keys %ucd_pod) {
16912        next unless $ucd_pod{$key}->{'output_this'};
16913        push @ucd_pod, format_pod_line($indent_info_column,
16914                                       $ucd_pod{$key}->{'name'},
16915                                       $ucd_pod{$key}->{'info'},
16916                                       $ucd_pod{$key}->{'status'},
16917                                      );
16918    }
16919
16920    # Sort alphabetically, and fold for output
16921    @ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod;
16922    my $ucd_pod = simple_fold(\@ucd_pod,
16923                           ' ',
16924                           $indent_info_column,
16925                           $automatic_pod_indent);
16926    $ucd_pod =  format_pod_line($indent_info_column, 'NAME', '  INFO')
16927                . "\n"
16928                . $ucd_pod;
16929    my $space_hex = sprintf("%02x", ord " ");
16930    local $" = "";
16931
16932    # Everything is ready to assemble.
16933    my @OUT = << "END";
16934=begin comment
16935
16936$HEADER
16937
16938To change this file, edit $0 instead.
16939
16940=end comment
16941
16942=head1 NAME
16943
16944$pod_file - Index of Unicode Version $unicode_version character properties in Perl
16945
16946=head1 DESCRIPTION
16947
16948This document provides information about the portion of the Unicode database
16949that deals with character properties, that is the portion that is defined on
16950single code points.  (L</Other information in the Unicode data base>
16951below briefly mentions other data that Unicode provides.)
16952
16953Perl can provide access to all non-provisional Unicode character properties,
16954though not all are enabled by default.  The omitted ones are the Unihan
16955properties (accessible via the CPAN module L<Unicode::Unihan>) and certain
16956deprecated or Unicode-internal properties.  (An installation may choose to
16957recompile Perl's tables to change this.  See L</Unicode character
16958properties that are NOT accepted by Perl>.)
16959
16960For most purposes, access to Unicode properties from the Perl core is through
16961regular expression matches, as described in the next section.
16962For some special purposes, and to access the properties that are not suitable
16963for regular expression matching, all the Unicode character properties that
16964Perl handles are accessible via the standard L<Unicode::UCD> module, as
16965described in the section L</Properties accessible through Unicode::UCD>.
16966
16967Perl also provides some additional extensions and short-cut synonyms
16968for Unicode properties.
16969
16970This document merely lists all available properties and does not attempt to
16971explain what each property really means.  There is a brief description of each
16972Perl extension; see L<perlunicode/Other Properties> for more information on
16973these.  There is some detail about Blocks, Scripts, General_Category,
16974and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
16975official Unicode properties, refer to the Unicode standard.  A good starting
16976place is L<$unicode_reference_url>.
16977
16978Note that you can define your own properties; see
16979L<perlunicode/"User-Defined Character Properties">.
16980
16981=head1 Properties accessible through C<\\p{}> and C<\\P{}>
16982
16983The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to
16984most of the Unicode character properties.  The table below shows all these
16985constructs, both single and compound forms.
16986
16987B<Compound forms> consist of two components, separated by an equals sign or a
16988colon.  The first component is the property name, and the second component is
16989the particular value of the property to match against, for example,
16990C<\\p{Script_Extensions: Greek}> and C<\\p{Script_Extensions=Greek}> both mean
16991to match characters whose Script_Extensions property value is Greek.
16992(C<Script_Extensions> is an improved version of the C<Script> property.)
16993
16994B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for
16995their equivalent compound forms.  The table shows these equivalences.  (In our
16996example, C<\\p{Greek}> is a just a shortcut for
16997C<\\p{Script_Extensions=Greek}>).  There are also a few Perl-defined single
16998forms that are not shortcuts for a compound form.  One such is C<\\p{Word}>.
16999These are also listed in the table.
17000
17001In parsing these constructs, Perl always ignores Upper/lower case differences
17002everywhere within the {braces}.  Thus C<\\p{Greek}> means the same thing as
17003C<\\p{greek}>.  But note that changing the case of the C<"p"> or C<"P"> before
17004the left brace completely changes the meaning of the construct, from "match"
17005(for C<\\p{}>) to "doesn't match" (for C<\\P{}>).  Casing in this document is
17006for improved legibility.
17007
17008Also, white space, hyphens, and underscores are normally ignored
17009everywhere between the {braces}, and hence can be freely added or removed
17010even if the C</x> modifier hasn't been specified on the regular expression.
17011But in the table below $a_bold_stricter at the beginning of an entry
17012means that tighter (stricter) rules are used for that entry:
17013
17014=over 4
17015
17016=over 4
17017
17018=item Single form (C<\\p{name}>) tighter rules:
17019
17020White space, hyphens, and underscores ARE significant
17021except for:
17022
17023=over 4
17024
17025=item * white space adjacent to a non-word character
17026
17027=item * underscores separating digits in numbers
17028
17029=back
17030
17031That means, for example, that you can freely add or remove white space
17032adjacent to (but within) the braces without affecting the meaning.
17033
17034=item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules:
17035
17036The tighter rules given above for the single form apply to everything to the
17037right of the colon or equals; the looser rules still apply to everything to
17038the left.
17039
17040That means, for example, that you can freely add or remove white space
17041adjacent to (but within) the braces and the colon or equal sign.
17042
17043=back
17044
17045=back
17046
17047Some properties are considered obsolete by Unicode, but still available.
17048There are several varieties of obsolescence:
17049
17050=over 4
17051
17052=over 4
17053
17054=item Stabilized
17055
17056A property may be stabilized.  Such a determination does not indicate
17057that the property should or should not be used; instead it is a declaration
17058that the property will not be maintained nor extended for newly encoded
17059characters.  Such properties are marked with $a_bold_stabilized in the
17060table.
17061
17062=item Deprecated
17063
17064A property may be deprecated, perhaps because its original intent
17065has been replaced by another property, or because its specification was
17066somehow defective.  This means that its use is strongly
17067discouraged, so much so that a warning will be issued if used, unless the
17068regular expression is in the scope of a C<S<no warnings 'deprecated'>>
17069statement.  $A_bold_deprecated flags each such entry in the table, and
17070the entry there for the longest, most descriptive version of the property will
17071give the reason it is deprecated, and perhaps advice.  Perl may issue such a
17072warning, even for properties that aren't officially deprecated by Unicode,
17073when there used to be characters or code points that were matched by them, but
17074no longer.  This is to warn you that your program may not work like it did on
17075earlier Unicode releases.
17076
17077A deprecated property may be made unavailable in a future Perl version, so it
17078is best to move away from them.
17079
17080A deprecated property may also be stabilized, but this fact is not shown.
17081
17082=item Obsolete
17083
17084Properties marked with $a_bold_obsolete in the table are considered (plain)
17085obsolete.  Generally this designation is given to properties that Unicode once
17086used for internal purposes (but not any longer).
17087
17088=item Discouraged
17089
17090This is not actually a Unicode-specified obsolescence, but applies to certain
17091Perl extensions that are present for backwards compatibility, but are
17092discouraged from being used.  These are not obsolete, but their meanings are
17093not stable.  Future Unicode versions could force any of these extensions to be
17094removed without warning, replaced by another property with the same name that
17095means something different.  $A_bold_discouraged flags each such entry in the
17096table.  Use the equivalent shown instead.
17097
17098@block_warning
17099
17100=back
17101
17102=back
17103
17104The table below has two columns.  The left column contains the C<\\p{}>
17105constructs to look up, possibly preceded by the flags mentioned above; and
17106the right column contains information about them, like a description, or
17107synonyms.  The table shows both the single and compound forms for each
17108property that has them.  If the left column is a short name for a property,
17109the right column will give its longer, more descriptive name; and if the left
17110column is the longest name, the right column will show any equivalent shortest
17111name, in both single and compound forms if applicable.
17112
17113If braces are not needed to specify a property (e.g., C<\\pL>), the left
17114column contains both forms, with and without braces.
17115
17116The right column will also caution you if a property means something different
17117than what might normally be expected.
17118
17119All single forms are Perl extensions; a few compound forms are as well, and
17120are noted as such.
17121
17122Numbers in (parentheses) indicate the total number of Unicode code points
17123matched by the property.  For the entries that give the longest, most
17124descriptive version of the property, the count is followed by a list of some
17125of the code points matched by it.  The list includes all the matched
17126characters in the 0-255 range, enclosed in the familiar [brackets] the same as
17127a regular expression bracketed character class.  Following that, the next few
17128higher matching ranges are also given.  To avoid visual ambiguity, the SPACE
17129character is represented as C<\\x$space_hex>.
17130
17131For emphasis, those properties that match no code points at all are listed as
17132well in a separate section following the table.
17133
17134Most properties match the same code points regardless of whether C<"/i">
17135case-insensitive matching is specified or not.  But a few properties are
17136affected.  These are shown with the notation S<C<(/i= I<other_property>)>>
17137in the second column.  Under case-insensitive matching they match the
17138same code pode points as the property I<other_property>.
17139
17140There is no description given for most non-Perl defined properties (See
17141L<$unicode_reference_url> for that).
17142
17143For compactness, 'B<*>' is used as a wildcard instead of showing all possible
17144combinations.  For example, entries like:
17145
17146 \\p{Gc: *}                                  \\p{General_Category: *}
17147
17148mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
17149for the latter is also valid for the former.  Similarly,
17150
17151 \\p{Is_*}                                   \\p{*}
17152
17153means that if and only if, for example, C<\\p{Foo}> exists, then
17154C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing.
17155And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and
17156C<\\p{IsFoo=Bar}>.  "*" here is restricted to something not beginning with an
17157underscore.
17158
17159Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
17160And 'No', 'F', and 'False' are all synonyms for 'N'.  The table shows 'Y*' and
17161'N*' to indicate this, and doesn't have separate entries for the other
17162possibilities.  Note that not all properties which have values 'Yes' and 'No'
17163are binary, and they have all their values spelled out without using this wild
17164card, and a C<NOT> clause in their description that highlights their not being
17165binary.  These also require the compound form to match them, whereas true
17166binary properties have both single and compound forms available.
17167
17168Note that all non-essential underscores are removed in the display of the
17169short names below.
17170
17171B<Legend summary:>
17172
17173=over 4
17174
17175=item Z<>B<*> is a wild-card
17176
17177=item B<(\\d+)> in the info column gives the number of Unicode code points matched
17178by this property.
17179
17180=item B<$DEPRECATED> means this is deprecated.
17181
17182=item B<$OBSOLETE> means this is obsolete.
17183
17184=item B<$STABILIZED> means this is stabilized.
17185
17186=item B<$STRICTER> means tighter (stricter) name matching applies.
17187
17188=item B<$DISCOURAGED> means use of this form is discouraged, and may not be
17189stable.
17190
17191=back
17192
17193$formatted_properties
17194
17195$zero_matches
17196
17197=head1 Properties accessible through Unicode::UCD
17198
17199The value of any Unicode (not including Perl extensions) character
17200property mentioned above for any single code point is available through
17201L<Unicode::UCD/charprop()>.  L<Unicode::UCD/charprops_all()> returns the
17202values of all the Unicode properties for a given code point.
17203
17204Besides these, all the Unicode character properties mentioned above
17205(except for those marked as for internal use by Perl) are also
17206accessible by L<Unicode::UCD/prop_invlist()>.
17207
17208Due to their nature, not all Unicode character properties are suitable for
17209regular expression matches, nor C<prop_invlist()>.  The remaining
17210non-provisional, non-internal ones are accessible via
17211L<Unicode::UCD/prop_invmap()> (except for those that this Perl installation
17212hasn't included; see L<below for which those are|/Unicode character properties
17213that are NOT accepted by Perl>).
17214
17215For compatibility with other parts of Perl, all the single forms given in the
17216table in the L<section above|/Properties accessible through \\p{} and \\P{}>
17217are recognized.  BUT, there are some ambiguities between some Perl extensions
17218and the Unicode properties, all of which are silently resolved in favor of the
17219official Unicode property.  To avoid surprises, you should only use
17220C<prop_invmap()> for forms listed in the table below, which omits the
17221non-recommended ones.  The affected forms are the Perl single form equivalents
17222of Unicode properties, such as C<\\p{sc}> being a single-form equivalent of
17223C<\\p{gc=sc}>, which is treated by C<prop_invmap()> as the C<Script> property,
17224whose short name is C<sc>.  The table indicates the current ambiguities in the
17225INFO column, beginning with the word C<"NOT">.
17226
17227The standard Unicode properties listed below are documented in
17228L<$unicode_reference_url>; Perl_Decimal_Digit is documented in
17229L<Unicode::UCD/prop_invmap()>.  The other Perl extensions are in
17230L<perlunicode/Other Properties>;
17231
17232The first column in the table is a name for the property; the second column is
17233an alternative name, if any, plus possibly some annotations.  The alternative
17234name is the property's full name, unless that would simply repeat the first
17235column, in which case the second column indicates the property's short name
17236(if different).  The annotations are given only in the entry for the full
17237name.  The annotations for binary properties include a list of the first few
17238ranges that the property matches.  To avoid any ambiguity, the SPACE character
17239is represented as C<\\x$space_hex>.
17240
17241If a property is obsolete, etc, the entry will be flagged with the same
17242characters used in the table in the L<section above|/Properties accessible
17243through \\p{} and \\P{}>, like B<$DEPRECATED> or B<$STABILIZED>.
17244
17245$ucd_pod
17246
17247=head1 Properties accessible through other means
17248
17249Certain properties are accessible also via core function calls.  These are:
17250
17251 Lowercase_Mapping          lc() and lcfirst()
17252 Titlecase_Mapping          ucfirst()
17253 Uppercase_Mapping          uc()
17254
17255Also, Case_Folding is accessible through the C</i> modifier in regular
17256expressions, the C<\\F> transliteration escape, and the C<L<fc|perlfunc/fc>>
17257operator.
17258
17259Besides being able to say C<\\p{Name=...}>, the Name and Name_Aliases
17260properties are accessible through the C<\\N{}> interpolation in double-quoted
17261strings and regular expressions; and functions C<charnames::viacode()>,
17262C<charnames::vianame()>, and C<charnames::string_vianame()> (which require a
17263C<use charnames ();> to be specified.
17264
17265Finally, most properties related to decomposition are accessible via
17266L<Unicode::Normalize>.
17267
17268=head1 Unicode character properties that are NOT accepted by Perl
17269
17270Perl will generate an error for a few character properties in Unicode when
17271used in a regular expression.  The non-Unihan ones are listed below, with the
17272reasons they are not accepted, perhaps with work-arounds.  The short names for
17273the properties are listed enclosed in (parentheses).
17274As described after the list, an installation can change the defaults and choose
17275to accept any of these.  The list is machine generated based on the
17276choices made for the installation that generated this document.
17277
17278@bad_re_properties
17279
17280An installation can choose to allow any of these to be matched by downloading
17281the Unicode database from L<http://www.unicode.org/Public/> to
17282C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the
17283controlling lists contained in the program
17284C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
17285(C<\%Config> is available from the Config module).
17286
17287Also, perl can be recompiled to operate on an earlier version of the Unicode
17288standard.  Further information is at
17289C<\$Config{privlib}>/F<unicore/README.perl>.
17290
17291=head1 Other information in the Unicode data base
17292
17293The Unicode data base is delivered in two different formats.  The XML version
17294is valid for more modern Unicode releases.  The other version is a collection
17295of files.  The two are intended to give equivalent information.  Perl uses the
17296older form; this allows you to recompile Perl to use early Unicode releases.
17297
17298The only non-character property that Perl currently supports is Named
17299Sequences, in which a sequence of code points
17300is given a name and generally treated as a single entity.  (Perl supports
17301these via the C<\\N{...}> double-quotish construct,
17302L<charnames/charnames::string_vianame(name)>, and L<Unicode::UCD/namedseq()>.
17303
17304Below is a list of the files in the Unicode data base that Perl doesn't
17305currently use, along with very brief descriptions of their purposes.
17306Some of the names of the files have been shortened from those that Unicode
17307uses, in order to allow them to be distinguishable from similarly named files
17308on file systems for which only the first 8 characters of a name are
17309significant.
17310
17311=over 4
17312
17313@unused_files
17314
17315=back
17316
17317=head1 SEE ALSO
17318
17319L<$unicode_reference_url>
17320
17321L<perlrecharclass>
17322
17323L<perlunicode>
17324
17325END
17326
17327    # And write it.  The 0 means no utf8.
17328    main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT);
17329    return;
17330}
17331
17332sub make_Name_pm () {
17333    # Create and write Name.pm, which contains subroutines and data to use in
17334    # conjunction with Name.pl
17335
17336    # Maybe there's nothing to do.
17337    return unless $has_hangul_syllables || @code_points_ending_in_code_point;
17338
17339    my @name = <<END;
17340$HEADER
17341$INTERNAL_ONLY_HEADER
17342END
17343
17344    # Convert these structures to output format.
17345    my $code_points_ending_in_code_point =
17346        main::simple_dumper(\@code_points_ending_in_code_point,
17347                            ' ' x 8);
17348    my $names = main::simple_dumper(\%names_ending_in_code_point,
17349                                    ' ' x 8);
17350    my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point,
17351                                    ' ' x 8);
17352
17353    # Do the same with the Hangul names,
17354    my $jamo;
17355    my $jamo_l;
17356    my $jamo_v;
17357    my $jamo_t;
17358    my $jamo_re;
17359    if ($has_hangul_syllables) {
17360
17361        # Construct a regular expression of all the possible
17362        # combinations of the Hangul syllables.
17363        my @L_re;   # Leading consonants
17364        for my $i ($LBase .. $LBase + $LCount - 1) {
17365            push @L_re, $Jamo{$i}
17366        }
17367        my @V_re;   # Middle vowels
17368        for my $i ($VBase .. $VBase + $VCount - 1) {
17369            push @V_re, $Jamo{$i}
17370        }
17371        my @T_re;   # Trailing consonants
17372        for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
17373            push @T_re, $Jamo{$i}
17374        }
17375
17376        # The whole re is made up of the L V T combination.
17377        $jamo_re = '('
17378                    . join ('|', sort @L_re)
17379                    . ')('
17380                    . join ('|', sort @V_re)
17381                    . ')('
17382                    . join ('|', sort @T_re)
17383                    . ')?';
17384
17385        # These hashes needed by the algorithm were generated
17386        # during reading of the Jamo.txt file
17387        $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
17388        $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
17389        $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
17390        $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
17391    }
17392
17393    push @name, <<END;
17394
17395package charnames;
17396
17397# This module contains machine-generated tables and code for the
17398# algorithmically-determinable Unicode character names.  The following
17399# routines can be used to translate between name and code point and vice versa
17400
17401{ # Closure
17402
17403    # Matches legal code point.  4-6 hex numbers, If there are 6, the first
17404    # two must be 10; if there are 5, the first must not be a 0.  Written this
17405    # way to decrease backtracking.  The first regex allows the code point to
17406    # be at the end of a word, but to work properly, the word shouldn't end
17407    # with a valid hex character.  The second one won't match a code point at
17408    # the end of a word, and doesn't have the run-on issue
17409    my \$run_on_code_point_re = qr/$run_on_code_point_re/;
17410    my \$code_point_re = qr/$code_point_re/;
17411
17412    # In the following hash, the keys are the bases of names which include
17413    # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01.  The value
17414    # of each key is another hash which is used to get the low and high ends
17415    # for each range of code points that apply to the name.
17416    my %names_ending_in_code_point = (
17417$names
17418    );
17419
17420    # The following hash is a copy of the previous one, except is for loose
17421    # matching, so each name has blanks and dashes squeezed out
17422    my %loose_names_ending_in_code_point = (
17423$loose_names
17424    );
17425
17426    # And the following array gives the inverse mapping from code points to
17427    # names.  Lowest code points are first
17428    \@code_points_ending_in_code_point = (
17429$code_points_ending_in_code_point
17430    );
17431
17432    # Is exportable, make read-only
17433    Internals::SvREADONLY(\@code_points_ending_in_code_point, 1);
17434END
17435    # Earlier releases didn't have Jamos.  No sense outputting
17436    # them unless will be used.
17437    if ($has_hangul_syllables) {
17438        push @name, <<END;
17439
17440    # Convert from code point to Jamo short name for use in composing Hangul
17441    # syllable names
17442    my %Jamo = (
17443$jamo
17444    );
17445
17446    # Leading consonant (can be null)
17447    my %Jamo_L = (
17448$jamo_l
17449    );
17450
17451    # Vowel
17452    my %Jamo_V = (
17453$jamo_v
17454    );
17455
17456    # Optional trailing consonant
17457    my %Jamo_T = (
17458$jamo_t
17459    );
17460
17461    # Computed re that splits up a Hangul name into LVT or LV syllables
17462    my \$syllable_re = qr/$jamo_re/;
17463
17464    my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
17465    my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
17466
17467    # These constants names and values were taken from the Unicode standard,
17468    # version 5.1, section 3.12.  They are used in conjunction with Hangul
17469    # syllables
17470    my \$SBase = $SBase_string;
17471    my \$LBase = $LBase_string;
17472    my \$VBase = $VBase_string;
17473    my \$TBase = $TBase_string;
17474    my \$SCount = $SCount;
17475    my \$LCount = $LCount;
17476    my \$VCount = $VCount;
17477    my \$TCount = $TCount;
17478    my \$NCount = \$VCount * \$TCount;
17479END
17480    } # End of has Jamos
17481
17482    push @name, << 'END';
17483
17484    sub name_to_code_point_special {
17485        my ($name, $loose) = @_;
17486
17487        # Returns undef if not one of the specially handled names; otherwise
17488        # returns the code point equivalent to the input name
17489        # $loose is non-zero if to use loose matching, 'name' in that case
17490        # must be input as upper case with all blanks and dashes squeezed out.
17491END
17492    if ($has_hangul_syllables) {
17493        push @name, << 'END';
17494
17495        if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
17496            || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
17497        {
17498            return if $name !~ qr/^$syllable_re$/;
17499            my $L = $Jamo_L{$1};
17500            my $V = $Jamo_V{$2};
17501            my $T = (defined $3) ? $Jamo_T{$3} : 0;
17502            return ($L * $VCount + $V) * $TCount + $T + $SBase;
17503        }
17504END
17505    }
17506    push @name, << 'END';
17507
17508        # Name must end in 'code_point' for this to handle.
17509        return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
17510                   || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
17511
17512        my $base = $1;
17513        my $code_point = CORE::hex $2;
17514        my $names_ref;
17515
17516        if ($loose) {
17517            $names_ref = \%loose_names_ending_in_code_point;
17518        }
17519        else {
17520            return if $base !~ s/-$//;
17521            $names_ref = \%names_ending_in_code_point;
17522        }
17523
17524        # Name must be one of the ones which has the code point in it.
17525        return if ! $names_ref->{$base};
17526
17527        # Look through the list of ranges that apply to this name to see if
17528        # the code point is in one of them.
17529        for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
17530            return if $names_ref->{$base}{'low'}->[$i] > $code_point;
17531            next if $names_ref->{$base}{'high'}->[$i] < $code_point;
17532
17533            # Here, the code point is in the range.
17534            return $code_point;
17535        }
17536
17537        # Here, looked like the name had a code point number in it, but
17538        # did not match one of the valid ones.
17539        return;
17540    }
17541
17542    sub code_point_to_name_special {
17543        my $code_point = shift;
17544
17545        # Returns the name of a code point if algorithmically determinable;
17546        # undef if not
17547END
17548    if ($has_hangul_syllables) {
17549        push @name, << 'END';
17550
17551        # If in the Hangul range, calculate the name based on Unicode's
17552        # algorithm
17553        if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
17554            use integer;
17555            my $SIndex = $code_point - $SBase;
17556            my $L = $LBase + $SIndex / $NCount;
17557            my $V = $VBase + ($SIndex % $NCount) / $TCount;
17558            my $T = $TBase + $SIndex % $TCount;
17559            $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
17560            $name .= $Jamo{$T} if $T != $TBase;
17561            return $name;
17562        }
17563END
17564    }
17565    push @name, << 'END';
17566
17567        # Look through list of these code points for one in range.
17568        foreach my $hash (@code_points_ending_in_code_point) {
17569            return if $code_point < $hash->{'low'};
17570            if ($code_point <= $hash->{'high'}) {
17571                return sprintf("%s-%04X", $hash->{'name'}, $code_point);
17572            }
17573        }
17574        return;            # None found
17575    }
17576} # End closure
17577
175781;
17579END
17580
17581    main::write("Name.pm", 0, \@name);  # The 0 means no utf8.
17582    return;
17583}
17584
17585sub make_UCD () {
17586    # Create and write UCD.pl, which passes info about the tables to
17587    # Unicode::UCD
17588
17589    # Stringify structures for output
17590    my $loose_property_name_of
17591                           = simple_dumper(\%loose_property_name_of, ' ' x 4);
17592    chomp $loose_property_name_of;
17593
17594    my $strict_property_name_of
17595                           = simple_dumper(\%strict_property_name_of, ' ' x 4);
17596    chomp $strict_property_name_of;
17597
17598    my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4);
17599    chomp $stricter_to_file_of;
17600
17601    my $inline_definitions = simple_dumper(\@inline_definitions, " " x 4);
17602    chomp $inline_definitions;
17603
17604    my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4);
17605    chomp $loose_to_file_of;
17606
17607    my $nv_floating_to_rational
17608                           = simple_dumper(\%nv_floating_to_rational, ' ' x 4);
17609    chomp $nv_floating_to_rational;
17610
17611    my $why_deprecated = simple_dumper(\%Unicode::UCD::why_deprecated, ' ' x 4);
17612    chomp $why_deprecated;
17613
17614    # We set the key to the file when we associated files with tables, but we
17615    # couldn't do the same for the value then, as we might not have the file
17616    # for the alternate table figured out at that time.
17617    foreach my $cased (keys %caseless_equivalent_to) {
17618        my @path = $caseless_equivalent_to{$cased}->file_path;
17619        my $path;
17620        if ($path[0] eq "#") {  # Pseudo-directory '#'
17621            $path = join '/', @path;
17622        }
17623        else {  # Gets rid of lib/
17624            $path = join '/', @path[1, -1];
17625        }
17626        $caseless_equivalent_to{$cased} = $path;
17627    }
17628    my $caseless_equivalent_to
17629                           = simple_dumper(\%caseless_equivalent_to, ' ' x 4);
17630    chomp $caseless_equivalent_to;
17631
17632    my $loose_property_to_file_of
17633                        = simple_dumper(\%loose_property_to_file_of, ' ' x 4);
17634    chomp $loose_property_to_file_of;
17635
17636    my $strict_property_to_file_of
17637                        = simple_dumper(\%strict_property_to_file_of, ' ' x 4);
17638    chomp $strict_property_to_file_of;
17639
17640    my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4);
17641    chomp $file_to_swash_name;
17642
17643    # Create a mapping from each alias of Perl single-form extensions to all
17644    # its equivalent aliases, for quick look-up.
17645    my %perlprop_to_aliases;
17646    foreach my $table ($perl->tables) {
17647
17648        # First create the list of the aliases of each extension
17649        my @aliases_list;    # List of legal aliases for this extension
17650
17651        my $table_name = $table->name;
17652        my $standard_table_name = standardize($table_name);
17653        my $table_full_name = $table->full_name;
17654        my $standard_table_full_name = standardize($table_full_name);
17655
17656        # Make sure that the list has both the short and full names
17657        push @aliases_list, $table_name, $table_full_name;
17658
17659        my $found_ucd = 0;  # ? Did we actually get an alias that should be
17660                            # output for this table
17661
17662        # Go through all the aliases (including the two just added), and add
17663        # any new unique ones to the list
17664        foreach my $alias ($table->aliases) {
17665
17666            # Skip non-legal names
17667            next unless $alias->ok_as_filename;
17668            next unless $alias->ucd;
17669
17670            $found_ucd = 1;     # have at least one legal name
17671
17672            my $name = $alias->name;
17673            my $standard = standardize($name);
17674
17675            # Don't repeat a name that is equivalent to one already on the
17676            # list
17677            next if $standard eq $standard_table_name;
17678            next if $standard eq $standard_table_full_name;
17679
17680            push @aliases_list, $name;
17681        }
17682
17683        # If there were no legal names, don't output anything.
17684        next unless $found_ucd;
17685
17686        # To conserve memory in the program reading these in, omit full names
17687        # that are identical to the short name, when those are the only two
17688        # aliases for the property.
17689        if (@aliases_list == 2 && $aliases_list[0] eq $aliases_list[1]) {
17690            pop @aliases_list;
17691        }
17692
17693        # Here, @aliases_list is the list of all the aliases that this
17694        # extension legally has.  Now can create a map to it from each legal
17695        # standardized alias
17696        foreach my $alias ($table->aliases) {
17697            next unless $alias->ucd;
17698            next unless $alias->ok_as_filename;
17699            push @{$perlprop_to_aliases{standardize($alias->name)}},
17700                 uniques @aliases_list;
17701        }
17702    }
17703
17704    # Make a list of all combinations of properties/values that are suppressed.
17705    my @suppressed;
17706    if (! $debug_skip) {    # This tends to fail in this debug mode
17707        foreach my $property_name (keys %why_suppressed) {
17708
17709            # Just the value
17710            my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
17711
17712            # The hash may contain properties not in this release of Unicode
17713            next unless defined (my $property = property_ref($property_name));
17714
17715            # Find all combinations
17716            foreach my $prop_alias ($property->aliases) {
17717                my $prop_alias_name = standardize($prop_alias->name);
17718
17719                # If no =value, there's just one combination possible for this
17720                if (! $value_name) {
17721
17722                    # The property may be suppressed, but there may be a proxy
17723                    # for it, so it shouldn't be listed as suppressed
17724                    next if $prop_alias->ucd;
17725                    push @suppressed, $prop_alias_name;
17726                }
17727                else {  # Otherwise
17728                    foreach my $value_alias
17729                                    ($property->table($value_name)->aliases)
17730                    {
17731                        next if $value_alias->ucd;
17732
17733                        push @suppressed, "$prop_alias_name="
17734                                        .  standardize($value_alias->name);
17735                    }
17736                }
17737            }
17738        }
17739    }
17740    @suppressed = sort @suppressed; # So doesn't change between runs of this
17741                                    # program
17742
17743    # Convert the structure below (designed for Name.pm) to a form that UCD
17744    # wants, so it doesn't have to modify it at all; i.e. so that it includes
17745    # an element for the Hangul syllables in the appropriate place, and
17746    # otherwise changes the name to include the "-<code point>" suffix.
17747    my @algorithm_names;
17748    my $done_hangul = $v_version lt v2.0.0;  # Hanguls as we know them came
17749                                             # along in this version
17750    # Copy it linearly.
17751    for my $i (0 .. @code_points_ending_in_code_point - 1) {
17752
17753        # Insert the hanguls in the correct place.
17754        if (! $done_hangul
17755            && $code_points_ending_in_code_point[$i]->{'low'} > $SBase)
17756        {
17757            $done_hangul = 1;
17758            push @algorithm_names, { low => $SBase,
17759                                     high => $SBase + $SCount - 1,
17760                                     name => '<hangul syllable>',
17761                                    };
17762        }
17763
17764        # Copy the current entry, modified.
17765        push @algorithm_names, {
17766            low => $code_points_ending_in_code_point[$i]->{'low'},
17767            high => $code_points_ending_in_code_point[$i]->{'high'},
17768            name =>
17769               "$code_points_ending_in_code_point[$i]->{'name'}-<code point>",
17770        };
17771    }
17772
17773    # Serialize these structures for output.
17774    my $loose_to_standard_value
17775                          = simple_dumper(\%loose_to_standard_value, ' ' x 4);
17776    chomp $loose_to_standard_value;
17777
17778    my $string_property_loose_to_name
17779                    = simple_dumper(\%string_property_loose_to_name, ' ' x 4);
17780    chomp $string_property_loose_to_name;
17781
17782    my $perlprop_to_aliases = simple_dumper(\%perlprop_to_aliases, ' ' x 4);
17783    chomp $perlprop_to_aliases;
17784
17785    my $prop_aliases = simple_dumper(\%prop_aliases, ' ' x 4);
17786    chomp $prop_aliases;
17787
17788    my $prop_value_aliases = simple_dumper(\%prop_value_aliases, ' ' x 4);
17789    chomp $prop_value_aliases;
17790
17791    my $suppressed = (@suppressed) ? simple_dumper(\@suppressed, ' ' x 4) : "";
17792    chomp $suppressed;
17793
17794    my $algorithm_names = simple_dumper(\@algorithm_names, ' ' x 4);
17795    chomp $algorithm_names;
17796
17797    my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4);
17798    chomp $ambiguous_names;
17799
17800    my $combination_property = simple_dumper(\%combination_property, ' ' x 4);
17801    chomp $combination_property;
17802
17803    my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4);
17804    chomp $loose_defaults;
17805
17806    my @ucd = <<END;
17807$HEADER
17808$INTERNAL_ONLY_HEADER
17809
17810# This file is for the use of Unicode::UCD
17811
17812# Highest legal Unicode code point
17813\$Unicode::UCD::MAX_UNICODE_CODEPOINT = 0x$MAX_UNICODE_CODEPOINT_STRING;
17814
17815# Hangul syllables
17816\$Unicode::UCD::HANGUL_BEGIN = $SBase_string;
17817\$Unicode::UCD::HANGUL_COUNT = $SCount;
17818
17819# Maps Unicode (not Perl single-form extensions) property names in loose
17820# standard form to their corresponding standard names
17821\%Unicode::UCD::loose_property_name_of = (
17822$loose_property_name_of
17823);
17824
17825# Same, but strict names
17826\%Unicode::UCD::strict_property_name_of = (
17827$strict_property_name_of
17828);
17829
17830# Gives the definitions (in the form of inversion lists) for those properties
17831# whose definitions aren't kept in files
17832\@Unicode::UCD::inline_definitions = (
17833$inline_definitions
17834);
17835
17836# Maps property, table to file for those using stricter matching.  For paths
17837# whose directory is '#', the file is in the form of a numeric index into
17838# \@inline_definitions
17839\%Unicode::UCD::stricter_to_file_of = (
17840$stricter_to_file_of
17841);
17842
17843# Maps property, table to file for those using loose matching.  For paths
17844# whose directory is '#', the file is in the form of a numeric index into
17845# \@inline_definitions
17846\%Unicode::UCD::loose_to_file_of = (
17847$loose_to_file_of
17848);
17849
17850# Maps floating point to fractional form
17851\%Unicode::UCD::nv_floating_to_rational = (
17852$nv_floating_to_rational
17853);
17854
17855# If a %e floating point number doesn't have this number of digits in it after
17856# the decimal point to get this close to a fraction, it isn't considered to be
17857# that fraction even if all the digits it does have match.
17858\$Unicode::UCD::e_precision = $E_FLOAT_PRECISION;
17859
17860# Deprecated tables to generate a warning for.  The key is the file containing
17861# the table, so as to avoid duplication, as many property names can map to the
17862# file, but we only need one entry for all of them.
17863\%Unicode::UCD::why_deprecated = (
17864$why_deprecated
17865);
17866
17867# A few properties have different behavior under /i matching.  This maps
17868# those to substitute files to use under /i.
17869\%Unicode::UCD::caseless_equivalent = (
17870$caseless_equivalent_to
17871);
17872
17873# Property names to mapping files
17874\%Unicode::UCD::loose_property_to_file_of = (
17875$loose_property_to_file_of
17876);
17877
17878# Property names to mapping files
17879\%Unicode::UCD::strict_property_to_file_of = (
17880$strict_property_to_file_of
17881);
17882
17883# Files to the swash names within them.
17884\%Unicode::UCD::file_to_swash_name = (
17885$file_to_swash_name
17886);
17887
17888# Keys are all the possible "prop=value" combinations, in loose form; values
17889# are the standard loose name for the 'value' part of the key
17890\%Unicode::UCD::loose_to_standard_value = (
17891$loose_to_standard_value
17892);
17893
17894# String property loose names to standard loose name
17895\%Unicode::UCD::string_property_loose_to_name = (
17896$string_property_loose_to_name
17897);
17898
17899# Keys are Perl extensions in loose form; values are each one's list of
17900# aliases
17901\%Unicode::UCD::loose_perlprop_to_name = (
17902$perlprop_to_aliases
17903);
17904
17905# Keys are standard property name; values are each one's aliases
17906\%Unicode::UCD::prop_aliases = (
17907$prop_aliases
17908);
17909
17910# Keys of top level are standard property name; values are keys to another
17911# hash,  Each one is one of the property's values, in standard form.  The
17912# values are that prop-val's aliases.  If only one specified, the short and
17913# long alias are identical.
17914\%Unicode::UCD::prop_value_aliases = (
17915$prop_value_aliases
17916);
17917
17918# Ordered (by code point ordinal) list of the ranges of code points whose
17919# names are algorithmically determined.  Each range entry is an anonymous hash
17920# of the start and end points and a template for the names within it.
17921\@Unicode::UCD::algorithmic_named_code_points = (
17922$algorithm_names
17923);
17924
17925# The properties that as-is have two meanings, and which must be disambiguated
17926\%Unicode::UCD::ambiguous_names = (
17927$ambiguous_names
17928);
17929
17930# Keys are the prop-val combinations which are the default values for the
17931# given property, expressed in standard loose form
17932\%Unicode::UCD::loose_defaults = (
17933$loose_defaults
17934);
17935
17936# The properties that are combinations, in that they have both a map table and
17937# a match table.  This is actually for UCD.t, so it knows how to test for
17938# these.
17939\%Unicode::UCD::combination_property = (
17940$combination_property
17941);
17942
17943# All combinations of names that are suppressed.
17944# This is actually for UCD.t, so it knows which properties shouldn't have
17945# entries.  If it got any bigger, would probably want to put it in its own
17946# file to use memory only when it was needed, in testing.
17947\@Unicode::UCD::suppressed_properties = (
17948$suppressed
17949);
17950
179511;
17952END
17953
17954    main::write("UCD.pl", 0, \@ucd);  # The 0 means no utf8.
17955    return;
17956}
17957
17958sub write_all_tables() {
17959    # Write out all the tables generated by this program to files, as well as
17960    # the supporting data structures, pod file, and .t file.
17961
17962    my @writables;              # List of tables that actually get written
17963    my %match_tables_to_write;  # Used to collapse identical match tables
17964                                # into one file.  Each key is a hash function
17965                                # result to partition tables into buckets.
17966                                # Each value is an array of the tables that
17967                                # fit in the bucket.
17968
17969    # For each property ...
17970    # (sort so that if there is an immutable file name, it has precedence, so
17971    # some other property can't come in and take over its file name.  (We
17972    # don't care if both defined, as they had better be different anyway.)
17973    # The property named 'Perl' needs to be first (it doesn't have any
17974    # immutable file name) because empty properties are defined in terms of
17975    # its table named 'All' under the -annotate option.)   We also sort by
17976    # the property's name.  This is just for repeatability of the outputs
17977    # between runs of this program, but does not affect correctness.
17978    PROPERTY:
17979    foreach my $property ($perl,
17980                          sort { return -1 if defined $a->file;
17981                                 return 1 if defined $b->file;
17982                                 return $a->name cmp $b->name;
17983                                } grep { $_ != $perl } property_ref('*'))
17984    {
17985        my $type = $property->type;
17986
17987        # And for each table for that property, starting with the mapping
17988        # table for it ...
17989        TABLE:
17990        foreach my $table($property,
17991
17992                        # and all the match tables for it (if any), sorted so
17993                        # the ones with the shortest associated file name come
17994                        # first.  The length sorting prevents problems of a
17995                        # longer file taking a name that might have to be used
17996                        # by a shorter one.  The alphabetic sorting prevents
17997                        # differences between releases
17998                        sort {  my $ext_a = $a->external_name;
17999                                return 1 if ! defined $ext_a;
18000                                my $ext_b = $b->external_name;
18001                                return -1 if ! defined $ext_b;
18002
18003                                # But return the non-complement table before
18004                                # the complement one, as the latter is defined
18005                                # in terms of the former, and needs to have
18006                                # the information for the former available.
18007                                return 1 if $a->complement != 0;
18008                                return -1 if $b->complement != 0;
18009
18010                                # Similarly, return a subservient table after
18011                                # a leader
18012                                return 1 if $a->leader != $a;
18013                                return -1 if $b->leader != $b;
18014
18015                                my $cmp = length $ext_a <=> length $ext_b;
18016
18017                                # Return result if lengths not equal
18018                                return $cmp if $cmp;
18019
18020                                # Alphabetic if lengths equal
18021                                return $ext_a cmp $ext_b
18022                        } $property->tables
18023                    )
18024        {
18025
18026            # Here we have a table associated with a property.  It could be
18027            # the map table (done first for each property), or one of the
18028            # other tables.  Determine which type.
18029            my $is_property = $table->isa('Property');
18030
18031            my $name = $table->name;
18032            my $complete_name = $table->complete_name;
18033
18034            # See if should suppress the table if is empty, but warn if it
18035            # contains something.
18036            my $suppress_if_empty_warn_if_not
18037                    = $why_suppress_if_empty_warn_if_not{$complete_name} || 0;
18038
18039            # Calculate if this table should have any code points associated
18040            # with it or not.
18041            my $expected_empty =
18042
18043                # $perl should be empty
18044                ($is_property && ($table == $perl))
18045
18046                # Match tables in properties we skipped populating should be
18047                # empty
18048                || (! $is_property && ! $property->to_create_match_tables)
18049
18050                # Tables and properties that are expected to have no code
18051                # points should be empty
18052                || $suppress_if_empty_warn_if_not
18053            ;
18054
18055            # Set a boolean if this table is the complement of an empty binary
18056            # table
18057            my $is_complement_of_empty_binary =
18058                $type == $BINARY &&
18059                (($table == $property->table('Y')
18060                    && $property->table('N')->is_empty)
18061                || ($table == $property->table('N')
18062                    && $property->table('Y')->is_empty));
18063
18064            if ($table->is_empty) {
18065
18066                if ($suppress_if_empty_warn_if_not) {
18067                    $table->set_fate($SUPPRESSED,
18068                                     $suppress_if_empty_warn_if_not);
18069                }
18070
18071                # Suppress (by skipping them) expected empty tables.
18072                next TABLE if $expected_empty;
18073
18074                # And setup to later output a warning for those that aren't
18075                # known to be allowed to be empty.  Don't do the warning if
18076                # this table is a child of another one to avoid duplicating
18077                # the warning that should come from the parent one.
18078                if (($table == $property || $table->parent == $table)
18079                    && $table->fate != $SUPPRESSED
18080                    && $table->fate != $MAP_PROXIED
18081                    && ! grep { $complete_name =~ /^$_$/ }
18082                                                    @tables_that_may_be_empty)
18083                {
18084                    push @unhandled_properties, "$table";
18085                }
18086
18087                # The old way of expressing an empty match list was to
18088                # complement the list that matches everything.  The new way is
18089                # to create an empty inversion list, but this doesn't work for
18090                # annotating, so use the old way then.
18091                $table->set_complement($All) if $annotate
18092                                                && $table != $property;
18093            }
18094            elsif ($expected_empty) {
18095                my $because = "";
18096                if ($suppress_if_empty_warn_if_not) {
18097                    $because = " because $suppress_if_empty_warn_if_not";
18098                }
18099
18100                Carp::my_carp("Not expecting property $table$because.  Generating file for it anyway.");
18101            }
18102
18103            # Some tables should match everything
18104            my $expected_full =
18105                ($table->fate == $SUPPRESSED)
18106                ? 0
18107                : ($is_property)
18108                  ? # All these types of map tables will be full because
18109                    # they will have been populated with defaults
18110                    ($type == $ENUM)
18111
18112                  : # A match table should match everything if its method
18113                    # shows it should
18114                    ($table->matches_all
18115
18116                    # The complement of an empty binary table will match
18117                    # everything
18118                    || $is_complement_of_empty_binary
18119                    )
18120            ;
18121
18122            my $count = $table->count;
18123            if ($expected_full) {
18124                if ($count != $MAX_WORKING_CODEPOINTS) {
18125                    Carp::my_carp("$table matches only "
18126                    . clarify_number($count)
18127                    . " Unicode code points but should match "
18128                    . clarify_number($MAX_WORKING_CODEPOINTS)
18129                    . " (off by "
18130                    .  clarify_number(abs($MAX_WORKING_CODEPOINTS - $count))
18131                    . ").  Proceeding anyway.");
18132                }
18133
18134                # Here is expected to be full.  If it is because it is the
18135                # complement of an (empty) binary table that is to be
18136                # suppressed, then suppress this one as well.
18137                if ($is_complement_of_empty_binary) {
18138                    my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
18139                    my $opposing = $property->table($opposing_name);
18140                    my $opposing_status = $opposing->status;
18141                    if ($opposing_status) {
18142                        $table->set_status($opposing_status,
18143                                           $opposing->status_info);
18144                    }
18145                }
18146            }
18147            elsif ($count == $MAX_UNICODE_CODEPOINTS
18148                   && $name ne "Any"
18149                   && ($table == $property || $table->leader == $table)
18150                   && $table->property->status ne $NORMAL)
18151            {
18152                    Carp::my_carp("$table unexpectedly matches all Unicode code points.  Proceeding anyway.");
18153            }
18154
18155            if ($table->fate >= $SUPPRESSED) {
18156                if (! $is_property) {
18157                    my @children = $table->children;
18158                    foreach my $child (@children) {
18159                        if ($child->fate < $SUPPRESSED) {
18160                            Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
18161                        }
18162                    }
18163                }
18164                next TABLE;
18165
18166            }
18167
18168            if (! $is_property) {
18169
18170                make_ucd_table_pod_entries($table) if $table->property == $perl;
18171
18172                # Several things need to be done just once for each related
18173                # group of match tables.  Do them on the parent.
18174                if ($table->parent == $table) {
18175
18176                    # Add an entry in the pod file for the table; it also does
18177                    # the children.
18178                    make_re_pod_entries($table) if defined $pod_directory;
18179
18180                    # See if the table matches identical code points with
18181                    # something that has already been processed and is ready
18182                    # for output.  In that case, no need to have two files
18183                    # with the same code points in them.  We use the table's
18184                    # hash() method to store these in buckets, so that it is
18185                    # quite likely that if two tables are in the same bucket
18186                    # they will be identical, so don't have to compare tables
18187                    # frequently.  The tables have to have the same status to
18188                    # share a file, so add this to the bucket hash.  (The
18189                    # reason for this latter is that UCD.pm associates a
18190                    # status with a file.) We don't check tables that are
18191                    # inverses of others, as it would lead to some coding
18192                    # complications, and checking all the regular ones should
18193                    # find everything.
18194                    if ($table->complement == 0) {
18195                        my $hash = $table->hash . ';' . $table->status;
18196
18197                        # Look at each table that is in the same bucket as
18198                        # this one would be.
18199                        foreach my $comparison
18200                                            (@{$match_tables_to_write{$hash}})
18201                        {
18202                            # If the table doesn't point back to this one, we
18203                            # see if it matches identically
18204                            if (   $comparison->leader != $table
18205                                && $table->matches_identically_to($comparison))
18206                            {
18207                                $table->set_equivalent_to($comparison,
18208                                                                Related => 0);
18209                                next TABLE;
18210                            }
18211                        }
18212
18213                        # Here, not equivalent, add this table to the bucket.
18214                        push @{$match_tables_to_write{$hash}}, $table;
18215                    }
18216                }
18217            }
18218            else {
18219
18220                # Here is the property itself.
18221                # Don't write out or make references to the $perl property
18222                next if $table == $perl;
18223
18224                make_ucd_table_pod_entries($table);
18225
18226                # There is a mapping stored of the various synonyms to the
18227                # standardized name of the property for Unicode::UCD.
18228                # Also, the pod file contains entries of the form:
18229                # \p{alias: *}         \p{full: *}
18230                # rather than show every possible combination of things.
18231
18232                my @property_aliases = $property->aliases;
18233
18234                my $full_property_name = $property->full_name;
18235                my $property_name = $property->name;
18236                my $standard_property_name = standardize($property_name);
18237                my $standard_property_full_name
18238                                        = standardize($full_property_name);
18239
18240                # We also create for Unicode::UCD a list of aliases for
18241                # the property.  The list starts with the property name;
18242                # then its full name.  Legacy properties are not listed in
18243                # Unicode::UCD.
18244                my @property_list;
18245                my @standard_list;
18246                if ( $property->fate <= $MAP_PROXIED) {
18247                    @property_list = ($property_name, $full_property_name);
18248                    @standard_list = ($standard_property_name,
18249                                        $standard_property_full_name);
18250                }
18251
18252                # For each synonym ...
18253                for my $i (0 .. @property_aliases - 1)  {
18254                    my $alias = $property_aliases[$i];
18255                    my $alias_name = $alias->name;
18256                    my $alias_standard = standardize($alias_name);
18257
18258
18259                    # Add other aliases to the list of property aliases
18260                    if ($property->fate <= $MAP_PROXIED
18261                        && ! grep { $alias_standard eq $_ } @standard_list)
18262                    {
18263                        push @property_list, $alias_name;
18264                        push @standard_list, $alias_standard;
18265                    }
18266
18267                    # For Unicode::UCD, set the mapping of the alias to the
18268                    # property
18269                    if ($type == $STRING) {
18270                        if ($property->fate <= $MAP_PROXIED) {
18271                            $string_property_loose_to_name{$alias_standard}
18272                                            = $standard_property_name;
18273                        }
18274                    }
18275                    else {
18276                        my $hash_ref = ($alias_standard =~ /^_/)
18277                                       ? \%strict_property_name_of
18278                                       : \%loose_property_name_of;
18279                        if (exists $hash_ref->{$alias_standard}) {
18280                            Carp::my_carp("There already is a property with the same standard name as $alias_name: $hash_ref->{$alias_standard}.  Old name is retained");
18281                        }
18282                        else {
18283                            $hash_ref->{$alias_standard}
18284                                                = $standard_property_name;
18285                        }
18286
18287                        # Now for the re pod entry for this alias.  Skip if not
18288                        # outputting a pod; skip the first one, which is the
18289                        # full name so won't have an entry like: '\p{full: *}
18290                        # \p{full: *}', and skip if don't want an entry for
18291                        # this one.
18292                        next if $i == 0
18293                                || ! defined $pod_directory
18294                                || ! $alias->make_re_pod_entry;
18295
18296                        my $rhs = "\\p{$full_property_name: *}";
18297                        if ($property != $perl && $table->perl_extension) {
18298                            $rhs .= ' (Perl extension)';
18299                        }
18300                        push @match_properties,
18301                            format_pod_line($indent_info_column,
18302                                        '\p{' . $alias->name . ': *}',
18303                                        $rhs,
18304                                        $alias->status);
18305                    }
18306                }
18307
18308                # The list of all possible names is attached to each alias, so
18309                # lookup is easy
18310                if (@property_list) {
18311                    push @{$prop_aliases{$standard_list[0]}}, @property_list;
18312                }
18313
18314                if ($property->fate <= $MAP_PROXIED) {
18315
18316                    # Similarly, we create for Unicode::UCD a list of
18317                    # property-value aliases.
18318
18319                    # Look at each table in the property...
18320                    foreach my $table ($property->tables) {
18321                        my @values_list;
18322                        my $table_full_name = $table->full_name;
18323                        my $standard_table_full_name
18324                                              = standardize($table_full_name);
18325                        my $table_name = $table->name;
18326                        my $standard_table_name = standardize($table_name);
18327
18328                        # The list starts with the table name and its full
18329                        # name.
18330                        push @values_list, $table_name, $table_full_name;
18331
18332                        # We add to the table each unique alias that isn't
18333                        # discouraged from use.
18334                        foreach my $alias ($table->aliases) {
18335                            next if $alias->status
18336                                 && $alias->status eq $DISCOURAGED;
18337                            my $name = $alias->name;
18338                            my $standard = standardize($name);
18339                            next if $standard eq $standard_table_name;
18340                            next if $standard eq $standard_table_full_name;
18341                            push @values_list, $name;
18342                        }
18343
18344                        # Here @values_list is a list of all the aliases for
18345                        # the table.  That is, all the property-values given
18346                        # by this table.  By agreement with Unicode::UCD,
18347                        # if the name and full name are identical, and there
18348                        # are no other names, drop the duplicate entry to save
18349                        # memory.
18350                        if (@values_list == 2
18351                            && $values_list[0] eq $values_list[1])
18352                        {
18353                            pop @values_list
18354                        }
18355
18356                        # To save memory, unlike the similar list for property
18357                        # aliases above, only the standard forms have the list.
18358                        # This forces an extra step of converting from input
18359                        # name to standard name, but the savings are
18360                        # considerable.  (There is only marginal savings if we
18361                        # did this with the property aliases.)
18362                        push @{$prop_value_aliases{$standard_property_name}{$standard_table_name}}, @values_list;
18363                    }
18364                }
18365
18366                # Don't write out a mapping file if not desired.
18367                next if ! $property->to_output_map;
18368            }
18369
18370            # Here, we know we want to write out the table, but don't do it
18371            # yet because there may be other tables that come along and will
18372            # want to share the file, and the file's comments will change to
18373            # mention them.  So save for later.
18374            push @writables, $table;
18375
18376        } # End of looping through the property and all its tables.
18377    } # End of looping through all properties.
18378
18379    # Now have all the tables that will have files written for them.  Do it.
18380    foreach my $table (@writables) {
18381        my @directory;
18382        my $filename;
18383        my $property = $table->property;
18384        my $is_property = ($table == $property);
18385
18386        # For very short tables, instead of writing them out to actual files,
18387        # we in-line their inversion list definitions into UCD.pm.  The
18388        # definition replaces the file name, and the special pseudo-directory
18389        # '#' is used to signal this.  This significantly cuts down the number
18390        # of files written at little extra cost to the hashes in UCD.pm.
18391        # And it means, no run-time files to read to get the definitions.
18392        if (! $is_property
18393            && ! $annotate  # For annotation, we want to explicitly show
18394                            # everything, so keep in files
18395            && $table->ranges <= 3)
18396        {
18397            my @ranges = $table->ranges;
18398            my $count = @ranges;
18399            if ($count == 0) {  # 0th index reserved for 0-length lists
18400                $filename = 0;
18401            }
18402            elsif ($table->leader != $table) {
18403
18404                # Here, is a table that is equivalent to another; code
18405                # in register_file_for_name() causes its leader's definition
18406                # to be used
18407
18408                next;
18409            }
18410            else {  # No equivalent table so far.
18411
18412                # Build up its definition range-by-range.
18413                my $definition = "";
18414                while (defined (my $range = shift @ranges)) {
18415                    my $end = $range->end;
18416                    if ($end < $MAX_WORKING_CODEPOINT) {
18417                        $count++;
18418                        $end = "\n" . ($end + 1);
18419                    }
18420                    else {  # Extends to infinity, hence no 'end'
18421                        $end = "";
18422                    }
18423                    $definition .= "\n" . $range->start . $end;
18424                }
18425                $definition = "V$count" . $definition;
18426                $filename = @inline_definitions;
18427                push @inline_definitions, $definition;
18428            }
18429            @directory = "#";
18430            register_file_for_name($table, \@directory, $filename);
18431            next;
18432        }
18433
18434        if (! $is_property) {
18435            # Match tables for the property go in lib/$subdirectory, which is
18436            # the property's name.  Don't use the standard file name for this,
18437            # as may get an unfamiliar alias
18438            @directory = ($matches_directory, ($property->match_subdir)
18439                                              ? $property->match_subdir
18440                                              : $property->external_name);
18441        }
18442        else {
18443
18444            @directory = $table->directory;
18445            $filename = $table->file;
18446        }
18447
18448        # Use specified filename if available, or default to property's
18449        # shortest name.  We need an 8.3 safe filename (which means "an 8
18450        # safe" filename, since after the dot is only 'pl', which is < 3)
18451        # The 2nd parameter is if the filename shouldn't be changed, and
18452        # it shouldn't iff there is a hard-coded name for this table.
18453        $filename = construct_filename(
18454                                $filename || $table->external_name,
18455                                ! $filename,    # mutable if no filename
18456                                \@directory);
18457
18458        register_file_for_name($table, \@directory, $filename);
18459
18460        # Only need to write one file when shared by more than one
18461        # property
18462        next if ! $is_property
18463                && ($table->leader != $table || $table->complement != 0);
18464
18465        # Construct a nice comment to add to the file
18466        $table->set_final_comment;
18467
18468        $table->write;
18469    }
18470
18471
18472    # Write out the pod file
18473    make_pod;
18474
18475    # And Name.pm, UCD.pl
18476    make_Name_pm;
18477    make_UCD;
18478
18479    make_property_test_script() if $make_test_script;
18480    make_normalization_test_script() if $make_norm_test_script;
18481    return;
18482}
18483
18484my @white_space_separators = ( # This used only for making the test script.
18485                            "",
18486                            ' ',
18487                            "\t",
18488                            '   '
18489                        );
18490
18491sub generate_separator($lhs) {
18492    # This used only for making the test script.  It generates the colon or
18493    # equal separator between the property and property value, with random
18494    # white space surrounding the separator
18495
18496    return "" if $lhs eq "";  # No separator if there's only one (the r) side
18497
18498    # Choose space before and after randomly
18499    my $spaces_before =$white_space_separators[rand(@white_space_separators)];
18500    my $spaces_after = $white_space_separators[rand(@white_space_separators)];
18501
18502    # And return the whole complex, half the time using a colon, half the
18503    # equals
18504    return $spaces_before
18505            . (rand() < 0.5) ? '=' : ':'
18506            . $spaces_after;
18507}
18508
18509sub generate_tests($lhs, $rhs, $valid_code, $invalid_code, $warning) {
18510    # This used only for making the test script.  It generates test cases that
18511    # are expected to compile successfully in perl.  Note that the LHS and
18512    # RHS are assumed to already be as randomized as the caller wants.
18513
18514    # $lhs          # The property: what's to the left of the colon
18515                    #  or equals separator
18516    # $rhs          # The property value; what's to the right
18517    # $valid_code   # A code point that's known to be in the
18518                        # table given by LHS=RHS; undef if table is
18519                        # empty
18520    # $invalid_code # A code point known to not be in the table;
18521                    # undef if the table is all code points
18522    # $warning
18523
18524    # Get the colon or equal
18525    my $separator = generate_separator($lhs);
18526
18527    # The whole 'property=value'
18528    my $name = "$lhs$separator$rhs";
18529
18530    my @output;
18531    # Create a complete set of tests, with complements.
18532    if (defined $valid_code) {
18533        push @output, <<"EOC"
18534Expect(1, $valid_code, '\\p{$name}', $warning);
18535Expect(0, $valid_code, '\\p{^$name}', $warning);
18536Expect(0, $valid_code, '\\P{$name}', $warning);
18537Expect(1, $valid_code, '\\P{^$name}', $warning);
18538EOC
18539    }
18540    if (defined $invalid_code) {
18541        push @output, <<"EOC"
18542Expect(0, $invalid_code, '\\p{$name}', $warning);
18543Expect(1, $invalid_code, '\\p{^$name}', $warning);
18544Expect(1, $invalid_code, '\\P{$name}', $warning);
18545Expect(0, $invalid_code, '\\P{^$name}', $warning);
18546EOC
18547    }
18548    return @output;
18549}
18550
18551sub generate_wildcard_tests($lhs, $rhs, $valid_code, $invalid_code, $warning) {
18552    # This used only for making the test script.  It generates wildcardl
18553    # matching test cases that are expected to compile successfully in perl.
18554
18555    # $lhs           # The property: what's to the left of the
18556                     # or equals separator
18557    # $rhs           # The property value; what's to the right
18558    # $valid_code    # A code point that's known to be in the
18559                     # table given by LHS=RHS; undef if table is
18560                     # empty
18561    # $invalid_code  # A code point known to not be in the table;
18562                     # undef if the table is all code points
18563    # $warning
18564
18565    return if $lhs eq "";
18566    return if $lhs =~ / ^ Is_ /x;   # These are not currently supported
18567
18568    # Generate a standardized pattern, with colon being the delimitter
18569    my $wildcard = "$lhs=:\\A$rhs\\z:";
18570
18571    my @output;
18572    push @output, "Expect(1, $valid_code, '\\p{$wildcard}', $warning);"
18573                                                        if defined $valid_code;
18574    push @output, "Expect(0, $invalid_code, '\\p{$wildcard}', $warning);"
18575                                                      if defined $invalid_code;
18576    return @output;
18577}
18578
18579sub generate_error($lhs, $rhs, $already_in_error=0) {
18580    # This used only for making the test script.  It generates test cases that
18581    # are expected to not only not match, but to be syntax or similar errors
18582
18583    # $lhs                # The property: what's to the left of the
18584                          # colon or equals separator
18585    # $rhs                # The property value; what's to the right
18586    # $already_in_error   # Boolean; if true it's known that the
18587                          # unmodified LHS and RHS will cause an error.
18588                          # This routine should not force another one
18589    # Get the colon or equal
18590    my $separator = generate_separator($lhs);
18591
18592    # Since this is an error only, don't bother to randomly decide whether to
18593    # put the error on the left or right side; and assume that the RHS is
18594    # loosely matched, again for convenience rather than rigor.
18595    $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
18596
18597    my $property = $lhs . $separator . $rhs;
18598
18599    return <<"EOC";
18600Error('\\p{$property}');
18601Error('\\P{$property}');
18602EOC
18603}
18604
18605# These are used only for making the test script
18606# XXX Maybe should also have a bad strict seps, which includes underscore.
18607
18608my @good_loose_seps = (
18609            " ",
18610            "-",
18611            "\t",
18612            "",
18613            "_",
18614           );
18615my @bad_loose_seps = (
18616           "/a/",
18617           ':=',
18618          );
18619
18620sub randomize_stricter_name($name) {
18621    # This used only for making the test script.  Take the input name and
18622    # return a randomized, but valid version of it under the stricter matching
18623    # rules.
18624
18625    # If the name looks like a number (integer, floating, or rational), do
18626    # some extra work
18627    if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
18628        my $sign = $1;
18629        my $number = $2;
18630        my $separator = $3;
18631
18632        # If there isn't a sign, part of the time add a plus
18633        # Note: Not testing having any denominator having a minus sign
18634        if (! $sign) {
18635            $sign = '+' if rand() <= .3;
18636        }
18637
18638        # And add 0 or more leading zeros.
18639        $name = $sign . ('0' x int rand(10)) . $number;
18640
18641        if (defined $separator) {
18642            my $extra_zeros = '0' x int rand(10);
18643
18644            if ($separator eq '.') {
18645
18646                # Similarly, add 0 or more trailing zeros after a decimal
18647                # point
18648                $name .= $extra_zeros;
18649            }
18650            else {
18651
18652                # Or, leading zeros before the denominator
18653                $name =~ s,/,/$extra_zeros,;
18654            }
18655        }
18656    }
18657
18658    # For legibility of the test, only change the case of whole sections at a
18659    # time.  To do this, first split into sections.  The split returns the
18660    # delimiters
18661    my @sections;
18662    for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
18663        trace $section if main::DEBUG && $to_trace;
18664
18665        if (length $section > 1 && $section !~ /\D/) {
18666
18667            # If the section is a sequence of digits, about half the time
18668            # randomly add underscores between some of them.
18669            if (rand() > .5) {
18670
18671                # Figure out how many underscores to add.  max is 1 less than
18672                # the number of digits.  (But add 1 at the end to make sure
18673                # result isn't 0, and compensate earlier by subtracting 2
18674                # instead of 1)
18675                my $num_underscores = int rand(length($section) - 2) + 1;
18676
18677                # And add them evenly throughout, for convenience, not rigor
18678                use integer;
18679                my $spacing = (length($section) - 1)/ $num_underscores;
18680                my $temp = $section;
18681                $section = "";
18682                for my $i (1 .. $num_underscores) {
18683                    $section .= substr($temp, 0, $spacing, "") . '_';
18684                }
18685                $section .= $temp;
18686            }
18687            push @sections, $section;
18688        }
18689        else {
18690
18691            # Here not a sequence of digits.  Change the case of the section
18692            # randomly
18693            my $switch = int rand(4);
18694            if ($switch == 0) {
18695                push @sections, uc $section;
18696            }
18697            elsif ($switch == 1) {
18698                push @sections, lc $section;
18699            }
18700            elsif ($switch == 2) {
18701                push @sections, ucfirst $section;
18702            }
18703            else {
18704                push @sections, $section;
18705            }
18706        }
18707    }
18708    trace "returning", join "", @sections if main::DEBUG && $to_trace;
18709    return join "", @sections;
18710}
18711
18712sub randomize_loose_name($name, $want_error=0) {
18713    # This used only for making the test script
18714
18715    $name = randomize_stricter_name($name);
18716
18717    my @parts;
18718    push @parts, $good_loose_seps[rand(@good_loose_seps)];
18719
18720    # Preserve trailing ones for the sake of not stripping the underscore from
18721    # 'L_'
18722    for my $part (split /[-\s_]+ (?= . )/, $name) {
18723        if (@parts) {
18724            if ($want_error and rand() < 0.3) {
18725                push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
18726                $want_error = 0;
18727            }
18728            else {
18729                push @parts, $good_loose_seps[rand(@good_loose_seps)];
18730            }
18731        }
18732        push @parts, $part;
18733    }
18734    my $new = join("", @parts);
18735    trace "$name => $new" if main::DEBUG && $to_trace;
18736
18737    if ($want_error) {
18738        if (rand() >= 0.5) {
18739            $new .= $bad_loose_seps[rand(@bad_loose_seps)];
18740        }
18741        else {
18742            $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
18743        }
18744    }
18745    return $new;
18746}
18747
18748# Used to make sure don't generate duplicate test cases.
18749my %test_generated;
18750
18751sub make_property_test_script() {
18752    # This used only for making the test script
18753    # this written directly -- it's huge.
18754
18755    print "Making test script\n" if $verbosity >= $PROGRESS;
18756
18757    # This uses randomness to test different possibilities without testing all
18758    # possibilities.  To ensure repeatability, set the seed to 0.  But if
18759    # tests are added, it will perturb all later ones in the .t file
18760    srand 0;
18761
18762    $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
18763
18764    # Create a list of what the %f representation is for each rational number.
18765    # This will be used below.
18766    my @valid_base_floats = '0.0';
18767    foreach my $e_representation (keys %nv_floating_to_rational) {
18768        push @valid_base_floats,
18769                            eval $nv_floating_to_rational{$e_representation};
18770    }
18771
18772    # It doesn't matter whether the elements of this array contain single lines
18773    # or multiple lines. main::write doesn't count the lines.
18774    my @output;
18775
18776    push @output, <<'EOF_CODE';
18777Error('\p{Script=InGreek}');    # Bug #69018
18778Test_GCB("1100 $nobreak 1161");  # Bug #70940
18779Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
18780Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
18781Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726
18782Error('\p{InKana}');    # 'Kana' is not a block so InKana shouldn't compile
18783
18784# Make sure this gets tested; it was not part of the official test suite at
18785# the time this was added.  Note that this is as it would appear in the
18786# official suite, and gets modified to check for the perl tailoring by
18787# Test_WB()
18788Test_WB("$breakable 0020 $breakable 0020 $breakable 0308 $breakable");
18789Test_LB("$nobreak 200B $nobreak 0020 $nobreak 0020 $breakable 2060 $breakable");
18790Expect(1, ord(" "), '\p{gc=:(?aa)s:}', "");     # /aa is valid
18791Expect(1, ord(" "), '\p{gc=:(?-s)s:}', "");     # /-s is valid
18792EOF_CODE
18793
18794    # Sort these so get results in same order on different runs of this
18795    # program
18796    foreach my $property (sort { $a->has_dependency <=> $b->has_dependency
18797                                    or
18798                                 lc $a->name cmp lc $b->name
18799                               } property_ref('*'))
18800    {
18801        # Non-binary properties should not match \p{};  Test all for that.
18802        if ($property->type != $BINARY && $property->type != $FORCED_BINARY) {
18803            my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS }
18804                                                            $property->aliases;
18805            foreach my $property_alias ($property->aliases) {
18806                my $name = standardize($property_alias->name);
18807
18808                # But some names are ambiguous, meaning a binary property with
18809                # the same name when used in \p{}, and a different
18810                # (non-binary) property in other contexts.
18811                next if grep { $name eq $_ } keys %ambiguous_names;
18812
18813                push @output, <<"EOF_CODE";
18814Error('\\p{$name}');
18815Error('\\P{$name}');
18816EOF_CODE
18817            }
18818        }
18819        foreach my $table (sort { $a->has_dependency <=> $b->has_dependency
18820                                    or
18821                                  lc $a->name cmp lc $b->name
18822                                } $property->tables)
18823        {
18824
18825            # Find code points that match, and don't match this table.
18826            my $valid = $table->get_valid_code_point;
18827            my $invalid = $table->get_invalid_code_point;
18828            my $warning = ($table->status eq $DEPRECATED)
18829                            ? "'deprecated'"
18830                            : '""';
18831
18832            # Test each possible combination of the property's aliases with
18833            # the table's.  If this gets to be too many, could do what is done
18834            # in the set_final_comment() for Tables
18835            my @table_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->aliases;
18836            next unless @table_aliases;
18837            my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->property->aliases;
18838            next unless @property_aliases;
18839
18840            # Every property can be optionally be prefixed by 'Is_', so test
18841            # that those work, by creating such a new alias for each
18842            # pre-existing one.
18843            push @property_aliases, map { Alias->new("Is_" . $_->name,
18844                                                    $_->loose_match,
18845                                                    $_->make_re_pod_entry,
18846                                                    $_->ok_as_filename,
18847                                                    $_->status,
18848                                                    $_->ucd,
18849                                                    )
18850                                         } @property_aliases;
18851            my $max = max(scalar @table_aliases, scalar @property_aliases);
18852            for my $j (0 .. $max - 1) {
18853
18854                # The current alias for property is the next one on the list,
18855                # or if beyond the end, start over.  Similarly for table
18856                my $property_name
18857                            = $property_aliases[$j % @property_aliases]->name;
18858
18859                $property_name = "" if $table->property == $perl;
18860                my $table_alias = $table_aliases[$j % @table_aliases];
18861                my $table_name = $table_alias->name;
18862                my $loose_match = $table_alias->loose_match;
18863
18864                # If the table doesn't have a file, any test for it is
18865                # already guaranteed to be in error
18866                my $already_error = ! $table->file_path;
18867
18868                # A table that begins with these could actually be a
18869                # user-defined property, so won't be compile time errors, as
18870                # the definitions of those can be deferred until runtime
18871                next if $already_error && $table_name =~ / ^ I[ns] /x;
18872
18873                # Generate error cases for this alias.
18874                push @output, generate_error($property_name,
18875                                             $table_name,
18876                                             $already_error);
18877
18878                # If the table is guaranteed to always generate an error,
18879                # quit now without generating success cases.
18880                next if $already_error;
18881
18882                # Now for the success cases.  First, wildcard matching, as it
18883                # shouldn't have any randomization.
18884                if ($table_alias->status eq $NORMAL) {
18885                    push @output, generate_wildcard_tests($property_name,
18886                                                          $table_name,
18887                                                          $valid,
18888                                                          $invalid,
18889                                                          $warning,
18890                                                         );
18891                }
18892                my $random;
18893                if ($loose_match) {
18894
18895                    # For loose matching, create an extra test case for the
18896                    # standard name.
18897                    my $standard = standardize($table_name);
18898
18899                    # $test_name should be a unique combination for each test
18900                    # case; used just to avoid duplicate tests
18901                    my $test_name = "$property_name=$standard";
18902
18903                    # Don't output duplicate test cases.
18904                    if (! exists $test_generated{$test_name}) {
18905                        $test_generated{$test_name} = 1;
18906                        push @output, generate_tests($property_name,
18907                                                     $standard,
18908                                                     $valid,
18909                                                     $invalid,
18910                                                     $warning,
18911                                                 );
18912                        if ($table_alias->status eq $NORMAL) {
18913                            push @output, generate_wildcard_tests(
18914                                                     $property_name,
18915                                                     $standard,
18916                                                     $valid,
18917                                                     $invalid,
18918                                                     $warning,
18919                                                 );
18920                        }
18921                    }
18922                    $random = randomize_loose_name($table_name)
18923                }
18924                else { # Stricter match
18925                    $random = randomize_stricter_name($table_name);
18926                }
18927
18928                # Now for the main test case for this alias.
18929                my $test_name = "$property_name=$random";
18930                if (! exists $test_generated{$test_name}) {
18931                    $test_generated{$test_name} = 1;
18932                    push @output, generate_tests($property_name,
18933                                                 $random,
18934                                                 $valid,
18935                                                 $invalid,
18936                                                 $warning,
18937                                             );
18938
18939                    if ($property->name eq 'nv') {
18940                        if ($table_name !~ qr{/}) {
18941                            push @output, generate_tests($property_name,
18942                                                sprintf("%.15e", $table_name),
18943                                                $valid,
18944                                                $invalid,
18945                                                $warning,
18946                                            );
18947                    }
18948                    else {
18949                        # If the name is a rational number, add tests for a
18950                        # non-reduced form, and for a floating point equivalent.
18951
18952                        # 60 is a number divisible by a bunch of things
18953                        my ($numerator, $denominator) = $table_name
18954                                                        =~ m! (.+) / (.+) !x;
18955                        $numerator *= 60;
18956                        $denominator *= 60;
18957                        push @output, generate_tests($property_name,
18958                                                    "$numerator/$denominator",
18959                                                    $valid,
18960                                                    $invalid,
18961                                                    $warning,
18962                                    );
18963
18964                        # Calculate the float, and the %e representation
18965                        my $float = eval $table_name;
18966                        my $e_representation = sprintf("%.*e",
18967                                                $E_FLOAT_PRECISION, $float);
18968                        # Parse that
18969                        my ($non_zeros, $zeros, $exponent_sign, $exponent)
18970                           = $e_representation
18971                               =~ / -? [1-9] \. (\d*?) (0*) e ([+-]) (\d+) /x;
18972                        my $min_e_precision;
18973                        my $min_f_precision;
18974
18975                        if ($exponent_sign eq '+' && $exponent != 0) {
18976                            Carp::my_carp_bug("Not yet equipped to handle"
18977                                            . " positive exponents");
18978                            return;
18979                        }
18980                        else {
18981                            # We're trying to find the minimum precision that
18982                            # is needed to indicate this particular rational
18983                            # for the given $E_FLOAT_PRECISION.  For %e, any
18984                            # trailing zeros, like 1.500e-02 aren't needed, so
18985                            # the correct value is how many non-trailing zeros
18986                            # there are after the decimal point.
18987                            $min_e_precision = length $non_zeros;
18988
18989                            # For %f, like .01500, we want at least
18990                            # $E_FLOAT_PRECISION digits, but any trailing
18991                            # zeros aren't needed, so we can subtract the
18992                            # length of those.  But we also need to include
18993                            # the zeros after the decimal point, but before
18994                            # the first significant digit.
18995                            $min_f_precision = $E_FLOAT_PRECISION
18996                                             + $exponent
18997                                             - length $zeros;
18998                        }
18999
19000                        # Make tests for each possible precision from 1 to
19001                        # just past the worst case.
19002                        my $upper_limit = ($min_e_precision > $min_f_precision)
19003                                           ? $min_e_precision
19004                                           : $min_f_precision;
19005
19006                        for my $i (1 .. $upper_limit + 1) {
19007                            for my $format ("e", "f") {
19008                                my $this_table
19009                                          = sprintf("%.*$format", $i, $float);
19010
19011                                # If we don't have enough precision digits,
19012                                # make a fail test; otherwise a pass test.
19013                                my $pass = ($format eq "e")
19014                                            ? $i >= $min_e_precision
19015                                            : $i >= $min_f_precision;
19016                                if ($pass) {
19017                                    push @output, generate_tests($property_name,
19018                                                                $this_table,
19019                                                                $valid,
19020                                                                $invalid,
19021                                                                $warning,
19022                                                );
19023                                }
19024                                elsif (   $format eq "e"
19025
19026                                          # Here we would fail, but in the %f
19027                                          # case, the representation at this
19028                                          # precision could actually be a
19029                                          # valid one for some other rational
19030                                       || ! grep { $this_table
19031                                                            =~ / ^ $_ 0* $ /x }
19032                                                            @valid_base_floats)
19033                                {
19034                                    push @output,
19035                                        generate_error($property_name,
19036                                                       $this_table,
19037                                                       1   # 1 => already an
19038                                                           # error
19039                                                );
19040                                }
19041                            }
19042                        }
19043                    }
19044                    }
19045                }
19046            }
19047            $table->DESTROY();
19048        }
19049        $property->DESTROY();
19050    }
19051
19052    # Make any test of the boundary (break) properties TODO if the code
19053    # doesn't match the version being compiled
19054    my $TODO_FAILING_BREAKS = ($version_of_mk_invlist_bounds ne $v_version)
19055                             ? "\nsub TODO_FAILING_BREAKS { 1 }\n"
19056                             : "\nsub TODO_FAILING_BREAKS { 0 }\n";
19057
19058    @output= map {
19059        map s/^/    /mgr,
19060        map "$_;\n",
19061        split /;\n/, $_
19062    } @output;
19063
19064    # Cause there to be 'if' statements to only execute a portion of this
19065    # long-running test each time, so that we can have a bunch of .t's running
19066    # in parallel
19067    my $chunks = 10     # Number of test files
19068               - 1      # For GCB & SB
19069               - 1      # For WB
19070               - 4;     # LB split into this many files
19071    my @output_chunked;
19072    my $chunk_count=0;
19073    my $chunk_size= int(@output / $chunks) + 1;
19074    while (@output) {
19075        $chunk_count++;
19076        my @chunk= splice @output, 0, $chunk_size;
19077        push @output_chunked,
19078            "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19079                @chunk,
19080            "}\n";
19081    }
19082
19083    $chunk_count++;
19084    push @output_chunked,
19085        "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19086            (map {"    Test_GCB('$_');\n"} @backslash_X_tests),
19087            (map {"    Test_SB('$_');\n"} @SB_tests),
19088        "}\n";
19089
19090
19091    $chunk_size= int(@LB_tests / 4) + 1;
19092    @LB_tests = map {"    Test_LB('$_');\n"} @LB_tests;
19093    while (@LB_tests) {
19094        $chunk_count++;
19095        my @chunk= splice @LB_tests, 0, $chunk_size;
19096        push @output_chunked,
19097            "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19098                @chunk,
19099            "}\n";
19100    }
19101
19102    $chunk_count++;
19103    push @output_chunked,
19104        "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19105            (map {"    Test_WB('$_');\n"} @WB_tests),
19106        "}\n";
19107
19108    &write($t_path,
19109           0,           # Not utf8;
19110           [$HEADER,
19111            $TODO_FAILING_BREAKS,
19112            <DATA>,
19113            @output_chunked,
19114            "Finished();\n",
19115           ]);
19116
19117    return;
19118}
19119
19120sub make_normalization_test_script() {
19121    print "Making normalization test script\n" if $verbosity >= $PROGRESS;
19122
19123    my $n_path = 'TestNorm.pl';
19124
19125    unshift @normalization_tests, <<'END';
19126use utf8;
19127use Test::More;
19128
19129sub ord_string {    # Convert packed ords to printable string
19130    use charnames ();
19131    return "'" . join("", map { '\N{' . charnames::viacode($_) . '}' }
19132                                                unpack "U*", shift) .  "'";
19133    #return "'" . join(" ", map { sprintf "%04X", $_ } unpack "U*", shift) .  "'";
19134}
19135
19136sub Test_N {
19137    my ($source, $nfc, $nfd, $nfkc, $nfkd) = @_;
19138    my $display_source = ord_string($source);
19139    my $display_nfc = ord_string($nfc);
19140    my $display_nfd = ord_string($nfd);
19141    my $display_nfkc = ord_string($nfkc);
19142    my $display_nfkd = ord_string($nfkd);
19143
19144    use Unicode::Normalize;
19145    #    NFC
19146    #      nfc ==  toNFC(source) ==  toNFC(nfc) ==  toNFC(nfd)
19147    #      nfkc ==  toNFC(nfkc) ==  toNFC(nfkd)
19148    #
19149    #    NFD
19150    #      nfd ==  toNFD(source) ==  toNFD(nfc) ==  toNFD(nfd)
19151    #      nfkd ==  toNFD(nfkc) ==  toNFD(nfkd)
19152    #
19153    #    NFKC
19154    #      nfkc == toNFKC(source) == toNFKC(nfc) == toNFKC(nfd) ==
19155    #      toNFKC(nfkc) == toNFKC(nfkd)
19156    #
19157    #    NFKD
19158    #      nfkd == toNFKD(source) == toNFKD(nfc) == toNFKD(nfd) ==
19159    #      toNFKD(nfkc) == toNFKD(nfkd)
19160
19161    is(NFC($source), $nfc, "NFC($display_source) eq $display_nfc");
19162    is(NFC($nfc), $nfc, "NFC($display_nfc) eq $display_nfc");
19163    is(NFC($nfd), $nfc, "NFC($display_nfd) eq $display_nfc");
19164    is(NFC($nfkc), $nfkc, "NFC($display_nfkc) eq $display_nfkc");
19165    is(NFC($nfkd), $nfkc, "NFC($display_nfkd) eq $display_nfkc");
19166
19167    is(NFD($source), $nfd, "NFD($display_source) eq $display_nfd");
19168    is(NFD($nfc), $nfd, "NFD($display_nfc) eq $display_nfd");
19169    is(NFD($nfd), $nfd, "NFD($display_nfd) eq $display_nfd");
19170    is(NFD($nfkc), $nfkd, "NFD($display_nfkc) eq $display_nfkd");
19171    is(NFD($nfkd), $nfkd, "NFD($display_nfkd) eq $display_nfkd");
19172
19173    is(NFKC($source), $nfkc, "NFKC($display_source) eq $display_nfkc");
19174    is(NFKC($nfc), $nfkc, "NFKC($display_nfc) eq $display_nfkc");
19175    is(NFKC($nfd), $nfkc, "NFKC($display_nfd) eq $display_nfkc");
19176    is(NFKC($nfkc), $nfkc, "NFKC($display_nfkc) eq $display_nfkc");
19177    is(NFKC($nfkd), $nfkc, "NFKC($display_nfkd) eq $display_nfkc");
19178
19179    is(NFKD($source), $nfkd, "NFKD($display_source) eq $display_nfkd");
19180    is(NFKD($nfc), $nfkd, "NFKD($display_nfc) eq $display_nfkd");
19181    is(NFKD($nfd), $nfkd, "NFKD($display_nfd) eq $display_nfkd");
19182    is(NFKD($nfkc), $nfkd, "NFKD($display_nfkc) eq $display_nfkd");
19183    is(NFKD($nfkd), $nfkd, "NFKD($display_nfkd) eq $display_nfkd");
19184}
19185END
19186
19187    &write($n_path,
19188           1,           # Is utf8;
19189           [
19190            @normalization_tests,
19191            'done_testing();'
19192            ]);
19193    return;
19194}
19195
19196# Skip reasons, so will be exact same text and hence the files with each
19197# reason will get grouped together in perluniprops.
19198my $Documentation = "Documentation";
19199my $Indic_Skip
19200            = "Provisional; for the analysis and processing of Indic scripts";
19201my $Validation = "Validation Tests";
19202my $Validation_Documentation = "Documentation of validation Tests";
19203
19204# This is a list of the input files and how to handle them.  The files are
19205# processed in their order in this list.  Some reordering is possible if
19206# desired, but the PropertyAliases and PropValueAliases files should be first,
19207# and the extracted before the others (as data in an extracted file can be
19208# over-ridden by the non-extracted.  Some other files depend on data derived
19209# from an earlier file, like UnicodeData requires data from Jamo, and the case
19210# changing and folding requires data from Unicode.  Mostly, it is safest to
19211# order by first version releases in (except the Jamo).
19212#
19213# The version strings allow the program to know whether to expect a file or
19214# not, but if a file exists in the directory, it will be processed, even if it
19215# is in a version earlier than expected, so you can copy files from a later
19216# release into an earlier release's directory.
19217my @input_file_objects = (
19218    Input_file->new('PropertyAliases.txt', v3.2,
19219                    Handler => \&process_PropertyAliases,
19220                    Early => [ \&substitute_PropertyAliases ],
19221                    Required_Even_in_Debug_Skip => 1,
19222                   ),
19223    Input_file->new(undef, v0,  # No file associated with this
19224                    Progress_Message => 'Finishing property setup',
19225                    Handler => \&finish_property_setup,
19226                   ),
19227    Input_file->new('PropValueAliases.txt', v3.2,
19228                     Handler => \&process_PropValueAliases,
19229                     Early => [ \&substitute_PropValueAliases ],
19230                     Has_Missings_Defaults => $NOT_IGNORED,
19231                     Required_Even_in_Debug_Skip => 1,
19232                    ),
19233    Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
19234                    Property => 'General_Category',
19235                   ),
19236    Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
19237                    Property => 'Canonical_Combining_Class',
19238                    Has_Missings_Defaults => $NOT_IGNORED,
19239                   ),
19240    Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
19241                    Property => 'Numeric_Type',
19242                    Has_Missings_Defaults => $NOT_IGNORED,
19243                   ),
19244    Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
19245                    Property => 'East_Asian_Width',
19246                    Has_Missings_Defaults => $NOT_IGNORED,
19247                   ),
19248    Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
19249                    Property => 'Line_Break',
19250                    Has_Missings_Defaults => $NOT_IGNORED,
19251                   ),
19252    Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
19253                    Property => 'Bidi_Class',
19254                    Has_Missings_Defaults => $NOT_IGNORED,
19255                   ),
19256    Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
19257                    Property => 'Decomposition_Type',
19258                    Has_Missings_Defaults => $NOT_IGNORED,
19259                   ),
19260    Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
19261    Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
19262                    Property => 'Numeric_Value',
19263                    Each_Line_Handler => \&filter_numeric_value_line,
19264                    Has_Missings_Defaults => $NOT_IGNORED,
19265                   ),
19266    Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
19267                    Property => 'Joining_Group',
19268                    Has_Missings_Defaults => $NOT_IGNORED,
19269                   ),
19270
19271    Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
19272                    Property => 'Joining_Type',
19273                    Has_Missings_Defaults => $NOT_IGNORED,
19274                   ),
19275    Input_file->new("${EXTRACTED}DName.txt", v10.0.0,
19276                    Skip => 'This file adds no new information not already'
19277                          . ' present in other files',
19278                    # And it's unnecessary programmer work to handle this new
19279                    # format.  Previous Derived files actually had bug fixes
19280                    # in them that were useful, but that should not be the
19281                    # case here.
19282                   ),
19283    Input_file->new('Jamo.txt', v2.0.0,
19284                    Property => 'Jamo_Short_Name',
19285                    Each_Line_Handler => \&filter_jamo_line,
19286                   ),
19287    Input_file->new('UnicodeData.txt', v1.1.5,
19288                    Pre_Handler => \&setup_UnicodeData,
19289
19290                    # We clean up this file for some early versions.
19291                    Each_Line_Handler => [ (($v_version lt v2.0.0 )
19292                                            ? \&filter_v1_ucd
19293                                            : ($v_version eq v2.1.5)
19294                                                ? \&filter_v2_1_5_ucd
19295
19296                                                # And for 5.14 Perls with 6.0,
19297                                                # have to also make changes
19298                                                : ($v_version ge v6.0.0
19299                                                   && $^V lt v5.17.0)
19300                                                    ? \&filter_v6_ucd
19301                                                    : undef),
19302
19303                                            # Early versions did not have the
19304                                            # proper Unicode_1 names for the
19305                                            # controls
19306                                            (($v_version lt v3.0.0)
19307                                            ? \&filter_early_U1_names
19308                                            : undef),
19309
19310                                            # Early versions did not correctly
19311                                            # use the later method for giving
19312                                            # decimal digit values
19313                                            (($v_version le v3.2.0)
19314                                            ? \&filter_bad_Nd_ucd
19315                                            : undef),
19316
19317                                            # And the main filter
19318                                            \&filter_UnicodeData_line,
19319                                         ],
19320                    EOF_Handler => \&EOF_UnicodeData,
19321                   ),
19322    Input_file->new('CJKXREF.TXT', v1.1.5,
19323                    Withdrawn => v2.0.0,
19324                    Skip => 'Gives the mapping of CJK code points '
19325                          . 'between Unicode and various other standards',
19326                   ),
19327    Input_file->new('ArabicShaping.txt', v2.0.0,
19328                    Each_Line_Handler =>
19329                        ($v_version lt 4.1.0)
19330                                    ? \&filter_old_style_arabic_shaping
19331                                    : undef,
19332                    # The first field after the range is a "schematic name"
19333                    # not used by Perl
19334                    Properties => [ '<ignored>', 'Joining_Type', 'Joining_Group' ],
19335                    Has_Missings_Defaults => $NOT_IGNORED,
19336                   ),
19337    Input_file->new('Blocks.txt', v2.0.0,
19338                    Property => 'Block',
19339                    Has_Missings_Defaults => $NOT_IGNORED,
19340                    Each_Line_Handler => \&filter_blocks_lines
19341                   ),
19342    Input_file->new('Index.txt', v2.0.0,
19343                    Skip => 'Alphabetical index of Unicode characters',
19344                   ),
19345    Input_file->new('NamesList.txt', v2.0.0,
19346                    Skip => 'Annotated list of characters',
19347                   ),
19348    Input_file->new('PropList.txt', v2.0.0,
19349                    Each_Line_Handler => (($v_version lt v3.1.0)
19350                                            ? \&filter_old_style_proplist
19351                                            : undef),
19352                   ),
19353    Input_file->new('Props.txt', v2.0.0,
19354                    Withdrawn => v3.0.0,
19355                    Skip => 'A subset of F<PropList.txt> (which is used instead)',
19356                   ),
19357    Input_file->new('ReadMe.txt', v2.0.0,
19358                    Skip => $Documentation,
19359                   ),
19360    Input_file->new('Unihan.txt', v2.0.0,
19361                    Withdrawn => v5.2.0,
19362                    Construction_Time_Handler => \&construct_unihan,
19363                    Pre_Handler => \&setup_unihan,
19364                    Optional => [ "",
19365                                  'Unicode_Radical_Stroke'
19366                                ],
19367                    Each_Line_Handler => \&filter_unihan_line,
19368                   ),
19369    Input_file->new('SpecialCasing.txt', v2.1.8,
19370                    Each_Line_Handler => ($v_version eq 2.1.8)
19371                                         ? \&filter_2_1_8_special_casing_line
19372                                         : \&filter_special_casing_line,
19373                    Pre_Handler => \&setup_special_casing,
19374                    Has_Missings_Defaults => $IGNORED,
19375                   ),
19376    Input_file->new(
19377                    'LineBreak.txt', v3.0.0,
19378                    Has_Missings_Defaults => $NOT_IGNORED,
19379                    Property => 'Line_Break',
19380                    # Early versions had problematic syntax
19381                    Each_Line_Handler => ($v_version ge v3.1.0)
19382                                          ? undef
19383                                          : ($v_version lt v3.0.0)
19384                                            ? \&filter_substitute_lb
19385                                            : \&filter_early_ea_lb,
19386                    # Must use long names for property values see comments at
19387                    # sub filter_substitute_lb
19388                    Early => [ "LBsubst.txt", '_Perl_LB', 'Alphabetic',
19389                               'Alphabetic', # default to this because XX ->
19390                                             # AL
19391
19392                               # Don't use _Perl_LB as a synonym for
19393                               # Line_Break in later perls, as it is tailored
19394                               # and isn't the same as Line_Break
19395                               'ONLY_EARLY' ],
19396                   ),
19397    Input_file->new('EastAsianWidth.txt', v3.0.0,
19398                    Property => 'East_Asian_Width',
19399                    Has_Missings_Defaults => $NOT_IGNORED,
19400                    # Early versions had problematic syntax
19401                    Each_Line_Handler => (($v_version lt v3.1.0)
19402                                        ? \&filter_early_ea_lb
19403                                        : undef),
19404                   ),
19405    Input_file->new('CompositionExclusions.txt', v3.0.0,
19406                    Property => 'Composition_Exclusion',
19407                   ),
19408    Input_file->new('UnicodeData.html', v3.0.0,
19409                    Withdrawn => v4.0.1,
19410                    Skip => $Documentation,
19411                   ),
19412    Input_file->new('BidiMirroring.txt', v3.0.1,
19413                    Property => 'Bidi_Mirroring_Glyph',
19414                    Has_Missings_Defaults => ($v_version lt v6.2.0)
19415                                              ? $NO_DEFAULTS
19416                                              # Is <none> which doesn't mean
19417                                              # anything to us, we will use the
19418                                              # null string
19419                                              : $IGNORED,
19420                   ),
19421    Input_file->new('NamesList.html', v3.0.0,
19422                    Skip => 'Describes the format and contents of '
19423                          . 'F<NamesList.txt>',
19424                   ),
19425    Input_file->new('UnicodeCharacterDatabase.html', v3.0.0,
19426                    Withdrawn => v5.1,
19427                    Skip => $Documentation,
19428                   ),
19429    Input_file->new('CaseFolding.txt', v3.0.1,
19430                    Pre_Handler => \&setup_case_folding,
19431                    Each_Line_Handler =>
19432                        [ ($v_version lt v3.1.0)
19433                                 ? \&filter_old_style_case_folding
19434                                 : undef,
19435                           \&filter_case_folding_line
19436                        ],
19437                    Has_Missings_Defaults => $IGNORED,
19438                   ),
19439    Input_file->new("NormTest.txt", v3.0.1,
19440                     Handler => \&process_NormalizationsTest,
19441                     Skip => ($make_norm_test_script) ? 0 : $Validation,
19442                   ),
19443    Input_file->new('DCoreProperties.txt', v3.1.0,
19444                    # 5.2 changed this file
19445                    Has_Missings_Defaults => (($v_version ge v5.2.0)
19446                                            ? $NOT_IGNORED
19447                                            : $NO_DEFAULTS),
19448                   ),
19449    Input_file->new('DProperties.html', v3.1.0,
19450                    Withdrawn => v3.2.0,
19451                    Skip => $Documentation,
19452                   ),
19453    Input_file->new('PropList.html', v3.1.0,
19454                    Withdrawn => v5.1,
19455                    Skip => $Documentation,
19456                   ),
19457    Input_file->new('Scripts.txt', v3.1.0,
19458                    Property => 'Script',
19459                    Each_Line_Handler => (($v_version le v4.0.0)
19460                                          ? \&filter_all_caps_script_names
19461                                          : undef),
19462                    Has_Missings_Defaults => $NOT_IGNORED,
19463                   ),
19464    Input_file->new('DNormalizationProps.txt', v3.1.0,
19465                    Has_Missings_Defaults => $NOT_IGNORED,
19466                    Each_Line_Handler => (($v_version lt v4.0.1)
19467                                      ? \&filter_old_style_normalization_lines
19468                                      : undef),
19469                   ),
19470    Input_file->new('DerivedProperties.html', v3.1.1,
19471                    Withdrawn => v5.1,
19472                    Skip => $Documentation,
19473                   ),
19474    Input_file->new('DAge.txt', v3.2.0,
19475                    Has_Missings_Defaults => $NOT_IGNORED,
19476                    Property => 'Age'
19477                   ),
19478    Input_file->new('HangulSyllableType.txt', v4.0,
19479                    Has_Missings_Defaults => $NOT_IGNORED,
19480                    Early => [ \&generate_hst, 'Hangul_Syllable_Type' ],
19481                    Property => 'Hangul_Syllable_Type'
19482                   ),
19483    Input_file->new('NormalizationCorrections.txt', v3.2.0,
19484                     # This documents the cumulative fixes to erroneous
19485                     # normalizations in earlier Unicode versions.  Its main
19486                     # purpose is so that someone running on an earlier
19487                     # version can use this file to override what got
19488                     # published in that earlier release.  It would be easy
19489                     # for mktables to handle this file.  But all the
19490                     # corrections in it should already be in the other files
19491                     # for the release it is.  To get it to actually mean
19492                     # something useful, someone would have to be using an
19493                     # earlier Unicode release, and copy it into the directory
19494                     # for that release and recompile.  So far there has been
19495                     # no demand to do that, so this hasn't been implemented.
19496                    Skip => 'Documentation of corrections already '
19497                          . 'incorporated into the Unicode data base',
19498                   ),
19499    Input_file->new('StandardizedVariants.html', v3.2.0,
19500                    Skip => 'Obsoleted as of Unicode 9.0, but previously '
19501                          . 'provided a visual display of the standard '
19502                          . 'variant sequences derived from '
19503                          . 'F<StandardizedVariants.txt>.',
19504                        # I don't know why the html came earlier than the
19505                        # .txt, but both are skipped anyway, so it doesn't
19506                        # matter.
19507                   ),
19508    Input_file->new('StandardizedVariants.txt', v4.0.0,
19509                    Skip => 'Certain glyph variations for character display '
19510                          . 'are standardized.  This lists the non-Unihan '
19511                          . 'ones; the Unihan ones are also not used by '
19512                          . 'Perl, and are in a separate Unicode data base '
19513                          . 'L<http://www.unicode.org/ivd>',
19514                   ),
19515    Input_file->new('UCD.html', v4.0.0,
19516                    Withdrawn => v5.2,
19517                    Skip => $Documentation,
19518                   ),
19519    Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
19520                    Early => [ "WBsubst.txt", '_Perl_WB', 'ALetter' ],
19521                    Property => 'Word_Break',
19522                    Has_Missings_Defaults => $NOT_IGNORED,
19523                   ),
19524    Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1,
19525                    Early => [ \&generate_GCB, '_Perl_GCB' ],
19526                    Property => 'Grapheme_Cluster_Break',
19527                    Has_Missings_Defaults => $NOT_IGNORED,
19528                   ),
19529    Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
19530                    Handler => \&process_GCB_test,
19531                    retain_trailing_comments => 1,
19532                   ),
19533    Input_file->new("$AUXILIARY/GraphemeBreakTest.html", v4.1.0,
19534                    Skip => $Validation_Documentation,
19535                   ),
19536    Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
19537                    Handler => \&process_SB_test,
19538                    retain_trailing_comments => 1,
19539                   ),
19540    Input_file->new("$AUXILIARY/SentenceBreakTest.html", v4.1.0,
19541                    Skip => $Validation_Documentation,
19542                   ),
19543    Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
19544                    Handler => \&process_WB_test,
19545                    retain_trailing_comments => 1,
19546                   ),
19547    Input_file->new("$AUXILIARY/WordBreakTest.html", v4.1.0,
19548                    Skip => $Validation_Documentation,
19549                   ),
19550    Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
19551                    Property => 'Sentence_Break',
19552                    Early => [ "SBsubst.txt", '_Perl_SB', 'OLetter' ],
19553                    Has_Missings_Defaults => $NOT_IGNORED,
19554                   ),
19555    Input_file->new('NamedSequences.txt', v4.1.0,
19556                    Handler => \&process_NamedSequences
19557                   ),
19558    Input_file->new('Unihan.html', v4.1.0,
19559                    Withdrawn => v5.2,
19560                    Skip => $Documentation,
19561                   ),
19562    Input_file->new('NameAliases.txt', v5.0,
19563                    Property => 'Name_Alias',
19564                    Each_Line_Handler => ($v_version le v6.0.0)
19565                                   ? \&filter_early_version_name_alias_line
19566                                   : \&filter_later_version_name_alias_line,
19567                   ),
19568        # NameAliases.txt came along in v5.0.  The above constructor handles
19569        # this.  But until 6.1, it was lacking some information needed by core
19570        # perl.  The constructor below handles that.  It is either a kludge or
19571        # clever, depending on your point of view.  The 'Withdrawn' parameter
19572        # indicates not to use it at all starting in 6.1 (so the above
19573        # constructor applies), and the 'v6.1' parameter indicates to use the
19574        # Early parameter before 6.1.  Therefore 'Early" is always used,
19575        # yielding the internal-only property '_Perl_Name_Alias', which it
19576        # gets from a NameAliases.txt from 6.1 or later stored in
19577        # N_Asubst.txt.  In combination with the above constructor,
19578        # 'Name_Alias' is publicly accessible starting with v5.0, and the
19579        # better 6.1 version is accessible to perl core in all releases.
19580    Input_file->new("NameAliases.txt", v6.1,
19581                    Withdrawn => v6.1,
19582                    Early => [ "N_Asubst.txt", '_Perl_Name_Alias', "" ],
19583                    Property => 'Name_Alias',
19584                    EOF_Handler => \&fixup_early_perl_name_alias,
19585                    Each_Line_Handler =>
19586                                       \&filter_later_version_name_alias_line,
19587                   ),
19588    Input_file->new('NamedSqProv.txt', v5.0.0,
19589                    Skip => 'Named sequences proposed for inclusion in a '
19590                          . 'later version of the Unicode Standard; if you '
19591                          . 'need them now, you can append this file to '
19592                          . 'F<NamedSequences.txt> and recompile perl',
19593                   ),
19594    Input_file->new("$AUXILIARY/LBTest.txt", v5.1.0,
19595                    Handler => \&process_LB_test,
19596                    retain_trailing_comments => 1,
19597                   ),
19598    Input_file->new("$AUXILIARY/LineBreakTest.html", v5.1.0,
19599                    Skip => $Validation_Documentation,
19600                   ),
19601    Input_file->new("BidiTest.txt", v5.2.0,
19602                    Skip => $Validation,
19603                   ),
19604    Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
19605                    Optional => "",
19606                    Each_Line_Handler => \&filter_unihan_line,
19607                   ),
19608    Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
19609                    Optional => "",
19610                    Each_Line_Handler => \&filter_unihan_line,
19611                   ),
19612    Input_file->new('UnihanIRGSources.txt', v5.2.0,
19613                    Optional => [ "",
19614                                  'kCompatibilityVariant',
19615                                  'kIICore',
19616                                  'kIRG_GSource',
19617                                  'kIRG_HSource',
19618                                  'kIRG_JSource',
19619                                  'kIRG_KPSource',
19620                                  'kIRG_MSource',
19621                                  'kIRG_KSource',
19622                                  'kIRG_SSource',
19623                                  'kIRG_TSource',
19624                                  'kIRG_USource',
19625                                  'kIRG_UKSource',
19626                                  'kIRG_VSource',
19627                               ],
19628                    Pre_Handler => \&setup_unihan,
19629                    Each_Line_Handler => \&filter_unihan_line,
19630                   ),
19631    Input_file->new('UnihanNumericValues.txt', v5.2.0,
19632                    Optional => [ "",
19633                                  'kAccountingNumeric',
19634                                  'kOtherNumeric',
19635                                  'kPrimaryNumeric',
19636                                ],
19637                    Each_Line_Handler => \&filter_unihan_line,
19638                   ),
19639    Input_file->new('UnihanOtherMappings.txt', v5.2.0,
19640                    Optional => "",
19641                    Each_Line_Handler => \&filter_unihan_line,
19642                   ),
19643    Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
19644                    Optional => [ "",
19645                                  'Unicode_Radical_Stroke'
19646                                ],
19647                    Each_Line_Handler => \&filter_unihan_line,
19648                   ),
19649    Input_file->new('UnihanReadings.txt', v5.2.0,
19650                    Optional => "",
19651                    Each_Line_Handler => \&filter_unihan_line,
19652                   ),
19653    Input_file->new('UnihanVariants.txt', v5.2.0,
19654                    Optional => "",
19655                    Each_Line_Handler => \&filter_unihan_line,
19656                   ),
19657    Input_file->new('CJKRadicals.txt', v5.2.0,
19658                    Skip => 'Maps the kRSUnicode property values to '
19659                          . 'corresponding code points',
19660                   ),
19661    Input_file->new('EmojiSources.txt', v6.0.0,
19662                    Skip => 'Maps certain Unicode code points to their '
19663                          . 'legacy Japanese cell-phone values',
19664                   ),
19665    # This file is actually not usable as-is until 6.1.0, because the property
19666    # is provisional, so its name is missing from PropertyAliases.txt until
19667    # that release, so that further work would have to be done to get it to
19668    # work properly
19669    Input_file->new('ScriptExtensions.txt', v6.0.0,
19670                    Property => 'Script_Extensions',
19671                    Early => [ sub {} ], # Doesn't do anything but ensures
19672                                         # that this isn't skipped for early
19673                                         # versions
19674                    Pre_Handler => \&setup_script_extensions,
19675                    Each_Line_Handler => \&filter_script_extensions_line,
19676                    Has_Missings_Defaults => (($v_version le v6.0.0)
19677                                            ? $NO_DEFAULTS
19678                                            : $IGNORED),
19679                   ),
19680    # These two Indic files are actually not usable as-is until 6.1.0,
19681    # because they are provisional, so their property values are missing from
19682    # PropValueAliases.txt until that release, so that further work would have
19683    # to be done to get them to work properly.
19684    Input_file->new('IndicMatraCategory.txt', v6.0.0,
19685                    Withdrawn => v8.0.0,
19686                    Property => 'Indic_Matra_Category',
19687                    Has_Missings_Defaults => $NOT_IGNORED,
19688                    Skip => $Indic_Skip,
19689                   ),
19690    Input_file->new('IndicSyllabicCategory.txt', v6.0.0,
19691                    Property => 'Indic_Syllabic_Category',
19692                    Has_Missings_Defaults => $NOT_IGNORED,
19693                    Skip => (($v_version lt v8.0.0)
19694                              ? $Indic_Skip
19695                              : 0),
19696                   ),
19697    Input_file->new('USourceData.txt', v6.2.0,
19698                    Skip => 'Documentation of status and cross reference of '
19699                          . 'proposals for encoding by Unicode of Unihan '
19700                          . 'characters',
19701                   ),
19702    Input_file->new('USourceGlyphs.pdf', v6.2.0,
19703                    Skip => 'Pictures of the characters in F<USourceData.txt>',
19704                   ),
19705    Input_file->new('BidiBrackets.txt', v6.3.0,
19706                    Properties => [ 'Bidi_Paired_Bracket',
19707                                    'Bidi_Paired_Bracket_Type'
19708                                  ],
19709                    Has_Missings_Defaults => $NO_DEFAULTS,
19710                   ),
19711    Input_file->new("BidiCharacterTest.txt", v6.3.0,
19712                    Skip => $Validation,
19713                   ),
19714    Input_file->new('IndicPositionalCategory.txt', v8.0.0,
19715                    Property => 'Indic_Positional_Category',
19716                    Has_Missings_Defaults => $NOT_IGNORED,
19717                   ),
19718    Input_file->new('TangutSources.txt', v9.0.0,
19719                    Skip => 'Specifies source mappings for Tangut ideographs'
19720                          . ' and components. This data file also includes'
19721                          . ' informative radical-stroke values that are used'
19722                          . ' internally by Unicode',
19723                   ),
19724    Input_file->new('VerticalOrientation.txt', v10.0.0,
19725                    Property => 'Vertical_Orientation',
19726                    Has_Missings_Defaults => $NOT_IGNORED,
19727                   ),
19728    Input_file->new('NushuSources.txt', v10.0.0,
19729                    Skip => 'Specifies source material for Nushu characters',
19730                   ),
19731    Input_file->new('EquivalentUnifiedIdeograph.txt', v11.0.0,
19732                    Property => 'Equivalent_Unified_Ideograph',
19733                    Has_Missings_Defaults => $NOT_IGNORED,
19734                   ),
19735    Input_file->new('EmojiData.txt', v11.0.0,
19736                    # Is in UAX #51 and not the UCD, so must be updated
19737                    # separately, and the first line edited to indicate the
19738                    # UCD release we're pretending it to be in.  The UTC says
19739                    # this is a transitional state, and in fact was moved to
19740                    # the UCD in 13.0
19741                    Withdrawn => v13.0.0,
19742                    Pre_Handler => \&setup_emojidata,
19743                    Has_Missings_Defaults => $NOT_IGNORED,
19744                    Each_Line_Handler => \&filter_emojidata_line,
19745                    UCD => 0,
19746                   ),
19747    Input_file->new("$EMOJI/emoji.txt", v13.0.0,
19748                    Has_Missings_Defaults => $NOT_IGNORED,
19749                    UCD => 0,
19750                   ),
19751    Input_file->new("$EMOJI/ReadMe.txt", v13.0.0,
19752                    Skip => $Documentation,
19753                    UCD => 0,
19754                   ),
19755    Input_file->new('IdStatus.txt', v13.0.0,
19756                    Pre_Handler => \&setup_IdStatus,
19757                    Property => 'Identifier_Status',
19758                    UCD => 0,
19759                   ),
19760    Input_file->new('IdType.txt', v13.0.0,
19761                    Pre_Handler => \&setup_IdType,
19762                    Each_Line_Handler => \&filter_IdType_line,
19763                    Property => 'Identifier_Type',
19764                    UCD => 0,
19765                   ),
19766);
19767
19768# End of all the preliminaries.
19769# Do it...
19770
19771if (@missing_early_files) {
19772    print simple_fold(join_lines(<<END
19773
19774The compilation cannot be completed because one or more required input files,
19775listed below, are missing.  This is because you are compiling Unicode version
19776$unicode_version, which predates the existence of these file(s).  To fully
19777function, perl needs the data that these files would have contained if they
19778had been in this release.  To work around this, create copies of later
19779versions of the missing files in the directory containing '$0'.  (Perl will
19780make the necessary adjustments to the data to compensate for it not being the
19781same version as is being compiled.)  The files are available from unicode.org,
19782via either ftp or http.  If using http, they will be under
19783www.unicode.org/versions/.  Below are listed the source file name of each
19784missing file, the Unicode version to copy it from, and the name to store it
19785as.  (Note that the listed source file name may not be exactly the one that
19786Unicode calls it.  If you don't find it, you can look it up in 'README.perl'
19787to get the correct name.)
19788END
19789    ));
19790    print simple_fold(join_lines("\n$_")) for @missing_early_files;
19791    exit 2;
19792}
19793
19794if ($compare_versions) {
19795    Carp::my_carp(<<END
19796Warning.  \$compare_versions is set.  Output is not suitable for production
19797END
19798    );
19799}
19800
19801# Put into %potential_files a list of all the files in the directory structure
19802# that could be inputs to this program
19803File::Find::find({
19804    wanted=>sub {
19805        return unless / \. ( txt | htm l? ) $ /xi;  # Some platforms change the
19806                                                    # name's case
19807        my $full = lc(File::Spec->rel2abs($_));
19808        $potential_files{$full} = 1;
19809        return;
19810    }
19811}, File::Spec->curdir());
19812
19813my @mktables_list_output_files;
19814my $old_start_time = 0;
19815my $old_options = "";
19816
19817if (! -e $file_list) {
19818    print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
19819    $write_unchanged_files = 1;
19820} elsif ($write_unchanged_files) {
19821    print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
19822}
19823else {
19824    print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
19825    my $file_handle;
19826    if (! open $file_handle, "<", $file_list) {
19827        Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
19828        $glob_list = 1;
19829    }
19830    else {
19831        my @input;
19832
19833        # Read and parse mktables.lst, placing the results from the first part
19834        # into @input, and the second part into @mktables_list_output_files
19835        for my $list ( \@input, \@mktables_list_output_files ) {
19836            while (<$file_handle>) {
19837                s/^ \s+ | \s+ $//xg;
19838                if (/^ \s* \# \s* Autogenerated\ starting\ on\ (\d+)/x) {
19839                    $old_start_time = $1;
19840                    next;
19841                }
19842                if (/^ \s* \# \s* From\ options\ (.+) /x) {
19843                    $old_options = $1;
19844                    next;
19845                }
19846                next if /^ \s* (?: \# .* )? $/x;
19847                last if /^ =+ $/x;
19848                my ( $file ) = split /\t/;
19849                push @$list, $file;
19850            }
19851            @$list = uniques(@$list);
19852            next;
19853        }
19854
19855        # Look through all the input files
19856        foreach my $input (@input) {
19857            next if $input eq 'version'; # Already have checked this.
19858
19859            # Ignore if doesn't exist.  The checking about whether we care or
19860            # not is done via the Input_file object.
19861            next if ! file_exists($input);
19862
19863            # The paths are stored with relative names, and with '/' as the
19864            # delimiter; convert to absolute on this machine
19865            my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
19866            $potential_files{lc $full} = 1;
19867        }
19868    }
19869
19870    close $file_handle;
19871}
19872
19873if ($glob_list) {
19874
19875    # Here wants to process all .txt files in the directory structure.
19876    # Convert them to full path names.  They are stored in the platform's
19877    # relative style
19878    my @known_files;
19879    foreach my $object (@input_file_objects) {
19880        my $file = $object->file;
19881        next unless defined $file;
19882        push @known_files, File::Spec->rel2abs($file);
19883    }
19884
19885    my @unknown_input_files;
19886    foreach my $file (keys %potential_files) {  # The keys are stored in lc
19887        next if grep { $file eq lc($_) } @known_files;
19888
19889        # Here, the file is unknown to us.  Get relative path name
19890        $file = File::Spec->abs2rel($file);
19891        push @unknown_input_files, $file;
19892
19893        # What will happen is we create a data structure for it, and add it to
19894        # the list of input files to process.  First get the subdirectories
19895        # into an array
19896        my (undef, $directories, undef) = File::Spec->splitpath($file);
19897        $directories =~ s;/$;;;     # Can have extraneous trailing '/'
19898        my @directories = File::Spec->splitdir($directories);
19899
19900        # If the file isn't extracted (meaning none of the directories is the
19901        # extracted one), just add it to the end of the list of inputs.
19902        if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
19903            push @input_file_objects, Input_file->new($file, v0);
19904        }
19905        else {
19906
19907            # Here, the file is extracted.  It needs to go ahead of most other
19908            # processing.  Search for the first input file that isn't a
19909            # special required property (that is, find one whose first_release
19910            # is non-0), and isn't extracted.  Also, the Age property file is
19911            # processed before the extracted ones, just in case
19912            # $compare_versions is set.
19913            for (my $i = 0; $i < @input_file_objects; $i++) {
19914                if ($input_file_objects[$i]->first_released ne v0
19915                    && lc($input_file_objects[$i]->file) ne 'dage.txt'
19916                    && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
19917                {
19918                    splice @input_file_objects, $i, 0,
19919                                                Input_file->new($file, v0);
19920                    last;
19921                }
19922            }
19923
19924        }
19925    }
19926    if (@unknown_input_files) {
19927        print STDERR simple_fold(join_lines(<<END
19928
19929The following files are unknown as to how to handle.  Assuming they are
19930typical property files.  You'll know by later error messages if it worked or
19931not:
19932END
19933        ) . " " . join(", ", @unknown_input_files) . "\n\n");
19934    }
19935} # End of looking through directory structure for more .txt files.
19936
19937# Create the list of input files from the objects we have defined, plus
19938# version
19939my @input_files = qw(version Makefile);
19940foreach my $object (@input_file_objects) {
19941    my $file = $object->file;
19942    next if ! defined $file;    # Not all objects have files
19943    next if defined $object->skip;;
19944    push @input_files,  $file;
19945}
19946
19947if ( $verbosity >= $VERBOSE ) {
19948    print "Expecting ".scalar( @input_files )." input files. ",
19949         "Checking ".scalar( @mktables_list_output_files )." output files.\n";
19950}
19951
19952# We set $most_recent to be the most recently changed input file, including
19953# this program itself (done much earlier in this file)
19954foreach my $in (@input_files) {
19955    next unless -e $in;        # Keep going even if missing a file
19956    my $mod_time = (stat $in)[9];
19957    $most_recent = $mod_time if $mod_time > $most_recent;
19958
19959    # See that the input files have distinct names, to warn someone if they
19960    # are adding a new one
19961    if ($make_list) {
19962        my ($volume, $directories, $file ) = File::Spec->splitpath($in);
19963        $directories =~ s;/$;;;     # Can have extraneous trailing '/'
19964        my @directories = File::Spec->splitdir($directories);
19965        construct_filename($file, 'mutable', \@directories);
19966    }
19967}
19968
19969# We use 'Makefile' just to see if it has changed since the last time we
19970# rebuilt.  Now discard it.
19971@input_files = grep { $_ ne 'Makefile' } @input_files;
19972
19973my $rebuild = $write_unchanged_files    # Rebuild: if unconditional rebuild
19974              || ! scalar @mktables_list_output_files  # or if no outputs known
19975              || $old_start_time < $most_recent        # or out-of-date
19976              || $old_options ne $command_line_arguments; # or with different
19977                                                          # options
19978
19979# Now we check to see if any output files are older than youngest, if
19980# they are, we need to continue on, otherwise we can presumably bail.
19981if (! $rebuild) {
19982    foreach my $out (@mktables_list_output_files) {
19983        if ( ! file_exists($out)) {
19984            print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
19985            $rebuild = 1;
19986            last;
19987         }
19988        #local $to_trace = 1 if main::DEBUG;
19989        trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace;
19990        if ( (stat $out)[9] <= $most_recent ) {
19991            #trace "$out:  most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace;
19992            print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
19993            $rebuild = 1;
19994            last;
19995        }
19996    }
19997}
19998if (! $rebuild) {
19999    print "$0: Files seem to be ok, not bothering to rebuild.  Add '-w' option to force build\n";
20000    exit(0);
20001}
20002print "$0: Must rebuild tables.\n" if $verbosity >= $VERBOSE;
20003
20004# Ready to do the major processing.  First create the perl pseudo-property.
20005$perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
20006
20007# Process each input file
20008foreach my $file (@input_file_objects) {
20009    $file->run;
20010}
20011
20012# Finish the table generation.
20013
20014print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
20015finish_Unicode();
20016
20017# For the very specialized case of comparing two Unicode versions...
20018if (DEBUG && $compare_versions) {
20019    handle_compare_versions();
20020}
20021
20022print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
20023compile_perl();
20024
20025print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
20026add_perl_synonyms();
20027
20028print "Writing tables\n" if $verbosity >= $PROGRESS;
20029write_all_tables();
20030
20031# Write mktables.lst
20032if ( $file_list and $make_list ) {
20033
20034    print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
20035    foreach my $file (@input_files, @files_actually_output) {
20036        my (undef, $directories, $basefile) = File::Spec->splitpath($file);
20037        my @directories = grep length, File::Spec->splitdir($directories);
20038        $file = join '/', @directories, $basefile;
20039    }
20040
20041    my $ofh;
20042    if (! open $ofh,">",$file_list) {
20043        Carp::my_carp("Can't write to '$file_list'.  Skipping: $!");
20044        return
20045    }
20046    else {
20047        my $localtime = localtime $start_time;
20048        print $ofh <<"END";
20049#
20050# $file_list -- File list for $0.
20051#
20052#   Autogenerated starting on $start_time ($localtime)
20053#   From options $command_line_arguments
20054#
20055# - First section is input files
20056#   ($0 itself is not listed but is automatically considered an input)
20057# - Section separator is /^=+\$/
20058# - Second section is a list of output files.
20059# - Lines matching /^\\s*#/ are treated as comments
20060#   which along with blank lines are ignored.
20061#
20062
20063# Input files:
20064
20065END
20066        print $ofh "$_\n" for sort(@input_files);
20067        print $ofh "\n=================================\n# Output files:\n\n";
20068        print $ofh "$_\n" for sort @files_actually_output;
20069        print $ofh "\n# ",scalar(@input_files)," input files\n",
20070                "# ",scalar(@files_actually_output)+1," output files\n\n",
20071                "# End list\n";
20072        close $ofh
20073            or Carp::my_carp("Failed to close $ofh: $!");
20074
20075        print "Filelist has ",scalar(@input_files)," input files and ",
20076            scalar(@files_actually_output)+1," output files\n"
20077            if $verbosity >= $VERBOSE;
20078    }
20079}
20080
20081# Output these warnings unless -q explicitly specified.
20082if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) {
20083    if (@unhandled_properties) {
20084        print "\nProperties and tables that unexpectedly have no code points\n";
20085        foreach my $property (sort @unhandled_properties) {
20086            print $property, "\n";
20087        }
20088    }
20089
20090    if (%potential_files) {
20091        print "\nInput files that are not considered:\n";
20092        foreach my $file (sort keys %potential_files) {
20093            print File::Spec->abs2rel($file), "\n";
20094        }
20095    }
20096    print "\nAll done\n" if $verbosity >= $VERBOSE;
20097}
20098
20099if ($version_of_mk_invlist_bounds lt $v_version) {
20100    Carp::my_carp("WARNING: \\b{} algorithms (regen/mk_invlist.pl) need"
20101                . " to be checked and possibly updated to Unicode"
20102                . " $string_version.  Failing tests will be marked TODO");
20103}
20104
20105exit(0);
20106
20107# TRAILING CODE IS USED BY make_property_test_script()
20108__DATA__
20109
20110use strict;
20111use warnings;
20112
20113use feature 'signatures';
20114
20115no warnings 'experimental::signatures';
20116no warnings 'experimental::uniprop_wildcards';
20117
20118# Test qr/\X/ and the \p{} regular expression constructs.  This file is
20119# constructed by mktables from the tables it generates, so if mktables is
20120# buggy, this won't necessarily catch those bugs.  Tests are generated for all
20121# feasible properties; a few aren't currently feasible; see
20122# is_code_point_usable() in mktables for details.
20123
20124# Standard test packages are not used because this manipulates SIG_WARN.  It
20125# exits 0 if every non-skipped test succeeded; -1 if any failed.
20126
20127my $Tests = 0;
20128my $Fails = 0;
20129
20130# loc_tools.pl requires this function to be defined
20131sub ok($pass, @msg) {
20132    print "not " unless $pass;
20133    print "ok ";
20134    print ++$Tests;
20135    print " - ", join "", @msg if @msg;
20136    print "\n";
20137}
20138
20139sub Expect($expected, $ord, $regex, $warning_type='') {
20140    my $line   = (caller)[2];
20141
20142    # Convert the code point to hex form
20143    my $string = sprintf "\"\\x{%04X}\"", $ord;
20144
20145    my @tests = "";
20146
20147    # The first time through, use all warnings.  If the input should generate
20148    # a warning, add another time through with them turned off
20149    push @tests, "no warnings '$warning_type';" if $warning_type;
20150
20151    foreach my $no_warnings (@tests) {
20152
20153        # Store any warning messages instead of outputting them
20154        local $SIG{__WARN__} = $SIG{__WARN__};
20155        my $warning_message;
20156        $SIG{__WARN__} = sub { $warning_message = $_[0] };
20157
20158        $Tests++;
20159
20160        # A string eval is needed because of the 'no warnings'.
20161        # Assumes no parentheses in the regular expression
20162        my $result = eval "$no_warnings
20163                            my \$RegObj = qr($regex);
20164                            $string =~ \$RegObj ? 1 : 0";
20165        if (not defined $result) {
20166            print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
20167            $Fails++;
20168        }
20169        elsif ($result ^ $expected) {
20170            print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
20171            $Fails++;
20172        }
20173        elsif ($warning_message) {
20174            if (! $warning_type || ($warning_type && $no_warnings)) {
20175                print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
20176                $Fails++;
20177            }
20178            else {
20179                print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
20180            }
20181        }
20182        elsif ($warning_type && ! $no_warnings) {
20183            print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
20184            $Fails++;
20185        }
20186        else {
20187            print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
20188        }
20189    }
20190    return;
20191}
20192
20193sub Error($regex) {
20194    $Tests++;
20195    if (eval { 'x' =~ qr/$regex/; 1 }) {
20196        $Fails++;
20197        my $line = (caller)[2];
20198        print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
20199    }
20200    else {
20201        my $line = (caller)[2];
20202        print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
20203    }
20204    return;
20205}
20206
20207# Break test files (e.g. GCBTest.txt) character that break allowed here
20208my $breakable_utf8 = my $breakable = chr(utf8::unicode_to_native(0xF7));
20209utf8::upgrade($breakable_utf8);
20210
20211# Break test files (e.g. GCBTest.txt) character that indicates can't break
20212# here
20213my $nobreak_utf8 = my $nobreak = chr(utf8::unicode_to_native(0xD7));
20214utf8::upgrade($nobreak_utf8);
20215
20216my $are_ctype_locales_available;
20217my $utf8_locale;
20218chdir 't' if -d 't';
20219eval { require "./loc_tools.pl" };
20220if (defined &locales_enabled) {
20221    $are_ctype_locales_available = locales_enabled('LC_CTYPE');
20222    if ($are_ctype_locales_available) {
20223        $utf8_locale = &find_utf8_ctype_locale;
20224    }
20225}
20226
20227# Eval'd so can run on versions earlier than the property is available in
20228my $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}\p{WB=ZWJ}]/';
20229if (! defined $WB_Extend_or_Format_re) {
20230    $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}]/';
20231}
20232
20233sub _test_break($template, $break_type) {
20234    # Test various break property matches.  The 2nd parameter gives the
20235    # property name.  The input is a line from auxiliary/*Test.txt for the
20236    # given property.  Each such line is a sequence of Unicode (not native)
20237    # code points given by their hex numbers, separated by the two characters
20238    # defined just before this subroutine that indicate that either there can
20239    # or cannot be a break between the adjacent code points.  All these are
20240    # tested.
20241    #
20242    # For the gcb property extra tests are made.  if there isn't a break, that
20243    # means the sequence forms an extended grapheme cluster, which means that
20244    # \X should match the whole thing.  If there is a break, \X should stop
20245    # there.  This is all converted by this routine into a match: $string =~
20246    # /(\X)/, Each \X should match the next cluster; and that is what is
20247    # checked.
20248
20249    my $line   = (caller 1)[2];   # Line number
20250    my $comment = "";
20251
20252    if ($template =~ / ( .*? ) \s* \# (.*) /x) {
20253        $template = $1;
20254        $comment = $2;
20255
20256        # Replace leading spaces with a single one.
20257        $comment =~ s/ ^ \s* / # /x;
20258    }
20259
20260    # The line contains characters above the ASCII range, but in Latin1.  It
20261    # may or may not be in utf8, and if it is, it may or may not know it.  So,
20262    # convert these characters to 8 bits.  If knows is in utf8, simply
20263    # downgrade.
20264    if (utf8::is_utf8($template)) {
20265        utf8::downgrade($template);
20266    } else {
20267
20268        # Otherwise, if it is in utf8, but doesn't know it, the next lines
20269        # convert the two problematic characters to their 8-bit equivalents.
20270        # If it isn't in utf8, they don't harm anything.
20271        use bytes;
20272        $template =~ s/$nobreak_utf8/$nobreak/g;
20273        $template =~ s/$breakable_utf8/$breakable/g;
20274    }
20275
20276    # Perl customizes wb.  So change the official tests accordingly
20277    if ($break_type eq 'wb' && $WB_Extend_or_Format_re) {
20278
20279        # Split into elements that alternate between code point and
20280        # break/no-break
20281        my @line = split / +/, $template;
20282
20283        # Look at each code point and its following one
20284        for (my $i = 1; $i <  @line - 1 - 1; $i+=2) {
20285
20286            # The customization only involves changing some breaks to
20287            # non-breaks.
20288            next if $line[$i+1] =~ /$nobreak/;
20289
20290            my $lhs = chr utf8::unicode_to_native(hex $line[$i]);
20291            my $rhs = chr utf8::unicode_to_native(hex $line[$i+2]);
20292
20293            # And it only affects adjacent space characters.
20294            next if $lhs !~ /\s/u;
20295
20296            # But, we want to make sure to test spaces followed by a Extend
20297            # or Format.
20298            next if $rhs !~ /\s|$WB_Extend_or_Format_re/;
20299
20300            # To test the customization, add some white-space before this to
20301            # create a span.  The $lhs white space may or may not be bound to
20302            # that span, and also with the $rhs.  If the $rhs is a binding
20303            # character, the $lhs is bound to it and not to the span, unless
20304            # $lhs is vertical space.  In all other cases, the $lhs is bound
20305            # to the span.  If the $rhs is white space, it is bound to the
20306            # $lhs
20307            my $bound;
20308            my $span;
20309            if ($rhs =~ /$WB_Extend_or_Format_re/) {
20310                if ($lhs =~ /\v/) {
20311                    $bound = $breakable;
20312                    $span = $nobreak;
20313                }
20314                else {
20315                    $bound = $nobreak;
20316                    $span = $breakable;
20317                }
20318            }
20319            else {
20320                $span = $nobreak;
20321                $bound = $nobreak;
20322            }
20323
20324            splice @line, $i, 0, ( '0020', $nobreak, '0020', $span);
20325            $i += 4;
20326            $line[$i+1] = $bound;
20327        }
20328        $template = join " ", @line;
20329    }
20330
20331    # The input is just the break/no-break symbols and sequences of Unicode
20332    # code points as hex digits separated by spaces for legibility. e.g.:
20333    # ÷ 0020 × 0308 ÷ 0020 ÷
20334    # Convert to native \x format
20335    $template =~ s/ \s* ( [[:xdigit:]]+ ) \s* /sprintf("\\x{%02X}", utf8::unicode_to_native(hex $1))/gex;
20336    $template =~ s/ \s* //gx;   # Probably the line above removed all spaces;
20337                                # but be sure
20338
20339    # Make a copy of the input with the symbols replaced by \b{} and \B{} as
20340    # appropriate
20341    my $break_pattern = $template =~ s/ $breakable /\\b{$break_type}/grx;
20342    $break_pattern =~ s/ $nobreak /\\B{$break_type}/gx;
20343
20344    my $display_string = $template =~ s/[$breakable$nobreak]//gr;
20345    my $string = eval "\"$display_string\"";
20346
20347    # The remaining massaging of the input is for the \X tests.  Get rid of
20348    # the leading and trailing breakables
20349    $template =~ s/^ \s* $breakable \s* //x;
20350    $template =~ s/ \s* $breakable \s* $ //x;
20351
20352    # Delete no-breaks
20353    $template =~ s/ \s* $nobreak \s* //xg;
20354
20355    # Split the input into segments that are breakable between them.
20356    my @should_display = split /\s*$breakable\s*/, $template;
20357    my @should_match = map { eval "\"$_\"" } @should_display;
20358
20359    # If a string can be represented in both non-ut8 and utf8, test both cases
20360    my $display_upgrade = "";
20361    UPGRADE:
20362    for my $to_upgrade (0 .. 1) {
20363
20364        if ($to_upgrade) {
20365
20366            # If already in utf8, would just be a repeat
20367            next UPGRADE if utf8::is_utf8($string);
20368
20369            utf8::upgrade($string);
20370            $display_upgrade = " (utf8-upgraded)";
20371        }
20372
20373        my @modifiers = qw(a aa d u i);
20374        if ($are_ctype_locales_available) {
20375            push @modifiers, "l$utf8_locale" if defined $utf8_locale;
20376
20377            # The /l modifier has C after it to indicate the locale to try
20378            push @modifiers, "lC";
20379        }
20380
20381        # Test for each of the regex modifiers.
20382        for my $modifier (@modifiers) {
20383            my $display_locale = "";
20384
20385            # For /l, set the locale to what it says to.
20386            if ($modifier =~ / ^ l (.*) /x) {
20387                my $locale = $1;
20388                $display_locale = "(locale = $locale)";
20389                POSIX::setlocale(&POSIX::LC_CTYPE, $locale);
20390                $modifier = 'l';
20391            }
20392
20393            no warnings qw(locale regexp surrogate);
20394            my $pattern = "(?$modifier:$break_pattern)";
20395
20396            # Actually do the test
20397            my $matched_text;
20398            my $matched = $string =~ qr/$pattern/;
20399            if ($matched) {
20400                $matched_text = "matched";
20401            }
20402            else {
20403                $matched_text = "failed to match";
20404                print "not ";
20405
20406                if (TODO_FAILING_BREAKS) {
20407                    $comment = " # $comment" unless $comment =~ / ^ \s* \# /x;
20408                    $comment =~ s/#/# TODO/;
20409                }
20410            }
20411            print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$pattern/$display_upgrade; line $line $display_locale$comment\n";
20412
20413            # Only print the comment on the first use of this line
20414            $comment = "";
20415
20416            # Repeat with the first \B{} in the pattern.  This makes sure the
20417            # code in regexec.c:find_byclass() for \B gets executed
20418            if ($pattern =~ / ( .*? : ) .* ( \\B\{ .* ) /x) {
20419                my $B_pattern = "$1$2";
20420                $matched = $string =~ qr/$B_pattern/;
20421                print "not " unless $matched;
20422                $matched_text = ($matched) ? "matched" : "failed to match";
20423                print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$B_pattern/$display_upgrade; line $line $display_locale";
20424                print " # TODO" if TODO_FAILING_BREAKS && ! $matched;
20425                print "\n";
20426            }
20427        }
20428
20429        next if $break_type ne 'gcb';
20430
20431        # Finally, do the \X match.
20432        my @matches = $string =~ /(\X)/g;
20433
20434        # Look through each matched cluster to verify that it matches what we
20435        # expect.
20436        my $min = (@matches < @should_match) ? @matches : @should_match;
20437        for my $i (0 .. $min - 1) {
20438            $Tests++;
20439            if ($matches[$i] eq $should_match[$i]) {
20440                print "ok $Tests - ";
20441                if ($i == 0) {
20442                    print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
20443                } else {
20444                    print "And \\X #", $i + 1,
20445                }
20446                print " correctly matched $should_display[$i]; line $line\n";
20447            } else {
20448                $matches[$i] = join("", map { sprintf "\\x{%04X}", ord $_ }
20449                                                    split "", $matches[$i]);
20450                print "not ok $Tests -";
20451                print " # TODO" if TODO_FAILING_BREAKS;
20452                print " In \"$display_string\" =~ /(\\X)/g, \\X #",
20453                    $i + 1,
20454                    " should have matched $should_display[$i]",
20455                    " but instead matched $matches[$i]",
20456                    ".  Abandoning rest of line $line\n";
20457                next UPGRADE;
20458            }
20459        }
20460
20461        # And the number of matches should equal the number of expected matches.
20462        $Tests++;
20463        if (@matches == @should_match) {
20464            print "ok $Tests - Nothing was left over; line $line\n";
20465        } else {
20466            print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line";
20467            print " # TODO" if TODO_FAILING_BREAKS;
20468            print "\n";
20469        }
20470    }
20471
20472    return;
20473}
20474
20475sub Test_GCB($t) {
20476    _test_break($t, 'gcb');
20477}
20478
20479sub Test_LB($t) {
20480    _test_break($t, 'lb');
20481}
20482
20483sub Test_SB($t) {
20484    _test_break($t, 'sb');
20485}
20486
20487sub Test_WB($t) {
20488    _test_break($t, 'wb');
20489}
20490
20491sub Finished() {
20492    print "1..$Tests\n";
20493    exit($Fails ? -1 : 0);
20494}
20495
20496