1#!/usr/bin/perl -w
2
3# !!!!!!!!!!!!!!       IF YOU MODIFY THIS FILE       !!!!!!!!!!!!!!!!!!!!!!!!!
4# Any files created or read by this program should be listed in 'mktables.lst'
5# Use -makelist to regenerate it.
6
7# There was an attempt when this was first rewritten to make it 5.8
8# compatible, but that has now been abandoned, and newer constructs are used
9# as convenient.
10
11# NOTE: this script can run quite slowly in older/slower systems.
12# It can also consume a lot of memory (128 MB or more), you may need
13# to raise your process resource limits (e.g. in bash, "ulimit -a"
14# to inspect, and "ulimit -d ..." or "ulimit -m ..." to set)
15
16my $start_time;
17BEGIN { # Get the time the script started running; do it at compilation to
18        # get it as close as possible
19    $start_time= time;
20}
21
22require 5.010_001;
23use strict;
24use warnings;
25use Carp;
26use Config;
27use File::Find;
28use File::Path;
29use File::Spec;
30use Text::Tabs;
31use re "/aa";
32
33use feature 'state';
34use feature 'signatures';
35no warnings 'experimental::signatures';
36
37sub DEBUG () { 0 }  # Set to 0 for production; 1 for development
38my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
39
40sub NON_ASCII_PLATFORM { ord("A") != 65 }
41
42# When a new version of Unicode is published, unfortunately the algorithms for
43# dealing with various bounds, like \b{gcb}, \b{lb} may have to be updated
44# manually.  The changes may or may not be backward compatible with older
45# releases.  The code is in regen/mk_invlist.pl and regexec.c.  Make the
46# changes, then come back here and set the variable below to what version the
47# code is expecting.  If a newer version of Unicode is being compiled than
48# expected, a warning will be generated.  If an older version is being
49# compiled, any bounds tests that fail in the generated test file (-maketest
50# option) will be marked as TODO.
51my $version_of_mk_invlist_bounds = v14.0.0;
52
53##########################################################################
54#
55# mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
56# from the Unicode database files (lib/unicore/.../*.txt),  It also generates
57# a pod file and .t files, depending on option parameters.
58#
59# The structure of this file is:
60#   First these introductory comments; then
61#   code needed for everywhere, such as debugging stuff; then
62#   code to handle input parameters; then
63#   data structures likely to be of external interest (some of which depend on
64#       the input parameters, so follows them; then
65#   more data structures and subroutine and package (class) definitions; then
66#   the small actual loop to process the input files and finish up; then
67#   a __DATA__ section, for the .t tests
68#
69# This program works on all releases of Unicode so far.  The outputs have been
70# scrutinized most intently for release 5.1.  The others have been checked for
71# somewhat more than just sanity.  It can handle all non-provisional Unicode
72# character properties in those releases.
73#
74# This program is mostly about Unicode character (or code point) properties.
75# A property describes some attribute or quality of a code point, like if it
76# is lowercase or not, its name, what version of Unicode it was first defined
77# in, or what its uppercase equivalent is.  Unicode deals with these disparate
78# possibilities by making all properties into mappings from each code point
79# into some corresponding value.  In the case of it being lowercase or not,
80# the mapping is either to 'Y' or 'N' (or various synonyms thereof).  Each
81# property maps each Unicode code point to a single value, called a "property
82# value".  (Some more recently defined properties, map a code point to a set
83# of values.)
84#
85# When using a property in a regular expression, what is desired isn't the
86# mapping of the code point to its property's value, but the reverse (or the
87# mathematical "inverse relation"): starting with the property value, "Does a
88# code point map to it?"  These are written in a "compound" form:
89# \p{property=value}, e.g., \p{category=punctuation}.  This program generates
90# files containing the lists of code points that map to each such regular
91# expression property value, one file per list
92#
93# There is also a single form shortcut that Perl adds for many of the commonly
94# used properties.  This happens for all binary properties, plus script,
95# general_category, and block properties.
96#
97# Thus the outputs of this program are files.  There are map files, mostly in
98# the 'To' directory; and there are list files for use in regular expression
99# matching, all in subdirectories of the 'lib' directory, with each
100# subdirectory being named for the property that the lists in it are for.
101# Bookkeeping, test, and documentation files are also generated.
102
103my $matches_directory = 'lib';   # Where match (\p{}) files go.
104my $map_directory = 'To';        # Where map files go.
105
106# DATA STRUCTURES
107#
108# The major data structures of this program are Property, of course, but also
109# Table.  There are two kinds of tables, very similar to each other.
110# "Match_Table" is the data structure giving the list of code points that have
111# a particular property value, mentioned above.  There is also a "Map_Table"
112# data structure which gives the property's mapping from code point to value.
113# There are two structures because the match tables need to be combined in
114# various ways, such as constructing unions, intersections, complements, etc.,
115# and the map ones don't.  And there would be problems, perhaps subtle, if
116# a map table were inadvertently operated on in some of those ways.
117# The use of separate classes with operations defined on one but not the other
118# prevents accidentally confusing the two.
119#
120# At the heart of each table's data structure is a "Range_List", which is just
121# an ordered list of "Ranges", plus ancillary information, and methods to
122# operate on them.  A Range is a compact way to store property information.
123# Each range has a starting code point, an ending code point, and a value that
124# is meant to apply to all the code points between the two end points,
125# inclusive.  For a map table, this value is the property value for those
126# code points.  Two such ranges could be written like this:
127#   0x41 .. 0x5A, 'Upper',
128#   0x61 .. 0x7A, 'Lower'
129#
130# Each range also has a type used as a convenience to classify the values.
131# Most ranges in this program will be Type 0, or normal, but there are some
132# ranges that have a non-zero type.  These are used only in map tables, and
133# are for mappings that don't fit into the normal scheme of things.  Mappings
134# that require a hash entry to communicate with utf8.c are one example;
135# another example is mappings for charnames.pm to use which indicate a name
136# that is algorithmically determinable from its code point (and the reverse).
137# These are used to significantly compact these tables, instead of listing
138# each one of the tens of thousands individually.
139#
140# In a match table, the value of a range is irrelevant (and hence the type as
141# well, which will always be 0), and arbitrarily set to the empty string.
142# Using the example above, there would be two match tables for those two
143# entries, one named Upper would contain the 0x41..0x5A range, and the other
144# named Lower would contain 0x61..0x7A.
145#
146# Actually, there are two types of range lists, "Range_Map" is the one
147# associated with map tables, and "Range_List" with match tables.
148# Again, this is so that methods can be defined on one and not the others so
149# as to prevent operating on them in incorrect ways.
150#
151# Eventually, most tables are written out to files to be read by Unicode::UCD.
152# All tables could in theory be written, but some are suppressed because there
153# is no current practical use for them.  It is easy to change which get
154# written by changing various lists that are near the top of the actual code
155# in this file.  The table data structures contain enough ancillary
156# information to allow them to be treated as separate entities for writing,
157# such as the path to each one's file.  There is a heading in each map table
158# that gives the format of its entries, and what the map is for all the code
159# points missing from it.  (This allows tables to be more compact.)
160#
161# The Property data structure contains one or more tables.  All properties
162# contain a map table (except the $perl property which is a
163# pseudo-property containing only match tables), and any properties that
164# are usable in regular expression matches also contain various matching
165# tables, one for each value the property can have.  A binary property can
166# have two values, True and False (or Y and N, which are preferred by Unicode
167# terminology).  Thus each of these properties will have a map table that
168# takes every code point and maps it to Y or N (but having ranges cuts the
169# number of entries in that table way down), and two match tables, one
170# which has a list of all the code points that map to Y, and one for all the
171# code points that map to N.  (For each binary property, a third table is also
172# generated for the pseudo Perl property.  It contains the identical code
173# points as the Y table, but can be written in regular expressions, not in the
174# compound form, but in a "single" form like \p{IsUppercase}.)  Many
175# properties are binary, but some properties have several possible values,
176# some have many, and properties like Name have a different value for every
177# named code point.  Those will not, unless the controlling lists are changed,
178# have their match tables written out.  But all the ones which can be used in
179# regular expression \p{} and \P{} constructs will.  Prior to 5.14, generally
180# a property would have either its map table or its match tables written but
181# not both.  Again, what gets written is controlled by lists which can easily
182# be changed.  Starting in 5.14, advantage was taken of this, and all the map
183# tables needed to reconstruct the Unicode db are now written out, while
184# suppressing the Unicode .txt files that contain the data.  Our tables are
185# much more compact than the .txt files, so a significant space savings was
186# achieved.  Also, tables are not written out that are trivially derivable
187# from tables that do get written.  So, there typically is no file containing
188# the code points not matched by a binary property (the table for \P{} versus
189# lowercase \p{}), since you just need to invert the True table to get the
190# False table.
191
192# Properties have a 'Type', like 'binary', or 'string', or 'enum' depending on
193# how many match tables there are and the content of the maps.  This 'Type' is
194# different than a range 'Type', so don't get confused by the two concepts
195# having the same name.
196#
197# For information about the Unicode properties, see Unicode's UAX44 document:
198
199my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
200
201# As stated earlier, this program will work on any release of Unicode so far.
202# Most obvious problems in earlier data have NOT been corrected except when
203# necessary to make Perl or this program work reasonably, and to keep out
204# potential security issues.  For example, no folding information was given in
205# early releases, so this program substitutes lower case instead, just so that
206# a regular expression with the /i option will do something that actually
207# gives the right results in many cases.  There are also a couple other
208# corrections for version 1.1.5, commented at the point they are made.  As an
209# example of corrections that weren't made (but could be) is this statement
210# from DerivedAge.txt: "The supplementary private use code points and the
211# non-character code points were assigned in version 2.0, but not specifically
212# listed in the UCD until versions 3.0 and 3.1 respectively."  (To be precise
213# it was 3.0.1 not 3.0.0)  More information on Unicode version glitches is
214# further down in these introductory comments.
215#
216# This program works on all non-provisional properties as of the current
217# Unicode release, though the files for some are suppressed for various
218# reasons.  You can change which are output by changing lists in this program.
219#
220# The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
221# loose matchings rules (from Unicode TR18):
222#
223#    The recommended names for UCD properties and property values are in
224#    PropertyAliases.txt [Prop] and PropertyValueAliases.txt
225#    [PropValue]. There are both abbreviated names and longer, more
226#    descriptive names. It is strongly recommended that both names be
227#    recognized, and that loose matching of property names be used,
228#    whereby the case distinctions, whitespace, hyphens, and underbar
229#    are ignored.
230#
231# The program still allows Fuzzy to override its determination of if loose
232# matching should be used, but it isn't currently used, as it is no longer
233# needed; the calculations it makes are good enough.
234#
235# SUMMARY OF HOW IT WORKS:
236#
237#   Process arguments
238#
239#   A list is constructed containing each input file that is to be processed
240#
241#   Each file on the list is processed in a loop, using the associated handler
242#   code for each:
243#        The PropertyAliases.txt and PropValueAliases.txt files are processed
244#            first.  These files name the properties and property values.
245#            Objects are created of all the property and property value names
246#            that the rest of the input should expect, including all synonyms.
247#        The other input files give mappings from properties to property
248#           values.  That is, they list code points and say what the mapping
249#           is under the given property.  Some files give the mappings for
250#           just one property; and some for many.  This program goes through
251#           each file and populates the properties and their map tables from
252#           them.  Some properties are listed in more than one file, and
253#           Unicode has set up a precedence as to which has priority if there
254#           is a conflict.  Thus the order of processing matters, and this
255#           program handles the conflict possibility by processing the
256#           overriding input files last, so that if necessary they replace
257#           earlier values.
258#        After this is all done, the program creates the property mappings not
259#            furnished by Unicode, but derivable from what it does give.
260#        The tables of code points that match each property value in each
261#            property that is accessible by regular expressions are created.
262#        The Perl-defined properties are created and populated.  Many of these
263#            require data determined from the earlier steps
264#        Any Perl-defined synonyms are created, and name clashes between Perl
265#            and Unicode are reconciled and warned about.
266#        All the properties are written to files
267#        Any other files are written, and final warnings issued.
268#
269# For clarity, a number of operators have been overloaded to work on tables:
270#   ~ means invert (take all characters not in the set).  The more
271#       conventional '!' is not used because of the possibility of confusing
272#       it with the actual boolean operation.
273#   + means union
274#   - means subtraction
275#   & means intersection
276# The precedence of these is the order listed.  Parentheses should be
277# copiously used.  These are not a general scheme.  The operations aren't
278# defined for a number of things, deliberately, to avoid getting into trouble.
279# Operations are done on references and affect the underlying structures, so
280# that the copy constructors for them have been overloaded to not return a new
281# clone, but the input object itself.
282#
283# The bool operator is deliberately not overloaded to avoid confusion with
284# "should it mean if the object merely exists, or also is non-empty?".
285#
286# WHY CERTAIN DESIGN DECISIONS WERE MADE
287#
288# This program needs to be able to run under miniperl.  Therefore, it uses a
289# minimum of other modules, and hence implements some things itself that could
290# be gotten from CPAN
291#
292# This program uses inputs published by the Unicode Consortium.  These can
293# change incompatibly between releases without the Perl maintainers realizing
294# it.  Therefore this program is now designed to try to flag these.  It looks
295# at the directories where the inputs are, and flags any unrecognized files.
296# It keeps track of all the properties in the files it handles, and flags any
297# that it doesn't know how to handle.  It also flags any input lines that
298# don't match the expected syntax, among other checks.
299#
300# It is also designed so if a new input file matches one of the known
301# templates, one hopefully just needs to add it to a list to have it
302# processed.
303#
304# As mentioned earlier, some properties are given in more than one file.  In
305# particular, the files in the extracted directory are supposedly just
306# reformattings of the others.  But they contain information not easily
307# derivable from the other files, including results for Unihan (which isn't
308# usually available to this program) and for unassigned code points.  They
309# also have historically had errors or been incomplete.  In an attempt to
310# create the best possible data, this program thus processes them first to
311# glean information missing from the other files; then processes those other
312# files to override any errors in the extracted ones.  Much of the design was
313# driven by this need to store things and then possibly override them.
314#
315# It tries to keep fatal errors to a minimum, to generate something usable for
316# testing purposes.  It always looks for files that could be inputs, and will
317# warn about any that it doesn't know how to handle (the -q option suppresses
318# the warning).
319#
320# Why is there more than one type of range?
321#   This simplified things.  There are some very specialized code points that
322#   have to be handled specially for output, such as Hangul syllable names.
323#   By creating a range type (done late in the development process), it
324#   allowed this to be stored with the range, and overridden by other input.
325#   Originally these were stored in another data structure, and it became a
326#   mess trying to decide if a second file that was for the same property was
327#   overriding the earlier one or not.
328#
329# Why are there two kinds of tables, match and map?
330#   (And there is a base class shared by the two as well.)  As stated above,
331#   they actually are for different things.  Development proceeded much more
332#   smoothly when I (khw) realized the distinction.  Map tables are used to
333#   give the property value for every code point (actually every code point
334#   that doesn't map to a default value).  Match tables are used for regular
335#   expression matches, and are essentially the inverse mapping.  Separating
336#   the two allows more specialized methods, and error checks so that one
337#   can't just take the intersection of two map tables, for example, as that
338#   is nonsensical.
339#
340# What about 'fate' and 'status'.  The concept of a table's fate was created
341#   late when it became clear that something more was needed.  The difference
342#   between this and 'status' is unclean, and could be improved if someone
343#   wanted to spend the effort.
344#
345# DEBUGGING
346#
347# This program is written so it will run under miniperl.  Occasionally changes
348# will cause an error where the backtrace doesn't work well under miniperl.
349# To diagnose the problem, you can instead run it under regular perl, if you
350# have one compiled.
351#
352# There is a good trace facility.  To enable it, first sub DEBUG must be set
353# to return true.  Then a line like
354#
355# local $to_trace = 1 if main::DEBUG;
356#
357# can be added to enable tracing in its lexical scope (plus dynamic) or until
358# you insert another line:
359#
360# local $to_trace = 0 if main::DEBUG;
361#
362# To actually trace, use a line like "trace $a, @b, %c, ...;
363#
364# Some of the more complex subroutines already have trace statements in them.
365# Permanent trace statements should be like:
366#
367# trace ... if main::DEBUG && $to_trace;
368#
369# main::stack_trace() will display what its name implies
370#
371# If there is just one or a few files that you're debugging, you can easily
372# cause most everything else to be skipped.  Change the line
373#
374# my $debug_skip = 0;
375#
376# to 1, and every file whose object is in @input_file_objects and doesn't have
377# a, 'non_skip => 1,' in its constructor will be skipped.  However, skipping
378# Jamo.txt or UnicodeData.txt will likely cause fatal errors.
379#
380# To compare the output tables, it may be useful to specify the -annotate
381# flag.  (As of this writing, this can't be done on a clean workspace, due to
382# requirements in Text::Tabs used in this option; so first run mktables
383# without this option.)  This option adds comment lines to each table, one for
384# each non-algorithmically named character giving, currently its code point,
385# name, and graphic representation if printable (and you have a font that
386# knows about it).  This makes it easier to see what the particular code
387# points are in each output table.  Non-named code points are annotated with a
388# description of their status, and contiguous ones with the same description
389# will be output as a range rather than individually.  Algorithmically named
390# characters are also output as ranges, except when there are just a few
391# contiguous ones.
392#
393# FUTURE ISSUES
394#
395# The program would break if Unicode were to change its names so that
396# interior white space, underscores, or dashes differences were significant
397# within property and property value names.
398#
399# It might be easier to use the xml versions of the UCD if this program ever
400# would need heavy revision, and the ability to handle old versions was not
401# required.
402#
403# There is the potential for name collisions, in that Perl has chosen names
404# that Unicode could decide it also likes.  There have been such collisions in
405# the past, with mostly Perl deciding to adopt the Unicode definition of the
406# name.  However in the 5.2 Unicode beta testing, there were a number of such
407# collisions, which were withdrawn before the final release, because of Perl's
408# and other's protests.  These all involved new properties which began with
409# 'Is'.  Based on the protests, Unicode is unlikely to try that again.  Also,
410# many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
411# Unicode document, so they are unlikely to be used by Unicode for another
412# purpose.  However, they might try something beginning with 'In', or use any
413# of the other Perl-defined properties.  This program will warn you of name
414# collisions, and refuse to generate tables with them, but manual intervention
415# will be required in this event.  One scheme that could be implemented, if
416# necessary, would be to have this program generate another file, or add a
417# field to mktables.lst that gives the date of first definition of a property.
418# Each new release of Unicode would use that file as a basis for the next
419# iteration.  And the Perl synonym addition code could sort based on the age
420# of the property, so older properties get priority, and newer ones that clash
421# would be refused; hence existing code would not be impacted, and some other
422# synonym would have to be used for the new property.  This is ugly, and
423# manual intervention would certainly be easier to do in the short run; lets
424# hope it never comes to this.
425#
426# A NOTE ON UNIHAN
427#
428# This program can generate tables from the Unihan database.  But that DB
429# isn't normally available, so it is marked as optional.  Prior to version
430# 5.2, this database was in a single file, Unihan.txt.  In 5.2 the database
431# was split into 8 different files, all beginning with the letters 'Unihan'.
432# If you plunk those files down into the directory mktables ($0) is in, this
433# program will read them and automatically create tables for the properties
434# from it that are listed in PropertyAliases.txt and PropValueAliases.txt,
435# plus any you add to the @cjk_properties array and the @cjk_property_values
436# array, being sure to add necessary '# @missings' lines to the latter.  For
437# Unicode versions earlier than 5.2, most of the Unihan properties are not
438# listed at all in PropertyAliases nor PropValueAliases.  This program assumes
439# for these early releases that you want the properties that are specified in
440# the 5.2 release.
441#
442# You may need to adjust the entries to suit your purposes.  setup_unihan(),
443# and filter_unihan_line() are the functions where this is done.  This program
444# already does some adjusting to make the lines look more like the rest of the
445# Unicode DB;  You can see what that is in filter_unihan_line()
446#
447# There is a bug in the 3.2 data file in which some values for the
448# kPrimaryNumeric property have commas and an unexpected comment.  A filter
449# could be added to correct these; or for a particular installation, the
450# Unihan.txt file could be edited to fix them.
451#
452# HOW TO ADD A FILE TO BE PROCESSED
453#
454# A new file from Unicode needs to have an object constructed for it in
455# @input_file_objects, probably at the end or at the end of the extracted
456# ones.  The program should warn you if its name will clash with others on
457# restrictive file systems, like DOS.  If so, figure out a better name, and
458# add lines to the README.perl file giving that.  If the file is a character
459# property, it should be in the format that Unicode has implicitly
460# standardized for such files for the more recently introduced ones.
461# If so, the Input_file constructor for @input_file_objects can just be the
462# file name and release it first appeared in.  If not, then it should be
463# possible to construct an each_line_handler() to massage the line into the
464# standardized form.
465#
466# For non-character properties, more code will be needed.  You can look at
467# the existing entries for clues.
468#
469# UNICODE VERSIONS NOTES
470#
471# The Unicode UCD has had a number of errors in it over the versions.  And
472# these remain, by policy, in the standard for that version.  Therefore it is
473# risky to correct them, because code may be expecting the error.  So this
474# program doesn't generally make changes, unless the error breaks the Perl
475# core.  As an example, some versions of 2.1.x Jamo.txt have the wrong value
476# for U+1105, which causes real problems for the algorithms for Jamo
477# calculations, so it is changed here.
478#
479# But it isn't so clear cut as to what to do about concepts that are
480# introduced in a later release; should they extend back to earlier releases
481# where the concept just didn't exist?  It was easier to do this than to not,
482# so that's what was done.  For example, the default value for code points not
483# in the files for various properties was probably undefined until changed by
484# some version.  No_Block for blocks is such an example.  This program will
485# assign No_Block even in Unicode versions that didn't have it.  This has the
486# benefit that code being written doesn't have to special case earlier
487# versions; and the detriment that it doesn't match the Standard precisely for
488# the affected versions.
489#
490# Here are some observations about some of the issues in early versions:
491#
492# Prior to version 3.0, there were 3 character decompositions.  These are not
493# handled by Unicode::Normalize, nor will it compile when presented a version
494# that has them.  However, you can trivially get it to compile by simply
495# ignoring those decompositions, by changing the croak to a carp.  At the time
496# of this writing, the line (in dist/Unicode-Normalize/Normalize.pm or
497# dist/Unicode-Normalize/mkheader) reads
498#
499#   croak("Weird Canonical Decomposition of U+$h");
500#
501# Simply comment it out.  It will compile, but will not know about any three
502# character decompositions.
503
504# The number of code points in \p{alpha=True} halved in 2.1.9.  It turns out
505# that the reason is that the CJK block starting at 4E00 was removed from
506# PropList, and was not put back in until 3.1.0.  The Perl extension (the
507# single property name \p{alpha}) has the correct values.  But the compound
508# form is simply not generated until 3.1, as it can be argued that prior to
509# this release, this was not an official property.  The comments for
510# filter_old_style_proplist() give more details.
511#
512# Unicode introduced the synonym Space for White_Space in 4.1.  Perl has
513# always had a \p{Space}.  In release 3.2 only, they are not synonymous.  The
514# reason is that 3.2 introduced U+205F=medium math space, which was not
515# classed as white space, but Perl figured out that it should have been. 4.0
516# reclassified it correctly.
517#
518# Another change between 3.2 and 4.0 is the CCC property value ATBL.  In 3.2
519# this was erroneously a synonym for 202 (it should be 200).  In 4.0, ATB
520# became 202, and ATBL was left with no code points, as all the ones that
521# mapped to 202 stayed mapped to 202.  Thus if your program used the numeric
522# name for the class, it would not have been affected, but if it used the
523# mnemonic, it would have been.
524#
525# \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1.  Before that, code
526# points which eventually came to have this script property value, instead
527# mapped to "Unknown".  But in the next release all these code points were
528# moved to \p{sc=common} instead.
529
530# The tests furnished  by Unicode for testing WordBreak and SentenceBreak
531# generate errors in 5.0 and earlier.
532#
533# The default for missing code points for BidiClass is complicated.  Starting
534# in 3.1.1, the derived file DBidiClass.txt handles this, but this program
535# tries to do the best it can for earlier releases.  It is done in
536# process_PropertyAliases()
537#
538# In version 2.1.2, the entry in UnicodeData.txt:
539#   0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;;019F;
540# should instead be
541#   0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F
542# Without this change, there are casing problems for this character.
543#
544# Search for $string_compare_versions to see how to compare changes to
545# properties between Unicode versions
546#
547##############################################################################
548
549my $UNDEF = ':UNDEF:';  # String to print out for undefined values in tracing
550                        # and errors
551my $MAX_LINE_WIDTH = 78;
552
553# Debugging aid to skip most files so as to not be distracted by them when
554# concentrating on the ones being debugged.  Add
555# non_skip => 1,
556# to the constructor for those files you want processed when you set this.
557# Files with a first version number of 0 are special: they are always
558# processed regardless of the state of this flag.  Generally, Jamo.txt and
559# UnicodeData.txt must not be skipped if you want this program to not die
560# before normal completion.
561my $debug_skip = 0;
562
563
564# Normally these are suppressed.
565my $write_Unicode_deprecated_tables = 0;
566
567# Set to 1 to enable tracing.
568our $to_trace = 0;
569
570{ # Closure for trace: debugging aid
571    my $print_caller = 1;        # ? Include calling subroutine name
572    my $main_with_colon = 'main::';
573    my $main_colon_length = length($main_with_colon);
574
575    sub trace {
576        return unless $to_trace;        # Do nothing if global flag not set
577
578        my @input = @_;
579
580        local $DB::trace = 0;
581        $DB::trace = 0;          # Quiet 'used only once' message
582
583        my $line_number;
584
585        # Loop looking up the stack to get the first non-trace caller
586        my $caller_line;
587        my $caller_name;
588        my $i = 0;
589        do {
590            $line_number = $caller_line;
591            (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
592            $caller = $main_with_colon unless defined $caller;
593
594            $caller_name = $caller;
595
596            # get rid of pkg
597            $caller_name =~ s/.*:://;
598            if (substr($caller_name, 0, $main_colon_length)
599                eq $main_with_colon)
600            {
601                $caller_name = substr($caller_name, $main_colon_length);
602            }
603
604        } until ($caller_name ne 'trace');
605
606        # If the stack was empty, we were called from the top level
607        $caller_name = 'main' if ($caller_name eq ""
608                                    || $caller_name eq 'trace');
609
610        my $output = "";
611        #print STDERR __LINE__, ": ", join ", ", @input, "\n";
612        foreach my $string (@input) {
613            if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
614                $output .= simple_dumper($string);
615            }
616            else {
617                $string = "$string" if ref $string;
618                $string = $UNDEF unless defined $string;
619                chomp $string;
620                $string = '""' if $string eq "";
621                $output .= " " if $output ne ""
622                                && $string ne ""
623                                && substr($output, -1, 1) ne " "
624                                && substr($string, 0, 1) ne " ";
625                $output .= $string;
626            }
627        }
628
629        print STDERR sprintf "%4d: ", $line_number if defined $line_number;
630        print STDERR "$caller_name: " if $print_caller;
631        print STDERR $output, "\n";
632        return;
633    }
634}
635
636sub stack_trace() {
637    local $to_trace = 1 if main::DEBUG;
638    my $line = (caller(0))[2];
639    my $i = 1;
640
641    # Accumulate the stack trace
642    while (1) {
643        my ($pkg, $file, $caller_line, $caller) = caller $i++;
644
645        last unless defined $caller;
646
647        trace "called from $caller() at line $line";
648        $line = $caller_line;
649    }
650}
651
652# This is for a rarely used development feature that allows you to compare two
653# versions of the Unicode standard without having to deal with changes caused
654# by the code points introduced in the later version.  You probably also want
655# to use the -annotate option when using this.  Run this program on a unicore
656# containing the starting release you want to compare.  Save that output
657# structure.  Then, switching to a unicore with the ending release, change the
658# "" in the $string_compare_versions definition just below to a string
659# containing a SINGLE dotted Unicode release number (e.g. "2.1") corresponding
660# to the starting release.  This program will then compile, but throw away all
661# code points introduced after the starting release.  Finally use a diff tool
662# to compare the two directory structures.  They include only the code points
663# common to both releases, and you can see the changes caused just by the
664# underlying release semantic changes.  For versions earlier than 3.2, you
665# must copy a version of DAge.txt into the directory.
666my $string_compare_versions = DEBUG && "";
667my $compare_versions = DEBUG
668                       && $string_compare_versions
669                       && pack "C*", split /\./, $string_compare_versions;
670
671sub uniques {
672    # Returns non-duplicated input values.  From "Perl Best Practices:
673    # Encapsulated Cleverness".  p. 455 in first edition.
674
675    my %seen;
676    # Arguably this breaks encapsulation, if the goal is to permit multiple
677    # distinct objects to stringify to the same value, and be interchangeable.
678    # However, for this program, no two objects stringify identically, and all
679    # lists passed to this function are either objects or strings. So this
680    # doesn't affect correctness, but it does give a couple of percent speedup.
681    no overloading;
682    return grep { ! $seen{$_}++ } @_;
683}
684
685$0 = File::Spec->canonpath($0);
686
687my $make_test_script = 0;      # ? Should we output a test script
688my $make_norm_test_script = 0; # ? Should we output a normalization test script
689my $write_unchanged_files = 0; # ? Should we update the output files even if
690                               #    we don't think they have changed
691my $use_directory = "";        # ? Should we chdir somewhere.
692my $pod_directory;             # input directory to store the pod file.
693my $pod_file = 'perluniprops';
694my $t_path;                     # Path to the .t test file
695my $file_list = 'mktables.lst'; # File to store input and output file names.
696                               # This is used to speed up the build, by not
697                               # executing the main body of the program if
698                               # nothing on the list has changed since the
699                               # previous build
700my $make_list = 1;             # ? Should we write $file_list.  Set to always
701                               # make a list so that when the release manager
702                               # is preparing a release, they won't have to do
703                               # special things
704my $glob_list = 0;             # ? Should we try to include unknown .txt files
705                               # in the input.
706my $output_range_counts = $debugging_build;   # ? Should we include the number
707                                              # of code points in ranges in
708                                              # the output
709my $annotate = 0;              # ? Should character names be in the output
710
711# Verbosity levels; 0 is quiet
712my $NORMAL_VERBOSITY = 1;
713my $PROGRESS = 2;
714my $VERBOSE = 3;
715
716my $verbosity = $NORMAL_VERBOSITY;
717
718# Stored in mktables.lst so that if this program is called with different
719# options, will regenerate even if the files otherwise look like they're
720# up-to-date.
721my $command_line_arguments = join " ", @ARGV;
722
723# Process arguments
724while (@ARGV) {
725    my $arg = shift @ARGV;
726    if ($arg eq '-v') {
727        $verbosity = $VERBOSE;
728    }
729    elsif ($arg eq '-p') {
730        $verbosity = $PROGRESS;
731        $| = 1;     # Flush buffers as we go.
732    }
733    elsif ($arg eq '-q') {
734        $verbosity = 0;
735    }
736    elsif ($arg eq '-w') {
737        # update the files even if they haven't changed
738        $write_unchanged_files = 1;
739    }
740    elsif ($arg eq '-check') {
741        my $this = shift @ARGV;
742        my $ok = shift @ARGV;
743        if ($this ne $ok) {
744            print "Skipping as check params are not the same.\n";
745            exit(0);
746        }
747    }
748    elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
749        -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
750    }
751    elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
752    {
753        $make_test_script = 1;
754    }
755    elsif ($arg eq '-makenormtest')
756    {
757        $make_norm_test_script = 1;
758    }
759    elsif ($arg eq '-makelist') {
760        $make_list = 1;
761    }
762    elsif ($arg eq '-C' && defined ($use_directory = shift)) {
763        -d $use_directory or croak "Unknown directory '$use_directory'";
764    }
765    elsif ($arg eq '-L') {
766
767        # Existence not tested until have chdir'd
768        $file_list = shift;
769    }
770    elsif ($arg eq '-globlist') {
771        $glob_list = 1;
772    }
773    elsif ($arg eq '-c') {
774        $output_range_counts = ! $output_range_counts
775    }
776    elsif ($arg eq '-annotate') {
777        $annotate = 1;
778        $debugging_build = 1;
779        $output_range_counts = 1;
780    }
781    else {
782        my $with_c = 'with';
783        $with_c .= 'out' if $output_range_counts;   # Complements the state
784        croak <<END;
785usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
786          [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
787          [-check A B ]
788  -c          : Output comments $with_c number of code points in ranges
789  -q          : Quiet Mode: Only output serious warnings.
790  -p          : Set verbosity level to normal plus show progress.
791  -v          : Set Verbosity level high:  Show progress and non-serious
792                warnings
793  -w          : Write files regardless
794  -C dir      : Change to this directory before proceeding. All relative paths
795                except those specified by the -P and -T options will be done
796                with respect to this directory.
797  -P dir      : Output $pod_file file to directory 'dir'.
798  -T path     : Create a test script as 'path'; overrides -maketest
799  -L filelist : Use alternate 'filelist' instead of standard one
800  -globlist   : Take as input all non-Test *.txt files in current and sub
801                directories
802  -maketest   : Make test script 'TestProp.pl' in current (or -C directory),
803                overrides -T
804  -makelist   : Rewrite the file list $file_list based on current setup
805  -annotate   : Output an annotation for each character in the table files;
806                useful for debugging mktables, looking at diffs; but is slow
807                and memory intensive
808  -check A B  : Executes $0 only if A and B are the same
809END
810    }
811}
812
813# Stores the most-recently changed file.  If none have changed, can skip the
814# build
815my $most_recent = (stat $0)[9];   # Do this before the chdir!
816
817# Change directories now, because need to read 'version' early.
818if ($use_directory) {
819    if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
820        $pod_directory = File::Spec->rel2abs($pod_directory);
821    }
822    if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
823        $t_path = File::Spec->rel2abs($t_path);
824    }
825    chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
826    if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
827        $pod_directory = File::Spec->abs2rel($pod_directory);
828    }
829    if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
830        $t_path = File::Spec->abs2rel($t_path);
831    }
832}
833
834# Get Unicode version into regular and v-string.  This is done now because
835# various tables below get populated based on it.  These tables are populated
836# here to be near the top of the file, and so easily seeable by those needing
837# to modify things.
838open my $VERSION, "<", "version"
839                    or croak "$0: can't open required file 'version': $!\n";
840my $string_version = <$VERSION>;
841close $VERSION;
842chomp $string_version;
843my $v_version = pack "C*", split /\./, $string_version;        # v string
844
845my $unicode_version = ($compare_versions)
846                      ? (  "$string_compare_versions (using "
847                         . "$string_version rules)")
848                      : $string_version;
849
850# The following are the complete names of properties with property values that
851# are known to not match any code points in some versions of Unicode, but that
852# may change in the future so they should be matchable, hence an empty file is
853# generated for them.
854my @tables_that_may_be_empty;
855push @tables_that_may_be_empty, 'Joining_Type=Left_Joining'
856                                                    if $v_version lt v6.3.0;
857push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
858push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
859push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
860                                                    if $v_version ge v4.1.0;
861push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
862                                                    if $v_version ge v6.0.0;
863push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend'
864                                                    if $v_version ge v6.1.0;
865push @tables_that_may_be_empty, 'Canonical_Combining_Class=CCC133'
866                                                    if $v_version ge v6.2.0;
867
868# The lists below are hashes, so the key is the item in the list, and the
869# value is the reason why it is in the list.  This makes generation of
870# documentation easier.
871
872my %why_suppressed;  # No file generated for these.
873
874# Files aren't generated for empty extraneous properties.  This is arguable.
875# Extraneous properties generally come about because a property is no longer
876# used in a newer version of Unicode.  If we generated a file without code
877# points, programs that used to work on that property will still execute
878# without errors.  It just won't ever match (or will always match, with \P{}).
879# This means that the logic is now likely wrong.  I (khw) think its better to
880# find this out by getting an error message.  Just move them to the table
881# above to change this behavior
882my %why_suppress_if_empty_warn_if_not = (
883
884   # It is the only property that has ever officially been removed from the
885   # Standard.  The database never contained any code points for it.
886   'Special_Case_Condition' => 'Obsolete',
887
888   # Apparently never official, but there were code points in some versions of
889   # old-style PropList.txt
890   'Non_Break' => 'Obsolete',
891);
892
893# These would normally go in the warn table just above, but they were changed
894# a long time before this program was written, so warnings about them are
895# moot.
896if ($v_version gt v3.2.0) {
897    push @tables_that_may_be_empty,
898                                'Canonical_Combining_Class=Attached_Below_Left'
899}
900
901# Obsoleted
902if ($v_version ge v11.0.0) {
903    push @tables_that_may_be_empty, qw(
904                                       Grapheme_Cluster_Break=E_Base
905                                       Grapheme_Cluster_Break=E_Base_GAZ
906                                       Grapheme_Cluster_Break=E_Modifier
907                                       Grapheme_Cluster_Break=Glue_After_Zwj
908                                       Word_Break=E_Base
909                                       Word_Break=E_Base_GAZ
910                                       Word_Break=E_Modifier
911                                       Word_Break=Glue_After_Zwj);
912}
913
914# Enum values for to_output_map() method in the Map_Table package. (0 is don't
915# output)
916my $EXTERNAL_MAP = 1;
917my $INTERNAL_MAP = 2;
918my $OUTPUT_ADJUSTED = 3;
919
920# To override computed values for writing the map tables for these properties.
921# The default for enum map tables is to write them out, so that the Unicode
922# .txt files can be removed, but all the data to compute any property value
923# for any code point is available in a more compact form.
924my %global_to_output_map = (
925    # Needed by UCD.pm, but don't want to publicize that it exists, so won't
926    # get stuck supporting it if things change.  Since it is a STRING
927    # property, it normally would be listed in the pod, but INTERNAL_MAP
928    # suppresses that.
929    Unicode_1_Name => $INTERNAL_MAP,
930
931    Present_In => 0,                # Suppress, as easily computed from Age
932    Block => (NON_ASCII_PLATFORM) ? 1 : 0,  # Suppress, as Blocks.txt is
933                                            # retained, but needed for
934                                            # non-ASCII
935
936    # Suppress, as mapping can be found instead from the
937    # Perl_Decomposition_Mapping file
938    Decomposition_Type => 0,
939);
940
941# There are several types of obsolete properties defined by Unicode.  These
942# must be hand-edited for every new Unicode release.
943my %why_deprecated;  # Generates a deprecated warning message if used.
944my %why_stabilized;  # Documentation only
945my %why_obsolete;    # Documentation only
946
947{   # Closure
948    my $simple = 'Perl uses the more complete version';
949    my $unihan = 'Unihan properties are by default not enabled in the Perl core.';
950
951    my $other_properties = 'other properties';
952    my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
953    my $why_no_expand  = "Deprecated by Unicode.  These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used.  For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character.";
954
955    %why_deprecated = (
956        'Grapheme_Link' => 'Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
957        'Jamo_Short_Name' => $contributory,
958        'Line_Break=Surrogate' => 'Surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
959        'Other_Alphabetic' => $contributory,
960        'Other_Default_Ignorable_Code_Point' => $contributory,
961        'Other_Grapheme_Extend' => $contributory,
962        'Other_ID_Continue' => $contributory,
963        'Other_ID_Start' => $contributory,
964        'Other_Lowercase' => $contributory,
965        'Other_Math' => $contributory,
966        'Other_Uppercase' => $contributory,
967        'Expands_On_NFC' => $why_no_expand,
968        'Expands_On_NFD' => $why_no_expand,
969        'Expands_On_NFKC' => $why_no_expand,
970        'Expands_On_NFKD' => $why_no_expand,
971    );
972
973    %why_suppressed = (
974        # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which
975        # contains the same information, but without the algorithmically
976        # determinable Hangul syllables'.  This file is not published, so it's
977        # existence is not noted in the comment.
978        'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or prop_invmap() or charprop() in Unicode::UCD::',
979
980        # Don't suppress ISO_Comment, as otherwise special handling is needed
981        # to differentiate between it and gc=c, which can be written as 'isc',
982        # which is the same characters as ISO_Comment's short name.
983
984        'Name' => "Accessible via \\N{...} or 'use charnames;' or charprop() or prop_invmap() in Unicode::UCD::",
985
986        'Simple_Case_Folding' => "$simple.  Can access this through casefold(), charprop(), or prop_invmap() in Unicode::UCD",
987        'Simple_Lowercase_Mapping' => "$simple.  Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
988        'Simple_Titlecase_Mapping' => "$simple.  Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
989        'Simple_Uppercase_Mapping' => "$simple.  Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
990
991        FC_NFKC_Closure => 'Deprecated by Unicode, and supplanted in usage by NFKC_Casefold; otherwise not useful',
992    );
993
994    foreach my $property (
995
996            # The following are suppressed because they were made contributory
997            # or deprecated by Unicode before Perl ever thought about
998            # supporting them.
999            'Jamo_Short_Name',
1000            'Grapheme_Link',
1001            'Expands_On_NFC',
1002            'Expands_On_NFD',
1003            'Expands_On_NFKC',
1004            'Expands_On_NFKD',
1005
1006            # The following are suppressed because they have been marked
1007            # as deprecated for a sufficient amount of time
1008            'Other_Alphabetic',
1009            'Other_Default_Ignorable_Code_Point',
1010            'Other_Grapheme_Extend',
1011            'Other_ID_Continue',
1012            'Other_ID_Start',
1013            'Other_Lowercase',
1014            'Other_Math',
1015            'Other_Uppercase',
1016    ) {
1017        $why_suppressed{$property} = $why_deprecated{$property};
1018    }
1019
1020    # Customize the message for all the 'Other_' properties
1021    foreach my $property (keys %why_deprecated) {
1022        next if (my $main_property = $property) !~ s/^Other_//;
1023        $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
1024    }
1025}
1026
1027if ($write_Unicode_deprecated_tables) {
1028    foreach my $property (keys %why_suppressed) {
1029        delete $why_suppressed{$property} if $property =~
1030                                                    / ^ Other | Grapheme /x;
1031    }
1032}
1033
1034if ($v_version ge 4.0.0) {
1035    $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
1036    if ($v_version ge 6.0.0) {
1037        $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14';
1038    }
1039}
1040if ($v_version ge 5.2.0 && $v_version lt 6.0.0) {
1041    $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
1042    if ($v_version ge 6.0.0) {
1043        $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed';
1044    }
1045}
1046
1047# Probably obsolete forever
1048if ($v_version ge v4.1.0) {
1049    $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete.  All code points previously matched by this have been moved to "Script=Common".';
1050}
1051if ($v_version ge v6.0.0) {
1052    $why_suppressed{'Script=Katakana_Or_Hiragana'} .= '  Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana" (or both)';
1053    $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"';
1054}
1055
1056# This program can create files for enumerated-like properties, such as
1057# 'Numeric_Type'.  This file would be the same format as for a string
1058# property, with a mapping from code point to its value, so you could look up,
1059# for example, the script a code point is in.  But no one so far wants this
1060# mapping, or they have found another way to get it since this is a new
1061# feature.  So no file is generated except if it is in this list.
1062my @output_mapped_properties = split "\n", <<END;
1063END
1064
1065# If you want more Unihan properties than the default, you need to add them to
1066# these arrays.  Depending on the property type, @missing lines might have to
1067# be added to the second array.  A sample entry would be (including the '#'):
1068# @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
1069my @cjk_properties = split "\n", <<'END';
1070END
1071my @cjk_property_values = split "\n", <<'END';
1072END
1073
1074# The input files don't list every code point.  Those not listed are to be
1075# defaulted to some value.  Below are hard-coded what those values are for
1076# non-binary properties as of 5.1.  Starting in 5.0, there are
1077# machine-parsable comment lines in the files that give the defaults; so this
1078# list shouldn't have to be extended.  The claim is that all missing entries
1079# for binary properties will default to 'N'.  Unicode tried to change that in
1080# 5.2, but the beta period produced enough protest that they backed off.
1081#
1082# The defaults for the fields that appear in UnicodeData.txt in this hash must
1083# be in the form that it expects.  The others may be synonyms.
1084my $CODE_POINT = '<code point>';
1085my %default_mapping = (
1086    Age => "Unassigned",
1087    # Bidi_Class => Complicated; set in code
1088    Bidi_Mirroring_Glyph => "",
1089    Block => 'No_Block',
1090    Canonical_Combining_Class => 0,
1091    Case_Folding => $CODE_POINT,
1092    Decomposition_Mapping => $CODE_POINT,
1093    Decomposition_Type => 'None',
1094    East_Asian_Width => "Neutral",
1095    FC_NFKC_Closure => $CODE_POINT,
1096    General_Category => ($v_version le 6.3.0) ? 'Cn' : 'Unassigned',
1097    Grapheme_Cluster_Break => 'Other',
1098    Hangul_Syllable_Type => 'NA',
1099    ISO_Comment => "",
1100    Jamo_Short_Name => "",
1101    Joining_Group => "No_Joining_Group",
1102    # Joining_Type => Complicated; set in code
1103    kIICore => 'N',   #                       Is converted to binary
1104    #Line_Break => Complicated; set in code
1105    Lowercase_Mapping => $CODE_POINT,
1106    Name => "",
1107    Name_Alias => "",
1108    NFC_QC => 'Yes',
1109    NFD_QC => 'Yes',
1110    NFKC_QC => 'Yes',
1111    NFKD_QC => 'Yes',
1112    Numeric_Type => 'None',
1113    Numeric_Value => 'NaN',
1114    Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
1115    Sentence_Break => 'Other',
1116    Simple_Case_Folding => $CODE_POINT,
1117    Simple_Lowercase_Mapping => $CODE_POINT,
1118    Simple_Titlecase_Mapping => $CODE_POINT,
1119    Simple_Uppercase_Mapping => $CODE_POINT,
1120    Titlecase_Mapping => $CODE_POINT,
1121    Unicode_1_Name => "",
1122    Unicode_Radical_Stroke => "",
1123    Uppercase_Mapping => $CODE_POINT,
1124    Word_Break => 'Other',
1125);
1126
1127### End of externally interesting definitions, except for @input_file_objects
1128
1129my $HEADER=<<"EOF";
1130# !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
1131# This file is machine-generated by $0 from the Unicode
1132# database, Version $unicode_version.  Any changes made here will be lost!
1133EOF
1134
1135my $INTERNAL_ONLY_HEADER = <<"EOF";
1136
1137# !!!!!!!   INTERNAL PERL USE ONLY   !!!!!!!
1138# This file is for internal use by core Perl only.  The format and even the
1139# name or existence of this file are subject to change without notice.  Don't
1140# use it directly.  Use Unicode::UCD to access the Unicode character data
1141# base.
1142EOF
1143
1144my $DEVELOPMENT_ONLY=<<"EOF";
1145# !!!!!!!   DEVELOPMENT USE ONLY   !!!!!!!
1146# This file contains information artificially constrained to code points
1147# present in Unicode release $string_compare_versions.
1148# IT CANNOT BE RELIED ON.  It is for use during development only and should
1149# not be used for production.
1150
1151EOF
1152
1153my $MAX_UNICODE_CODEPOINT_STRING = ($v_version ge v2.0.0)
1154                                   ? "10FFFF"
1155                                   : "FFFF";
1156my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
1157my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
1158
1159# We work with above-Unicode code points, up to IV_MAX, but we may want to use
1160# sentinels above that number.  Therefore for internal use, we use a much
1161# smaller number, translating it to IV_MAX only for output.  The exact number
1162# is immaterial (all above-Unicode code points are treated exactly the same),
1163# but the algorithm requires it to be at least
1164# 2 * $MAX_UNICODE_CODEPOINTS + 1
1165my $MAX_WORKING_CODEPOINTS= $MAX_UNICODE_CODEPOINT * 8;
1166my $MAX_WORKING_CODEPOINT = $MAX_WORKING_CODEPOINTS - 1;
1167my $MAX_WORKING_CODEPOINT_STRING = sprintf("%X", $MAX_WORKING_CODEPOINT);
1168
1169my $MAX_PLATFORM_CODEPOINT = ~0 >> 1;
1170
1171# Matches legal code point.  4-6 hex numbers, If there are 6, the first
1172# two must be 10; if there are 5, the first must not be a 0.  Written this way
1173# to decrease backtracking.  The first regex allows the code point to be at
1174# the end of a word, but to work properly, the word shouldn't end with a valid
1175# hex character.  The second one won't match a code point at the end of a
1176# word, and doesn't have the run-on issue
1177my $run_on_code_point_re =
1178            qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1179my $code_point_re = qr/\b$run_on_code_point_re/;
1180
1181# This matches the beginning of the line in the Unicode DB files that give the
1182# defaults for code points not listed (i.e., missing) in the file.  The code
1183# depends on this ending with a semi-colon, so it can assume it is a valid
1184# field when the line is split() by semi-colons
1185my $missing_defaults_prefix = qr/^#\s+\@missing:\s+0000\.\.10FFFF\s*;/;
1186
1187# Property types.  Unicode has more types, but these are sufficient for our
1188# purposes.
1189my $UNKNOWN = -1;   # initialized to illegal value
1190my $NON_STRING = 1; # Either binary or enum
1191my $BINARY = 2;
1192my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal
1193                       # tables, additional true and false tables are
1194                       # generated so that false is anything matching the
1195                       # default value, and true is everything else.
1196my $ENUM = 4;       # Include catalog
1197my $STRING = 5;     # Anything else: string or misc
1198
1199# Some input files have lines that give default values for code points not
1200# contained in the file.  Sometimes these should be ignored.
1201my $NO_DEFAULTS = 0;        # Must evaluate to false
1202my $NOT_IGNORED = 1;
1203my $IGNORED = 2;
1204
1205# Range types.  Each range has a type.  Most ranges are type 0, for normal,
1206# and will appear in the main body of the tables in the output files, but
1207# there are other types of ranges as well, listed below, that are specially
1208# handled.   There are pseudo-types as well that will never be stored as a
1209# type, but will affect the calculation of the type.
1210
1211# 0 is for normal, non-specials
1212my $MULTI_CP = 1;           # Sequence of more than code point
1213my $HANGUL_SYLLABLE = 2;
1214my $CP_IN_NAME = 3;         # The NAME contains the code point appended to it.
1215my $NULL = 4;               # The map is to the null string; utf8.c can't
1216                            # handle these, nor is there an accepted syntax
1217                            # for them in \p{} constructs
1218my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
1219                             # otherwise be $MULTI_CP type are instead type 0
1220
1221# process_generic_property_file() can accept certain overrides in its input.
1222# Each of these must begin AND end with $CMD_DELIM.
1223my $CMD_DELIM = "\a";
1224my $REPLACE_CMD = 'replace';    # Override the Replace
1225my $MAP_TYPE_CMD = 'map_type';  # Override the Type
1226
1227my $NO = 0;
1228my $YES = 1;
1229
1230# Values for the Replace argument to add_range.
1231# $NO                      # Don't replace; add only the code points not
1232                           # already present.
1233my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1234                           # the comments at the subroutine definition.
1235my $UNCONDITIONALLY = 2;   # Replace without conditions.
1236my $MULTIPLE_BEFORE = 4;   # Don't replace, but add a duplicate record if
1237                           # already there
1238my $MULTIPLE_AFTER = 5;    # Don't replace, but add a duplicate record if
1239                           # already there
1240my $CROAK = 6;             # Die with an error if is already there
1241
1242# Flags to give property statuses.  The phrases are to remind maintainers that
1243# if the flag is changed, the indefinite article referring to it in the
1244# documentation may need to be as well.
1245my $NORMAL = "";
1246my $DEPRECATED = 'D';
1247my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1248my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1249my $DISCOURAGED = 'X';
1250my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1251my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1252my $STRICTER = 'T';
1253my $a_bold_stricter = "a 'B<$STRICTER>'";
1254my $A_bold_stricter = "A 'B<$STRICTER>'";
1255my $STABILIZED = 'S';
1256my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1257my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1258my $OBSOLETE = 'O';
1259my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1260my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1261
1262# Aliases can also have an extra status:
1263my $INTERNAL_ALIAS = 'P';
1264
1265my %status_past_participles = (
1266    $DISCOURAGED => 'discouraged',
1267    $STABILIZED => 'stabilized',
1268    $OBSOLETE => 'obsolete',
1269    $DEPRECATED => 'deprecated',
1270    $INTERNAL_ALIAS => 'reserved for Perl core internal use only',
1271);
1272
1273# Table fates.  These are somewhat ordered, so that fates < $MAP_PROXIED should be
1274# externally documented.
1275my $ORDINARY = 0;       # The normal fate.
1276my $MAP_PROXIED = 1;    # The map table for the property isn't written out,
1277                        # but there is a file written that can be used to
1278                        # reconstruct this table
1279my $INTERNAL_ONLY = 2;  # The file for this table is written out, but it is
1280                        # for Perl's internal use only
1281my $SUPPRESSED = 3;     # The file for this table is not written out, and as a
1282                        # result, we don't bother to do many computations on
1283                        # it.
1284my $PLACEHOLDER = 4;    # Like $SUPPRESSED, but we go through all the
1285                        # computations anyway, as the values are needed for
1286                        # things to work.  This happens when we have Perl
1287                        # extensions that depend on Unicode tables that
1288                        # wouldn't normally be in a given Unicode version.
1289
1290# The format of the values of the tables:
1291my $EMPTY_FORMAT = "";
1292my $BINARY_FORMAT = 'b';
1293my $DECIMAL_FORMAT = 'd';
1294my $FLOAT_FORMAT = 'f';
1295my $INTEGER_FORMAT = 'i';
1296my $HEX_FORMAT = 'x';
1297my $RATIONAL_FORMAT = 'r';
1298my $STRING_FORMAT = 's';
1299my $ADJUST_FORMAT = 'a';
1300my $HEX_ADJUST_FORMAT = 'ax';
1301my $DECOMP_STRING_FORMAT = 'c';
1302my $STRING_WHITE_SPACE_LIST = 'sw';
1303
1304my %map_table_formats = (
1305    $BINARY_FORMAT => 'binary',
1306    $DECIMAL_FORMAT => 'single decimal digit',
1307    $FLOAT_FORMAT => 'floating point number',
1308    $INTEGER_FORMAT => 'integer',
1309    $HEX_FORMAT => 'non-negative hex whole number; a code point',
1310    $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1311    $STRING_FORMAT => 'string',
1312    $ADJUST_FORMAT => 'some entries need adjustment',
1313    $HEX_ADJUST_FORMAT => 'mapped value in hex; some entries need adjustment',
1314    $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
1315    $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators'
1316);
1317
1318# Unicode didn't put such derived files in a separate directory at first.
1319my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1320my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1321my $AUXILIARY = 'auxiliary';
1322my $EMOJI = 'emoji';
1323
1324# Hashes and arrays that will eventually go into UCD.pl for the use of UCD.pm
1325my %loose_to_file_of;       # loosely maps table names to their respective
1326                            # files
1327my %stricter_to_file_of;    # same; but for stricter mapping.
1328my %loose_property_to_file_of; # Maps a loose property name to its map file
1329my %strict_property_to_file_of; # Same, but strict
1330my @inline_definitions = "V0"; # Each element gives a definition of a unique
1331                            # inversion list.  When a definition is inlined,
1332                            # its value in the hash it's in (one of the two
1333                            # defined just above) will include an index into
1334                            # this array.  The 0th element is initialized to
1335                            # the definition for a zero length inversion list
1336my %file_to_swash_name;     # Maps the file name to its corresponding key name
1337                            # in the hash %Unicode::UCD::SwashInfo
1338my %nv_floating_to_rational; # maps numeric values floating point numbers to
1339                             # their rational equivalent
1340my %loose_property_name_of; # Loosely maps (non_string) property names to
1341                            # standard form
1342my %strict_property_name_of; # Strictly maps (non_string) property names to
1343                            # standard form
1344my %string_property_loose_to_name; # Same, for string properties.
1345my %loose_defaults;         # keys are of form "prop=value", where 'prop' is
1346                            # the property name in standard loose form, and
1347                            # 'value' is the default value for that property,
1348                            # also in standard loose form.
1349my %loose_to_standard_value; # loosely maps table names to the canonical
1350                            # alias for them
1351my %ambiguous_names;        # keys are alias names (in standard form) that
1352                            # have more than one possible meaning.
1353my %combination_property;   # keys are alias names (in standard form) that
1354                            # have both a map table, and a binary one that
1355                            # yields true for all non-null maps.
1356my %prop_aliases;           # Keys are standard property name; values are each
1357                            # one's aliases
1358my %prop_value_aliases;     # Keys of top level are standard property name;
1359                            # values are keys to another hash,  Each one is
1360                            # one of the property's values, in standard form.
1361                            # The values are that prop-val's aliases.
1362my %skipped_files;          # List of files that we skip
1363my %ucd_pod;    # Holds entries that will go into the UCD section of the pod
1364
1365# Most properties are immune to caseless matching, otherwise you would get
1366# nonsensical results, as properties are a function of a code point, not
1367# everything that is caselessly equivalent to that code point.  For example,
1368# Changes_When_Case_Folded('s') should be false, whereas caselessly it would
1369# be true because 's' and 'S' are equivalent caselessly.  However,
1370# traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we
1371# extend that concept to those very few properties that are like this.  Each
1372# such property will match the full range caselessly.  They are hard-coded in
1373# the program; it's not worth trying to make it general as it's extremely
1374# unlikely that they will ever change.
1375my %caseless_equivalent_to;
1376
1377# This is the range of characters that were in Release 1 of Unicode, and
1378# removed in Release 2 (replaced with the current Hangul syllables starting at
1379# U+AC00).  The range was reused starting in Release 3 for other purposes.
1380my $FIRST_REMOVED_HANGUL_SYLLABLE = 0x3400;
1381my $FINAL_REMOVED_HANGUL_SYLLABLE = 0x4DFF;
1382
1383# These constants names and values were taken from the Unicode standard,
1384# version 5.1, section 3.12.  They are used in conjunction with Hangul
1385# syllables.  The '_string' versions are so generated tables can retain the
1386# hex format, which is the more familiar value
1387my $SBase_string = "0xAC00";
1388my $SBase = CORE::hex $SBase_string;
1389my $LBase_string = "0x1100";
1390my $LBase = CORE::hex $LBase_string;
1391my $VBase_string = "0x1161";
1392my $VBase = CORE::hex $VBase_string;
1393my $TBase_string = "0x11A7";
1394my $TBase = CORE::hex $TBase_string;
1395my $SCount = 11172;
1396my $LCount = 19;
1397my $VCount = 21;
1398my $TCount = 28;
1399my $NCount = $VCount * $TCount;
1400
1401# For Hangul syllables;  These store the numbers from Jamo.txt in conjunction
1402# with the above published constants.
1403my %Jamo;
1404my %Jamo_L;     # Leading consonants
1405my %Jamo_V;     # Vowels
1406my %Jamo_T;     # Trailing consonants
1407
1408# For code points whose name contains its ordinal as a '-ABCD' suffix.
1409# The key is the base name of the code point, and the value is an
1410# array giving all the ranges that use this base name.  Each range
1411# is actually a hash giving the 'low' and 'high' values of it.
1412my %names_ending_in_code_point;
1413my %loose_names_ending_in_code_point;   # Same as above, but has blanks, dashes
1414                                        # removed from the names
1415# Inverse mapping.  The list of ranges that have these kinds of
1416# names.  Each element contains the low, high, and base names in an
1417# anonymous hash.
1418my @code_points_ending_in_code_point;
1419
1420# To hold Unicode's normalization test suite
1421my @normalization_tests;
1422
1423# Boolean: does this Unicode version have the hangul syllables, and are we
1424# writing out a table for them?
1425my $has_hangul_syllables = 0;
1426
1427# Does this Unicode version have code points whose names end in their
1428# respective code points, and are we writing out a table for them?  0 for no;
1429# otherwise points to first property that a table is needed for them, so that
1430# if multiple tables are needed, we don't create duplicates
1431my $needing_code_points_ending_in_code_point = 0;
1432
1433my @backslash_X_tests;     # List of tests read in for testing \X
1434my @LB_tests;              # List of tests read in for testing \b{lb}
1435my @SB_tests;              # List of tests read in for testing \b{sb}
1436my @WB_tests;              # List of tests read in for testing \b{wb}
1437my @unhandled_properties;  # Will contain a list of properties found in
1438                           # the input that we didn't process.
1439my @match_properties;      # Properties that have match tables, to be
1440                           # listed in the pod
1441my @map_properties;        # Properties that get map files written
1442my @named_sequences;       # NamedSequences.txt contents.
1443my %potential_files;       # Generated list of all .txt files in the directory
1444                           # structure so we can warn if something is being
1445                           # ignored.
1446my @missing_early_files;   # Generated list of absent files that we need to
1447                           # proceed in compiling this early Unicode version
1448my @files_actually_output; # List of files we generated.
1449my @more_Names;            # Some code point names are compound; this is used
1450                           # to store the extra components of them.
1451my $E_FLOAT_PRECISION = 2; # The minimum number of digits after the decimal
1452                           # point of a normalized floating point number
1453                           # needed to match before we consider it equivalent
1454                           # to a candidate rational
1455
1456# These store references to certain commonly used property objects
1457my $age;
1458my $ccc;
1459my $gc;
1460my $perl;
1461my $block;
1462my $perl_charname;
1463my $print;
1464my $All;
1465my $Assigned;   # All assigned characters in this Unicode release
1466my $DI;         # Default_Ignorable_Code_Point property
1467my $NChar;      # Noncharacter_Code_Point property
1468my $script;
1469my $scx;        # Script_Extensions property
1470my $idt;        # Identifier_Type property
1471
1472# Are there conflicting names because of beginning with 'In_', or 'Is_'
1473my $has_In_conflicts = 0;
1474my $has_Is_conflicts = 0;
1475
1476sub internal_file_to_platform ($file=undef) {
1477    # Convert our file paths which have '/' separators to those of the
1478    # platform.
1479
1480    return undef unless defined $file;
1481
1482    return File::Spec->join(split '/', $file);
1483}
1484
1485sub file_exists ($file=undef) {   # platform independent '-e'.  This program internally
1486                        # uses slash as a path separator.
1487    return 0 unless defined $file;
1488    return -e internal_file_to_platform($file);
1489}
1490
1491sub objaddr($addr) {
1492    # Returns the address of the blessed input object.
1493    # It doesn't check for blessedness because that would do a string eval
1494    # every call, and the program is structured so that this is never called
1495    # for a non-blessed object.
1496
1497    no overloading; # If overloaded, numifying below won't work.
1498
1499    # Numifying a ref gives its address.
1500    return pack 'J', $addr;
1501}
1502
1503# These are used only if $annotate is true.
1504# The entire range of Unicode characters is examined to populate these
1505# after all the input has been processed.  But most can be skipped, as they
1506# have the same descriptive phrases, such as being unassigned
1507my @viacode;            # Contains the 1 million character names
1508my @age;                # And their ages ("" if none)
1509my @printable;          # boolean: And are those characters printable?
1510my @annotate_char_type; # Contains a type of those characters, specifically
1511                        # for the purposes of annotation.
1512my $annotate_ranges;    # A map of ranges of code points that have the same
1513                        # name for the purposes of annotation.  They map to the
1514                        # upper edge of the range, so that the end point can
1515                        # be immediately found.  This is used to skip ahead to
1516                        # the end of a range, and avoid processing each
1517                        # individual code point in it.
1518my $unassigned_sans_noncharacters; # A Range_List of the unassigned
1519                                   # characters, but excluding those which are
1520                                   # also noncharacter code points
1521
1522# The annotation types are an extension of the regular range types, though
1523# some of the latter are folded into one.  Make the new types negative to
1524# avoid conflicting with the regular types
1525my $SURROGATE_TYPE = -1;
1526my $UNASSIGNED_TYPE = -2;
1527my $PRIVATE_USE_TYPE = -3;
1528my $NONCHARACTER_TYPE = -4;
1529my $CONTROL_TYPE = -5;
1530my $ABOVE_UNICODE_TYPE = -6;
1531my $UNKNOWN_TYPE = -7;  # Used only if there is a bug in this program
1532
1533sub populate_char_info ($i) {
1534    # Used only with the $annotate option.  Populates the arrays with the
1535    # input code point's info that are needed for outputting more detailed
1536    # comments.  If calling context wants a return, it is the end point of
1537    # any contiguous range of characters that share essentially the same info
1538
1539    $viacode[$i] = $perl_charname->value_of($i) || "";
1540    $age[$i] = (defined $age)
1541               ? (($age->value_of($i) =~ / ^ \d+ \. \d+ $ /x)
1542                  ? $age->value_of($i)
1543                  : "")
1544               : "";
1545
1546    # A character is generally printable if Unicode says it is,
1547    # but below we make sure that most Unicode general category 'C' types
1548    # aren't.
1549    $printable[$i] = $print->contains($i);
1550
1551    # But the characters in this range were removed in v2.0 and replaced by
1552    # different ones later.  Modern fonts will be for the replacement
1553    # characters, so suppress printing them.
1554    if (($v_version lt v2.0
1555         || ($compare_versions && $compare_versions lt v2.0))
1556        && (   $i >= $FIRST_REMOVED_HANGUL_SYLLABLE
1557            && $i <= $FINAL_REMOVED_HANGUL_SYLLABLE))
1558    {
1559        $printable[$i] = 0;
1560    }
1561
1562    $annotate_char_type[$i] = $perl_charname->type_of($i) || 0;
1563
1564    # Only these two regular types are treated specially for annotations
1565    # purposes
1566    $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME
1567                                && $annotate_char_type[$i] != $HANGUL_SYLLABLE;
1568
1569    # Give a generic name to all code points that don't have a real name.
1570    # We output ranges, if applicable, for these.  Also calculate the end
1571    # point of the range.
1572    my $end;
1573    if (! $viacode[$i]) {
1574        if ($i > $MAX_UNICODE_CODEPOINT) {
1575            $viacode[$i] = 'Above-Unicode';
1576            $annotate_char_type[$i] = $ABOVE_UNICODE_TYPE;
1577            $printable[$i] = 0;
1578            $end = $MAX_WORKING_CODEPOINT;
1579        }
1580        elsif ($gc-> table('Private_use')->contains($i)) {
1581            $viacode[$i] = 'Private Use';
1582            $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
1583            $printable[$i] = 0;
1584            $end = $gc->table('Private_Use')->containing_range($i)->end;
1585        }
1586        elsif ($NChar->contains($i)) {
1587            $viacode[$i] = 'Noncharacter';
1588            $annotate_char_type[$i] = $NONCHARACTER_TYPE;
1589            $printable[$i] = 0;
1590            $end = $NChar->containing_range($i)->end;
1591        }
1592        elsif ($gc-> table('Control')->contains($i)) {
1593            my $name_ref = property_ref('Name_Alias');
1594            $name_ref = property_ref('Unicode_1_Name') if ! defined $name_ref;
1595            $viacode[$i] = (defined $name_ref)
1596                           ? $name_ref->value_of($i)
1597                           : 'Control';
1598            $annotate_char_type[$i] = $CONTROL_TYPE;
1599            $printable[$i] = 0;
1600        }
1601        elsif ($gc-> table('Unassigned')->contains($i)) {
1602            $annotate_char_type[$i] = $UNASSIGNED_TYPE;
1603            $printable[$i] = 0;
1604            $viacode[$i] = 'Unassigned';
1605
1606            if (defined $block) { # No blocks in earliest releases
1607                $viacode[$i] .= ', block=' . $block-> value_of($i);
1608                $end = $gc-> table('Unassigned')->containing_range($i)->end;
1609
1610                # Because we name the unassigned by the blocks they are in, it
1611                # can't go past the end of that block, and it also can't go
1612                # past the unassigned range it is in.  The special table makes
1613                # sure that the non-characters, which are unassigned, are
1614                # separated out.
1615                $end = min($block->containing_range($i)->end,
1616                           $unassigned_sans_noncharacters->
1617                                                    containing_range($i)->end);
1618            }
1619            else {
1620                $end = $i + 1;
1621                while ($unassigned_sans_noncharacters->contains($end)) {
1622                    $end++;
1623                }
1624                $end--;
1625            }
1626        }
1627        elsif ($perl->table('_Perl_Surrogate')->contains($i)) {
1628            $viacode[$i] = 'Surrogate';
1629            $annotate_char_type[$i] = $SURROGATE_TYPE;
1630            $printable[$i] = 0;
1631            $end = $gc->table('Surrogate')->containing_range($i)->end;
1632        }
1633        else {
1634            Carp::my_carp_bug("Can't figure out how to annotate "
1635                              . sprintf("U+%04X", $i)
1636                              . ".  Proceeding anyway.");
1637            $viacode[$i] = 'UNKNOWN';
1638            $annotate_char_type[$i] = $UNKNOWN_TYPE;
1639            $printable[$i] = 0;
1640        }
1641    }
1642
1643    # Here, has a name, but if it's one in which the code point number is
1644    # appended to the name, do that.
1645    elsif ($annotate_char_type[$i] == $CP_IN_NAME) {
1646        $viacode[$i] .= sprintf("-%04X", $i);
1647
1648        my $limit = $perl_charname->containing_range($i)->end;
1649        if (defined $age) {
1650            # Do all these as groups of the same age, instead of individually,
1651            # because their names are so meaningless, and there are typically
1652            # large quantities of them.
1653            $end = $i + 1;
1654            while ($end <= $limit && $age->value_of($end) == $age[$i]) {
1655                $end++;
1656            }
1657            $end--;
1658        }
1659        else {
1660            $end = $limit;
1661        }
1662    }
1663
1664    # And here, has a name, but if it's a hangul syllable one, replace it with
1665    # the correct name from the Unicode algorithm
1666    elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) {
1667        use integer;
1668        my $SIndex = $i - $SBase;
1669        my $L = $LBase + $SIndex / $NCount;
1670        my $V = $VBase + ($SIndex % $NCount) / $TCount;
1671        my $T = $TBase + $SIndex % $TCount;
1672        $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}";
1673        $viacode[$i] .= $Jamo{$T} if $T != $TBase;
1674        $end = $perl_charname->containing_range($i)->end;
1675    }
1676
1677    return if ! defined wantarray;
1678    return $i if ! defined $end;    # If not a range, return the input
1679
1680    # Save this whole range so can find the end point quickly
1681    $annotate_ranges->add_map($i, $end, $end);
1682
1683    return $end;
1684}
1685
1686sub max($a, $b) {
1687    return $a >= $b ? $a : $b;
1688}
1689
1690sub min($a, $b) {
1691    return $a <= $b ? $a : $b;
1692}
1693
1694sub clarify_number ($number) {
1695    # This returns the input number with underscores inserted every 3 digits
1696    # in large (5 digits or more) numbers.  Input must be entirely digits, not
1697    # checked.
1698
1699    my $pos = length($number) - 3;
1700    return $number if $pos <= 1;
1701    while ($pos > 0) {
1702        substr($number, $pos, 0) = '_';
1703        $pos -= 3;
1704    }
1705    return $number;
1706}
1707
1708sub clarify_code_point_count ($number) {
1709    # This is like clarify_number(), but the input is assumed to be a count of
1710    # code points, rather than a generic number.
1711
1712    my $append = "";
1713
1714    if ($number > $MAX_UNICODE_CODEPOINTS) {
1715        $number -= ($MAX_WORKING_CODEPOINTS - $MAX_UNICODE_CODEPOINTS);
1716        return "All above-Unicode code points" if $number == 0;
1717        $append = " + all above-Unicode code points";
1718    }
1719    return clarify_number($number) . $append;
1720}
1721
1722package Carp;
1723
1724# These routines give a uniform treatment of messages in this program.  They
1725# are placed in the Carp package to cause the stack trace to not include them,
1726# although an alternative would be to use another package and set @CARP_NOT
1727# for it.
1728
1729our $Verbose = 1 if main::DEBUG;  # Useful info when debugging
1730
1731# This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1732# and overload trying to load Scalar:Util under miniperl.  See
1733# http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1734undef $overload::VERSION;
1735
1736sub my_carp($message="", $nofold=0) {
1737
1738    if ($message) {
1739        $message = main::join_lines($message);
1740        $message =~ s/^$0: *//;     # Remove initial program name
1741        $message =~ s/[.;,]+$//;    # Remove certain ending punctuation
1742        $message = "\n$0: $message;";
1743
1744        # Fold the message with program name, semi-colon end punctuation
1745        # (which looks good with the message that carp appends to it), and a
1746        # hanging indent for continuation lines.
1747        $message = main::simple_fold($message, "", 4) unless $nofold;
1748        $message =~ s/\n$//;        # Remove the trailing nl so what carp
1749                                    # appends is to the same line
1750    }
1751
1752    return $message if defined wantarray;   # If a caller just wants the msg
1753
1754    carp $message;
1755    return;
1756}
1757
1758sub my_carp_bug($message="") {
1759    # This is called when it is clear that the problem is caused by a bug in
1760    # this program.
1761    $message =~ s/^$0: *//;
1762    $message = my_carp("Bug in $0.  Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1763    carp $message;
1764    return;
1765}
1766
1767sub carp_too_few_args($args_ref, $count) {
1768    my_carp_bug("Need at least $count arguments to "
1769        . (caller 1)[3]
1770        . ".  Instead got: '"
1771        . join ', ', @$args_ref
1772        . "'.  No action taken.");
1773    return;
1774}
1775
1776sub carp_extra_args($args_ref) {
1777    unless (ref $args_ref) {
1778        my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref.  Not checking arguments.");
1779        return;
1780    }
1781    my ($package, $file, $line) = caller;
1782    my $subroutine = (caller 1)[3];
1783
1784    my $list;
1785    if (ref $args_ref eq 'HASH') {
1786        foreach my $key (keys %$args_ref) {
1787            $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
1788        }
1789        $list = join ', ', each %{$args_ref};
1790    }
1791    elsif (ref $args_ref eq 'ARRAY') {
1792        foreach my $arg (@$args_ref) {
1793            $arg = $UNDEF unless defined $arg;
1794        }
1795        $list = join ', ', @$args_ref;
1796    }
1797    else {
1798        my_carp_bug("Can't cope with ref "
1799                . ref($args_ref)
1800                . " . argument to 'carp_extra_args'.  Not checking arguments.");
1801        return;
1802    }
1803
1804    my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine.  Skipped.");
1805    return;
1806}
1807
1808package main;
1809
1810{ # Closure
1811
1812    # This program uses the inside-out method for objects, as recommended in
1813    # "Perl Best Practices".  (This is the best solution still, since this has
1814    # to run under miniperl.)  This closure aids in generating those.  There
1815    # are two routines.  setup_package() is called once per package to set
1816    # things up, and then set_access() is called for each hash representing a
1817    # field in the object.  These routines arrange for the object to be
1818    # properly destroyed when no longer used, and for standard accessor
1819    # functions to be generated.  If you need more complex accessors, just
1820    # write your own and leave those accesses out of the call to set_access().
1821    # More details below.
1822
1823    my %constructor_fields; # fields that are to be used in constructors; see
1824                            # below
1825
1826    # The values of this hash will be the package names as keys to other
1827    # hashes containing the name of each field in the package as keys, and
1828    # references to their respective hashes as values.
1829    my %package_fields;
1830
1831    sub setup_package {
1832        # Sets up the package, creating standard DESTROY and dump methods
1833        # (unless already defined).  The dump method is used in debugging by
1834        # simple_dumper().
1835        # The optional parameters are:
1836        #   a)  a reference to a hash, that gets populated by later
1837        #       set_access() calls with one of the accesses being
1838        #       'constructor'.  The caller can then refer to this, but it is
1839        #       not otherwise used by these two routines.
1840        #   b)  a reference to a callback routine to call during destruction
1841        #       of the object, before any fields are actually destroyed
1842
1843        my %args = @_;
1844        my $constructor_ref = delete $args{'Constructor_Fields'};
1845        my $destroy_callback = delete $args{'Destroy_Callback'};
1846        Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1847
1848        my %fields;
1849        my $package = (caller)[0];
1850
1851        $package_fields{$package} = \%fields;
1852        $constructor_fields{$package} = $constructor_ref;
1853
1854        unless ($package->can('DESTROY')) {
1855            my $destroy_name = "${package}::DESTROY";
1856            no strict "refs";
1857
1858            # Use typeglob to give the anonymous subroutine the name we want
1859            *$destroy_name = sub {
1860                my $self = shift;
1861                my $addr = do { no overloading; pack 'J', $self; };
1862
1863                $self->$destroy_callback if $destroy_callback;
1864                foreach my $field (keys %{$package_fields{$package}}) {
1865                    #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1866                    delete $package_fields{$package}{$field}{$addr};
1867                }
1868                return;
1869            }
1870        }
1871
1872        unless ($package->can('dump')) {
1873            my $dump_name = "${package}::dump";
1874            no strict "refs";
1875            *$dump_name = sub {
1876                my $self = shift;
1877                return dump_inside_out($self, $package_fields{$package}, @_);
1878            }
1879        }
1880        return;
1881    }
1882
1883    sub set_access($name, $field, @accessors) {
1884        # Arrange for the input field to be garbage collected when no longer
1885        # needed.  Also, creates standard accessor functions for the field
1886        # based on the optional parameters-- none if none of these parameters:
1887        #   'addable'    creates an 'add_NAME()' accessor function.
1888        #   'readable' or 'readable_array'   creates a 'NAME()' accessor
1889        #                function.
1890        #   'settable'   creates a 'set_NAME()' accessor function.
1891        #   'constructor' doesn't create an accessor function, but adds the
1892        #                field to the hash that was previously passed to
1893        #                setup_package();
1894        # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1895        # 'add' etc. all mean 'addable'.
1896        # The read accessor function will work on both array and scalar
1897        # values.  If another accessor in the parameter list is 'a', the read
1898        # access assumes an array.  You can also force it to be array access
1899        # by specifying 'readable_array' instead of 'readable'
1900        #
1901        # A sort-of 'protected' access can be set-up by preceding the addable,
1902        # readable or settable with some initial portion of 'protected_' (but,
1903        # the underscore is required), like 'p_a', 'pro_set', etc.  The
1904        # "protection" is only by convention.  All that happens is that the
1905        # accessor functions' names begin with an underscore.  So instead of
1906        # calling set_foo, the call is _set_foo.  (Real protection could be
1907        # accomplished by having a new subroutine, end_package, called at the
1908        # end of each package, and then storing the __LINE__ ranges and
1909        # checking them on every accessor.  But that is way overkill.)
1910
1911        # We create anonymous subroutines as the accessors and then use
1912        # typeglobs to assign them to the proper package and name
1913
1914        # $name 	Name of the field
1915        # $field 	Reference to the inside-out hash containing the
1916		# 			field
1917
1918        my $package = (caller)[0];
1919
1920        if (! exists $package_fields{$package}) {
1921            croak "$0: Must call 'setup_package' before 'set_access'";
1922        }
1923
1924        # Stash the field so DESTROY can get it.
1925        $package_fields{$package}{$name} = $field;
1926
1927        # Remaining arguments are the accessors.  For each...
1928        foreach my $access (@accessors) {
1929            my $access = lc $access;
1930
1931            my $protected = "";
1932
1933            # Match the input as far as it goes.
1934            if ($access =~ /^(p[^_]*)_/) {
1935                $protected = $1;
1936                if (substr('protected_', 0, length $protected)
1937                    eq $protected)
1938                {
1939
1940                    # Add 1 for the underscore not included in $protected
1941                    $access = substr($access, length($protected) + 1);
1942                    $protected = '_';
1943                }
1944                else {
1945                    $protected = "";
1946                }
1947            }
1948
1949            if (substr('addable', 0, length $access) eq $access) {
1950                my $subname = "${package}::${protected}add_$name";
1951                no strict "refs";
1952
1953                # add_ accessor.  Don't add if already there, which we
1954                # determine using 'eq' for scalars and '==' otherwise.
1955                *$subname = sub ($self, $value) {
1956                    use strict "refs";
1957                    my $addr = do { no overloading; pack 'J', $self; };
1958                    if (ref $value) {
1959                        return if grep { $value == $_ } @{$field->{$addr}};
1960                    }
1961                    else {
1962                        return if grep { $value eq $_ } @{$field->{$addr}};
1963                    }
1964                    push @{$field->{$addr}}, $value;
1965                    return;
1966                }
1967            }
1968            elsif (substr('constructor', 0, length $access) eq $access) {
1969                if ($protected) {
1970                    Carp::my_carp_bug("Can't set-up 'protected' constructors")
1971                }
1972                else {
1973                    $constructor_fields{$package}{$name} = $field;
1974                }
1975            }
1976            elsif (substr('readable_array', 0, length $access) eq $access) {
1977
1978                # Here has read access.  If one of the other parameters for
1979                # access is array, or this one specifies array (by being more
1980                # than just 'readable_'), then create a subroutine that
1981                # assumes the data is an array.  Otherwise just a scalar
1982                my $subname = "${package}::${protected}$name";
1983                if (grep { /^a/i } @_
1984                    or length($access) > length('readable_'))
1985                {
1986                    no strict "refs";
1987                    *$subname = sub ($_addr) {
1988                        use strict "refs";
1989                        my $addr = do { no overloading; pack 'J', $_addr; };
1990                        if (ref $field->{$addr} ne 'ARRAY') {
1991                            my $type = ref $field->{$addr};
1992                            $type = 'scalar' unless $type;
1993                            Carp::my_carp_bug("Trying to read $name as an array when it is a $type.  Big problems.");
1994                            return;
1995                        }
1996                        return scalar @{$field->{$addr}} unless wantarray;
1997
1998                        # Make a copy; had problems with caller modifying the
1999                        # original otherwise
2000                        my @return = @{$field->{$addr}};
2001                        return @return;
2002                    }
2003                }
2004                else {
2005
2006                    # Here not an array value, a simpler function.
2007                    no strict "refs";
2008                    *$subname = sub ($addr) {
2009                        use strict "refs";
2010                        no overloading;
2011                        return $field->{pack 'J', $addr};
2012                    }
2013                }
2014            }
2015            elsif (substr('settable', 0, length $access) eq $access) {
2016                my $subname = "${package}::${protected}set_$name";
2017                no strict "refs";
2018                *$subname = sub ($self, $value) {
2019                    use strict "refs";
2020                    # $self is $_[0]; $value is $_[1]
2021                    no overloading;
2022                    $field->{pack 'J', $self} = $value;
2023                    return;
2024                }
2025            }
2026            else {
2027                Carp::my_carp_bug("Unknown accessor type $access.  No accessor set.");
2028            }
2029        }
2030        return;
2031    }
2032}
2033
2034package Input_file;
2035
2036# All input files use this object, which stores various attributes about them,
2037# and provides for convenient, uniform handling.  The run method wraps the
2038# processing.  It handles all the bookkeeping of opening, reading, and closing
2039# the file, returning only significant input lines.
2040#
2041# Each object gets a handler which processes the body of the file, and is
2042# called by run().  All character property files must use the generic,
2043# default handler, which has code scrubbed to handle things you might not
2044# expect, including automatic EBCDIC handling.  For files that don't deal with
2045# mapping code points to a property value, such as test files,
2046# PropertyAliases, PropValueAliases, and named sequences, you can override the
2047# handler to be a custom one.  Such a handler should basically be a
2048# while(next_line()) {...} loop.
2049#
2050# You can also set up handlers to
2051#   0) call during object construction time, after everything else is done
2052#   1) call before the first line is read, for pre processing
2053#   2) call to adjust each line of the input before the main handler gets
2054#      them.  This can be automatically generated, if appropriately simple
2055#      enough, by specifying a Properties parameter in the constructor.
2056#   3) call upon EOF before the main handler exits its loop
2057#   4) call at the end, for post processing
2058#
2059# $_ is used to store the input line, and is to be filtered by the
2060# each_line_handler()s.  So, if the format of the line is not in the desired
2061# format for the main handler, these are used to do that adjusting.  They can
2062# be stacked (by enclosing them in an [ anonymous array ] in the constructor,
2063# so the $_ output of one is used as the input to the next.  The EOF handler
2064# is also stackable, but none of the others are, but could easily be changed
2065# to be so.
2066#
2067# Some properties are used by the Perl core but aren't defined until later
2068# Unicode releases.  The perl interpreter would have problems working when
2069# compiled with an earlier Unicode version that doesn't have them, so we need
2070# to define them somehow for those releases.  The 'Early' constructor
2071# parameter can be used to automatically handle this.  It is essentially
2072# ignored if the Unicode version being compiled has a data file for this
2073# property.  Either code to execute or a file to read can be specified.
2074# Details are at the %early definition.
2075#
2076# Most of the handlers can call insert_lines() or insert_adjusted_lines()
2077# which insert the parameters as lines to be processed before the next input
2078# file line is read.  This allows the EOF handler(s) to flush buffers, for
2079# example.  The difference between the two routines is that the lines inserted
2080# by insert_lines() are subjected to the each_line_handler()s.  (So if you
2081# called it from such a handler, you would get infinite recursion without some
2082# mechanism to prevent that.)  Lines inserted by insert_adjusted_lines() go
2083# directly to the main handler without any adjustments.  If the
2084# post-processing handler calls any of these, there will be no effect.  Some
2085# error checking for these conditions could be added, but it hasn't been done.
2086#
2087# carp_bad_line() should be called to warn of bad input lines, which clears $_
2088# to prevent further processing of the line.  This routine will output the
2089# message as a warning once, and then keep a count of the lines that have the
2090# same message, and output that count at the end of the file's processing.
2091# This keeps the number of messages down to a manageable amount.
2092#
2093# get_missings() should be called to retrieve any @missing input lines.
2094# Messages will be raised if this isn't done if the options aren't to ignore
2095# missings.
2096
2097sub trace { return main::trace(@_); }
2098
2099{ # Closure
2100    # Keep track of fields that are to be put into the constructor.
2101    my %constructor_fields;
2102
2103    main::setup_package(Constructor_Fields => \%constructor_fields);
2104
2105    my %file; # Input file name, required
2106    main::set_access('file', \%file, qw{ c r });
2107
2108    my %first_released; # Unicode version file was first released in, required
2109    main::set_access('first_released', \%first_released, qw{ c r });
2110
2111    my %handler;    # Subroutine to process the input file, defaults to
2112                    # 'process_generic_property_file'
2113    main::set_access('handler', \%handler, qw{ c });
2114
2115    my %property;
2116    # name of property this file is for.  defaults to none, meaning not
2117    # applicable, or is otherwise determinable, for example, from each line.
2118    main::set_access('property', \%property, qw{ c r });
2119
2120    my %optional;
2121    # This is either an unsigned number, or a list of property names.  In the
2122    # former case, if it is non-zero, it means the file is optional, so if the
2123    # file is absent, no warning about that is output.  In the latter case, it
2124    # is a list of properties that the file (exclusively) defines.  If the
2125    # file is present, tables for those properties will be produced; if
2126    # absent, none will, even if they are listed elsewhere (namely
2127    # PropertyAliases.txt and PropValueAliases.txt) as being in this release,
2128    # and no warnings will be raised about them not being available.  (And no
2129    # warning about the file itself will be raised.)
2130    main::set_access('optional', \%optional, qw{ c readable_array } );
2131
2132    my %non_skip;
2133    # This is used for debugging, to skip processing of all but a few input
2134    # files.  Add 'non_skip => 1' to the constructor for those files you want
2135    # processed when you set the $debug_skip global.
2136    main::set_access('non_skip', \%non_skip, 'c');
2137
2138    my %skip;
2139    # This is used to skip processing of this input file (semi-) permanently.
2140    # The value should be the reason the file is being skipped.  It is used
2141    # for files that we aren't planning to process anytime soon, but want to
2142    # allow to be in the directory and be checked for their names not
2143    # conflicting with any other files on a DOS 8.3 name filesystem, but to
2144    # not otherwise be processed, and to not raise a warning about not being
2145    # handled.  In the constructor call, any value that evaluates to a numeric
2146    # 0 or undef means don't skip.  Any other value is a string giving the
2147    # reason it is being skipped, and this will appear in generated pod.
2148    # However, an empty string reason will suppress the pod entry.
2149    # Internally, calls that evaluate to numeric 0 are changed into undef to
2150    # distinguish them from an empty string call.
2151    main::set_access('skip', \%skip, 'c', 'r');
2152
2153    my %each_line_handler;
2154    # list of subroutines to look at and filter each non-comment line in the
2155    # file.  defaults to none.  The subroutines are called in order, each is
2156    # to adjust $_ for the next one, and the final one adjusts it for
2157    # 'handler'
2158    main::set_access('each_line_handler', \%each_line_handler, 'c');
2159
2160    my %retain_trailing_comments;
2161    # This is used to not discard the comments that end data lines.  This
2162    # would be used only for files with non-typical syntax, and most code here
2163    # assumes that comments have been stripped, so special handlers would have
2164    # to be written.  It is assumed that the code will use these in
2165    # single-quoted contexts, and so any "'" marks in the comment will be
2166    # prefixed by a backslash.
2167    main::set_access('retain_trailing_comments', \%retain_trailing_comments, 'c');
2168
2169    my %properties; # Optional ordered list of the properties that occur in each
2170    # meaningful line of the input file.  If present, an appropriate
2171    # each_line_handler() is automatically generated and pushed onto the stack
2172    # of such handlers.  This is useful when a file contains multiple
2173    # properties per line, but no other special considerations are necessary.
2174    # The special value "<ignored>" means to discard the corresponding input
2175    # field.
2176    # Any @missing lines in the file should also match this syntax; no such
2177    # files exist as of 6.3.  But if it happens in a future release, the code
2178    # could be expanded to properly parse them.
2179    main::set_access('properties', \%properties, qw{ c r });
2180
2181    my %has_missings_defaults;
2182    # ? Are there lines in the file giving default values for code points
2183    # missing from it?.  Defaults to NO_DEFAULTS.  Otherwise NOT_IGNORED is
2184    # the norm, but IGNORED means it has such lines, but the handler doesn't
2185    # use them.  Having these three states allows us to catch changes to the
2186    # UCD that this program should track.  XXX This could be expanded to
2187    # specify the syntax for such lines, like %properties above.
2188    main::set_access('has_missings_defaults',
2189                                        \%has_missings_defaults, qw{ c r });
2190
2191    my %construction_time_handler;
2192    # Subroutine to call at the end of the new method.  If undef, no such
2193    # handler is called.
2194    main::set_access('construction_time_handler',
2195                                        \%construction_time_handler, qw{ c });
2196
2197    my %pre_handler;
2198    # Subroutine to call before doing anything else in the file.  If undef, no
2199    # such handler is called.
2200    main::set_access('pre_handler', \%pre_handler, qw{ c });
2201
2202    my %eof_handler;
2203    # Subroutines to call upon getting an EOF on the input file, but before
2204    # that is returned to the main handler.  This is to allow buffers to be
2205    # flushed.  The handler is expected to call insert_lines() or
2206    # insert_adjusted() with the buffered material
2207    main::set_access('eof_handler', \%eof_handler, qw{ c });
2208
2209    my %post_handler;
2210    # Subroutine to call after all the lines of the file are read in and
2211    # processed.  If undef, no such handler is called.  Note that this cannot
2212    # add lines to be processed; instead use eof_handler
2213    main::set_access('post_handler', \%post_handler, qw{ c });
2214
2215    my %progress_message;
2216    # Message to print to display progress in lieu of the standard one
2217    main::set_access('progress_message', \%progress_message, qw{ c });
2218
2219    my %handle;
2220    # cache open file handle, internal.  Is undef if file hasn't been
2221    # processed at all, empty if has;
2222    main::set_access('handle', \%handle);
2223
2224    my %added_lines;
2225    # cache of lines added virtually to the file, internal
2226    main::set_access('added_lines', \%added_lines);
2227
2228    my %remapped_lines;
2229    # cache of lines added virtually to the file, internal
2230    main::set_access('remapped_lines', \%remapped_lines);
2231
2232    my %errors;
2233    # cache of errors found, internal
2234    main::set_access('errors', \%errors);
2235
2236    my %missings;
2237    # storage of '@missing' defaults lines
2238    main::set_access('missings', \%missings);
2239
2240    my %early;
2241    # Used for properties that must be defined (for Perl's purposes) on
2242    # versions of Unicode earlier than Unicode itself defines them.  The
2243    # parameter is an array (it would be better to be a hash, but not worth
2244    # bothering about due to its rare use).
2245    #
2246    # The first element is either a code reference to call when in a release
2247    # earlier than the Unicode file is available in, or it is an alternate
2248    # file to use instead of the non-existent one.  This file must have been
2249    # plunked down in the same directory as mktables.  Should you be compiling
2250    # on a release that needs such a file, mktables will abort the
2251    # compilation, and tell you where to get the necessary file(s), and what
2252    # name(s) to use to store them as.
2253    # In the case of specifying an alternate file, the array must contain two
2254    # further elements:
2255    #
2256    # [1] is the name of the property that will be generated by this file.
2257    # The class automatically takes the input file and excludes any code
2258    # points in it that were not assigned in the Unicode version being
2259    # compiled.  It then uses this result to define the property in the given
2260    # version.  Since the property doesn't actually exist in the Unicode
2261    # version being compiled, this should be a name accessible only by core
2262    # perl.  If it is the same name as the regular property, the constructor
2263    # will mark the output table as a $PLACEHOLDER so that it doesn't actually
2264    # get output, and so will be unusable by non-core code.  Otherwise it gets
2265    # marked as $INTERNAL_ONLY.
2266    #
2267    # [2] is a property value to assign (only when compiling Unicode 1.1.5) to
2268    # the Hangul syllables in that release (which were ripped out in version
2269    # 2) for the given property .  (Hence it is ignored except when compiling
2270    # version 1.  You only get one value that applies to all of them, which
2271    # may not be the actual reality, but probably nobody cares anyway for
2272    # these obsolete characters.)
2273    #
2274    # [3] if present is the default value for the property to assign for code
2275    # points not given in the input.  If not present, the default from the
2276    # normal property is used
2277    #
2278    # [-1] If there is an extra final element that is the string 'ONLY_EARLY'.
2279    # it means to not add the name in [1] as an alias to the property name
2280    # used for these.  Normally, when compiling Unicode versions that don't
2281    # invoke the early handling, the name is added as a synonym.
2282    #
2283    # Not all files can be handled in the above way, and so the code ref
2284    # alternative is available.  It can do whatever it needs to.  The other
2285    # array elements are optional in this case, and the code is free to use or
2286    # ignore them if they are present.
2287    #
2288    # Internally, the constructor unshifts a 0 or 1 onto this array to
2289    # indicate if an early alternative is actually being used or not.  This
2290    # makes for easier testing later on.
2291    main::set_access('early', \%early, 'c');
2292
2293    my %only_early;
2294    main::set_access('only_early', \%only_early, 'c');
2295
2296    my %required_even_in_debug_skip;
2297    # debug_skip is used to speed up compilation during debugging by skipping
2298    # processing files that are not needed for the task at hand.  However,
2299    # some files pretty much can never be skipped, and this is used to specify
2300    # that this is one of them.  In order to skip this file, the call to the
2301    # constructor must be edited to comment out this parameter.
2302    main::set_access('required_even_in_debug_skip',
2303                     \%required_even_in_debug_skip, 'c');
2304
2305    my %withdrawn;
2306    # Some files get removed from the Unicode DB.  This is a version object
2307    # giving the first release without this file.
2308    main::set_access('withdrawn', \%withdrawn, 'c');
2309
2310    my %ucd;
2311    # Some files are not actually part of the Unicode Character Database.
2312    # These typically have a different way of indicating their version
2313    main::set_access('ucd', \%ucd, 'c');
2314
2315    my %in_this_release;
2316    # Calculated value from %first_released and %withdrawn.  Are we compiling
2317    # a Unicode release which includes this file?
2318    main::set_access('in_this_release', \%in_this_release);
2319
2320    sub _next_line;
2321    sub _next_line_with_remapped_range;
2322
2323    sub new {
2324        my $class = shift;
2325
2326        my $self = bless \do{ my $anonymous_scalar }, $class;
2327        my $addr = do { no overloading; pack 'J', $self; };
2328
2329        # Set defaults
2330        $handler{$addr} = \&main::process_generic_property_file;
2331        $retain_trailing_comments{$addr} = 0;
2332        $non_skip{$addr} = 0;
2333        $skip{$addr} = undef;
2334        $has_missings_defaults{$addr} = $NO_DEFAULTS;
2335        $handle{$addr} = undef;
2336        $added_lines{$addr} = [ ];
2337        $remapped_lines{$addr} = [ ];
2338        $each_line_handler{$addr} = [ ];
2339        $eof_handler{$addr} = [ ];
2340        $errors{$addr} = { };
2341        $missings{$addr} = [ ];
2342        $early{$addr} = [ ];
2343        $optional{$addr} = [ ];
2344        $ucd{$addr} = 1;
2345
2346        # Two positional parameters.
2347        return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
2348        $file{$addr} = main::internal_file_to_platform(shift);
2349        $first_released{$addr} = shift;
2350
2351        # The rest of the arguments are key => value pairs
2352        # %constructor_fields has been set up earlier to list all possible
2353        # ones.  Either set or push, depending on how the default has been set
2354        # up just above.
2355        my %args = @_;
2356        foreach my $key (keys %args) {
2357            my $argument = $args{$key};
2358
2359            # Note that the fields are the lower case of the constructor keys
2360            my $hash = $constructor_fields{lc $key};
2361            if (! defined $hash) {
2362                Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self.  Skipped");
2363                next;
2364            }
2365            if (ref $hash->{$addr} eq 'ARRAY') {
2366                if (ref $argument eq 'ARRAY') {
2367                    foreach my $argument (@{$argument}) {
2368                        next if ! defined $argument;
2369                        push @{$hash->{$addr}}, $argument;
2370                    }
2371                }
2372                else {
2373                    push @{$hash->{$addr}}, $argument if defined $argument;
2374                }
2375            }
2376            else {
2377                $hash->{$addr} = $argument;
2378            }
2379            delete $args{$key};
2380        };
2381
2382        $non_skip{$addr} = 1 if $required_even_in_debug_skip{$addr};
2383
2384        # Convert 0 (meaning don't skip) to undef
2385        undef $skip{$addr} unless $skip{$addr};
2386
2387        # Handle the case where this file is optional
2388        my $pod_message_for_non_existent_optional = "";
2389        if ($optional{$addr}->@*) {
2390
2391            # First element is the pod message
2392            $pod_message_for_non_existent_optional
2393                                                = shift $optional{$addr}->@*;
2394            # Convert a 0 'Optional' argument to an empty list to make later
2395            # code more concise.
2396            if (   $optional{$addr}->@*
2397                && $optional{$addr}->@* == 1
2398                && $optional{$addr}[0] ne ""
2399                && $optional{$addr}[0] !~ /\D/
2400                && $optional{$addr}[0] == 0)
2401            {
2402                $optional{$addr} = [ ];
2403            }
2404            else {  # But if the only element doesn't evaluate to 0, make sure
2405                    # that this file is indeed considered optional below.
2406                unshift $optional{$addr}->@*, 1;
2407            }
2408        }
2409
2410        my $progress;
2411        my $function_instead_of_file = 0;
2412
2413        if ($early{$addr}->@* && $early{$addr}[-1] eq 'ONLY_EARLY') {
2414            $only_early{$addr} = 1;
2415            pop $early{$addr}->@*;
2416        }
2417
2418        # If we are compiling a Unicode release earlier than the file became
2419        # available, the constructor may have supplied a substitute
2420        if ($first_released{$addr} gt $v_version && $early{$addr}->@*) {
2421
2422            # Yes, we have a substitute, that we will use; mark it so
2423            unshift $early{$addr}->@*, 1;
2424
2425            # See the definition of %early for what the array elements mean.
2426            # Note that we have just unshifted onto the array, so the numbers
2427            # below are +1 of those in the %early description.
2428            # If we have a property this defines, create a table and default
2429            # map for it now (at essentially compile time), so that it will be
2430            # available for the whole of run time.  (We will want to add this
2431            # name as an alias when we are using the official property name;
2432            # but this must be deferred until run(), because at construction
2433            # time the official names have yet to be defined.)
2434            if ($early{$addr}[2]) {
2435                my $fate = ($property{$addr}
2436                            && $property{$addr} eq $early{$addr}[2])
2437                          ? $PLACEHOLDER
2438                          : $INTERNAL_ONLY;
2439                my $prop_object = Property->new($early{$addr}[2],
2440                                                Fate => $fate,
2441                                                Perl_Extension => 1,
2442                                                );
2443
2444                # If not specified by the constructor, use the default mapping
2445                # for the regular property for this substitute one.
2446                if ($early{$addr}[4]) {
2447                    $prop_object->set_default_map($early{$addr}[4]);
2448                }
2449                elsif (    defined $property{$addr}
2450                       &&  defined $default_mapping{$property{$addr}})
2451                {
2452                    $prop_object
2453                        ->set_default_map($default_mapping{$property{$addr}});
2454                }
2455            }
2456
2457            if (ref $early{$addr}[1] eq 'CODE') {
2458                $function_instead_of_file = 1;
2459
2460                # If the first element of the array is a code ref, the others
2461                # are optional.
2462                $handler{$addr} = $early{$addr}[1];
2463                $property{$addr} = $early{$addr}[2]
2464                                                if defined $early{$addr}[2];
2465                $progress = "substitute $file{$addr}";
2466
2467                undef $file{$addr};
2468            }
2469            else {  # Specifying a substitute file
2470
2471                if (! main::file_exists($early{$addr}[1])) {
2472
2473                    # If we don't see the substitute file, generate an error
2474                    # message giving the needed things, and add it to the list
2475                    # of such to output before actual processing happens
2476                    # (hence the user finds out all of them in one run).
2477                    # Instead of creating a general method for NameAliases,
2478                    # hard-code it here, as there is unlikely to ever be a
2479                    # second one which needs special handling.
2480                    my $string_version = ($file{$addr} eq "NameAliases.txt")
2481                                    ? 'at least 6.1 (the later, the better)'
2482                                    : sprintf "%vd", $first_released{$addr};
2483                    push @missing_early_files, <<END;
2484'$file{$addr}' version $string_version should be copied to '$early{$addr}[1]'.
2485END
2486                    ;
2487                    return;
2488                }
2489                $progress = $early{$addr}[1];
2490                $progress .= ", substituting for $file{$addr}" if $file{$addr};
2491                $file{$addr} = $early{$addr}[1];
2492                $property{$addr} = $early{$addr}[2];
2493
2494                # Ignore code points not in the version being compiled
2495                push $each_line_handler{$addr}->@*, \&_exclude_unassigned;
2496
2497                if (   $v_version lt v2.0        # Hanguls in this release ...
2498                    && defined $early{$addr}[3]) # ... need special treatment
2499                {
2500                    push $eof_handler{$addr}->@*, \&_fixup_obsolete_hanguls;
2501                }
2502            }
2503
2504            # And this substitute is valid for all releases.
2505            $first_released{$addr} = v0;
2506        }
2507        else {  # Normal behavior
2508            $progress = $file{$addr};
2509            unshift $early{$addr}->@*, 0; # No substitute
2510        }
2511
2512        my $file = $file{$addr};
2513        $progress_message{$addr} = "Processing $progress"
2514                                            unless $progress_message{$addr};
2515
2516        # A file should be there if it is within the window of versions for
2517        # which Unicode supplies it
2518        if ($withdrawn{$addr} && $withdrawn{$addr} le $v_version) {
2519            $in_this_release{$addr} = 0;
2520            $skip{$addr} = "";
2521        }
2522        else {
2523            $in_this_release{$addr} = $first_released{$addr} le $v_version;
2524
2525            # Check that the file for this object (possibly using a substitute
2526            # for early releases) exists or we have a function alternative
2527            if (   ! $function_instead_of_file
2528                && ! main::file_exists($file))
2529            {
2530                # Here there is nothing available for this release.  This is
2531                # fine if we aren't expecting anything in this release.
2532                if (! $in_this_release{$addr}) {
2533                    $skip{$addr} = "";  # Don't remark since we expected
2534                                        # nothing and got nothing
2535                }
2536                elsif ($optional{$addr}->@*) {
2537
2538                    # Here the file is optional in this release; Use the
2539                    # passed in text to document this case in the pod.
2540                    $skip{$addr} = $pod_message_for_non_existent_optional;
2541                }
2542                elsif (   $in_this_release{$addr}
2543                       && ! defined $skip{$addr}
2544                       && defined $file)
2545                { # Doesn't exist but should.
2546                    $skip{$addr} = "'$file' not found.  Possibly Big problems";
2547                    Carp::my_carp($skip{$addr});
2548                }
2549            }
2550            elsif ($debug_skip && ! defined $skip{$addr} && ! $non_skip{$addr})
2551            {
2552
2553                # The file exists; if not skipped for another reason, and we are
2554                # skipping most everything during debugging builds, use that as
2555                # the skip reason.
2556                $skip{$addr} = '$debug_skip is on'
2557            }
2558        }
2559
2560        if (   ! $debug_skip
2561            && $non_skip{$addr}
2562            && ! $required_even_in_debug_skip{$addr}
2563            && $verbosity)
2564        {
2565            print "Warning: " . __PACKAGE__ . " constructor for $file has useless 'non_skip' in it\n";
2566        }
2567
2568        # Here, we have figured out if we will be skipping this file or not.
2569        # If so, we add any single property it defines to any passed in
2570        # optional property list.  These will be dealt with at run time.
2571        if (defined $skip{$addr}) {
2572            if ($property{$addr}) {
2573                push $optional{$addr}->@*, $property{$addr};
2574            }
2575        } # Otherwise, are going to process the file.
2576        elsif ($property{$addr}) {
2577
2578            # If the file has a property defined in the constructor for it, it
2579            # means that the property is not listed in the file's entries.  So
2580            # add a handler (to the list of line handlers) to insert the
2581            # property name into the lines, to provide a uniform interface to
2582            # the final processing subroutine.
2583            push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
2584        }
2585        elsif ($properties{$addr}) {
2586
2587            # Similarly, there may be more than one property represented on
2588            # each line, with no clue but the constructor input what those
2589            # might be.  Add a handler for each line in the input so that it
2590            # creates a separate input line for each property in those input
2591            # lines, thus making them suitable to handle generically.
2592
2593            push @{$each_line_handler{$addr}},
2594                 sub {
2595                    my $file = shift;
2596                    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2597                    my @fields = split /\s*;\s*/, $_, -1;
2598
2599                    if (@fields - 1 > @{$properties{$addr}}) {
2600                        $file->carp_bad_line('Extra fields');
2601                        $_ = "";
2602                        return;
2603                    }
2604                    my $range = shift @fields;  # 0th element is always the
2605                                                # range
2606
2607                    # The next fields in the input line correspond
2608                    # respectively to the stored properties.
2609                    for my $i (0 ..  @{$properties{$addr}} - 1) {
2610                        my $property_name = $properties{$addr}[$i];
2611                        next if $property_name eq '<ignored>';
2612                        $file->insert_adjusted_lines(
2613                              "$range; $property_name; $fields[$i]");
2614                    }
2615                    $_ = "";
2616
2617                    return;
2618                };
2619        }
2620
2621        {   # On non-ascii platforms, we use a special pre-handler
2622            no strict;
2623            no warnings 'once';
2624            *next_line = (main::NON_ASCII_PLATFORM)
2625                         ? *_next_line_with_remapped_range
2626                         : *_next_line;
2627        }
2628
2629        &{$construction_time_handler{$addr}}($self)
2630                                        if $construction_time_handler{$addr};
2631
2632        return $self;
2633    }
2634
2635
2636    use overload
2637        fallback => 0,
2638        qw("") => "_operator_stringify",
2639        "." => \&main::_operator_dot,
2640        ".=" => \&main::_operator_dot_equal,
2641    ;
2642
2643    sub _operator_stringify($self, $other="", $reversed=0) {
2644        return __PACKAGE__ . " object for " . $self->file;
2645    }
2646
2647    sub run($self) {
2648        # Process the input object $self.  This opens and closes the file and
2649        # calls all the handlers for it.  Currently,  this can only be called
2650        # once per file, as it destroy's the EOF handlers
2651
2652        # flag to make sure extracted files are processed early
2653        state $seen_non_extracted = 0;
2654
2655        my $addr = do { no overloading; pack 'J', $self; };
2656
2657        my $file = $file{$addr};
2658
2659        if (! $file) {
2660            $handle{$addr} = 'pretend_is_open';
2661        }
2662        else {
2663            if ($seen_non_extracted) {
2664                if ($file =~ /$EXTRACTED/i) # Some platforms may change the
2665                                            # case of the file's name
2666                {
2667                    Carp::my_carp_bug(main::join_lines(<<END
2668$file should be processed just after the 'Prop...Alias' files, and before
2669anything not in the $EXTRACTED_DIR directory.  Proceeding, but the results may
2670have subtle problems
2671END
2672                    ));
2673                }
2674            }
2675            elsif ($EXTRACTED_DIR
2676
2677                    # We only do this check for generic property files
2678                    && $handler{$addr} == \&main::process_generic_property_file
2679
2680                    && $file !~ /$EXTRACTED/i)
2681            {
2682                # We don't set this (by the 'if' above) if we have no
2683                # extracted directory, so if running on an early version,
2684                # this test won't work.  Not worth worrying about.
2685                $seen_non_extracted = 1;
2686            }
2687
2688            # Mark the file as having being processed, and warn if it
2689            # isn't a file we are expecting.  As we process the files,
2690            # they are deleted from the hash, so any that remain at the
2691            # end of the program are files that we didn't process.
2692            my $fkey = File::Spec->rel2abs($file);
2693            my $exists = delete $potential_files{lc($fkey)};
2694
2695            Carp::my_carp("Was not expecting '$file'.")
2696                                    if $exists && ! $in_this_release{$addr};
2697
2698            # If there is special handling for compiling Unicode releases
2699            # earlier than the first one in which Unicode defines this
2700            # property ...
2701            if ($early{$addr}->@* > 1) {
2702
2703                # Mark as processed any substitute file that would be used in
2704                # such a release
2705                $fkey = File::Spec->rel2abs($early{$addr}[1]);
2706                delete $potential_files{lc($fkey)};
2707
2708                # As commented in the constructor code, when using the
2709                # official property, we still have to allow the publicly
2710                # inaccessible early name so that the core code which uses it
2711                # will work regardless.
2712                if (   ! $only_early{$addr}
2713                    && ! $early{$addr}[0]
2714                    && $early{$addr}->@* > 2)
2715                {
2716                    my $early_property_name = $early{$addr}[2];
2717                    if ($property{$addr} ne $early_property_name) {
2718                        main::property_ref($property{$addr})
2719                                            ->add_alias($early_property_name);
2720                    }
2721                }
2722            }
2723
2724            # We may be skipping this file ...
2725            if (defined $skip{$addr}) {
2726
2727                # If the file isn't supposed to be in this release, there is
2728                # nothing to do
2729                if ($in_this_release{$addr}) {
2730
2731                    # But otherwise, we may print a message
2732                    if ($debug_skip) {
2733                        print STDERR "Skipping input file '$file'",
2734                                     " because '$skip{$addr}'\n";
2735                    }
2736
2737                    # And add it to the list of skipped files, which is later
2738                    # used to make the pod
2739                    $skipped_files{$file} = $skip{$addr};
2740
2741                    # The 'optional' list contains properties that are also to
2742                    # be skipped along with the file.  (There may also be
2743                    # digits which are just placeholders to make sure it isn't
2744                    # an empty list
2745                    foreach my $property ($optional{$addr}->@*) {
2746                        next unless $property =~ /\D/;
2747                        my $prop_object = main::property_ref($property);
2748                        next unless defined $prop_object;
2749                        $prop_object->set_fate($SUPPRESSED, $skip{$addr});
2750                    }
2751                }
2752
2753                return;
2754            }
2755
2756            # Here, we are going to process the file.  Open it, converting the
2757            # slashes used in this program into the proper form for the OS
2758            my $file_handle;
2759            if (not open $file_handle, "<", $file) {
2760                Carp::my_carp("Can't open $file.  Skipping: $!");
2761                return;
2762            }
2763            $handle{$addr} = $file_handle; # Cache the open file handle
2764
2765            # If possible, make sure that the file is the correct version.
2766            # (This data isn't available on early Unicode releases or in
2767            # UnicodeData.txt.)  We don't do this check if we are using a
2768            # substitute file instead of the official one (though the code
2769            # could be extended to do so).
2770            if ($in_this_release{$addr}
2771                && ! $early{$addr}[0]
2772                && lc($file) ne 'unicodedata.txt')
2773            {
2774                my $this_version;
2775
2776                if ($file !~ /^Unihan/i) {
2777
2778                    # The non-Unihan files started getting version numbers in
2779                    # 3.2, but some files in 4.0 are unchanged from 3.2, and
2780                    # marked as 3.2.  4.0.1 is the first version where there
2781                    # are no files marked as being from less than 4.0, though
2782                    # some are marked as 4.0.  In versions after that, the
2783                    # numbers are correct.
2784                    if ($v_version ge v4.0.1) {
2785                        $_ = <$file_handle>;    # The version number is in the
2786                                                # very first line if it is a
2787                                                # UCD file; otherwise, it
2788                                                # might be
2789                        goto valid_version if $_ =~ / - $string_version \. /x;
2790                        chomp;
2791                        if ($ucd{$addr}) {
2792                            $_ =~ s/^#\s*//;
2793
2794                            # 4.0.1 had some valid files that weren't updated.
2795                            goto valid_version
2796                                    if $v_version eq v4.0.1 && $_ =~ /4\.0\.0/;
2797                            $this_version = $_;
2798                            goto wrong_version;
2799                        }
2800                        else {
2801                            my $BOM = "\x{FEFF}";
2802                            utf8::encode($BOM);
2803                            my $BOM_re = qr/ ^ (?:$BOM)? /x;
2804
2805                            while ($_ =~ s/$BOM_re//) { # BOM; seems to be on
2806                                                        # many lines in some files!!
2807                                $_ = <$file_handle>;
2808                                chomp;
2809                                if ($_ =~ /^# Version: (.*)/) {
2810                                    $this_version = $1;
2811                                    goto valid_version
2812                                        if $this_version eq $string_version;
2813                                    goto valid_version
2814                                        if "$this_version.0" eq $string_version;
2815                                    goto wrong_version;
2816                                }
2817                            }
2818                            goto no_version;
2819                        }
2820                    }
2821                }
2822                elsif ($v_version ge v6.0.0) { # Unihan
2823
2824                    # Unihan files didn't get accurate version numbers until
2825                    # 6.0.  The version is somewhere in the first comment
2826                    # block
2827                    while (<$file_handle>) {
2828                        goto no_version if $_ !~ /^#/;
2829                        chomp;
2830                        $_ =~ s/^#\s*//;
2831                        next if $_ !~ / version: /x;
2832                        goto valid_version if $_ =~ /$string_version/;
2833                        goto wrong_version;
2834                    }
2835                    goto no_version;
2836                }
2837                else {  # Old Unihan; have to assume is valid
2838                    goto valid_version;
2839                }
2840
2841              wrong_version:
2842                die Carp::my_carp("File '$file' is version "
2843                                . "'$this_version'.  It should be "
2844                                . "version $string_version");
2845              no_version:
2846                Carp::my_carp_bug("Could not find the expected "
2847                                . "version info in file '$file'");
2848            }
2849        }
2850
2851      valid_version:
2852        print "$progress_message{$addr}\n" if $verbosity >= $PROGRESS;
2853
2854        # Call any special handler for before the file.
2855        &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
2856
2857        # Then the main handler
2858        &{$handler{$addr}}($self);
2859
2860        # Then any special post-file handler.
2861        &{$post_handler{$addr}}($self) if $post_handler{$addr};
2862
2863        # If any errors have been accumulated, output the counts (as the first
2864        # error message in each class was output when it was encountered).
2865        if ($errors{$addr}) {
2866            my $total = 0;
2867            my $types = 0;
2868            foreach my $error (keys %{$errors{$addr}}) {
2869                $total += $errors{$addr}->{$error};
2870                delete $errors{$addr}->{$error};
2871                $types++;
2872            }
2873            if ($total > 1) {
2874                my $message
2875                        = "A total of $total lines had errors in $file.  ";
2876
2877                $message .= ($types == 1)
2878                            ? '(Only the first one was displayed.)'
2879                            : '(Only the first of each type was displayed.)';
2880                Carp::my_carp($message);
2881            }
2882        }
2883
2884        if (@{$missings{$addr}}) {
2885            Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines.  Generated tables likely are wrong");
2886        }
2887
2888        # If a real file handle, close it.
2889        close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2890                                                        ref $handle{$addr};
2891        $handle{$addr} = "";   # Uses empty to indicate that has already seen
2892                               # the file, as opposed to undef
2893        return;
2894    }
2895
2896    sub _next_line($self) {
2897        # Sets $_ to be the next logical input line, if any.  Returns non-zero
2898        # if such a line exists.  'logical' means that any lines that have
2899        # been added via insert_lines() will be returned in $_ before the file
2900        # is read again.
2901
2902        my $addr = do { no overloading; pack 'J', $self; };
2903
2904        # Here the file is open (or if the handle is not a ref, is an open
2905        # 'virtual' file).  Get the next line; any inserted lines get priority
2906        # over the file itself.
2907        my $adjusted;
2908
2909        LINE:
2910        while (1) { # Loop until find non-comment, non-empty line
2911            #local $to_trace = 1 if main::DEBUG;
2912            my $inserted_ref = shift @{$added_lines{$addr}};
2913            if (defined $inserted_ref) {
2914                ($adjusted, $_) = @{$inserted_ref};
2915                trace $adjusted, $_ if main::DEBUG && $to_trace;
2916                return 1 if $adjusted;
2917            }
2918            else {
2919                last if ! ref $handle{$addr}; # Don't read unless is real file
2920                last if ! defined ($_ = readline $handle{$addr});
2921            }
2922            chomp;
2923            trace $_ if main::DEBUG && $to_trace;
2924
2925            # See if this line is the comment line that defines what property
2926            # value that code points that are not listed in the file should
2927            # have.  The format or existence of these lines is not guaranteed
2928            # by Unicode since they are comments, but the documentation says
2929            # that this was added for machine-readability, so probably won't
2930            # change.  This works starting in Unicode Version 5.0.  They look
2931            # like:
2932            #
2933            # @missing: 0000..10FFFF; Not_Reordered
2934            # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2935            # @missing: 0000..10FFFF; ; NaN
2936            #
2937            # Save the line for a later get_missings() call.
2938            if (/$missing_defaults_prefix/) {
2939                if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2940                    $self->carp_bad_line("Unexpected \@missing line.  Assuming no missing entries");
2941                }
2942                elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2943                    my @defaults = split /\s* ; \s*/x, $_;
2944
2945                    # The first field is the @missing, which ends in a
2946                    # semi-colon, so can safely shift.
2947                    shift @defaults;
2948
2949                    # Some of these lines may have empty field placeholders
2950                    # which get in the way.  An example is:
2951                    # @missing: 0000..10FFFF; ; NaN
2952                    # Remove them.  Process starting from the top so the
2953                    # splice doesn't affect things still to be looked at.
2954                    for (my $i = @defaults - 1; $i >= 0; $i--) {
2955                        next if $defaults[$i] ne "";
2956                        splice @defaults, $i, 1;
2957                    }
2958
2959                    # What's left should be just the property (maybe) and the
2960                    # default.  Having only one element means it doesn't have
2961                    # the property.
2962                    my $default;
2963                    my $property;
2964                    if (@defaults >= 1) {
2965                        if (@defaults == 1) {
2966                            $default = $defaults[0];
2967                        }
2968                        else {
2969                            $property = $defaults[0];
2970                            $default = $defaults[1];
2971                        }
2972                    }
2973
2974                    if (@defaults < 1
2975                        || @defaults > 2
2976                        || ($default =~ /^</
2977                            && $default !~ /^<code *point>$/i
2978                            && $default !~ /^<none>$/i
2979                            && $default !~ /^<script>$/i))
2980                    {
2981                        $self->carp_bad_line("Unrecognized \@missing line: $_.  Assuming no missing entries");
2982                    }
2983                    else {
2984
2985                        # If the property is missing from the line, it should
2986                        # be the one for the whole file
2987                        $property = $property{$addr} if ! defined $property;
2988
2989                        # Change <none> to the null string, which is what it
2990                        # really means.  If the default is the code point
2991                        # itself, set it to <code point>, which is what
2992                        # Unicode uses (but sometimes they've forgotten the
2993                        # space)
2994                        if ($default =~ /^<none>$/i) {
2995                            $default = "";
2996                        }
2997                        elsif ($default =~ /^<code *point>$/i) {
2998                            $default = $CODE_POINT;
2999                        }
3000                        elsif ($default =~ /^<script>$/i) {
3001
3002                            # Special case this one.  Currently is from
3003                            # ScriptExtensions.txt, and means for all unlisted
3004                            # code points, use their Script property values.
3005                            # For the code points not listed in that file, the
3006                            # default value is 'Unknown'.
3007                            $default = "Unknown";
3008                        }
3009
3010                        # Store them as a sub-arrays with both components.
3011                        push @{$missings{$addr}}, [ $default, $property ];
3012                    }
3013                }
3014
3015                # There is nothing for the caller to process on this comment
3016                # line.
3017                next;
3018            }
3019
3020            # Unless to keep, remove comments.  If to keep, ignore
3021            # comment-only lines
3022            if ($retain_trailing_comments{$addr}) {
3023                next if / ^ \s* \# /x;
3024
3025                # But escape any single quotes (done in both the comment and
3026                # non-comment portion; this could be a bug someday, but not
3027                # likely)
3028                s/'/\\'/g;
3029            }
3030            else {
3031                s/#.*//;
3032            }
3033
3034            # Remove trailing space, and skip this line if the result is empty
3035            s/\s+$//;
3036            next if /^$/;
3037
3038            # Call any handlers for this line, and skip further processing of
3039            # the line if the handler sets the line to null.
3040            foreach my $sub_ref (@{$each_line_handler{$addr}}) {
3041                &{$sub_ref}($self);
3042                next LINE if /^$/;
3043            }
3044
3045            # Here the line is ok.  return success.
3046            return 1;
3047        } # End of looping through lines.
3048
3049        # If there are EOF handlers, call each (only once) and if it generates
3050        # more lines to process go back in the loop to handle them.
3051        while ($eof_handler{$addr}->@*) {
3052            &{$eof_handler{$addr}[0]}($self);
3053            shift $eof_handler{$addr}->@*;   # Currently only get one shot at it.
3054            goto LINE if $added_lines{$addr};
3055        }
3056
3057        # Return failure -- no more lines.
3058        return 0;
3059
3060    }
3061
3062    sub _next_line_with_remapped_range($self) {
3063        # like _next_line(), but for use on non-ASCII platforms.  It sets $_
3064        # to be the next logical input line, if any.  Returns non-zero if such
3065        # a line exists.  'logical' means that any lines that have been added
3066        # via insert_lines() will be returned in $_ before the file is read
3067        # again.
3068        #
3069        # The difference from _next_line() is that this remaps the Unicode
3070        # code points in the input to those of the native platform.  Each
3071        # input line contains a single code point, or a single contiguous
3072        # range of them  This routine splits each range into its individual
3073        # code points and caches them.  It returns the cached values,
3074        # translated into their native equivalents, one at a time, for each
3075        # call, before reading the next line.  Since native values can only be
3076        # a single byte wide, no translation is needed for code points above
3077        # 0xFF, and ranges that are entirely above that number are not split.
3078        # If an input line contains the range 254-1000, it would be split into
3079        # three elements: 254, 255, and 256-1000.  (The downstream table
3080        # insertion code will sort and coalesce the individual code points
3081        # into appropriate ranges.)
3082
3083        my $addr = do { no overloading; pack 'J', $self; };
3084
3085        while (1) {
3086
3087            # Look in cache before reading the next line.  Return any cached
3088            # value, translated
3089            my $inserted = shift @{$remapped_lines{$addr}};
3090            if (defined $inserted) {
3091                trace $inserted if main::DEBUG && $to_trace;
3092                $_ = $inserted =~ s/^ ( \d+ ) /sprintf("%04X", utf8::unicode_to_native($1))/xer;
3093                trace $_ if main::DEBUG && $to_trace;
3094                return 1;
3095            }
3096
3097            # Get the next line.
3098            return 0 unless _next_line($self);
3099
3100            # If there is a special handler for it, return the line,
3101            # untranslated.  This should happen only for files that are
3102            # special, not being code-point related, such as property names.
3103            return 1 if $handler{$addr}
3104                                    != \&main::process_generic_property_file;
3105
3106            my ($range, $property_name, $map, @remainder)
3107                = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
3108
3109            if (@remainder
3110                || ! defined $property_name
3111                || $range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
3112            {
3113                Carp::my_carp_bug("Unrecognized input line '$_'.  Ignored");
3114            }
3115
3116            my $low = hex $1;
3117            my $high = (defined $2) ? hex $2 : $low;
3118
3119            # If the input maps the range to another code point, remap the
3120            # target if it is between 0 and 255.
3121            my $tail;
3122            if (defined $map) {
3123                $map =~ s/\b 00 ( [0-9A-F]{2} ) \b/sprintf("%04X", utf8::unicode_to_native(hex $1))/gxe;
3124                $tail = "$property_name; $map";
3125                $_ = "$range; $tail";
3126            }
3127            else {
3128                $tail = $property_name;
3129            }
3130
3131            # If entire range is above 255, just return it, unchanged (except
3132            # any mapped-to code point, already changed above)
3133            return 1 if $low > 255;
3134
3135            # Cache an entry for every code point < 255.  For those in the
3136            # range above 255, return a dummy entry for just that portion of
3137            # the range.  Note that this will be out-of-order, but that is not
3138            # a problem.
3139            foreach my $code_point ($low .. $high) {
3140                if ($code_point > 255) {
3141                    $_ = sprintf "%04X..%04X; $tail", $code_point, $high;
3142                    return 1;
3143                }
3144                push @{$remapped_lines{$addr}}, "$code_point; $tail";
3145            }
3146        } # End of looping through lines.
3147
3148        # NOTREACHED
3149    }
3150
3151#   Not currently used, not fully tested.
3152#    sub peek {
3153#        # Non-destructive lookahead one non-adjusted, non-comment, non-blank
3154#        # record.  Not callable from an each_line_handler(), nor does it call
3155#        # an each_line_handler() on the line.
3156#
3157#        my $self = shift;
3158#        my $addr = do { no overloading; pack 'J', $self; };
3159#
3160#        foreach my $inserted_ref (@{$added_lines{$addr}}) {
3161#            my ($adjusted, $line) = @{$inserted_ref};
3162#            next if $adjusted;
3163#
3164#            # Remove comments and trailing space, and return a non-empty
3165#            # resulting line
3166#            $line =~ s/#.*//;
3167#            $line =~ s/\s+$//;
3168#            return $line if $line ne "";
3169#        }
3170#
3171#        return if ! ref $handle{$addr}; # Don't read unless is real file
3172#        while (1) { # Loop until find non-comment, non-empty line
3173#            local $to_trace = 1 if main::DEBUG;
3174#            trace $_ if main::DEBUG && $to_trace;
3175#            return if ! defined (my $line = readline $handle{$addr});
3176#            chomp $line;
3177#            push @{$added_lines{$addr}}, [ 0, $line ];
3178#
3179#            $line =~ s/#.*//;
3180#            $line =~ s/\s+$//;
3181#            return $line if $line ne "";
3182#        }
3183#
3184#        return;
3185#    }
3186
3187
3188    sub insert_lines($self, @lines) {
3189        # Lines can be inserted so that it looks like they were in the input
3190        # file at the place it was when this routine is called.  See also
3191        # insert_adjusted_lines().  Lines inserted via this routine go through
3192        # any each_line_handler()
3193
3194        # Each inserted line is an array, with the first element being 0 to
3195        # indicate that this line hasn't been adjusted, and needs to be
3196        # processed.
3197        no overloading;
3198        push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @lines;
3199        return;
3200    }
3201
3202    sub insert_adjusted_lines($self, @lines) {
3203        # Lines can be inserted so that it looks like they were in the input
3204        # file at the place it was when this routine is called.  See also
3205        # insert_lines().  Lines inserted via this routine are already fully
3206        # adjusted, ready to be processed; each_line_handler()s handlers will
3207        # not be called.  This means this is not a completely general
3208        # facility, as only the last each_line_handler on the stack should
3209        # call this.  It could be made more general, by passing to each of the
3210        # line_handlers their position on the stack, which they would pass on
3211        # to this routine, and that would replace the boolean first element in
3212        # the anonymous array pushed here, so that the next_line routine could
3213        # use that to call only those handlers whose index is after it on the
3214        # stack.  But this is overkill for what is needed now.
3215
3216        trace $_[0] if main::DEBUG && $to_trace;
3217
3218        # Each inserted line is an array, with the first element being 1 to
3219        # indicate that this line has been adjusted
3220        no overloading;
3221        push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @lines;
3222        return;
3223    }
3224
3225    sub get_missings($self) {
3226        # Returns the stored up @missings lines' values, and clears the list.
3227        # The values are in an array, consisting of the default in the first
3228        # element, and the property in the 2nd.  However, since these lines
3229        # can be stacked up, the return is an array of all these arrays.
3230
3231        my $addr = do { no overloading; pack 'J', $self; };
3232
3233        # If not accepting a list return, just return the first one.
3234        return shift @{$missings{$addr}} unless wantarray;
3235
3236        my @return = @{$missings{$addr}};
3237        undef @{$missings{$addr}};
3238        return @return;
3239    }
3240
3241    sub _exclude_unassigned($self) {
3242
3243        # Takes the range in $_ and excludes code points that aren't assigned
3244        # in this release
3245
3246        state $skip_inserted_count = 0;
3247
3248        # Ignore recursive calls.
3249        if ($skip_inserted_count) {
3250            $skip_inserted_count--;
3251            return;
3252        }
3253
3254        # Find what code points are assigned in this release
3255        main::calculate_Assigned() if ! defined $Assigned;
3256
3257        my $addr = do { no overloading; pack 'J', $self; };
3258
3259        my ($range, @remainder)
3260            = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
3261
3262        # Examine the range.
3263        if ($range =~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
3264        {
3265            my $low = hex $1;
3266            my $high = (defined $2) ? hex $2 : $low;
3267
3268            # Split the range into subranges of just those code points in it
3269            # that are assigned.
3270            my @ranges = (Range_List->new(Initialize
3271                              => Range->new($low, $high)) & $Assigned)->ranges;
3272
3273            # Do nothing if nothing in the original range is assigned in this
3274            # release; handle normally if everything is in this release.
3275            if (! @ranges) {
3276                $_ = "";
3277            }
3278            elsif (@ranges != 1) {
3279
3280                # Here, some code points in the original range aren't in this
3281                # release; @ranges gives the ones that are.  Create fake input
3282                # lines for each of the ranges, and set things up so that when
3283                # this routine is called on that fake input, it will do
3284                # nothing.
3285                $skip_inserted_count = @ranges;
3286                my $remainder = join ";", @remainder;
3287                for my $range (@ranges) {
3288                    $self->insert_lines(sprintf("%04X..%04X;%s",
3289                                    $range->start, $range->end, $remainder));
3290                }
3291                $_ = "";    # The original range is now defunct.
3292            }
3293        }
3294
3295        return;
3296    }
3297
3298    sub _fixup_obsolete_hanguls($self) {
3299
3300        # This is called only when compiling Unicode version 1.  All Unicode
3301        # data for subsequent releases assumes that the code points that were
3302        # Hangul syllables in this release only are something else, so if
3303        # using such data, we have to override it
3304
3305        my $addr = do { no overloading; pack 'J', $self; };
3306
3307        my $object = main::property_ref($property{$addr});
3308        $object->add_map($FIRST_REMOVED_HANGUL_SYLLABLE,
3309                         $FINAL_REMOVED_HANGUL_SYLLABLE,
3310                         $early{$addr}[3],  # Passed-in value for these
3311                         Replace => $UNCONDITIONALLY);
3312    }
3313
3314    sub _insert_property_into_line($self) {
3315        # Add a property field to $_, if this file requires it.
3316
3317        my $addr = do { no overloading; pack 'J', $self; };
3318        my $property = $property{$addr};
3319
3320        $_ =~ s/(;|$)/; $property$1/;
3321        return;
3322    }
3323
3324    sub carp_bad_line($self, $message="") {
3325        # Output consistent error messages, using either a generic one, or the
3326        # one given by the optional parameter.  To avoid gazillions of the
3327        # same message in case the syntax of a  file is way off, this routine
3328        # only outputs the first instance of each message, incrementing a
3329        # count so the totals can be output at the end of the file.
3330
3331        my $addr = do { no overloading; pack 'J', $self; };
3332
3333        $message = 'Unexpected line' unless $message;
3334
3335        # No trailing punctuation so as to fit with our addenda.
3336        $message =~ s/[.:;,]$//;
3337
3338        # If haven't seen this exact message before, output it now.  Otherwise
3339        # increment the count of how many times it has occurred
3340        unless ($errors{$addr}->{$message}) {
3341            Carp::my_carp("$message in '$_' in "
3342                            . $file{$addr}
3343                            . " at line $..  Skipping this line;");
3344            $errors{$addr}->{$message} = 1;
3345        }
3346        else {
3347            $errors{$addr}->{$message}++;
3348        }
3349
3350        # Clear the line to prevent any further (meaningful) processing of it.
3351        $_ = "";
3352
3353        return;
3354    }
3355} # End closure
3356
3357package Multi_Default;
3358
3359# Certain properties in early versions of Unicode had more than one possible
3360# default for code points missing from the files.  In these cases, one
3361# default applies to everything left over after all the others are applied,
3362# and for each of the others, there is a description of which class of code
3363# points applies to it.  This object helps implement this by storing the
3364# defaults, and for all but that final default, an eval string that generates
3365# the class that it applies to.
3366
3367use strict;
3368use warnings;
3369
3370use feature 'signatures';
3371no warnings 'experimental::signatures';
3372
3373{   # Closure
3374
3375    main::setup_package();
3376
3377    my %class_defaults;
3378    # The defaults structure for the classes
3379    main::set_access('class_defaults', \%class_defaults);
3380
3381    my %other_default;
3382    # The default that applies to everything left over.
3383    main::set_access('other_default', \%other_default, 'r');
3384
3385
3386    sub new {
3387        # The constructor is called with default => eval pairs, terminated by
3388        # the left-over default. e.g.
3389        # Multi_Default->new(
3390        #        'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
3391        #               -  0x200D',
3392        #        'R' => 'some other expression that evaluates to code points',
3393        #        .
3394        #        .
3395        #        .
3396        #        'U'));
3397        # It is best to leave the final value be the one that matches the
3398        # above-Unicode code points.
3399
3400        my $class = shift;
3401
3402        my $self = bless \do{my $anonymous_scalar}, $class;
3403        my $addr = do { no overloading; pack 'J', $self; };
3404
3405        while (@_ > 1) {
3406            my $default = shift;
3407            my $eval = shift;
3408            $class_defaults{$addr}->{$default} = $eval;
3409        }
3410
3411        $other_default{$addr} = shift;
3412
3413        return $self;
3414    }
3415
3416    sub get_next_defaults($self) {
3417        # Iterates and returns the next class of defaults.
3418
3419        my $addr = do { no overloading; pack 'J', $self; };
3420
3421        return each %{$class_defaults{$addr}};
3422    }
3423}
3424
3425package Alias;
3426
3427# An alias is one of the names that a table goes by.  This class defines them
3428# including some attributes.  Everything is currently setup in the
3429# constructor.
3430
3431use strict;
3432use warnings;
3433
3434use feature 'signatures';
3435no warnings 'experimental::signatures';
3436
3437
3438{   # Closure
3439
3440    main::setup_package();
3441
3442    my %name;
3443    main::set_access('name', \%name, 'r');
3444
3445    my %loose_match;
3446    # Should this name match loosely or not.
3447    main::set_access('loose_match', \%loose_match, 'r');
3448
3449    my %make_re_pod_entry;
3450    # Some aliases should not get their own entries in the re section of the
3451    # pod, because they are covered by a wild-card, and some we want to
3452    # discourage use of.  Binary
3453    main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's');
3454
3455    my %ucd;
3456    # Is this documented to be accessible via Unicode::UCD
3457    main::set_access('ucd', \%ucd, 'r', 's');
3458
3459    my %status;
3460    # Aliases have a status, like deprecated, or even suppressed (which means
3461    # they don't appear in documentation).  Enum
3462    main::set_access('status', \%status, 'r');
3463
3464    my %ok_as_filename;
3465    # Similarly, some aliases should not be considered as usable ones for
3466    # external use, such as file names, or we don't want documentation to
3467    # recommend them.  Boolean
3468    main::set_access('ok_as_filename', \%ok_as_filename, 'r');
3469
3470    sub new {
3471        my $class = shift;
3472
3473        my $self = bless \do { my $anonymous_scalar }, $class;
3474        my $addr = do { no overloading; pack 'J', $self; };
3475
3476        $name{$addr} = shift;
3477        $loose_match{$addr} = shift;
3478        $make_re_pod_entry{$addr} = shift;
3479        $ok_as_filename{$addr} = shift;
3480        $status{$addr} = shift;
3481        $ucd{$addr} = shift;
3482
3483        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3484
3485        # Null names are never ok externally
3486        $ok_as_filename{$addr} = 0 if $name{$addr} eq "";
3487
3488        return $self;
3489    }
3490}
3491
3492package Range;
3493
3494# A range is the basic unit for storing code points, and is described in the
3495# comments at the beginning of the program.  Each range has a starting code
3496# point; an ending code point (not less than the starting one); a value
3497# that applies to every code point in between the two end-points, inclusive;
3498# and an enum type that applies to the value.  The type is for the user's
3499# convenience, and has no meaning here, except that a non-zero type is
3500# considered to not obey the normal Unicode rules for having standard forms.
3501#
3502# The same structure is used for both map and match tables, even though in the
3503# latter, the value (and hence type) is irrelevant and could be used as a
3504# comment.  In map tables, the value is what all the code points in the range
3505# map to.  Type 0 values have the standardized version of the value stored as
3506# well, so as to not have to recalculate it a lot.
3507
3508use strict;
3509use warnings;
3510
3511use feature 'signatures';
3512no warnings 'experimental::signatures';
3513
3514sub trace { return main::trace(@_); }
3515
3516{   # Closure
3517
3518    main::setup_package();
3519
3520    my %start;
3521    main::set_access('start', \%start, 'r', 's');
3522
3523    my %end;
3524    main::set_access('end', \%end, 'r', 's');
3525
3526    my %value;
3527    main::set_access('value', \%value, 'r', 's');
3528
3529    my %type;
3530    main::set_access('type', \%type, 'r');
3531
3532    my %standard_form;
3533    # The value in internal standard form.  Defined only if the type is 0.
3534    main::set_access('standard_form', \%standard_form);
3535
3536    # Note that if these fields change, the dump() method should as well
3537
3538    sub new($class, $_addr, $_end, @_args) {
3539        my $self = bless \do { my $anonymous_scalar }, $class;
3540        my $addr = do { no overloading; pack 'J', $self; };
3541
3542        $start{$addr} = $_addr;
3543        $end{$addr}   = $_end;
3544
3545        my %args = @_args;
3546
3547        my $value = delete $args{'Value'};  # Can be 0
3548        $value = "" unless defined $value;
3549        $value{$addr} = $value;
3550
3551        $type{$addr} = delete $args{'Type'} || 0;
3552
3553        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3554
3555        return $self;
3556    }
3557
3558    use overload
3559        fallback => 0,
3560        qw("") => "_operator_stringify",
3561        "." => \&main::_operator_dot,
3562        ".=" => \&main::_operator_dot_equal,
3563    ;
3564
3565    sub _operator_stringify($self, $other="", $reversed=0) {
3566        my $addr = do { no overloading; pack 'J', $self; };
3567
3568        # Output it like '0041..0065 (value)'
3569        my $return = sprintf("%04X", $start{$addr})
3570                        .  '..'
3571                        . sprintf("%04X", $end{$addr});
3572        my $value = $value{$addr};
3573        my $type = $type{$addr};
3574        $return .= ' (';
3575        $return .= "$value";
3576        $return .= ", Type=$type" if $type != 0;
3577        $return .= ')';
3578
3579        return $return;
3580    }
3581
3582    sub standard_form($self) {
3583        # Calculate the standard form only if needed, and cache the result.
3584        # The standard form is the value itself if the type is special.
3585        # This represents a considerable CPU and memory saving - at the time
3586        # of writing there are 368676 non-special objects, but the standard
3587        # form is only requested for 22047 of them - ie about 6%.
3588
3589        my $addr = do { no overloading; pack 'J', $self; };
3590
3591        return $standard_form{$addr} if defined $standard_form{$addr};
3592
3593        my $value = $value{$addr};
3594        return $value if $type{$addr};
3595        return $standard_form{$addr} = main::standardize($value);
3596    }
3597
3598    sub dump($self, $indent) {
3599        # Human, not machine readable.  For machine readable, comment out this
3600        # entire routine and let the standard one take effect.
3601        my $addr = do { no overloading; pack 'J', $self; };
3602
3603        my $return = $indent
3604                    . sprintf("%04X", $start{$addr})
3605                    . '..'
3606                    . sprintf("%04X", $end{$addr})
3607                    . " '$value{$addr}';";
3608        if (! defined $standard_form{$addr}) {
3609            $return .= "(type=$type{$addr})";
3610        }
3611        elsif ($standard_form{$addr} ne $value{$addr}) {
3612            $return .= "(standard '$standard_form{$addr}')";
3613        }
3614        return $return;
3615    }
3616} # End closure
3617
3618package _Range_List_Base;
3619
3620use strict;
3621use warnings;
3622
3623use feature 'signatures';
3624no warnings 'experimental::signatures';
3625
3626# Base class for range lists.  A range list is simply an ordered list of
3627# ranges, so that the ranges with the lowest starting numbers are first in it.
3628#
3629# When a new range is added that is adjacent to an existing range that has the
3630# same value and type, it merges with it to form a larger range.
3631#
3632# Ranges generally do not overlap, except that there can be multiple entries
3633# of single code point ranges.  This is because of NameAliases.txt.
3634#
3635# In this program, there is a standard value such that if two different
3636# values, have the same standard value, they are considered equivalent.  This
3637# value was chosen so that it gives correct results on Unicode data
3638
3639# There are a number of methods to manipulate range lists, and some operators
3640# are overloaded to handle them.
3641
3642sub trace { return main::trace(@_); }
3643
3644{ # Closure
3645
3646    our $addr;
3647
3648    # Max is initialized to a negative value that isn't adjacent to 0, for
3649    # simpler tests
3650    my $max_init = -2;
3651
3652    main::setup_package();
3653
3654    my %ranges;
3655    # The list of ranges
3656    main::set_access('ranges', \%ranges, 'readable_array');
3657
3658    my %max;
3659    # The highest code point in the list.  This was originally a method, but
3660    # actual measurements said it was used a lot.
3661    main::set_access('max', \%max, 'r');
3662
3663    my %each_range_iterator;
3664    # Iterator position for each_range()
3665    main::set_access('each_range_iterator', \%each_range_iterator);
3666
3667    my %owner_name_of;
3668    # Name of parent this is attached to, if any.  Solely for better error
3669    # messages.
3670    main::set_access('owner_name_of', \%owner_name_of, 'p_r');
3671
3672    my %_search_ranges_cache;
3673    # A cache of the previous result from _search_ranges(), for better
3674    # performance
3675    main::set_access('_search_ranges_cache', \%_search_ranges_cache);
3676
3677    sub new {
3678        my $class = shift;
3679        my %args = @_;
3680
3681        # Optional initialization data for the range list.
3682        my $initialize = delete $args{'Initialize'};
3683
3684        my $self;
3685
3686        # Use _union() to initialize.  _union() returns an object of this
3687        # class, which means that it will call this constructor recursively.
3688        # But it won't have this $initialize parameter so that it won't
3689        # infinitely loop on this.
3690        return _union($class, $initialize, %args) if defined $initialize;
3691
3692        $self = bless \do { my $anonymous_scalar }, $class;
3693        my $addr = do { no overloading; pack 'J', $self; };
3694
3695        # Optional parent object, only for debug info.
3696        $owner_name_of{$addr} = delete $args{'Owner'};
3697        $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
3698
3699        # Stringify, in case it is an object.
3700        $owner_name_of{$addr} = "$owner_name_of{$addr}";
3701
3702        # This is used only for error messages, and so a colon is added
3703        $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
3704
3705        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3706
3707        $max{$addr} = $max_init;
3708
3709        $_search_ranges_cache{$addr} = 0;
3710        $ranges{$addr} = [];
3711
3712        return $self;
3713    }
3714
3715    use overload
3716        fallback => 0,
3717        qw("") => "_operator_stringify",
3718        "." => \&main::_operator_dot,
3719        ".=" => \&main::_operator_dot_equal,
3720    ;
3721
3722    sub _operator_stringify($self, $other="", $reversed=0) {
3723        my $addr = do { no overloading; pack 'J', $self; };
3724
3725        return "Range_List attached to '$owner_name_of{$addr}'"
3726                                                if $owner_name_of{$addr};
3727        return "anonymous Range_List " . \$self;
3728    }
3729
3730    sub _union {
3731        # Returns the union of the input code points.  It can be called as
3732        # either a constructor or a method.  If called as a method, the result
3733        # will be a new() instance of the calling object, containing the union
3734        # of that object with the other parameter's code points;  if called as
3735        # a constructor, the first parameter gives the class that the new object
3736        # should be, and the second parameter gives the code points to go into
3737        # it.
3738        # In either case, there are two parameters looked at by this routine;
3739        # any additional parameters are passed to the new() constructor.
3740        #
3741        # The code points can come in the form of some object that contains
3742        # ranges, and has a conventionally named method to access them; or
3743        # they can be an array of individual code points (as integers); or
3744        # just a single code point.
3745        #
3746        # If they are ranges, this routine doesn't make any effort to preserve
3747        # the range values and types of one input over the other.  Therefore
3748        # this base class should not allow _union to be called from other than
3749        # initialization code, so as to prevent two tables from being added
3750        # together where the range values matter.  The general form of this
3751        # routine therefore belongs in a derived class, but it was moved here
3752        # to avoid duplication of code.  The failure to overload this in this
3753        # class keeps it safe.
3754        #
3755        # It does make the effort during initialization to accept tables with
3756        # multiple values for the same code point, and to preserve the order
3757        # of these.  If there is only one input range or range set, it doesn't
3758        # sort (as it should already be sorted to the desired order), and will
3759        # accept multiple values per code point.  Otherwise it will merge
3760        # multiple values into a single one.
3761
3762        my $self;
3763        my @args;   # Arguments to pass to the constructor
3764
3765        my $class = shift;
3766
3767        # If a method call, will start the union with the object itself, and
3768        # the class of the new object will be the same as self.
3769        if (ref $class) {
3770            $self = $class;
3771            $class = ref $self;
3772            push @args, $self;
3773        }
3774
3775        # Add the other required parameter.
3776        push @args, shift;
3777        # Rest of parameters are passed on to the constructor
3778
3779        # Accumulate all records from both lists.
3780        my @records;
3781        my $input_count = 0;
3782        for my $arg (@args) {
3783            #local $to_trace = 0 if main::DEBUG;
3784            trace "argument = $arg" if main::DEBUG && $to_trace;
3785            if (! defined $arg) {
3786                my $message = "";
3787                if (defined $self) {
3788                    no overloading;
3789                    $message .= $owner_name_of{pack 'J', $self};
3790                }
3791                Carp::my_carp_bug($message . "Undefined argument to _union.  No union done.");
3792                return;
3793            }
3794
3795            $arg = [ $arg ] if ! ref $arg;
3796            my $type = ref $arg;
3797            if ($type eq 'ARRAY') {
3798                foreach my $element (@$arg) {
3799                    push @records, Range->new($element, $element);
3800                    $input_count++;
3801                }
3802            }
3803            elsif ($arg->isa('Range')) {
3804                push @records, $arg;
3805                $input_count++;
3806            }
3807            elsif ($arg->can('ranges')) {
3808                push @records, $arg->ranges;
3809                $input_count++;
3810            }
3811            else {
3812                my $message = "";
3813                if (defined $self) {
3814                    no overloading;
3815                    $message .= $owner_name_of{pack 'J', $self};
3816                }
3817                Carp::my_carp_bug($message . "Cannot take the union of a $type.  No union done.");
3818                return;
3819            }
3820        }
3821
3822        # Sort with the range containing the lowest ordinal first, but if
3823        # two ranges start at the same code point, sort with the bigger range
3824        # of the two first, because it takes fewer cycles.
3825        if ($input_count > 1) {
3826            @records = sort { ($a->start <=> $b->start)
3827                                      or
3828                                    # if b is shorter than a, b->end will be
3829                                    # less than a->end, and we want to select
3830                                    # a, so want to return -1
3831                                    ($b->end <=> $a->end)
3832                                   } @records;
3833        }
3834
3835        my $new = $class->new(@_);
3836
3837        # Fold in records so long as they add new information.
3838        for my $set (@records) {
3839            my $start = $set->start;
3840            my $end   = $set->end;
3841            my $value = $set->value;
3842            my $type  = $set->type;
3843            if ($start > $new->max) {
3844                $new->_add_delete('+', $start, $end, $value, Type => $type);
3845            }
3846            elsif ($end > $new->max) {
3847                $new->_add_delete('+', $new->max +1, $end, $value,
3848                                                                Type => $type);
3849            }
3850            elsif ($input_count == 1) {
3851                # Here, overlaps existing range, but is from a single input,
3852                # so preserve the multiple values from that input.
3853                $new->_add_delete('+', $start, $end, $value, Type => $type,
3854                                                Replace => $MULTIPLE_AFTER);
3855            }
3856        }
3857
3858        return $new;
3859    }
3860
3861    sub range_count($self) {        # Return the number of ranges in the range list
3862        no overloading;
3863        return scalar @{$ranges{pack 'J', $self}};
3864    }
3865
3866    sub min($self) {
3867        # Returns the minimum code point currently in the range list, or if
3868        # the range list is empty, 2 beyond the max possible.  This is a
3869        # method because used so rarely, that not worth saving between calls,
3870        # and having to worry about changing it as ranges are added and
3871        # deleted.
3872
3873        my $addr = do { no overloading; pack 'J', $self; };
3874
3875        # If the range list is empty, return a large value that isn't adjacent
3876        # to any that could be in the range list, for simpler tests
3877        return $MAX_WORKING_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
3878        return $ranges{$addr}->[0]->start;
3879    }
3880
3881    sub contains($self, $codepoint) {
3882        # Boolean: Is argument in the range list?  If so returns $i such that:
3883        #   range[$i]->end < $codepoint <= range[$i+1]->end
3884        # which is one beyond what you want; this is so that the 0th range
3885        # doesn't return false
3886
3887        my $i = $self->_search_ranges($codepoint);
3888        return 0 unless defined $i;
3889
3890        # The search returns $i, such that
3891        #   range[$i-1]->end < $codepoint <= range[$i]->end
3892        # So is in the table if and only iff it is at least the start position
3893        # of range $i.
3894        no overloading;
3895        return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint;
3896        return $i + 1;
3897    }
3898
3899    sub containing_range($self, $codepoint) {
3900        # Returns the range object that contains the code point, undef if none
3901        my $i = $self->contains($codepoint);
3902        return unless $i;
3903
3904        # contains() returns 1 beyond where we should look
3905        no overloading;
3906        return $ranges{pack 'J', $self}->[$i-1];
3907    }
3908
3909    sub value_of($self, $codepoint) {
3910        # Returns the value associated with the code point, undef if none
3911        my $range = $self->containing_range($codepoint);
3912        return unless defined $range;
3913
3914        return $range->value;
3915    }
3916
3917    sub type_of($self, $codepoint) {
3918        # Returns the type of the range containing the code point, undef if
3919        # the code point is not in the table
3920        my $range = $self->containing_range($codepoint);
3921        return unless defined $range;
3922
3923        return $range->type;
3924    }
3925
3926    sub _search_ranges($self, $code_point) {
3927        # Find the range in the list which contains a code point, or where it
3928        # should go if were to add it.  That is, it returns $i, such that:
3929        #   range[$i-1]->end < $codepoint <= range[$i]->end
3930        # Returns undef if no such $i is possible (e.g. at end of table), or
3931        # if there is an error.
3932        my $addr = do { no overloading; pack 'J', $self; };
3933
3934        return if $code_point > $max{$addr};
3935        my $r = $ranges{$addr};                # The current list of ranges
3936        my $range_list_size = scalar @$r;
3937        my $i;
3938
3939        use integer;        # want integer division
3940
3941        # Use the cached result as the starting guess for this one, because,
3942        # an experiment on 5.1 showed that 90% of the time the cache was the
3943        # same as the result on the next call (and 7% it was one less).
3944        $i = $_search_ranges_cache{$addr};
3945        $i = 0 if $i >= $range_list_size;   # Reset if no longer valid (prob.
3946                                            # from an intervening deletion
3947        #local $to_trace = 1 if main::DEBUG;
3948        trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
3949        return $i if $code_point <= $r->[$i]->end
3950                     && ($i == 0 || $r->[$i-1]->end < $code_point);
3951
3952        # Here the cache doesn't yield the correct $i.  Try adding 1.
3953        if ($i < $range_list_size - 1
3954            && $r->[$i]->end < $code_point &&
3955            $code_point <= $r->[$i+1]->end)
3956        {
3957            $i++;
3958            trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
3959            $_search_ranges_cache{$addr} = $i;
3960            return $i;
3961        }
3962
3963        # Here, adding 1 also didn't work.  We do a binary search to
3964        # find the correct position, starting with current $i
3965        my $lower = 0;
3966        my $upper = $range_list_size - 1;
3967        while (1) {
3968            trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
3969
3970            if ($code_point <= $r->[$i]->end) {
3971
3972                # Here we have met the upper constraint.  We can quit if we
3973                # also meet the lower one.
3974                last if $i == 0 || $r->[$i-1]->end < $code_point;
3975
3976                $upper = $i;        # Still too high.
3977
3978            }
3979            else {
3980
3981                # Here, $r[$i]->end < $code_point, so look higher up.
3982                $lower = $i;
3983            }
3984
3985            # Split search domain in half to try again.
3986            my $temp = ($upper + $lower) / 2;
3987
3988            # No point in continuing unless $i changes for next time
3989            # in the loop.
3990            if ($temp == $i) {
3991
3992                # We can't reach the highest element because of the averaging.
3993                # So if one below the upper edge, force it there and try one
3994                # more time.
3995                if ($i == $range_list_size - 2) {
3996
3997                    trace "Forcing to upper edge" if main::DEBUG && $to_trace;
3998                    $i = $range_list_size - 1;
3999
4000                    # Change $lower as well so if fails next time through,
4001                    # taking the average will yield the same $i, and we will
4002                    # quit with the error message just below.
4003                    $lower = $i;
4004                    next;
4005                }
4006                Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go.  No action taken.");
4007                return;
4008            }
4009            $i = $temp;
4010        } # End of while loop
4011
4012        if (main::DEBUG && $to_trace) {
4013            trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
4014            trace "i=  [ $i ]", $r->[$i];
4015            trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
4016        }
4017
4018        # Here we have found the offset.  Cache it as a starting point for the
4019        # next call.
4020        $_search_ranges_cache{$addr} = $i;
4021        return $i;
4022    }
4023
4024    sub _add_delete {
4025        # Add, replace or delete ranges to or from a list.  The $type
4026        # parameter gives which:
4027        #   '+' => insert or replace a range, returning a list of any changed
4028        #          ranges.
4029        #   '-' => delete a range, returning a list of any deleted ranges.
4030        #
4031        # The next three parameters give respectively the start, end, and
4032        # value associated with the range.  'value' should be null unless the
4033        # operation is '+';
4034        #
4035        # The range list is kept sorted so that the range with the lowest
4036        # starting position is first in the list, and generally, adjacent
4037        # ranges with the same values are merged into a single larger one (see
4038        # exceptions below).
4039        #
4040        # There are more parameters; all are key => value pairs:
4041        #   Type    gives the type of the value.  It is only valid for '+'.
4042        #           All ranges have types; if this parameter is omitted, 0 is
4043        #           assumed.  Ranges with type 0 are assumed to obey the
4044        #           Unicode rules for casing, etc; ranges with other types are
4045        #           not.  Otherwise, the type is arbitrary, for the caller's
4046        #           convenience, and looked at only by this routine to keep
4047        #           adjacent ranges of different types from being merged into
4048        #           a single larger range, and when Replace =>
4049        #           $IF_NOT_EQUIVALENT is specified (see just below).
4050        #   Replace  determines what to do if the range list already contains
4051        #            ranges which coincide with all or portions of the input
4052        #            range.  It is only valid for '+':
4053        #       => $NO            means that the new value is not to replace
4054        #                         any existing ones, but any empty gaps of the
4055        #                         range list coinciding with the input range
4056        #                         will be filled in with the new value.
4057        #       => $UNCONDITIONALLY  means to replace the existing values with
4058        #                         this one unconditionally.  However, if the
4059        #                         new and old values are identical, the
4060        #                         replacement is skipped to save cycles
4061        #       => $IF_NOT_EQUIVALENT means to replace the existing values
4062        #          (the default)  with this one if they are not equivalent.
4063        #                         Ranges are equivalent if their types are the
4064        #                         same, and they are the same string; or if
4065        #                         both are type 0 ranges, if their Unicode
4066        #                         standard forms are identical.  In this last
4067        #                         case, the routine chooses the more "modern"
4068        #                         one to use.  This is because some of the
4069        #                         older files are formatted with values that
4070        #                         are, for example, ALL CAPs, whereas the
4071        #                         derived files have a more modern style,
4072        #                         which looks better.  By looking for this
4073        #                         style when the pre-existing and replacement
4074        #                         standard forms are the same, we can move to
4075        #                         the modern style
4076        #       => $MULTIPLE_BEFORE means that if this range duplicates an
4077        #                         existing one, but has a different value,
4078        #                         don't replace the existing one, but insert
4079        #                         this one so that the same range can occur
4080        #                         multiple times.  They are stored LIFO, so
4081        #                         that the final one inserted is the first one
4082        #                         returned in an ordered search of the table.
4083        #                         If this is an exact duplicate, including the
4084        #                         value, the original will be moved to be
4085        #                         first, before any other duplicate ranges
4086        #                         with different values.
4087        #       => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored
4088        #                         FIFO, so that this one is inserted after all
4089        #                         others that currently exist.  If this is an
4090        #                         exact duplicate, including value, of an
4091        #                         existing range, this one is discarded
4092        #                         (leaving the existing one in its original,
4093        #                         higher priority position
4094        #       => $CROAK         Die with an error if is already there
4095        #       => anything else  is the same as => $IF_NOT_EQUIVALENT
4096        #
4097        # "same value" means identical for non-type-0 ranges, and it means
4098        # having the same standard forms for type-0 ranges.
4099
4100        return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
4101
4102        my $self = shift;
4103        my $operation = shift;   # '+' for add/replace; '-' for delete;
4104        my $start = shift;
4105        my $end   = shift;
4106        my $value = shift;
4107
4108        my %args = @_;
4109
4110        $value = "" if not defined $value;        # warning: $value can be "0"
4111
4112        my $replace = delete $args{'Replace'};
4113        $replace = $IF_NOT_EQUIVALENT unless defined $replace;
4114
4115        my $type = delete $args{'Type'};
4116        $type = 0 unless defined $type;
4117
4118        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4119
4120        my $addr = do { no overloading; pack 'J', $self; };
4121
4122        if ($operation ne '+' && $operation ne '-') {
4123            Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'.  No action taken.");
4124            return;
4125        }
4126        unless (defined $start && defined $end) {
4127            Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete.  No action taken.");
4128            return;
4129        }
4130        unless ($end >= $start) {
4131            Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . ").  No action taken.");
4132            return;
4133        }
4134        #local $to_trace = 1 if main::DEBUG;
4135
4136        if ($operation eq '-') {
4137            if ($replace != $IF_NOT_EQUIVALENT) {
4138                Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list.  Assuming Replace => \$IF_NOT_EQUIVALENT.");
4139                $replace = $IF_NOT_EQUIVALENT;
4140            }
4141            if ($type) {
4142                Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list.  Assuming Type => 0.");
4143                $type = 0;
4144            }
4145            if ($value ne "") {
4146                Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list.  Assuming Value => \"\".");
4147                $value = "";
4148            }
4149        }
4150
4151        my $r = $ranges{$addr};               # The current list of ranges
4152        my $range_list_size = scalar @$r;     # And its size
4153        my $max = $max{$addr};                # The current high code point in
4154                                              # the list of ranges
4155
4156        # Do a special case requiring fewer machine cycles when the new range
4157        # starts after the current highest point.  The Unicode input data is
4158        # structured so this is common.
4159        if ($start > $max) {
4160
4161            trace "$owner_name_of{$addr} $operation", sprintf("%04X..%04X (%s) type=%d; prev max=%04X", $start, $end, $value, $type, $max) if main::DEBUG && $to_trace;
4162            return if $operation eq '-'; # Deleting a non-existing range is a
4163                                         # no-op
4164
4165            # If the new range doesn't logically extend the current final one
4166            # in the range list, create a new range at the end of the range
4167            # list.  (max cleverly is initialized to a negative number not
4168            # adjacent to 0 if the range list is empty, so even adding a range
4169            # to an empty range list starting at 0 will have this 'if'
4170            # succeed.)
4171            if ($start > $max + 1        # non-adjacent means can't extend.
4172                || @{$r}[-1]->value ne $value # values differ, can't extend.
4173                || @{$r}[-1]->type != $type # types differ, can't extend.
4174            ) {
4175                push @$r, Range->new($start, $end,
4176                                     Value => $value,
4177                                     Type => $type);
4178            }
4179            else {
4180
4181                # Here, the new range starts just after the current highest in
4182                # the range list, and they have the same type and value.
4183                # Extend the existing range to incorporate the new one.
4184                @{$r}[-1]->set_end($end);
4185            }
4186
4187            # This becomes the new maximum.
4188            $max{$addr} = $end;
4189
4190            return;
4191        }
4192        #local $to_trace = 0 if main::DEBUG;
4193
4194        trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
4195
4196        # Here, the input range isn't after the whole rest of the range list.
4197        # Most likely 'splice' will be needed.  The rest of the routine finds
4198        # the needed splice parameters, and if necessary, does the splice.
4199        # First, find the offset parameter needed by the splice function for
4200        # the input range.  Note that the input range may span multiple
4201        # existing ones, but we'll worry about that later.  For now, just find
4202        # the beginning.  If the input range is to be inserted starting in a
4203        # position not currently in the range list, it must (obviously) come
4204        # just after the range below it, and just before the range above it.
4205        # Slightly less obviously, it will occupy the position currently
4206        # occupied by the range that is to come after it.  More formally, we
4207        # are looking for the position, $i, in the array of ranges, such that:
4208        #
4209        # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
4210        #
4211        # (The ordered relationships within existing ranges are also shown in
4212        # the equation above).  However, if the start of the input range is
4213        # within an existing range, the splice offset should point to that
4214        # existing range's position in the list; that is $i satisfies a
4215        # somewhat different equation, namely:
4216        #
4217        #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
4218        #
4219        # More briefly, $start can come before or after r[$i]->start, and at
4220        # this point, we don't know which it will be.  However, these
4221        # two equations share these constraints:
4222        #
4223        #   r[$i-1]->end < $start <= r[$i]->end
4224        #
4225        # And that is good enough to find $i.
4226
4227        my $i = $self->_search_ranges($start);
4228        if (! defined $i) {
4229            Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined.  Operation '$operation' not performed");
4230            return;
4231        }
4232
4233        # The search function returns $i such that:
4234        #
4235        # r[$i-1]->end < $start <= r[$i]->end
4236        #
4237        # That means that $i points to the first range in the range list
4238        # that could possibly be affected by this operation.  We still don't
4239        # know if the start of the input range is within r[$i], or if it
4240        # points to empty space between r[$i-1] and r[$i].
4241        trace "[$i] is the beginning splice point.  Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
4242
4243        # Special case the insertion of data that is not to replace any
4244        # existing data.
4245        if ($replace == $NO) {  # If $NO, has to be operation '+'
4246            #local $to_trace = 1 if main::DEBUG;
4247            trace "Doesn't replace" if main::DEBUG && $to_trace;
4248
4249            # Here, the new range is to take effect only on those code points
4250            # that aren't already in an existing range.  This can be done by
4251            # looking through the existing range list and finding the gaps in
4252            # the ranges that this new range affects, and then calling this
4253            # function recursively on each of those gaps, leaving untouched
4254            # anything already in the list.  Gather up a list of the changed
4255            # gaps first so that changes to the internal state as new ranges
4256            # are added won't be a problem.
4257            my @gap_list;
4258
4259            # First, if the starting point of the input range is outside an
4260            # existing one, there is a gap from there to the beginning of the
4261            # existing range -- add a span to fill the part that this new
4262            # range occupies
4263            if ($start < $r->[$i]->start) {
4264                push @gap_list, Range->new($start,
4265                                           main::min($end,
4266                                                     $r->[$i]->start - 1),
4267                                           Type => $type);
4268                trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
4269            }
4270
4271            # Then look through the range list for other gaps until we reach
4272            # the highest range affected by the input one.
4273            my $j;
4274            for ($j = $i+1; $j < $range_list_size; $j++) {
4275                trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
4276                last if $end < $r->[$j]->start;
4277
4278                # If there is a gap between when this range starts and the
4279                # previous one ends, add a span to fill it.  Note that just
4280                # because there are two ranges doesn't mean there is a
4281                # non-zero gap between them.  It could be that they have
4282                # different values or types
4283                if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
4284                    push @gap_list,
4285                        Range->new($r->[$j-1]->end + 1,
4286                                   $r->[$j]->start - 1,
4287                                   Type => $type);
4288                    trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
4289                }
4290            }
4291
4292            # Here, we have either found an existing range in the range list,
4293            # beyond the area affected by the input one, or we fell off the
4294            # end of the loop because the input range affects the whole rest
4295            # of the range list.  In either case, $j is 1 higher than the
4296            # highest affected range.  If $j == $i, it means that there are no
4297            # affected ranges, that the entire insertion is in the gap between
4298            # r[$i-1], and r[$i], which we already have taken care of before
4299            # the loop.
4300            # On the other hand, if there are affected ranges, it might be
4301            # that there is a gap that needs filling after the final such
4302            # range to the end of the input range
4303            if ($r->[$j-1]->end < $end) {
4304                    push @gap_list, Range->new(main::max($start,
4305                                                         $r->[$j-1]->end + 1),
4306                                               $end,
4307                                               Type => $type);
4308                    trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
4309            }
4310
4311            # Call recursively to fill in all the gaps.
4312            foreach my $gap (@gap_list) {
4313                $self->_add_delete($operation,
4314                                   $gap->start,
4315                                   $gap->end,
4316                                   $value,
4317                                   Type => $type);
4318            }
4319
4320            return;
4321        }
4322
4323        # Here, we have taken care of the case where $replace is $NO.
4324        # Remember that here, r[$i-1]->end < $start <= r[$i]->end
4325        # If inserting a multiple record, this is where it goes, before the
4326        # first (if any) existing one if inserting LIFO.  (If this is to go
4327        # afterwards, FIFO, we below move the pointer to there.)  These imply
4328        # an insertion, and no change to any existing ranges.  Note that $i
4329        # can be -1 if this new range doesn't actually duplicate any existing,
4330        # and comes at the beginning of the list.
4331        if ($replace == $MULTIPLE_BEFORE || $replace == $MULTIPLE_AFTER) {
4332
4333            if ($start != $end) {
4334                Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point.  No action taken.");
4335                return;
4336            }
4337
4338            # If the new code point is within a current range ...
4339            if ($end >= $r->[$i]->start) {
4340
4341                # Don't add an exact duplicate, as it isn't really a multiple
4342                my $existing_value = $r->[$i]->value;
4343                my $existing_type = $r->[$i]->type;
4344                return if $value eq $existing_value && $type eq $existing_type;
4345
4346                # If the multiple value is part of an existing range, we want
4347                # to split up that range, so that only the single code point
4348                # is affected.  To do this, we first call ourselves
4349                # recursively to delete that code point from the table, having
4350                # preserved its current data above.  Then we call ourselves
4351                # recursively again to add the new multiple, which we know by
4352                # the test just above is different than the current code
4353                # point's value, so it will become a range containing a single
4354                # code point: just itself.  Finally, we add back in the
4355                # pre-existing code point, which will again be a single code
4356                # point range.  Because 'i' likely will have changed as a
4357                # result of these operations, we can't just continue on, but
4358                # do this operation recursively as well.  If we are inserting
4359                # LIFO, the pre-existing code point needs to go after the new
4360                # one, so use MULTIPLE_AFTER; and vice versa.
4361                if ($r->[$i]->start != $r->[$i]->end) {
4362                    $self->_add_delete('-', $start, $end, "");
4363                    $self->_add_delete('+', $start, $end, $value, Type => $type);
4364                    return $self->_add_delete('+',
4365                            $start, $end,
4366                            $existing_value,
4367                            Type => $existing_type,
4368                            Replace => ($replace == $MULTIPLE_BEFORE)
4369                                       ? $MULTIPLE_AFTER
4370                                       : $MULTIPLE_BEFORE);
4371                }
4372            }
4373
4374            # If to place this new record after, move to beyond all existing
4375            # ones; but don't add this one if identical to any of them, as it
4376            # isn't really a multiple.  This leaves the original order, so
4377            # that the current request is ignored.  The reasoning is that the
4378            # previous request that wanted this record to have high priority
4379            # should have precedence.
4380            if ($replace == $MULTIPLE_AFTER) {
4381                while ($i < @$r && $r->[$i]->start == $start) {
4382                    return if $value eq $r->[$i]->value
4383                              && $type eq $r->[$i]->type;
4384                    $i++;
4385                }
4386            }
4387            else {
4388                # If instead we are to place this new record before any
4389                # existing ones, remove any identical ones that come after it.
4390                # This changes the existing order so that the new one is
4391                # first, as is being requested.
4392                for (my $j = $i + 1;
4393                     $j < @$r && $r->[$j]->start == $start;
4394                     $j++)
4395                {
4396                    if ($value eq $r->[$j]->value && $type eq $r->[$j]->type) {
4397                        splice @$r, $j, 1;
4398                        last;   # There should only be one instance, so no
4399                                # need to keep looking
4400                    }
4401                }
4402            }
4403
4404            trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
4405            my @return = splice @$r,
4406                                $i,
4407                                0,
4408                                Range->new($start,
4409                                           $end,
4410                                           Value => $value,
4411                                           Type => $type);
4412            if (main::DEBUG && $to_trace) {
4413                trace "After splice:";
4414                trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4415                trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4416                trace "i  =[", $i, "]", $r->[$i] if $i >= 0;
4417                trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4418                trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4419                trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3;
4420            }
4421            return @return;
4422        }
4423
4424        # Here, we have taken care of $NO and $MULTIPLE_foo replaces.  This
4425        # leaves delete, insert, and replace either unconditionally or if not
4426        # equivalent.  $i still points to the first potential affected range.
4427        # Now find the highest range affected, which will determine the length
4428        # parameter to splice.  (The input range can span multiple existing
4429        # ones.)  If this isn't a deletion, while we are looking through the
4430        # range list, see also if this is a replacement rather than a clean
4431        # insertion; that is if it will change the values of at least one
4432        # existing range.  Start off assuming it is an insert, until find it
4433        # isn't.
4434        my $clean_insert = $operation eq '+';
4435        my $j;        # This will point to the highest affected range
4436
4437        # For non-zero types, the standard form is the value itself;
4438        my $standard_form = ($type) ? $value : main::standardize($value);
4439
4440        for ($j = $i; $j < $range_list_size; $j++) {
4441            trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
4442
4443            # If find a range that it doesn't overlap into, we can stop
4444            # searching
4445            last if $end < $r->[$j]->start;
4446
4447            # Here, overlaps the range at $j.  If the values don't match,
4448            # and so far we think this is a clean insertion, it becomes a
4449            # non-clean insertion, i.e., a 'change' or 'replace' instead.
4450            if ($clean_insert) {
4451                if ($r->[$j]->standard_form ne $standard_form) {
4452                    $clean_insert = 0;
4453                    if ($replace == $CROAK) {
4454                        main::croak("The range to add "
4455                        . sprintf("%04X", $start)
4456                        . '-'
4457                        . sprintf("%04X", $end)
4458                        . " with value '$value' overlaps an existing range $r->[$j]");
4459                    }
4460                }
4461                else {
4462
4463                    # Here, the two values are essentially the same.  If the
4464                    # two are actually identical, replacing wouldn't change
4465                    # anything so skip it.
4466                    my $pre_existing = $r->[$j]->value;
4467                    if ($pre_existing ne $value) {
4468
4469                        # Here the new and old standardized values are the
4470                        # same, but the non-standardized values aren't.  If
4471                        # replacing unconditionally, then replace
4472                        if( $replace == $UNCONDITIONALLY) {
4473                            $clean_insert = 0;
4474                        }
4475                        else {
4476
4477                            # Here, are replacing conditionally.  Decide to
4478                            # replace or not based on which appears to look
4479                            # the "nicest".  If one is mixed case and the
4480                            # other isn't, choose the mixed case one.
4481                            my $new_mixed = $value =~ /[A-Z]/
4482                                            && $value =~ /[a-z]/;
4483                            my $old_mixed = $pre_existing =~ /[A-Z]/
4484                                            && $pre_existing =~ /[a-z]/;
4485
4486                            if ($old_mixed != $new_mixed) {
4487                                $clean_insert = 0 if $new_mixed;
4488                                if (main::DEBUG && $to_trace) {
4489                                    if ($clean_insert) {
4490                                        trace "Retaining $pre_existing over $value";
4491                                    }
4492                                    else {
4493                                        trace "Replacing $pre_existing with $value";
4494                                    }
4495                                }
4496                            }
4497                            else {
4498
4499                                # Here casing wasn't different between the two.
4500                                # If one has hyphens or underscores and the
4501                                # other doesn't, choose the one with the
4502                                # punctuation.
4503                                my $new_punct = $value =~ /[-_]/;
4504                                my $old_punct = $pre_existing =~ /[-_]/;
4505
4506                                if ($old_punct != $new_punct) {
4507                                    $clean_insert = 0 if $new_punct;
4508                                    if (main::DEBUG && $to_trace) {
4509                                        if ($clean_insert) {
4510                                            trace "Retaining $pre_existing over $value";
4511                                        }
4512                                        else {
4513                                            trace "Replacing $pre_existing with $value";
4514                                        }
4515                                    }
4516                                }   # else existing one is just as "good";
4517                                    # retain it to save cycles.
4518                            }
4519                        }
4520                    }
4521                }
4522            }
4523        } # End of loop looking for highest affected range.
4524
4525        # Here, $j points to one beyond the highest range that this insertion
4526        # affects (hence to beyond the range list if that range is the final
4527        # one in the range list).
4528
4529        # The splice length is all the affected ranges.  Get it before
4530        # subtracting, for efficiency, so we don't have to later add 1.
4531        my $length = $j - $i;
4532
4533        $j--;        # $j now points to the highest affected range.
4534        trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
4535
4536        # Here, have taken care of $NO and $MULTIPLE_foo replaces.
4537        # $j points to the highest affected range.  But it can be < $i or even
4538        # -1.  These happen only if the insertion is entirely in the gap
4539        # between r[$i-1] and r[$i].  Here's why: j < i means that the j loop
4540        # above exited first time through with $end < $r->[$i]->start.  (And
4541        # then we subtracted one from j)  This implies also that $start <
4542        # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
4543        # $start, so the entire input range is in the gap.
4544        if ($j < $i) {
4545
4546            # Here the entire input range is in the gap before $i.
4547
4548            if (main::DEBUG && $to_trace) {
4549                if ($i) {
4550                    trace "Entire range is between $r->[$i-1] and $r->[$i]";
4551                }
4552                else {
4553                    trace "Entire range is before $r->[$i]";
4554                }
4555            }
4556            return if $operation ne '+'; # Deletion of a non-existent range is
4557                                         # a no-op
4558        }
4559        else {
4560
4561            # Here part of the input range is not in the gap before $i.  Thus,
4562            # there is at least one affected one, and $j points to the highest
4563            # such one.
4564
4565            # At this point, here is the situation:
4566            # This is not an insertion of a multiple, nor of tentative ($NO)
4567            # data.
4568            #   $i  points to the first element in the current range list that
4569            #            may be affected by this operation.  In fact, we know
4570            #            that the range at $i is affected because we are in
4571            #            the else branch of this 'if'
4572            #   $j  points to the highest affected range.
4573            # In other words,
4574            #   r[$i-1]->end < $start <= r[$i]->end
4575            # And:
4576            #   r[$i-1]->end < $start <= $end < r[$j+1]->start
4577            #
4578            # Also:
4579            #   $clean_insert is a boolean which is set true if and only if
4580            #        this is a "clean insertion", i.e., not a change nor a
4581            #        deletion (multiple was handled above).
4582
4583            # We now have enough information to decide if this call is a no-op
4584            # or not.  It is a no-op if this is an insertion of already
4585            # existing data.  To be so, it must be contained entirely in one
4586            # range.
4587
4588            if (main::DEBUG && $to_trace && $clean_insert
4589                                         && $start >= $r->[$i]->start
4590                                         && $end   <= $r->[$i]->end)
4591            {
4592                    trace "no-op";
4593            }
4594            return if $clean_insert
4595                      && $start >= $r->[$i]->start
4596                      && $end   <= $r->[$i]->end;
4597        }
4598
4599        # Here, we know that some action will have to be taken.  We have
4600        # calculated the offset and length (though adjustments may be needed)
4601        # for the splice.  Now start constructing the replacement list.
4602        my @replacement;
4603        my $splice_start = $i;
4604
4605        my $extends_below;
4606        my $extends_above;
4607
4608        # See if should extend any adjacent ranges.
4609        if ($operation eq '-') { # Don't extend deletions
4610            $extends_below = $extends_above = 0;
4611        }
4612        else {  # Here, should extend any adjacent ranges.  See if there are
4613                # any.
4614            $extends_below = ($i > 0
4615                            # can't extend unless adjacent
4616                            && $r->[$i-1]->end == $start -1
4617                            # can't extend unless are same standard value
4618                            && $r->[$i-1]->standard_form eq $standard_form
4619                            # can't extend unless share type
4620                            && $r->[$i-1]->type == $type);
4621            $extends_above = ($j+1 < $range_list_size
4622                            && $r->[$j+1]->start == $end +1
4623                            && $r->[$j+1]->standard_form eq $standard_form
4624                            && $r->[$j+1]->type == $type);
4625        }
4626        if ($extends_below && $extends_above) { # Adds to both
4627            $splice_start--;     # start replace at element below
4628            $length += 2;        # will replace on both sides
4629            trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
4630
4631            # The result will fill in any gap, replacing both sides, and
4632            # create one large range.
4633            @replacement = Range->new($r->[$i-1]->start,
4634                                      $r->[$j+1]->end,
4635                                      Value => $value,
4636                                      Type => $type);
4637        }
4638        else {
4639
4640            # Here we know that the result won't just be the conglomeration of
4641            # a new range with both its adjacent neighbors.  But it could
4642            # extend one of them.
4643
4644            if ($extends_below) {
4645
4646                # Here the new element adds to the one below, but not to the
4647                # one above.  If inserting, and only to that one range,  can
4648                # just change its ending to include the new one.
4649                if ($length == 0 && $clean_insert) {
4650                    $r->[$i-1]->set_end($end);
4651                    trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
4652                    return;
4653                }
4654                else {
4655                    trace "Changing inserted range to start at ", sprintf("%04X",  $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
4656                    $splice_start--;        # start replace at element below
4657                    $length++;              # will replace the element below
4658                    $start = $r->[$i-1]->start;
4659                }
4660            }
4661            elsif ($extends_above) {
4662
4663                # Here the new element adds to the one above, but not below.
4664                # Mirror the code above
4665                if ($length == 0 && $clean_insert) {
4666                    $r->[$j+1]->set_start($start);
4667                    trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
4668                    return;
4669                }
4670                else {
4671                    trace "Changing inserted range to end at ", sprintf("%04X",  $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
4672                    $length++;        # will replace the element above
4673                    $end = $r->[$j+1]->end;
4674                }
4675            }
4676
4677            trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
4678
4679            # Finally, here we know there will have to be a splice.
4680            # If the change or delete affects only the highest portion of the
4681            # first affected range, the range will have to be split.  The
4682            # splice will remove the whole range, but will replace it by a new
4683            # range containing just the unaffected part.  So, in this case,
4684            # add to the replacement list just this unaffected portion.
4685            if (! $extends_below
4686                && $start > $r->[$i]->start && $start <= $r->[$i]->end)
4687            {
4688                push @replacement,
4689                    Range->new($r->[$i]->start,
4690                               $start - 1,
4691                               Value => $r->[$i]->value,
4692                               Type => $r->[$i]->type);
4693            }
4694
4695            # In the case of an insert or change, but not a delete, we have to
4696            # put in the new stuff;  this comes next.
4697            if ($operation eq '+') {
4698                push @replacement, Range->new($start,
4699                                              $end,
4700                                              Value => $value,
4701                                              Type => $type);
4702            }
4703
4704            trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
4705            #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
4706
4707            # And finally, if we're changing or deleting only a portion of the
4708            # highest affected range, it must be split, as the lowest one was.
4709            if (! $extends_above
4710                && $j >= 0  # Remember that j can be -1 if before first
4711                            # current element
4712                && $end >= $r->[$j]->start
4713                && $end < $r->[$j]->end)
4714            {
4715                push @replacement,
4716                    Range->new($end + 1,
4717                               $r->[$j]->end,
4718                               Value => $r->[$j]->value,
4719                               Type => $r->[$j]->type);
4720            }
4721        }
4722
4723        # And do the splice, as calculated above
4724        if (main::DEBUG && $to_trace) {
4725            trace "replacing $length element(s) at $i with ";
4726            foreach my $replacement (@replacement) {
4727                trace "    $replacement";
4728            }
4729            trace "Before splice:";
4730            trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4731            trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4732            trace "i  =[", $i, "]", $r->[$i];
4733            trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4734            trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4735        }
4736
4737        my @return = splice @$r, $splice_start, $length, @replacement;
4738
4739        if (main::DEBUG && $to_trace) {
4740            trace "After splice:";
4741            trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4742            trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4743            trace "i  =[", $i, "]", $r->[$i];
4744            trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4745            trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4746            trace "removed ", @return if @return;
4747        }
4748
4749        # An actual deletion could have changed the maximum in the list.
4750        # There was no deletion if the splice didn't return something, but
4751        # otherwise recalculate it.  This is done too rarely to worry about
4752        # performance.
4753        if ($operation eq '-' && @return) {
4754            if (@$r) {
4755                $max{$addr} = $r->[-1]->end;
4756            }
4757            else {  # Now empty
4758                $max{$addr} = $max_init;
4759            }
4760        }
4761        return @return;
4762    }
4763
4764    sub reset_each_range($self) {  # reset the iterator for each_range();
4765        no overloading;
4766        undef $each_range_iterator{pack 'J', $self};
4767        return;
4768    }
4769
4770    sub each_range($self) {
4771        # Iterate over each range in a range list.  Results are undefined if
4772        # the range list is changed during the iteration.
4773        my $addr = do { no overloading; pack 'J', $self; };
4774
4775        return if $self->is_empty;
4776
4777        $each_range_iterator{$addr} = -1
4778                                if ! defined $each_range_iterator{$addr};
4779        $each_range_iterator{$addr}++;
4780        return $ranges{$addr}->[$each_range_iterator{$addr}]
4781                        if $each_range_iterator{$addr} < @{$ranges{$addr}};
4782        undef $each_range_iterator{$addr};
4783        return;
4784    }
4785
4786    sub count($self) {        # Returns count of code points in range list
4787        my $addr = do { no overloading; pack 'J', $self; };
4788
4789        my $count = 0;
4790        foreach my $range (@{$ranges{$addr}}) {
4791            $count += $range->end - $range->start + 1;
4792        }
4793        return $count;
4794    }
4795
4796    sub delete_range($self, $start, $end) {    # Delete a range
4797        return $self->_add_delete('-', $start, $end, "");
4798    }
4799
4800    sub is_empty($self) { # Returns boolean as to if a range list is empty
4801        no overloading;
4802        return scalar @{$ranges{pack 'J', $self}} == 0;
4803    }
4804
4805    sub hash($self) {
4806        # Quickly returns a scalar suitable for separating tables into
4807        # buckets, i.e. it is a hash function of the contents of a table, so
4808        # there are relatively few conflicts.
4809        my $addr = do { no overloading; pack 'J', $self; };
4810
4811        # These are quickly computable.  Return looks like 'min..max;count'
4812        return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
4813    }
4814} # End closure for _Range_List_Base
4815
4816package Range_List;
4817use parent '-norequire', '_Range_List_Base';
4818
4819use warnings;
4820use strict;
4821
4822use feature 'signatures';
4823no warnings 'experimental::signatures';
4824
4825# A Range_List is a range list for match tables; i.e. the range values are
4826# not significant.  Thus a number of operations can be safely added to it,
4827# such as inversion, intersection.  Note that union is also an unsafe
4828# operation when range values are cared about, and that method is in the base
4829# class, not here.  But things are set up so that that method is callable only
4830# during initialization.  Only in this derived class, is there an operation
4831# that combines two tables.  A Range_Map can thus be used to initialize a
4832# Range_List, and its mappings will be in the list, but are not significant to
4833# this class.
4834
4835sub trace { return main::trace(@_); }
4836
4837{ # Closure
4838
4839    use overload
4840        fallback => 0,
4841        '+' => sub { my $self = shift;
4842                    my $other = shift;
4843
4844                    return $self->_union($other)
4845                },
4846        '+=' => sub { my $self = shift;
4847                    my $other = shift;
4848                    my $reversed = shift;
4849
4850                    if ($reversed) {
4851                        Carp::my_carp_bug("Bad news.  Can't cope with '"
4852                        . ref($other)
4853                        . ' += '
4854                        . ref($self)
4855                        . "'.  undef returned.");
4856                        return;
4857                    }
4858
4859                    return $self->_union($other)
4860                },
4861        '&' => sub { my $self = shift;
4862                    my $other = shift;
4863
4864                    return $self->_intersect($other, 0);
4865                },
4866        '&=' => sub { my $self = shift;
4867                    my $other = shift;
4868                    my $reversed = shift;
4869
4870                    if ($reversed) {
4871                        Carp::my_carp_bug("Bad news.  Can't cope with '"
4872                        . ref($other)
4873                        . ' &= '
4874                        . ref($self)
4875                        . "'.  undef returned.");
4876                        return;
4877                    }
4878
4879                    return $self->_intersect($other, 0);
4880                },
4881        '~' => "_invert",
4882        '-' => "_subtract",
4883    ;
4884
4885    sub _invert($self, @) {
4886        # Returns a new Range_List that gives all code points not in $self.
4887        my $new = Range_List->new;
4888
4889        # Go through each range in the table, finding the gaps between them
4890        my $max = -1;   # Set so no gap before range beginning at 0
4891        for my $range ($self->ranges) {
4892            my $start = $range->start;
4893            my $end   = $range->end;
4894
4895            # If there is a gap before this range, the inverse will contain
4896            # that gap.
4897            if ($start > $max + 1) {
4898                $new->add_range($max + 1, $start - 1);
4899            }
4900            $max = $end;
4901        }
4902
4903        # And finally, add the gap from the end of the table to the max
4904        # possible code point
4905        if ($max < $MAX_WORKING_CODEPOINT) {
4906            $new->add_range($max + 1, $MAX_WORKING_CODEPOINT);
4907        }
4908        return $new;
4909    }
4910
4911    sub _subtract($self, $other, $reversed=0) {
4912        # Returns a new Range_List with the argument deleted from it.  The
4913        # argument can be a single code point, a range, or something that has
4914        # a range, with the _range_list() method on it returning them
4915
4916        if ($reversed) {
4917            Carp::my_carp_bug("Bad news.  Can't cope with '"
4918            . ref($other)
4919            . ' - '
4920            . ref($self)
4921            . "'.  undef returned.");
4922            return;
4923        }
4924
4925        my $new = Range_List->new(Initialize => $self);
4926
4927        if (! ref $other) { # Single code point
4928            $new->delete_range($other, $other);
4929        }
4930        elsif ($other->isa('Range')) {
4931            $new->delete_range($other->start, $other->end);
4932        }
4933        elsif ($other->can('_range_list')) {
4934            foreach my $range ($other->_range_list->ranges) {
4935                $new->delete_range($range->start, $range->end);
4936            }
4937        }
4938        else {
4939            Carp::my_carp_bug("Can't cope with a "
4940                        . ref($other)
4941                        . " argument to '-'.  Subtraction ignored."
4942                        );
4943            return $self;
4944        }
4945
4946        return $new;
4947    }
4948
4949    sub _intersect($a_object, $b_object, $check_if_overlapping=0) {
4950        # Returns either a boolean giving whether the two inputs' range lists
4951        # intersect (overlap), or a new Range_List containing the intersection
4952        # of the two lists.  The optional final parameter being true indicates
4953        # to do the check instead of the intersection.
4954
4955        if (! defined $b_object) {
4956            my $message = "";
4957            $message .= $a_object->_owner_name_of if defined $a_object;
4958            Carp::my_carp_bug($message .= "Called with undefined value.  Intersection not done.");
4959            return;
4960        }
4961
4962        # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
4963        # Thus the intersection could be much more simply be written:
4964        #   return ~(~$a_object + ~$b_object);
4965        # But, this is slower, and when taking the inverse of a large
4966        # range_size_1 table, back when such tables were always stored that
4967        # way, it became prohibitively slow, hence the code was changed to the
4968        # below
4969
4970        if ($b_object->isa('Range')) {
4971            $b_object = Range_List->new(Initialize => $b_object,
4972                                        Owner => $a_object->_owner_name_of);
4973        }
4974        $b_object = $b_object->_range_list if $b_object->can('_range_list');
4975
4976        my @a_ranges = $a_object->ranges;
4977        my @b_ranges = $b_object->ranges;
4978
4979        #local $to_trace = 1 if main::DEBUG;
4980        trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
4981
4982        # Start with the first range in each list
4983        my $a_i = 0;
4984        my $range_a = $a_ranges[$a_i];
4985        my $b_i = 0;
4986        my $range_b = $b_ranges[$b_i];
4987
4988        my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
4989                                                if ! $check_if_overlapping;
4990
4991        # If either list is empty, there is no intersection and no overlap
4992        if (! defined $range_a || ! defined $range_b) {
4993            return $check_if_overlapping ? 0 : $new;
4994        }
4995        trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4996
4997        # Otherwise, must calculate the intersection/overlap.  Start with the
4998        # very first code point in each list
4999        my $a = $range_a->start;
5000        my $b = $range_b->start;
5001
5002        # Loop through all the ranges of each list; in each iteration, $a and
5003        # $b are the current code points in their respective lists
5004        while (1) {
5005
5006            # If $a and $b are the same code point, ...
5007            if ($a == $b) {
5008
5009                # it means the lists overlap.  If just checking for overlap
5010                # know the answer now,
5011                return 1 if $check_if_overlapping;
5012
5013                # The intersection includes this code point plus anything else
5014                # common to both current ranges.
5015                my $start = $a;
5016                my $end = main::min($range_a->end, $range_b->end);
5017                if (! $check_if_overlapping) {
5018                    trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
5019                    $new->add_range($start, $end);
5020                }
5021
5022                # Skip ahead to the end of the current intersect
5023                $a = $b = $end;
5024
5025                # If the current intersect ends at the end of either range (as
5026                # it must for at least one of them), the next possible one
5027                # will be the beginning code point in it's list's next range.
5028                if ($a == $range_a->end) {
5029                    $range_a = $a_ranges[++$a_i];
5030                    last unless defined $range_a;
5031                    $a = $range_a->start;
5032                }
5033                if ($b == $range_b->end) {
5034                    $range_b = $b_ranges[++$b_i];
5035                    last unless defined $range_b;
5036                    $b = $range_b->start;
5037                }
5038
5039                trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
5040            }
5041            elsif ($a < $b) {
5042
5043                # Not equal, but if the range containing $a encompasses $b,
5044                # change $a to be the middle of the range where it does equal
5045                # $b, so the next iteration will get the intersection
5046                if ($range_a->end >= $b) {
5047                    $a = $b;
5048                }
5049                else {
5050
5051                    # Here, the current range containing $a is entirely below
5052                    # $b.  Go try to find a range that could contain $b.
5053                    $a_i = $a_object->_search_ranges($b);
5054
5055                    # If no range found, quit.
5056                    last unless defined $a_i;
5057
5058                    # The search returns $a_i, such that
5059                    #   range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
5060                    # Set $a to the beginning of this new range, and repeat.
5061                    $range_a = $a_ranges[$a_i];
5062                    $a = $range_a->start;
5063                }
5064            }
5065            else { # Here, $b < $a.
5066
5067                # Mirror image code to the leg just above
5068                if ($range_b->end >= $a) {
5069                    $b = $a;
5070                }
5071                else {
5072                    $b_i = $b_object->_search_ranges($a);
5073                    last unless defined $b_i;
5074                    $range_b = $b_ranges[$b_i];
5075                    $b = $range_b->start;
5076                }
5077            }
5078        } # End of looping through ranges.
5079
5080        # Intersection fully computed, or now know that there is no overlap
5081        return $check_if_overlapping ? 0 : $new;
5082    }
5083
5084    sub overlaps($self, $other) {
5085        # Returns boolean giving whether the two arguments overlap somewhere
5086        return $self->_intersect($other, 1);
5087    }
5088
5089    sub add_range($self, $start, $end) {
5090        # Add a range to the list.
5091        return $self->_add_delete('+', $start, $end, "");
5092    }
5093
5094    sub matches_identically_to($self, $other) {
5095        # Return a boolean as to whether or not two Range_Lists match identical
5096        # sets of code points.
5097        # These are ordered in increasing real time to figure out (at least
5098        # until a patch changes that and doesn't change this)
5099        return 0 if $self->max != $other->max;
5100        return 0 if $self->min != $other->min;
5101        return 0 if $self->range_count != $other->range_count;
5102        return 0 if $self->count != $other->count;
5103
5104        # Here they could be identical because all the tests above passed.
5105        # The loop below is somewhat simpler since we know they have the same
5106        # number of elements.  Compare range by range, until reach the end or
5107        # find something that differs.
5108        my @a_ranges = $self->ranges;
5109        my @b_ranges = $other->ranges;
5110        for my $i (0 .. @a_ranges - 1) {
5111            my $a = $a_ranges[$i];
5112            my $b = $b_ranges[$i];
5113            trace "self $a; other $b" if main::DEBUG && $to_trace;
5114            return 0 if ! defined $b
5115                        || $a->start != $b->start
5116                        || $a->end != $b->end;
5117        }
5118        return 1;
5119    }
5120
5121    sub is_code_point_usable($code, $try_hard) {
5122        # This used only for making the test script.  See if the input
5123        # proposed trial code point is one that Perl will handle.  If second
5124        # parameter is 0, it won't select some code points for various
5125        # reasons, noted below.
5126        return 0 if $code < 0;                # Never use a negative
5127
5128        # shun null.  I'm (khw) not sure why this was done, but NULL would be
5129        # the character very frequently used.
5130        return $try_hard if $code == 0x0000;
5131
5132        # shun non-character code points.
5133        return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
5134        return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
5135
5136        return $try_hard if $code > $MAX_UNICODE_CODEPOINT;   # keep in range
5137        return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
5138
5139        return 1;
5140    }
5141
5142    sub get_valid_code_point($self) {
5143        # Return a code point that's part of the range list.  Returns nothing
5144        # if the table is empty or we can't find a suitable code point.  This
5145        # used only for making the test script.
5146        my $addr = do { no overloading; pack 'J', $self; };
5147
5148        # On first pass, don't choose less desirable code points; if no good
5149        # one is found, repeat, allowing a less desirable one to be selected.
5150        for my $try_hard (0, 1) {
5151
5152            # Look through all the ranges for a usable code point.
5153            for my $set (reverse $self->ranges) {
5154
5155                # Try the edge cases first, starting with the end point of the
5156                # range.
5157                my $end = $set->end;
5158                return $end if is_code_point_usable($end, $try_hard);
5159                $end = $MAX_UNICODE_CODEPOINT + 1 if $end > $MAX_UNICODE_CODEPOINT;
5160
5161                # End point didn't, work.  Start at the beginning and try
5162                # every one until find one that does work.
5163                for my $trial ($set->start .. $end - 1) {
5164                    return $trial if is_code_point_usable($trial, $try_hard);
5165                }
5166            }
5167        }
5168        return ();  # If none found, give up.
5169    }
5170
5171    sub get_invalid_code_point($self) {
5172        # Return a code point that's not part of the table.  Returns nothing
5173        # if the table covers all code points or a suitable code point can't
5174        # be found.  This used only for making the test script.
5175
5176        # Just find a valid code point of the inverse, if any.
5177        return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
5178    }
5179} # end closure for Range_List
5180
5181package Range_Map;
5182use parent '-norequire', '_Range_List_Base';
5183
5184use strict;
5185use warnings;
5186
5187use feature 'signatures';
5188no warnings 'experimental::signatures';
5189
5190# A Range_Map is a range list in which the range values (called maps) are
5191# significant, and hence shouldn't be manipulated by our other code, which
5192# could be ambiguous or lose things.  For example, in taking the union of two
5193# lists, which share code points, but which have differing values, which one
5194# has precedence in the union?
5195# It turns out that these operations aren't really necessary for map tables,
5196# and so this class was created to make sure they aren't accidentally
5197# applied to them.
5198
5199{ # Closure
5200
5201    sub add_map($self, @add) {
5202        # Add a range containing a mapping value to the list
5203        return $self->_add_delete('+', @add);
5204    }
5205
5206    sub replace_map($self, @list) {
5207        # Replace a range
5208        return $self->_add_delete('+', @list, Replace => $UNCONDITIONALLY);
5209    }
5210
5211    sub add_duplicate {
5212        # Adds entry to a range list which can duplicate an existing entry
5213
5214        my $self = shift;
5215        my $code_point = shift;
5216        my $value = shift;
5217        my %args = @_;
5218        my $replace = delete $args{'Replace'} // $MULTIPLE_BEFORE;
5219        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5220
5221        return $self->add_map($code_point, $code_point,
5222                                $value, Replace => $replace);
5223    }
5224} # End of closure for package Range_Map
5225
5226package _Base_Table;
5227
5228use strict;
5229use warnings;
5230
5231use feature 'signatures';
5232no warnings 'experimental::signatures';
5233
5234# A table is the basic data structure that gets written out into a file for
5235# use by the Perl core.  This is the abstract base class implementing the
5236# common elements from the derived ones.  A list of the methods to be
5237# furnished by an implementing class is just after the constructor.
5238
5239sub standardize { return main::standardize($_[0]); }
5240sub trace { return main::trace(@_); }
5241
5242{ # Closure
5243
5244    main::setup_package();
5245
5246    my %range_list;
5247    # Object containing the ranges of the table.
5248    main::set_access('range_list', \%range_list, 'p_r', 'p_s');
5249
5250    my %full_name;
5251    # The full table name.
5252    main::set_access('full_name', \%full_name, 'r');
5253
5254    my %name;
5255    # The table name, almost always shorter
5256    main::set_access('name', \%name, 'r');
5257
5258    my %short_name;
5259    # The shortest of all the aliases for this table, with underscores removed
5260    main::set_access('short_name', \%short_name);
5261
5262    my %nominal_short_name_length;
5263    # The length of short_name before removing underscores
5264    main::set_access('nominal_short_name_length',
5265                    \%nominal_short_name_length);
5266
5267    my %complete_name;
5268    # The complete name, including property.
5269    main::set_access('complete_name', \%complete_name, 'r');
5270
5271    my %property;
5272    # Parent property this table is attached to.
5273    main::set_access('property', \%property, 'r');
5274
5275    my %aliases;
5276    # Ordered list of alias objects of the table's name.  The first ones in
5277    # the list are output first in comments
5278    main::set_access('aliases', \%aliases, 'readable_array');
5279
5280    my %comment;
5281    # A comment associated with the table for human readers of the files
5282    main::set_access('comment', \%comment, 's');
5283
5284    my %description;
5285    # A comment giving a short description of the table's meaning for human
5286    # readers of the files.
5287    main::set_access('description', \%description, 'readable_array');
5288
5289    my %note;
5290    # A comment giving a short note about the table for human readers of the
5291    # files.
5292    main::set_access('note', \%note, 'readable_array');
5293
5294    my %fate;
5295    # Enum; there are a number of possibilities for what happens to this
5296    # table: it could be normal, or suppressed, or not for external use.  See
5297    # values at definition for $SUPPRESSED.
5298    main::set_access('fate', \%fate, 'r');
5299
5300    my %find_table_from_alias;
5301    # The parent property passes this pointer to a hash which this class adds
5302    # all its aliases to, so that the parent can quickly take an alias and
5303    # find this table.
5304    main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
5305
5306    my %locked;
5307    # After this table is made equivalent to another one; we shouldn't go
5308    # changing the contents because that could mean it's no longer equivalent
5309    main::set_access('locked', \%locked, 'r');
5310
5311    my %file_path;
5312    # This gives the final path to the file containing the table.  Each
5313    # directory in the path is an element in the array
5314    main::set_access('file_path', \%file_path, 'readable_array');
5315
5316    my %status;
5317    # What is the table's status, normal, $OBSOLETE, etc.  Enum
5318    main::set_access('status', \%status, 'r');
5319
5320    my %status_info;
5321    # A comment about its being obsolete, or whatever non normal status it has
5322    main::set_access('status_info', \%status_info, 'r');
5323
5324    my %caseless_equivalent;
5325    # The table this is equivalent to under /i matching, if any.
5326    main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's');
5327
5328    my %range_size_1;
5329    # Is the table to be output with each range only a single code point?
5330    # This is done to avoid breaking existing code that may have come to rely
5331    # on this behavior in previous versions of this program.)
5332    main::set_access('range_size_1', \%range_size_1, 'r', 's');
5333
5334    my %perl_extension;
5335    # A boolean set iff this table is a Perl extension to the Unicode
5336    # standard.
5337    main::set_access('perl_extension', \%perl_extension, 'r');
5338
5339    my %output_range_counts;
5340    # A boolean set iff this table is to have comments written in the
5341    # output file that contain the number of code points in the range.
5342    # The constructor can override the global flag of the same name.
5343    main::set_access('output_range_counts', \%output_range_counts, 'r');
5344
5345    my %write_as_invlist;
5346    # A boolean set iff the output file for this table is to be in the form of
5347    # an inversion list/map.
5348    main::set_access('write_as_invlist', \%write_as_invlist, 'r');
5349
5350    my %format;
5351    # The format of the entries of the table.  This is calculated from the
5352    # data in the table (or passed in the constructor).  This is an enum e.g.,
5353    # $STRING_FORMAT.  It is marked protected as it should not be generally
5354    # used to override calculations.
5355    main::set_access('format', \%format, 'r', 'p_s');
5356
5357    my %has_dependency;
5358    # A boolean that gives whether some other table in this property is
5359    # defined as the complement of this table.  This is a crude, but currently
5360    # sufficient, mechanism to make this table not get destroyed before what
5361    # is dependent on it is.  Other dependencies could be added, so the name
5362    # was chosen to reflect a more general situation than actually is
5363    # currently the case.
5364    main::set_access('has_dependency', \%has_dependency, 'r', 's');
5365
5366    sub new {
5367        # All arguments are key => value pairs, which you can see below, most
5368        # of which match fields documented above.  Otherwise: Re_Pod_Entry,
5369        # OK_as_Filename, and Fuzzy apply to the names of the table, and are
5370        # documented in the Alias package
5371
5372        return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
5373
5374        my $class = shift;
5375
5376        my $self = bless \do { my $anonymous_scalar }, $class;
5377        my $addr = do { no overloading; pack 'J', $self; };
5378
5379        my %args = @_;
5380
5381        $name{$addr} = delete $args{'Name'};
5382        $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
5383        $full_name{$addr} = delete $args{'Full_Name'};
5384        my $complete_name = $complete_name{$addr}
5385                          = delete $args{'Complete_Name'};
5386        $format{$addr} = delete $args{'Format'};
5387        $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
5388        $property{$addr} = delete $args{'_Property'};
5389        $range_list{$addr} = delete $args{'_Range_List'};
5390        $status{$addr} = delete $args{'Status'} || $NORMAL;
5391        $status_info{$addr} = delete $args{'_Status_Info'} || "";
5392        $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
5393        $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
5394        $fate{$addr} = delete $args{'Fate'} || $ORDINARY;
5395        $write_as_invlist{$addr} = delete $args{'Write_As_Invlist'};# No default
5396        my $ucd = delete $args{'UCD'};
5397
5398        my $description = delete $args{'Description'};
5399        my $ok_as_filename = delete $args{'OK_as_Filename'};
5400        my $loose_match = delete $args{'Fuzzy'};
5401        my $note = delete $args{'Note'};
5402        my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
5403        my $perl_extension = delete $args{'Perl_Extension'};
5404        my $suppression_reason = delete $args{'Suppression_Reason'};
5405
5406        # Shouldn't have any left over
5407        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5408
5409        # Can't use || above because conceivably the name could be 0, and
5410        # can't use // operator in case this program gets used in Perl 5.8
5411        $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
5412        $output_range_counts{$addr} = $output_range_counts if
5413                                        ! defined $output_range_counts{$addr};
5414
5415        $aliases{$addr} = [ ];
5416        $comment{$addr} = [ ];
5417        $description{$addr} = [ ];
5418        $note{$addr} = [ ];
5419        $file_path{$addr} = [ ];
5420        $locked{$addr} = "";
5421        $has_dependency{$addr} = 0;
5422
5423        push @{$description{$addr}}, $description if $description;
5424        push @{$note{$addr}}, $note if $note;
5425
5426        if ($fate{$addr} == $PLACEHOLDER) {
5427
5428            # A placeholder table doesn't get documented, is a perl extension,
5429            # and quite likely will be empty
5430            $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
5431            $perl_extension = 1 if ! defined $perl_extension;
5432            $ucd = 0 if ! defined $ucd;
5433            push @tables_that_may_be_empty, $complete_name{$addr};
5434            $self->add_comment(<<END);
5435This is a placeholder because it is not in Version $string_version of Unicode,
5436but is needed by the Perl core to work gracefully.  Because it is not in this
5437version of Unicode, it will not be listed in $pod_file.pod
5438END
5439        }
5440        elsif (exists $why_suppressed{$complete_name}
5441                # Don't suppress if overridden
5442                && ! grep { $_ eq $complete_name{$addr} }
5443                                                    @output_mapped_properties)
5444        {
5445            $fate{$addr} = $SUPPRESSED;
5446        }
5447        elsif ($fate{$addr} == $SUPPRESSED) {
5448            Carp::my_carp_bug("Need reason for suppressing") unless $suppression_reason;
5449            # Though currently unused
5450        }
5451        elsif ($suppression_reason) {
5452            Carp::my_carp_bug("A reason was given for suppressing, but not suppressed");
5453        }
5454
5455        # If hasn't set its status already, see if it is on one of the
5456        # lists of properties or tables that have particular statuses; if
5457        # not, is normal.  The lists are prioritized so the most serious
5458        # ones are checked first
5459        if (! $status{$addr}) {
5460            if (exists $why_deprecated{$complete_name}) {
5461                $status{$addr} = $DEPRECATED;
5462            }
5463            elsif (exists $why_stabilized{$complete_name}) {
5464                $status{$addr} = $STABILIZED;
5465            }
5466            elsif (exists $why_obsolete{$complete_name}) {
5467                $status{$addr} = $OBSOLETE;
5468            }
5469
5470            # Existence above doesn't necessarily mean there is a message
5471            # associated with it.  Use the most serious message.
5472            if ($status{$addr}) {
5473                if ($why_deprecated{$complete_name}) {
5474                    $status_info{$addr}
5475                                = $why_deprecated{$complete_name};
5476                }
5477                elsif ($why_stabilized{$complete_name}) {
5478                    $status_info{$addr}
5479                                = $why_stabilized{$complete_name};
5480                }
5481                elsif ($why_obsolete{$complete_name}) {
5482                    $status_info{$addr}
5483                                = $why_obsolete{$complete_name};
5484                }
5485            }
5486        }
5487
5488        $perl_extension{$addr} = $perl_extension || 0;
5489
5490        # Don't list a property by default that is internal only
5491        if ($fate{$addr} > $MAP_PROXIED) {
5492            $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
5493            $ucd = 0 if ! defined $ucd;
5494        }
5495        else {
5496            $ucd = 1 if ! defined $ucd;
5497        }
5498
5499        # By convention what typically gets printed only or first is what's
5500        # first in the list, so put the full name there for good output
5501        # clarity.  Other routines rely on the full name being first on the
5502        # list
5503        $self->add_alias($full_name{$addr},
5504                            OK_as_Filename => $ok_as_filename,
5505                            Fuzzy => $loose_match,
5506                            Re_Pod_Entry => $make_re_pod_entry,
5507                            Status => $status{$addr},
5508                            UCD => $ucd,
5509                            );
5510
5511        # Then comes the other name, if meaningfully different.
5512        if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
5513            $self->add_alias($name{$addr},
5514                            OK_as_Filename => $ok_as_filename,
5515                            Fuzzy => $loose_match,
5516                            Re_Pod_Entry => $make_re_pod_entry,
5517                            Status => $status{$addr},
5518                            UCD => $ucd,
5519                            );
5520        }
5521
5522        return $self;
5523    }
5524
5525    # Here are the methods that are required to be defined by any derived
5526    # class
5527    for my $sub (qw(
5528                    handle_special_range
5529                    append_to_body
5530                    pre_body
5531                ))
5532                # write() knows how to write out normal ranges, but it calls
5533                # handle_special_range() when it encounters a non-normal one.
5534                # append_to_body() is called by it after it has handled all
5535                # ranges to add anything after the main portion of the table.
5536                # And finally, pre_body() is called after all this to build up
5537                # anything that should appear before the main portion of the
5538                # table.  Doing it this way allows things in the middle to
5539                # affect what should appear before the main portion of the
5540                # table.
5541    {
5542        no strict "refs";
5543        *$sub = sub {
5544            Carp::my_carp_bug( __LINE__
5545                              . ": Must create method '$sub()' for "
5546                              . ref shift);
5547            return;
5548        }
5549    }
5550
5551    use overload
5552        fallback => 0,
5553        "." => \&main::_operator_dot,
5554        ".=" => \&main::_operator_dot_equal,
5555        '!=' => \&main::_operator_not_equal,
5556        '==' => \&main::_operator_equal,
5557    ;
5558
5559    sub ranges {
5560        # Returns the array of ranges associated with this table.
5561
5562        no overloading;
5563        return $range_list{pack 'J', shift}->ranges;
5564    }
5565
5566    sub add_alias {
5567        # Add a synonym for this table.
5568
5569        return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
5570
5571        my $self = shift;
5572        my $name = shift;       # The name to add.
5573        my $pointer = shift;    # What the alias hash should point to.  For
5574                                # map tables, this is the parent property;
5575                                # for match tables, it is the table itself.
5576
5577        my %args = @_;
5578        my $loose_match = delete $args{'Fuzzy'};
5579
5580        my $ok_as_filename = delete $args{'OK_as_Filename'};
5581        $ok_as_filename = 1 unless defined $ok_as_filename;
5582
5583        # An internal name does not get documented, unless overridden by the
5584        # input; same for making tests for it.
5585        my $status = delete $args{'Status'} || (($name =~ /^_/)
5586                                                ? $INTERNAL_ALIAS
5587                                                : $NORMAL);
5588        my $make_re_pod_entry = delete $args{'Re_Pod_Entry'}
5589                                            // (($status ne $INTERNAL_ALIAS)
5590                                               ? (($name =~ /^_/) ? $NO : $YES)
5591                                               : $NO);
5592        my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1);
5593
5594        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5595
5596        # Capitalize the first letter of the alias unless it is one of the CJK
5597        # ones which specifically begins with a lower 'k'.  Do this because
5598        # Unicode has varied whether they capitalize first letters or not, and
5599        # have later changed their minds and capitalized them, but not the
5600        # other way around.  So do it always and avoid changes from release to
5601        # release
5602        $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
5603
5604        my $addr = do { no overloading; pack 'J', $self; };
5605
5606        # Figure out if should be loosely matched if not already specified.
5607        if (! defined $loose_match) {
5608
5609            # Is a loose_match if isn't null, and doesn't begin with an
5610            # underscore and isn't just a number
5611            if ($name ne ""
5612                && substr($name, 0, 1) ne '_'
5613                && $name !~ qr{^[0-9_.+-/]+$})
5614            {
5615                $loose_match = 1;
5616            }
5617            else {
5618                $loose_match = 0;
5619            }
5620        }
5621
5622        # If this alias has already been defined, do nothing.
5623        return if defined $find_table_from_alias{$addr}->{$name};
5624
5625        # That includes if it is standardly equivalent to an existing alias,
5626        # in which case, add this name to the list, so won't have to search
5627        # for it again.
5628        my $standard_name = main::standardize($name);
5629        if (defined $find_table_from_alias{$addr}->{$standard_name}) {
5630            $find_table_from_alias{$addr}->{$name}
5631                        = $find_table_from_alias{$addr}->{$standard_name};
5632            return;
5633        }
5634
5635        # Set the index hash for this alias for future quick reference.
5636        $find_table_from_alias{$addr}->{$name} = $pointer;
5637        $find_table_from_alias{$addr}->{$standard_name} = $pointer;
5638        local $to_trace = 0 if main::DEBUG;
5639        trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
5640        trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
5641
5642
5643        # Put the new alias at the end of the list of aliases unless the final
5644        # element begins with an underscore (meaning it is for internal perl
5645        # use) or is all numeric, in which case, put the new one before that
5646        # one.  This floats any all-numeric or underscore-beginning aliases to
5647        # the end.  This is done so that they are listed last in output lists,
5648        # to encourage the user to use a better name (either more descriptive
5649        # or not an internal-only one) instead.  This ordering is relied on
5650        # implicitly elsewhere in this program, like in short_name()
5651        my $list = $aliases{$addr};
5652        my $insert_position = (@$list == 0
5653                                || (substr($list->[-1]->name, 0, 1) ne '_'
5654                                    && $list->[-1]->name =~ /\D/))
5655                            ? @$list
5656                            : @$list - 1;
5657        splice @$list,
5658                $insert_position,
5659                0,
5660                Alias->new($name, $loose_match, $make_re_pod_entry,
5661                           $ok_as_filename, $status, $ucd);
5662
5663        # This name may be shorter than any existing ones, so clear the cache
5664        # of the shortest, so will have to be recalculated.
5665        no overloading;
5666        undef $short_name{pack 'J', $self};
5667        return;
5668    }
5669
5670    sub short_name($self, $nominal_length_ptr=undef) {
5671        # Returns a name suitable for use as the base part of a file name.
5672        # That is, shorter wins.  It can return undef if there is no suitable
5673        # name.  The name has all non-essential underscores removed.
5674
5675        # The optional second parameter is a reference to a scalar in which
5676        # this routine will store the length the returned name had before the
5677        # underscores were removed, or undef if the return is undef.
5678
5679        # The shortest name can change if new aliases are added.  So using
5680        # this should be deferred until after all these are added.  The code
5681        # that does that should clear this one's cache.
5682        # Any name with alphabetics is preferred over an all numeric one, even
5683        # if longer.
5684
5685        my $addr = do { no overloading; pack 'J', $self; };
5686
5687        # For efficiency, don't recalculate, but this means that adding new
5688        # aliases could change what the shortest is, so the code that does
5689        # that needs to undef this.
5690        if (defined $short_name{$addr}) {
5691            if ($nominal_length_ptr) {
5692                $$nominal_length_ptr = $nominal_short_name_length{$addr};
5693            }
5694            return $short_name{$addr};
5695        }
5696
5697        # Look at each alias
5698        my $is_last_resort = 0;
5699        my $deprecated_or_discouraged
5700                                = qr/ ^ (?: $DEPRECATED | $DISCOURAGED ) $/x;
5701        foreach my $alias ($self->aliases()) {
5702
5703            # Don't use an alias that isn't ok to use for an external name.
5704            next if ! $alias->ok_as_filename;
5705
5706            my $name = main::Standardize($alias->name);
5707            trace $self, $name if main::DEBUG && $to_trace;
5708
5709            # Take the first one, or any non-deprecated non-discouraged one
5710            # over one that is, or a shorter one that isn't numeric.  This
5711            # relies on numeric aliases always being last in the array
5712            # returned by aliases().  Any alpha one will have precedence.
5713            if (   ! defined $short_name{$addr}
5714                || (   $is_last_resort
5715                    && $alias->status !~ $deprecated_or_discouraged)
5716                || ($name =~ /\D/
5717                    && length($name) < length($short_name{$addr})))
5718            {
5719                # Remove interior underscores.
5720                ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
5721
5722                $nominal_short_name_length{$addr} = length $name;
5723                $is_last_resort = $alias->status =~ $deprecated_or_discouraged;
5724            }
5725        }
5726
5727        # If the short name isn't a nice one, perhaps an equivalent table has
5728        # a better one.
5729        if (   $self->can('children')
5730            && (   ! defined $short_name{$addr}
5731                || $short_name{$addr} eq ""
5732                || $short_name{$addr} eq "_"))
5733        {
5734            my $return;
5735            foreach my $follower ($self->children) {    # All equivalents
5736                my $follower_name = $follower->short_name;
5737                next unless defined $follower_name;
5738
5739                # Anything (except undefined) is better than underscore or
5740                # empty
5741                if (! defined $return || $return eq "_") {
5742                    $return = $follower_name;
5743                    next;
5744                }
5745
5746                # If the new follower name isn't "_" and is shorter than the
5747                # current best one, prefer the new one.
5748                next if $follower_name eq "_";
5749                next if length $follower_name > length $return;
5750                $return = $follower_name;
5751            }
5752            $short_name{$addr} = $return if defined $return;
5753        }
5754
5755        # If no suitable external name return undef
5756        if (! defined $short_name{$addr}) {
5757            $$nominal_length_ptr = undef if $nominal_length_ptr;
5758            return;
5759        }
5760
5761        # Don't allow a null short name.
5762        if ($short_name{$addr} eq "") {
5763            $short_name{$addr} = '_';
5764            $nominal_short_name_length{$addr} = 1;
5765        }
5766
5767        trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
5768
5769        if ($nominal_length_ptr) {
5770            $$nominal_length_ptr = $nominal_short_name_length{$addr};
5771        }
5772        return $short_name{$addr};
5773    }
5774
5775    sub external_name($self) {
5776        # Returns the external name that this table should be known by.  This
5777        # is usually the short_name, but not if the short_name is undefined,
5778        # in which case the external_name is arbitrarily set to the
5779        # underscore.
5780
5781        my $short = $self->short_name;
5782        return $short if defined $short;
5783
5784        return '_';
5785    }
5786
5787    sub add_description($self, $description) { # Adds the parameter as a short description.
5788        no overloading;
5789        push @{$description{pack 'J', $self}}, $description;
5790
5791        return;
5792    }
5793
5794    sub add_note($self, $note) { # Adds the parameter as a short note.
5795        no overloading;
5796        push @{$note{pack 'J', $self}}, $note;
5797
5798        return;
5799    }
5800
5801    sub add_comment($self, $comment) { # Adds the parameter as a comment.
5802
5803        return unless $debugging_build;
5804
5805        chomp $comment;
5806
5807        no overloading;
5808        push @{$comment{pack 'J', $self}}, $comment;
5809
5810        return;
5811    }
5812
5813    sub comment($self) {
5814        # Return the current comment for this table.  If called in list
5815        # context, returns the array of comments.  In scalar, returns a string
5816        # of each element joined together with a period ending each.
5817
5818        my $addr = do { no overloading; pack 'J', $self; };
5819        my @list = @{$comment{$addr}};
5820        return @list if wantarray;
5821        my $return = "";
5822        foreach my $sentence (@list) {
5823            $return .= '.  ' if $return;
5824            $return .= $sentence;
5825            $return =~ s/\.$//;
5826        }
5827        $return .= '.' if $return;
5828        return $return;
5829    }
5830
5831    sub initialize($self, $initialization) {
5832        # Initialize the table with the argument which is any valid
5833        # initialization for range lists.
5834
5835        my $addr = do { no overloading; pack 'J', $self; };
5836
5837        # Replace the current range list with a new one of the same exact
5838        # type.
5839        my $class = ref $range_list{$addr};
5840        $range_list{$addr} = $class->new(Owner => $self,
5841                                        Initialize => $initialization);
5842        return;
5843
5844    }
5845
5846    sub header($self) {
5847        # The header that is output for the table in the file it is written
5848        # in.
5849        my $return = "";
5850        $return .= $DEVELOPMENT_ONLY if $compare_versions;
5851        $return .= $HEADER;
5852        return $return;
5853    }
5854
5855    sub merge_single_annotation_line ($output, $annotation, $annotation_column) {
5856
5857        # This appends an annotation comment, $annotation, to $output,
5858        # starting in or after column $annotation_column, removing any
5859        # pre-existing comment from $output.
5860
5861        $annotation =~ s/^ \s* \# \  //x;
5862        $output =~ s/ \s* ( \# \N* )? \n //x;
5863        $output = Text::Tabs::expand($output);
5864
5865        my $spaces = $annotation_column - length $output;
5866        $spaces = 2 if $spaces < 0;  # Have 2 blanks before the comment
5867
5868        $output = sprintf "%s%*s# %s",
5869                            $output,
5870                            $spaces,
5871                            " ",
5872                            $annotation;
5873        return Text::Tabs::unexpand $output;
5874    }
5875
5876    sub write($self, $use_adjustments=0, $suppress_value=0) {
5877        # Write a representation of the table to its file.  It calls several
5878        # functions furnished by sub-classes of this abstract base class to
5879        # handle non-normal ranges, to add stuff before the table, and at its
5880        # end.  If the table is to be written so that adjustments are
5881        # required, this does that conversion.
5882
5883
5884        # $use_adjustments ? output in adjusted format or not
5885        # $suppress_value Optional, if the value associated with
5886        # a range equals this one, don't write
5887        # the range
5888
5889        my $addr = do { no overloading; pack 'J', $self; };
5890        my $write_as_invlist = $write_as_invlist{$addr};
5891
5892        # Start with the header
5893        my @HEADER = $self->header;
5894
5895        # Then the comments
5896        push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
5897                                                        if $comment{$addr};
5898
5899        # Things discovered processing the main body of the document may
5900        # affect what gets output before it, therefore pre_body() isn't called
5901        # until after all other processing of the table is done.
5902
5903        # The main body looks like a 'here' document.  If there are comments,
5904        # get rid of them when processing it.
5905        my @OUT;
5906        if ($annotate || $output_range_counts) {
5907            # Use the line below in Perls that don't have /r
5908            #push @OUT, 'return join "\n",  map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n";
5909            push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n";
5910        } else {
5911            push @OUT, "return <<'END';\n";
5912        }
5913
5914        if ($range_list{$addr}->is_empty) {
5915
5916            # This is a kludge for empty tables to silence a warning in
5917            # utf8.c, which can't really deal with empty tables, but it can
5918            # deal with a table that matches nothing, as the inverse of 'All'
5919            # does.
5920            push @OUT, "!Unicode::UCD::All\n";
5921        }
5922        elsif ($self->name eq 'N'
5923
5924               # To save disk space and table cache space, avoid putting out
5925               # binary N tables, but instead create a file which just inverts
5926               # the Y table.  Since the file will still exist and occupy a
5927               # certain number of blocks, might as well output the whole
5928               # thing if it all will fit in one block.   The number of
5929               # ranges below is an approximate number for that.
5930               && ($self->property->type == $BINARY
5931                   || $self->property->type == $FORCED_BINARY)
5932               # && $self->property->tables == 2  Can't do this because the
5933               #        non-binary properties, like NFDQC aren't specifiable
5934               #        by the notation
5935               && $range_list{$addr}->ranges > 15
5936               && ! $annotate)  # Under --annotate, want to see everything
5937        {
5938            push @OUT, "!Unicode::UCD::" . $self->property->name . "\n";
5939        }
5940        else {
5941            my $range_size_1 = $range_size_1{$addr};
5942
5943            # To make it more readable, use a minimum indentation
5944            my $comment_indent;
5945
5946            # These are used only in $annotate option
5947            my $format;         # e.g. $HEX_ADJUST_FORMAT
5948            my $include_name;   # ? Include the character's name in the
5949                                # annotation?
5950            my $include_cp;     # ? Include its code point
5951
5952            if (! $annotate) {
5953                $comment_indent = ($self->isa('Map_Table'))
5954                                  ? 24
5955                                  : ($write_as_invlist)
5956                                    ? 8
5957                                    : 16;
5958            }
5959            else {
5960                $format = $self->format;
5961
5962                # The name of the character is output only for tables that
5963                # don't already include the name in the output.
5964                my $property = $self->property;
5965                $include_name =
5966                    !  ($property == $perl_charname
5967                        || $property == main::property_ref('Unicode_1_Name')
5968                        || $property == main::property_ref('Name')
5969                        || $property == main::property_ref('Name_Alias')
5970                       );
5971
5972                # Don't include the code point in the annotation where all
5973                # lines are a single code point, so it can be easily found in
5974                # the first column
5975                $include_cp = ! $range_size_1;
5976
5977                if (! $self->isa('Map_Table')) {
5978                    $comment_indent = ($write_as_invlist) ? 8 : 16;
5979                }
5980                else {
5981                    $comment_indent = 16;
5982
5983                    # There are just a few short ranges in this table, so no
5984                    # need to include the code point in the annotation.
5985                    $include_cp = 0 if $format eq $DECOMP_STRING_FORMAT;
5986
5987                    # We're trying to get this to look good, as the whole
5988                    # point is to make human-readable tables.  It is easier to
5989                    # read if almost all the annotation comments begin in the
5990                    # same column.  Map tables have varying width maps, so can
5991                    # create a jagged comment appearance.  This code does a
5992                    # preliminary pass through these tables looking for the
5993                    # maximum width map in each, and causing the comments to
5994                    # begin just to the right of that.  However, if the
5995                    # comments begin too far to the right of most lines, it's
5996                    # hard to line them up horizontally with their real data.
5997                    # Therefore we ignore the longest outliers
5998                    my $ignore_longest_X_percent = 2;  # Discard longest X%
5999
6000                    # Each key in this hash is a width of at least one of the
6001                    # maps in the table.  Its value is how many lines have
6002                    # that width.
6003                    my %widths;
6004
6005                    # We won't space things further left than one tab stop
6006                    # after the rest of the line; initializing it to that
6007                    # number saves some work.
6008                    my $max_map_width = 8;
6009
6010                    # Fill in the %widths hash
6011                    my $total = 0;
6012                    for my $set ($range_list{$addr}->ranges) {
6013                        my $value = $set->value;
6014
6015                        # These range types don't appear in the main table
6016                        next if $set->type == 0
6017                                && defined $suppress_value
6018                                && $value eq $suppress_value;
6019                        next if $set->type == $MULTI_CP
6020                                || $set->type == $NULL;
6021
6022                        # Include 2 spaces before the beginning of the
6023                        # comment
6024                        my $this_width = length($value) + 2;
6025
6026                        # Ranges of the remaining non-zero types usually
6027                        # occupy just one line (maybe occasionally two, but
6028                        # this doesn't have to be dead accurate).  This is
6029                        # because these ranges are like "unassigned code
6030                        # points"
6031                        my $count = ($set->type != 0)
6032                                    ? 1
6033                                    : $set->end - $set->start + 1;
6034                        $widths{$this_width} += $count;
6035                        $total += $count;
6036                        $max_map_width = $this_width
6037                                            if $max_map_width < $this_width;
6038                    }
6039
6040                    # If the widest map gives us less than two tab stops
6041                    # worth, just take it as-is.
6042                    if ($max_map_width > 16) {
6043
6044                        # Otherwise go through %widths until we have included
6045                        # the desired percentage of lines in the whole table.
6046                        my $running_total = 0;
6047                        foreach my $width (sort { $a <=> $b } keys %widths)
6048                        {
6049                            $running_total += $widths{$width};
6050                            use integer;
6051                            if ($running_total * 100 / $total
6052                                            >= 100 - $ignore_longest_X_percent)
6053                            {
6054                                $max_map_width = $width;
6055                                last;
6056                            }
6057                        }
6058                    }
6059                    $comment_indent += $max_map_width;
6060                }
6061            }
6062
6063            # Values for previous time through the loop.  Initialize to
6064            # something that won't be adjacent to the first iteration;
6065            # only $previous_end matters for that.
6066            my $previous_start;
6067            my $previous_end = -2;
6068            my $previous_value;
6069
6070            # Values for next time through the portion of the loop that splits
6071            # the range.  0 in $next_start means there is no remaining portion
6072            # to deal with.
6073            my $next_start = 0;
6074            my $next_end;
6075            my $next_value;
6076            my $offset = 0;
6077            my $invlist_count = 0;
6078
6079            my $output_value_in_hex = $self->isa('Map_Table')
6080                                && ($self->format eq $HEX_ADJUST_FORMAT
6081                                    || $self->to_output_map == $EXTERNAL_MAP);
6082            # Use leading zeroes just for files whose format should not be
6083            # changed from what it has been.  Otherwise, they just take up
6084            # space and time to process.
6085            my $hex_format = ($self->isa('Map_Table')
6086                              && $self->to_output_map == $EXTERNAL_MAP)
6087                             ? "%04X"
6088                             : "%X";
6089
6090            # The values for some of these tables are stored in mktables as
6091            # hex strings.  Normally, these are just output as strings without
6092            # change, but when we are doing adjustments, we have to operate on
6093            # these numerically, so we convert those to decimal to do that,
6094            # and back to hex for output
6095            my $convert_map_to_from_hex = 0;
6096            my $output_map_in_hex = 0;
6097            if ($self->isa('Map_Table')) {
6098                $convert_map_to_from_hex
6099                   = ($use_adjustments && $self->format eq $HEX_ADJUST_FORMAT)
6100                      || ($annotate && $self->format eq $HEX_FORMAT);
6101                $output_map_in_hex = $convert_map_to_from_hex
6102                                 || $self->format eq $HEX_FORMAT;
6103            }
6104
6105            # To store any annotations about the characters.
6106            my @annotation;
6107
6108            # Output each range as part of the here document.
6109            RANGE:
6110            for my $set ($range_list{$addr}->ranges) {
6111                if ($set->type != 0) {
6112                    $self->handle_special_range($set);
6113                    next RANGE;
6114                }
6115                my $start = $set->start;
6116                my $end   = $set->end;
6117                my $value  = $set->value;
6118
6119                # Don't output ranges whose value is the one to suppress
6120                next RANGE if defined $suppress_value
6121                              && $value eq $suppress_value;
6122
6123                $value = CORE::hex $value if $convert_map_to_from_hex;
6124
6125
6126                {   # This bare block encloses the scope where we may need to
6127                    # 'redo' to.  Consider a table that is to be written out
6128                    # using single item ranges.  This is given in the
6129                    # $range_size_1 boolean.  To accomplish this, we split the
6130                    # range each time through the loop into two portions, the
6131                    # first item, and the rest.  We handle that first item
6132                    # this time in the loop, and 'redo' to repeat the process
6133                    # for the rest of the range.
6134                    #
6135                    # We may also have to do it, with other special handling,
6136                    # if the table has adjustments.  Consider the table that
6137                    # contains the lowercasing maps.  mktables stores the
6138                    # ASCII range ones as 26 ranges:
6139                    #       ord('A') => ord('a'), .. ord('Z') => ord('z')
6140                    # For compactness, the table that gets written has this as
6141                    # just one range
6142                    #       ( ord('A') .. ord('Z') ) => ord('a')
6143                    # and the software that reads the tables is smart enough
6144                    # to "connect the dots".  This change is accomplished in
6145                    # this loop by looking to see if the current iteration
6146                    # fits the paradigm of the previous iteration, and if so,
6147                    # we merge them by replacing the final output item with
6148                    # the merged data.  Repeated 25 times, this gets A-Z.  But
6149                    # we also have to make sure we don't screw up cases where
6150                    # we have internally stored
6151                    #       ( 0x1C4 .. 0x1C6 ) => 0x1C5
6152                    # This single internal range has to be output as 3 ranges,
6153                    # which is done by splitting, like we do for $range_size_1
6154                    # tables.  (There are very few of such ranges that need to
6155                    # be split, so the gain of doing the combining of other
6156                    # ranges far outweighs the splitting of these.)  The
6157                    # values to use for the redo at the end of this block are
6158                    # set up just below in the scalars whose names begin with
6159                    # '$next_'.
6160
6161                    if (($use_adjustments || $range_size_1) && $end != $start)
6162                    {
6163                        $next_start = $start + 1;
6164                        $next_end = $end;
6165                        $next_value = $value;
6166                        $end = $start;
6167                    }
6168
6169                    if ($use_adjustments && ! $range_size_1) {
6170
6171                        # If this range is adjacent to the previous one, and
6172                        # the values in each are integers that are also
6173                        # adjacent (differ by 1), then this range really
6174                        # extends the previous one that is already in element
6175                        # $OUT[-1].  So we pop that element, and pretend that
6176                        # the range starts with whatever it started with.
6177                        # $offset is incremented by 1 each time so that it
6178                        # gives the current offset from the first element in
6179                        # the accumulating range, and we keep in $value the
6180                        # value of that first element.
6181                        if ($start == $previous_end + 1
6182                            && $value =~ /^ -? \d+ $/xa
6183                            && $previous_value =~ /^ -? \d+ $/xa
6184                            && ($value == ($previous_value + ++$offset)))
6185                        {
6186                            pop @OUT;
6187                            $start = $previous_start;
6188                            $value = $previous_value;
6189                        }
6190                        else {
6191                            $offset = 0;
6192                            if (@annotation == 1) {
6193                                $OUT[-1] = merge_single_annotation_line(
6194                                    $OUT[-1], $annotation[0], $comment_indent);
6195                            }
6196                            else {
6197                                push @OUT, @annotation;
6198                            }
6199                        }
6200                        undef @annotation;
6201
6202                        # Save the current values for the next time through
6203                        # the loop.
6204                        $previous_start = $start;
6205                        $previous_end = $end;
6206                        $previous_value = $value;
6207                    }
6208
6209                    if ($write_as_invlist) {
6210                        if (   $previous_end > 0
6211                            && $output_range_counts{$addr})
6212                        {
6213                            my $complement_count = $start - $previous_end - 1;
6214                            if ($complement_count > 1) {
6215                                $OUT[-1] = merge_single_annotation_line(
6216                                    $OUT[-1],
6217                                       "#"
6218                                     . (" " x 17)
6219                                     . "["
6220                                     .  main::clarify_code_point_count(
6221                                                            $complement_count)
6222                                      . "] in complement\n",
6223                                    $comment_indent);
6224                            }
6225                        }
6226
6227                        # Inversion list format has a single number per line,
6228                        # the starting code point of a range that matches the
6229                        # property
6230                        push @OUT, $start, "\n";
6231                        $invlist_count++;
6232
6233                        # Add a comment with the size of the range, if
6234                        # requested.
6235                        if ($output_range_counts{$addr}) {
6236                            $OUT[-1] = merge_single_annotation_line(
6237                                    $OUT[-1],
6238                                    "# ["
6239                                      . main::clarify_code_point_count($end - $start + 1)
6240                                      . "]\n",
6241                                    $comment_indent);
6242                        }
6243                    }
6244                    elsif ($start != $end) { # If there is a range
6245                        if ($end == $MAX_WORKING_CODEPOINT) {
6246                            push @OUT, sprintf "$hex_format\t$hex_format",
6247                                                $start,
6248                                                $MAX_PLATFORM_CODEPOINT;
6249                        }
6250                        else {
6251                            push @OUT, sprintf "$hex_format\t$hex_format",
6252                                                $start,       $end;
6253                        }
6254                        if (length $value) {
6255                            if ($convert_map_to_from_hex) {
6256                                $OUT[-1] .= sprintf "\t$hex_format\n", $value;
6257                            }
6258                            else {
6259                                $OUT[-1] .= "\t$value\n";
6260                            }
6261                        }
6262
6263                        # Add a comment with the size of the range, if
6264                        # requested.
6265                        if ($output_range_counts{$addr}) {
6266                            $OUT[-1] = merge_single_annotation_line(
6267                                    $OUT[-1],
6268                                    "# ["
6269                                      . main::clarify_code_point_count($end - $start + 1)
6270                                      . "]\n",
6271                                    $comment_indent);
6272                        }
6273                    }
6274                    else { # Here to output a single code point per line.
6275
6276                        # Use any passed in subroutine to output.
6277                        if (ref $range_size_1 eq 'CODE') {
6278                            for my $i ($start .. $end) {
6279                                push @OUT, &{$range_size_1}($i, $value);
6280                            }
6281                        }
6282                        else {
6283
6284                            # Here, caller is ok with default output.
6285                            for (my $i = $start; $i <= $end; $i++) {
6286                                if ($convert_map_to_from_hex) {
6287                                    push @OUT,
6288                                        sprintf "$hex_format\t\t$hex_format\n",
6289                                                 $i,            $value;
6290                                }
6291                                else {
6292                                    push @OUT, sprintf $hex_format, $i;
6293                                    $OUT[-1] .= "\t\t$value" if $value ne "";
6294                                    $OUT[-1] .= "\n";
6295                                }
6296                            }
6297                        }
6298                    }
6299
6300                    if ($annotate) {
6301                        for (my $i = $start; $i <= $end; $i++) {
6302                            my $annotation = "";
6303
6304                            # Get character information if don't have it already
6305                            main::populate_char_info($i)
6306                                                     if ! defined $viacode[$i];
6307                            my $type = $annotate_char_type[$i];
6308
6309                            # Figure out if should output the next code points
6310                            # as part of a range or not.  If this is not in an
6311                            # annotation range, then won't output as a range,
6312                            # so returns $i.  Otherwise use the end of the
6313                            # annotation range, but no further than the
6314                            # maximum possible end point of the loop.
6315                            my $range_end =
6316                                        $range_size_1
6317                                        ? $start
6318                                        : main::min(
6319                                          $annotate_ranges->value_of($i) || $i,
6320                                          $end);
6321
6322                            # Use a range if it is a range, and either is one
6323                            # of the special annotation ranges, or the range
6324                            # is at most 3 long.  This last case causes the
6325                            # algorithmically named code points to be output
6326                            # individually in spans of at most 3, as they are
6327                            # the ones whose $type is > 0.
6328                            if ($range_end != $i
6329                                && ( $type < 0 || $range_end - $i > 2))
6330                            {
6331                                # Here is to output a range.  We don't allow a
6332                                # caller-specified output format--just use the
6333                                # standard one.
6334                                my $range_name = $viacode[$i];
6335
6336                                # For the code points which end in their hex
6337                                # value, we eliminate that from the output
6338                                # annotation, and capitalize only the first
6339                                # letter of each word.
6340                                if ($type == $CP_IN_NAME) {
6341                                    my $hex = sprintf $hex_format, $i;
6342                                    $range_name =~ s/-$hex$//;
6343                                    my @words = split " ", $range_name;
6344                                    for my $word (@words) {
6345                                        $word =
6346                                          ucfirst(lc($word)) if $word ne 'CJK';
6347                                    }
6348                                    $range_name = join " ", @words;
6349                                }
6350                                elsif ($type == $HANGUL_SYLLABLE) {
6351                                    $range_name = "Hangul Syllable";
6352                                }
6353
6354                                # If the annotation would just repeat what's
6355                                # already being output as the range, skip it.
6356                                # (When an inversion list is being written, it
6357                                # isn't a repeat, as that always is in
6358                                # decimal)
6359                                if (   $write_as_invlist
6360                                    || $i != $start
6361                                    || $range_end < $end)
6362                                {
6363                                    if ($range_end < $MAX_WORKING_CODEPOINT)
6364                                    {
6365                                        $annotation = sprintf "%04X..%04X",
6366                                                              $i,   $range_end;
6367                                    }
6368                                    else {
6369                                        $annotation = sprintf "%04X..INFINITY",
6370                                                               $i;
6371                                    }
6372                                }
6373                                else { # Indent if not displaying code points
6374                                    $annotation = " " x 4;
6375                                }
6376
6377                                if ($range_name) {
6378                                    $annotation .= " $age[$i]" if $age[$i];
6379                                    $annotation .= " $range_name";
6380                                }
6381
6382                                # Include the number of code points in the
6383                                # range
6384                                my $count =
6385                                    main::clarify_code_point_count($range_end - $i + 1);
6386                                $annotation .= " [$count]\n";
6387
6388                                # Skip to the end of the range
6389                                $i = $range_end;
6390                            }
6391                            else { # Not in a range.
6392                                my $comment = "";
6393
6394                                # When outputting the names of each character,
6395                                # use the character itself if printable
6396                                $comment .= "'" . main::display_chr($i) . "' "
6397                                                            if $printable[$i];
6398
6399                                my $output_value = $value;
6400
6401                                # Determine the annotation
6402                                if ($format eq $DECOMP_STRING_FORMAT) {
6403
6404                                    # This is very specialized, with the type
6405                                    # of decomposition beginning the line
6406                                    # enclosed in <...>, and the code points
6407                                    # that the code point decomposes to
6408                                    # separated by blanks.  Create two
6409                                    # strings, one of the printable
6410                                    # characters, and one of their official
6411                                    # names.
6412                                    (my $map = $output_value)
6413                                                    =~ s/ \ * < .*? > \ +//x;
6414                                    my $tostr = "";
6415                                    my $to_name = "";
6416                                    my $to_chr = "";
6417                                    foreach my $to (split " ", $map) {
6418                                        $to = CORE::hex $to;
6419                                        $to_name .= " + " if $to_name;
6420                                        $to_chr .= main::display_chr($to);
6421                                        main::populate_char_info($to)
6422                                                    if ! defined $viacode[$to];
6423                                        $to_name .=  $viacode[$to];
6424                                    }
6425
6426                                    $comment .=
6427                                    "=> '$to_chr'; $viacode[$i] => $to_name";
6428                                }
6429                                else {
6430                                    $output_value += $i - $start
6431                                                   if $use_adjustments
6432                                                      # Don't try to adjust a
6433                                                      # non-integer
6434                                                   && $output_value !~ /[-\D]/;
6435
6436                                    if ($output_map_in_hex) {
6437                                        main::populate_char_info($output_value)
6438                                          if ! defined $viacode[$output_value];
6439                                        $comment .= " => '"
6440                                        . main::display_chr($output_value)
6441                                        . "'; " if $printable[$output_value];
6442                                    }
6443                                    if ($include_name && $viacode[$i]) {
6444                                        $comment .= " " if $comment;
6445                                        $comment .= $viacode[$i];
6446                                    }
6447                                    if ($output_map_in_hex) {
6448                                        $comment .=
6449                                                " => $viacode[$output_value]"
6450                                                    if $viacode[$output_value];
6451                                        $output_value = sprintf($hex_format,
6452                                                                $output_value);
6453                                    }
6454                                }
6455
6456                                if ($include_cp) {
6457                                    $annotation = sprintf "%04X %s", $i, $age[$i];
6458                                    if ($use_adjustments) {
6459                                        $annotation .= " => $output_value";
6460                                    }
6461                                }
6462
6463                                if ($comment ne "") {
6464                                    $annotation .= " " if $annotation ne "";
6465                                    $annotation .= $comment;
6466                                }
6467                                $annotation .= "\n" if $annotation ne "";
6468                            }
6469
6470                            if ($annotation ne "") {
6471                                push @annotation, (" " x $comment_indent)
6472                                                  .  "# $annotation";
6473                            }
6474                        }
6475
6476                        # If not adjusting, we don't have to go through the
6477                        # loop again to know that the annotation comes next
6478                        # in the output.
6479                        if (! $use_adjustments) {
6480                            if (@annotation == 1) {
6481                                $OUT[-1] = merge_single_annotation_line(
6482                                    $OUT[-1], $annotation[0], $comment_indent);
6483                            }
6484                            else {
6485                                push @OUT, map { Text::Tabs::unexpand $_ }
6486                                               @annotation;
6487                            }
6488                            undef @annotation;
6489                        }
6490                    }
6491
6492                    # Add the beginning of the range that doesn't match the
6493                    # property, except if the just added match range extends
6494                    # to infinity.  We do this after any annotations for the
6495                    # match range.
6496                    if ($write_as_invlist && $end < $MAX_WORKING_CODEPOINT) {
6497                        push @OUT, $end + 1, "\n";
6498                        $invlist_count++;
6499                    }
6500
6501                    # If we split the range, set up so the next time through
6502                    # we get the remainder, and redo.
6503                    if ($next_start) {
6504                        $start = $next_start;
6505                        $end = $next_end;
6506                        $value = $next_value;
6507                        $next_start = 0;
6508                        redo;
6509                    }
6510                } # End of redo block
6511            } # End of loop through all the table's ranges
6512
6513            push @OUT, @annotation; # Add orphaned annotation, if any
6514
6515            splice @OUT, 1, 0, "V$invlist_count\n" if $invlist_count;
6516        }
6517
6518        # Add anything that goes after the main body, but within the here
6519        # document,
6520        my $append_to_body = $self->append_to_body;
6521        push @OUT, $append_to_body if $append_to_body;
6522
6523        # And finish the here document.
6524        push @OUT, "END\n";
6525
6526        # Done with the main portion of the body.  Can now figure out what
6527        # should appear before it in the file.
6528        my $pre_body = $self->pre_body;
6529        push @HEADER, $pre_body, "\n" if $pre_body;
6530
6531        # All these files should have a .pl suffix added to them.
6532        my @file_with_pl = @{$file_path{$addr}};
6533        $file_with_pl[-1] .= '.pl';
6534
6535        main::write(\@file_with_pl,
6536                    $annotate,      # utf8 iff annotating
6537                    \@HEADER,
6538                    \@OUT);
6539        return;
6540    }
6541
6542    sub set_status($self, $status, $info) {    # Set the table's status
6543        # status The status enum value
6544        # info Any message associated with it.
6545        my $addr = do { no overloading; pack 'J', $self; };
6546
6547        $status{$addr} = $status;
6548        $status_info{$addr} = $info;
6549        return;
6550    }
6551
6552    sub set_fate($self, $fate, $reason=undef) {  # Set the fate of a table
6553        my $addr = do { no overloading; pack 'J', $self; };
6554
6555        return if $fate{$addr} == $fate;    # If no-op
6556
6557        # Can only change the ordinary fate, except if going to $MAP_PROXIED
6558        return if $fate{$addr} != $ORDINARY && $fate != $MAP_PROXIED;
6559
6560        $fate{$addr} = $fate;
6561
6562        # Don't document anything to do with a non-normal fated table
6563        if ($fate != $ORDINARY) {
6564            my $put_in_pod = ($fate == $MAP_PROXIED) ? 1 : 0;
6565            foreach my $alias ($self->aliases) {
6566                $alias->set_ucd($put_in_pod);
6567
6568                # MAP_PROXIED doesn't affect the match tables
6569                next if $fate == $MAP_PROXIED;
6570                $alias->set_make_re_pod_entry($put_in_pod);
6571            }
6572        }
6573
6574        # Save the reason for suppression for output
6575        if ($fate >= $SUPPRESSED) {
6576            $reason = "" unless defined $reason;
6577            $why_suppressed{$complete_name{$addr}} = $reason;
6578        }
6579
6580        return;
6581    }
6582
6583    sub lock($self) {
6584        # Don't allow changes to the table from now on.  This stores a stack
6585        # trace of where it was called, so that later attempts to modify it
6586        # can immediately show where it got locked.
6587        my $addr = do { no overloading; pack 'J', $self; };
6588
6589        $locked{$addr} = "";
6590
6591        my $line = (caller(0))[2];
6592        my $i = 1;
6593
6594        # Accumulate the stack trace
6595        while (1) {
6596            my ($pkg, $file, $caller_line, $caller) = caller $i++;
6597
6598            last unless defined $caller;
6599
6600            $locked{$addr} .= "    called from $caller() at line $line\n";
6601            $line = $caller_line;
6602        }
6603        $locked{$addr} .= "    called from main at line $line\n";
6604
6605        return;
6606    }
6607
6608    sub carp_if_locked($self) {
6609        # Return whether a table is locked or not, and, by the way, complain
6610        # if is locked
6611        my $addr = do { no overloading; pack 'J', $self; };
6612
6613        return 0 if ! $locked{$addr};
6614        Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
6615        return 1;
6616    }
6617
6618    sub set_file_path($self, @path) { # Set the final directory path for this table
6619        no overloading;
6620        @{$file_path{pack 'J', $self}} = @path;
6621        return
6622    }
6623
6624    # Accessors for the range list stored in this table.  First for
6625    # unconditional
6626    for my $sub (qw(
6627                    containing_range
6628                    contains
6629                    count
6630                    each_range
6631                    hash
6632                    is_empty
6633                    matches_identically_to
6634                    max
6635                    min
6636                    range_count
6637                    reset_each_range
6638                    type_of
6639                    value_of
6640                ))
6641    {
6642        no strict "refs";
6643        *$sub = sub {
6644            use strict "refs";
6645            my $self = shift;
6646            return $self->_range_list->$sub(@_);
6647        }
6648    }
6649
6650    # Then for ones that should fail if locked
6651    for my $sub (qw(
6652                    delete_range
6653                ))
6654    {
6655        no strict "refs";
6656        *$sub = sub {
6657            use strict "refs";
6658            my $self = shift;
6659
6660            return if $self->carp_if_locked;
6661            no overloading;
6662            return $self->_range_list->$sub(@_);
6663        }
6664    }
6665
6666} # End closure
6667
6668package Map_Table;
6669use parent '-norequire', '_Base_Table';
6670
6671# A Map Table is a table that contains the mappings from code points to
6672# values.  There are two weird cases:
6673# 1) Anomalous entries are ones that aren't maps of ranges of code points, but
6674#    are written in the table's file at the end of the table nonetheless.  It
6675#    requires specially constructed code to handle these; utf8.c can not read
6676#    these in, so they should not go in $map_directory.  As of this writing,
6677#    the only case that these happen is for named sequences used in
6678#    charnames.pm.   But this code doesn't enforce any syntax on these, so
6679#    something else could come along that uses it.
6680# 2) Specials are anything that doesn't fit syntactically into the body of the
6681#    table.  The ranges for these have a map type of non-zero.  The code below
6682#    knows about and handles each possible type.   In most cases, these are
6683#    written as part of the header.
6684#
6685# A map table deliberately can't be manipulated at will unlike match tables.
6686# This is because of the ambiguities having to do with what to do with
6687# overlapping code points.  And there just isn't a need for those things;
6688# what one wants to do is just query, add, replace, or delete mappings, plus
6689# write the final result.
6690# However, there is a method to get the list of possible ranges that aren't in
6691# this table to use for defaulting missing code point mappings.  And,
6692# map_add_or_replace_non_nulls() does allow one to add another table to this
6693# one, but it is clearly very specialized, and defined that the other's
6694# non-null values replace this one's if there is any overlap.
6695
6696sub trace { return main::trace(@_); }
6697
6698{ # Closure
6699
6700    main::setup_package();
6701
6702    my %default_map;
6703    # Many input files omit some entries; this gives what the mapping for the
6704    # missing entries should be
6705    main::set_access('default_map', \%default_map, 'r');
6706
6707    my %anomalous_entries;
6708    # Things that go in the body of the table which don't fit the normal
6709    # scheme of things, like having a range.  Not much can be done with these
6710    # once there except to output them.  This was created to handle named
6711    # sequences.
6712    main::set_access('anomalous_entry', \%anomalous_entries, 'a');
6713    main::set_access('anomalous_entries',       # Append singular, read plural
6714                    \%anomalous_entries,
6715                    'readable_array');
6716    my %to_output_map;
6717    # Enum as to whether or not to write out this map table, and how:
6718    #   0               don't output
6719    #   $EXTERNAL_MAP   means its existence is noted in the documentation, and
6720    #                   it should not be removed nor its format changed.  This
6721    #                   is done for those files that have traditionally been
6722    #                   output.
6723    #   $INTERNAL_MAP   means Perl reserves the right to do anything it wants
6724    #                   with this file
6725    #   $OUTPUT_ADJUSTED means that it is an $INTERNAL_MAP, and instead of
6726    #                   outputting the actual mappings as-is, we adjust things
6727    #                   to create a much more compact table. Only those few
6728    #                   tables where the mapping is convertible at least to an
6729    #                   integer and compacting makes a big difference should
6730    #                   have this.  Hence, the default is to not do this
6731    #                   unless the table's default mapping is to $CODE_POINT,
6732    #                   and the range size is not 1.
6733    main::set_access('to_output_map', \%to_output_map, 's');
6734
6735    sub new {
6736        my $class = shift;
6737        my $name = shift;
6738
6739        my %args = @_;
6740
6741        # Optional initialization data for the table.
6742        my $initialize = delete $args{'Initialize'};
6743
6744        my $default_map = delete $args{'Default_Map'};
6745        my $property = delete $args{'_Property'};
6746        my $full_name = delete $args{'Full_Name'};
6747        my $to_output_map = delete $args{'To_Output_Map'};
6748
6749        # Rest of parameters passed on
6750
6751        my $range_list = Range_Map->new(Owner => $property);
6752
6753        my $self = $class->SUPER::new(
6754                                    Name => $name,
6755                                    Complete_Name =>  $full_name,
6756                                    Full_Name => $full_name,
6757                                    _Property => $property,
6758                                    _Range_List => $range_list,
6759                                    Write_As_Invlist => 0,
6760                                    %args);
6761
6762        my $addr = do { no overloading; pack 'J', $self; };
6763
6764        $anomalous_entries{$addr} = [];
6765        $default_map{$addr} = $default_map;
6766        $to_output_map{$addr} = $to_output_map;
6767
6768        $self->initialize($initialize) if defined $initialize;
6769
6770        return $self;
6771    }
6772
6773    use overload
6774        fallback => 0,
6775        qw("") => "_operator_stringify",
6776    ;
6777
6778    sub _operator_stringify($self, $other="", $reversed=0) {
6779
6780        my $name = $self->property->full_name;
6781        $name = '""' if $name eq "";
6782        return "Map table for Property '$name'";
6783    }
6784
6785    sub add_alias {
6786        # Add a synonym for this table (which means the property itself)
6787        my $self = shift;
6788        my $name = shift;
6789        # Rest of parameters passed on.
6790
6791        $self->SUPER::add_alias($name, $self->property, @_);
6792        return;
6793    }
6794
6795    sub add_map {
6796        # Add a range of code points to the list of specially-handled code
6797        # points.  0 is assumed if the type of special is not passed
6798        # in.
6799
6800        my $self = shift;
6801        my $lower = shift;
6802        my $upper = shift;
6803        my $string = shift;
6804        my %args = @_;
6805
6806        my $type = delete $args{'Type'} || 0;
6807        # Rest of parameters passed on
6808
6809        # Can't change the table if locked.
6810        return if $self->carp_if_locked;
6811
6812        my $addr = do { no overloading; pack 'J', $self; };
6813
6814        $self->_range_list->add_map($lower, $upper,
6815                                    $string,
6816                                    @_,
6817                                    Type => $type);
6818        return;
6819    }
6820
6821    sub append_to_body($self) {
6822        # Adds to the written HERE document of the table's body any anomalous
6823        # entries in the table..
6824        my $addr = do { no overloading; pack 'J', $self; };
6825
6826        return "" unless @{$anomalous_entries{$addr}};
6827        return join("\n", @{$anomalous_entries{$addr}}) . "\n";
6828    }
6829
6830    sub map_add_or_replace_non_nulls($self, $other) {
6831        # This adds the mappings in the table $other to $self.  Non-null
6832        # mappings from $other override those in $self.  It essentially merges
6833        # the two tables, with the second having priority except for null
6834        # mappings.
6835        return if $self->carp_if_locked;
6836
6837        if (! $other->isa(__PACKAGE__)) {
6838            Carp::my_carp_bug("$other should be a "
6839                        . __PACKAGE__
6840                        . ".  Not a '"
6841                        . ref($other)
6842                        . "'.  Not added;");
6843            return;
6844        }
6845
6846        my $addr = do { no overloading; pack 'J', $self; };
6847        my $other_addr = do { no overloading; pack 'J', $other; };
6848
6849        local $to_trace = 0 if main::DEBUG;
6850
6851        my $self_range_list = $self->_range_list;
6852        my $other_range_list = $other->_range_list;
6853        foreach my $range ($other_range_list->ranges) {
6854            my $value = $range->value;
6855            next if $value eq "";
6856            $self_range_list->_add_delete('+',
6857                                          $range->start,
6858                                          $range->end,
6859                                          $value,
6860                                          Type => $range->type,
6861                                          Replace => $UNCONDITIONALLY);
6862        }
6863
6864        return;
6865    }
6866
6867    sub set_default_map($self, $map, $use_full_name=0) {
6868        # Define what code points that are missing from the input files should
6869        # map to.  The optional second parameter 'full_name' indicates to
6870        # force using the full name of the map instead of its standard name.
6871        if ($use_full_name && $use_full_name ne 'full_name') {
6872            Carp::my_carp_bug("Second parameter to set_default_map() if"
6873                            . " present, must be 'full_name'");
6874        }
6875
6876        my $addr = do { no overloading; pack 'J', $self; };
6877
6878        # Convert the input to the standard equivalent, if any (won't have any
6879        # for $STRING properties)
6880        my $standard = $self->property->table($map);
6881        if (defined $standard) {
6882            $map = ($use_full_name)
6883                   ? $standard->full_name
6884                   : $standard->name;
6885        }
6886
6887        # Warn if there already is a non-equivalent default map for this
6888        # property.  Note that a default map can be a ref, which means that
6889        # what it actually means is delayed until later in the program, and it
6890        # IS permissible to override it here without a message.
6891        my $default_map = $default_map{$addr};
6892        if (defined $default_map
6893            && ! ref($default_map)
6894            && $default_map ne $map
6895            && main::Standardize($map) ne $default_map)
6896        {
6897            my $property = $self->property;
6898            my $map_table = $property->table($map);
6899            my $default_table = $property->table($default_map);
6900            if (defined $map_table
6901                && defined $default_table
6902                && $map_table != $default_table)
6903            {
6904                Carp::my_carp("Changing the default mapping for "
6905                            . $property
6906                            . " from $default_map to $map'");
6907            }
6908        }
6909
6910        $default_map{$addr} = $map;
6911
6912        # Don't also create any missing table for this map at this point,
6913        # because if we did, it could get done before the main table add is
6914        # done for PropValueAliases.txt; instead the caller will have to make
6915        # sure it exists, if desired.
6916        return;
6917    }
6918
6919    sub to_output_map($self) {
6920        # Returns boolean: should we write this map table?
6921        my $addr = do { no overloading; pack 'J', $self; };
6922
6923        # If overridden, use that
6924        return $to_output_map{$addr} if defined $to_output_map{$addr};
6925
6926        my $full_name = $self->full_name;
6927        return $global_to_output_map{$full_name}
6928                                if defined $global_to_output_map{$full_name};
6929
6930        # If table says to output, do so; if says to suppress it, do so.
6931        my $fate = $self->fate;
6932        return $INTERNAL_MAP if $fate == $INTERNAL_ONLY;
6933        return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties;
6934        return 0 if $fate == $SUPPRESSED || $fate == $MAP_PROXIED;
6935
6936        my $type = $self->property->type;
6937
6938        # Don't want to output binary map tables even for debugging.
6939        return 0 if $type == $BINARY;
6940
6941        # But do want to output string ones.  All the ones that remain to
6942        # be dealt with (i.e. which haven't explicitly been set to external)
6943        # are for internal Perl use only.  The default for those that map to
6944        # $CODE_POINT and haven't been restricted to a single element range
6945        # is to use the adjusted form.
6946        if ($type == $STRING) {
6947            return $INTERNAL_MAP if $self->range_size_1
6948                                    || $default_map{$addr} ne $CODE_POINT;
6949            return $OUTPUT_ADJUSTED;
6950        }
6951
6952        # Otherwise is an $ENUM, do output it, for Perl's purposes
6953        return $INTERNAL_MAP;
6954    }
6955
6956    sub inverse_list($self) {
6957        # Returns a Range_List that is gaps of the current table.  That is,
6958        # the inversion
6959        my $current = Range_List->new(Initialize => $self->_range_list,
6960                                Owner => $self->property);
6961        return ~ $current;
6962    }
6963
6964    sub header($self) {
6965        my $return = $self->SUPER::header();
6966
6967        if ($self->to_output_map >= $INTERNAL_MAP) {
6968            $return .= $INTERNAL_ONLY_HEADER;
6969        }
6970        else {
6971            # Other properties have fixed formats.
6972            my $property_name = $self->property->full_name;
6973
6974            $return .= <<END;
6975
6976# !!!!!!!   IT IS DEPRECATED TO USE THIS FILE   !!!!!!!
6977
6978# This file is for internal use by core Perl only.  It is retained for
6979# backwards compatibility with applications that may have come to rely on it,
6980# but its format and even its name or existence are subject to change without
6981# notice in a future Perl version.  Don't use it directly.  Instead, its
6982# contents are now retrievable through a stable API in the Unicode::UCD
6983# module: Unicode::UCD::prop_invmap('$property_name') (Values for individual
6984# code points can be retrieved via Unicode::UCD::charprop());
6985END
6986        }
6987        return $return;
6988    }
6989
6990    sub set_final_comment($self) {
6991        # Just before output, create the comment that heads the file
6992        # containing this table.
6993
6994        return unless $debugging_build;
6995
6996        # No sense generating a comment if aren't going to write it out.
6997        return if ! $self->to_output_map;
6998
6999        my $addr = do { no overloading; pack 'J', $self; };
7000
7001        my $property = $self->property;
7002
7003        # Get all the possible names for this property.  Don't use any that
7004        # aren't ok for use in a file name, etc.  This is perhaps causing that
7005        # flag to do double duty, and may have to be changed in the future to
7006        # have our own flag for just this purpose; but it works now to exclude
7007        # Perl generated synonyms from the lists for properties, where the
7008        # name is always the proper Unicode one.
7009        my @property_aliases = grep { $_->ok_as_filename } $self->aliases;
7010
7011        my $count = $self->count;
7012        my $default_map = $default_map{$addr};
7013
7014        # The ranges that map to the default aren't output, so subtract that
7015        # to get those actually output.  A property with matching tables
7016        # already has the information calculated.
7017        if ($property->type != $STRING && $property->type != $FORCED_BINARY) {
7018            $count -= $property->table($default_map)->count;
7019        }
7020        elsif (defined $default_map) {
7021
7022            # But for $STRING properties, must calculate now.  Subtract the
7023            # count from each range that maps to the default.
7024            foreach my $range ($self->_range_list->ranges) {
7025                if ($range->value eq $default_map) {
7026                    $count -= $range->end +1 - $range->start;
7027                }
7028            }
7029
7030        }
7031
7032        # Get a  string version of $count with underscores in large numbers,
7033        # for clarity.
7034        my $string_count = main::clarify_code_point_count($count);
7035
7036        my $code_points = ($count == 1)
7037                        ? 'single code point'
7038                        : "$string_count code points";
7039
7040        my $mapping;
7041        my $these_mappings;
7042        my $are;
7043        if (@property_aliases <= 1) {
7044            $mapping = 'mapping';
7045            $these_mappings = 'this mapping';
7046            $are = 'is'
7047        }
7048        else {
7049            $mapping = 'synonymous mappings';
7050            $these_mappings = 'these mappings';
7051            $are = 'are'
7052        }
7053        my $cp;
7054        if ($count >= $MAX_UNICODE_CODEPOINTS) {
7055            $cp = "any code point in Unicode Version $string_version";
7056        }
7057        else {
7058            my $map_to;
7059            if ($default_map eq "") {
7060                $map_to = 'the empty string';
7061            }
7062            elsif ($default_map eq $CODE_POINT) {
7063                $map_to = "itself";
7064            }
7065            else {
7066                $map_to = "'$default_map'";
7067            }
7068            if ($count == 1) {
7069                $cp = "the single code point";
7070            }
7071            else {
7072                $cp = "one of the $code_points";
7073            }
7074            $cp .= " in Unicode Version $unicode_version for which the mapping is not to $map_to";
7075        }
7076
7077        my $comment = "";
7078
7079        my $status = $self->status;
7080        if ($status ne $NORMAL) {
7081            my $warn = uc $status_past_participles{$status};
7082            $comment .= <<END;
7083
7084!!!!!!!   $warn !!!!!!!!!!!!!!!!!!!
7085 All property or property=value combinations contained in this file are $warn.
7086 See $unicode_reference_url for what this means.
7087
7088END
7089        }
7090        $comment .= "This file returns the $mapping:\n";
7091
7092        my $ucd_accessible_name = "";
7093        my $has_underscore_name = 0;
7094        my $full_name = $self->property->full_name;
7095        for my $i (0 .. @property_aliases - 1) {
7096            my $name = $property_aliases[$i]->name;
7097            $has_underscore_name = 1 if $name =~ /^_/;
7098            $comment .= sprintf("%-8s%s\n", " ", $name . '(cp)');
7099            if ($property_aliases[$i]->ucd) {
7100                if ($name eq $full_name) {
7101                    $ucd_accessible_name = $full_name;
7102                }
7103                elsif (! $ucd_accessible_name) {
7104                    $ucd_accessible_name = $name;
7105                }
7106            }
7107        }
7108        $comment .= "\nwhere 'cp' is $cp.";
7109        if ($ucd_accessible_name) {
7110            $comment .= "  Note that $these_mappings";
7111            if ($has_underscore_name) {
7112                $comment .= " (except for the one(s) that begin with an underscore)";
7113            }
7114            $comment .= " $are accessible via the functions prop_invmap('$full_name') or charprop() in Unicode::UCD";
7115
7116        }
7117
7118        # And append any commentary already set from the actual property.
7119        $comment .= "\n\n" . $self->comment if $self->comment;
7120        if ($self->description) {
7121            $comment .= "\n\n" . join " ", $self->description;
7122        }
7123        if ($self->note) {
7124            $comment .= "\n\n" . join " ", $self->note;
7125        }
7126        $comment .= "\n";
7127
7128        if (! $self->perl_extension) {
7129            $comment .= <<END;
7130
7131For information about what this property really means, see:
7132$unicode_reference_url
7133END
7134        }
7135
7136        if ($count) {        # Format differs for empty table
7137                $comment.= "\nThe format of the ";
7138            if ($self->range_size_1) {
7139                $comment.= <<END;
7140main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
7141is in hex; MAPPING is what CODE_POINT maps to.
7142END
7143            }
7144            else {
7145
7146                # There are tables which end up only having one element per
7147                # range, but it is not worth keeping track of for making just
7148                # this comment a little better.
7149                $comment .= <<END;
7150non-comment portions of the main body of lines of this file is:
7151START\\tSTOP\\tMAPPING where START is the starting code point of the
7152range, in hex; STOP is the ending point, or if omitted, the range has just one
7153code point; MAPPING is what each code point between START and STOP maps to.
7154END
7155                if ($self->output_range_counts) {
7156                    $comment .= <<END;
7157Numbers in comments in [brackets] indicate how many code points are in the
7158range (omitted when the range is a single code point or if the mapping is to
7159the null string).
7160END
7161                }
7162            }
7163        }
7164        $self->set_comment(main::join_lines($comment));
7165        return;
7166    }
7167
7168    my %swash_keys; # Makes sure don't duplicate swash names.
7169
7170    # The remaining variables are temporaries used while writing each table,
7171    # to output special ranges.
7172    my @multi_code_point_maps;  # Map is to more than one code point.
7173
7174    sub handle_special_range($self, $range) {
7175        # Called in the middle of write when it finds a range it doesn't know
7176        # how to handle.
7177
7178        my $addr = do { no overloading; pack 'J', $self; };
7179
7180        my $type = $range->type;
7181
7182        my $low = $range->start;
7183        my $high = $range->end;
7184        my $map = $range->value;
7185
7186        # No need to output the range if it maps to the default.
7187        return if $map eq $default_map{$addr};
7188
7189        my $property = $self->property;
7190
7191        # Switch based on the map type...
7192        if ($type == $HANGUL_SYLLABLE) {
7193
7194            # These are entirely algorithmically determinable based on
7195            # some constants furnished by Unicode; for now, just set a
7196            # flag to indicate that have them.  After everything is figured
7197            # out, we will output the code that does the algorithm.  (Don't
7198            # output them if not needed because we are suppressing this
7199            # property.)
7200            $has_hangul_syllables = 1 if $property->to_output_map;
7201        }
7202        elsif ($type == $CP_IN_NAME) {
7203
7204            # Code points whose name ends in their code point are also
7205            # algorithmically determinable, but need information about the map
7206            # to do so.  Both the map and its inverse are stored in data
7207            # structures output in the file.  They are stored in the mean time
7208            # in global lists The lists will be written out later into Name.pm,
7209            # which is created only if needed.  In order to prevent duplicates
7210            # in the list, only add to them for one property, should multiple
7211            # ones need them.
7212            if ($needing_code_points_ending_in_code_point == 0) {
7213                $needing_code_points_ending_in_code_point = $property;
7214            }
7215            if ($property == $needing_code_points_ending_in_code_point) {
7216                push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
7217                push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
7218
7219                my $squeezed = $map =~ s/[-\s]+//gr;
7220                push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}},
7221                                                                          $low;
7222                push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}},
7223                                                                         $high;
7224
7225                # Calculate the set of legal characters in names of this
7226                # series.  It includes every character in the name prefix.
7227                my %legal;
7228                $legal{$_} = 1 for split //, $map;
7229
7230                # Plus the hex code point chars, blank, and minus.  Also \n
7231                # can show up as being required due to anchoring
7232                for my $i ('0' .. '9', 'A' .. 'F', '-', ' ', "\n") {
7233                    $legal{$i} = 1;
7234                }
7235                my $legal = join "", sort { $a cmp $b } keys %legal;
7236
7237                # The legal chars can be used in match optimizations
7238                push @code_points_ending_in_code_point, { low => $low,
7239                                                        high => $high,
7240                                                        name => $map,
7241                                                        legal => $legal,
7242                                                        };
7243            }
7244        }
7245        elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
7246
7247            # Multi-code point maps and null string maps have an entry
7248            # for each code point in the range.  They use the same
7249            # output format.
7250            for my $code_point ($low .. $high) {
7251
7252                # The pack() below can't cope with surrogates.  XXX This may
7253                # no longer be true
7254                if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
7255                    Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self.  No map created");
7256                    next;
7257                }
7258
7259                # Generate the hash entries for these in the form that
7260                # utf8.c understands.
7261                my $tostr = "";
7262                my $to_name = "";
7263                my $to_chr = "";
7264                foreach my $to (split " ", $map) {
7265                    if ($to !~ /^$code_point_re$/) {
7266                        Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self.  No map created");
7267                        next;
7268                    }
7269                    $tostr .= sprintf "\\x{%s}", $to;
7270                    $to = CORE::hex $to;
7271                    if ($annotate) {
7272                        $to_name .= " + " if $to_name;
7273                        $to_chr .= main::display_chr($to);
7274                        main::populate_char_info($to)
7275                                            if ! defined $viacode[$to];
7276                        $to_name .=  $viacode[$to];
7277                    }
7278                }
7279
7280                # The unpack yields a list of the bytes that comprise the
7281                # UTF-8 of $code_point, which are each placed in \xZZ format
7282                # and output in the %s to map to $tostr, so the result looks
7283                # like:
7284                # "\xC4\xB0" => "\x{0069}\x{0307}",
7285                my $utf8 = sprintf(qq["%s" => "$tostr",],
7286                        join("", map { sprintf "\\x%02X", $_ }
7287                            unpack("U0C*", chr $code_point)));
7288
7289                # Add a comment so that a human reader can more easily
7290                # see what's going on.
7291                push @multi_code_point_maps,
7292                        sprintf("%-45s # U+%04X", $utf8, $code_point);
7293                if (! $annotate) {
7294                    $multi_code_point_maps[-1] .= " => $map";
7295                }
7296                else {
7297                    main::populate_char_info($code_point)
7298                                    if ! defined $viacode[$code_point];
7299                    $multi_code_point_maps[-1] .= " '"
7300                        . main::display_chr($code_point)
7301                        . "' => '$to_chr'; $viacode[$code_point] => $to_name";
7302                }
7303            }
7304        }
7305        else {
7306            Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self.  Not written");
7307        }
7308
7309        return;
7310    }
7311
7312    sub pre_body($self) {
7313        # Returns the string that should be output in the file before the main
7314        # body of this table.  It isn't called until the main body is
7315        # calculated, saving a pass.  The string includes some hash entries
7316        # identifying the format of the body, and what the single value should
7317        # be for all ranges missing from it.  It also includes any code points
7318        # which have map_types that don't go in the main table.
7319
7320        my $addr = do { no overloading; pack 'J', $self; };
7321
7322        my $name = $self->property->swash_name;
7323
7324        # Currently there is nothing in the pre_body unless a swash is being
7325        # generated.
7326        return unless defined $name;
7327
7328        if (defined $swash_keys{$name}) {
7329            Carp::my_carp(main::join_lines(<<END
7330Already created a swash name '$name' for $swash_keys{$name}.  This means that
7331the same name desired for $self shouldn't be used.  Bad News.  This must be
7332fixed before production use, but proceeding anyway
7333END
7334            ));
7335        }
7336        $swash_keys{$name} = "$self";
7337
7338        my $pre_body = "";
7339
7340        # Here we assume we were called after have gone through the whole
7341        # file.  If we actually generated anything for each map type, add its
7342        # respective header and trailer
7343        my $specials_name = "";
7344        if (@multi_code_point_maps) {
7345            $specials_name = "Unicode::UCD::ToSpec$name";
7346            $pre_body .= <<END;
7347
7348# Some code points require special handling because their mappings are each to
7349# multiple code points.  These do not appear in the main body, but are defined
7350# in the hash below.
7351
7352# Each key is the string of N bytes that together make up the UTF-8 encoding
7353# for the code point.  (i.e. the same as looking at the code point's UTF-8
7354# under "use bytes").  Each value is the UTF-8 of the translation, for speed.
7355\%$specials_name = (
7356END
7357            $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
7358        }
7359
7360        my $format = $self->format;
7361
7362        my $return = "";
7363
7364        my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
7365        if ($output_adjusted) {
7366            if ($specials_name) {
7367                $return .= <<END;
7368# The mappings in the non-hash portion of this file must be modified to get the
7369# correct values by adding the code point ordinal number to each one that is
7370# numeric.
7371END
7372            }
7373            else {
7374                $return .= <<END;
7375# The mappings must be modified to get the correct values by adding the code
7376# point ordinal number to each one that is numeric.
7377END
7378            }
7379        }
7380
7381        $return .= <<END;
7382
7383# The name this table is to be known by, with the format of the mappings in
7384# the main body of the table, and what all code points missing from this file
7385# map to.
7386\$Unicode::UCD::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
7387END
7388        if ($specials_name) {
7389            $return .= <<END;
7390\$Unicode::UCD::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
7391END
7392        }
7393        my $default_map = $default_map{$addr};
7394
7395        # For $CODE_POINT default maps and using adjustments, instead the default
7396        # becomes zero.
7397        $return .= "\$Unicode::UCD::SwashInfo{'To$name'}{'missing'} = '"
7398                .  (($output_adjusted && $default_map eq $CODE_POINT)
7399                   ? "0"
7400                   : $default_map)
7401                . "';";
7402
7403        if ($default_map eq $CODE_POINT) {
7404            $return .= ' # code point maps to itself';
7405        }
7406        elsif ($default_map eq "") {
7407            $return .= ' # code point maps to the empty string';
7408        }
7409        $return .= "\n";
7410
7411        $return .= $pre_body;
7412
7413        return $return;
7414    }
7415
7416    sub write($self) {
7417        # Write the table to the file.
7418
7419        my $addr = do { no overloading; pack 'J', $self; };
7420
7421        # Clear the temporaries
7422        undef @multi_code_point_maps;
7423
7424        # Calculate the format of the table if not already done.
7425        my $format = $self->format;
7426        my $type = $self->property->type;
7427        my $default_map = $self->default_map;
7428        if (! defined $format) {
7429            if ($type == $BINARY) {
7430
7431                # Don't bother checking the values, because we elsewhere
7432                # verify that a binary table has only 2 values.
7433                $format = $BINARY_FORMAT;
7434            }
7435            else {
7436                my @ranges = $self->_range_list->ranges;
7437
7438                # default an empty table based on its type and default map
7439                if (! @ranges) {
7440
7441                    # But it turns out that the only one we can say is a
7442                    # non-string (besides binary, handled above) is when the
7443                    # table is a string and the default map is to a code point
7444                    if ($type == $STRING && $default_map eq $CODE_POINT) {
7445                        $format = $HEX_FORMAT;
7446                    }
7447                    else {
7448                        $format = $STRING_FORMAT;
7449                    }
7450                }
7451                else {
7452
7453                    # Start with the most restrictive format, and as we find
7454                    # something that doesn't fit with that, change to the next
7455                    # most restrictive, and so on.
7456                    $format = $DECIMAL_FORMAT;
7457                    foreach my $range (@ranges) {
7458                        next if $range->type != 0;  # Non-normal ranges don't
7459                                                    # affect the main body
7460                        my $map = $range->value;
7461                        if ($map ne $default_map) {
7462                            last if $format eq $STRING_FORMAT;  # already at
7463                                                                # least
7464                                                                # restrictive
7465                            $format = $INTEGER_FORMAT
7466                                                if $format eq $DECIMAL_FORMAT
7467                                                    && $map !~ / ^ [0-9] $ /x;
7468                            $format = $FLOAT_FORMAT
7469                                            if $format eq $INTEGER_FORMAT
7470                                                && $map !~ / ^ -? [0-9]+ $ /x;
7471                            $format = $RATIONAL_FORMAT
7472                                if $format eq $FLOAT_FORMAT
7473                                    && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
7474                            $format = $HEX_FORMAT
7475                                if ($format eq $RATIONAL_FORMAT
7476                                       && $map !~
7477                                           m/ ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x)
7478                                        # Assume a leading zero means hex,
7479                                        # even if all digits are 0-9
7480                                    || ($format eq $INTEGER_FORMAT
7481                                        && $map =~ /^0[0-9A-F]/);
7482                            $format = $STRING_FORMAT if $format eq $HEX_FORMAT
7483                                                       && $map =~ /[^0-9A-F]/;
7484                        }
7485                    }
7486                }
7487            }
7488        } # end of calculating format
7489
7490        if ($default_map eq $CODE_POINT
7491            && $format ne $HEX_FORMAT
7492            && ! defined $self->format)    # manual settings are always
7493                                           # considered ok
7494        {
7495            Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
7496        }
7497
7498        # If the output is to be adjusted, the format of the table that gets
7499        # output is actually 'a' or 'ax' instead of whatever it is stored
7500        # internally as.
7501        my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
7502        if ($output_adjusted) {
7503            if ($default_map eq $CODE_POINT) {
7504                $format = $HEX_ADJUST_FORMAT;
7505            }
7506            else {
7507                $format = $ADJUST_FORMAT;
7508            }
7509        }
7510
7511        $self->_set_format($format);
7512
7513        return $self->SUPER::write(
7514            $output_adjusted,
7515            $default_map);   # don't write defaulteds
7516    }
7517
7518    # Accessors for the underlying list that should fail if locked.
7519    for my $sub (qw(
7520                    add_duplicate
7521                    replace_map
7522                ))
7523    {
7524        no strict "refs";
7525        *$sub = sub {
7526            use strict "refs";
7527            my $self = shift;
7528
7529            return if $self->carp_if_locked;
7530            return $self->_range_list->$sub(@_);
7531        }
7532    }
7533} # End closure for Map_Table
7534
7535package Match_Table;
7536use parent '-norequire', '_Base_Table';
7537
7538# A Match table is one which is a list of all the code points that have
7539# the same property and property value, for use in \p{property=value}
7540# constructs in regular expressions.  It adds very little data to the base
7541# structure, but many methods, as these lists can be combined in many ways to
7542# form new ones.
7543# There are only a few concepts added:
7544# 1) Equivalents and Relatedness.
7545#    Two tables can match the identical code points, but have different names.
7546#    This always happens when there is a perl single form extension
7547#    \p{IsProperty} for the Unicode compound form \P{Property=True}.  The two
7548#    tables are set to be related, with the Perl extension being a child, and
7549#    the Unicode property being the parent.
7550#
7551#    It may be that two tables match the identical code points and we don't
7552#    know if they are related or not.  This happens most frequently when the
7553#    Block and Script properties have the exact range.  But note that a
7554#    revision to Unicode could add new code points to the script, which would
7555#    now have to be in a different block (as the block was filled, or there
7556#    would have been 'Unknown' script code points in it and they wouldn't have
7557#    been identical).  So we can't rely on any two properties from Unicode
7558#    always matching the same code points from release to release, and thus
7559#    these tables are considered coincidentally equivalent--not related.  When
7560#    two tables are unrelated but equivalent, one is arbitrarily chosen as the
7561#    'leader', and the others are 'equivalents'.  This concept is useful
7562#    to minimize the number of tables written out.  Only one file is used for
7563#    any identical set of code points, with entries in UCD.pl mapping all
7564#    the involved tables to it.
7565#
7566#    Related tables will always be identical; we set them up to be so.  Thus
7567#    if the Unicode one is deprecated, the Perl one will be too.  Not so for
7568#    unrelated tables.  Relatedness makes generating the documentation easier.
7569#
7570# 2) Complement.
7571#    Like equivalents, two tables may be the inverses of each other, the
7572#    intersection between them is null, and the union is every Unicode code
7573#    point.  The two tables that occupy a binary property are necessarily like
7574#    this.  By specifying one table as the complement of another, we can avoid
7575#    storing it on disk (using the other table and performing a fast
7576#    transform), and some memory and calculations.
7577#
7578# 3) Conflicting.  It may be that there will eventually be name clashes, with
7579#    the same name meaning different things.  For a while, there actually were
7580#    conflicts, but they have so far been resolved by changing Perl's or
7581#    Unicode's definitions to match the other, but when this code was written,
7582#    it wasn't clear that that was what was going to happen.  (Unicode changed
7583#    because of protests during their beta period.)  Name clashes are warned
7584#    about during compilation, and the documentation.  The generated tables
7585#    are sane, free of name clashes, because the code suppresses the Perl
7586#    version.  But manual intervention to decide what the actual behavior
7587#    should be may be required should this happen.  The introductory comments
7588#    have more to say about this.
7589#
7590# 4) Definition.  This is a string for human consumption that specifies the
7591#    code points that this table matches.  This is used only for the generated
7592#    pod file.  It may be specified explicitly, or automatically computed.
7593#    Only the first portion of complicated definitions is computed and
7594#    displayed.
7595
7596sub standardize { return main::standardize($_[0]); }
7597sub trace { return main::trace(@_); }
7598
7599
7600{ # Closure
7601
7602    main::setup_package();
7603
7604    my %leader;
7605    # The leader table of this one; initially $self.
7606    main::set_access('leader', \%leader, 'r');
7607
7608    my %equivalents;
7609    # An array of any tables that have this one as their leader
7610    main::set_access('equivalents', \%equivalents, 'readable_array');
7611
7612    my %parent;
7613    # The parent table to this one, initially $self.  This allows us to
7614    # distinguish between equivalent tables that are related (for which this
7615    # is set to), and those which may not be, but share the same output file
7616    # because they match the exact same set of code points in the current
7617    # Unicode release.
7618    main::set_access('parent', \%parent, 'r');
7619
7620    my %children;
7621    # An array of any tables that have this one as their parent
7622    main::set_access('children', \%children, 'readable_array');
7623
7624    my %conflicting;
7625    # Array of any tables that would have the same name as this one with
7626    # a different meaning.  This is used for the generated documentation.
7627    main::set_access('conflicting', \%conflicting, 'readable_array');
7628
7629    my %matches_all;
7630    # Set in the constructor for tables that are expected to match all code
7631    # points.
7632    main::set_access('matches_all', \%matches_all, 'r');
7633
7634    my %complement;
7635    # Points to the complement that this table is expressed in terms of; 0 if
7636    # none.
7637    main::set_access('complement', \%complement, 'r');
7638
7639    my %definition;
7640    # Human readable string of the first few ranges of code points matched by
7641    # this table
7642    main::set_access('definition', \%definition, 'r', 's');
7643
7644    sub new {
7645        my $class = shift;
7646
7647        my %args = @_;
7648
7649        # The property for which this table is a listing of property values.
7650        my $property = delete $args{'_Property'};
7651
7652        my $name = delete $args{'Name'};
7653        my $full_name = delete $args{'Full_Name'};
7654        $full_name = $name if ! defined $full_name;
7655
7656        # Optional
7657        my $initialize = delete $args{'Initialize'};
7658        my $matches_all = delete $args{'Matches_All'} || 0;
7659        my $format = delete $args{'Format'};
7660        my $definition = delete $args{'Definition'} // "";
7661        # Rest of parameters passed on.
7662
7663        my $range_list = Range_List->new(Initialize => $initialize,
7664                                         Owner => $property);
7665
7666        my $complete = $full_name;
7667        $complete = '""' if $complete eq "";  # A null name shouldn't happen,
7668                                              # but this helps debug if it
7669                                              # does
7670        # The complete name for a match table includes it's property in a
7671        # compound form 'property=table', except if the property is the
7672        # pseudo-property, perl, in which case it is just the single form,
7673        # 'table' (If you change the '=' must also change the ':' in lots of
7674        # places in this program that assume an equal sign)
7675        $complete = $property->full_name . "=$complete" if $property != $perl;
7676
7677        my $self = $class->SUPER::new(%args,
7678                                      Name => $name,
7679                                      Complete_Name => $complete,
7680                                      Full_Name => $full_name,
7681                                      _Property => $property,
7682                                      _Range_List => $range_list,
7683                                      Format => $EMPTY_FORMAT,
7684                                      Write_As_Invlist => 1,
7685                                      );
7686        my $addr = do { no overloading; pack 'J', $self; };
7687
7688        $conflicting{$addr} = [ ];
7689        $equivalents{$addr} = [ ];
7690        $children{$addr} = [ ];
7691        $matches_all{$addr} = $matches_all;
7692        $leader{$addr} = $self;
7693        $parent{$addr} = $self;
7694        $complement{$addr} = 0;
7695        $definition{$addr} = $definition;
7696
7697        if (defined $format && $format ne $EMPTY_FORMAT) {
7698            Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'.  Using '$EMPTY_FORMAT'");
7699        }
7700
7701        return $self;
7702    }
7703
7704    # See this program's beginning comment block about overloading these.
7705    use overload
7706        fallback => 0,
7707        qw("") => "_operator_stringify",
7708        '=' => sub {
7709                    my $self = shift;
7710
7711                    return if $self->carp_if_locked;
7712                    return $self;
7713                },
7714
7715        '+' => sub {
7716                        my $self = shift;
7717                        my $other = shift;
7718
7719                        return $self->_range_list + $other;
7720                    },
7721        '&' => sub {
7722                        my $self = shift;
7723                        my $other = shift;
7724
7725                        return $self->_range_list & $other;
7726                    },
7727        '+=' => sub {
7728                        my $self = shift;
7729                        my $other = shift;
7730                        my $reversed = shift;
7731
7732                        if ($reversed) {
7733                            Carp::my_carp_bug("Bad news.  Can't cope with '"
7734                            . ref($other)
7735                            . ' += '
7736                            . ref($self)
7737                            . "'.  undef returned.");
7738                            return;
7739                        }
7740
7741                        return if $self->carp_if_locked;
7742
7743                        my $addr = do { no overloading; pack 'J', $self; };
7744
7745                        if (ref $other) {
7746
7747                            # Change the range list of this table to be the
7748                            # union of the two.
7749                            $self->_set_range_list($self->_range_list
7750                                                    + $other);
7751                        }
7752                        else {    # $other is just a simple value
7753                            $self->add_range($other, $other);
7754                        }
7755                        return $self;
7756                    },
7757        '&=' => sub {
7758                        my $self = shift;
7759                        my $other = shift;
7760                        my $reversed = shift;
7761
7762                        if ($reversed) {
7763                            Carp::my_carp_bug("Bad news.  Can't cope with '"
7764                            . ref($other)
7765                            . ' &= '
7766                            . ref($self)
7767                            . "'.  undef returned.");
7768                            return;
7769                        }
7770
7771                        return if $self->carp_if_locked;
7772                        $self->_set_range_list($self->_range_list & $other);
7773                        return $self;
7774                    },
7775        '-' => sub { my $self = shift;
7776                    my $other = shift;
7777                    my $reversed = shift;
7778                    if ($reversed) {
7779                        Carp::my_carp_bug("Bad news.  Can't cope with '"
7780                        . ref($other)
7781                        . ' - '
7782                        . ref($self)
7783                        . "'.  undef returned.");
7784                        return;
7785                    }
7786
7787                    return $self->_range_list - $other;
7788                },
7789        '~' => sub { my $self = shift;
7790                    return ~ $self->_range_list;
7791                },
7792    ;
7793
7794    sub _operator_stringify($self, $other="", $reversed=0) {
7795
7796        my $name = $self->complete_name;
7797        return "Table '$name'";
7798    }
7799
7800    sub _range_list {
7801        # Returns the range list associated with this table, which will be the
7802        # complement's if it has one.
7803
7804        my $self = shift;
7805        my $complement = $self->complement;
7806
7807        # In order to avoid re-complementing on each access, only do the
7808        # complement the first time, and store the result in this table's
7809        # range list to use henceforth.  However, this wouldn't work if the
7810        # controlling (complement) table changed after we do this, so lock it.
7811        # Currently, the value of the complement isn't needed until after it
7812        # is fully constructed, so this works.  If this were to change, the
7813        # each_range iteration functionality would no longer work on this
7814        # complement.
7815        if ($complement != 0 && $self->SUPER::_range_list->count == 0) {
7816            $self->_set_range_list($self->SUPER::_range_list
7817                                + ~ $complement->_range_list);
7818            $complement->lock;
7819        }
7820
7821        return $self->SUPER::_range_list;
7822    }
7823
7824    sub add_alias {
7825        # Add a synonym for this table.  See the comments in the base class
7826
7827        my $self = shift;
7828        my $name = shift;
7829        # Rest of parameters passed on.
7830
7831        $self->SUPER::add_alias($name, $self, @_);
7832        return;
7833    }
7834
7835    sub add_conflicting {
7836        # Add the name of some other object to the list of ones that name
7837        # clash with this match table.
7838
7839        my $self = shift;
7840        my $conflicting_name = shift;   # The name of the conflicting object
7841        my $p = shift || 'p';           # Optional, is this a \p{} or \P{} ?
7842        my $conflicting_object = shift; # Optional, the conflicting object
7843                                        # itself.  This is used to
7844                                        # disambiguate the text if the input
7845                                        # name is identical to any of the
7846                                        # aliases $self is known by.
7847                                        # Sometimes the conflicting object is
7848                                        # merely hypothetical, so this has to
7849                                        # be an optional parameter.
7850        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7851
7852        my $addr = do { no overloading; pack 'J', $self; };
7853
7854        # Check if the conflicting name is exactly the same as any existing
7855        # alias in this table (as long as there is a real object there to
7856        # disambiguate with).
7857        if (defined $conflicting_object) {
7858            foreach my $alias ($self->aliases) {
7859                if (standardize($alias->name) eq standardize($conflicting_name)) {
7860
7861                    # Here, there is an exact match.  This results in
7862                    # ambiguous comments, so disambiguate by changing the
7863                    # conflicting name to its object's complete equivalent.
7864                    $conflicting_name = $conflicting_object->complete_name;
7865                    last;
7866                }
7867            }
7868        }
7869
7870        # Convert to the \p{...} final name
7871        $conflicting_name = "\\$p" . "{$conflicting_name}";
7872
7873        # Only add once
7874        return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
7875
7876        push @{$conflicting{$addr}}, $conflicting_name;
7877
7878        return;
7879    }
7880
7881    sub is_set_equivalent_to($self, $other=undef) {
7882        # Return boolean of whether or not the other object is a table of this
7883        # type and has been marked equivalent to this one.
7884
7885        return 0 if ! defined $other; # Can happen for incomplete early
7886                                      # releases
7887        unless ($other->isa(__PACKAGE__)) {
7888            my $ref_other = ref $other;
7889            my $ref_self = ref $self;
7890            Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'.  $other not set equivalent to $self.");
7891            return 0;
7892        }
7893
7894        # Two tables are equivalent if they have the same leader.
7895        no overloading;
7896        return $leader{pack 'J', $self} == $leader{pack 'J', $other};
7897        return;
7898    }
7899
7900    sub set_equivalent_to {
7901        # Set $self equivalent to the parameter table.
7902        # The required Related => 'x' parameter is a boolean indicating
7903        # whether these tables are related or not.  If related, $other becomes
7904        # the 'parent' of $self; if unrelated it becomes the 'leader'
7905        #
7906        # Related tables share all characteristics except names; equivalents
7907        # not quite so many.
7908        # If they are related, one must be a perl extension.  This is because
7909        # we can't guarantee that Unicode won't change one or the other in a
7910        # later release even if they are identical now.
7911
7912        my $self = shift;
7913        my $other = shift;
7914
7915        my %args = @_;
7916        my $related = delete $args{'Related'};
7917
7918        Carp::carp_extra_args(\%args) if main::DEBUG && %args;
7919
7920        return if ! defined $other;     # Keep on going; happens in some early
7921                                        # Unicode releases.
7922
7923        if (! defined $related) {
7924            Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter.  Assuming $self is not related to $other");
7925            $related = 0;
7926        }
7927
7928        # If already are equivalent, no need to re-do it;  if subroutine
7929        # returns null, it found an error, also do nothing
7930        my $are_equivalent = $self->is_set_equivalent_to($other);
7931        return if ! defined $are_equivalent || $are_equivalent;
7932
7933        my $addr = do { no overloading; pack 'J', $self; };
7934        my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
7935
7936        if ($related) {
7937            if ($current_leader->perl_extension) {
7938                if ($other->perl_extension) {
7939                    Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent.");
7940                    return;
7941                }
7942            } elsif ($self->property != $other->property    # Depending on
7943                                                            # situation, might
7944                                                            # be better to use
7945                                                            # add_alias()
7946                                                            # instead for same
7947                                                            # property
7948                     && ! $other->perl_extension
7949
7950                         # We allow the sc and scx properties to be marked as
7951                         # related.  They are in fact related, and this allows
7952                         # the pod to show that better.  This test isn't valid
7953                         # if this is an early Unicode release without the scx
7954                         # property (having that also implies the sc property
7955                         # exists, so don't have to test for no 'sc')
7956                     && (   ! defined $scx
7957                         && ! (   (   $self->property == $script
7958                                   || $self->property == $scx)
7959                               && (   $self->property == $script
7960                                   || $self->property == $scx))))
7961            {
7962                Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties.  Assuming $self is not related to $other");
7963                $related = 0;
7964            }
7965        }
7966
7967        if (! $self->is_empty && ! $self->matches_identically_to($other)) {
7968            Carp::my_carp_bug("$self should be empty or match identically to $other.  Not setting equivalent");
7969            return;
7970        }
7971
7972        my $leader = do { no overloading; pack 'J', $current_leader; };
7973        my $other_addr = do { no overloading; pack 'J', $other; };
7974
7975        # Any tables that are equivalent to or children of this table must now
7976        # instead be equivalent to or (children) to the new leader (parent),
7977        # still equivalent.  The equivalency includes their matches_all info,
7978        # and for related tables, their fate and status.
7979        # All related tables are of necessity equivalent, but the converse
7980        # isn't necessarily true
7981        my $status = $other->status;
7982        my $status_info = $other->status_info;
7983        my $fate = $other->fate;
7984        my $matches_all = $matches_all{other_addr};
7985        my $caseless_equivalent = $other->caseless_equivalent;
7986        foreach my $table ($current_leader, @{$equivalents{$leader}}) {
7987            next if $table == $other;
7988            trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
7989
7990            my $table_addr = do { no overloading; pack 'J', $table; };
7991            $leader{$table_addr} = $other;
7992            $matches_all{$table_addr} = $matches_all;
7993            $self->_set_range_list($other->_range_list);
7994            push @{$equivalents{$other_addr}}, $table;
7995            if ($related) {
7996                $parent{$table_addr} = $other;
7997                push @{$children{$other_addr}}, $table;
7998                $table->set_status($status, $status_info);
7999
8000                # This reason currently doesn't get exposed outside; otherwise
8001                # would have to look up the parent's reason and use it instead.
8002                $table->set_fate($fate, "Parent's fate");
8003
8004                $self->set_caseless_equivalent($caseless_equivalent);
8005            }
8006        }
8007
8008        # Now that we've declared these to be equivalent, any changes to one
8009        # of the tables would invalidate that equivalency.
8010        $self->lock;
8011        $other->lock;
8012        return;
8013    }
8014
8015    sub set_complement($self, $other) {
8016        # Set $self to be the complement of the parameter table.  $self is
8017        # locked, as what it contains should all come from the other table.
8018
8019        if ($other->complement != 0) {
8020            Carp::my_carp_bug("Can't set $self to be the complement of $other, which itself is the complement of " . $other->complement);
8021            return;
8022        }
8023        my $addr = do { no overloading; pack 'J', $self; };
8024        $complement{$addr} = $other;
8025
8026        # Be sure the other property knows we are depending on them; or the
8027        # other table if it is one in the current property.
8028        if ($self->property != $other->property) {
8029            $other->property->set_has_dependency(1);
8030        }
8031        else {
8032            $other->set_has_dependency(1);
8033        }
8034        $self->lock;
8035        return;
8036    }
8037
8038    sub add_range($self, @range) { # Add a range to the list for this table.
8039        # Rest of parameters passed on
8040
8041        return if $self->carp_if_locked;
8042        return $self->_range_list->add_range(@range);
8043    }
8044
8045    sub header($self) {
8046        # All match tables are to be used only by the Perl core.
8047        return $self->SUPER::header() . $INTERNAL_ONLY_HEADER;
8048    }
8049
8050    sub pre_body {  # Does nothing for match tables.
8051        return
8052    }
8053
8054    sub append_to_body {  # Does nothing for match tables.
8055        return
8056    }
8057
8058    sub set_fate($self, $fate, $reason=undef) {
8059        $self->SUPER::set_fate($fate, $reason);
8060
8061        # All children share this fate
8062        foreach my $child ($self->children) {
8063            $child->set_fate($fate, $reason);
8064        }
8065        return;
8066    }
8067
8068    sub calculate_table_definition
8069    {
8070        # Returns a human-readable string showing some or all of the code
8071        # points matched by this table.  The string will include a
8072        # bracketed-character class for all characters matched in the 00-FF
8073        # range, and the first few ranges matched beyond that.
8074        my $max_ranges = 6;
8075
8076        my $self = shift;
8077        my $definition = $self->definition || "";
8078
8079        # Skip this if already have a definition.
8080        return $definition if $definition;
8081
8082        my $lows_string = "";   # The string representation of the 0-FF
8083                                # characters
8084        my $string_range = "";  # The string rep. of the above FF ranges
8085        my $range_count = 0;    # How many ranges in $string_rage
8086
8087        my @lows_invlist;       # The inversion list of the 0-FF code points
8088        my $first_non_control = ord(" ");   # Everything below this is a
8089                                            # control, on ASCII or EBCDIC
8090        my $max_table_code_point = $self->max;
8091
8092        # On ASCII platforms, the range 80-FF contains no printables.
8093        my $highest_printable = ((main::NON_ASCII_PLATFORM) ? 255 : 126);
8094
8095
8096        # Look through the first few ranges matched by this table.
8097        $self->reset_each_range;    # Defensive programming
8098        while (defined (my $range = $self->each_range())) {
8099            my $start = $range->start;
8100            my $end = $range->end;
8101
8102            # Accumulate an inversion list of the 00-FF code points
8103            if ($start < 256 && ($start > 0 || $end < 256)) {
8104                push @lows_invlist, $start;
8105                push @lows_invlist, 1 + (($end < 256) ? $end : 255);
8106
8107                # Get next range if there are more ranges below 256
8108                next if $end < 256 && $end < $max_table_code_point;
8109
8110                # If the range straddles the 255/256 boundary, we split it
8111                # there.  We already added above the low portion to the
8112                # inversion list
8113                $start = 256 if $end > 256;
8114            }
8115
8116            # Here, @lows_invlist contains the code points below 256, and
8117            # there is no other range, or the current one starts at or above
8118            # 256.  Generate the [char class] for the 0-255 ones.
8119            while (@lows_invlist) {
8120
8121                # If this range (necessarily the first one, by the way) starts
8122                # at 0 ...
8123                if ($lows_invlist[0] == 0) {
8124
8125                    # If it ends within the block of controls, that means that
8126                    # some controls are in it and some aren't.  Since Unicode
8127                    # properties pretty much only know about a few of the
8128                    # controls, like \n, \t, this means that its one of them
8129                    # that isn't in the range.  Complement the inversion list
8130                    # which will likely cause these to be output using their
8131                    # mnemonics, hence being clearer.
8132                    if ($lows_invlist[1] < $first_non_control) {
8133                        $lows_string .= '^';
8134                        shift @lows_invlist;
8135                        push @lows_invlist, 256;
8136                    }
8137                    elsif ($lows_invlist[1] <= $highest_printable) {
8138
8139                        # Here, it extends into the printables block.  Split
8140                        # into two ranges so that the controls are separate.
8141                        $lows_string .= sprintf "\\x00-\\x%02x",
8142                                                    $first_non_control - 1;
8143                        $lows_invlist[0] = $first_non_control;
8144                    }
8145                }
8146
8147                # If the range completely contains the printables, don't
8148                # individually spell out the printables.
8149                if (    $lows_invlist[0] <= $first_non_control
8150                    && $lows_invlist[1] > $highest_printable)
8151                {
8152                    $lows_string .= sprintf "\\x%02x-\\x%02x",
8153                                        $lows_invlist[0], $lows_invlist[1] - 1;
8154                    shift @lows_invlist;
8155                    shift @lows_invlist;
8156                    next;
8157                }
8158
8159                # Here, the range may include some but not all printables.
8160                # Look at each one individually
8161                foreach my $ord (shift @lows_invlist .. shift(@lows_invlist) - 1) {
8162                    my $char = chr $ord;
8163
8164                    # If there is already something in the list, an
8165                    # alphanumeric char could be the next in sequence.  If so,
8166                    # we start or extend a range.  That is, we could have so
8167                    # far something like 'a-c', and the next char is a 'd', so
8168                    # we change it to 'a-d'.  We use native_to_unicode()
8169                    # because a-z on EBCDIC means 26 chars, and excludes the
8170                    # gap ones.
8171                    if ($lows_string ne "" && $char =~ /[[:alnum:]]/) {
8172                        my $prev = substr($lows_string, -1);
8173                        if (   $prev !~ /[[:alnum:]]/
8174                            ||   utf8::native_to_unicode(ord $prev) + 1
8175                              != utf8::native_to_unicode(ord $char))
8176                        {
8177                            # Not extending the range
8178                            $lows_string .= $char;
8179                        }
8180                        elsif (   length $lows_string > 1
8181                               && substr($lows_string, -2, 1) eq '-')
8182                        {
8183                            # We had a sequence like '-c' and the current
8184                            # character is 'd'.  Extend the range.
8185                            substr($lows_string, -1, 1) = $char;
8186                        }
8187                        else {
8188                            # We had something like 'd' and this is 'e'.
8189                            # Start a range.
8190                            $lows_string .= "-$char";
8191                        }
8192                    }
8193                    elsif ($char =~ /[[:graph:]]/) {
8194
8195                        # We output a graphic char as-is, preceded by a
8196                        # backslash if it is a metacharacter
8197                        $lows_string .= '\\'
8198                                if $char =~ /[\\\^\$\@\%\|()\[\]\{\}\-\/"']/;
8199                        $lows_string .= $char;
8200                    } # Otherwise use mnemonic for any that have them
8201                    elsif ($char =~ /[\a]/) {
8202                        $lows_string .= '\a';
8203                    }
8204                    elsif ($char =~ /[\b]/) {
8205                        $lows_string .= '\b';
8206                    }
8207                    elsif ($char eq "\e") {
8208                        $lows_string .= '\e';
8209                    }
8210                    elsif ($char eq "\f") {
8211                        $lows_string .= '\f';
8212                    }
8213                    elsif ($char eq "\cK") {
8214                        $lows_string .= '\cK';
8215                    }
8216                    elsif ($char eq "\n") {
8217                        $lows_string .= '\n';
8218                    }
8219                    elsif ($char eq "\r") {
8220                        $lows_string .= '\r';
8221                    }
8222                    elsif ($char eq "\t") {
8223                        $lows_string .= '\t';
8224                    }
8225                    else {
8226
8227                        # Here is a non-graphic without a mnemonic.  We use \x
8228                        # notation.  But if the ordinal of this is one above
8229                        # the previous, create or extend the range
8230                        my $hex_representation = sprintf("%02x", ord $char);
8231                        if (   length $lows_string >= 4
8232                            && substr($lows_string, -4, 2) eq '\\x'
8233                            && hex(substr($lows_string, -2)) + 1 == ord $char)
8234                        {
8235                            if (       length $lows_string >= 5
8236                                &&     substr($lows_string, -5, 1) eq '-'
8237                                && (   length $lows_string == 5
8238                                    || substr($lows_string, -6, 1) ne '\\'))
8239                            {
8240                                substr($lows_string, -2) = $hex_representation;
8241                            }
8242                            else {
8243                                $lows_string .= '-\\x' . $hex_representation;
8244                            }
8245                        }
8246                        else {
8247                            $lows_string .= '\\x' . $hex_representation;
8248                        }
8249                    }
8250                }
8251            }
8252
8253            # Done with assembling the string of all lows.  If there are only
8254            # lows in the property, are completely done.
8255            if ($max_table_code_point < 256) {
8256                $self->reset_each_range;
8257                last;
8258            }
8259
8260            # Otherwise, quit if reached max number of non-lows ranges.  If
8261            # there are lows, count them as one unit towards the maximum.
8262            $range_count++;
8263            if ($range_count > (($lows_string eq "") ? $max_ranges : $max_ranges - 1)) {
8264                $string_range .= " ...";
8265                $self->reset_each_range;
8266                last;
8267            }
8268
8269            # Otherwise add this range.
8270            $string_range .= ", " if $string_range ne "";
8271            if ($start == $end) {
8272                $string_range .= sprintf("U+%04X", $start);
8273            }
8274            elsif ($end >= $MAX_WORKING_CODEPOINT)  {
8275                $string_range .= sprintf("U+%04X..infinity", $start);
8276            }
8277            else  {
8278                $string_range .= sprintf("U+%04X..%04X",
8279                                        $start, $end);
8280            }
8281        }
8282
8283        # Done with all the ranges we're going to look at.  Assemble the
8284        # definition from the lows + non-lows.
8285
8286        if ($lows_string ne "" || $string_range ne "") {
8287            if ($lows_string ne "") {
8288                $definition .= "[$lows_string]";
8289                $definition .= ", " if $string_range;
8290            }
8291            $definition .= $string_range;
8292        }
8293
8294        return $definition;
8295    }
8296
8297    sub write($self) {
8298        return $self->SUPER::write(0); # No adjustments
8299    }
8300
8301    # $leader - Should only be called on the leader table of an equivalent group
8302    sub set_final_comment($leader) {
8303        # This creates a comment for the file that is to hold the match table
8304        # $self.  It is somewhat convoluted to make the English read nicely,
8305        # but, heh, it's just a comment.
8306        # This should be called only with the leader match table of all the
8307        # ones that share the same file.  It lists all such tables, ordered so
8308        # that related ones are together.
8309
8310        return unless $debugging_build;
8311
8312        my $addr = do { no overloading; pack 'J', $leader; };
8313
8314        if ($leader{$addr} != $leader) {
8315            Carp::my_carp_bug(<<END
8316set_final_comment() must be called on a leader table, which $leader is not.
8317It is equivalent to $leader{$addr}.  No comment created
8318END
8319            );
8320            return;
8321        }
8322
8323        # Get the number of code points matched by each of the tables in this
8324        # file, and add underscores for clarity.
8325        my $count = $leader->count;
8326        my $unicode_count;
8327        my $non_unicode_string;
8328        if ($count > $MAX_UNICODE_CODEPOINTS) {
8329            $unicode_count = $count - ($MAX_WORKING_CODEPOINT
8330                                       - $MAX_UNICODE_CODEPOINT);
8331            $non_unicode_string = "All above-Unicode code points match as well, and are also returned";
8332        }
8333        else {
8334            $unicode_count = $count;
8335            $non_unicode_string = "";
8336        }
8337        my $string_count = main::clarify_code_point_count($unicode_count);
8338
8339        my $loose_count = 0;        # how many aliases loosely matched
8340        my $compound_name = "";     # ? Are any names compound?, and if so, an
8341                                    # example
8342        my $properties_with_compound_names = 0;    # count of these
8343
8344
8345        my %flags;              # The status flags used in the file
8346        my $total_entries = 0;  # number of entries written in the comment
8347        my $matches_comment = ""; # The portion of the comment about the
8348                                  # \p{}'s
8349        my @global_comments;    # List of all the tables' comments that are
8350                                # there before this routine was called.
8351        my $has_ucd_alias = 0;  # If there is an alias that is accessible via
8352                                # Unicode::UCD.  If not, then don't say it is
8353                                # in the comment
8354
8355        # Get list of all the parent tables that are equivalent to this one
8356        # (including itself).
8357        my @parents = grep { $parent{main::objaddr $_} == $_ }
8358                            main::uniques($leader, @{$equivalents{$addr}});
8359        my $has_unrelated = (@parents >= 2);  # boolean, ? are there unrelated
8360                                              # tables
8361        for my $parent (@parents) {
8362
8363            my $property = $parent->property;
8364
8365            # Special case 'N' tables in properties with two match tables when
8366            # the other is a 'Y' one.  These are likely to be binary tables,
8367            # but not necessarily.  In either case, \P{} will match the
8368            # complement of \p{}, and so if something is a synonym of \p, the
8369            # complement of that something will be the synonym of \P.  This
8370            # would be true of any property with just two match tables, not
8371            # just those whose values are Y and N; but that would require a
8372            # little extra work, and there are none such so far in Unicode.
8373            my $perl_p = 'p';        # which is it?  \p{} or \P{}
8374            my @yes_perl_synonyms;   # list of any synonyms for the 'Y' table
8375
8376            if (scalar $property->tables == 2
8377                && $parent == $property->table('N')
8378                && defined (my $yes = $property->table('Y')))
8379            {
8380                my $yes_addr = do { no overloading; pack 'J', $yes; };
8381                @yes_perl_synonyms
8382                    = grep { $_->property == $perl }
8383                                    main::uniques($yes,
8384                                                $parent{$yes_addr},
8385                                                $parent{$yes_addr}->children);
8386
8387                # But these synonyms are \P{} ,not \p{}
8388                $perl_p = 'P';
8389            }
8390
8391            my @description;        # Will hold the table description
8392            my @note;               # Will hold the table notes.
8393            my @conflicting;        # Will hold the table conflicts.
8394
8395            # Look at the parent, any yes synonyms, and all the children
8396            my $parent_addr = do { no overloading; pack 'J', $parent; };
8397            for my $table ($parent,
8398                           @yes_perl_synonyms,
8399                           @{$children{$parent_addr}})
8400            {
8401                my $table_addr = do { no overloading; pack 'J', $table; };
8402                my $table_property = $table->property;
8403
8404                # Tables are separated by a blank line to create a grouping.
8405                $matches_comment .= "\n" if $matches_comment;
8406
8407                # The table is named based on the property and value
8408                # combination it is for, like script=greek.  But there may be
8409                # a number of synonyms for each side, like 'sc' for 'script',
8410                # and 'grek' for 'greek'.  Any combination of these is a valid
8411                # name for this table.  In this case, there are three more,
8412                # 'sc=grek', 'sc=greek', and 'script='grek'.  Rather than
8413                # listing all possible combinations in the comment, we make
8414                # sure that each synonym occurs at least once, and add
8415                # commentary that the other combinations are possible.
8416                # Because regular expressions don't recognize things like
8417                # \p{jsn=}, only look at non-null right-hand-sides
8418                my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table_property->aliases;
8419                my @table_aliases = grep { $_->name ne "" } $table->aliases;
8420
8421                # The alias lists above are already ordered in the order we
8422                # want to output them.  To ensure that each synonym is listed,
8423                # we must use the max of the two numbers.  But if there are no
8424                # legal synonyms (nothing in @table_aliases), then we don't
8425                # list anything.
8426                my $listed_combos = (@table_aliases)
8427                                    ?  main::max(scalar @table_aliases,
8428                                                 scalar @property_aliases)
8429                                    : 0;
8430                trace "$listed_combos, tables=", scalar @table_aliases, "; property names=", scalar @property_aliases if main::DEBUG;
8431
8432                my $property_had_compound_name = 0;
8433
8434                for my $i (0 .. $listed_combos - 1) {
8435                    $total_entries++;
8436
8437                    # The current alias for the property is the next one on
8438                    # the list, or if beyond the end, start over.  Similarly
8439                    # for the table (\p{prop=table})
8440                    my $property_alias = $property_aliases
8441                                            [$i % @property_aliases]->name;
8442                    my $table_alias_object = $table_aliases
8443                                                        [$i % @table_aliases];
8444                    my $table_alias = $table_alias_object->name;
8445                    my $loose_match = $table_alias_object->loose_match;
8446                    $has_ucd_alias |= $table_alias_object->ucd;
8447
8448                    if ($table_alias !~ /\D/) { # Clarify large numbers.
8449                        $table_alias = main::clarify_number($table_alias)
8450                    }
8451
8452                    # Add a comment for this alias combination
8453                    my $current_match_comment;
8454                    if ($table_property == $perl) {
8455                        $current_match_comment = "\\$perl_p"
8456                                                    . "{$table_alias}";
8457                    }
8458                    else {
8459                        $current_match_comment
8460                                        = "\\p{$property_alias=$table_alias}";
8461                        $property_had_compound_name = 1;
8462                    }
8463
8464                    # Flag any abnormal status for this table.
8465                    my $flag = $property->status
8466                                || $table->status
8467                                || $table_alias_object->status;
8468                    if ($flag && $flag ne $PLACEHOLDER) {
8469                        $flags{$flag} = $status_past_participles{$flag};
8470                    }
8471
8472                    $loose_count++;
8473
8474                    # Pretty up the comment.  Note the \b; it says don't make
8475                    # this line a continuation.
8476                    $matches_comment .= sprintf("\b%-1s%-s%s\n",
8477                                        $flag,
8478                                        " " x 7,
8479                                        $current_match_comment);
8480                } # End of generating the entries for this table.
8481
8482                # Save these for output after this group of related tables.
8483                push @description, $table->description;
8484                push @note, $table->note;
8485                push @conflicting, $table->conflicting;
8486
8487                # And this for output after all the tables.
8488                push @global_comments, $table->comment;
8489
8490                # Compute an alternate compound name using the final property
8491                # synonym and the first table synonym with a colon instead of
8492                # the equal sign used elsewhere.
8493                if ($property_had_compound_name) {
8494                    $properties_with_compound_names ++;
8495                    if (! $compound_name || @property_aliases > 1) {
8496                        $compound_name = $property_aliases[-1]->name
8497                                        . ': '
8498                                        . $table_aliases[0]->name;
8499                    }
8500                }
8501            } # End of looping through all children of this table
8502
8503            # Here have assembled in $matches_comment all the related tables
8504            # to the current parent (preceded by the same info for all the
8505            # previous parents).  Put out information that applies to all of
8506            # the current family.
8507            if (@conflicting) {
8508
8509                # But output the conflicting information now, as it applies to
8510                # just this table.
8511                my $conflicting = join ", ", @conflicting;
8512                if ($conflicting) {
8513                    $matches_comment .= <<END;
8514
8515    Note that contrary to what you might expect, the above is NOT the same as
8516END
8517                    $matches_comment .= "any of: " if @conflicting > 1;
8518                    $matches_comment .= "$conflicting\n";
8519                }
8520            }
8521            if (@description) {
8522                $matches_comment .= "\n    Meaning: "
8523                                    . join('; ', @description)
8524                                    . "\n";
8525            }
8526            if (@note) {
8527                $matches_comment .= "\n    Note: "
8528                                    . join("\n    ", @note)
8529                                    . "\n";
8530            }
8531        } # End of looping through all tables
8532
8533        $matches_comment .= "\n$non_unicode_string\n" if $non_unicode_string;
8534
8535
8536        my $code_points;
8537        my $match;
8538        my $any_of_these;
8539        if ($unicode_count == 1) {
8540            $match = 'matches';
8541            $code_points = 'single code point';
8542        }
8543        else {
8544            $match = 'match';
8545            $code_points = "$string_count code points";
8546        }
8547
8548        my $synonyms;
8549        my $entries;
8550        if ($total_entries == 1) {
8551            $synonyms = "";
8552            $entries = 'entry';
8553            $any_of_these = 'this'
8554        }
8555        else {
8556            $synonyms = " any of the following regular expression constructs";
8557            $entries = 'entries';
8558            $any_of_these = 'any of these'
8559        }
8560
8561        my $comment = "";
8562        if ($has_ucd_alias) {
8563            $comment .= "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n";
8564        }
8565        if ($has_unrelated) {
8566            $comment .= <<END;
8567This file is for tables that are not necessarily related:  To conserve
8568resources, every table that matches the identical set of code points in this
8569version of Unicode uses this file.  Each one is listed in a separate group
8570below.  It could be that the tables will match the same set of code points in
8571other Unicode releases, or it could be purely coincidence that they happen to
8572be the same in Unicode $unicode_version, and hence may not in other versions.
8573
8574END
8575        }
8576
8577        if (%flags) {
8578            foreach my $flag (sort keys %flags) {
8579                $comment .= <<END;
8580'$flag' below means that this form is $flags{$flag}.
8581END
8582                if ($flag eq $INTERNAL_ALIAS) {
8583                    $comment .= "DO NOT USE!!!";
8584                }
8585                else {
8586                    $comment .= "Consult $pod_file.pod";
8587                }
8588                $comment .= "\n";
8589            }
8590            $comment .= "\n";
8591        }
8592
8593        if ($total_entries == 0) {
8594            Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string.  Creating file anyway.");
8595            $comment .= <<END;
8596This file returns the $code_points in Unicode Version
8597$unicode_version for
8598$leader, but it is inaccessible through Perl regular expressions, as
8599"\\p{prop=}" is not recognized.
8600END
8601
8602        } else {
8603            $comment .= <<END;
8604This file returns the $code_points in Unicode Version
8605$unicode_version that
8606$match$synonyms:
8607
8608$matches_comment
8609$pod_file.pod should be consulted for the syntax rules for $any_of_these,
8610including if adding or subtracting white space, underscore, and hyphen
8611characters matters or doesn't matter, and other permissible syntactic
8612variants.  Upper/lower case distinctions never matter.
8613END
8614
8615        }
8616        if ($compound_name) {
8617            $comment .= <<END;
8618
8619A colon can be substituted for the equals sign, and
8620END
8621            if ($properties_with_compound_names > 1) {
8622                $comment .= <<END;
8623within each group above,
8624END
8625            }
8626            $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
8627
8628            # Note the \b below, it says don't make that line a continuation.
8629            $comment .= <<END;
8630anything to the left of the equals (or colon) can be combined with anything to
8631the right.  Thus, for example,
8632$compound_name
8633\bis also valid.
8634END
8635        }
8636
8637        # And append any comment(s) from the actual tables.  They are all
8638        # gathered here, so may not read all that well.
8639        if (@global_comments) {
8640            $comment .= "\n" . join("\n\n", @global_comments) . "\n";
8641        }
8642
8643        if ($count) {   # The format differs if no code points, and needs no
8644                        # explanation in that case
8645            if ($leader->write_as_invlist) {
8646                $comment.= <<END;
8647
8648The first data line of this file begins with the letter V to indicate it is in
8649inversion list format.  The number following the V gives the number of lines
8650remaining.  Each of those remaining lines is a single number representing the
8651starting code point of a range which goes up to but not including the number
8652on the next line; The 0th, 2nd, 4th... ranges are for code points that match
8653the property; the 1st, 3rd, 5th... are ranges of code points that don't match
8654the property.  The final line's range extends to the platform's infinity.
8655END
8656            }
8657            else {
8658                $comment.= <<END;
8659The format of the lines of this file is:
8660START\\tSTOP\\twhere START is the starting code point of the range, in hex;
8661STOP is the ending point, or if omitted, the range has just one code point.
8662END
8663            }
8664            if ($leader->output_range_counts) {
8665                $comment .= <<END;
8666Numbers in comments in [brackets] indicate how many code points are in the
8667range.
8668END
8669            }
8670        }
8671
8672        $leader->set_comment(main::join_lines($comment));
8673        return;
8674    }
8675
8676    # Accessors for the underlying list
8677    for my $sub (qw(
8678                    get_valid_code_point
8679                    get_invalid_code_point
8680                ))
8681    {
8682        no strict "refs";
8683        *$sub = sub {
8684            use strict "refs";
8685            my $self = shift;
8686
8687            return $self->_range_list->$sub(@_);
8688        }
8689    }
8690} # End closure for Match_Table
8691
8692package Property;
8693
8694# The Property class represents a Unicode property, or the $perl
8695# pseudo-property.  It contains a map table initialized empty at construction
8696# time, and for properties accessible through regular expressions, various
8697# match tables, created through the add_match_table() method, and referenced
8698# by the table('NAME') or tables() methods, the latter returning a list of all
8699# of the match tables.  Otherwise table operations implicitly are for the map
8700# table.
8701#
8702# Most of the data in the property is actually about its map table, so it
8703# mostly just uses that table's accessors for most methods.  The two could
8704# have been combined into one object, but for clarity because of their
8705# differing semantics, they have been kept separate.  It could be argued that
8706# the 'file' and 'directory' fields should be kept with the map table.
8707#
8708# Each property has a type.  This can be set in the constructor, or in the
8709# set_type accessor, but mostly it is figured out by the data.  Every property
8710# starts with unknown type, overridden by a parameter to the constructor, or
8711# as match tables are added, or ranges added to the map table, the data is
8712# inspected, and the type changed.  After the table is mostly or entirely
8713# filled, compute_type() should be called to finalize they analysis.
8714#
8715# There are very few operations defined.  One can safely remove a range from
8716# the map table, and property_add_or_replace_non_nulls() adds the maps from another
8717# table to this one, replacing any in the intersection of the two.
8718
8719sub standardize { return main::standardize($_[0]); }
8720sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
8721
8722{   # Closure
8723
8724    # This hash will contain as keys, all the aliases of all properties, and
8725    # as values, pointers to their respective property objects.  This allows
8726    # quick look-up of a property from any of its names.
8727    my %alias_to_property_of;
8728
8729    sub dump_alias_to_property_of {
8730        # For debugging
8731
8732        print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
8733        return;
8734    }
8735
8736    sub property_ref($name) {
8737        # This is a package subroutine, not called as a method.
8738        # If the single parameter is a literal '*' it returns a list of all
8739        # defined properties.
8740        # Otherwise, the single parameter is a name, and it returns a pointer
8741        # to the corresponding property object, or undef if none.
8742        #
8743        # Properties can have several different names.  The 'standard' form of
8744        # each of them is stored in %alias_to_property_of as they are defined.
8745        # But it's possible that this subroutine will be called with some
8746        # variant, so if the initial lookup fails, it is repeated with the
8747        # standardized form of the input name.  If found, besides returning the
8748        # result, the input name is added to the list so future calls won't
8749        # have to do the conversion again.
8750
8751        if (! defined $name) {
8752            Carp::my_carp_bug("Undefined input property.  No action taken.");
8753            return;
8754        }
8755
8756        return main::uniques(values %alias_to_property_of) if $name eq '*';
8757
8758        # Return cached result if have it.
8759        my $result = $alias_to_property_of{$name};
8760        return $result if defined $result;
8761
8762        # Convert the input to standard form.
8763        my $standard_name = standardize($name);
8764
8765        $result = $alias_to_property_of{$standard_name};
8766        return unless defined $result;        # Don't cache undefs
8767
8768        # Cache the result before returning it.
8769        $alias_to_property_of{$name} = $result;
8770        return $result;
8771    }
8772
8773
8774    main::setup_package();
8775
8776    my %map;
8777    # A pointer to the map table object for this property
8778    main::set_access('map', \%map);
8779
8780    my %full_name;
8781    # The property's full name.  This is a duplicate of the copy kept in the
8782    # map table, but is needed because stringify needs it during
8783    # construction of the map table, and then would have a chicken before egg
8784    # problem.
8785    main::set_access('full_name', \%full_name, 'r');
8786
8787    my %table_ref;
8788    # This hash will contain as keys, all the aliases of any match tables
8789    # attached to this property, and as values, the pointers to their
8790    # respective tables.  This allows quick look-up of a table from any of its
8791    # names.
8792    main::set_access('table_ref', \%table_ref);
8793
8794    my %type;
8795    # The type of the property, $ENUM, $BINARY, etc
8796    main::set_access('type', \%type, 'r');
8797
8798    my %file;
8799    # The filename where the map table will go (if actually written).
8800    # Normally defaulted, but can be overridden.
8801    main::set_access('file', \%file, 'r', 's');
8802
8803    my %directory;
8804    # The directory where the map table will go (if actually written).
8805    # Normally defaulted, but can be overridden.
8806    main::set_access('directory', \%directory, 's');
8807
8808    my %pseudo_map_type;
8809    # This is used to affect the calculation of the map types for all the
8810    # ranges in the table.  It should be set to one of the values that signify
8811    # to alter the calculation.
8812    main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
8813
8814    my %has_only_code_point_maps;
8815    # A boolean used to help in computing the type of data in the map table.
8816    main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
8817
8818    my %unique_maps;
8819    # A list of the first few distinct mappings this property has.  This is
8820    # used to disambiguate between binary and enum property types, so don't
8821    # have to keep more than three.
8822    main::set_access('unique_maps', \%unique_maps);
8823
8824    my %pre_declared_maps;
8825    # A boolean that gives whether the input data should declare all the
8826    # tables used, or not.  If the former, unknown ones raise a warning.
8827    main::set_access('pre_declared_maps',
8828                                    \%pre_declared_maps, 'r', 's');
8829
8830    my %match_subdir;
8831    # For properties whose shortest names are too long for a DOS 8.3
8832    # filesystem to distinguish between, this is used to manually give short
8833    # names for the directory name immediately under $match_tables that the
8834    # match tables for this property should be placed in.
8835    main::set_access('match_subdir', \%match_subdir, 'r');
8836
8837    my %has_dependency;
8838    # A boolean that gives whether some table somewhere is defined as the
8839    # complement of a table in this property.  This is a crude, but currently
8840    # sufficient, mechanism to make this property not get destroyed before
8841    # what is dependent on it is.  Other dependencies could be added, so the
8842    # name was chosen to reflect a more general situation than actually is
8843    # currently the case.
8844    main::set_access('has_dependency', \%has_dependency, 'r', 's');
8845
8846    sub new {
8847        # The only required parameter is the positionally first, name.  All
8848        # other parameters are key => value pairs.  See the documentation just
8849        # above for the meanings of the ones not passed directly on to the map
8850        # table constructor.
8851
8852        my $class = shift;
8853        my $name = shift || "";
8854
8855        my $self = property_ref($name);
8856        if (defined $self) {
8857            my $options_string = join ", ", @_;
8858            $options_string = ".  Ignoring options $options_string" if $options_string;
8859            Carp::my_carp("$self is already in use.  Using existing one$options_string;");
8860            return $self;
8861        }
8862
8863        my %args = @_;
8864
8865        $self = bless \do { my $anonymous_scalar }, $class;
8866        my $addr = do { no overloading; pack 'J', $self; };
8867
8868        $directory{$addr} = delete $args{'Directory'};
8869        $file{$addr} = delete $args{'File'};
8870        $full_name{$addr} = delete $args{'Full_Name'} || $name;
8871        $type{$addr} = delete $args{'Type'} || $UNKNOWN;
8872        $pseudo_map_type{$addr} = delete $args{'Map_Type'};
8873        $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'}
8874                                    # Starting in this release, property
8875                                    # values should be defined for all
8876                                    # properties, except those overriding this
8877                                    // $v_version ge v5.1.0;
8878        $match_subdir{$addr} = delete $args{'Match_SubDir'};
8879
8880        # Rest of parameters passed on.
8881
8882        $has_only_code_point_maps{$addr} = 1;
8883        $table_ref{$addr} = { };
8884        $unique_maps{$addr} = { };
8885        $has_dependency{$addr} = 0;
8886
8887        $map{$addr} = Map_Table->new($name,
8888                                    Full_Name => $full_name{$addr},
8889                                    _Alias_Hash => \%alias_to_property_of,
8890                                    _Property => $self,
8891                                    %args);
8892        return $self;
8893    }
8894
8895    # See this program's beginning comment block about overloading the copy
8896    # constructor.  Few operations are defined on properties, but a couple are
8897    # useful.  It is safe to take the inverse of a property, and to remove a
8898    # single code point from it.
8899    use overload
8900        fallback => 0,
8901        qw("") => "_operator_stringify",
8902        "." => \&main::_operator_dot,
8903        ".=" => \&main::_operator_dot_equal,
8904        '==' => \&main::_operator_equal,
8905        '!=' => \&main::_operator_not_equal,
8906        '=' => sub { return shift },
8907        '-=' => "_minus_and_equal",
8908    ;
8909
8910    sub _operator_stringify($self, $other="", $reversed=0) {
8911        return "Property '" .  shift->full_name . "'";
8912    }
8913
8914    sub _minus_and_equal($self, $other, $reversed=0) {
8915        # Remove a single code point from the map table of a property.
8916        if (ref $other) {
8917            Carp::my_carp_bug("Bad news.  Can't cope with a "
8918                        . ref($other)
8919                        . " argument to '-='.  Subtraction ignored.");
8920            return $self;
8921        }
8922        elsif ($reversed) {   # Shouldn't happen in a -=, but just in case
8923            Carp::my_carp_bug("Bad news.  Can't cope with subtracting a "
8924            . ref $self
8925            . " from a non-object.  undef returned.");
8926            return;
8927        }
8928        else {
8929            no overloading;
8930            $map{pack 'J', $self}->delete_range($other, $other);
8931        }
8932        return $self;
8933    }
8934
8935    sub add_match_table {
8936        # Add a new match table for this property, with name given by the
8937        # parameter.  It returns a pointer to the table.
8938
8939        my $self = shift;
8940        my $name = shift;
8941        my %args = @_;
8942
8943        my $addr = do { no overloading; pack 'J', $self; };
8944
8945        my $table = $table_ref{$addr}{$name};
8946        my $standard_name = main::standardize($name);
8947        if (defined $table
8948            || (defined ($table = $table_ref{$addr}{$standard_name})))
8949        {
8950            Carp::my_carp("Table '$name' in $self is already in use.  Using existing one");
8951            $table_ref{$addr}{$name} = $table;
8952            return $table;
8953        }
8954        else {
8955
8956            # See if this is a perl extension, if not passed in.
8957            my $perl_extension = delete $args{'Perl_Extension'};
8958            $perl_extension
8959                        = $self->perl_extension if ! defined $perl_extension;
8960
8961            my $fate;
8962            my $suppression_reason = "";
8963            if ($self->name =~ /^_/) {
8964                $fate = $SUPPRESSED;
8965                $suppression_reason = "Parent property is internal only";
8966            }
8967            elsif ($self->fate >= $SUPPRESSED) {
8968                $fate = $self->fate;
8969                $suppression_reason = $why_suppressed{$self->complete_name};
8970
8971            }
8972            elsif ($name =~ /^_/) {
8973                $fate = $INTERNAL_ONLY;
8974            }
8975            $table = Match_Table->new(
8976                                Name => $name,
8977                                Perl_Extension => $perl_extension,
8978                                _Alias_Hash => $table_ref{$addr},
8979                                _Property => $self,
8980                                Fate => $fate,
8981                                Suppression_Reason => $suppression_reason,
8982                                Status => $self->status,
8983                                _Status_Info => $self->status_info,
8984                                %args);
8985            return unless defined $table;
8986        }
8987
8988        # Save the names for quick look up
8989        $table_ref{$addr}{$standard_name} = $table;
8990        $table_ref{$addr}{$name} = $table;
8991
8992        # Perhaps we can figure out the type of this property based on the
8993        # fact of adding this match table.  First, string properties don't
8994        # have match tables; second, a binary property can't have 3 match
8995        # tables
8996        if ($type{$addr} == $UNKNOWN) {
8997            $type{$addr} = $NON_STRING;
8998        }
8999        elsif ($type{$addr} == $STRING) {
9000            Carp::my_carp("$self Added a match table '$name' to a string property '$self'.  Changed it to a non-string property.  Bad News.");
9001            $type{$addr} = $NON_STRING;
9002        }
9003        elsif ($type{$addr} != $ENUM && $type{$addr} != $FORCED_BINARY) {
9004            if (scalar main::uniques(values %{$table_ref{$addr}}) > 2) {
9005                if ($type{$addr} == $BINARY) {
9006                    Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary.  Changing its type to 'enum'.  Bad News.");
9007                }
9008                $type{$addr} = $ENUM;
9009            }
9010        }
9011
9012        return $table;
9013    }
9014
9015    sub delete_match_table($self, $table_to_remove) {
9016        # Delete the table referred to by $2 from the property $1.
9017        my $addr = do { no overloading; pack 'J', $self; };
9018
9019        # Remove all names that refer to it.
9020        foreach my $key (keys %{$table_ref{$addr}}) {
9021            delete $table_ref{$addr}{$key}
9022                                if $table_ref{$addr}{$key} == $table_to_remove;
9023        }
9024
9025        $table_to_remove->DESTROY;
9026        return;
9027    }
9028
9029    sub table($self, $name) {
9030        # Return a pointer to the match table (with name given by the
9031        # parameter) associated with this property; undef if none.
9032        my $addr = do { no overloading; pack 'J', $self; };
9033
9034        return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
9035
9036        # If quick look-up failed, try again using the standard form of the
9037        # input name.  If that succeeds, cache the result before returning so
9038        # won't have to standardize this input name again.
9039        my $standard_name = main::standardize($name);
9040        return unless defined $table_ref{$addr}{$standard_name};
9041
9042        $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
9043        return $table_ref{$addr}{$name};
9044    }
9045
9046    sub tables {
9047        # Return a list of pointers to all the match tables attached to this
9048        # property
9049
9050        no overloading;
9051        return main::uniques(values %{$table_ref{pack 'J', shift}});
9052    }
9053
9054    sub directory {
9055        # Returns the directory the map table for this property should be
9056        # output in.  If a specific directory has been specified, that has
9057        # priority;  'undef' is returned if the type isn't defined;
9058        # or $map_directory for everything else.
9059
9060        my $addr = do { no overloading; pack 'J', shift; };
9061
9062        return $directory{$addr} if defined $directory{$addr};
9063        return undef if $type{$addr} == $UNKNOWN;
9064        return $map_directory;
9065    }
9066
9067    sub swash_name($self) {
9068        # Return the name that is used to both:
9069        #   1)  Name the file that the map table is written to.
9070        #   2)  The name of swash related stuff inside that file.
9071        # The reason for this is that the Perl core historically has used
9072        # certain names that aren't the same as the Unicode property names.
9073        # To continue using these, $file is hard-coded in this file for those,
9074        # but otherwise the standard name is used.  This is different from the
9075        # external_name, so that the rest of the files, like in lib can use
9076        # the standard name always, without regard to historical precedent.
9077        my $addr = do { no overloading; pack 'J', $self; };
9078
9079        # Swash names are used only on either
9080        # 1) regular or internal-only map tables
9081        # 2) otherwise there should be no access to the
9082        #    property map table from other parts of Perl.
9083        return if $map{$addr}->fate != $ORDINARY
9084                  && ! ($map{$addr}->name =~ /^_/
9085                        && $map{$addr}->fate == $INTERNAL_ONLY);
9086
9087        return $file{$addr} if defined $file{$addr};
9088        return $map{$addr}->external_name;
9089    }
9090
9091    sub to_create_match_tables($self) {
9092        # Returns a boolean as to whether or not match tables should be
9093        # created for this property.
9094
9095        # The whole point of this pseudo property is match tables.
9096        return 1 if $self == $perl;
9097
9098        my $addr = do { no overloading; pack 'J', $self; };
9099
9100        # Don't generate tables of code points that match the property values
9101        # of a string property.  Such a list would most likely have many
9102        # property values, each with just one or very few code points mapping
9103        # to it.
9104        return 0 if $type{$addr} == $STRING;
9105
9106        # Otherwise, do.
9107        return 1;
9108    }
9109
9110    sub property_add_or_replace_non_nulls($self, $other) {
9111        # This adds the mappings in the property $other to $self.  Non-null
9112        # mappings from $other override those in $self.  It essentially merges
9113        # the two properties, with the second having priority except for null
9114        # mappings.
9115
9116        if (! $other->isa(__PACKAGE__)) {
9117            Carp::my_carp_bug("$other should be a "
9118                            . __PACKAGE__
9119                            . ".  Not a '"
9120                            . ref($other)
9121                            . "'.  Not added;");
9122            return;
9123        }
9124
9125        no overloading;
9126        return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other});
9127    }
9128
9129    sub set_proxy_for {
9130        # Certain tables are not generally written out to files, but
9131        # Unicode::UCD has the intelligence to know that the file for $self
9132        # can be used to reconstruct those tables.  This routine just changes
9133        # things so that UCD pod entries for those suppressed tables are
9134        # generated, so the fact that a proxy is used is invisible to the
9135        # user.
9136
9137        my $self = shift;
9138
9139        foreach my $property_name (@_) {
9140            my $ref = property_ref($property_name);
9141            next if $ref->to_output_map;
9142            $ref->set_fate($MAP_PROXIED);
9143        }
9144    }
9145
9146    sub set_type($self, $type) {
9147        # Set the type of the property.  Mostly this is figured out by the
9148        # data in the table.  But this is used to set it explicitly.  The
9149        # reason it is not a standard accessor is that when setting a binary
9150        # property, we need to make sure that all the true/false aliases are
9151        # present, as they were omitted in early Unicode releases.
9152
9153        if ($type != $ENUM
9154            && $type != $BINARY
9155            && $type != $FORCED_BINARY
9156            && $type != $STRING)
9157        {
9158            Carp::my_carp("Unrecognized type '$type'.  Type not set");
9159            return;
9160        }
9161
9162        { no overloading; $type{pack 'J', $self} = $type; }
9163        return if $type != $BINARY && $type != $FORCED_BINARY;
9164
9165        my $yes = $self->table('Y');
9166        $yes = $self->table('Yes') if ! defined $yes;
9167        $yes = $self->add_match_table('Y', Full_Name => 'Yes')
9168                                                            if ! defined $yes;
9169
9170        # Add aliases in order wanted, duplicates will be ignored.  We use a
9171        # binary property present in all releases for its ordered lists of
9172        # true/false aliases.  Note, that could run into problems in
9173        # outputting things in that we don't distinguish between the name and
9174        # full name of these.  Hopefully, if the table was already created
9175        # before this code is executed, it was done with these set properly.
9176        my $bm = property_ref("Bidi_Mirrored");
9177        foreach my $alias ($bm->table("Y")->aliases) {
9178            $yes->add_alias($alias->name);
9179        }
9180        my $no = $self->table('N');
9181        $no = $self->table('No') if ! defined $no;
9182        $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no;
9183        foreach my $alias ($bm->table("N")->aliases) {
9184            $no->add_alias($alias->name);
9185        }
9186
9187        return;
9188    }
9189
9190    sub add_map {
9191        # Add a map to the property's map table.  This also keeps
9192        # track of the maps so that the property type can be determined from
9193        # its data.
9194
9195        my $self = shift;
9196        my $start = shift;  # First code point in range
9197        my $end = shift;    # Final code point in range
9198        my $map = shift;    # What the range maps to.
9199        # Rest of parameters passed on.
9200
9201        my $addr = do { no overloading; pack 'J', $self; };
9202
9203        # If haven't the type of the property, gather information to figure it
9204        # out.
9205        if ($type{$addr} == $UNKNOWN) {
9206
9207            # If the map contains an interior blank or dash, or most other
9208            # nonword characters, it will be a string property.  This
9209            # heuristic may actually miss some string properties.  If so, they
9210            # may need to have explicit set_types called for them.  This
9211            # happens in the Unihan properties.
9212            if ($map =~ / (?<= . ) [ -] (?= . ) /x
9213                || $map =~ / [^\w.\/\ -]  /x)
9214            {
9215                $self->set_type($STRING);
9216
9217                # $unique_maps is used for disambiguating between ENUM and
9218                # BINARY later; since we know the property is not going to be
9219                # one of those, no point in keeping the data around
9220                undef $unique_maps{$addr};
9221            }
9222            else {
9223
9224                # Not necessarily a string.  The final decision has to be
9225                # deferred until all the data are in.  We keep track of if all
9226                # the values are code points for that eventual decision.
9227                $has_only_code_point_maps{$addr} &=
9228                                            $map =~ / ^ $code_point_re $/x;
9229
9230                # For the purposes of disambiguating between binary and other
9231                # enumerations at the end, we keep track of the first three
9232                # distinct property values.  Once we get to three, we know
9233                # it's not going to be binary, so no need to track more.
9234                if (scalar keys %{$unique_maps{$addr}} < 3) {
9235                    $unique_maps{$addr}{main::standardize($map)} = 1;
9236                }
9237            }
9238        }
9239
9240        # Add the mapping by calling our map table's method
9241        return $map{$addr}->add_map($start, $end, $map, @_);
9242    }
9243
9244    sub compute_type($self) {
9245        # Compute the type of the property: $ENUM, $STRING, or $BINARY.  This
9246        # should be called after the property is mostly filled with its maps.
9247        # We have been keeping track of what the property values have been,
9248        # and now have the necessary information to figure out the type.
9249
9250        my $addr = do { no overloading; pack 'J', $self; };
9251
9252        my $type = $type{$addr};
9253
9254        # If already have figured these out, no need to do so again, but we do
9255        # a double check on ENUMS to make sure that a string property hasn't
9256        # improperly been classified as an ENUM, so continue on with those.
9257        return if $type == $STRING
9258                  || $type == $BINARY
9259                  || $type == $FORCED_BINARY;
9260
9261        # If every map is to a code point, is a string property.
9262        if ($type == $UNKNOWN
9263            && ($has_only_code_point_maps{$addr}
9264                || (defined $map{$addr}->default_map
9265                    && $map{$addr}->default_map eq "")))
9266        {
9267            $self->set_type($STRING);
9268        }
9269        else {
9270
9271            # Otherwise, it is to some sort of enumeration.  (The case where
9272            # it is a Unicode miscellaneous property, and treated like a
9273            # string in this program is handled in add_map()).  Distinguish
9274            # between binary and some other enumeration type.  Of course, if
9275            # there are more than two values, it's not binary.  But more
9276            # subtle is the test that the default mapping is defined means it
9277            # isn't binary.  This in fact may change in the future if Unicode
9278            # changes the way its data is structured.  But so far, no binary
9279            # properties ever have @missing lines for them, so the default map
9280            # isn't defined for them.  The few properties that are two-valued
9281            # and aren't considered binary have the default map defined
9282            # starting in Unicode 5.0, when the @missing lines appeared; and
9283            # this program has special code to put in a default map for them
9284            # for earlier than 5.0 releases.
9285            if ($type == $ENUM
9286                || scalar keys %{$unique_maps{$addr}} > 2
9287                || defined $self->default_map)
9288            {
9289                my $tables = $self->tables;
9290                my $count = $self->count;
9291                if ($verbosity && $tables > 500 && $tables/$count > .1) {
9292                    Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $tables\n");
9293                }
9294                $self->set_type($ENUM);
9295            }
9296            else {
9297                $self->set_type($BINARY);
9298            }
9299        }
9300        undef $unique_maps{$addr};  # Garbage collect
9301        return;
9302    }
9303
9304    # $reaons - Ignored unless suppressing
9305    sub set_fate($self, $fate, $reason=undef) {
9306        my $addr = do { no overloading; pack 'J', $self; };
9307        if ($fate >= $SUPPRESSED) {
9308            $why_suppressed{$self->complete_name} = $reason;
9309        }
9310
9311        # Each table shares the property's fate, except that MAP_PROXIED
9312        # doesn't affect match tables
9313        $map{$addr}->set_fate($fate, $reason);
9314        if ($fate != $MAP_PROXIED) {
9315            foreach my $table ($map{$addr}, $self->tables) {
9316                $table->set_fate($fate, $reason);
9317            }
9318        }
9319        return;
9320    }
9321
9322
9323    # Most of the accessors for a property actually apply to its map table.
9324    # Setup up accessor functions for those, referring to %map
9325    for my $sub (qw(
9326                    add_alias
9327                    add_anomalous_entry
9328                    add_comment
9329                    add_conflicting
9330                    add_description
9331                    add_duplicate
9332                    add_note
9333                    aliases
9334                    comment
9335                    complete_name
9336                    containing_range
9337                    count
9338                    default_map
9339                    definition
9340                    delete_range
9341                    description
9342                    each_range
9343                    external_name
9344                    fate
9345                    file_path
9346                    format
9347                    initialize
9348                    inverse_list
9349                    is_empty
9350                    name
9351                    note
9352                    perl_extension
9353                    property
9354                    range_count
9355                    ranges
9356                    range_size_1
9357                    replace_map
9358                    reset_each_range
9359                    set_comment
9360                    set_default_map
9361                    set_file_path
9362                    set_final_comment
9363                    _set_format
9364                    set_range_size_1
9365                    set_status
9366                    set_to_output_map
9367                    short_name
9368                    status
9369                    status_info
9370                    to_output_map
9371                    type_of
9372                    value_of
9373                    write
9374                ))
9375                    # 'property' above is for symmetry, so that one can take
9376                    # the property of a property and get itself, and so don't
9377                    # have to distinguish between properties and tables in
9378                    # calling code
9379    {
9380        no strict "refs";
9381        *$sub = sub {
9382            use strict "refs";
9383            my $self = shift;
9384            no overloading;
9385            return $map{pack 'J', $self}->$sub(@_);
9386        }
9387    }
9388
9389
9390} # End closure
9391
9392package main;
9393
9394sub display_chr {
9395    # Converts an ordinal printable character value to a displayable string,
9396    # using a dotted circle to hold combining characters.
9397
9398    my $ord = shift;
9399    my $chr = chr $ord;
9400    return $chr if $ccc->table(0)->contains($ord);
9401    return "\x{25CC}$chr";
9402}
9403
9404sub join_lines($input) {
9405    # Returns lines of the input joined together, so that they can be folded
9406    # properly.
9407    # This causes continuation lines to be joined together into one long line
9408    # for folding.  A continuation line is any line that doesn't begin with a
9409    # space or "\b" (the latter is stripped from the output).  This is so
9410    # lines can be in a HERE document so as to fit nicely in the terminal
9411    # width, but be joined together in one long line, and then folded with
9412    # indents, '#' prefixes, etc, properly handled.
9413    # A blank separates the joined lines except if there is a break; an extra
9414    # blank is inserted after a period ending a line.
9415
9416    # Initialize the return with the first line.
9417    my ($return, @lines) = split "\n", $input;
9418
9419    # If the first line is null, it was an empty line, add the \n back in
9420    $return = "\n" if $return eq "";
9421
9422    # Now join the remainder of the physical lines.
9423    for my $line (@lines) {
9424
9425        # An empty line means wanted a blank line, so add two \n's to get that
9426        # effect, and go to the next line.
9427        if (length $line == 0) {
9428            $return .= "\n\n";
9429            next;
9430        }
9431
9432        # Look at the last character of what we have so far.
9433        my $previous_char = substr($return, -1, 1);
9434
9435        # And at the next char to be output.
9436        my $next_char = substr($line, 0, 1);
9437
9438        if ($previous_char ne "\n") {
9439
9440            # Here didn't end wth a nl.  If the next char a blank or \b, it
9441            # means that here there is a break anyway.  So add a nl to the
9442            # output.
9443            if ($next_char eq " " || $next_char eq "\b") {
9444                $previous_char = "\n";
9445                $return .= $previous_char;
9446            }
9447
9448            # Add an extra space after periods.
9449            $return .= " " if $previous_char eq '.';
9450        }
9451
9452        # Here $previous_char is still the latest character to be output.  If
9453        # it isn't a nl, it means that the next line is to be a continuation
9454        # line, with a blank inserted between them.
9455        $return .= " " if $previous_char ne "\n";
9456
9457        # Get rid of any \b
9458        substr($line, 0, 1) = "" if $next_char eq "\b";
9459
9460        # And append this next line.
9461        $return .= $line;
9462    }
9463
9464    return $return;
9465}
9466
9467sub simple_fold( $line, $prefix="", $hanging_indent=0, $right_margin=0) {
9468    # Returns a string of the input (string or an array of strings) folded
9469    # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
9470    # a \n
9471    # This is tailored for the kind of text written by this program,
9472    # especially the pod file, which can have very long names with
9473    # underscores in the middle, or words like AbcDefgHij....  We allow
9474    # breaking in the middle of such constructs if the line won't fit
9475    # otherwise.  The break in such cases will come either just after an
9476    # underscore, or just before one of the Capital letters.
9477
9478    local $to_trace = 0 if main::DEBUG;
9479
9480    # $prefix Optional string to prepend to each output line
9481    # $hanging_indent Optional number of spaces to indent
9482	# continuation lines
9483    # $right_margin  Optional number of spaces to narrow the
9484    # total width by.
9485
9486    # The space available doesn't include what's automatically prepended
9487    # to each line, or what's reserved on the right.
9488    my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
9489    # XXX Instead of using the 'nofold' perhaps better to look up the stack
9490
9491    if (DEBUG && $hanging_indent >= $max) {
9492        Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max.  Using 0", 'nofold');
9493        $hanging_indent = 0;
9494    }
9495
9496    # First, split into the current physical lines.
9497    my @line;
9498    if (ref $line) {        # Better be an array, because not bothering to
9499                            # test
9500        foreach my $line (@{$line}) {
9501            push @line, split /\n/, $line;
9502        }
9503    }
9504    else {
9505        @line = split /\n/, $line;
9506    }
9507
9508    #local $to_trace = 1 if main::DEBUG;
9509    trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
9510
9511    # Look at each current physical line.
9512    for (my $i = 0; $i < @line; $i++) {
9513        Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
9514        #local $to_trace = 1 if main::DEBUG;
9515        trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
9516
9517        # Remove prefix, because will be added back anyway, don't want
9518        # doubled prefix
9519        $line[$i] =~ s/^$prefix//;
9520
9521        # Remove trailing space
9522        $line[$i] =~ s/\s+\Z//;
9523
9524        # If the line is too long, fold it.
9525        if (length $line[$i] > $max) {
9526            my $remainder;
9527
9528            # Here needs to fold.  Save the leading space in the line for
9529            # later.
9530            $line[$i] =~ /^ ( \s* )/x;
9531            my $leading_space = $1;
9532            trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
9533
9534            # If character at final permissible position is white space,
9535            # fold there, which will delete that white space
9536            if (substr($line[$i], $max - 1, 1) =~ /\s/) {
9537                $remainder = substr($line[$i], $max);
9538                $line[$i] = substr($line[$i], 0, $max - 1);
9539            }
9540            else {
9541
9542                # Otherwise fold at an acceptable break char closest to
9543                # the max length.  Look at just the maximal initial
9544                # segment of the line
9545                my $segment = substr($line[$i], 0, $max - 1);
9546                if ($segment =~
9547                    /^ ( .{$hanging_indent}   # Don't look before the
9548                                              #  indent.
9549                        \ *                   # Don't look in leading
9550                                              #  blanks past the indent
9551                            [^ ] .*           # Find the right-most
9552                        (?:                   #  acceptable break:
9553                            [ \s = ]          # space or equal
9554                            | - (?! [.0-9] )  # or non-unary minus.
9555                            | [^\\[(] (?= \\ )# break before single backslash
9556                                              #  not immediately after opening
9557                                              #  punctuation
9558                        )                     # $1 includes the character
9559                    )/x)
9560                {
9561                    # Split into the initial part that fits, and remaining
9562                    # part of the input
9563                    $remainder = substr($line[$i], length $1);
9564                    $line[$i] = $1;
9565                    trace $line[$i] if DEBUG && $to_trace;
9566                    trace $remainder if DEBUG && $to_trace;
9567                }
9568
9569                # If didn't find a good breaking spot, see if there is a
9570                # not-so-good breaking spot.  These are just after
9571                # underscores or where the case changes from lower to
9572                # upper.  Use \a as a soft hyphen, but give up
9573                # and don't break the line if there is actually a \a
9574                # already in the input.  We use an ascii character for the
9575                # soft-hyphen to avoid any attempt by miniperl to try to
9576                # access the files that this program is creating.
9577                elsif ($segment !~ /\a/
9578                       && ($segment =~ s/_/_\a/g
9579                       || $segment =~ s/ ( (?!\\) [a-z] ) (?= [A-Z] )/$1\a/xg))
9580                {
9581                    # Here were able to find at least one place to insert
9582                    # our substitute soft hyphen.  Find the right-most one
9583                    # and replace it by a real hyphen.
9584                    trace $segment if DEBUG && $to_trace;
9585                    substr($segment,
9586                            rindex($segment, "\a"),
9587                            1) = '-';
9588
9589                    # Then remove the soft hyphen substitutes.
9590                    $segment =~ s/\a//g;
9591                    trace $segment if DEBUG && $to_trace;
9592
9593                    # And split into the initial part that fits, and
9594                    # remainder of the line
9595                    my $pos = rindex($segment, '-');
9596                    $remainder = substr($line[$i], $pos);
9597                    trace $remainder if DEBUG && $to_trace;
9598                    $line[$i] = substr($segment, 0, $pos + 1);
9599                }
9600            }
9601
9602            # Here we know if we can fold or not.  If we can, $remainder
9603            # is what remains to be processed in the next iteration.
9604            if (defined $remainder) {
9605                trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
9606
9607                # Insert the folded remainder of the line as a new element
9608                # of the array.  (It may still be too long, but we will
9609                # deal with that next time through the loop.)  Omit any
9610                # leading space in the remainder.
9611                $remainder =~ s/^\s+//;
9612                trace "remainder='$remainder'" if main::DEBUG && $to_trace;
9613
9614                # But then indent by whichever is larger of:
9615                # 1) the leading space on the input line;
9616                # 2) the hanging indent.
9617                # This preserves indentation in the original line.
9618                my $lead = ($leading_space)
9619                            ? length $leading_space
9620                            : $hanging_indent;
9621                $lead = max($lead, $hanging_indent);
9622                splice @line, $i+1, 0, (" " x $lead) . $remainder;
9623            }
9624        }
9625
9626        # Ready to output the line. Get rid of any trailing space
9627        # And prefix by the required $prefix passed in.
9628        $line[$i] =~ s/\s+$//;
9629        $line[$i] = "$prefix$line[$i]\n";
9630    } # End of looping through all the lines.
9631
9632    return join "", @line;
9633}
9634
9635sub property_ref {  # Returns a reference to a property object.
9636    return Property::property_ref(@_);
9637}
9638
9639sub force_unlink ($filename) {
9640    return unless file_exists($filename);
9641    return if CORE::unlink($filename);
9642
9643    # We might need write permission
9644    chmod 0777, $filename;
9645    CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename.  Proceeding anyway: $!");
9646    return;
9647}
9648
9649sub write ($file, $use_utf8, @lines) {
9650    # Given a filename and references to arrays of lines, write the lines of
9651    # each array to the file
9652    # Filename can be given as an arrayref of directory names
9653
9654    # Get into a single string if an array, and get rid of, in Unix terms, any
9655    # leading '.'
9656    $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
9657    $file = File::Spec->canonpath($file);
9658
9659    # If has directories, make sure that they all exist
9660    (undef, my $directories, undef) = File::Spec->splitpath($file);
9661    File::Path::mkpath($directories) if $directories && ! -d $directories;
9662
9663    push @files_actually_output, $file;
9664
9665    force_unlink ($file);
9666
9667    my $OUT;
9668    if (not open $OUT, ">", $file) {
9669        Carp::my_carp("can't open $file for output.  Skipping this file: $!");
9670        return;
9671    }
9672
9673    binmode $OUT, ":utf8" if $use_utf8;
9674
9675    foreach my $lines_ref (@lines) {
9676        unless (@$lines_ref) {
9677            Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;");
9678        }
9679
9680        print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
9681    }
9682    close $OUT or die Carp::my_carp("close '$file' failed: $!");
9683
9684    print "$file written.\n" if $verbosity >= $VERBOSE;
9685
9686    return;
9687}
9688
9689
9690sub Standardize($name=undef) {
9691    # This converts the input name string into a standardized equivalent to
9692    # use internally.
9693
9694    unless (defined $name) {
9695      Carp::my_carp_bug("Standardize() called with undef.  Returning undef.");
9696      return;
9697    }
9698
9699    # Remove any leading or trailing white space
9700    $name =~ s/^\s+//g;
9701    $name =~ s/\s+$//g;
9702
9703    # Convert interior white space and hyphens into underscores.
9704    $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
9705
9706    # Capitalize the letter following an underscore, and convert a sequence of
9707    # multiple underscores to a single one
9708    $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
9709
9710    # And capitalize the first letter, but not for the special cjk ones.
9711    $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
9712    return $name;
9713}
9714
9715sub standardize ($str=undef) {
9716    # Returns a lower-cased standardized name, without underscores.  This form
9717    # is chosen so that it can distinguish between any real versus superficial
9718    # Unicode name differences.  It relies on the fact that Unicode doesn't
9719    # have interior underscores, white space, nor dashes in any
9720    # stricter-matched name.  It should not be used on Unicode code point
9721    # names (the Name property), as they mostly, but not always follow these
9722    # rules.
9723
9724    my $name = Standardize($str);
9725    return if !defined $name;
9726
9727    $name =~ s/ (?<= .) _ (?= . ) //xg;
9728    return lc $name;
9729}
9730
9731sub UCD_name ($table, $alias) {
9732    # Returns the name that Unicode::UCD will use to find a table.  XXX
9733    # perhaps this function should be placed somewhere, like UCD.pm so that
9734    # Unicode::UCD can use it directly without duplicating code that can get
9735    # out-of sync.
9736
9737    my $property = $table->property;
9738    $property = ($property == $perl)
9739                ? ""                # 'perl' is never explicitly stated
9740                : standardize($property->name) . '=';
9741    if ($alias->loose_match) {
9742        return $property . standardize($alias->name);
9743    }
9744    else {
9745        return lc ($property . $alias->name);
9746    }
9747
9748    return;
9749}
9750
9751{   # Closure
9752
9753    my $indent_increment = " " x ( $debugging_build ? 2 : 0);
9754    %main::already_output = ();
9755
9756    $main::simple_dumper_nesting = 0;
9757
9758    sub simple_dumper( $item, $indent = "" ) {
9759        # Like Simple Data::Dumper. Good enough for our needs. We can't use
9760        # the real thing as we have to run under miniperl.
9761
9762        # It is designed so that on input it is at the beginning of a line,
9763        # and the final thing output in any call is a trailing ",\n".
9764
9765        $indent = "" if ! $debugging_build;
9766
9767        # nesting level is localized, so that as the call stack pops, it goes
9768        # back to the prior value.
9769        local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
9770        local %main::already_output = %main::already_output;
9771        $main::simple_dumper_nesting++;
9772        #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
9773
9774        # Determine the indent for recursive calls.
9775        my $next_indent = $indent . $indent_increment;
9776
9777        my $output;
9778        if (! ref $item) {
9779
9780            # Dump of scalar: just output it in quotes if not a number.  To do
9781            # so we must escape certain characters, and therefore need to
9782            # operate on a copy to avoid changing the original
9783            my $copy = $item;
9784            $copy = $UNDEF unless defined $copy;
9785
9786            # Quote non-integers (integers also have optional leading '-')
9787            if ($copy eq "" || $copy !~ /^ -? \d+ $/x) {
9788
9789                # Escape apostrophe and backslash
9790                $copy =~ s/ ( ['\\] ) /\\$1/xg;
9791                $copy = "'$copy'";
9792            }
9793            $output = "$indent$copy,\n";
9794        }
9795        else {
9796
9797            # Keep track of cycles in the input, and refuse to infinitely loop
9798            my $addr = do { no overloading; pack 'J', $item; };
9799            if (defined $main::already_output{$addr}) {
9800                return "${indent}ALREADY OUTPUT: $item\n";
9801            }
9802            $main::already_output{$addr} = $item;
9803
9804            if (ref $item eq 'ARRAY') {
9805                my $using_brackets;
9806                $output = $indent;
9807                if ($main::simple_dumper_nesting > 1) {
9808                    $output .= '[';
9809                    $using_brackets = 1;
9810                }
9811                else {
9812                    $using_brackets = 0;
9813                }
9814
9815                # If the array is empty, put the closing bracket on the same
9816                # line.  Otherwise, recursively add each array element
9817                if (@$item == 0) {
9818                    $output .= " ";
9819                }
9820                else {
9821                    $output .= "\n";
9822                    for (my $i = 0; $i < @$item; $i++) {
9823
9824                        # Indent array elements one level
9825                        $output .= &simple_dumper($item->[$i], $next_indent);
9826                        next if ! $debugging_build;
9827                        $output =~ s/\n$//;      # Remove any trailing nl so
9828                        $output .= " # [$i]\n";  # as to add a comment giving
9829                                                 # the array index
9830                    }
9831                    $output .= $indent;     # Indent closing ']' to orig level
9832                }
9833                $output .= ']' if $using_brackets;
9834                $output .= ",\n";
9835            }
9836            elsif (ref $item eq 'HASH') {
9837                my $is_first_line;
9838                my $using_braces;
9839                my $body_indent;
9840
9841                # No surrounding braces at top level
9842                $output .= $indent;
9843                if ($main::simple_dumper_nesting > 1) {
9844                    $output .= "{\n";
9845                    $is_first_line = 0;
9846                    $body_indent = $next_indent;
9847                    $next_indent .= $indent_increment;
9848                    $using_braces = 1;
9849                }
9850                else {
9851                    $is_first_line = 1;
9852                    $body_indent = $indent;
9853                    $using_braces = 0;
9854                }
9855
9856                # Output hashes sorted alphabetically instead of apparently
9857                # random.  Use caseless alphabetic sort
9858                foreach my $key (sort { lc $a cmp lc $b } keys %$item)
9859                {
9860                    if ($is_first_line) {
9861                        $is_first_line = 0;
9862                    }
9863                    else {
9864                        $output .= "$body_indent";
9865                    }
9866
9867                    # The key must be a scalar, but this recursive call quotes
9868                    # it
9869                    $output .= &simple_dumper($key);
9870
9871                    # And change the trailing comma and nl to the hash fat
9872                    # comma for clarity, and so the value can be on the same
9873                    # line
9874                    $output =~ s/,\n$/ => /;
9875
9876                    # Recursively call to get the value's dump.
9877                    my $next = &simple_dumper($item->{$key}, $next_indent);
9878
9879                    # If the value is all on one line, remove its indent, so
9880                    # will follow the => immediately.  If it takes more than
9881                    # one line, start it on a new line.
9882                    if ($next !~ /\n.*\n/) {
9883                        $next =~ s/^ *//;
9884                    }
9885                    else {
9886                        $output .= "\n";
9887                    }
9888                    $output .= $next;
9889                }
9890
9891                $output .= "$indent},\n" if $using_braces;
9892            }
9893            elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
9894                $output = $indent . ref($item) . "\n";
9895                # XXX see if blessed
9896            }
9897            elsif ($item->can('dump')) {
9898
9899                # By convention in this program, objects furnish a 'dump'
9900                # method.  Since not doing any output at this level, just pass
9901                # on the input indent
9902                $output = $item->dump($indent);
9903            }
9904            else {
9905                Carp::my_carp("Can't cope with dumping a " . ref($item) . ".  Skipping.");
9906            }
9907        }
9908        return $output;
9909    }
9910}
9911
9912sub dump_inside_out( $object, $fields_ref ) {
9913    # Dump inside-out hashes in an object's state by converting them to a
9914    # regular hash and then calling simple_dumper on that.
9915
9916    my $addr = do { no overloading; pack 'J', $object; };
9917
9918    my %hash;
9919    foreach my $key (keys %$fields_ref) {
9920        $hash{$key} = $fields_ref->{$key}{$addr};
9921    }
9922
9923    return simple_dumper(\%hash, @_);
9924}
9925
9926sub _operator_dot($self, $other="", $reversed=0) {
9927    # Overloaded '.' method that is common to all packages.  It uses the
9928    # package's stringify method.
9929
9930    foreach my $which (\$self, \$other) {
9931        next unless ref $$which;
9932        if ($$which->can('_operator_stringify')) {
9933            $$which = $$which->_operator_stringify;
9934        }
9935        else {
9936            my $ref = ref $$which;
9937            my $addr = do { no overloading; pack 'J', $$which; };
9938            $$which = "$ref ($addr)";
9939        }
9940    }
9941    return ($reversed)
9942            ? "$other$self"
9943            : "$self$other";
9944}
9945
9946sub _operator_dot_equal($self, $other="", $reversed=0) {
9947    # Overloaded '.=' method that is common to all packages.
9948
9949    if ($reversed) {
9950        return $other .= "$self";
9951    }
9952    else {
9953        return "$self" . "$other";
9954    }
9955}
9956
9957sub _operator_equal($self, $other, @) {
9958    # Generic overloaded '==' routine.  To be equal, they must be the exact
9959    # same object
9960
9961    return 0 unless defined $other;
9962    return 0 unless ref $other;
9963    no overloading;
9964    return $self == $other;
9965}
9966
9967sub _operator_not_equal($self, $other, @) {
9968    return ! _operator_equal($self, $other);
9969}
9970
9971sub substitute_PropertyAliases($file_object) {
9972    # Deal with early releases that don't have the crucial PropertyAliases.txt
9973    # file.
9974
9975    $file_object->insert_lines(get_old_property_aliases());
9976
9977    process_PropertyAliases($file_object);
9978}
9979
9980
9981sub process_PropertyAliases($file) {
9982    # This reads in the PropertyAliases.txt file, which contains almost all
9983    # the character properties in Unicode and their equivalent aliases:
9984    # scf       ; Simple_Case_Folding         ; sfc
9985    #
9986    # Field 0 is the preferred short name for the property.
9987    # Field 1 is the full name.
9988    # Any succeeding ones are other accepted names.
9989
9990    # Add any cjk properties that may have been defined.
9991    $file->insert_lines(@cjk_properties);
9992
9993    while ($file->next_line) {
9994
9995        my @data = split /\s*;\s*/;
9996
9997        my $full = $data[1];
9998
9999        # This line is defective in early Perls.  The property in Unihan.txt
10000        # is kRSUnicode.
10001        if ($full eq 'Unicode_Radical_Stroke' && @data < 3) {
10002            push @data, qw(cjkRSUnicode kRSUnicode);
10003        }
10004
10005        my $this = Property->new($data[0], Full_Name => $full);
10006
10007        $this->set_fate($SUPPRESSED, $why_suppressed{$full})
10008                                                    if $why_suppressed{$full};
10009
10010        # Start looking for more aliases after these two.
10011        for my $i (2 .. @data - 1) {
10012            $this->add_alias($data[$i]);
10013        }
10014
10015    }
10016
10017    my $scf = property_ref("Simple_Case_Folding");
10018    $scf->add_alias("scf");
10019    $scf->add_alias("sfc");
10020
10021    return;
10022}
10023
10024sub finish_property_setup($file) {
10025    # Finishes setting up after PropertyAliases.
10026
10027    # This entry was missing from this file in earlier Unicode versions
10028    if (-e 'Jamo.txt' && ! defined property_ref('JSN')) {
10029        Property->new('JSN', Full_Name => 'Jamo_Short_Name');
10030    }
10031
10032    # These are used so much, that we set globals for them.
10033    $gc = property_ref('General_Category');
10034    $block = property_ref('Block');
10035    $script = property_ref('Script');
10036    $age = property_ref('Age');
10037
10038    # Perl adds this alias.
10039    $gc->add_alias('Category');
10040
10041    # Unicode::Normalize expects this file with this name and directory.
10042    $ccc = property_ref('Canonical_Combining_Class');
10043    if (defined $ccc) {
10044        $ccc->set_file('CombiningClass');
10045        $ccc->set_directory(File::Spec->curdir());
10046    }
10047
10048    # These two properties aren't actually used in the core, but unfortunately
10049    # the names just above that are in the core interfere with these, so
10050    # choose different names.  These aren't a problem unless the map tables
10051    # for these files get written out.
10052    my $lowercase = property_ref('Lowercase');
10053    $lowercase->set_file('IsLower') if defined $lowercase;
10054    my $uppercase = property_ref('Uppercase');
10055    $uppercase->set_file('IsUpper') if defined $uppercase;
10056
10057    # Set up the hard-coded default mappings, but only on properties defined
10058    # for this release
10059    foreach my $property (keys %default_mapping) {
10060        my $property_object = property_ref($property);
10061        next if ! defined $property_object;
10062        my $default_map = $default_mapping{$property};
10063        $property_object->set_default_map($default_map);
10064
10065        # A map of <code point> implies the property is string.
10066        if ($property_object->type == $UNKNOWN
10067            && $default_map eq $CODE_POINT)
10068        {
10069            $property_object->set_type($STRING);
10070        }
10071    }
10072
10073    # The following use the Multi_Default class to create objects for
10074    # defaults.
10075
10076    # Bidi class has a complicated default, but the derived file takes care of
10077    # the complications, leaving just 'L'.
10078    if (file_exists("${EXTRACTED}DBidiClass.txt")) {
10079        property_ref('Bidi_Class')->set_default_map('L');
10080    }
10081    else {
10082        my $default;
10083
10084        # The derived file was introduced in 3.1.1.  The values below are
10085        # taken from table 3-8, TUS 3.0
10086        my $default_R =
10087            'my $default = Range_List->new;
10088             $default->add_range(0x0590, 0x05FF);
10089             $default->add_range(0xFB1D, 0xFB4F);'
10090        ;
10091
10092        # The defaults apply only to unassigned characters
10093        $default_R .= '$gc->table("Unassigned") & $default;';
10094
10095        if ($v_version lt v3.0.0) {
10096            $default = Multi_Default->new(R => $default_R, 'L');
10097        }
10098        else {
10099
10100            # AL apparently not introduced until 3.0:  TUS 2.x references are
10101            # not on-line to check it out
10102            my $default_AL =
10103                'my $default = Range_List->new;
10104                 $default->add_range(0x0600, 0x07BF);
10105                 $default->add_range(0xFB50, 0xFDFF);
10106                 $default->add_range(0xFE70, 0xFEFF);'
10107            ;
10108
10109            # Non-character code points introduced in this release; aren't AL
10110            if ($v_version ge 3.1.0) {
10111                $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
10112            }
10113            $default_AL .= '$gc->table("Unassigned") & $default';
10114            $default = Multi_Default->new(AL => $default_AL,
10115                                          R => $default_R,
10116                                          'L');
10117        }
10118        property_ref('Bidi_Class')->set_default_map($default);
10119    }
10120
10121    # Joining type has a complicated default, but the derived file takes care
10122    # of the complications, leaving just 'U' (or Non_Joining), except the file
10123    # is bad in 3.1.0
10124    if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
10125        if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
10126            property_ref('Joining_Type')->set_default_map('Non_Joining');
10127        }
10128        else {
10129
10130            # Otherwise, there are not one, but two possibilities for the
10131            # missing defaults: T and U.
10132            # The missing defaults that evaluate to T are given by:
10133            # T = Mn + Cf - ZWNJ - ZWJ
10134            # where Mn and Cf are the general category values. In other words,
10135            # any non-spacing mark or any format control character, except
10136            # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
10137            # WIDTH JOINER (joining type C).
10138            my $default = Multi_Default->new(
10139               'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
10140               'Non_Joining');
10141            property_ref('Joining_Type')->set_default_map($default);
10142        }
10143    }
10144
10145    # Line break has a complicated default in early releases. It is 'Unknown'
10146    # for non-assigned code points; 'AL' for assigned.
10147    if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
10148        my $lb = property_ref('Line_Break');
10149        if (file_exists("${EXTRACTED}DLineBreak.txt")) {
10150            $lb->set_default_map('Unknown');
10151        }
10152        else {
10153            my $default = Multi_Default->new('AL' => '~ $gc->table("Cn")',
10154                                             'Unknown',
10155                                            );
10156            $lb->set_default_map($default);
10157        }
10158    }
10159
10160    # For backwards compatibility with applications that may read the mapping
10161    # file directly (it was documented in 5.12 and 5.14 as being thusly
10162    # usable), keep it from being adjusted.  (range_size_1 is
10163    # used to force the traditional format.)
10164    if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) {
10165        $nfkc_cf->set_to_output_map($EXTERNAL_MAP);
10166        $nfkc_cf->set_range_size_1(1);
10167    }
10168    if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) {
10169        $bmg->set_to_output_map($EXTERNAL_MAP);
10170        $bmg->set_range_size_1(1);
10171    }
10172
10173    property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED);
10174
10175    return;
10176}
10177
10178sub get_old_property_aliases() {
10179    # Returns what would be in PropertyAliases.txt if it existed in very old
10180    # versions of Unicode.  It was derived from the one in 3.2, and pared
10181    # down based on the data that was actually in the older releases.
10182    # An attempt was made to use the existence of files to mean inclusion or
10183    # not of various aliases, but if this was not sufficient, using version
10184    # numbers was resorted to.
10185
10186    my @return;
10187
10188    # These are to be used in all versions (though some are constructed by
10189    # this program if missing)
10190    push @return, split /\n/, <<'END';
10191bc        ; Bidi_Class
10192Bidi_M    ; Bidi_Mirrored
10193cf        ; Case_Folding
10194ccc       ; Canonical_Combining_Class
10195dm        ; Decomposition_Mapping
10196dt        ; Decomposition_Type
10197gc        ; General_Category
10198isc       ; ISO_Comment
10199lc        ; Lowercase_Mapping
10200na        ; Name
10201na1       ; Unicode_1_Name
10202nt        ; Numeric_Type
10203nv        ; Numeric_Value
10204scf       ; Simple_Case_Folding
10205slc       ; Simple_Lowercase_Mapping
10206stc       ; Simple_Titlecase_Mapping
10207suc       ; Simple_Uppercase_Mapping
10208tc        ; Titlecase_Mapping
10209uc        ; Uppercase_Mapping
10210END
10211
10212    if (-e 'Blocks.txt') {
10213        push @return, "blk       ; Block\n";
10214    }
10215    if (-e 'ArabicShaping.txt') {
10216        push @return, split /\n/, <<'END';
10217jg        ; Joining_Group
10218jt        ; Joining_Type
10219END
10220    }
10221    if (-e 'PropList.txt') {
10222
10223        # This first set is in the original old-style proplist.
10224        push @return, split /\n/, <<'END';
10225Bidi_C    ; Bidi_Control
10226Dash      ; Dash
10227Dia       ; Diacritic
10228Ext       ; Extender
10229Hex       ; Hex_Digit
10230Hyphen    ; Hyphen
10231IDC       ; ID_Continue
10232Ideo      ; Ideographic
10233Join_C    ; Join_Control
10234Math      ; Math
10235QMark     ; Quotation_Mark
10236Term      ; Terminal_Punctuation
10237WSpace    ; White_Space
10238END
10239        # The next sets were added later
10240        if ($v_version ge v3.0.0) {
10241            push @return, split /\n/, <<'END';
10242Upper     ; Uppercase
10243Lower     ; Lowercase
10244END
10245        }
10246        if ($v_version ge v3.0.1) {
10247            push @return, split /\n/, <<'END';
10248NChar     ; Noncharacter_Code_Point
10249END
10250        }
10251        # The next sets were added in the new-style
10252        if ($v_version ge v3.1.0) {
10253            push @return, split /\n/, <<'END';
10254OAlpha    ; Other_Alphabetic
10255OLower    ; Other_Lowercase
10256OMath     ; Other_Math
10257OUpper    ; Other_Uppercase
10258END
10259        }
10260        if ($v_version ge v3.1.1) {
10261            push @return, "AHex      ; ASCII_Hex_Digit\n";
10262        }
10263    }
10264    if (-e 'EastAsianWidth.txt') {
10265        push @return, "ea        ; East_Asian_Width\n";
10266    }
10267    if (-e 'CompositionExclusions.txt') {
10268        push @return, "CE        ; Composition_Exclusion\n";
10269    }
10270    if (-e 'LineBreak.txt') {
10271        push @return, "lb        ; Line_Break\n";
10272    }
10273    if (-e 'BidiMirroring.txt') {
10274        push @return, "bmg       ; Bidi_Mirroring_Glyph\n";
10275    }
10276    if (-e 'Scripts.txt') {
10277        push @return, "sc        ; Script\n";
10278    }
10279    if (-e 'DNormalizationProps.txt') {
10280        push @return, split /\n/, <<'END';
10281Comp_Ex   ; Full_Composition_Exclusion
10282FC_NFKC   ; FC_NFKC_Closure
10283NFC_QC    ; NFC_Quick_Check
10284NFD_QC    ; NFD_Quick_Check
10285NFKC_QC   ; NFKC_Quick_Check
10286NFKD_QC   ; NFKD_Quick_Check
10287XO_NFC    ; Expands_On_NFC
10288XO_NFD    ; Expands_On_NFD
10289XO_NFKC   ; Expands_On_NFKC
10290XO_NFKD   ; Expands_On_NFKD
10291END
10292    }
10293    if (-e 'DCoreProperties.txt') {
10294        push @return, split /\n/, <<'END';
10295Alpha     ; Alphabetic
10296IDS       ; ID_Start
10297XIDC      ; XID_Continue
10298XIDS      ; XID_Start
10299END
10300        # These can also appear in some versions of PropList.txt
10301        push @return, "Lower     ; Lowercase\n"
10302                                    unless grep { $_ =~ /^Lower\b/} @return;
10303        push @return, "Upper     ; Uppercase\n"
10304                                    unless grep { $_ =~ /^Upper\b/} @return;
10305    }
10306
10307    # This flag requires the DAge.txt file to be copied into the directory.
10308    if (DEBUG && $compare_versions) {
10309        push @return, 'age       ; Age';
10310    }
10311
10312    return @return;
10313}
10314
10315sub substitute_PropValueAliases($file_object) {
10316    # Deal with early releases that don't have the crucial
10317    # PropValueAliases.txt file.
10318
10319    $file_object->insert_lines(get_old_property_value_aliases());
10320
10321    process_PropValueAliases($file_object);
10322}
10323
10324sub process_PropValueAliases($file) {
10325    # This file contains values that properties look like:
10326    # bc ; AL        ; Arabic_Letter
10327    # blk; n/a       ; Greek_And_Coptic                 ; Greek
10328    #
10329    # Field 0 is the property.
10330    # Field 1 is the short name of a property value or 'n/a' if no
10331    #                short name exists;
10332    # Field 2 is the full property value name;
10333    # Any other fields are more synonyms for the property value.
10334    # Purely numeric property values are omitted from the file; as are some
10335    # others, fewer and fewer in later releases
10336
10337    # Entries for the ccc property have an extra field before the
10338    # abbreviation:
10339    # ccc;   0; NR   ; Not_Reordered
10340    # It is the numeric value that the names are synonyms for.
10341
10342    # There are comment entries for values missing from this file:
10343    # # @missing: 0000..10FFFF; ISO_Comment; <none>
10344    # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
10345
10346    if ($v_version lt 4.0.0) {
10347        $file->insert_lines(split /\n/, <<'END'
10348Hangul_Syllable_Type; L                                ; Leading_Jamo
10349Hangul_Syllable_Type; LV                               ; LV_Syllable
10350Hangul_Syllable_Type; LVT                              ; LVT_Syllable
10351Hangul_Syllable_Type; NA                               ; Not_Applicable
10352Hangul_Syllable_Type; T                                ; Trailing_Jamo
10353Hangul_Syllable_Type; V                                ; Vowel_Jamo
10354END
10355        );
10356    }
10357    if ($v_version lt 4.1.0) {
10358        $file->insert_lines(split /\n/, <<'END'
10359_Perl_GCB; CN                               ; Control
10360_Perl_GCB; CR                               ; CR
10361_Perl_GCB; EX                               ; Extend
10362_Perl_GCB; L                                ; L
10363_Perl_GCB; LF                               ; LF
10364_Perl_GCB; LV                               ; LV
10365_Perl_GCB; LVT                              ; LVT
10366_Perl_GCB; T                                ; T
10367_Perl_GCB; V                                ; V
10368_Perl_GCB; XX                               ; Other
10369END
10370        );
10371    }
10372
10373    # Add any explicit cjk values
10374    $file->insert_lines(@cjk_property_values);
10375
10376    # This line is used only for testing the code that checks for name
10377    # conflicts.  There is a script Inherited, and when this line is executed
10378    # it causes there to be a name conflict with the 'Inherited' that this
10379    # program generates for this block property value
10380    #$file->insert_lines('blk; n/a; Herited');
10381
10382    # Process each line of the file ...
10383    while ($file->next_line) {
10384
10385        # Fix typo in input file
10386        s/CCC133/CCC132/g if $v_version eq v6.1.0;
10387
10388        my ($property, @data) = split /\s*;\s*/;
10389
10390        # The ccc property has an extra field at the beginning, which is the
10391        # numeric value.  Move it to be after the other two, mnemonic, fields,
10392        # so that those will be used as the property value's names, and the
10393        # number will be an extra alias.  (Rightmost splice removes field 1-2,
10394        # returning them in a slice; left splice inserts that before anything,
10395        # thus shifting the former field 0 to after them.)
10396        splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc';
10397
10398        if ($v_version le v5.0.0 && $property eq 'blk' && $data[1] =~ /-/) {
10399            my $new_style = $data[1] =~ s/-/_/gr;
10400            splice @data, 1, 0, $new_style;
10401        }
10402
10403        # Field 0 is a short name unless "n/a"; field 1 is the full name.  If
10404        # there is no short name, use the full one in element 1
10405        if ($data[0] eq "n/a") {
10406            $data[0] = $data[1];
10407        }
10408        elsif ($data[0] ne $data[1]
10409               && standardize($data[0]) eq standardize($data[1])
10410               && $data[1] !~ /[[:upper:]]/)
10411        {
10412            # Also, there is a bug in the file in which "n/a" is omitted, and
10413            # the two fields are identical except for case, and the full name
10414            # is all lower case.  Copy the "short" name unto the full one to
10415            # give it some upper case.
10416
10417            $data[1] = $data[0];
10418        }
10419
10420        # Earlier releases had the pseudo property 'qc' that should expand to
10421        # the ones that replace it below.
10422        if ($property eq 'qc') {
10423            if (lc $data[0] eq 'y') {
10424                $file->insert_lines('NFC_QC; Y      ; Yes',
10425                                    'NFD_QC; Y      ; Yes',
10426                                    'NFKC_QC; Y     ; Yes',
10427                                    'NFKD_QC; Y     ; Yes',
10428                                    );
10429            }
10430            elsif (lc $data[0] eq 'n') {
10431                $file->insert_lines('NFC_QC; N      ; No',
10432                                    'NFD_QC; N      ; No',
10433                                    'NFKC_QC; N     ; No',
10434                                    'NFKD_QC; N     ; No',
10435                                    );
10436            }
10437            elsif (lc $data[0] eq 'm') {
10438                $file->insert_lines('NFC_QC; M      ; Maybe',
10439                                    'NFKC_QC; M     ; Maybe',
10440                                    );
10441            }
10442            else {
10443                $file->carp_bad_line("qc followed by unexpected '$data[0]");
10444            }
10445            next;
10446        }
10447
10448        # The first field is the short name, 2nd is the full one.
10449        my $property_object = property_ref($property);
10450        my $table = $property_object->add_match_table($data[0],
10451                                                Full_Name => $data[1]);
10452
10453        # Start looking for more aliases after these two.
10454        for my $i (2 .. @data - 1) {
10455            $table->add_alias($data[$i]);
10456        }
10457    } # End of looping through the file
10458
10459    # As noted in the comments early in the program, it generates tables for
10460    # the default values for all releases, even those for which the concept
10461    # didn't exist at the time.  Here we add those if missing.
10462    if (defined $age && ! defined $age->table('Unassigned')) {
10463        $age->add_match_table('Unassigned');
10464    }
10465    $block->add_match_table('No_Block') if -e 'Blocks.txt'
10466                                    && ! defined $block->table('No_Block');
10467
10468
10469    # Now set the default mappings of the properties from the file.  This is
10470    # done after the loop because a number of properties have only @missings
10471    # entries in the file, and may not show up until the end.
10472    my @defaults = $file->get_missings;
10473    foreach my $default_ref (@defaults) {
10474        my $default = $default_ref->[0];
10475        my $property = property_ref($default_ref->[1]);
10476        $property->set_default_map($default);
10477    }
10478    return;
10479}
10480
10481sub get_old_property_value_aliases () {
10482    # Returns what would be in PropValueAliases.txt if it existed in very old
10483    # versions of Unicode.  It was derived from the one in 3.2, and pared
10484    # down.  An attempt was made to use the existence of files to mean
10485    # inclusion or not of various aliases, but if this was not sufficient,
10486    # using version numbers was resorted to.
10487
10488    my @return = split /\n/, <<'END';
10489bc ; AN        ; Arabic_Number
10490bc ; B         ; Paragraph_Separator
10491bc ; CS        ; Common_Separator
10492bc ; EN        ; European_Number
10493bc ; ES        ; European_Separator
10494bc ; ET        ; European_Terminator
10495bc ; L         ; Left_To_Right
10496bc ; ON        ; Other_Neutral
10497bc ; R         ; Right_To_Left
10498bc ; WS        ; White_Space
10499
10500Bidi_M; N; No; F; False
10501Bidi_M; Y; Yes; T; True
10502
10503# The standard combining classes are very much different in v1, so only use
10504# ones that look right (not checked thoroughly)
10505ccc;   0; NR   ; Not_Reordered
10506ccc;   1; OV   ; Overlay
10507ccc;   7; NK   ; Nukta
10508ccc;   8; KV   ; Kana_Voicing
10509ccc;   9; VR   ; Virama
10510ccc; 202; ATBL ; Attached_Below_Left
10511ccc; 216; ATAR ; Attached_Above_Right
10512ccc; 218; BL   ; Below_Left
10513ccc; 220; B    ; Below
10514ccc; 222; BR   ; Below_Right
10515ccc; 224; L    ; Left
10516ccc; 228; AL   ; Above_Left
10517ccc; 230; A    ; Above
10518ccc; 232; AR   ; Above_Right
10519ccc; 234; DA   ; Double_Above
10520
10521dt ; can       ; canonical
10522dt ; enc       ; circle
10523dt ; fin       ; final
10524dt ; font      ; font
10525dt ; fra       ; fraction
10526dt ; init      ; initial
10527dt ; iso       ; isolated
10528dt ; med       ; medial
10529dt ; n/a       ; none
10530dt ; nb        ; noBreak
10531dt ; sqr       ; square
10532dt ; sub       ; sub
10533dt ; sup       ; super
10534
10535gc ; C         ; Other                            # Cc | Cf | Cn | Co | Cs
10536gc ; Cc        ; Control
10537gc ; Cn        ; Unassigned
10538gc ; Co        ; Private_Use
10539gc ; L         ; Letter                           # Ll | Lm | Lo | Lt | Lu
10540gc ; LC        ; Cased_Letter                     # Ll | Lt | Lu
10541gc ; Ll        ; Lowercase_Letter
10542gc ; Lm        ; Modifier_Letter
10543gc ; Lo        ; Other_Letter
10544gc ; Lu        ; Uppercase_Letter
10545gc ; M         ; Mark                             # Mc | Me | Mn
10546gc ; Mc        ; Spacing_Mark
10547gc ; Mn        ; Nonspacing_Mark
10548gc ; N         ; Number                           # Nd | Nl | No
10549gc ; Nd        ; Decimal_Number
10550gc ; No        ; Other_Number
10551gc ; P         ; Punctuation                      # Pc | Pd | Pe | Pf | Pi | Po | Ps
10552gc ; Pd        ; Dash_Punctuation
10553gc ; Pe        ; Close_Punctuation
10554gc ; Po        ; Other_Punctuation
10555gc ; Ps        ; Open_Punctuation
10556gc ; S         ; Symbol                           # Sc | Sk | Sm | So
10557gc ; Sc        ; Currency_Symbol
10558gc ; Sm        ; Math_Symbol
10559gc ; So        ; Other_Symbol
10560gc ; Z         ; Separator                        # Zl | Zp | Zs
10561gc ; Zl        ; Line_Separator
10562gc ; Zp        ; Paragraph_Separator
10563gc ; Zs        ; Space_Separator
10564
10565nt ; de        ; Decimal
10566nt ; di        ; Digit
10567nt ; n/a       ; None
10568nt ; nu        ; Numeric
10569END
10570
10571    if (-e 'ArabicShaping.txt') {
10572        push @return, split /\n/, <<'END';
10573jg ; n/a       ; AIN
10574jg ; n/a       ; ALEF
10575jg ; n/a       ; DAL
10576jg ; n/a       ; GAF
10577jg ; n/a       ; LAM
10578jg ; n/a       ; MEEM
10579jg ; n/a       ; NO_JOINING_GROUP
10580jg ; n/a       ; NOON
10581jg ; n/a       ; QAF
10582jg ; n/a       ; SAD
10583jg ; n/a       ; SEEN
10584jg ; n/a       ; TAH
10585jg ; n/a       ; WAW
10586
10587jt ; C         ; Join_Causing
10588jt ; D         ; Dual_Joining
10589jt ; L         ; Left_Joining
10590jt ; R         ; Right_Joining
10591jt ; U         ; Non_Joining
10592jt ; T         ; Transparent
10593END
10594        if ($v_version ge v3.0.0) {
10595            push @return, split /\n/, <<'END';
10596jg ; n/a       ; ALAPH
10597jg ; n/a       ; BEH
10598jg ; n/a       ; BETH
10599jg ; n/a       ; DALATH_RISH
10600jg ; n/a       ; E
10601jg ; n/a       ; FEH
10602jg ; n/a       ; FINAL_SEMKATH
10603jg ; n/a       ; GAMAL
10604jg ; n/a       ; HAH
10605jg ; n/a       ; HAMZA_ON_HEH_GOAL
10606jg ; n/a       ; HE
10607jg ; n/a       ; HEH
10608jg ; n/a       ; HEH_GOAL
10609jg ; n/a       ; HETH
10610jg ; n/a       ; KAF
10611jg ; n/a       ; KAPH
10612jg ; n/a       ; KNOTTED_HEH
10613jg ; n/a       ; LAMADH
10614jg ; n/a       ; MIM
10615jg ; n/a       ; NUN
10616jg ; n/a       ; PE
10617jg ; n/a       ; QAPH
10618jg ; n/a       ; REH
10619jg ; n/a       ; REVERSED_PE
10620jg ; n/a       ; SADHE
10621jg ; n/a       ; SEMKATH
10622jg ; n/a       ; SHIN
10623jg ; n/a       ; SWASH_KAF
10624jg ; n/a       ; TAW
10625jg ; n/a       ; TEH_MARBUTA
10626jg ; n/a       ; TETH
10627jg ; n/a       ; YEH
10628jg ; n/a       ; YEH_BARREE
10629jg ; n/a       ; YEH_WITH_TAIL
10630jg ; n/a       ; YUDH
10631jg ; n/a       ; YUDH_HE
10632jg ; n/a       ; ZAIN
10633END
10634        }
10635    }
10636
10637
10638    if (-e 'EastAsianWidth.txt') {
10639        push @return, split /\n/, <<'END';
10640ea ; A         ; Ambiguous
10641ea ; F         ; Fullwidth
10642ea ; H         ; Halfwidth
10643ea ; N         ; Neutral
10644ea ; Na        ; Narrow
10645ea ; W         ; Wide
10646END
10647    }
10648
10649    if (-e 'LineBreak.txt' || -e 'LBsubst.txt') {
10650        my @lb = split /\n/, <<'END';
10651lb ; AI        ; Ambiguous
10652lb ; AL        ; Alphabetic
10653lb ; B2        ; Break_Both
10654lb ; BA        ; Break_After
10655lb ; BB        ; Break_Before
10656lb ; BK        ; Mandatory_Break
10657lb ; CB        ; Contingent_Break
10658lb ; CL        ; Close_Punctuation
10659lb ; CM        ; Combining_Mark
10660lb ; CR        ; Carriage_Return
10661lb ; EX        ; Exclamation
10662lb ; GL        ; Glue
10663lb ; HY        ; Hyphen
10664lb ; ID        ; Ideographic
10665lb ; IN        ; Inseperable
10666lb ; IS        ; Infix_Numeric
10667lb ; LF        ; Line_Feed
10668lb ; NS        ; Nonstarter
10669lb ; NU        ; Numeric
10670lb ; OP        ; Open_Punctuation
10671lb ; PO        ; Postfix_Numeric
10672lb ; PR        ; Prefix_Numeric
10673lb ; QU        ; Quotation
10674lb ; SA        ; Complex_Context
10675lb ; SG        ; Surrogate
10676lb ; SP        ; Space
10677lb ; SY        ; Break_Symbols
10678lb ; XX        ; Unknown
10679lb ; ZW        ; ZWSpace
10680END
10681        # If this Unicode version predates the lb property, we use our
10682        # substitute one
10683        if (-e 'LBsubst.txt') {
10684            $_ = s/^lb/_Perl_LB/r for @lb;
10685        }
10686        push @return, @lb;
10687    }
10688
10689    if (-e 'DNormalizationProps.txt') {
10690        push @return, split /\n/, <<'END';
10691qc ; M         ; Maybe
10692qc ; N         ; No
10693qc ; Y         ; Yes
10694END
10695    }
10696
10697    if (-e 'Scripts.txt') {
10698        push @return, split /\n/, <<'END';
10699sc ; Arab      ; Arabic
10700sc ; Armn      ; Armenian
10701sc ; Beng      ; Bengali
10702sc ; Bopo      ; Bopomofo
10703sc ; Cans      ; Canadian_Aboriginal
10704sc ; Cher      ; Cherokee
10705sc ; Cyrl      ; Cyrillic
10706sc ; Deva      ; Devanagari
10707sc ; Dsrt      ; Deseret
10708sc ; Ethi      ; Ethiopic
10709sc ; Geor      ; Georgian
10710sc ; Goth      ; Gothic
10711sc ; Grek      ; Greek
10712sc ; Gujr      ; Gujarati
10713sc ; Guru      ; Gurmukhi
10714sc ; Hang      ; Hangul
10715sc ; Hani      ; Han
10716sc ; Hebr      ; Hebrew
10717sc ; Hira      ; Hiragana
10718sc ; Ital      ; Old_Italic
10719sc ; Kana      ; Katakana
10720sc ; Khmr      ; Khmer
10721sc ; Knda      ; Kannada
10722sc ; Laoo      ; Lao
10723sc ; Latn      ; Latin
10724sc ; Mlym      ; Malayalam
10725sc ; Mong      ; Mongolian
10726sc ; Mymr      ; Myanmar
10727sc ; Ogam      ; Ogham
10728sc ; Orya      ; Oriya
10729sc ; Qaai      ; Inherited
10730sc ; Runr      ; Runic
10731sc ; Sinh      ; Sinhala
10732sc ; Syrc      ; Syriac
10733sc ; Taml      ; Tamil
10734sc ; Telu      ; Telugu
10735sc ; Thaa      ; Thaana
10736sc ; Thai      ; Thai
10737sc ; Tibt      ; Tibetan
10738sc ; Yiii      ; Yi
10739sc ; Zyyy      ; Common
10740END
10741    }
10742
10743    if ($v_version ge v2.0.0) {
10744        push @return, split /\n/, <<'END';
10745dt ; com       ; compat
10746dt ; nar       ; narrow
10747dt ; sml       ; small
10748dt ; vert      ; vertical
10749dt ; wide      ; wide
10750
10751gc ; Cf        ; Format
10752gc ; Cs        ; Surrogate
10753gc ; Lt        ; Titlecase_Letter
10754gc ; Me        ; Enclosing_Mark
10755gc ; Nl        ; Letter_Number
10756gc ; Pc        ; Connector_Punctuation
10757gc ; Sk        ; Modifier_Symbol
10758END
10759    }
10760    if ($v_version ge v2.1.2) {
10761        push @return, "bc ; S         ; Segment_Separator\n";
10762    }
10763    if ($v_version ge v2.1.5) {
10764        push @return, split /\n/, <<'END';
10765gc ; Pf        ; Final_Punctuation
10766gc ; Pi        ; Initial_Punctuation
10767END
10768    }
10769    if ($v_version ge v2.1.8) {
10770        push @return, "ccc; 240; IS   ; Iota_Subscript\n";
10771    }
10772
10773    if ($v_version ge v3.0.0) {
10774        push @return, split /\n/, <<'END';
10775bc ; AL        ; Arabic_Letter
10776bc ; BN        ; Boundary_Neutral
10777bc ; LRE       ; Left_To_Right_Embedding
10778bc ; LRO       ; Left_To_Right_Override
10779bc ; NSM       ; Nonspacing_Mark
10780bc ; PDF       ; Pop_Directional_Format
10781bc ; RLE       ; Right_To_Left_Embedding
10782bc ; RLO       ; Right_To_Left_Override
10783
10784ccc; 233; DB   ; Double_Below
10785END
10786    }
10787
10788    if ($v_version ge v3.1.0) {
10789        push @return, "ccc; 226; R    ; Right\n";
10790    }
10791
10792    return @return;
10793}
10794
10795sub process_NormalizationsTest($file) {
10796
10797    # Each line looks like:
10798    #      source code point; NFC; NFD; NFKC; NFKD
10799    # e.g.
10800    #       1E0A;1E0A;0044 0307;1E0A;0044 0307;
10801
10802    # Process each line of the file ...
10803    while ($file->next_line) {
10804
10805        next if /^@/;
10806
10807        my ($c1, $c2, $c3, $c4, $c5) = split /\s*;\s*/;
10808
10809        foreach my $var (\$c1, \$c2, \$c3, \$c4, \$c5) {
10810            $$var = pack "U0U*", map { hex } split " ", $$var;
10811            $$var =~ s/(\\)/$1$1/g;
10812        }
10813
10814        push @normalization_tests,
10815                "Test_N(q$c1, q$c2, q$c3, q$c4, q$c5);\n";
10816    } # End of looping through the file
10817}
10818
10819sub output_perl_charnames_line ($a, $b) {
10820
10821    # Output the entries in Perl_charnames specially, using 5 digits instead
10822    # of four.  This makes the entries a constant length, and simplifies
10823    # charnames.pm which this table is for.  Unicode can have 6 digit
10824    # ordinals, but they are all private use or noncharacters which do not
10825    # have names, so won't be in this table.
10826
10827    return sprintf "%05X\n%s\n\n", $_[0], $_[1];
10828}
10829
10830{ # Closure
10831
10832    # These are constants to the $property_info hash in this subroutine, to
10833    # avoid using a quoted-string which might have a typo.
10834    my $TYPE  = 'type';
10835    my $DEFAULT_MAP = 'default_map';
10836    my $DEFAULT_TABLE = 'default_table';
10837    my $PSEUDO_MAP_TYPE = 'pseudo_map_type';
10838    my $MISSINGS = 'missings';
10839
10840    sub process_generic_property_file($file) {
10841        # This processes a file containing property mappings and puts them
10842        # into internal map tables.  It should be used to handle any property
10843        # files that have mappings from a code point or range thereof to
10844        # something else.  This means almost all the UCD .txt files.
10845        # each_line_handlers() should be set to adjust the lines of these
10846        # files, if necessary, to what this routine understands:
10847        #
10848        # 0374          ; NFD_QC; N
10849        # 003C..003E    ; Math
10850        #
10851        # the fields are: "codepoint-range ; property; map"
10852        #
10853        # meaning the codepoints in the range all have the value 'map' under
10854        # 'property'.
10855        # Beginning and trailing white space in each field are not significant.
10856        # Note there is not a trailing semi-colon in the above.  A trailing
10857        # semi-colon means the map is a null-string.  An omitted map, as
10858        # opposed to a null-string, is assumed to be 'Y', based on Unicode
10859        # table syntax.  (This could have been hidden from this routine by
10860        # doing it in the $file object, but that would require parsing of the
10861        # line there, so would have to parse it twice, or change the interface
10862        # to pass this an array.  So not done.)
10863        #
10864        # The map field may begin with a sequence of commands that apply to
10865        # this range.  Each such command begins and ends with $CMD_DELIM.
10866        # These are used to indicate, for example, that the mapping for a
10867        # range has a non-default type.
10868        #
10869        # This loops through the file, calling its next_line() method, and
10870        # then taking the map and adding it to the property's table.
10871        # Complications arise because any number of properties can be in the
10872        # file, in any order, interspersed in any way.  The first time a
10873        # property is seen, it gets information about that property and
10874        # caches it for quick retrieval later.  It also normalizes the maps
10875        # so that only one of many synonyms is stored.  The Unicode input
10876        # files do use some multiple synonyms.
10877
10878        my %property_info;               # To keep track of what properties
10879                                         # have already had entries in the
10880                                         # current file, and info about each,
10881                                         # so don't have to recompute.
10882        my $property_name;               # property currently being worked on
10883        my $property_type;               # and its type
10884        my $previous_property_name = ""; # name from last time through loop
10885        my $property_object;             # pointer to the current property's
10886                                         # object
10887        my $property_addr;               # the address of that object
10888        my $default_map;                 # the string that code points missing
10889                                         # from the file map to
10890        my $default_table;               # For non-string properties, a
10891                                         # reference to the match table that
10892                                         # will contain the list of code
10893                                         # points that map to $default_map.
10894
10895        # Get the next real non-comment line
10896        LINE:
10897        while ($file->next_line) {
10898
10899            # Default replacement type; means that if parts of the range have
10900            # already been stored in our tables, the new map overrides them if
10901            # they differ more than cosmetically
10902            my $replace = $IF_NOT_EQUIVALENT;
10903            my $map_type;            # Default type for the map of this range
10904
10905            #local $to_trace = 1 if main::DEBUG;
10906            trace $_ if main::DEBUG && $to_trace;
10907
10908            # Split the line into components
10909            my ($range, $property_name, $map, @remainder)
10910                = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10911
10912            # If more or less on the line than we are expecting, warn and skip
10913            # the line
10914            if (@remainder) {
10915                $file->carp_bad_line('Extra fields');
10916                next LINE;
10917            }
10918            elsif ( ! defined $property_name) {
10919                $file->carp_bad_line('Missing property');
10920                next LINE;
10921            }
10922
10923            # Examine the range.
10924            if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
10925            {
10926                $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
10927                next LINE;
10928            }
10929            my $low = hex $1;
10930            my $high = (defined $2) ? hex $2 : $low;
10931
10932            # If changing to a new property, get the things constant per
10933            # property
10934            if ($previous_property_name ne $property_name) {
10935
10936                $property_object = property_ref($property_name);
10937                if (! defined $property_object) {
10938                    $file->carp_bad_line("Unexpected property '$property_name'.  Skipped");
10939                    next LINE;
10940                }
10941                { no overloading; $property_addr = pack 'J', $property_object; }
10942
10943                # Defer changing names until have a line that is acceptable
10944                # (the 'next' statement above means is unacceptable)
10945                $previous_property_name = $property_name;
10946
10947                # If not the first time for this property, retrieve info about
10948                # it from the cache
10949                if (defined ($property_info{$property_addr}{$TYPE})) {
10950                    $property_type = $property_info{$property_addr}{$TYPE};
10951                    $default_map = $property_info{$property_addr}{$DEFAULT_MAP};
10952                    $map_type
10953                        = $property_info{$property_addr}{$PSEUDO_MAP_TYPE};
10954                    $default_table
10955                            = $property_info{$property_addr}{$DEFAULT_TABLE};
10956                }
10957                else {
10958
10959                    # Here, is the first time for this property.  Set up the
10960                    # cache.
10961                    $property_type = $property_info{$property_addr}{$TYPE}
10962                                   = $property_object->type;
10963                    $map_type
10964                        = $property_info{$property_addr}{$PSEUDO_MAP_TYPE}
10965                        = $property_object->pseudo_map_type;
10966
10967                    # The Unicode files are set up so that if the map is not
10968                    # defined, it is a binary property
10969                    if (! defined $map && $property_type != $BINARY) {
10970                        if ($property_type != $UNKNOWN
10971                            && $property_type != $NON_STRING)
10972                        {
10973                            $file->carp_bad_line("No mapping defined on a non-binary property.  Using 'Y' for the map");
10974                        }
10975                        else {
10976                            $property_object->set_type($BINARY);
10977                            $property_type
10978                                = $property_info{$property_addr}{$TYPE}
10979                                = $BINARY;
10980                        }
10981                    }
10982
10983                    # Get any @missings default for this property.  This
10984                    # should precede the first entry for the property in the
10985                    # input file, and is located in a comment that has been
10986                    # stored by the Input_file class until we access it here.
10987                    # It's possible that there is more than one such line
10988                    # waiting for us; collect them all, and parse
10989                    my @missings_list = $file->get_missings
10990                                            if $file->has_missings_defaults;
10991                    foreach my $default_ref (@missings_list) {
10992                        my $default = $default_ref->[0];
10993                        my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); };
10994
10995                        # For string properties, the default is just what the
10996                        # file says, but non-string properties should already
10997                        # have set up a table for the default property value;
10998                        # use the table for these, so can resolve synonyms
10999                        # later to a single standard one.
11000                        if ($property_type == $STRING
11001                            || $property_type == $UNKNOWN)
11002                        {
11003                            $property_info{$addr}{$MISSINGS} = $default;
11004                        }
11005                        else {
11006                            $property_info{$addr}{$MISSINGS}
11007                                        = $property_object->table($default);
11008                        }
11009                    }
11010
11011                    # Finished storing all the @missings defaults in the input
11012                    # file so far.  Get the one for the current property.
11013                    my $missings = $property_info{$property_addr}{$MISSINGS};
11014
11015                    # But we likely have separately stored what the default
11016                    # should be.  (This is to accommodate versions of the
11017                    # standard where the @missings lines are absent or
11018                    # incomplete.)  Hopefully the two will match.  But check
11019                    # it out.
11020                    $default_map = $property_object->default_map;
11021
11022                    # If the map is a ref, it means that the default won't be
11023                    # processed until later, so undef it, so next few lines
11024                    # will redefine it to something that nothing will match
11025                    undef $default_map if ref $default_map;
11026
11027                    # Create a $default_map if don't have one; maybe a dummy
11028                    # that won't match anything.
11029                    if (! defined $default_map) {
11030
11031                        # Use any @missings line in the file.
11032                        if (defined $missings) {
11033                            if (ref $missings) {
11034                                $default_map = $missings->full_name;
11035                                $default_table = $missings;
11036                            }
11037                            else {
11038                                $default_map = $missings;
11039                            }
11040
11041                            # And store it with the property for outside use.
11042                            $property_object->set_default_map($default_map);
11043                        }
11044                        else {
11045
11046                            # Neither an @missings nor a default map.  Create
11047                            # a dummy one, so won't have to test definedness
11048                            # in the main loop.
11049                            $default_map = '_Perl This will never be in a file
11050                                            from Unicode';
11051                        }
11052                    }
11053
11054                    # Here, we have $default_map defined, possibly in terms of
11055                    # $missings, but maybe not, and possibly is a dummy one.
11056                    if (defined $missings) {
11057
11058                        # Make sure there is no conflict between the two.
11059                        # $missings has priority.
11060                        if (ref $missings) {
11061                            $default_table
11062                                        = $property_object->table($default_map);
11063                            if (! defined $default_table
11064                                || $default_table != $missings)
11065                            {
11066                                if (! defined $default_table) {
11067                                    $default_table = $UNDEF;
11068                                }
11069                                $file->carp_bad_line(<<END
11070The \@missings line for $property_name in $file says that missings default to
11071$missings, but we expect it to be $default_table.  $missings used.
11072END
11073                                );
11074                                $default_table = $missings;
11075                                $default_map = $missings->full_name;
11076                            }
11077                            $property_info{$property_addr}{$DEFAULT_TABLE}
11078                                                        = $default_table;
11079                        }
11080                        elsif ($default_map ne $missings) {
11081                            $file->carp_bad_line(<<END
11082The \@missings line for $property_name in $file says that missings default to
11083$missings, but we expect it to be $default_map.  $missings used.
11084END
11085                            );
11086                            $default_map = $missings;
11087                        }
11088                    }
11089
11090                    $property_info{$property_addr}{$DEFAULT_MAP}
11091                                                    = $default_map;
11092
11093                    # If haven't done so already, find the table corresponding
11094                    # to this map for non-string properties.
11095                    if (! defined $default_table
11096                        && $property_type != $STRING
11097                        && $property_type != $UNKNOWN)
11098                    {
11099                        $default_table = $property_info{$property_addr}
11100                                                        {$DEFAULT_TABLE}
11101                                    = $property_object->table($default_map);
11102                    }
11103                } # End of is first time for this property
11104            } # End of switching properties.
11105
11106            # Ready to process the line.
11107            # The Unicode files are set up so that if the map is not defined,
11108            # it is a binary property with value 'Y'
11109            if (! defined $map) {
11110                $map = 'Y';
11111            }
11112            else {
11113
11114                # If the map begins with a special command to us (enclosed in
11115                # delimiters), extract the command(s).
11116                while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
11117                    my $command = $1;
11118                    if ($command =~  / ^ $REPLACE_CMD= (.*) /x) {
11119                        $replace = $1;
11120                    }
11121                    elsif ($command =~  / ^ $MAP_TYPE_CMD= (.*) /x) {
11122                        $map_type = $1;
11123                    }
11124                    else {
11125                        $file->carp_bad_line("Unknown command line: '$1'");
11126                        next LINE;
11127                    }
11128                }
11129            }
11130
11131            if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
11132            {
11133
11134                # Here, we have a map to a particular code point, and the
11135                # default map is to a code point itself.  If the range
11136                # includes the particular code point, change that portion of
11137                # the range to the default.  This makes sure that in the final
11138                # table only the non-defaults are listed.
11139                my $decimal_map = hex $map;
11140                if ($low <= $decimal_map && $decimal_map <= $high) {
11141
11142                    # If the range includes stuff before or after the map
11143                    # we're changing, split it and process the split-off parts
11144                    # later.
11145                    if ($low < $decimal_map) {
11146                        $file->insert_adjusted_lines(
11147                                            sprintf("%04X..%04X; %s; %s",
11148                                                    $low,
11149                                                    $decimal_map - 1,
11150                                                    $property_name,
11151                                                    $map));
11152                    }
11153                    if ($high > $decimal_map) {
11154                        $file->insert_adjusted_lines(
11155                                            sprintf("%04X..%04X; %s; %s",
11156                                                    $decimal_map + 1,
11157                                                    $high,
11158                                                    $property_name,
11159                                                    $map));
11160                    }
11161                    $low = $high = $decimal_map;
11162                    $map = $CODE_POINT;
11163                }
11164            }
11165
11166            # If we can tell that this is a synonym for the default map, use
11167            # the default one instead.
11168            if ($property_type != $STRING
11169                && $property_type != $UNKNOWN)
11170            {
11171                my $table = $property_object->table($map);
11172                if (defined $table && $table == $default_table) {
11173                    $map = $default_map;
11174                }
11175            }
11176
11177            # And figure out the map type if not known.
11178            if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
11179                if ($map eq "") {   # Nulls are always $NULL map type
11180                    $map_type = $NULL;
11181                } # Otherwise, non-strings, and those that don't allow
11182                  # $MULTI_CP, and those that aren't multiple code points are
11183                  # 0
11184                elsif
11185                   (($property_type != $STRING && $property_type != $UNKNOWN)
11186                   || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
11187                   || $map !~ /^ $code_point_re ( \  $code_point_re )+ $ /x)
11188                {
11189                    $map_type = 0;
11190                }
11191                else {
11192                    $map_type = $MULTI_CP;
11193                }
11194            }
11195
11196            $property_object->add_map($low, $high,
11197                                        $map,
11198                                        Type => $map_type,
11199                                        Replace => $replace);
11200        } # End of loop through file's lines
11201
11202        return;
11203    }
11204}
11205
11206{ # Closure for UnicodeData.txt handling
11207
11208    # This file was the first one in the UCD; its design leads to some
11209    # awkwardness in processing.  Here is a sample line:
11210    # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
11211    # The fields in order are:
11212    my $i = 0;            # The code point is in field 0, and is shifted off.
11213    my $CHARNAME = $i++;  # character name (e.g. "LATIN CAPITAL LETTER A")
11214    my $CATEGORY = $i++;  # category (e.g. "Lu")
11215    my $CCC = $i++;       # Canonical combining class (e.g. "230")
11216    my $BIDI = $i++;      # directional class (e.g. "L")
11217    my $PERL_DECOMPOSITION = $i++;  # decomposition mapping
11218    my $PERL_DECIMAL_DIGIT = $i++;   # decimal digit value
11219    my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
11220                                         # Dual-use in this program; see below
11221    my $NUMERIC = $i++;   # numeric value
11222    my $MIRRORED = $i++;  # ? mirrored
11223    my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
11224    my $COMMENT = $i++;   # iso comment
11225    my $UPPER = $i++;     # simple uppercase mapping
11226    my $LOWER = $i++;     # simple lowercase mapping
11227    my $TITLE = $i++;     # simple titlecase mapping
11228    my $input_field_count = $i;
11229
11230    # This routine in addition outputs these extra fields:
11231
11232    my $DECOMP_TYPE = $i++; # Decomposition type
11233
11234    # These fields are modifications of ones above, and are usually
11235    # suppressed; they must come last, as for speed, the loop upper bound is
11236    # normally set to ignore them
11237    my $NAME = $i++;        # This is the strict name field, not the one that
11238                            # charnames uses.
11239    my $DECOMP_MAP = $i++;  # Strict decomposition mapping; not the one used
11240                            # by Unicode::Normalize
11241    my $last_field = $i - 1;
11242
11243    # All these are read into an array for each line, with the indices defined
11244    # above.  The empty fields in the example line above indicate that the
11245    # value is defaulted.  The handler called for each line of the input
11246    # changes these to their defaults.
11247
11248    # Here are the official names of the properties, in a parallel array:
11249    my @field_names;
11250    $field_names[$BIDI] = 'Bidi_Class';
11251    $field_names[$CATEGORY] = 'General_Category';
11252    $field_names[$CCC] = 'Canonical_Combining_Class';
11253    $field_names[$CHARNAME] = 'Perl_Charnames';
11254    $field_names[$COMMENT] = 'ISO_Comment';
11255    $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
11256    $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
11257    $field_names[$LOWER] = 'Lowercase_Mapping';
11258    $field_names[$MIRRORED] = 'Bidi_Mirrored';
11259    $field_names[$NAME] = 'Name';
11260    $field_names[$NUMERIC] = 'Numeric_Value';
11261    $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
11262    $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
11263    $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
11264    $field_names[$TITLE] = 'Titlecase_Mapping';
11265    $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
11266    $field_names[$UPPER] = 'Uppercase_Mapping';
11267
11268    # Some of these need a little more explanation:
11269    # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
11270    #   property, but is used in calculating the Numeric_Type.  Perl however,
11271    #   creates a file from this field, so a Perl property is created from it.
11272    # Similarly, the Other_Digit field is used only for calculating the
11273    #   Numeric_Type, and so it can be safely re-used as the place to store
11274    #   the value for Numeric_Type; hence it is referred to as
11275    #   $NUMERIC_TYPE_OTHER_DIGIT.
11276    # The input field named $PERL_DECOMPOSITION is a combination of both the
11277    #   decomposition mapping and its type.  Perl creates a file containing
11278    #   exactly this field, so it is used for that.  The two properties are
11279    #   separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
11280    #   $DECOMP_MAP is usually suppressed (unless the lists are changed to
11281    #   output it), as Perl doesn't use it directly.
11282    # The input field named here $CHARNAME is used to construct the
11283    #   Perl_Charnames property, which is a combination of the Name property
11284    #   (which the input field contains), and the Unicode_1_Name property, and
11285    #   others from other files.  Since, the strict Name property is not used
11286    #   by Perl, this field is used for the table that Perl does use.  The
11287    #   strict Name property table is usually suppressed (unless the lists are
11288    #   changed to output it), so it is accumulated in a separate field,
11289    #   $NAME, which to save time is discarded unless the table is actually to
11290    #   be output
11291
11292    # This file is processed like most in this program.  Control is passed to
11293    # process_generic_property_file() which calls filter_UnicodeData_line()
11294    # for each input line.  This filter converts the input into line(s) that
11295    # process_generic_property_file() understands.  There is also a setup
11296    # routine called before any of the file is processed, and a handler for
11297    # EOF processing, all in this closure.
11298
11299    # A huge speed-up occurred at the cost of some added complexity when these
11300    # routines were altered to buffer the outputs into ranges.  Almost all the
11301    # lines of the input file apply to just one code point, and for most
11302    # properties, the map for the next code point up is the same as the
11303    # current one.  So instead of creating a line for each property for each
11304    # input line, filter_UnicodeData_line() remembers what the previous map
11305    # of a property was, and doesn't generate a line to pass on until it has
11306    # to, as when the map changes; and that passed-on line encompasses the
11307    # whole contiguous range of code points that have the same map for that
11308    # property.  This means a slight amount of extra setup, and having to
11309    # flush these buffers on EOF, testing if the maps have changed, plus
11310    # remembering state information in the closure.  But it means a lot less
11311    # real time in not having to change the data base for each property on
11312    # each line.
11313
11314    # Another complication is that there are already a few ranges designated
11315    # in the input.  There are two lines for each, with the same maps except
11316    # the code point and name on each line.  This was actually the hardest
11317    # thing to design around.  The code points in those ranges may actually
11318    # have real maps not given by these two lines.  These maps will either
11319    # be algorithmically determinable, or be in the extracted files furnished
11320    # with the UCD.  In the event of conflicts between these extracted files,
11321    # and this one, Unicode says that this one prevails.  But it shouldn't
11322    # prevail for conflicts that occur in these ranges.  The data from the
11323    # extracted files prevails in those cases.  So, this program is structured
11324    # so that those files are processed first, storing maps.  Then the other
11325    # files are processed, generally overwriting what the extracted files
11326    # stored.  But just the range lines in this input file are processed
11327    # without overwriting.  This is accomplished by adding a special string to
11328    # the lines output to tell process_generic_property_file() to turn off the
11329    # overwriting for just this one line.
11330    # A similar mechanism is used to tell it that the map is of a non-default
11331    # type.
11332
11333    sub setup_UnicodeData($file) { # Called before any lines of the input are read
11334
11335        # Create a new property specially located that is a combination of
11336        # various Name properties: Name, Unicode_1_Name, Named Sequences, and
11337        # _Perl_Name_Alias properties.  (The final one duplicates elements of the
11338        # first, and starting in v6.1, is the same as the 'Name_Alias
11339        # property.)  A comment for the new property will later be constructed
11340        # based on the actual properties present and used
11341        $perl_charname = Property->new('Perl_Charnames',
11342                       Default_Map => "",
11343                       Directory => File::Spec->curdir(),
11344                       File => 'Name',
11345                       Fate => $INTERNAL_ONLY,
11346                       Perl_Extension => 1,
11347                       Range_Size_1 => \&output_perl_charnames_line,
11348                       Type => $STRING,
11349                       );
11350        $perl_charname->set_proxy_for('Name');
11351
11352        my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
11353                                        Directory => File::Spec->curdir(),
11354                                        File => 'Decomposition',
11355                                        Format => $DECOMP_STRING_FORMAT,
11356                                        Fate => $INTERNAL_ONLY,
11357                                        Perl_Extension => 1,
11358                                        Default_Map => $CODE_POINT,
11359
11360                                        # normalize.pm can't cope with these
11361                                        Output_Range_Counts => 0,
11362
11363                                        # This is a specially formatted table
11364                                        # explicitly for normalize.pm, which
11365                                        # is expecting a particular format,
11366                                        # which means that mappings containing
11367                                        # multiple code points are in the main
11368                                        # body of the table
11369                                        Map_Type => $COMPUTE_NO_MULTI_CP,
11370                                        Type => $STRING,
11371                                        To_Output_Map => $INTERNAL_MAP,
11372                                        );
11373        $Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type');
11374        $Perl_decomp->add_comment(join_lines(<<END
11375This mapping is a combination of the Unicode 'Decomposition_Type' and
11376'Decomposition_Mapping' properties, formatted for use by normalize.pm.  It is
11377identical to the official Unicode 'Decomposition_Mapping' property except for
11378two things:
11379 1) It omits the algorithmically determinable Hangul syllable decompositions,
11380which normalize.pm handles algorithmically.
11381 2) It contains the decomposition type as well.  Non-canonical decompositions
11382begin with a word in angle brackets, like <super>, which denotes the
11383compatible decomposition type.  If the map does not begin with the <angle
11384brackets>, the decomposition is canonical.
11385END
11386        ));
11387
11388        my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
11389                                        Default_Map => "",
11390                                        Perl_Extension => 1,
11391                                        Directory => $map_directory,
11392                                        Type => $STRING,
11393                                        To_Output_Map => $OUTPUT_ADJUSTED,
11394                                        );
11395        $Decimal_Digit->add_comment(join_lines(<<END
11396This file gives the mapping of all code points which represent a single
11397decimal digit [0-9] to their respective digits, but it has ranges of 10 code
11398points, and the mapping of each non-initial element of each range is actually
11399not to "0", but to the offset that element has from its corresponding DIGIT 0.
11400These code points are those that have Numeric_Type=Decimal; not special
11401things, like subscripts nor Roman numerals.
11402END
11403        ));
11404
11405        # These properties are not used for generating anything else, and are
11406        # usually not output.  By making them last in the list, we can just
11407        # change the high end of the loop downwards to avoid the work of
11408        # generating a table(s) that is/are just going to get thrown away.
11409        if (! property_ref('Decomposition_Mapping')->to_output_map
11410            && ! property_ref('Name')->to_output_map)
11411        {
11412            $last_field = min($NAME, $DECOMP_MAP) - 1;
11413        } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
11414            $last_field = $DECOMP_MAP;
11415        } elsif (property_ref('Name')->to_output_map) {
11416            $last_field = $NAME;
11417        }
11418        return;
11419    }
11420
11421    my $first_time = 1;                 # ? Is this the first line of the file
11422    my $in_range = 0;                   # ? Are we in one of the file's ranges
11423    my $previous_cp;                    # hex code point of previous line
11424    my $decimal_previous_cp = -1;       # And its decimal equivalent
11425    my @start;                          # For each field, the current starting
11426                                        # code point in hex for the range
11427                                        # being accumulated.
11428    my @fields;                         # The input fields;
11429    my @previous_fields;                # And those from the previous call
11430
11431    sub filter_UnicodeData_line($file) {
11432        # Handle a single input line from UnicodeData.txt; see comments above
11433        # Conceptually this takes a single line from the file containing N
11434        # properties, and converts it into N lines with one property per line,
11435        # which is what the final handler expects.  But there are
11436        # complications due to the quirkiness of the input file, and to save
11437        # time, it accumulates ranges where the property values don't change
11438        # and only emits lines when necessary.  This is about an order of
11439        # magnitude fewer lines emitted.
11440
11441        # $_ contains the input line.
11442        # -1 in split means retain trailing null fields
11443        (my $cp, @fields) = split /\s*;\s*/, $_, -1;
11444
11445        #local $to_trace = 1 if main::DEBUG;
11446        trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
11447        if (@fields > $input_field_count) {
11448            $file->carp_bad_line('Extra fields');
11449            $_ = "";
11450            return;
11451        }
11452
11453        my $decimal_cp = hex $cp;
11454
11455        # We have to output all the buffered ranges when the next code point
11456        # is not exactly one after the previous one, which means there is a
11457        # gap in the ranges.
11458        my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
11459
11460        # The decomposition mapping field requires special handling.  It looks
11461        # like either:
11462        #
11463        # <compat> 0032 0020
11464        # 0041 0300
11465        #
11466        # The decomposition type is enclosed in <brackets>; if missing, it
11467        # means the type is canonical.  There are two decomposition mapping
11468        # tables: the one for use by Perl's normalize.pm has a special format
11469        # which is this field intact; the other, for general use is of
11470        # standard format.  In either case we have to find the decomposition
11471        # type.  Empty fields have None as their type, and map to the code
11472        # point itself
11473        if ($fields[$PERL_DECOMPOSITION] eq "") {
11474            $fields[$DECOMP_TYPE] = 'None';
11475            $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
11476        }
11477        else {
11478            ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
11479                                            =~ / < ( .+? ) > \s* ( .+ ) /x;
11480            if (! defined $fields[$DECOMP_TYPE]) {
11481                $fields[$DECOMP_TYPE] = 'Canonical';
11482                $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
11483            }
11484            else {
11485                $fields[$DECOMP_MAP] = $map;
11486            }
11487        }
11488
11489        # The 3 numeric fields also require special handling.  The 2 digit
11490        # fields must be either empty or match the number field.  This means
11491        # that if it is empty, they must be as well, and the numeric type is
11492        # None, and the numeric value is 'Nan'.
11493        # The decimal digit field must be empty or match the other digit
11494        # field.  If the decimal digit field is non-empty, the code point is
11495        # a decimal digit, and the other two fields will have the same value.
11496        # If it is empty, but the other digit field is non-empty, the code
11497        # point is an 'other digit', and the number field will have the same
11498        # value as the other digit field.  If the other digit field is empty,
11499        # but the number field is non-empty, the code point is a generic
11500        # numeric type.
11501        if ($fields[$NUMERIC] eq "") {
11502            if ($fields[$PERL_DECIMAL_DIGIT] ne ""
11503                || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
11504            ) {
11505                $file->carp_bad_line("Numeric values inconsistent.  Trying to process anyway");
11506            }
11507            $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
11508            $fields[$NUMERIC] = 'NaN';
11509        }
11510        else {
11511            $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number.  Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
11512            if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
11513                $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
11514                $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should be empty since the general category ($fields[$CATEGORY]) isn't 'Nd'.  Processing as Decimal") if $fields[$CATEGORY] ne "Nd";
11515                $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
11516            }
11517            elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
11518                $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
11519                $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
11520            }
11521            else {
11522                $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
11523
11524                # Rationals require extra effort.
11525                if ($fields[$NUMERIC] =~ qr{/}) {
11526                    reduce_fraction(\$fields[$NUMERIC]);
11527                    register_fraction($fields[$NUMERIC])
11528                }
11529            }
11530        }
11531
11532        # For the properties that have empty fields in the file, and which
11533        # mean something different from empty, change them to that default.
11534        # Certain fields just haven't been empty so far in any Unicode
11535        # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
11536        # $CATEGORY.  This leaves just the two fields, and so we hard-code in
11537        # the defaults; which are very unlikely to ever change.
11538        $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
11539        $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
11540
11541        # UAX44 says that if title is empty, it is the same as whatever upper
11542        # is,
11543        $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
11544
11545        # There are a few pairs of lines like:
11546        #   AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
11547        #   D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
11548        # that define ranges.  These should be processed after the fields are
11549        # adjusted above, as they may override some of them; but mostly what
11550        # is left is to possibly adjust the $CHARNAME field.  The names of all the
11551        # paired lines start with a '<', but this is also true of '<control>,
11552        # which isn't one of these special ones.
11553        if ($fields[$CHARNAME] eq '<control>') {
11554
11555            # Some code points in this file have the pseudo-name
11556            # '<control>', but the official name for such ones is the null
11557            # string.
11558            $fields[$NAME] = $fields[$CHARNAME] = "";
11559
11560            # We had better not be in between range lines.
11561            if ($in_range) {
11562                $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'.  Trying anyway");
11563                $in_range = 0;
11564            }
11565        }
11566        elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
11567
11568            # Here is a non-range line.  We had better not be in between range
11569            # lines.
11570            if ($in_range) {
11571                $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'.  Trying anyway");
11572                $in_range = 0;
11573            }
11574            if ($fields[$CHARNAME] =~ s/- $cp $//x) {
11575
11576                # These are code points whose names end in their code points,
11577                # which means the names are algorithmically derivable from the
11578                # code points.  To shorten the output Name file, the algorithm
11579                # for deriving these is placed in the file instead of each
11580                # code point, so they have map type $CP_IN_NAME
11581                $fields[$CHARNAME] = $CMD_DELIM
11582                                 . $MAP_TYPE_CMD
11583                                 . '='
11584                                 . $CP_IN_NAME
11585                                 . $CMD_DELIM
11586                                 . $fields[$CHARNAME];
11587            }
11588            $fields[$NAME] = $fields[$CHARNAME];
11589        }
11590        elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
11591            $fields[$CHARNAME] = $fields[$NAME] = $1;
11592
11593            # Here we are at the beginning of a range pair.
11594            if ($in_range) {
11595                $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'.  Trying anyway");
11596            }
11597            $in_range = 1;
11598
11599            # Because the properties in the range do not overwrite any already
11600            # in the db, we must flush the buffers of what's already there, so
11601            # they get handled in the normal scheme.
11602            $force_output = 1;
11603
11604        }
11605        elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
11606            $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME].  Ignoring this line.");
11607            $_ = "";
11608            return;
11609        }
11610        else { # Here, we are at the last line of a range pair.
11611
11612            if (! $in_range) {
11613                $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one.  Ignoring this line.");
11614                $_ = "";
11615                return;
11616            }
11617            $in_range = 0;
11618
11619            $fields[$NAME] = $fields[$CHARNAME];
11620
11621            # Check that the input is valid: that the closing of the range is
11622            # the same as the beginning.
11623            foreach my $i (0 .. $last_field) {
11624                next if $fields[$i] eq $previous_fields[$i];
11625                $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'.  Bad News.  Trying anyway");
11626            }
11627
11628            # The processing differs depending on the type of range,
11629            # determined by its $CHARNAME
11630            if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
11631
11632                # Check that the data looks right.
11633                if ($decimal_previous_cp != $SBase) {
11634                    $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp.  Bad News.  Results will be wrong");
11635                }
11636                if ($decimal_cp != $SBase + $SCount - 1) {
11637                    $file->carp_bad_line("Unexpected Hangul syllable end = $cp.  Bad News.  Results will be wrong");
11638                }
11639
11640                # The Hangul syllable range has a somewhat complicated name
11641                # generation algorithm.  Each code point in it has a canonical
11642                # decomposition also computable by an algorithm.  The
11643                # perl decomposition map table built from these is used only
11644                # by normalize.pm, which has the algorithm built in it, so the
11645                # decomposition maps are not needed, and are large, so are
11646                # omitted from it.  If the full decomposition map table is to
11647                # be output, the decompositions are generated for it, in the
11648                # EOF handling code for this input file.
11649
11650                $previous_fields[$DECOMP_TYPE] = 'Canonical';
11651
11652                # This range is stored in our internal structure with its
11653                # own map type, different from all others.
11654                $previous_fields[$CHARNAME] = $previous_fields[$NAME]
11655                                        = $CMD_DELIM
11656                                          . $MAP_TYPE_CMD
11657                                          . '='
11658                                          . $HANGUL_SYLLABLE
11659                                          . $CMD_DELIM
11660                                          . $fields[$CHARNAME];
11661            }
11662            elsif ($fields[$CATEGORY] eq 'Lo') {    # Is a letter
11663
11664                # All the CJK ranges like this have the name given as a
11665                # special case in the next code line.  And for the others, we
11666                # hope that Unicode continues to use the correct name in
11667                # future releases, so we don't have to make further special
11668                # cases.
11669                my $name = ($fields[$CHARNAME] =~ /^CJK/)
11670                           ? 'CJK UNIFIED IDEOGRAPH'
11671                           : uc $fields[$CHARNAME];
11672
11673                # The name for these contains the code point itself, and all
11674                # are defined to have the same base name, regardless of what
11675                # is in the file.  They are stored in our internal structure
11676                # with a map type of $CP_IN_NAME
11677                $previous_fields[$CHARNAME] = $previous_fields[$NAME]
11678                                        = $CMD_DELIM
11679                                           . $MAP_TYPE_CMD
11680                                           . '='
11681                                           . $CP_IN_NAME
11682                                           . $CMD_DELIM
11683                                           . $name;
11684
11685            }
11686            elsif ($fields[$CATEGORY] eq 'Co'
11687                     || $fields[$CATEGORY] eq 'Cs')
11688            {
11689                # The names of all the code points in these ranges are set to
11690                # null, as there are no names for the private use and
11691                # surrogate code points.
11692
11693                $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
11694            }
11695            else {
11696                $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY].  Attempting to process it.");
11697            }
11698
11699            # The first line of the range caused everything else to be output,
11700            # and then its values were stored as the beginning values for the
11701            # next set of ranges, which this one ends.  Now, for each value,
11702            # add a command to tell the handler that these values should not
11703            # replace any existing ones in our database.
11704            foreach my $i (0 .. $last_field) {
11705                $previous_fields[$i] = $CMD_DELIM
11706                                        . $REPLACE_CMD
11707                                        . '='
11708                                        . $NO
11709                                        . $CMD_DELIM
11710                                        . $previous_fields[$i];
11711            }
11712
11713            # And change things so it looks like the entire range has been
11714            # gone through with this being the final part of it.  Adding the
11715            # command above to each field will cause this range to be flushed
11716            # during the next iteration, as it guaranteed that the stored
11717            # field won't match whatever value the next one has.
11718            $previous_cp = $cp;
11719            $decimal_previous_cp = $decimal_cp;
11720
11721            # We are now set up for the next iteration; so skip the remaining
11722            # code in this subroutine that does the same thing, but doesn't
11723            # know about these ranges.
11724            $_ = "";
11725
11726            return;
11727        }
11728
11729        # On the very first line, we fake it so the code below thinks there is
11730        # nothing to output, and initialize so that when it does get output it
11731        # uses the first line's values for the lowest part of the range.
11732        # (One could avoid this by using peek(), but then one would need to
11733        # know the adjustments done above and do the same ones in the setup
11734        # routine; not worth it)
11735        if ($first_time) {
11736            $first_time = 0;
11737            @previous_fields = @fields;
11738            @start = ($cp) x scalar @fields;
11739            $decimal_previous_cp = $decimal_cp - 1;
11740        }
11741
11742        # For each field, output the stored up ranges that this code point
11743        # doesn't fit in.  Earlier we figured out if all ranges should be
11744        # terminated because of changing the replace or map type styles, or if
11745        # there is a gap between this new code point and the previous one, and
11746        # that is stored in $force_output.  But even if those aren't true, we
11747        # need to output the range if this new code point's value for the
11748        # given property doesn't match the stored range's.
11749        #local $to_trace = 1 if main::DEBUG;
11750        foreach my $i (0 .. $last_field) {
11751            my $field = $fields[$i];
11752            if ($force_output || $field ne $previous_fields[$i]) {
11753
11754                # Flush the buffer of stored values.
11755                $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
11756
11757                # Start a new range with this code point and its value
11758                $start[$i] = $cp;
11759                $previous_fields[$i] = $field;
11760            }
11761        }
11762
11763        # Set the values for the next time.
11764        $previous_cp = $cp;
11765        $decimal_previous_cp = $decimal_cp;
11766
11767        # The input line has generated whatever adjusted lines are needed, and
11768        # should not be looked at further.
11769        $_ = "";
11770        return;
11771    }
11772
11773    sub EOF_UnicodeData($file) {
11774        # Called upon EOF to flush the buffers, and create the Hangul
11775        # decomposition mappings if needed.
11776
11777        # Flush the buffers.
11778        foreach my $i (0 .. $last_field) {
11779            $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
11780        }
11781
11782        if (-e 'Jamo.txt') {
11783
11784            # The algorithm is published by Unicode, based on values in
11785            # Jamo.txt, (which should have been processed before this
11786            # subroutine), and the results left in %Jamo
11787            unless (%Jamo) {
11788                Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt.  Hangul syllables not generated.");
11789                return;
11790            }
11791
11792            # If the full decomposition map table is being output, insert
11793            # into it the Hangul syllable mappings.  This is to avoid having
11794            # to publish a subroutine in it to compute them.  (which would
11795            # essentially be this code.)  This uses the algorithm published by
11796            # Unicode.  (No hangul syllables in version 1)
11797            if ($v_version ge v2.0.0
11798                && property_ref('Decomposition_Mapping')->to_output_map) {
11799                for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
11800                    use integer;
11801                    my $SIndex = $S - $SBase;
11802                    my $L = $LBase + $SIndex / $NCount;
11803                    my $V = $VBase + ($SIndex % $NCount) / $TCount;
11804                    my $T = $TBase + $SIndex % $TCount;
11805
11806                    trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
11807                    my $decomposition = sprintf("%04X %04X", $L, $V);
11808                    $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
11809                    $file->insert_adjusted_lines(
11810                                sprintf("%04X; Decomposition_Mapping; %s",
11811                                        $S,
11812                                        $decomposition));
11813                }
11814            }
11815        }
11816
11817        return;
11818    }
11819
11820    sub filter_v1_ucd($file) {
11821        # Fix UCD lines in version 1.  This is probably overkill, but this
11822        # fixes some glaring errors in Version 1 UnicodeData.txt.  That file:
11823        # 1)    had many Hangul (U+3400 - U+4DFF) code points that were later
11824        #       removed.  This program retains them
11825        # 2)    didn't include ranges, which it should have, and which are now
11826        #       added in @corrected_lines below.  It was hand populated by
11827        #       taking the data from Version 2, verified by analyzing
11828        #       DAge.txt.
11829        # 3)    There is a syntax error in the entry for U+09F8 which could
11830        #       cause problems for Unicode::UCD, and so is changed.  It's
11831        #       numeric value was simply a minus sign, without any number.
11832        #       (Eventually Unicode changed the code point to non-numeric.)
11833        # 4)    The decomposition types often don't match later versions
11834        #       exactly, and the whole syntax of that field is different; so
11835        #       the syntax is changed as well as the types to their later
11836        #       terminology.  Otherwise normalize.pm would be very unhappy
11837        # 5)    Many ccc classes are different.  These are left intact.
11838        # 6)    U+FF10..U+FF19 are missing their numeric values in all three
11839        #       fields.  These are unchanged because it doesn't really cause
11840        #       problems for Perl.
11841        # 7)    A number of code points, such as controls, don't have their
11842        #       Unicode Version 1 Names in this file.  These are added.
11843        # 8)    A number of Symbols were marked as Lm.  This changes those in
11844        #       the Latin1 range, so that regexes work.
11845        # 9)    The odd characters U+03DB .. U+03E1 weren't encoded but are
11846        #       referred to by their lc equivalents.  Not fixed.
11847
11848        my @corrected_lines = split /\n/, <<'END';
118494E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
118509FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
11851E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
11852F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
11853F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
11854FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
11855END
11856
11857        #local $to_trace = 1 if main::DEBUG;
11858        trace $_ if main::DEBUG && $to_trace;
11859
11860        # -1 => retain trailing null fields
11861        my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11862
11863        # At the first place that is wrong in the input, insert all the
11864        # corrections, replacing the wrong line.
11865        if ($code_point eq '4E00') {
11866            my @copy = @corrected_lines;
11867            $_ = shift @copy;
11868            ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11869
11870            $file->insert_lines(@copy);
11871        }
11872        elsif ($code_point =~ /^00/ && $fields[$CATEGORY] eq 'Lm') {
11873
11874            # There are no Lm characters in Latin1; these should be 'Sk', but
11875            # there isn't that in V1.
11876            $fields[$CATEGORY] = 'So';
11877        }
11878
11879        if ($fields[$NUMERIC] eq '-') {
11880            $fields[$NUMERIC] = '-1';  # This is what 2.0 made it.
11881        }
11882
11883        if  ($fields[$PERL_DECOMPOSITION] ne "") {
11884
11885            # Several entries have this change to superscript 2 or 3 in the
11886            # middle.  Convert these to the modern version, which is to use
11887            # the actual U+00B2 and U+00B3 (the superscript forms) instead.
11888            # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
11889            # 'HHHH HHHH 00B3 HHHH'.
11890            # It turns out that all of these that don't have another
11891            # decomposition defined at the beginning of the line have the
11892            # <square> decomposition in later releases.
11893            if ($code_point ne '00B2' && $code_point ne '00B3') {
11894                if  ($fields[$PERL_DECOMPOSITION]
11895                                    =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
11896                {
11897                    if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
11898                        $fields[$PERL_DECOMPOSITION] = '<square> '
11899                        . $fields[$PERL_DECOMPOSITION];
11900                    }
11901                }
11902            }
11903
11904            # If is like '<+circled> 0052 <-circled>', convert to
11905            # '<circled> 0052'
11906            $fields[$PERL_DECOMPOSITION] =~
11907                            s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/xg;
11908
11909            # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
11910            $fields[$PERL_DECOMPOSITION] =~
11911                            s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
11912            or $fields[$PERL_DECOMPOSITION] =~
11913                            s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
11914            or $fields[$PERL_DECOMPOSITION] =~
11915                            s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
11916            or $fields[$PERL_DECOMPOSITION] =~
11917                        s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
11918
11919            # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
11920            $fields[$PERL_DECOMPOSITION] =~
11921                    s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
11922
11923            # Change names to modern form.
11924            $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
11925            $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
11926            $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
11927            $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
11928
11929            # One entry has weird braces
11930            $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
11931
11932            # One entry at U+2116 has an extra <sup>
11933            $fields[$PERL_DECOMPOSITION] =~ s/( < .*? > .* ) < .*? > \ * /$1/x;
11934        }
11935
11936        $_ = join ';', $code_point, @fields;
11937        trace $_ if main::DEBUG && $to_trace;
11938        return;
11939    }
11940
11941    sub filter_bad_Nd_ucd {
11942        # Early versions specified a value in the decimal digit field even
11943        # though the code point wasn't a decimal digit.  Clear the field in
11944        # that situation, so that the main code doesn't think it is a decimal
11945        # digit.
11946
11947        my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11948        if ($fields[$PERL_DECIMAL_DIGIT] ne "" && $fields[$CATEGORY] ne 'Nd') {
11949            $fields[$PERL_DECIMAL_DIGIT] = "";
11950            $_ = join ';', $code_point, @fields;
11951        }
11952        return;
11953    }
11954
11955    my @U1_control_names = split /\n/, <<'END';
11956NULL
11957START OF HEADING
11958START OF TEXT
11959END OF TEXT
11960END OF TRANSMISSION
11961ENQUIRY
11962ACKNOWLEDGE
11963BELL
11964BACKSPACE
11965HORIZONTAL TABULATION
11966LINE FEED
11967VERTICAL TABULATION
11968FORM FEED
11969CARRIAGE RETURN
11970SHIFT OUT
11971SHIFT IN
11972DATA LINK ESCAPE
11973DEVICE CONTROL ONE
11974DEVICE CONTROL TWO
11975DEVICE CONTROL THREE
11976DEVICE CONTROL FOUR
11977NEGATIVE ACKNOWLEDGE
11978SYNCHRONOUS IDLE
11979END OF TRANSMISSION BLOCK
11980CANCEL
11981END OF MEDIUM
11982SUBSTITUTE
11983ESCAPE
11984FILE SEPARATOR
11985GROUP SEPARATOR
11986RECORD SEPARATOR
11987UNIT SEPARATOR
11988DELETE
11989BREAK PERMITTED HERE
11990NO BREAK HERE
11991INDEX
11992NEXT LINE
11993START OF SELECTED AREA
11994END OF SELECTED AREA
11995CHARACTER TABULATION SET
11996CHARACTER TABULATION WITH JUSTIFICATION
11997LINE TABULATION SET
11998PARTIAL LINE DOWN
11999PARTIAL LINE UP
12000REVERSE LINE FEED
12001SINGLE SHIFT TWO
12002SINGLE SHIFT THREE
12003DEVICE CONTROL STRING
12004PRIVATE USE ONE
12005PRIVATE USE TWO
12006SET TRANSMIT STATE
12007CANCEL CHARACTER
12008MESSAGE WAITING
12009START OF GUARDED AREA
12010END OF GUARDED AREA
12011START OF STRING
12012SINGLE CHARACTER INTRODUCER
12013CONTROL SEQUENCE INTRODUCER
12014STRING TERMINATOR
12015OPERATING SYSTEM COMMAND
12016PRIVACY MESSAGE
12017APPLICATION PROGRAM COMMAND
12018END
12019
12020    sub filter_early_U1_names {
12021        # Very early versions did not have the Unicode_1_name field specified.
12022        # They differed in which ones were present; make sure a U1 name
12023        # exists, so that Unicode::UCD::charinfo will work
12024
12025        my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12026
12027
12028        # @U1_control names above are entirely positional, so we pull them out
12029        # in the exact order required, with gaps for the ones that don't have
12030        # names.
12031        if ($code_point =~ /^00[01]/
12032            || $code_point eq '007F'
12033            || $code_point =~ /^008[2-9A-F]/
12034            || $code_point =~ /^009[0-8A-F]/)
12035        {
12036            my $u1_name = shift @U1_control_names;
12037            $fields[$UNICODE_1_NAME] = $u1_name unless $fields[$UNICODE_1_NAME];
12038            $_ = join ';', $code_point, @fields;
12039        }
12040        return;
12041    }
12042
12043    sub filter_v2_1_5_ucd {
12044        # A dozen entries in this 2.1.5 file had the mirrored and numeric
12045        # columns swapped;  These all had mirrored be 'N'.  So if the numeric
12046        # column appears to be N, swap it back.
12047
12048        my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12049        if ($fields[$NUMERIC] eq 'N') {
12050            $fields[$NUMERIC] = $fields[$MIRRORED];
12051            $fields[$MIRRORED] = 'N';
12052            $_ = join ';', $code_point, @fields;
12053        }
12054        return;
12055    }
12056
12057    sub filter_v6_ucd {
12058
12059        # Unicode 6.0 co-opted the name BELL for U+1F514, but until 5.17,
12060        # it wasn't accepted, to allow for some deprecation cycles.  This
12061        # function is not called after 5.16
12062
12063        return if $_ !~ /^(?:0007|1F514|070F);/;
12064
12065        my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12066        if ($code_point eq '0007') {
12067            $fields[$CHARNAME] = "";
12068        }
12069        elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see
12070                            # http://www.unicode.org/versions/corrigendum8.html
12071            $fields[$BIDI] = "AL";
12072        }
12073        elsif ($^V lt v5.18.0) { # For 5.18 will convert to use Unicode's name
12074            $fields[$CHARNAME] = "";
12075        }
12076
12077        $_ = join ';', $code_point, @fields;
12078
12079        return;
12080    }
12081} # End closure for UnicodeData
12082
12083sub process_GCB_test($file) {
12084
12085    while ($file->next_line) {
12086        push @backslash_X_tests, $_;
12087    }
12088
12089    return;
12090}
12091
12092sub process_LB_test($file) {
12093
12094    while ($file->next_line) {
12095        push @LB_tests, $_;
12096    }
12097
12098    return;
12099}
12100
12101sub process_SB_test($file) {
12102
12103    while ($file->next_line) {
12104        push @SB_tests, $_;
12105    }
12106
12107    return;
12108}
12109
12110sub process_WB_test($file) {
12111
12112    while ($file->next_line) {
12113        push @WB_tests, $_;
12114    }
12115
12116    return;
12117}
12118
12119sub process_NamedSequences($file) {
12120    # NamedSequences.txt entries are just added to an array.  Because these
12121    # don't look like the other tables, they have their own handler.
12122    # An example:
12123    # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
12124    #
12125    # This just adds the sequence to an array for later handling
12126
12127    while ($file->next_line) {
12128        my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
12129        if (@remainder) {
12130            $file->carp_bad_line(
12131                "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
12132            next;
12133        }
12134
12135        # Code points need to be 5 digits long like the other entries in
12136        # Name.pl, for regcomp.c parsing; and the ones below 0x0100 need to be
12137        # converted to native
12138        $sequence = join " ", map { sprintf("%05X",
12139                                    utf8::unicode_to_native(hex $_))
12140                                  } split / /, $sequence;
12141        push @named_sequences, "$sequence\n$name\n";
12142    }
12143    return;
12144}
12145
12146{ # Closure
12147
12148    my $first_range;
12149
12150    sub  filter_early_ea_lb {
12151        # Fixes early EastAsianWidth.txt and LineBreak.txt files.  These had a
12152        # third field be the name of the code point, which can be ignored in
12153        # most cases.  But it can be meaningful if it marks a range:
12154        # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
12155        # 3400;W;<CJK Ideograph Extension A, First>
12156        #
12157        # We need to see the First in the example above to know it's a range.
12158        # They did not use the later range syntaxes.  This routine changes it
12159        # to use the modern syntax.
12160        # $1 is the Input_file object.
12161
12162        my @fields = split /\s*;\s*/;
12163        if ($fields[2] =~ /^<.*, First>/) {
12164            $first_range = $fields[0];
12165            $_ = "";
12166        }
12167        elsif ($fields[2] =~ /^<.*, Last>/) {
12168            $_ = $_ = "$first_range..$fields[0]; $fields[1]";
12169        }
12170        else {
12171            undef $first_range;
12172            $_ = "$fields[0]; $fields[1]";
12173        }
12174
12175        return;
12176    }
12177}
12178
12179sub filter_substitute_lb {
12180    # Used on Unicodes that predate the LB property, where there is a
12181    # substitute file.  This just does the regular ea_lb handling for such
12182    # files, and then substitutes the long property value name for the short
12183    # one that comes with the file.  (The other break files have the long
12184    # names in them, so this is the odd one out.)  The reason for doing this
12185    # kludge is that regen/mk_invlists.pl is expecting the long name.  This
12186    # also fixes the typo 'Inseperable' that leads to problems.
12187
12188    filter_early_ea_lb;
12189    return unless $_;
12190
12191    my @fields = split /\s*;\s*/;
12192    $fields[1] = property_ref('_Perl_LB')->table($fields[1])->full_name;
12193    $fields[1] = 'Inseparable' if lc $fields[1] eq 'inseperable';
12194    $_ = join '; ', @fields;
12195}
12196
12197sub filter_old_style_arabic_shaping {
12198    # Early versions used a different term for the later one.
12199
12200    my @fields = split /\s*;\s*/;
12201    $fields[3] =~ s/<no shaping>/No_Joining_Group/;
12202    $fields[3] =~ s/\s+/_/g;                # Change spaces to underscores
12203    $_ = join ';', @fields;
12204    return;
12205}
12206
12207{ # Closure
12208    my $lc; # Table for lowercase mapping
12209    my $tc;
12210    my $uc;
12211    my %special_casing_code_points;
12212
12213    sub setup_special_casing($file) {
12214        # SpecialCasing.txt contains the non-simple case change mappings.  The
12215        # simple ones are in UnicodeData.txt, which should already have been
12216        # read in to the full property data structures, so as to initialize
12217        # these with the simple ones.  Then the SpecialCasing.txt entries
12218        # add or overwrite the ones which have different full mappings.
12219
12220        # This routine sees if the simple mappings are to be output, and if
12221        # so, copies what has already been put into the full mapping tables,
12222        # while they still contain only the simple mappings.
12223
12224        # The reason it is done this way is that the simple mappings are
12225        # probably not going to be output, so it saves work to initialize the
12226        # full tables with the simple mappings, and then overwrite those
12227        # relatively few entries in them that have different full mappings,
12228        # and thus skip the simple mapping tables altogether.
12229
12230        $lc = property_ref('lc');
12231        $tc = property_ref('tc');
12232        $uc = property_ref('uc');
12233
12234        # For each of the case change mappings...
12235        foreach my $full_casing_table ($lc, $tc, $uc) {
12236            my $full_casing_name = $full_casing_table->name;
12237            my $full_casing_full_name = $full_casing_table->full_name;
12238            unless (defined $full_casing_table
12239                    && ! $full_casing_table->is_empty)
12240            {
12241                Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing.  Only special casing will be generated.");
12242            }
12243
12244            $full_casing_table->add_comment(join_lines( <<END
12245This file includes both the simple and full case changing maps.  The simple
12246ones are in the main body of the table below, and the full ones adding to or
12247overriding them are in the hash.
12248END
12249            ));
12250
12251            # The simple version's name in each mapping merely has an 's' in
12252            # front of the full one's
12253            my $simple_name = 's' . $full_casing_name;
12254            my $simple = property_ref($simple_name);
12255            $simple->initialize($full_casing_table) if $simple->to_output_map();
12256        }
12257
12258        return;
12259    }
12260
12261    sub filter_2_1_8_special_casing_line {
12262
12263        # This version had duplicate entries in this file.  Delete all but the
12264        # first one
12265        my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
12266                                              # fields
12267        if (exists $special_casing_code_points{$fields[0]}) {
12268            $_ = "";
12269            return;
12270        }
12271
12272        $special_casing_code_points{$fields[0]} = 1;
12273        filter_special_casing_line(@_);
12274    }
12275
12276    sub filter_special_casing_line($file) {
12277        # Change the format of $_ from SpecialCasing.txt into something that
12278        # the generic handler understands.  Each input line contains three
12279        # case mappings.  This will generate three lines to pass to the
12280        # generic handler for each of those.
12281
12282        # The input syntax (after stripping comments and trailing white space
12283        # is like one of the following (with the final two being entries that
12284        # we ignore):
12285        # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
12286        # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
12287        # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
12288        # Note the trailing semi-colon, unlike many of the input files.  That
12289        # means that there will be an extra null field generated by the split
12290
12291        my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
12292                                              # fields
12293
12294        # field #4 is when this mapping is conditional.  If any of these get
12295        # implemented, it would be by hard-coding in the casing functions in
12296        # the Perl core, not through tables.  But if there is a new condition
12297        # we don't know about, output a warning.  We know about all the
12298        # conditions through 6.0
12299        if ($fields[4] ne "") {
12300            my @conditions = split ' ', $fields[4];
12301            if ($conditions[0] ne 'tr'  # We know that these languages have
12302                                        # conditions, and some are multiple
12303                && $conditions[0] ne 'az'
12304                && $conditions[0] ne 'lt'
12305
12306                # And, we know about a single condition Final_Sigma, but
12307                # nothing else.
12308                && ($v_version gt v5.2.0
12309                    && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
12310            {
12311                $file->carp_bad_line("Unknown condition '$fields[4]'.  You should inspect it and either add code to handle it, or add to list of those that are to ignore");
12312            }
12313            elsif ($conditions[0] ne 'Final_Sigma') {
12314
12315                    # Don't print out a message for Final_Sigma, because we
12316                    # have hard-coded handling for it.  (But the standard
12317                    # could change what the rule should be, but it wouldn't
12318                    # show up here anyway.
12319
12320                    print "# SKIPPING Special Casing: $_\n"
12321                                                    if $verbosity >= $VERBOSE;
12322            }
12323            $_ = "";
12324            return;
12325        }
12326        elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
12327            $file->carp_bad_line('Extra fields');
12328            $_ = "";
12329            return;
12330        }
12331
12332        my $decimal_code_point = hex $fields[0];
12333
12334        # Loop to handle each of the three mappings in the input line, in
12335        # order, with $i indicating the current field number.
12336        my $i = 0;
12337        for my $object ($lc, $tc, $uc) {
12338            $i++;   # First time through, $i = 0 ... 3rd time = 3
12339
12340            my $value = $object->value_of($decimal_code_point);
12341            $value = ($value eq $CODE_POINT)
12342                      ? $decimal_code_point
12343                      : hex $value;
12344
12345            # If this isn't a multi-character mapping, it should already have
12346            # been read in.
12347            if ($fields[$i] !~ / /) {
12348                if ($value != hex $fields[$i]) {
12349                    Carp::my_carp("Bad news. UnicodeData.txt thinks "
12350                                  . $object->name
12351                                  . "(0x$fields[0]) is $value"
12352                                  . " and SpecialCasing.txt thinks it is "
12353                                  . hex($fields[$i])
12354                                  . ".  Good luck.  Retaining UnicodeData value, and proceeding anyway.");
12355                }
12356            }
12357            else {
12358
12359                # The mapping is additional, beyond the simple mapping.
12360                $file->insert_adjusted_lines("$fields[0]; "
12361                                             . $object->name
12362                                            . "; "
12363                                            . $CMD_DELIM
12364                                            . "$REPLACE_CMD=$MULTIPLE_BEFORE"
12365                                            . $CMD_DELIM
12366                                            . $fields[$i]);
12367            }
12368        }
12369
12370        # Everything has been handled by the insert_adjusted_lines()
12371        $_ = "";
12372
12373        return;
12374    }
12375}
12376
12377sub filter_old_style_case_folding($file) {
12378    # This transforms $_ containing the case folding style of 3.0.1, to 3.1
12379    # and later style.  Different letters were used in the earlier.
12380
12381    my @fields = split /\s*;\s*/;
12382
12383    if ($fields[1] eq 'L') {
12384        $fields[1] = 'C';             # L => C always
12385    }
12386    elsif ($fields[1] eq 'E') {
12387        if ($fields[2] =~ / /) {      # E => C if one code point; F otherwise
12388            $fields[1] = 'F'
12389        }
12390        else {
12391            $fields[1] = 'C'
12392        }
12393    }
12394    else {
12395        $file->carp_bad_line("Expecting L or E in second field");
12396        $_ = "";
12397        return;
12398    }
12399    $_ = join("; ", @fields) . ';';
12400    return;
12401}
12402
12403{ # Closure for case folding
12404
12405    # Create the map for simple only if are going to output it, for otherwise
12406    # it takes no part in anything we do.
12407    my $to_output_simple;
12408
12409    sub setup_case_folding {
12410        # Read in the case foldings in CaseFolding.txt.  This handles both
12411        # simple and full case folding.
12412
12413        $to_output_simple
12414                        = property_ref('Simple_Case_Folding')->to_output_map;
12415
12416        if (! $to_output_simple) {
12417            property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding');
12418        }
12419
12420        # If we ever wanted to show that these tables were combined, a new
12421        # property method could be created, like set_combined_props()
12422        property_ref('Case_Folding')->add_comment(join_lines( <<END
12423This file includes both the simple and full case folding maps.  The simple
12424ones are in the main body of the table below, and the full ones adding to or
12425overriding them are in the hash.
12426END
12427        ));
12428        return;
12429    }
12430
12431    sub filter_case_folding_line($file) {
12432        # Called for each line in CaseFolding.txt
12433        # Input lines look like:
12434        # 0041; C; 0061; # LATIN CAPITAL LETTER A
12435        # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
12436        # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
12437        #
12438        # 'C' means that folding is the same for both simple and full
12439        # 'F' that it is only for full folding
12440        # 'S' that it is only for simple folding
12441        # 'T' is locale-dependent, and ignored
12442        # 'I' is a type of 'F' used in some early releases.
12443        # Note the trailing semi-colon, unlike many of the input files.  That
12444        # means that there will be an extra null field generated by the split
12445        # below, which we ignore and hence is not an error.
12446
12447        my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
12448        if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
12449            $file->carp_bad_line('Extra fields');
12450            $_ = "";
12451            return;
12452        }
12453
12454        if ($type =~ / ^ [IT] $/x) {   # Skip Turkic case folding, is locale dependent
12455            $_ = "";
12456            return;
12457        }
12458
12459        # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
12460        # I are all full foldings; S is single-char.  For S, there is always
12461        # an F entry, so we must allow multiple values for the same code
12462        # point.  Fortunately this table doesn't need further manipulation
12463        # which would preclude using multiple-values.  The S is now included
12464        # so that _swash_inversion_hash() is able to construct closures
12465        # without having to worry about F mappings.
12466        if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
12467            $_ = "$range; Case_Folding; "
12468                 . "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map";
12469        }
12470        else {
12471            $_ = "";
12472            $file->carp_bad_line('Expecting C F I S or T in second field');
12473        }
12474
12475        # C and S are simple foldings, but simple case folding is not needed
12476        # unless we explicitly want its map table output.
12477        if ($to_output_simple && $type eq 'C' || $type eq 'S') {
12478            $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
12479        }
12480
12481        return;
12482    }
12483
12484} # End case fold closure
12485
12486sub filter_jamo_line {
12487    # Filter Jamo.txt lines.  This routine mainly is used to populate hashes
12488    # from this file that is used in generating the Name property for Jamo
12489    # code points.  But, it also is used to convert early versions' syntax
12490    # into the modern form.  Here are two examples:
12491    # 1100; G   # HANGUL CHOSEONG KIYEOK            # Modern syntax
12492    # U+1100; G; HANGUL CHOSEONG KIYEOK             # 2.0 syntax
12493    #
12494    # The input is $_, the output is $_ filtered.
12495
12496    my @fields = split /\s*;\s*/, $_, -1;  # -1 => retain trailing null fields
12497
12498    # Let the caller handle unexpected input.  In earlier versions, there was
12499    # a third field which is supposed to be a comment, but did not have a '#'
12500    # before it.
12501    return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
12502
12503    $fields[0] =~ s/^U\+//;     # Also, early versions had this extraneous
12504                                # beginning.
12505
12506    # Some 2.1 versions had this wrong.  Causes havoc with the algorithm.
12507    $fields[1] = 'R' if $fields[0] eq '1105';
12508
12509    # Add to structure so can generate Names from it.
12510    my $cp = hex $fields[0];
12511    my $short_name = $fields[1];
12512    $Jamo{$cp} = $short_name;
12513    if ($cp <= $LBase + $LCount) {
12514        $Jamo_L{$short_name} = $cp - $LBase;
12515    }
12516    elsif ($cp <= $VBase + $VCount) {
12517        $Jamo_V{$short_name} = $cp - $VBase;
12518    }
12519    elsif ($cp <= $TBase + $TCount) {
12520        $Jamo_T{$short_name} = $cp - $TBase;
12521    }
12522    else {
12523        Carp::my_carp_bug("Unexpected Jamo code point in $_");
12524    }
12525
12526
12527    # Reassemble using just the first two fields to look like a typical
12528    # property file line
12529    $_ = "$fields[0]; $fields[1]";
12530
12531    return;
12532}
12533
12534sub register_fraction($rational) {
12535    # This registers the input rational number so that it can be passed on to
12536    # Unicode::UCD, both in rational and floating forms.
12537
12538    my $floating = eval $rational;
12539
12540    my @floats = sprintf "%.*e", $E_FLOAT_PRECISION, $floating;
12541
12542    # See if the denominator is a power of 2.
12543    $rational =~ m!.*/(.*)!;
12544    my $denominator = $1;
12545    if (defined $denominator && (($denominator & ($denominator - 1)) == 0)) {
12546
12547        # Here the denominator is a power of 2.  This means it has an exact
12548        # representation in binary, so rounding could go either way.  It turns
12549        # out that Windows doesn't necessarily round towards even, so output
12550        # an extra entry.  This happens when the final digit we output is even
12551        # and the next digits would be 50* to the precision of the machine.
12552        my $extra_digit_float = sprintf "%e", $floating;
12553        my $q = $E_FLOAT_PRECISION - 1;
12554        if ($extra_digit_float =~ / ( .* \. \d{$q} )
12555                                    ( [02468] ) 5 0* ( e .*)
12556                                  /ix)
12557        {
12558            push @floats, $1 . ($2 + 1) . $3;
12559        }
12560    }
12561
12562    foreach my $float (@floats) {
12563        # Strip off any leading zeros beyond 2 digits to make it C99
12564        # compliant.  (Windows has 3 digit exponents, contrary to C99)
12565        $float =~ s/ ( .* e [-+] ) 0* ( \d{2,}? ) /$1$2/x;
12566
12567        if (   defined $nv_floating_to_rational{$float}
12568            && $nv_floating_to_rational{$float} ne $rational)
12569        {
12570            die Carp::my_carp_bug("Both '$rational' and"
12571                            . " '$nv_floating_to_rational{$float}' evaluate to"
12572                            . " the same floating point number."
12573                            . "  \$E_FLOAT_PRECISION must be increased");
12574        }
12575        $nv_floating_to_rational{$float} = $rational;
12576    }
12577    return;
12578}
12579
12580sub gcd($a, $b) {   # Greatest-common-divisor; from
12581                # http://en.wikipedia.org/wiki/Euclidean_algorithm
12582    use integer;
12583
12584    while ($b != 0) {
12585       my $temp = $b;
12586       $b = $a % $b;
12587       $a = $temp;
12588    }
12589    return $a;
12590}
12591
12592sub reduce_fraction($fraction_ref) {
12593    # Reduce a fraction to lowest terms.  The Unicode data may be reducible,
12594    # hence this is needed.  The argument is a reference to the
12595    # string denoting the fraction, which must be of the form:
12596    if ($$fraction_ref !~ / ^ (-?) (\d+) \/ (\d+) $ /ax) {
12597        Carp::my_carp_bug("Non-fraction input '$$fraction_ref'.  Unchanged");
12598        return;
12599    }
12600
12601    my $sign = $1;
12602    my $numerator = $2;
12603    my $denominator = $3;
12604
12605    use integer;
12606
12607    # Find greatest common divisor
12608    my $gcd = gcd($numerator, $denominator);
12609
12610    # And reduce using the gcd.
12611    if ($gcd != 1) {
12612        $numerator    /= $gcd;
12613        $denominator  /= $gcd;
12614        $$fraction_ref = "$sign$numerator/$denominator";
12615    }
12616
12617    return;
12618}
12619
12620sub filter_numeric_value_line($file) {
12621    # DNumValues contains lines of a different syntax than the typical
12622    # property file:
12623    # 0F33          ; -0.5 ; ; -1/2 # No       TIBETAN DIGIT HALF ZERO
12624    #
12625    # This routine transforms $_ containing the anomalous syntax to the
12626    # typical, by filtering out the extra columns, and convert early version
12627    # decimal numbers to strings that look like rational numbers.
12628
12629    # Starting in 5.1, there is a rational field.  Just use that, omitting the
12630    # extra columns.  Otherwise convert the decimal number in the second field
12631    # to a rational, and omit extraneous columns.
12632    my @fields = split /\s*;\s*/, $_, -1;
12633    my $rational;
12634
12635    if ($v_version ge v5.1.0) {
12636        if (@fields != 4) {
12637            $file->carp_bad_line('Not 4 semi-colon separated fields');
12638            $_ = "";
12639            return;
12640        }
12641        reduce_fraction(\$fields[3]) if $fields[3] =~ qr{/};
12642        $rational = $fields[3];
12643
12644        $_ = join '; ', @fields[ 0, 3 ];
12645    }
12646    else {
12647
12648        # Here, is an older Unicode file, which has decimal numbers instead of
12649        # rationals in it.  Use the fraction to calculate the denominator and
12650        # convert to rational.
12651
12652        if (@fields != 2 && @fields != 3) {
12653            $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
12654            $_ = "";
12655            return;
12656        }
12657
12658        my $codepoints = $fields[0];
12659        my $decimal = $fields[1];
12660        if ($decimal =~ s/\.0+$//) {
12661
12662            # Anything ending with a decimal followed by nothing but 0's is an
12663            # integer
12664            $_ = "$codepoints; $decimal";
12665            $rational = $decimal;
12666        }
12667        else {
12668
12669            my $denominator;
12670            if ($decimal =~ /\.50*$/) {
12671                $denominator = 2;
12672            }
12673
12674            # Here have the hardcoded repeating decimals in the fraction, and
12675            # the denominator they imply.  There were only a few denominators
12676            # in the older Unicode versions of this file which this code
12677            # handles, so it is easy to convert them.
12678
12679            # The 4 is because of a round-off error in the Unicode 3.2 files
12680            elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
12681                $denominator = 3;
12682            }
12683            elsif ($decimal =~ /\.[27]50*$/) {
12684                $denominator = 4;
12685            }
12686            elsif ($decimal =~ /\.[2468]0*$/) {
12687                $denominator = 5;
12688            }
12689            elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
12690                $denominator = 6;
12691            }
12692            elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
12693                $denominator = 8;
12694            }
12695            if ($denominator) {
12696                my $sign = ($decimal < 0) ? "-" : "";
12697                my $numerator = int((abs($decimal) * $denominator) + .5);
12698                $rational = "$sign$numerator/$denominator";
12699                $_ = "$codepoints; $rational";
12700            }
12701            else {
12702                $file->carp_bad_line("Can't cope with number '$decimal'.");
12703                $_ = "";
12704                return;
12705            }
12706        }
12707    }
12708
12709    register_fraction($rational) if $rational =~ qr{/};
12710    return;
12711}
12712
12713{ # Closure
12714    my %unihan_properties;
12715
12716    sub construct_unihan($file_object) {
12717
12718        return unless file_exists($file_object->file);
12719
12720        if ($v_version lt v4.0.0) {
12721            push @cjk_properties, 'URS ; Unicode_Radical_Stroke';
12722            push @cjk_property_values, split "\n", <<'END';
12723# @missing: 0000..10FFFF; Unicode_Radical_Stroke; <none>
12724END
12725        }
12726
12727        if ($v_version ge v3.0.0) {
12728            push @cjk_properties, split "\n", <<'END';
12729cjkIRG_GSource; kIRG_GSource
12730cjkIRG_JSource; kIRG_JSource
12731cjkIRG_KSource; kIRG_KSource
12732cjkIRG_TSource; kIRG_TSource
12733cjkIRG_VSource; kIRG_VSource
12734END
12735        push @cjk_property_values, split "\n", <<'END';
12736# @missing: 0000..10FFFF; cjkIRG_GSource; <none>
12737# @missing: 0000..10FFFF; cjkIRG_JSource; <none>
12738# @missing: 0000..10FFFF; cjkIRG_KSource; <none>
12739# @missing: 0000..10FFFF; cjkIRG_TSource; <none>
12740# @missing: 0000..10FFFF; cjkIRG_VSource; <none>
12741END
12742        }
12743        if ($v_version ge v3.1.0) {
12744            push @cjk_properties, 'cjkIRG_HSource; kIRG_HSource';
12745            push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_HSource; <none>';
12746        }
12747        if ($v_version ge v3.1.1) {
12748            push @cjk_properties, 'cjkIRG_KPSource; kIRG_KPSource';
12749            push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_KPSource; <none>';
12750        }
12751        if ($v_version ge v3.2.0) {
12752            push @cjk_properties, split "\n", <<'END';
12753cjkAccountingNumeric; kAccountingNumeric
12754cjkCompatibilityVariant; kCompatibilityVariant
12755cjkOtherNumeric; kOtherNumeric
12756cjkPrimaryNumeric; kPrimaryNumeric
12757END
12758            push @cjk_property_values, split "\n", <<'END';
12759# @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
12760# @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
12761# @missing: 0000..10FFFF; cjkOtherNumeric; NaN
12762# @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
12763END
12764        }
12765        if ($v_version gt v4.0.0) {
12766            push @cjk_properties, 'cjkIRG_USource; kIRG_USource';
12767            push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_USource; <none>';
12768        }
12769
12770        if ($v_version ge v4.1.0) {
12771            push @cjk_properties, 'cjkIICore ; kIICore';
12772            push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIICore; <none>';
12773        }
12774    }
12775
12776    sub setup_unihan {
12777        # Do any special setup for Unihan properties.
12778
12779        # This property gives the wrong computed type, so override.
12780        my $usource = property_ref('kIRG_USource');
12781        $usource->set_type($STRING) if defined $usource;
12782
12783        # This property is to be considered binary (it says so in
12784        # http://www.unicode.org/reports/tr38/)
12785        my $iicore = property_ref('kIICore');
12786        if (defined $iicore) {
12787            $iicore->set_type($FORCED_BINARY);
12788            $iicore->table("Y")->add_note("Matches any code point which has a non-null value for this property; see unicode.org UAX #38.");
12789
12790            # Unicode doesn't include the maps for this property, so don't
12791            # warn that they are missing.
12792            $iicore->set_pre_declared_maps(0);
12793            $iicore->add_comment(join_lines( <<END
12794This property contains string values, but any non-empty ones are considered to
12795be 'core', so Perl creates tables for both: 1) its string values, plus 2)
12796tables so that \\p{kIICore} matches any code point which has a non-empty
12797value for this property.
12798END
12799            ));
12800        }
12801
12802        return;
12803    }
12804
12805    sub filter_unihan_line {
12806        # Change unihan db lines to look like the others in the db.  Here is
12807        # an input sample:
12808        #   U+341C        kCangjie        IEKN
12809
12810        # Tabs are used instead of semi-colons to separate fields; therefore
12811        # they may have semi-colons embedded in them.  Change these to periods
12812        # so won't screw up the rest of the code.
12813        s/;/./g;
12814
12815        # Remove lines that don't look like ones we accept.
12816        if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
12817            $_ = "";
12818            return;
12819        }
12820
12821        # Extract the property, and save a reference to its object.
12822        my $property = $1;
12823        if (! exists $unihan_properties{$property}) {
12824            $unihan_properties{$property} = property_ref($property);
12825        }
12826
12827        # Don't do anything unless the property is one we're handling, which
12828        # we determine by seeing if there is an object defined for it or not
12829        if (! defined $unihan_properties{$property}) {
12830            $_ = "";
12831            return;
12832        }
12833
12834        # Convert the tab separators to our standard semi-colons, and convert
12835        # the U+HHHH notation to the rest of the standard's HHHH
12836        s/\t/;/g;
12837        s/\b U \+ (?= $code_point_re )//xg;
12838
12839        #local $to_trace = 1 if main::DEBUG;
12840        trace $_ if main::DEBUG && $to_trace;
12841
12842        return;
12843    }
12844}
12845
12846sub filter_blocks_lines($file) {
12847    # In the Blocks.txt file, the names of the blocks don't quite match the
12848    # names given in PropertyValueAliases.txt, so this changes them so they
12849    # do match:  Blanks and hyphens are changed into underscores.  Also makes
12850    # early release versions look like later ones
12851    #
12852    # $_ is transformed to the correct value.
12853
12854    if ($v_version lt v3.2.0) {
12855        if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
12856            $_ = "";
12857            return;
12858        }
12859
12860        # Old versions used a different syntax to mark the range.
12861        $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
12862    }
12863
12864    my @fields = split /\s*;\s*/, $_, -1;
12865    if (@fields != 2) {
12866        $file->carp_bad_line("Expecting exactly two fields");
12867        $_ = "";
12868        return;
12869    }
12870
12871    # Change hyphens and blanks in the block name field only
12872    $fields[1] =~ s/[ -]/_/g;
12873    $fields[1] =~ s/_ ( [a-z] ) /_\u$1/xg;   # Capitalize first letter of word
12874
12875    $_ = join("; ", @fields);
12876    return;
12877}
12878
12879{ # Closure
12880    my $current_property;
12881
12882    sub filter_old_style_proplist {
12883        # PropList.txt has been in Unicode since version 2.0.  Until 3.1, it
12884        # was in a completely different syntax.  Ken Whistler of Unicode says
12885        # that it was something he used as an aid for his own purposes, but
12886        # was never an official part of the standard.  Many of the properties
12887        # in it were incorporated into the later PropList.txt, but some were
12888        # not.  This program uses this early file to generate property tables
12889        # that are otherwise not accessible in the early UCD's.  It does this
12890        # for the ones that eventually became official, and don't appear to be
12891        # too different in their contents from the later official version, and
12892        # throws away the rest.  It could be argued that the ones it generates
12893        # were probably not really official at that time, so should be
12894        # ignored.  You can easily modify things to skip all of them by
12895        # changing this function to just set $_ to "", and return; and to skip
12896        # certain of them by simply removing their declarations from
12897        # get_old_property_aliases().
12898        #
12899        # Here is a list of all the ones that are thrown away:
12900        #   Alphabetic                   The definitions for this are very
12901        #                                defective, so better to not mislead
12902        #                                people into thinking it works.
12903        #                                Instead the Perl extension of the
12904        #                                same name is constructed from first
12905        #                                principles.
12906        #   Bidi=*                       duplicates UnicodeData.txt
12907        #   Combining                    never made into official property;
12908        #                                is \P{ccc=0}
12909        #   Composite                    never made into official property.
12910        #   Currency Symbol              duplicates UnicodeData.txt: gc=sc
12911        #   Decimal Digit                duplicates UnicodeData.txt: gc=nd
12912        #   Delimiter                    never made into official property;
12913        #                                removed in 3.0.1
12914        #   Format Control               never made into official property;
12915        #                                similar to gc=cf
12916        #   High Surrogate               duplicates Blocks.txt
12917        #   Ignorable Control            never made into official property;
12918        #                                similar to di=y
12919        #   ISO Control                  duplicates UnicodeData.txt: gc=cc
12920        #   Left of Pair                 never made into official property;
12921        #   Line Separator               duplicates UnicodeData.txt: gc=zl
12922        #   Low Surrogate                duplicates Blocks.txt
12923        #   Non-break                    was actually listed as a property
12924        #                                in 3.2, but without any code
12925        #                                points.  Unicode denies that this
12926        #                                was ever an official property
12927        #   Non-spacing                  duplicate UnicodeData.txt: gc=mn
12928        #   Numeric                      duplicates UnicodeData.txt: gc=cc
12929        #   Paired Punctuation           never made into official property;
12930        #                                appears to be gc=ps + gc=pe
12931        #   Paragraph Separator          duplicates UnicodeData.txt: gc=cc
12932        #   Private Use                  duplicates UnicodeData.txt: gc=co
12933        #   Private Use High Surrogate   duplicates Blocks.txt
12934        #   Punctuation                  duplicates UnicodeData.txt: gc=p
12935        #   Space                        different definition than eventual
12936        #                                one.
12937        #   Titlecase                    duplicates UnicodeData.txt: gc=lt
12938        #   Unassigned Code Value        duplicates UnicodeData.txt: gc=cn
12939        #   Zero-width                   never made into official property;
12940        #                                subset of gc=cf
12941        # Most of the properties have the same names in this file as in later
12942        # versions, but a couple do not.
12943        #
12944        # This subroutine filters $_, converting it from the old style into
12945        # the new style.  Here's a sample of the old-style
12946        #
12947        #   *******************************************
12948        #
12949        #   Property dump for: 0x100000A0 (Join Control)
12950        #
12951        #   200C..200D  (2 chars)
12952        #
12953        # In the example, the property is "Join Control".  It is kept in this
12954        # closure between calls to the subroutine.  The numbers beginning with
12955        # 0x were internal to Ken's program that generated this file.
12956
12957        # If this line contains the property name, extract it.
12958        if (/^Property dump for: [^(]*\((.*)\)/) {
12959            $_ = $1;
12960
12961            # Convert white space to underscores.
12962            s/ /_/g;
12963
12964            # Convert the few properties that don't have the same name as
12965            # their modern counterparts
12966            s/Identifier_Part/ID_Continue/
12967            or s/Not_a_Character/NChar/;
12968
12969            # If the name matches an existing property, use it.
12970            if (defined property_ref($_)) {
12971                trace "new property=", $_ if main::DEBUG && $to_trace;
12972                $current_property = $_;
12973            }
12974            else {        # Otherwise discard it
12975                trace "rejected property=", $_ if main::DEBUG && $to_trace;
12976                undef $current_property;
12977            }
12978            $_ = "";    # The property is saved for the next lines of the
12979                        # file, but this defining line is of no further use,
12980                        # so clear it so that the caller won't process it
12981                        # further.
12982        }
12983        elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
12984
12985            # Here, the input line isn't a header defining a property for the
12986            # following section, and either we aren't in such a section, or
12987            # the line doesn't look like one that defines the code points in
12988            # such a section.  Ignore this line.
12989            $_ = "";
12990        }
12991        else {
12992
12993            # Here, we have a line defining the code points for the current
12994            # stashed property.  Anything starting with the first blank is
12995            # extraneous.  Otherwise, it should look like a normal range to
12996            # the caller.  Append the property name so that it looks just like
12997            # a modern PropList entry.
12998
12999            $_ =~ s/\s.*//;
13000            $_ .= "; $current_property";
13001        }
13002        trace $_ if main::DEBUG && $to_trace;
13003        return;
13004    }
13005} # End closure for old style proplist
13006
13007sub filter_old_style_normalization_lines {
13008    # For early releases of Unicode, the lines were like:
13009    #        74..2A76    ; NFKD_NO
13010    # For later releases this became:
13011    #        74..2A76    ; NFKD_QC; N
13012    # Filter $_ to look like those in later releases.
13013    # Similarly for MAYBEs
13014
13015    s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
13016
13017    # Also, the property FC_NFKC was abbreviated to FNC
13018    s/FNC/FC_NFKC/;
13019    return;
13020}
13021
13022sub setup_script_extensions {
13023    # The Script_Extensions property starts out with a clone of the Script
13024    # property.
13025
13026    $scx = property_ref("Script_Extensions");
13027    return unless defined $scx;
13028
13029    $scx->_set_format($STRING_WHITE_SPACE_LIST);
13030    $scx->initialize($script);
13031    $scx->set_default_map($script->default_map);
13032    $scx->set_pre_declared_maps(0);     # PropValueAliases doesn't list these
13033    $scx->add_comment(join_lines( <<END
13034The values for code points that appear in one script are just the same as for
13035the 'Script' property.  Likewise the values for those that appear in many
13036scripts are either 'Common' or 'Inherited', same as with 'Script'.  But the
13037values of code points that appear in a few scripts are a space separated list
13038of those scripts.
13039END
13040    ));
13041
13042    # Initialize scx's tables and the aliases for them to be the same as sc's
13043    foreach my $table ($script->tables) {
13044        my $scx_table = $scx->add_match_table($table->name,
13045                                Full_Name => $table->full_name);
13046        foreach my $alias ($table->aliases) {
13047            $scx_table->add_alias($alias->name);
13048        }
13049    }
13050}
13051
13052sub  filter_script_extensions_line {
13053    # The Scripts file comes with the full name for the scripts; the
13054    # ScriptExtensions, with the short name.  The final mapping file is a
13055    # combination of these, and without adjustment, would have inconsistent
13056    # entries.  This filters the latter file to convert to full names.
13057    # Entries look like this:
13058    # 064B..0655    ; Arab Syrc # Mn  [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
13059
13060    my @fields = split /\s*;\s*/;
13061
13062    # This script was erroneously omitted in this Unicode version.
13063    $fields[1] .= ' Takr' if $v_version eq v6.1.0 && $fields[0] =~ /^0964/;
13064
13065    my @full_names;
13066    foreach my $short_name (split " ", $fields[1]) {
13067        push @full_names, $script->table($short_name)->full_name;
13068    }
13069    $fields[1] = join " ", @full_names;
13070    $_ = join "; ", @fields;
13071
13072    return;
13073}
13074
13075sub setup_emojidata {
13076    my $prop_ref = Property->new('ExtPict',
13077                                 Full_Name => 'Extended_Pictographic',
13078    );
13079    $prop_ref->set_fate($PLACEHOLDER,
13080                        "Not part of the Unicode Character Database");
13081}
13082
13083sub filter_emojidata_line {
13084    # We only are interested in this single property from this non-UCD data
13085    # file, and we turn it into a Perl property, so that it isn't accessible
13086    # to the users
13087
13088    $_ = "" unless /\bExtended_Pictographic\b/;
13089
13090    return;
13091}
13092
13093sub setup_IdStatus {
13094    my $ids = Property->new('Identifier_Status',
13095                            Match_SubDir => 'IdStatus',
13096                            Default_Map => 'Restricted',
13097                           );
13098    $ids->add_match_table('Allowed');
13099}
13100
13101sub setup_IdType {
13102    $idt = Property->new('Identifier_Type',
13103                            Match_SubDir => 'IdType',
13104                            Default_Map => 'Not_Character',
13105                            Format => $STRING_WHITE_SPACE_LIST,
13106                           );
13107}
13108
13109sub  filter_IdType_line {
13110
13111    # Some code points have more than one type, separated by spaces on the
13112    # input.  For now, we just add everything as a property value.  Later when
13113    # we look for properties with format $STRING_WHITE_SPACE_LIST, we resolve
13114    # things
13115
13116    my @fields = split /\s*;\s*/;
13117    my $types = $fields[1];
13118    $idt->add_match_table($types) unless defined $idt->table($types);
13119
13120    return;
13121}
13122
13123sub generate_hst($file) {
13124
13125    # Populates the Hangul Syllable Type property from first principles
13126
13127    # These few ranges are hard-coded in.
13128    $file->insert_lines(split /\n/, <<'END'
131291100..1159    ; L
13130115F          ; L
131311160..11A2    ; V
1313211A8..11F9    ; T
13133END
13134);
13135
13136    # The Hangul syllables in version 1 are at different code points than
13137    # those that came along starting in version 2, and have different names;
13138    # they comprise about 60% of the code points of the later version.
13139    # From my (khw) research on them (see <558493EB.4000807@att.net>), the
13140    # initial set is a subset of the later version, with different English
13141    # transliterations.  I did not see an easy mapping between them.  The
13142    # later set includes essentially all possibilities, even ones that aren't
13143    # in modern use (if they ever were), and over 96% of the new ones are type
13144    # LVT.  Mathematically, the early set must also contain a preponderance of
13145    # LVT values.  In lieu of doing nothing, we just set them all to LVT, and
13146    # expect that this will be right most of the time, which is better than
13147    # not being right at all.
13148    if ($v_version lt v2.0.0) {
13149        my $property = property_ref($file->property);
13150        $file->insert_lines(sprintf("%04X..%04X; LVT\n",
13151                                    $FIRST_REMOVED_HANGUL_SYLLABLE,
13152                                    $FINAL_REMOVED_HANGUL_SYLLABLE));
13153        push @tables_that_may_be_empty, $property->table('LV')->complete_name;
13154        return;
13155    }
13156
13157    # The algorithmically derived syllables are almost all LVT ones, so
13158    # initialize the whole range with that.
13159    $file->insert_lines(sprintf "%04X..%04X; LVT\n",
13160                        $SBase, $SBase + $SCount -1);
13161
13162    # Those ones that aren't LVT are LV, and they occur at intervals of
13163    # $TCount code points, starting with the first code point, at $SBase.
13164    for (my $i = $SBase; $i < $SBase + $SCount; $i += $TCount) {
13165        $file->insert_lines(sprintf "%04X..%04X; LV\n", $i, $i);
13166    }
13167
13168    return;
13169}
13170
13171sub generate_GCB($file) {
13172
13173    # Populates the Grapheme Cluster Break property from first principles
13174
13175    # All these definitions are from
13176    # http://www.unicode.org/reports/tr29/tr29-3.html with confirmation
13177    # from http://www.unicode.org/reports/tr29/tr29-4.html
13178
13179    foreach my $range ($gc->ranges) {
13180
13181        # Extend includes gc=Me and gc=Mn, while Control includes gc=Cc
13182        # and gc=Cf
13183        if ($range->value =~ / ^ M [en] $ /x) {
13184            $file->insert_lines(sprintf "%04X..%04X; Extend",
13185                                $range->start,  $range->end);
13186        }
13187        elsif ($range->value =~ / ^ C [cf] $ /x) {
13188            $file->insert_lines(sprintf "%04X..%04X; Control",
13189                                $range->start,  $range->end);
13190        }
13191    }
13192    $file->insert_lines("2028; Control"); # Line Separator
13193    $file->insert_lines("2029; Control"); # Paragraph Separator
13194
13195    $file->insert_lines("000D; CR");
13196    $file->insert_lines("000A; LF");
13197
13198    # Also from http://www.unicode.org/reports/tr29/tr29-3.html.
13199    foreach my $code_point ( qw{
13200                                09BE 09D7 0B3E 0B57 0BBE 0BD7 0CC2 0CD5 0CD6
13201                                0D3E 0D57 0DCF 0DDF FF9E FF9F 1D165 1D16E 1D16F
13202                                }
13203    ) {
13204        my $category = $gc->value_of(hex $code_point);
13205        next if ! defined $category || $category eq 'Cn'; # But not if
13206                                                          # unassigned in this
13207                                                          # release
13208        $file->insert_lines("$code_point; Extend");
13209    }
13210
13211    my $hst = property_ref('Hangul_Syllable_Type');
13212    if ($hst->count > 0) {
13213        foreach my $range ($hst->ranges) {
13214            $file->insert_lines(sprintf "%04X..%04X; %s",
13215                                    $range->start, $range->end, $range->value);
13216        }
13217    }
13218    else {
13219        generate_hst($file);
13220    }
13221
13222    main::process_generic_property_file($file);
13223}
13224
13225
13226sub fixup_early_perl_name_alias($file) {
13227
13228    # Different versions of Unicode have varying support for the name synonyms
13229    # below.  Just include everything.  As of 6.1, all these are correct in
13230    # the Unicode-supplied file.
13231
13232    # ALERT did not come along until 6.0, at which point it became preferred
13233    # over BELL.  By inserting it last in early releases, BELL is preferred
13234    # over it; and vice-vers in 6.0
13235    my $type_for_bell = ($v_version lt v6.0.0)
13236               ? 'correction'
13237               : 'alternate';
13238    $file->insert_lines(split /\n/, <<END
132390007;BELL; $type_for_bell
13240000A;LINE FEED (LF);alternate
13241000C;FORM FEED (FF);alternate
13242000D;CARRIAGE RETURN (CR);alternate
132430085;NEXT LINE (NEL);alternate
13244END
13245
13246    );
13247
13248    # One might think that the 'Unicode_1_Name' field, could work for most
13249    # of the above names, but sadly that field varies depending on the
13250    # release.  Version 1.1.5 had no names for any of the controls; Version
13251    # 2.0 introduced names for the C0 controls, and 3.0 introduced C1 names.
13252    # 3.0.1 removed the name INDEX; and 3.2 changed some names:
13253    #   changed to parenthesized versions like "NEXT LINE" to
13254    #       "NEXT LINE (NEL)";
13255    #   changed PARTIAL LINE DOWN to PARTIAL LINE FORWARD
13256    #   changed PARTIAL LINE UP to PARTIAL LINE BACKWARD;;
13257    #   changed e.g. FILE SEPARATOR to INFORMATION SEPARATOR FOUR
13258    #
13259    # All these are present in the 6.1 NameAliases.txt
13260
13261    return;
13262}
13263
13264sub filter_later_version_name_alias_line {
13265
13266    # This file has an extra entry per line for the alias type.  This is
13267    # handled by creating a compound entry: "$alias: $type";  First, split
13268    # the line into components.
13269    my ($range, $alias, $type, @remainder)
13270        = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
13271
13272    # This file contains multiple entries for some components, so tell the
13273    # downstream code to allow this in our internal tables; the
13274    # $MULTIPLE_AFTER preserves the input ordering.
13275    $_ = join ";", $range, $CMD_DELIM
13276                           . $REPLACE_CMD
13277                           . '='
13278                           . $MULTIPLE_AFTER
13279                           . $CMD_DELIM
13280                           . "$alias: $type",
13281                   @remainder;
13282    return;
13283}
13284
13285sub filter_early_version_name_alias_line {
13286
13287    # Early versions did not have the trailing alias type field; implicitly it
13288    # was 'correction'.
13289    $_ .= "; correction";
13290
13291    filter_later_version_name_alias_line;
13292    return;
13293}
13294
13295sub filter_all_caps_script_names {
13296
13297    # Some early Unicode releases had the script names in all CAPS.  This
13298    # converts them to just the first letter of each word being capital.
13299
13300    my ($range, $script, @remainder)
13301        = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
13302    my @words = split /[_-]/, $script;
13303    for my $word (@words) {
13304        $word =
13305            ucfirst(lc($word)) if $word ne 'CJK';
13306    }
13307    $script = join "_", @words;
13308    $_ = join ";", $range, $script, @remainder;
13309}
13310
13311sub finish_Unicode() {
13312    # This routine should be called after all the Unicode files have been read
13313    # in.  It:
13314    # 1) Creates properties that are missing from the version of Unicode being
13315    #    compiled, and which, for whatever reason, are needed for the Perl
13316    #    core to function properly.  These are minimally populated as
13317    #    necessary.
13318    # 2) Adds the mappings for code points missing from the files which have
13319    #    defaults specified for them.
13320    # 3) At this point all mappings are known, so it computes the type of
13321    #    each property whose type hasn't been determined yet.
13322    # 4) Calculates all the regular expression match tables based on the
13323    #    mappings.
13324    # 5) Calculates and adds the tables which are defined by Unicode, but
13325    #    which aren't derived by them, and certain derived tables that Perl
13326    #    uses.
13327
13328    # Folding information was introduced later into Unicode data.  To get
13329    # Perl's case ignore (/i) to work at all in releases that don't have
13330    # folding, use the best available alternative, which is lower casing.
13331    my $fold = property_ref('Case_Folding');
13332    if ($fold->is_empty) {
13333        $fold->initialize(property_ref('Lowercase_Mapping'));
13334        $fold->add_note(join_lines(<<END
13335WARNING: This table uses lower case as a substitute for missing fold
13336information
13337END
13338        ));
13339    }
13340
13341    # Multiple-character mapping was introduced later into Unicode data, so it
13342    # is by default the simple version.  If to output the simple versions and
13343    # not present, just use the regular (which in these Unicode versions is
13344    # the simple as well).
13345    foreach my $map (qw {   Uppercase_Mapping
13346                            Lowercase_Mapping
13347                            Titlecase_Mapping
13348                            Case_Folding
13349                        } )
13350    {
13351        my $comment = <<END;
13352
13353Note that although the Perl core uses this file, it has the standard values
13354for code points from U+0000 to U+00FF compiled in, so changing this table will
13355not change the core's behavior with respect to these code points.  Use
13356Unicode::Casing to override this table.
13357END
13358        if ($map eq 'Case_Folding') {
13359            $comment .= <<END;
13360(/i regex matching is not overridable except by using a custom regex engine)
13361END
13362        }
13363        property_ref($map)->add_comment(join_lines($comment));
13364        my $simple = property_ref("Simple_$map");
13365        next if ! $simple->is_empty;
13366        if ($simple->to_output_map) {
13367            $simple->initialize(property_ref($map));
13368        }
13369        else {
13370            property_ref($map)->set_proxy_for($simple->name);
13371        }
13372    }
13373
13374    # For each property, fill in any missing mappings, and calculate the re
13375    # match tables.  If a property has more than one missing mapping, the
13376    # default is a reference to a data structure, and may require data from
13377    # other properties to resolve.  The sort is used to cause these to be
13378    # processed last, after all the other properties have been calculated.
13379    # (Fortunately, the missing properties so far don't depend on each other.)
13380    foreach my $property
13381        (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
13382        property_ref('*'))
13383    {
13384        # $perl has been defined, but isn't one of the Unicode properties that
13385        # need to be finished up.
13386        next if $property == $perl;
13387
13388        # Nor do we need to do anything with properties that aren't going to
13389        # be output.
13390        next if $property->fate == $SUPPRESSED;
13391
13392        # Handle the properties that have more than one possible default
13393        if (ref $property->default_map) {
13394            my $default_map = $property->default_map;
13395
13396            # These properties have stored in the default_map:
13397            # One or more of:
13398            #   1)  A default map which applies to all code points in a
13399            #       certain class
13400            #   2)  an expression which will evaluate to the list of code
13401            #       points in that class
13402            # And
13403            #   3) the default map which applies to every other missing code
13404            #      point.
13405            #
13406            # Go through each list.
13407            while (my ($default, $eval) = $default_map->get_next_defaults) {
13408
13409                # Get the class list, and intersect it with all the so-far
13410                # unspecified code points yielding all the code points
13411                # in the class that haven't been specified.
13412                my $list = eval $eval;
13413                if ($@) {
13414                    Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
13415                    last;
13416                }
13417
13418                # Narrow down the list to just those code points we don't have
13419                # maps for yet.
13420                $list = $list & $property->inverse_list;
13421
13422                # Add mappings to the property for each code point in the list
13423                foreach my $range ($list->ranges) {
13424                    $property->add_map($range->start, $range->end, $default,
13425                    Replace => $CROAK);
13426                }
13427            }
13428
13429            # All remaining code points have the other mapping.  Set that up
13430            # so the normal single-default mapping code will work on them
13431            $property->set_default_map($default_map->other_default);
13432
13433            # And fall through to do that
13434        }
13435
13436        # We should have enough data now to compute the type of the property.
13437        my $property_name = $property->name;
13438        $property->compute_type;
13439        my $property_type = $property->type;
13440
13441        next if ! $property->to_create_match_tables;
13442
13443        # Here want to create match tables for this property
13444
13445        # The Unicode db always (so far, and they claim into the future) have
13446        # the default for missing entries in binary properties be 'N' (unless
13447        # there is a '@missing' line that specifies otherwise)
13448        if (! defined $property->default_map) {
13449            if ($property_type == $BINARY) {
13450                $property->set_default_map('N');
13451            }
13452            elsif ($property_type == $ENUM) {
13453                Carp::my_carp("Property '$property_name doesn't have a default mapping.  Using a fake one");
13454                $property->set_default_map('XXX This makes sure there is a default map');
13455            }
13456        }
13457
13458        # Add any remaining code points to the mapping, using the default for
13459        # missing code points.
13460        my $default_table;
13461        my $default_map = $property->default_map;
13462        if ($property_type == $FORCED_BINARY) {
13463
13464            # A forced binary property creates a 'Y' table that matches all
13465            # non-default values.  The actual string values are also written out
13466            # as a map table.  (The default value will almost certainly be the
13467            # empty string, so the pod glosses over the distinction, and just
13468            # talks about empty vs non-empty.)
13469            my $yes = $property->table("Y");
13470            foreach my $range ($property->ranges) {
13471                next if $range->value eq $default_map;
13472                $yes->add_range($range->start, $range->end);
13473            }
13474            $property->table("N")->set_complement($yes);
13475        }
13476        else {
13477            if (defined $default_map) {
13478
13479                # Make sure there is a match table for the default
13480                if (! defined ($default_table = $property->table($default_map)))
13481                {
13482                    $default_table = $property->add_match_table($default_map);
13483                }
13484
13485                # And, if the property is binary, the default table will just
13486                # be the complement of the other table.
13487                if ($property_type == $BINARY) {
13488                    my $non_default_table;
13489
13490                    # Find the non-default table.
13491                    for my $table ($property->tables) {
13492                        if ($table == $default_table) {
13493                            if ($v_version le v5.0.0) {
13494                                $table->add_alias($_) for qw(N No F False);
13495                            }
13496                            next;
13497                        } elsif ($v_version le v5.0.0) {
13498                            $table->add_alias($_) for qw(Y Yes T True);
13499                        }
13500                        $non_default_table = $table;
13501                    }
13502                    $default_table->set_complement($non_default_table);
13503                }
13504                else {
13505
13506                    # This fills in any missing values with the default.  It's
13507                    # not necessary to do this with binary properties, as the
13508                    # default is defined completely in terms of the Y table.
13509                    $property->add_map(0, $MAX_WORKING_CODEPOINT,
13510                                    $default_map, Replace => $NO);
13511                }
13512            }
13513
13514            # Have all we need to populate the match tables.
13515            my $maps_should_be_defined = $property->pre_declared_maps;
13516            foreach my $range ($property->ranges) {
13517                my $map = $range->value;
13518                my $table = $property->table($map);
13519                if (! defined $table) {
13520
13521                    # Integral and rational property values are not
13522                    # necessarily defined in PropValueAliases, but whether all
13523                    # the other ones should be depends on the property.
13524                    if ($maps_should_be_defined
13525                        && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
13526                    {
13527                        Carp::my_carp("Table '$property_name=$map' should "
13528                                    . "have been defined.  Defining it now.")
13529                    }
13530                    $table = $property->add_match_table($map);
13531                }
13532
13533                next if $table->complement != 0; # Don't need to populate these
13534                $table->add_range($range->start, $range->end);
13535            }
13536        }
13537
13538        # For Perl 5.6 compatibility, all properties matchable in regexes can
13539        # have an optional 'Is_' prefix.  This is now done in Unicode::UCD.
13540        # But warn if this creates a conflict with a (new) Unicode property
13541        # name, although it appears that Unicode has made a decision never to
13542        # begin a property name with 'Is_', so this shouldn't happen.
13543        foreach my $alias ($property->aliases) {
13544            my $Is_name = 'Is_' . $alias->name;
13545            if (defined (my $pre_existing = property_ref($Is_name))) {
13546                Carp::my_carp(<<END
13547There is already an alias named $Is_name (from " . $pre_existing . "), so
13548creating one for $property won't work.  This is bad news.  If it is not too
13549late, get Unicode to back off.  Otherwise go back to the old scheme (findable
13550from the git blame log for this area of the code that suppressed individual
13551aliases that conflict with the new Unicode names.  Proceeding anyway.
13552END
13553                );
13554            }
13555        } # End of loop through aliases for this property
13556
13557
13558        # Properties that have sets of values for some characters are now
13559        # converted.  For example, the Script_Extensions property started out
13560        # as a clone of the Script property.  But processing its data file
13561        # caused some elements to be replaced with different data.  (These
13562        # elements were for the Common and Inherited properties.)  This data
13563        # is a qw() list of all the scripts that the code points in the given
13564        # range are in.  An example line is:
13565        #
13566        # 060C          ; Arab Syrc Thaa # Po       ARABIC COMMA
13567        #
13568        # Code executed earlier has created a new match table named "Arab Syrc
13569        # Thaa" which contains 060C.  (The cloned table started out with this
13570        # code point mapping to "Common".)  Now we add 060C to each of the
13571        # Arab, Syrc, and Thaa match tables.  Then we delete the now spurious
13572        # "Arab Syrc Thaa" match table.  This is repeated for all these tables
13573        # and ranges.  The map data is retained in the map table for
13574        # reference, but the spurious match tables are deleted.
13575        my $format = $property->format;
13576        if (defined $format && $format eq $STRING_WHITE_SPACE_LIST) {
13577            foreach my $table ($property->tables) {
13578
13579                # Space separates the entries which should go in multiple
13580                # tables
13581                next unless $table->name =~ /\s/;
13582
13583                # The list of the entries, hence the names of the tables that
13584                # everything in this combo table should be added to.
13585                my @list = split /\s+/, $table->name;
13586
13587                # Add the entries from the combo table to each individual
13588                # table
13589                foreach my $individual (@list) {
13590                    my $existing_table = $property->table($individual);
13591
13592                    # This should only be necessary if this particular entry
13593                    # occurs only in combo with others.
13594                    $existing_table = $property->add_match_table($individual)
13595                                                unless defined $existing_table;
13596                    $existing_table += $table;
13597                }
13598                $property->delete_match_table($table);
13599            }
13600        }
13601    } # End of loop through all Unicode properties.
13602
13603    # Fill in the mappings that Unicode doesn't completely furnish.  First the
13604    # single letter major general categories.  If Unicode were to start
13605    # delivering the values, this would be redundant, but better that than to
13606    # try to figure out if should skip and not get it right.  Ths could happen
13607    # if a new major category were to be introduced, and the hard-coded test
13608    # wouldn't know about it.
13609    # This routine depends on the standard names for the general categories
13610    # being what it thinks they are, like 'Cn'.  The major categories are the
13611    # union of all the general category tables which have the same first
13612    # letters. eg. L = Lu + Lt + Ll + Lo + Lm
13613    foreach my $minor_table ($gc->tables) {
13614        my $minor_name = $minor_table->name;
13615        next if length $minor_name == 1;
13616        if (length $minor_name != 2) {
13617            Carp::my_carp_bug("Unexpected general category '$minor_name'.  Skipped.");
13618            next;
13619        }
13620
13621        my $major_name = uc(substr($minor_name, 0, 1));
13622        my $major_table = $gc->table($major_name);
13623        $major_table += $minor_table;
13624    }
13625
13626    # LC is Ll, Lu, and Lt.  (used to be L& or L_, but PropValueAliases.txt
13627    # defines it as LC)
13628    my $LC = $gc->table('LC');
13629    $LC->add_alias('L_', Status => $DISCOURAGED);   # For backwards...
13630    $LC->add_alias('L&', Status => $DISCOURAGED);   # compatibility.
13631
13632
13633    if ($LC->is_empty) { # Assume if not empty that Unicode has started to
13634                         # deliver the correct values in it
13635        $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
13636
13637        # Lt not in release 1.
13638        if (defined $gc->table('Lt')) {
13639            $LC += $gc->table('Lt');
13640            $gc->table('Lt')->set_caseless_equivalent($LC);
13641        }
13642    }
13643    $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
13644
13645    $gc->table('Ll')->set_caseless_equivalent($LC);
13646    $gc->table('Lu')->set_caseless_equivalent($LC);
13647
13648    # Make sure this assumption in perl core code is valid in this Unicode
13649    # release, with known exceptions
13650    foreach my $range (property_ref('Numeric-Type')->table('Decimal')->ranges) {
13651        next if $range->end - $range->start == 9;
13652        next if $range->start == 0x1D7CE;   # This whole range was added in 3.1
13653        next if $range->end == 0x19DA && $v_version eq v5.2.0;
13654        next if $range->end - $range->start < 9 && $v_version le 4.0.0;
13655        Carp::my_carp("Range $range unexpectedly doesn't contain 10"
13656                    . " decimal digits.  Code in regcomp.c assumes it does,"
13657                    . " and will have to be fixed.  Proceeding anyway.");
13658    }
13659
13660    # Mark the scx table as the parent of the corresponding sc table for those
13661    # which are identical.  This causes the pod for the script table to refer
13662    # to the corresponding scx one.  This is done after everything, so as to
13663    # wait until the tables are stabilized before checking for equivalency.
13664    if (defined $scx) {
13665        if (defined $pod_directory) {
13666            foreach my $table ($scx->tables) {
13667                my $plain_sc_equiv = $script->table($table->name);
13668                if ($table->matches_identically_to($plain_sc_equiv)) {
13669                    $plain_sc_equiv->set_equivalent_to($table, Related => 1);
13670                }
13671            }
13672        }
13673    }
13674
13675    return;
13676}
13677
13678sub pre_3_dot_1_Nl () {
13679
13680    # Return a range list for gc=nl for Unicode versions prior to 3.1, which
13681    # is when Unicode's became fully usable.  These code points were
13682    # determined by inspection and experimentation.  gc=nl is important for
13683    # certain Perl-extension properties that should be available in all
13684    # releases.
13685
13686    my $Nl = Range_List->new();
13687    if (defined (my $official = $gc->table('Nl'))) {
13688        $Nl += $official;
13689    }
13690    else {
13691        $Nl->add_range(0x2160, 0x2182);
13692        $Nl->add_range(0x3007, 0x3007);
13693        $Nl->add_range(0x3021, 0x3029);
13694    }
13695    $Nl->add_range(0xFE20, 0xFE23);
13696    $Nl->add_range(0x16EE, 0x16F0) if $v_version ge v3.0.0; # 3.0 was when
13697                                                            # these were added
13698    return $Nl;
13699}
13700
13701sub calculate_Assigned() {  # Set $Assigned to the gc != Cn code points; may be
13702                            # called before the Cn's are completely filled.
13703                            # Works on Unicodes earlier than ones that
13704                            # explicitly specify Cn.
13705    return if defined $Assigned;
13706
13707    if (! defined $gc || $gc->is_empty()) {
13708        Carp::my_carp_bug("calculate_Assigned() called before $gc is populated");
13709    }
13710
13711    $Assigned = $perl->add_match_table('Assigned',
13712                                Description  => "All assigned code points",
13713                                );
13714    while (defined (my $range = $gc->each_range())) {
13715        my $standard_value = standardize($range->value);
13716        next if $standard_value eq 'cn' || $standard_value eq 'unassigned';
13717        $Assigned->add_range($range->start, $range->end);
13718    }
13719}
13720
13721sub calculate_DI() {    # Set $DI to a Range_List equivalent to the
13722                        # Default_Ignorable_Code_Point property.  Works on
13723                        # Unicodes earlier than ones that explicitly specify
13724                        # DI.
13725    return if defined $DI;
13726
13727    if (defined (my $di = property_ref('Default_Ignorable_Code_Point'))) {
13728        $DI = $di->table('Y');
13729    }
13730    else {
13731        $DI = Range_List->new(Initialize => [ 0x180B .. 0x180D,
13732                                              0x2060 .. 0x206F,
13733                                              0xFE00 .. 0xFE0F,
13734                                              0xFFF0 .. 0xFFFB,
13735                                            ]);
13736        if ($v_version ge v2.0) {
13737            $DI += $gc->table('Cf')
13738                +  $gc->table('Cs');
13739
13740            # These are above the Unicode version 1 max
13741            $DI->add_range(0xE0000, 0xE0FFF);
13742        }
13743        $DI += $gc->table('Cc')
13744             - ord("\t")
13745             - utf8::unicode_to_native(0x0A)  # LINE FEED
13746             - utf8::unicode_to_native(0x0B)  # VERTICAL TAB
13747             - ord("\f")
13748             - utf8::unicode_to_native(0x0D)  # CARRIAGE RETURN
13749             - utf8::unicode_to_native(0x85); # NEL
13750    }
13751}
13752
13753sub calculate_NChar() {  # Create a Perl extension match table which is the
13754                         # same as the Noncharacter_Code_Point property, and
13755                         # set $NChar to point to it.  Works on Unicodes
13756                         # earlier than ones that explicitly specify NChar
13757    return if defined $NChar;
13758
13759    $NChar = $perl->add_match_table('_Perl_Nchar',
13760                                    Perl_Extension => 1,
13761                                    Fate => $INTERNAL_ONLY);
13762    if (defined (my $off_nchar = property_ref('NChar'))) {
13763        $NChar->initialize($off_nchar->table('Y'));
13764    }
13765    else {
13766        $NChar->initialize([ 0xFFFE .. 0xFFFF ]);
13767        if ($v_version ge v2.0) {   # First release with these nchars
13768            for (my $i = 0x1FFFE; $i <= 0x10FFFE; $i += 0x10000) {
13769                $NChar += [ $i .. $i+1 ];
13770            }
13771        }
13772    }
13773}
13774
13775sub handle_compare_versions () {
13776    # This fixes things up for the $compare_versions capability, where we
13777    # compare Unicode version X with version Y (with Y > X), and we are
13778    # running it on the Unicode Data for version Y.
13779    #
13780    # It works by calculating the code points whose meaning has been specified
13781    # after release X, by using the Age property.  The complement of this set
13782    # is the set of code points whose meaning is unchanged between the
13783    # releases.  This is the set the program restricts itself to.  It includes
13784    # everything whose meaning has been specified by the time version X came
13785    # along, plus those still unassigned by the time of version Y.  (We will
13786    # continue to use the word 'assigned' to mean 'meaning has been
13787    # specified', as it's shorter and is accurate in all cases except the
13788    # Noncharacter code points.)
13789    #
13790    # This function is run after all the properties specified by Unicode have
13791    # been calculated for release Y.  This makes sure we get all the nuances
13792    # of Y's rules.  (It is done before the Perl extensions are calculated, as
13793    # those are based entirely on the Unicode ones.)  But doing it after the
13794    # Unicode table calculations means we have to fix up the Unicode tables.
13795    # We do this by subtracting the code points that have been assigned since
13796    # X (which is actually done by ANDing each table of assigned code points
13797    # with the set of unchanged code points).  Most Unicode properties are of
13798    # the form such that all unassigned code points have a default, grab-bag,
13799    # property value which is changed when the code point gets assigned.  For
13800    # these, we just remove the changed code points from the table for the
13801    # latter property value, and add them back in to the grab-bag one.  A few
13802    # other properties are not entirely of this form and have values for some
13803    # or all unassigned code points that are not the grab-bag one.  These have
13804    # to be handled specially, and are hard-coded in to this routine based on
13805    # manual inspection of the Unicode character database.  A list of the
13806    # outlier code points is made for each of these properties, and those
13807    # outliers are excluded from adding and removing from tables.
13808    #
13809    # Note that there are glitches when comparing against Unicode 1.1, as some
13810    # Hangul syllables in it were later ripped out and eventually replaced
13811    # with other things.
13812
13813    print "Fixing up for version comparison\n" if $verbosity >= $PROGRESS;
13814
13815    my $after_first_version = "All matching code points were added after "
13816                            . "Unicode $string_compare_versions";
13817
13818    # Calculate the delta as those code points that have been newly assigned
13819    # since the first compare version.
13820    my $delta = Range_List->new();
13821    foreach my $table ($age->tables) {
13822        use version;
13823        next if $table == $age->table('Unassigned');
13824        next if version->parse($table->name)
13825             le version->parse($string_compare_versions);
13826        $delta += $table;
13827    }
13828    if ($delta->is_empty) {
13829        die ("No changes; perhaps you need a 'DAge.txt' file?");
13830    }
13831
13832    my $unchanged = ~ $delta;
13833
13834    calculate_Assigned() if ! defined $Assigned;
13835    $Assigned &= $unchanged;
13836
13837    # $Assigned now contains the code points that were assigned as of Unicode
13838    # version X.
13839
13840    # A block is all or nothing.  If nothing is assigned in it, it all goes
13841    # back to the No_Block pool; but if even one code point is assigned, the
13842    # block is retained.
13843    my $no_block = $block->table('No_Block');
13844    foreach my $this_block ($block->tables) {
13845        next if     $this_block == $no_block
13846                ||  ! ($this_block & $Assigned)->is_empty;
13847        $this_block->set_fate($SUPPRESSED, $after_first_version);
13848        foreach my $range ($this_block->ranges) {
13849            $block->replace_map($range->start, $range->end, 'No_Block')
13850        }
13851        $no_block += $this_block;
13852    }
13853
13854    my @special_delta_properties;   # List of properties that have to be
13855                                    # handled specially.
13856    my %restricted_delta;           # Keys are the entries in
13857                                    # @special_delta_properties;  values
13858                                    # are the range list of the code points
13859                                    # that behave normally when they get
13860                                    # assigned.
13861
13862    # In the next three properties, the Default Ignorable code points are
13863    # outliers.
13864    calculate_DI();
13865    $DI &= $unchanged;
13866
13867    push @special_delta_properties, property_ref('_Perl_GCB');
13868    $restricted_delta{$special_delta_properties[-1]} = ~ $DI;
13869
13870    if (defined (my $cwnfkcc = property_ref('Changes_When_NFKC_Casefolded')))
13871    {
13872        push @special_delta_properties, $cwnfkcc;
13873        $restricted_delta{$special_delta_properties[-1]} = ~ $DI;
13874    }
13875
13876    calculate_NChar();      # Non-character code points
13877    $NChar &= $unchanged;
13878
13879    # This may have to be updated from time-to-time to get the most accurate
13880    # results.
13881    my $default_BC_non_LtoR = Range_List->new(Initialize =>
13882                        # These came from the comments in v8.0 DBidiClass.txt
13883                                                        [ # AL
13884                                                            0x0600 .. 0x07BF,
13885                                                            0x08A0 .. 0x08FF,
13886                                                            0xFB50 .. 0xFDCF,
13887                                                            0xFDF0 .. 0xFDFF,
13888                                                            0xFE70 .. 0xFEFF,
13889                                                            0x1EE00 .. 0x1EEFF,
13890                                                           # R
13891                                                            0x0590 .. 0x05FF,
13892                                                            0x07C0 .. 0x089F,
13893                                                            0xFB1D .. 0xFB4F,
13894                                                            0x10800 .. 0x10FFF,
13895                                                            0x1E800 .. 0x1EDFF,
13896                                                            0x1EF00 .. 0x1EFFF,
13897                                                           # ET
13898                                                            0x20A0 .. 0x20CF,
13899                                                         ]
13900                                          );
13901    $default_BC_non_LtoR += $DI + $NChar;
13902    push @special_delta_properties, property_ref('BidiClass');
13903    $restricted_delta{$special_delta_properties[-1]} = ~ $default_BC_non_LtoR;
13904
13905    if (defined (my $eaw = property_ref('East_Asian_Width'))) {
13906
13907        my $default_EA_width_W = Range_List->new(Initialize =>
13908                                    # From comments in v8.0 EastAsianWidth.txt
13909                                                [
13910                                                    0x3400 .. 0x4DBF,
13911                                                    0x4E00 .. 0x9FFF,
13912                                                    0xF900 .. 0xFAFF,
13913                                                    0x20000 .. 0x2A6DF,
13914                                                    0x2A700 .. 0x2B73F,
13915                                                    0x2B740 .. 0x2B81F,
13916                                                    0x2B820 .. 0x2CEAF,
13917                                                    0x2F800 .. 0x2FA1F,
13918                                                    0x20000 .. 0x2FFFD,
13919                                                    0x30000 .. 0x3FFFD,
13920                                                ]
13921                                             );
13922        push @special_delta_properties, $eaw;
13923        $restricted_delta{$special_delta_properties[-1]}
13924                                                       = ~ $default_EA_width_W;
13925
13926        # Line break came along in the same release as East_Asian_Width, and
13927        # the non-grab-bag default set is a superset of the EAW one.
13928        if (defined (my $lb = property_ref('Line_Break'))) {
13929            my $default_LB_non_XX = Range_List->new(Initialize =>
13930                                        # From comments in v8.0 LineBreak.txt
13931                                                        [ 0x20A0 .. 0x20CF ]);
13932            $default_LB_non_XX += $default_EA_width_W;
13933            push @special_delta_properties, $lb;
13934            $restricted_delta{$special_delta_properties[-1]}
13935                                                        = ~ $default_LB_non_XX;
13936        }
13937    }
13938
13939    # Go through every property, skipping those we've already worked on, those
13940    # that are immutable, and the perl ones that will be calculated after this
13941    # routine has done its fixup.
13942    foreach my $property (property_ref('*')) {
13943        next if    $property == $perl     # Done later in the program
13944                || $property == $block    # Done just above
13945                || $property == $DI       # Done just above
13946                || $property == $NChar    # Done just above
13947
13948                   # The next two are invariant across Unicode versions
13949                || $property == property_ref('Pattern_Syntax')
13950                || $property == property_ref('Pattern_White_Space');
13951
13952        #  Find the grab-bag value.
13953        my $default_map = $property->default_map;
13954
13955        if (! $property->to_create_match_tables) {
13956
13957            # Here there aren't any match tables.  So far, all such properties
13958            # have a default map, and don't require special handling.  Just
13959            # change each newly assigned code point back to the default map,
13960            # as if they were unassigned.
13961            foreach my $range ($delta->ranges) {
13962                $property->add_map($range->start,
13963                                $range->end,
13964                                $default_map,
13965                                Replace => $UNCONDITIONALLY);
13966            }
13967        }
13968        else {  # Here there are match tables.  Find the one (if any) for the
13969                # grab-bag value that unassigned code points go to.
13970            my $default_table;
13971            if (defined $default_map) {
13972                $default_table = $property->table($default_map);
13973            }
13974
13975            # If some code points don't go back to the grab-bag when they
13976            # are considered unassigned, exclude them from the list that does
13977            # that.
13978            my $this_delta = $delta;
13979            my $this_unchanged = $unchanged;
13980            if (grep { $_ == $property } @special_delta_properties) {
13981                $this_delta = $delta & $restricted_delta{$property};
13982                $this_unchanged = ~ $this_delta;
13983            }
13984
13985            # Fix up each match table for this property.
13986            foreach my $table ($property->tables) {
13987                if (defined $default_table && $table == $default_table) {
13988
13989                    # The code points assigned after release X (the ones we
13990                    # are excluding in this routine) go back on to the default
13991                    # (grab-bag) table.  However, some of these tables don't
13992                    # actually exist, but are specified solely by the other
13993                    # tables.  (In a binary property, we don't need to
13994                    # actually have an 'N' table, as it's just the complement
13995                    # of the 'Y' table.)  Such tables will be locked, so just
13996                    # skip those.
13997                    $table += $this_delta unless $table->locked;
13998                }
13999                else {
14000
14001                    # Here the table is not for the default value.  We need to
14002                    # subtract the code points we are ignoring for this
14003                    # comparison (the deltas) from it.  But if the table
14004                    # started out with nothing, no need to exclude anything,
14005                    # and want to skip it here anyway, so it gets listed
14006                    # properly in the pod.
14007                    next if $table->is_empty;
14008
14009                    # Save the deltas for later, before we do the subtraction
14010                    my $deltas = $table & $this_delta;
14011
14012                    $table &= $this_unchanged;
14013
14014                    # Suppress the table if the subtraction left it with
14015                    # nothing in it
14016                    if ($table->is_empty) {
14017                        if ($property->type == $BINARY) {
14018                            push @tables_that_may_be_empty, $table->complete_name;
14019                        }
14020                        else {
14021                            $table->set_fate($SUPPRESSED, $after_first_version);
14022                        }
14023                    }
14024
14025                    # Now we add the removed code points to the property's
14026                    # map, as they should now map to the grab-bag default
14027                    # property (which they did in the first comparison
14028                    # version).  But we don't have to do this if the map is
14029                    # only for internal use.
14030                    if (defined $default_map && $property->to_output_map) {
14031
14032                        # The gc property has pseudo property values whose names
14033                        # have length 1.  These are the union of all the
14034                        # property values whose name is longer than 1 and
14035                        # whose first letter is all the same.  The replacement
14036                        # is done once for the longer-named tables.
14037                        next if $property == $gc && length $table->name == 1;
14038
14039                        foreach my $range ($deltas->ranges) {
14040                            $property->add_map($range->start,
14041                                            $range->end,
14042                                            $default_map,
14043                                            Replace => $UNCONDITIONALLY);
14044                        }
14045                    }
14046                }
14047            }
14048        }
14049    }
14050
14051    # The above code doesn't work on 'gc=C', as it is a superset of the default
14052    # ('Cn') table.  It's easiest to just special case it here.
14053    my $C = $gc->table('C');
14054    $C += $gc->table('Cn');
14055
14056    return;
14057}
14058
14059sub compile_perl() {
14060    # Create perl-defined tables.  Almost all are part of the pseudo-property
14061    # named 'perl' internally to this program.  Many of these are recommended
14062    # in UTS#18 "Unicode Regular Expressions", and their derivations are based
14063    # on those found there.
14064    # Almost all of these are equivalent to some Unicode property.
14065    # A number of these properties have equivalents restricted to the ASCII
14066    # range, with their names prefaced by 'Posix', to signify that these match
14067    # what the Posix standard says they should match.  A couple are
14068    # effectively this, but the name doesn't have 'Posix' in it because there
14069    # just isn't any Posix equivalent.  'XPosix' are the Posix tables extended
14070    # to the full Unicode range, by our guesses as to what is appropriate.
14071
14072    # 'All' is all code points.  As an error check, instead of just setting it
14073    # to be that, construct it to be the union of all the major categories
14074    $All = $perl->add_match_table('All',
14075      Description
14076        => "All code points, including those above Unicode.  Same as qr/./s",
14077      Matches_All => 1);
14078
14079    foreach my $major_table ($gc->tables) {
14080
14081        # Major categories are the ones with single letter names.
14082        next if length($major_table->name) != 1;
14083
14084        $All += $major_table;
14085    }
14086
14087    if ($All->max != $MAX_WORKING_CODEPOINT) {
14088        Carp::my_carp_bug("Generated highest code point ("
14089           . sprintf("%X", $All->max)
14090           . ") doesn't match expected value $MAX_WORKING_CODEPOINT_STRING.")
14091    }
14092    if ($All->range_count != 1 || $All->min != 0) {
14093     Carp::my_carp_bug("Generated table 'All' doesn't match all code points.")
14094    }
14095
14096    my $Any = $perl->add_match_table('Any',
14097                                    Description  => "All Unicode code points");
14098    $Any->add_range(0, $MAX_UNICODE_CODEPOINT);
14099    $Any->add_alias('Unicode');
14100
14101    calculate_Assigned();
14102
14103    my $ASCII = $perl->add_match_table('ASCII');
14104    if (defined $block) {   # This is equivalent to the block if have it.
14105        my $Unicode_ASCII = $block->table('Basic_Latin');
14106        if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
14107            $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
14108        }
14109    }
14110
14111    # Very early releases didn't have blocks, so initialize ASCII ourselves if
14112    # necessary
14113    if ($ASCII->is_empty) {
14114        if (! NON_ASCII_PLATFORM) {
14115            $ASCII->add_range(0, 127);
14116        }
14117        else {
14118            for my $i (0 .. 127) {
14119                $ASCII->add_range(utf8::unicode_to_native($i),
14120                                  utf8::unicode_to_native($i));
14121            }
14122        }
14123    }
14124
14125    # Get the best available case definitions.  Early Unicode versions didn't
14126    # have Uppercase and Lowercase defined, so use the general category
14127    # instead for them, modified by hard-coding in the code points each is
14128    # missing.
14129    my $Lower = $perl->add_match_table('XPosixLower');
14130    my $Unicode_Lower = property_ref('Lowercase');
14131    if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
14132        $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
14133
14134    }
14135    else {
14136        $Lower += $gc->table('Lowercase_Letter');
14137
14138        # There are quite a few code points in Lower, that aren't in gc=lc,
14139        # and not all are in all releases.
14140        my $temp = Range_List->new(Initialize => [
14141                                                utf8::unicode_to_native(0xAA),
14142                                                utf8::unicode_to_native(0xBA),
14143                                                0x02B0 .. 0x02B8,
14144                                                0x02C0 .. 0x02C1,
14145                                                0x02E0 .. 0x02E4,
14146                                                0x0345,
14147                                                0x037A,
14148                                                0x1D2C .. 0x1D6A,
14149                                                0x1D78,
14150                                                0x1D9B .. 0x1DBF,
14151                                                0x2071,
14152                                                0x207F,
14153                                                0x2090 .. 0x209C,
14154                                                0x2170 .. 0x217F,
14155                                                0x24D0 .. 0x24E9,
14156                                                0x2C7C .. 0x2C7D,
14157                                                0xA770,
14158                                                0xA7F8 .. 0xA7F9,
14159                                ]);
14160        $Lower += $temp & $Assigned;
14161    }
14162    my $Posix_Lower = $perl->add_match_table("PosixLower",
14163                            Initialize => $Lower & $ASCII,
14164                            );
14165
14166    my $Upper = $perl->add_match_table("XPosixUpper");
14167    my $Unicode_Upper = property_ref('Uppercase');
14168    if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
14169        $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
14170    }
14171    else {
14172
14173        # Unlike Lower, there are only two ranges in Upper that aren't in
14174        # gc=Lu, and all code points were assigned in all releases.
14175        $Upper += $gc->table('Uppercase_Letter');
14176        $Upper->add_range(0x2160, 0x216F);  # Uppercase Roman numerals
14177        $Upper->add_range(0x24B6, 0x24CF);  # Circled Latin upper case letters
14178    }
14179    my $Posix_Upper = $perl->add_match_table("PosixUpper",
14180                            Initialize => $Upper & $ASCII,
14181                            );
14182
14183    # Earliest releases didn't have title case.  Initialize it to empty if not
14184    # otherwise present
14185    my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase',
14186                                       Description => '(= \p{Gc=Lt})');
14187    my $lt = $gc->table('Lt');
14188
14189    # Earlier versions of mktables had this related to $lt since they have
14190    # identical code points, but their caseless equivalents are not the same,
14191    # one being 'Cased' and the other being 'LC', and so now must be kept as
14192    # separate entities.
14193    if (defined $lt) {
14194        $Title += $lt;
14195    }
14196    else {
14197        push @tables_that_may_be_empty, $Title->complete_name;
14198    }
14199
14200    my $Unicode_Cased = property_ref('Cased');
14201    if (defined $Unicode_Cased) {
14202        my $yes = $Unicode_Cased->table('Y');
14203        my $no = $Unicode_Cased->table('N');
14204        $Title->set_caseless_equivalent($yes);
14205        if (defined $Unicode_Upper) {
14206            $Unicode_Upper->table('Y')->set_caseless_equivalent($yes);
14207            $Unicode_Upper->table('N')->set_caseless_equivalent($no);
14208        }
14209        $Upper->set_caseless_equivalent($yes);
14210        if (defined $Unicode_Lower) {
14211            $Unicode_Lower->table('Y')->set_caseless_equivalent($yes);
14212            $Unicode_Lower->table('N')->set_caseless_equivalent($no);
14213        }
14214        $Lower->set_caseless_equivalent($yes);
14215    }
14216    else {
14217        # If this Unicode version doesn't have Cased, set up the Perl
14218        # extension from first principles.  From Unicode 5.1: Definition D120:
14219        # A character C is defined to be cased if and only if C has the
14220        # Lowercase or Uppercase property or has a General_Category value of
14221        # Titlecase_Letter.
14222        my $cased = $perl->add_match_table('Cased',
14223                        Initialize => $Lower + $Upper + $Title,
14224                        Description => 'Uppercase or Lowercase or Titlecase',
14225                        );
14226        # $notcased is purely for the caseless equivalents below
14227        my $notcased = $perl->add_match_table('_Not_Cased',
14228                                Initialize => ~ $cased,
14229                                Fate => $INTERNAL_ONLY,
14230                                Description => 'All not-cased code points');
14231        $Title->set_caseless_equivalent($cased);
14232        if (defined $Unicode_Upper) {
14233            $Unicode_Upper->table('Y')->set_caseless_equivalent($cased);
14234            $Unicode_Upper->table('N')->set_caseless_equivalent($notcased);
14235        }
14236        $Upper->set_caseless_equivalent($cased);
14237        if (defined $Unicode_Lower) {
14238            $Unicode_Lower->table('Y')->set_caseless_equivalent($cased);
14239            $Unicode_Lower->table('N')->set_caseless_equivalent($notcased);
14240        }
14241        $Lower->set_caseless_equivalent($cased);
14242    }
14243
14244    # The remaining perl defined tables are mostly based on Unicode TR 18,
14245    # "Annex C: Compatibility Properties".  All of these have two versions,
14246    # one whose name generally begins with Posix that is posix-compliant, and
14247    # one that matches Unicode characters beyond the Posix, ASCII range
14248
14249    my $Alpha = $perl->add_match_table('XPosixAlpha');
14250
14251    # Alphabetic was not present in early releases
14252    my $Alphabetic = property_ref('Alphabetic');
14253    if (defined $Alphabetic && ! $Alphabetic->is_empty) {
14254        $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
14255    }
14256    else {
14257
14258        # The Alphabetic property doesn't exist for early releases, so
14259        # generate it.  The actual definition, in 5.2 terms is:
14260        #
14261        # gc=L + gc=Nl + Other_Alphabetic
14262        #
14263        # Other_Alphabetic is also not defined in these early releases, but it
14264        # contains one gc=So range plus most of gc=Mn and gc=Mc, so we add
14265        # those last two as well, then subtract the relatively few of them that
14266        # shouldn't have been added.  (The gc=So range is the circled capital
14267        # Latin characters.  Early releases mistakenly didn't also include the
14268        # lower-case versions of these characters, and so we don't either, to
14269        # maintain consistency with those releases that first had this
14270        # property.
14271        $Alpha->initialize($gc->table('Letter')
14272                           + pre_3_dot_1_Nl()
14273                           + $gc->table('Mn')
14274                           + $gc->table('Mc')
14275                        );
14276        $Alpha->add_range(0x24D0, 0x24E9);  # gc=So
14277        foreach my $range (     [ 0x0300, 0x0344 ],
14278                                [ 0x0346, 0x034E ],
14279                                [ 0x0360, 0x0362 ],
14280                                [ 0x0483, 0x0486 ],
14281                                [ 0x0591, 0x05AF ],
14282                                [ 0x06DF, 0x06E0 ],
14283                                [ 0x06EA, 0x06EC ],
14284                                [ 0x0740, 0x074A ],
14285                                0x093C,
14286                                0x094D,
14287                                [ 0x0951, 0x0954 ],
14288                                0x09BC,
14289                                0x09CD,
14290                                0x0A3C,
14291                                0x0A4D,
14292                                0x0ABC,
14293                                0x0ACD,
14294                                0x0B3C,
14295                                0x0B4D,
14296                                0x0BCD,
14297                                0x0C4D,
14298                                0x0CCD,
14299                                0x0D4D,
14300                                0x0DCA,
14301                                [ 0x0E47, 0x0E4C ],
14302                                0x0E4E,
14303                                [ 0x0EC8, 0x0ECC ],
14304                                [ 0x0F18, 0x0F19 ],
14305                                0x0F35,
14306                                0x0F37,
14307                                0x0F39,
14308                                [ 0x0F3E, 0x0F3F ],
14309                                [ 0x0F82, 0x0F84 ],
14310                                [ 0x0F86, 0x0F87 ],
14311                                0x0FC6,
14312                                0x1037,
14313                                0x1039,
14314                                [ 0x17C9, 0x17D3 ],
14315                                [ 0x20D0, 0x20DC ],
14316                                0x20E1,
14317                                [ 0x302A, 0x302F ],
14318                                [ 0x3099, 0x309A ],
14319                                [ 0xFE20, 0xFE23 ],
14320                                [ 0x1D165, 0x1D169 ],
14321                                [ 0x1D16D, 0x1D172 ],
14322                                [ 0x1D17B, 0x1D182 ],
14323                                [ 0x1D185, 0x1D18B ],
14324                                [ 0x1D1AA, 0x1D1AD ],
14325        ) {
14326            if (ref $range) {
14327                $Alpha->delete_range($range->[0], $range->[1]);
14328            }
14329            else {
14330                $Alpha->delete_range($range, $range);
14331            }
14332        }
14333        $Alpha->add_description('Alphabetic');
14334        $Alpha->add_alias('Alphabetic');
14335    }
14336    my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
14337                            Initialize => $Alpha & $ASCII,
14338                            );
14339    $Posix_Upper->set_caseless_equivalent($Posix_Alpha);
14340    $Posix_Lower->set_caseless_equivalent($Posix_Alpha);
14341
14342    my $Alnum = $perl->add_match_table('Alnum', Full_Name => 'XPosixAlnum',
14343                        Description => 'Alphabetic and (decimal) Numeric',
14344                        Initialize => $Alpha + $gc->table('Decimal_Number'),
14345                        );
14346    $perl->add_match_table("PosixAlnum",
14347                            Initialize => $Alnum & $ASCII,
14348                            );
14349
14350    my $Word = $perl->add_match_table('Word', Full_Name => 'XPosixWord',
14351                                Description => '\w, including beyond ASCII;'
14352                                            . ' = \p{Alnum} + \pM + \p{Pc}'
14353                                            . ' + \p{Join_Control}',
14354                                Initialize => $Alnum + $gc->table('Mark'),
14355                                );
14356    my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
14357    if (defined $Pc) {
14358        $Word += $Pc;
14359    }
14360    else {
14361        $Word += ord('_');  # Make sure this is a $Word
14362    }
14363    my $JC = property_ref('Join_Control');  # Wasn't in release 1
14364    if (defined $JC) {
14365        $Word += $JC->table('Y');
14366    }
14367    else {
14368        $Word += 0x200C + 0x200D;
14369    }
14370
14371    # This is a Perl extension, so the name doesn't begin with Posix.
14372    my $PerlWord = $perl->add_match_table('PosixWord',
14373                    Description => '\w, restricted to ASCII',
14374                    Initialize => $Word & $ASCII,
14375                    );
14376    $PerlWord->add_alias('PerlWord');
14377
14378    my $Blank = $perl->add_match_table('Blank', Full_Name => 'XPosixBlank',
14379                                Description => '\h, Horizontal white space',
14380
14381                                # 200B is Zero Width Space which is for line
14382                                # break control, and was listed as
14383                                # Space_Separator in early releases
14384                                Initialize => $gc->table('Space_Separator')
14385                                            +   ord("\t")
14386                                            -   0x200B, # ZWSP
14387                                );
14388    $Blank->add_alias('HorizSpace');        # Another name for it.
14389    $perl->add_match_table("PosixBlank",
14390                            Initialize => $Blank & $ASCII,
14391                            );
14392
14393    my $VertSpace = $perl->add_match_table('VertSpace',
14394                            Description => '\v',
14395                            Initialize =>
14396                               $gc->table('Line_Separator')
14397                             + $gc->table('Paragraph_Separator')
14398                             + utf8::unicode_to_native(0x0A)  # LINE FEED
14399                             + utf8::unicode_to_native(0x0B)  # VERTICAL TAB
14400                             + ord("\f")
14401                             + utf8::unicode_to_native(0x0D)  # CARRIAGE RETURN
14402                             + utf8::unicode_to_native(0x85)  # NEL
14403                    );
14404    # No Posix equivalent for vertical space
14405
14406    my $Space = $perl->add_match_table('XPosixSpace',
14407                Description => '\s including beyond ASCII and vertical tab',
14408                Initialize => $Blank + $VertSpace,
14409    );
14410    $Space->add_alias('XPerlSpace');    # Pre-existing synonyms
14411    $Space->add_alias('SpacePerl');
14412    $Space->add_alias('Space') if $v_version lt v4.1.0;
14413
14414    my $Posix_space = $perl->add_match_table("PosixSpace",
14415                            Initialize => $Space & $ASCII,
14416                            );
14417    $Posix_space->add_alias('PerlSpace'); # A pre-existing synonym
14418
14419    my $Cntrl = $perl->add_match_table('Cntrl', Full_Name => 'XPosixCntrl',
14420                                        Description => 'Control characters');
14421    $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
14422    $perl->add_match_table("PosixCntrl",
14423                            Description => "ASCII control characters",
14424                            Definition =>  "ACK, BEL, BS, CAN, CR, DC1, DC2,"
14425                                         . " DC3, DC4, DEL, DLE, ENQ, EOM,"
14426                                         . " EOT, ESC, ETB, ETX, FF, FS, GS,"
14427                                         . " HT, LF, NAK, NUL, RS, SI, SO,"
14428                                         . " SOH, STX, SUB, SYN, US, VT",
14429                            Initialize => $Cntrl & $ASCII,
14430                            );
14431
14432    my $perl_surrogate = $perl->add_match_table('_Perl_Surrogate');
14433    my $Cs = $gc->table('Cs');
14434    if (defined $Cs && ! $Cs->is_empty) {
14435        $perl_surrogate += $Cs;
14436    }
14437    else {
14438        push @tables_that_may_be_empty, '_Perl_Surrogate';
14439    }
14440
14441    # $controls is a temporary used to construct Graph.
14442    my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
14443                                                + $gc->table('Control')
14444                                                + $perl_surrogate);
14445
14446    # Graph is  ~space &  ~(Cc|Cs|Cn) = ~(space + $controls)
14447    my $Graph = $perl->add_match_table('Graph', Full_Name => 'XPosixGraph',
14448                        Description => 'Characters that are graphical',
14449                        Initialize => ~ ($Space + $controls),
14450                        );
14451    $perl->add_match_table("PosixGraph",
14452                            Initialize => $Graph & $ASCII,
14453                            );
14454
14455    $print = $perl->add_match_table('Print', Full_Name => 'XPosixPrint',
14456                        Description => 'Characters that are graphical plus space characters (but no controls)',
14457                        Initialize => $Blank + $Graph - $gc->table('Control'),
14458                        );
14459    $perl->add_match_table("PosixPrint",
14460                            Initialize => $print & $ASCII,
14461                            );
14462
14463    my $Punct = $perl->add_match_table('Punct');
14464    $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
14465
14466    # \p{punct} doesn't include the symbols, which posix does
14467    my $XPosixPunct = $perl->add_match_table('XPosixPunct',
14468                    Description => '\p{Punct} + ASCII-range \p{Symbol}',
14469                    Initialize => $gc->table('Punctuation')
14470                                + ($ASCII & $gc->table('Symbol')),
14471                                Perl_Extension => 1
14472        );
14473    $perl->add_match_table('PosixPunct', Perl_Extension => 1,
14474        Initialize => $ASCII & $XPosixPunct,
14475        );
14476
14477    my $Digit = $perl->add_match_table('Digit', Full_Name => 'XPosixDigit',
14478                            Description => '[0-9] + all other decimal digits');
14479    $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
14480    my $PosixDigit = $perl->add_match_table("PosixDigit",
14481                                            Initialize => $Digit & $ASCII,
14482                                            );
14483
14484    # Hex_Digit was not present in first release
14485    my $Xdigit = $perl->add_match_table('XDigit', Full_Name => 'XPosixXDigit');
14486    my $Hex = property_ref('Hex_Digit');
14487    if (defined $Hex && ! $Hex->is_empty) {
14488        $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
14489    }
14490    else {
14491        $Xdigit->initialize([ ord('0') .. ord('9'),
14492                              ord('A') .. ord('F'),
14493                              ord('a') .. ord('f'),
14494                              0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
14495    }
14496
14497    # AHex was not present in early releases
14498    my $PosixXDigit = $perl->add_match_table('PosixXDigit');
14499    my $AHex = property_ref('ASCII_Hex_Digit');
14500    if (defined $AHex && ! $AHex->is_empty) {
14501        $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1);
14502    }
14503    else {
14504        $PosixXDigit->initialize($Xdigit & $ASCII);
14505        $PosixXDigit->add_alias('AHex');
14506        $PosixXDigit->add_alias('Ascii_Hex_Digit');
14507    }
14508
14509    my $any_folds = $perl->add_match_table("_Perl_Any_Folds",
14510                    Description => "Code points that particpate in some fold",
14511                    );
14512    my $loc_problem_folds = $perl->add_match_table(
14513               "_Perl_Problematic_Locale_Folds",
14514               Description =>
14515                   "Code points that are in some way problematic under locale",
14516    );
14517
14518    # This allows regexec.c to skip some work when appropriate.  Some of the
14519    # entries in _Perl_Problematic_Locale_Folds are multi-character folds,
14520    my $loc_problem_folds_start = $perl->add_match_table(
14521               "_Perl_Problematic_Locale_Foldeds_Start",
14522               Description =>
14523                   "The first character of every sequence in _Perl_Problematic_Locale_Folds",
14524    );
14525
14526    my $cf = property_ref('Case_Folding');
14527
14528    # Every character 0-255 is problematic because what each folds to depends
14529    # on the current locale
14530    $loc_problem_folds->add_range(0, 255);
14531    $loc_problem_folds->add_range(0x130, 0x131);    # These are problematic in
14532                                                    # Turkic locales
14533    $loc_problem_folds_start += $loc_problem_folds;
14534
14535    # Also problematic are anything these fold to outside the range.  Likely
14536    # forever the only thing folded to by these outside the 0-255 range is the
14537    # GREEK SMALL MU (from the MICRO SIGN), but it's easy to make the code
14538    # completely general, which should catch any unexpected changes or errors.
14539    # We look at each code point 0-255, and add its fold (including each part
14540    # of a multi-char fold) to the list.  See commit message
14541    # 31f05a37c4e9c37a7263491f2fc0237d836e1a80 for a more complete description
14542    # of the MU issue.
14543    foreach my $range ($loc_problem_folds->ranges) {
14544        foreach my $code_point ($range->start .. $range->end) {
14545            my $fold_range = $cf->containing_range($code_point);
14546            next unless defined $fold_range;
14547
14548            # Skip if folds to itself
14549            next if $fold_range->value eq $CODE_POINT;
14550
14551            my @hex_folds = split " ", $fold_range->value;
14552            my $start_cp = $hex_folds[0];
14553            next if $start_cp eq $CODE_POINT;
14554            $start_cp = hex $start_cp;
14555            foreach my $i (0 .. @hex_folds - 1) {
14556                my $cp = $hex_folds[$i];
14557                next if $cp eq $CODE_POINT;
14558                $cp = hex $cp;
14559                next unless $cp > 255;    # Already have the < 256 ones
14560
14561                $loc_problem_folds->add_range($cp, $cp);
14562                $loc_problem_folds_start->add_range($start_cp, $start_cp);
14563            }
14564        }
14565    }
14566
14567    my $folds_to_multi_char = $perl->add_match_table(
14568         "_Perl_Folds_To_Multi_Char",
14569         Description =>
14570              "Code points whose fold is a string of more than one character",
14571    );
14572    my $in_multi_fold = $perl->add_match_table(
14573               "_Perl_Is_In_Multi_Char_Fold",
14574               Description =>
14575                   "Code points that are in some multiple character fold",
14576    );
14577    if ($v_version lt v3.0.1) {
14578        push @tables_that_may_be_empty, '_Perl_Folds_To_Multi_Char',
14579                                        '_Perl_Is_In_Multi_Char_Fold',
14580                                        '_Perl_Non_Final_Folds';
14581    }
14582
14583    # Look through all the known folds to populate these tables.
14584    foreach my $range ($cf->ranges) {
14585        next if $range->value eq $CODE_POINT;
14586        my $start = $range->start;
14587        my $end = $range->end;
14588        $any_folds->add_range($start, $end);
14589
14590        my @hex_folds = split " ", $range->value;
14591        if (@hex_folds > 1) {   # Is multi-char fold
14592            $folds_to_multi_char->add_range($start, $end);
14593        }
14594
14595        my $found_locale_problematic = 0;
14596
14597        my $folded_count = @hex_folds;
14598        if ($folded_count > 3) {
14599            die Carp::my_carp("Maximum number of characters in a fold should be 3: Instead, it's  $folded_count for U+" . sprintf "%04X", $range->start);
14600        }
14601
14602        # Look at each of the folded-to characters...
14603        foreach my $i (1 .. $folded_count) {
14604            my $cp = hex $hex_folds[$i-1];
14605            $any_folds->add_range($cp, $cp);
14606
14607            # The fold is problematic if any of the folded-to characters is
14608            # already considered problematic.
14609            if ($loc_problem_folds->contains($cp)) {
14610                $loc_problem_folds->add_range($start, $end);
14611                $found_locale_problematic = 1;
14612            }
14613
14614            if ($folded_count > 1) {
14615                $in_multi_fold->add_range($cp, $cp);
14616            }
14617        }
14618
14619        # If this is a problematic fold, add to the start chars the
14620        # folding-from characters and first folded-to character.
14621        if ($found_locale_problematic) {
14622            $loc_problem_folds_start->add_range($start, $end);
14623            my $cp = hex $hex_folds[0];
14624            $loc_problem_folds_start->add_range($cp, $cp);
14625        }
14626    }
14627
14628    my $dt = property_ref('Decomposition_Type');
14629    $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
14630        Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
14631        Perl_Extension => 1,
14632        Note => 'Union of all non-canonical decompositions',
14633        );
14634
14635    # For backward compatibility, Perl has its own definition for IDStart.
14636    # It is regular XID_Start plus the underscore, but all characters must be
14637    # Word characters as well
14638    my $XID_Start = property_ref('XID_Start');
14639    my $perl_xids = $perl->add_match_table('_Perl_IDStart',
14640                                            Perl_Extension => 1,
14641                                            Fate => $INTERNAL_ONLY,
14642                                            Initialize => ord('_')
14643                                            );
14644    if (defined $XID_Start
14645        || defined ($XID_Start = property_ref('ID_Start')))
14646    {
14647        $perl_xids += $XID_Start->table('Y');
14648    }
14649    else {
14650        # For Unicode versions that don't have the property, construct our own
14651        # from first principles.  The actual definition is:
14652        #     Letters
14653        #   + letter numbers (Nl)
14654        #   - Pattern_Syntax
14655        #   - Pattern_White_Space
14656        #   + stability extensions
14657        #   - NKFC modifications
14658        #
14659        # What we do in the code below is to include the identical code points
14660        # that are in the first release that had Unicode's version of this
14661        # property, essentially extrapolating backwards.  There were no
14662        # stability extensions until v4.1, so none are included; likewise in
14663        # no Unicode version so far do subtracting PatSyn and PatWS make any
14664        # difference, so those also are ignored.
14665        $perl_xids += $gc->table('Letter') + pre_3_dot_1_Nl();
14666
14667        # We do subtract the NFKC modifications that are in the first version
14668        # that had this property.  We don't bother to test if they are in the
14669        # version in question, because if they aren't, the operation is a
14670        # no-op.  The NKFC modifications are discussed in
14671        # http://www.unicode.org/reports/tr31/#NFKC_Modifications
14672        foreach my $range ( 0x037A,
14673                            0x0E33,
14674                            0x0EB3,
14675                            [ 0xFC5E, 0xFC63 ],
14676                            [ 0xFDFA, 0xFE70 ],
14677                            [ 0xFE72, 0xFE76 ],
14678                            0xFE78,
14679                            0xFE7A,
14680                            0xFE7C,
14681                            0xFE7E,
14682                            [ 0xFF9E, 0xFF9F ],
14683        ) {
14684            if (ref $range) {
14685                $perl_xids->delete_range($range->[0], $range->[1]);
14686            }
14687            else {
14688                $perl_xids->delete_range($range, $range);
14689            }
14690        }
14691    }
14692
14693    $perl_xids &= $Word;
14694
14695    my $perl_xidc = $perl->add_match_table('_Perl_IDCont',
14696                                        Perl_Extension => 1,
14697                                        Fate => $INTERNAL_ONLY);
14698    my $XIDC = property_ref('XID_Continue');
14699    if (defined $XIDC
14700        || defined ($XIDC = property_ref('ID_Continue')))
14701    {
14702        $perl_xidc += $XIDC->table('Y');
14703    }
14704    else {
14705        # Similarly, we construct our own XIDC if necessary for early Unicode
14706        # versions.  The definition is:
14707        #     everything in XIDS
14708        #   + Gc=Mn
14709        #   + Gc=Mc
14710        #   + Gc=Nd
14711        #   + Gc=Pc
14712        #   - Pattern_Syntax
14713        #   - Pattern_White_Space
14714        #   + stability extensions
14715        #   - NFKC modifications
14716        #
14717        # The same thing applies to this as with XIDS for the PatSyn, PatWS,
14718        # and stability extensions.  There is a somewhat different set of NFKC
14719        # mods to remove (and add in this case).  The ones below make this
14720        # have identical code points as in the first release that defined it.
14721        $perl_xidc += $perl_xids
14722                    + $gc->table('L')
14723                    + $gc->table('Mn')
14724                    + $gc->table('Mc')
14725                    + $gc->table('Nd')
14726                    + utf8::unicode_to_native(0xB7)
14727                    ;
14728        if (defined (my $pc = $gc->table('Pc'))) {
14729            $perl_xidc += $pc;
14730        }
14731        else {  # 1.1.5 didn't have Pc, but these should have been in it
14732            $perl_xidc += 0xFF3F;
14733            $perl_xidc->add_range(0x203F, 0x2040);
14734            $perl_xidc->add_range(0xFE33, 0xFE34);
14735            $perl_xidc->add_range(0xFE4D, 0xFE4F);
14736        }
14737
14738        # Subtract the NFKC mods
14739        foreach my $range ( 0x037A,
14740                            [ 0xFC5E, 0xFC63 ],
14741                            [ 0xFDFA, 0xFE1F ],
14742                            0xFE70,
14743                            [ 0xFE72, 0xFE76 ],
14744                            0xFE78,
14745                            0xFE7A,
14746                            0xFE7C,
14747                            0xFE7E,
14748        ) {
14749            if (ref $range) {
14750                $perl_xidc->delete_range($range->[0], $range->[1]);
14751            }
14752            else {
14753                $perl_xidc->delete_range($range, $range);
14754            }
14755        }
14756    }
14757
14758    $perl_xidc &= $Word;
14759
14760    my $charname_begin = $perl->add_match_table('_Perl_Charname_Begin',
14761                    Perl_Extension => 1,
14762                    Fate => $INTERNAL_ONLY,
14763                    Initialize => $gc->table('Letter') & $Alpha & $perl_xids,
14764                    );
14765
14766    my $charname_continue = $perl->add_match_table('_Perl_Charname_Continue',
14767                        Perl_Extension => 1,
14768                        Fate => $INTERNAL_ONLY,
14769                        Initialize => $perl_xidc
14770                                    + ord(" ")
14771                                    + ord("(")
14772                                    + ord(")")
14773                                    + ord("-")
14774                        );
14775
14776    my @composition = ('Name', 'Unicode_1_Name', '_Perl_Name_Alias');
14777
14778    if (@named_sequences) {
14779        push @composition, 'Named_Sequence';
14780        foreach my $sequence (@named_sequences) {
14781            $perl_charname->add_anomalous_entry($sequence);
14782        }
14783    }
14784
14785    my $alias_sentence = "";
14786    my %abbreviations;
14787    my $alias = property_ref('_Perl_Name_Alias');
14788    $perl_charname->set_proxy_for('_Perl_Name_Alias');
14789
14790    # Add each entry in _Perl_Name_Alias to Perl_Charnames.  Where these go
14791    # with respect to any existing entry depends on the entry type.
14792    # Corrections go before said entry, as they should be returned in
14793    # preference over the existing entry.  (A correction to a correction
14794    # should be later in the _Perl_Name_Alias table, so it will correctly
14795    # precede the erroneous correction in Perl_Charnames.)
14796    #
14797    # Abbreviations go after everything else, so they are saved temporarily in
14798    # a hash for later.
14799    #
14800    # Everything else is added afterwards, which preserves the input
14801    # ordering
14802
14803    foreach my $range ($alias->ranges) {
14804        next if $range->value eq "";
14805        my $code_point = $range->start;
14806        if ($code_point != $range->end) {
14807            Carp::my_carp_bug("Bad News.  Expecting only one code point in the range $range.  Just to keep going, using only the first code point;");
14808        }
14809        my ($value, $type) = split ': ', $range->value;
14810        my $replace_type;
14811        if ($type eq 'correction') {
14812            $replace_type = $MULTIPLE_BEFORE;
14813        }
14814        elsif ($type eq 'abbreviation') {
14815
14816            # Save for later
14817            $abbreviations{$value} = $code_point;
14818            next;
14819        }
14820        else {
14821            $replace_type = $MULTIPLE_AFTER;
14822        }
14823
14824        # Actually add; before or after current entry(ies) as determined
14825        # above.
14826
14827        $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
14828    }
14829    $alias_sentence = <<END;
14830The _Perl_Name_Alias property adds duplicate code point entries that are
14831alternatives to the original name.  If an addition is a corrected
14832name, it will be physically first in the table.  The original (less correct,
14833but still valid) name will be next; then any alternatives, in no particular
14834order; and finally any abbreviations, again in no particular order.
14835END
14836
14837    # Now add the Unicode_1 names for the controls.  The Unicode_1 names had
14838    # precedence before 6.1, including the awful ones like "LINE FEED (LF)",
14839    # so should be first in the file; the other names have precedence starting
14840    # in 6.1,
14841    my $before_or_after = ($v_version lt v6.1.0)
14842                          ? $MULTIPLE_BEFORE
14843                          : $MULTIPLE_AFTER;
14844
14845    foreach my $range (property_ref('Unicode_1_Name')->ranges) {
14846        my $code_point = $range->start;
14847        my $unicode_1_value = $range->value;
14848        next if $unicode_1_value eq "";     # Skip if name doesn't exist.
14849
14850        if ($code_point != $range->end) {
14851            Carp::my_carp_bug("Bad News.  Expecting only one code point in the range $range.  Just to keep going, using only the first code point;");
14852        }
14853
14854        # To handle EBCDIC, we don't hard code in the code points of the
14855        # controls; instead realizing that all of them are below 256.
14856        last if $code_point > 255;
14857
14858        # We only add in the controls.
14859        next if $gc->value_of($code_point) ne 'Cc';
14860
14861        # We reject this Unicode1 name for later Perls, as it is used for
14862        # another code point
14863        next if $unicode_1_value eq 'BELL' && $^V ge v5.17.0;
14864
14865        # This won't add an exact duplicate.
14866        $perl_charname->add_duplicate($code_point, $unicode_1_value,
14867                                        Replace => $before_or_after);
14868    }
14869
14870    # Now that have everything added, add in abbreviations after
14871    # everything else.  Sort so results don't change between runs of this
14872    # program
14873    foreach my $value (sort keys %abbreviations) {
14874        $perl_charname->add_duplicate($abbreviations{$value}, $value,
14875                                        Replace => $MULTIPLE_AFTER);
14876    }
14877
14878    my $comment;
14879    if (@composition <= 2) { # Always at least 2
14880        $comment = join " and ", @composition;
14881    }
14882    else {
14883        $comment = join ", ", @composition[0 .. scalar @composition - 2];
14884        $comment .= ", and $composition[-1]";
14885    }
14886
14887    $perl_charname->add_comment(join_lines( <<END
14888This file is for charnames.pm.  It is the union of the $comment properties.
14889Unicode_1_Name entries are used only for nameless code points in the Name
14890property.
14891$alias_sentence
14892This file doesn't include the algorithmically determinable names.  For those,
14893use 'unicore/Name.pm'
14894END
14895    ));
14896    property_ref('Name')->add_comment(join_lines( <<END
14897This file doesn't include the algorithmically determinable names.  For those,
14898use 'unicore/Name.pm'
14899END
14900    ));
14901
14902    # Construct the Present_In property from the Age property.
14903    if (-e 'DAge.txt' && defined $age) {
14904        my $default_map = $age->default_map;
14905        my $in = Property->new('In',
14906                                Default_Map => $default_map,
14907                                Full_Name => "Present_In",
14908                                Perl_Extension => 1,
14909                                Type => $ENUM,
14910                                Initialize => $age,
14911                                );
14912        $in->add_comment(join_lines(<<END
14913THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE.  The values in this file are the
14914same as for $age, and not for what $in really means.  This is because anything
14915defined in a given release should have multiple values: that release and all
14916higher ones.  But only one value per code point can be represented in a table
14917like this.
14918END
14919        ));
14920
14921        # The Age tables are named like 1.5, 2.0, 2.1, ....  Sort so that the
14922        # lowest numbered (earliest) come first, with the non-numeric one
14923        # last.
14924        my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
14925                                            ? 1
14926                                            : ($b->name !~ /^[\d.]*$/)
14927                                                ? -1
14928                                                : $a->name <=> $b->name
14929                                            } $age->tables;
14930
14931        # The Present_In property is the cumulative age properties.  The first
14932        # one hence is identical to the first age one.
14933        my $previous_in = $in->add_match_table($first_age->name);
14934        $previous_in->set_equivalent_to($first_age, Related => 1);
14935
14936        my $description_start = "Code point's usage introduced in version ";
14937        $first_age->add_description($description_start . $first_age->name);
14938
14939        # To construct the accumulated values, for each of the age tables
14940        # starting with the 2nd earliest, merge the earliest with it, to get
14941        # all those code points existing in the 2nd earliest.  Repeat merging
14942        # the new 2nd earliest with the 3rd earliest to get all those existing
14943        # in the 3rd earliest, and so on.
14944        foreach my $current_age (@rest_ages) {
14945            next if $current_age->name !~ /^[\d.]*$/;   # Skip the non-numeric
14946
14947            my $current_in = $in->add_match_table(
14948                                    $current_age->name,
14949                                    Initialize => $current_age + $previous_in,
14950                                    Description => $description_start
14951                                                    . $current_age->name
14952                                                    . ' or earlier',
14953                                    );
14954            foreach my $alias ($current_age->aliases) {
14955                $current_in->add_alias($alias->name);
14956            }
14957            $previous_in = $current_in;
14958
14959            # Add clarifying material for the corresponding age file.  This is
14960            # in part because of the confusing and contradictory information
14961            # given in the Standard's documentation itself, as of 5.2.
14962            $current_age->add_description(
14963                            "Code point's usage was introduced in version "
14964                            . $current_age->name);
14965            $current_age->add_note("See also $in");
14966
14967        }
14968
14969        # And finally the code points whose usages have yet to be decided are
14970        # the same in both properties.  Note that permanently unassigned code
14971        # points actually have their usage assigned (as being permanently
14972        # unassigned), so that these tables are not the same as gc=cn.
14973        my $unassigned = $in->add_match_table($default_map);
14974        my $age_default = $age->table($default_map);
14975        $age_default->add_description(<<END
14976Code point's usage has not been assigned in any Unicode release thus far.
14977END
14978        );
14979        $unassigned->set_equivalent_to($age_default, Related => 1);
14980    }
14981
14982    my $patws = $perl->add_match_table('_Perl_PatWS',
14983                                       Perl_Extension => 1,
14984                                       Fate => $INTERNAL_ONLY);
14985    if (defined (my $off_patws = property_ref('Pattern_White_Space'))) {
14986        $patws->initialize($off_patws->table('Y'));
14987    }
14988    else {
14989        $patws->initialize([ ord("\t"),
14990                             ord("\n"),
14991                             utf8::unicode_to_native(0x0B), # VT
14992                             ord("\f"),
14993                             ord("\r"),
14994                             ord(" "),
14995                             utf8::unicode_to_native(0x85), # NEL
14996                             0x200E..0x200F,             # Left, Right marks
14997                             0x2028..0x2029              # Line, Paragraph seps
14998                           ] );
14999    }
15000
15001    # See L<perlfunc/quotemeta>
15002    my $quotemeta = $perl->add_match_table('_Perl_Quotemeta',
15003                                           Perl_Extension => 1,
15004                                           Fate => $INTERNAL_ONLY,
15005
15006                                           # Initialize to what's common in
15007                                           # all Unicode releases.
15008                                           Initialize =>
15009                                                  $gc->table('Control')
15010                                                + $Space
15011                                                + $patws
15012                                                + ((~ $Word) & $ASCII)
15013                           );
15014
15015    if (defined (my $patsyn = property_ref('Pattern_Syntax'))) {
15016        $quotemeta += $patsyn->table('Y');
15017    }
15018    else {
15019        $quotemeta += ((~ $Word) & Range->new(0, 255))
15020                    - utf8::unicode_to_native(0xA8)
15021                    - utf8::unicode_to_native(0xAF)
15022                    - utf8::unicode_to_native(0xB2)
15023                    - utf8::unicode_to_native(0xB3)
15024                    - utf8::unicode_to_native(0xB4)
15025                    - utf8::unicode_to_native(0xB7)
15026                    - utf8::unicode_to_native(0xB8)
15027                    - utf8::unicode_to_native(0xB9)
15028                    - utf8::unicode_to_native(0xBC)
15029                    - utf8::unicode_to_native(0xBD)
15030                    - utf8::unicode_to_native(0xBE);
15031        $quotemeta += [ # These are above-Latin1 patsyn; hence should be the
15032                        # same in all releases
15033                        0x2010 .. 0x2027,
15034                        0x2030 .. 0x203E,
15035                        0x2041 .. 0x2053,
15036                        0x2055 .. 0x205E,
15037                        0x2190 .. 0x245F,
15038                        0x2500 .. 0x2775,
15039                        0x2794 .. 0x2BFF,
15040                        0x2E00 .. 0x2E7F,
15041                        0x3001 .. 0x3003,
15042                        0x3008 .. 0x3020,
15043                        0x3030 .. 0x3030,
15044                        0xFD3E .. 0xFD3F,
15045                        0xFE45 .. 0xFE46
15046                      ];
15047    }
15048
15049    if (defined (my $di = property_ref('Default_Ignorable_Code_Point'))) {
15050        $quotemeta += $di->table('Y')
15051    }
15052    else {
15053        if ($v_version ge v2.0) {
15054            $quotemeta += $gc->table('Cf')
15055                       +  $gc->table('Cs');
15056
15057            # These are above the Unicode version 1 max
15058            $quotemeta->add_range(0xE0000, 0xE0FFF);
15059        }
15060        $quotemeta += $gc->table('Cc')
15061                    - $Space;
15062        my $temp = Range_List->new(Initialize => [ 0x180B .. 0x180D,
15063                                                   0x2060 .. 0x206F,
15064                                                   0xFE00 .. 0xFE0F,
15065                                                   0xFFF0 .. 0xFFFB,
15066                                                  ]);
15067        $temp->add_range(0xE0000, 0xE0FFF) if $v_version ge v2.0;
15068        $quotemeta += $temp;
15069    }
15070    calculate_DI();
15071    $quotemeta += $DI;
15072
15073    calculate_NChar();
15074
15075    # Finished creating all the perl properties.  All non-internal non-string
15076    # ones have a synonym of 'Is_' prefixed.  (Internal properties begin with
15077    # an underscore.)  These do not get a separate entry in the pod file
15078    foreach my $table ($perl->tables) {
15079        foreach my $alias ($table->aliases) {
15080            next if $alias->name =~ /^_/;
15081            $table->add_alias('Is_' . $alias->name,
15082                               Re_Pod_Entry => 0,
15083                               UCD => 0,
15084                               Status => $alias->status,
15085                               OK_as_Filename => 0);
15086        }
15087    }
15088
15089    # Perl tailors the WordBreak property so that \b{wb} doesn't split
15090    # adjacent spaces into separate words.  Unicode 11.0 moved in that
15091    # direction, but left TAB,  FIGURE SPACE (U+2007), and (ironically) NO
15092    # BREAK SPACE as breaking, so we retained the original Perl customization.
15093    # To do this, in the Perl copy of WB, simply replace the mappings of
15094    # horizontal space characters that otherwise would map to the default or
15095    # the 11.0 'WSegSpace' to instead map to our tailoring.
15096    my $perl_wb = property_ref('_Perl_WB');
15097    my $default = $perl_wb->default_map;
15098    for my $range ($Blank->ranges) {
15099        for my $i ($range->start .. $range->end) {
15100            my $value = $perl_wb->value_of($i);
15101
15102            next unless $value eq $default || $value eq 'WSegSpace';
15103            $perl_wb->add_map($i, $i, 'Perl_Tailored_HSpace',
15104                              Replace => $UNCONDITIONALLY);
15105        }
15106    }
15107
15108    # Also starting in Unicode 11.0, rules for some of the boundary types are
15109    # based on a non-UCD property (which we have read in if it exists).
15110    # Recall that these boundary properties partition the code points into
15111    # equivalence classes (represented as enums).
15112    #
15113    # The loop below goes through each code point that matches the non-UCD
15114    # property, and for each current equivalence class containing such a code
15115    # point, splits it so that those that are in both are now in a newly
15116    # created equivalence class whose name is a combination of the property
15117    # and the old class name, leaving unchanged everything that doesn't match
15118    # the non-UCD property.
15119    my $ep = property_ref('ExtPict');
15120    $ep = $ep->table('Y') if defined $ep;
15121    if (defined $ep) {
15122        foreach my $base_property (property_ref('GCB'),
15123                                   property_ref('WB'))
15124        {
15125            my $property = property_ref('_Perl_' . $base_property->name);
15126            foreach my $range ($ep->ranges) {
15127                foreach my $i ($range->start .. $range->end) {
15128                    my $current = $property->value_of($i);
15129                    $current = $property->table($current)->short_name;
15130                    $property->add_map($i, $i, 'ExtPict_' . $current,
15131                                       Replace => $UNCONDITIONALLY);
15132                }
15133            }
15134        }
15135    }
15136
15137    # Create a version of the LineBreak property with the mappings that are
15138    # omitted in the default algorithm remapped to what
15139    # http://www.unicode.org/reports/tr14 says they should be.
15140    #
15141    # First, create a plain copy, but with all property values written out in
15142    # their long form, as regen/mk_invlist.pl expects that, and also fix
15143    # occurrences of the typo in early Unicode versions: 'inseperable'.
15144    my $perl_lb = property_ref('_Perl_LB');
15145    if (! defined $perl_lb) {
15146        $perl_lb = Property->new('_Perl_LB',
15147                                 Fate => $INTERNAL_ONLY,
15148                                 Perl_Extension => 1,
15149                                 Directory => $map_directory,
15150                                 Type => $STRING);
15151        my $lb = property_ref('Line_Break');
15152
15153        # Populate from $lb, but use full name and fix typo.
15154        foreach my $range ($lb->ranges) {
15155            my $full_name = $lb->table($range->value)->full_name;
15156            $full_name = 'Inseparable'
15157                                if standardize($full_name) eq 'inseperable';
15158            $perl_lb->add_map($range->start, $range->end, $full_name);
15159        }
15160    }
15161
15162    # What tr14 says is this:
15163
15164    # Original 	   Resolved  General_Category
15165    # AI, SG, XX      AL      Any
15166    # SA              CM      Only Mn or Mc
15167    # SA              AL      Any except Mn and Mc
15168    # CJ              NS      Any
15169
15170    $perl_lb->set_default_map('Alphabetic', 'full_name');    # XX -> AL
15171
15172    my $ea = property_ref('East_Asian_Width');
15173    my $Cn_EP;
15174    $Cn_EP = $ep & $gc->table('Unassigned') if defined $ep;
15175
15176    for my $range ($perl_lb->ranges) {
15177        my $value = standardize($range->value);
15178        if (   $value eq standardize('Unknown')
15179            || $value eq standardize('Ambiguous')
15180            || $value eq standardize('Surrogate'))
15181        {
15182            $perl_lb->add_map($range->start, $range->end, 'Alphabetic',
15183                              Replace => $UNCONDITIONALLY);
15184        }
15185        elsif ($value eq standardize('Conditional_Japanese_Starter')) {
15186            $perl_lb->add_map($range->start, $range->end, 'Nonstarter',
15187                              Replace => $UNCONDITIONALLY);
15188        }
15189        elsif ($value eq standardize('Complex_Context')) {
15190            for my $i ($range->start .. $range->end) {
15191                my $gc_val = $gc->value_of($i);
15192                if ($gc_val eq 'Mn' || $gc_val eq 'Mc') {
15193                    $perl_lb->add_map($i, $i, 'Combining_Mark',
15194                                      Replace => $UNCONDITIONALLY);
15195                }
15196                else {
15197                    $perl_lb->add_map($i, $i, 'Alphabetic',
15198                                      Replace => $UNCONDITIONALLY);
15199                }
15200            }
15201        }
15202        elsif (defined $ep && $value eq standardize('Ideographic')) {
15203
15204            # Unicode 14 adds a rule to not break lines before any potential
15205            # EBase,  They say that any unassigned code point that is ExtPict,
15206            # is potentially an EBase.  In 14.0, all such ones are in the
15207            # ExtPict=ID category.  We must split that category for the
15208            # pairwise rule table to work.
15209            for my $i ($range->start .. $range->end) {
15210                if ($Cn_EP->contains($i)) {
15211                    $perl_lb->add_map($i, $i,
15212                                'Unassigned_Extended_Pictographic_Ideographic',
15213                                Replace => $UNCONDITIONALLY);
15214                }
15215            }
15216        }
15217        elsif (    defined $ea
15218               && (   $value eq standardize('Close_Parenthesis')
15219                   || $value eq standardize('Open_Punctuation')))
15220        {
15221            # Unicode 13 splits the OP and CP properties each into East Asian,
15222            # and non-.  We retain the (now somewhat misleading) names OP and
15223            # CP for the non-East Asian variety, as there are very few East
15224            # Asian ones.
15225            my $replace = ($value eq standardize('Open_Punctuation'))
15226                          ? 'East_Asian_OP'
15227                          : 'East_Asian_CP';
15228            for my $i ($range->start .. $range->end) {
15229                my $ea_val = $ea->value_of($i);
15230                if ($ea_val eq 'F' || $ea_val eq 'W' || $ea_val eq 'H') {
15231                    $perl_lb->add_map($i, $i, $replace,
15232                                                Replace => $UNCONDITIONALLY);
15233                }
15234            }
15235        }
15236    }
15237
15238    # This property is a modification of the scx property
15239    my $perl_scx = Property->new('_Perl_SCX',
15240                                 Fate => $INTERNAL_ONLY,
15241                                 Perl_Extension => 1,
15242                                 Directory => $map_directory,
15243                                 Type => $ENUM);
15244    my $source;
15245
15246    # Use scx if available; otherwise sc;  if neither is there (a very old
15247    # Unicode version, just say that everything is 'Common'
15248    if (defined $scx) {
15249        $source = $scx;
15250        $perl_scx->set_default_map('Unknown');
15251    }
15252    elsif (defined $script) {
15253        $source = $script;
15254
15255        # Early versions of 'sc', had everything be 'Common'
15256        if (defined $script->table('Unknown')) {
15257            $perl_scx->set_default_map('Unknown');
15258        }
15259        else {
15260            $perl_scx->set_default_map('Common');
15261        }
15262    } else {
15263        $perl_scx->add_match_table('Common');
15264        $perl_scx->add_map(0, $MAX_UNICODE_CODEPOINT, 'Common');
15265
15266        $perl_scx->add_match_table('Unknown');
15267        $perl_scx->set_default_map('Unknown');
15268    }
15269
15270    $perl_scx->_set_format($STRING_WHITE_SPACE_LIST);
15271    $perl_scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these
15272
15273    if (defined $source) {
15274        $perl_scx->initialize($source);
15275
15276        # UTS 39 says that the scx property should be modified for these
15277        # countries where certain mixed scripts are commonly used.
15278        for my $range ($perl_scx->ranges) {
15279            my $value = $range->value;
15280            my $changed = $value =~ s/ ( \b Han i? \b ) /$1 Hanb Jpan Kore/xi;
15281             $changed |=  $value =~ s/ ( \b Hira (gana)? \b ) /$1 Jpan/xi;
15282             $changed |=  $value =~ s/ ( \b Kata (kana)? \b ) /$1 Jpan/xi;
15283             $changed |=  $value =~ s{ ( \b Katakana_or_Hiragana \b ) }
15284                                     {$1 Katakana Hiragana Jpan}xi;
15285             $changed |=  $value =~ s/ ( \b Hang (ul)? \b ) /$1 Kore/xi;
15286             $changed |=  $value =~ s/ ( \b Bopo (mofo)? \b ) /$1 Hanb/xi;
15287
15288            if ($changed) {
15289                $value = join " ", uniques split " ", $value;
15290                $range->set_value($value)
15291            }
15292        }
15293
15294        foreach my $table ($source->tables) {
15295            my $scx_table = $perl_scx->add_match_table($table->name,
15296                                    Full_Name => $table->full_name);
15297            foreach my $alias ($table->aliases) {
15298                $scx_table->add_alias($alias->name);
15299            }
15300        }
15301    }
15302
15303    # Here done with all the basic stuff.  Ready to populate the information
15304    # about each character if annotating them.
15305    if ($annotate) {
15306
15307        # See comments at its declaration
15308        $annotate_ranges = Range_Map->new;
15309
15310        # This separates out the non-characters from the other unassigneds, so
15311        # can give different annotations for each.
15312        $unassigned_sans_noncharacters = Range_List->new(
15313                                    Initialize => $gc->table('Unassigned'));
15314        $unassigned_sans_noncharacters &= (~ $NChar);
15315
15316        for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT + 1; $i++ ) {
15317            $i = populate_char_info($i);    # Note sets $i so may cause skips
15318
15319        }
15320    }
15321
15322    return;
15323}
15324
15325sub add_perl_synonyms() {
15326    # A number of Unicode tables have Perl synonyms that are expressed in
15327    # the single-form, \p{name}.  These are:
15328    #   All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
15329    #       \p{Is_Name} as synonyms
15330    #   \p{Script_Extensions=Value} gets \p{Value}, \p{Is_Value} as synonyms
15331    #   \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
15332    #   \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
15333    #       conflict, \p{Value} and \p{Is_Value} as well
15334    #
15335    # This routine generates these synonyms, warning of any unexpected
15336    # conflicts.
15337
15338    # Construct the list of tables to get synonyms for.  Start with all the
15339    # binary and the General_Category ones.
15340    my @tables = grep { $_->type == $BINARY || $_->type == $FORCED_BINARY }
15341                                                            property_ref('*');
15342    push @tables, $gc->tables;
15343
15344    # If the version of Unicode includes the Script Extensions (preferably),
15345    # or Script property, add its tables
15346    if (defined $scx) {
15347        push @tables, $scx->tables;
15348    }
15349    else {
15350        push @tables, $script->tables if defined $script;
15351    }
15352
15353    # The Block tables are kept separate because they are treated differently.
15354    # And the earliest versions of Unicode didn't include them, so add only if
15355    # there are some.
15356    my @blocks;
15357    push @blocks, $block->tables if defined $block;
15358
15359    # Here, have the lists of tables constructed.  Process blocks last so that
15360    # if there are name collisions with them, blocks have lowest priority.
15361    # Should there ever be other collisions, manual intervention would be
15362    # required.  See the comments at the beginning of the program for a
15363    # possible way to handle those semi-automatically.
15364    foreach my $table (@tables,  @blocks) {
15365
15366        # For non-binary properties, the synonym is just the name of the
15367        # table, like Greek, but for binary properties the synonym is the name
15368        # of the property, and means the code points in its 'Y' table.
15369        my $nominal = $table;
15370        my $nominal_property = $nominal->property;
15371        my $actual;
15372        if (! $nominal->isa('Property')) {
15373            $actual = $table;
15374        }
15375        else {
15376
15377            # Here is a binary property.  Use the 'Y' table.  Verify that is
15378            # there
15379            my $yes = $nominal->table('Y');
15380            unless (defined $yes) {  # Must be defined, but is permissible to
15381                                     # be empty.
15382                Carp::my_carp_bug("Undefined $nominal, 'Y'.  Skipping.");
15383                next;
15384            }
15385            $actual = $yes;
15386        }
15387
15388        foreach my $alias ($nominal->aliases) {
15389
15390            # Attempt to create a table in the perl directory for the
15391            # candidate table, using whatever aliases in it that don't
15392            # conflict.  Also add non-conflicting aliases for all these
15393            # prefixed by 'Is_' (and/or 'In_' for Block property tables)
15394            PREFIX:
15395            foreach my $prefix ("", 'Is_', 'In_') {
15396
15397                # Only Block properties can have added 'In_' aliases.
15398                next if $prefix eq 'In_' and $nominal_property != $block;
15399
15400                my $proposed_name = $prefix . $alias->name;
15401
15402                # No Is_Is, In_In, nor combinations thereof
15403                trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
15404                next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
15405
15406                trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
15407
15408                # Get a reference to any existing table in the perl
15409                # directory with the desired name.
15410                my $pre_existing = $perl->table($proposed_name);
15411
15412                if (! defined $pre_existing) {
15413
15414                    # No name collision, so OK to add the perl synonym.
15415
15416                    my $make_re_pod_entry;
15417                    my $ok_as_filename;
15418                    my $status = $alias->status;
15419                    if ($nominal_property == $block) {
15420
15421                        # For block properties, only the compound form is
15422                        # preferred for external use; the others are
15423                        # discouraged.  The pod file contains wild cards for
15424                        # the 'In' and 'Is' forms so no entries for those; and
15425                        # we don't want people using the name without any
15426                        # prefix, so discourage that.
15427                        if ($prefix eq "") {
15428                            $make_re_pod_entry = 1;
15429                            $status = $status || $DISCOURAGED;
15430                            $ok_as_filename = 0;
15431                        }
15432                        elsif ($prefix eq 'In_') {
15433                            $make_re_pod_entry = 0;
15434                            $status = $status || $DISCOURAGED;
15435                            $ok_as_filename = 1;
15436                        }
15437                        else {
15438                            $make_re_pod_entry = 0;
15439                            $status = $status || $DISCOURAGED;
15440                            $ok_as_filename = 0;
15441                        }
15442                    }
15443                    elsif ($prefix ne "") {
15444
15445                        # The 'Is' prefix is handled in the pod by a wild
15446                        # card, and we won't use it for an external name
15447                        $make_re_pod_entry = 0;
15448                        $status = $status || $NORMAL;
15449                        $ok_as_filename = 0;
15450                    }
15451                    else {
15452
15453                        # Here, is an empty prefix, non block.  This gets its
15454                        # own pod entry and can be used for an external name.
15455                        $make_re_pod_entry = 1;
15456                        $status = $status || $NORMAL;
15457                        $ok_as_filename = 1;
15458                    }
15459
15460                    # Here, there isn't a perl pre-existing table with the
15461                    # name.  Look through the list of equivalents of this
15462                    # table to see if one is a perl table.
15463                    foreach my $equivalent ($actual->leader->equivalents) {
15464                        next if $equivalent->property != $perl;
15465
15466                        # Here, have found a table for $perl.  Add this alias
15467                        # to it, and are done with this prefix.
15468                        $equivalent->add_alias($proposed_name,
15469                                        Re_Pod_Entry => $make_re_pod_entry,
15470
15471                                        # Currently don't output these in the
15472                                        # ucd pod, as are strongly discouraged
15473                                        # from being used
15474                                        UCD => 0,
15475
15476                                        Status => $status,
15477                                        OK_as_Filename => $ok_as_filename);
15478                        trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
15479                        next PREFIX;
15480                    }
15481
15482                    # Here, $perl doesn't already have a table that is a
15483                    # synonym for this property, add one.
15484                    my $added_table = $perl->add_match_table($proposed_name,
15485                                            Re_Pod_Entry => $make_re_pod_entry,
15486
15487                                            # See UCD comment just above
15488                                            UCD => 0,
15489
15490                                            Status => $status,
15491                                            OK_as_Filename => $ok_as_filename);
15492                    # And it will be related to the actual table, since it is
15493                    # based on it.
15494                    $added_table->set_equivalent_to($actual, Related => 1);
15495                    trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
15496                    next;
15497                } # End of no pre-existing.
15498
15499                # Here, there is a pre-existing table that has the proposed
15500                # name.  We could be in trouble, but not if this is just a
15501                # synonym for another table that we have already made a child
15502                # of the pre-existing one.
15503                if ($pre_existing->is_set_equivalent_to($actual)) {
15504                    trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
15505                    $pre_existing->add_alias($proposed_name);
15506                    next;
15507                }
15508
15509                # Here, there is a name collision, but it still could be OK if
15510                # the tables match the identical set of code points, in which
15511                # case, we can combine the names.  Compare each table's code
15512                # point list to see if they are identical.
15513                trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
15514                if ($pre_existing->matches_identically_to($actual)) {
15515
15516                    # Here, they do match identically.  Not a real conflict.
15517                    # Make the perl version a child of the Unicode one, except
15518                    # in the non-obvious case of where the perl name is
15519                    # already a synonym of another Unicode property.  (This is
15520                    # excluded by the test for it being its own parent.)  The
15521                    # reason for this exclusion is that then the two Unicode
15522                    # properties become related; and we don't really know if
15523                    # they are or not.  We generate documentation based on
15524                    # relatedness, and this would be misleading.  Code
15525                    # later executed in the process will cause the tables to
15526                    # be represented by a single file anyway, without making
15527                    # it look in the pod like they are necessarily related.
15528                    if ($pre_existing->parent == $pre_existing
15529                        && ($pre_existing->property == $perl
15530                            || $actual->property == $perl))
15531                    {
15532                        trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
15533                        $pre_existing->set_equivalent_to($actual, Related => 1);
15534                    }
15535                    elsif (main::DEBUG && $to_trace) {
15536                        trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
15537                        trace $pre_existing->parent;
15538                    }
15539                    next PREFIX;
15540                }
15541
15542                # Here they didn't match identically, there is a real conflict
15543                # between our new name and a pre-existing property.
15544                $actual->add_conflicting($proposed_name, 'p', $pre_existing);
15545                $pre_existing->add_conflicting($nominal->full_name,
15546                                               'p',
15547                                               $actual);
15548
15549                # Don't output a warning for aliases for the block
15550                # properties (unless they start with 'In_') as it is
15551                # expected that there will be conflicts and the block
15552                # form loses.
15553                if ($verbosity >= $NORMAL_VERBOSITY
15554                    && ($actual->property != $block || $prefix eq 'In_'))
15555                {
15556                    print simple_fold(join_lines(<<END
15557There is already an alias named $proposed_name (from $pre_existing),
15558so not creating this alias for $actual
15559END
15560                    ), "", 4);
15561                }
15562
15563                # Keep track for documentation purposes.
15564                $has_In_conflicts++ if $prefix eq 'In_';
15565                $has_Is_conflicts++ if $prefix eq 'Is_';
15566            }
15567        }
15568    }
15569
15570    # There are some properties which have No and Yes (and N and Y) as
15571    # property values, but aren't binary, and could possibly be confused with
15572    # binary ones.  So create caveats for them.  There are tables that are
15573    # named 'No', and tables that are named 'N', but confusion is not likely
15574    # unless they are the same table.  For example, N meaning Number or
15575    # Neutral is not likely to cause confusion, so don't add caveats to things
15576    # like them.
15577    foreach my $property (grep { $_->type != $BINARY
15578                                 && $_->type != $FORCED_BINARY }
15579                                                            property_ref('*'))
15580    {
15581        my $yes = $property->table('Yes');
15582        if (defined $yes) {
15583            my $y = $property->table('Y');
15584            if (defined $y && $yes == $y) {
15585                foreach my $alias ($property->aliases) {
15586                    $yes->add_conflicting($alias->name);
15587                }
15588            }
15589        }
15590        my $no = $property->table('No');
15591        if (defined $no) {
15592            my $n = $property->table('N');
15593            if (defined $n && $no == $n) {
15594                foreach my $alias ($property->aliases) {
15595                    $no->add_conflicting($alias->name, 'P');
15596                }
15597            }
15598        }
15599    }
15600
15601    return;
15602}
15603
15604sub register_file_for_name($table, $directory_ref, $file) {
15605    # Given info about a table and a datafile that it should be associated
15606    # with, register that association
15607
15608    # $directory_ref    # Array of the directory path for the file
15609    # $file             # The file name in the final directory.
15610
15611    trace "table=$table, file=$file, directory=@$directory_ref, fate=", $table->fate if main::DEBUG && $to_trace;
15612
15613    if ($table->isa('Property')) {
15614        $table->set_file_path(@$directory_ref, $file);
15615        push @map_properties, $table;
15616
15617        # No swash means don't do the rest of this.
15618        return if $table->fate != $ORDINARY
15619                  && ! ($table->name =~ /^_/ && $table->fate == $INTERNAL_ONLY);
15620
15621        # Get the path to the file
15622        my @path = $table->file_path;
15623
15624        # Use just the file name if no subdirectory.
15625        shift @path if $path[0] eq File::Spec->curdir();
15626
15627        my $file = join '/', @path;
15628
15629        # Create a hash entry for Unicode::UCD to get the file that stores this
15630        # property's map table
15631        foreach my $alias ($table->aliases) {
15632            my $name = $alias->name;
15633            if ($name =~ /^_/) {
15634                $strict_property_to_file_of{lc $name} = $file;
15635            }
15636            else {
15637                $loose_property_to_file_of{standardize($name)} = $file;
15638            }
15639        }
15640
15641        # And a way for Unicode::UCD to find the proper key in the SwashInfo
15642        # hash for this property.
15643        $file_to_swash_name{$file} = "To" . $table->swash_name;
15644        return;
15645    }
15646
15647    # Do all of the work for all equivalent tables when called with the leader
15648    # table, so skip if isn't the leader.
15649    return if $table->leader != $table;
15650
15651    # If this is a complement of another file, use that other file instead,
15652    # with a ! prepended to it.
15653    my $complement;
15654    if (($complement = $table->complement) != 0) {
15655        my @directories = $complement->file_path;
15656
15657        # This assumes that the 0th element is something like 'lib',
15658        # the 1th element the property name (in its own directory), like
15659        # 'AHex', and the 2th element the file like 'Y' which will have a .pl
15660        # appended to it later.
15661        $directories[1] =~ s/^/!/;
15662        $file = pop @directories;
15663        $directory_ref =\@directories;
15664    }
15665
15666    # Join all the file path components together, using slashes.
15667    my $full_filename = join('/', @$directory_ref, $file);
15668
15669    # All go in the same subdirectory of unicore, or the special
15670    # pseudo-directory '#'
15671    if ($directory_ref->[0] !~ / ^ $matches_directory | \# $ /x) {
15672        Carp::my_carp("Unexpected directory in "
15673                .  join('/', @{$directory_ref}, $file));
15674    }
15675
15676    # For this table and all its equivalents ...
15677    foreach my $table ($table, $table->equivalents) {
15678
15679        # Associate it with its file internally.  Don't include the
15680        # $matches_directory first component
15681        $table->set_file_path(@$directory_ref, $file);
15682
15683        # No swash means don't do the rest of this.
15684        next if $table->isa('Map_Table') && $table->fate != $ORDINARY;
15685
15686        my $sub_filename = join('/', $directory_ref->[1, -1], $file);
15687
15688        my $property = $table->property;
15689        my $property_name = ($property == $perl)
15690                             ? ""  # 'perl' is never explicitly stated
15691                             : standardize($property->name) . '=';
15692
15693        my $is_default = 0; # Is this table the default one for the property?
15694
15695        # To calculate $is_default, we find if this table is the same as the
15696        # default one for the property.  But this is complicated by the
15697        # possibility that there is a master table for this one, and the
15698        # information is stored there instead of here.
15699        my $parent = $table->parent;
15700        my $leader_prop = $parent->property;
15701        my $default_map = $leader_prop->default_map;
15702        if (defined $default_map) {
15703            my $default_table = $leader_prop->table($default_map);
15704            $is_default = 1 if defined $default_table && $parent == $default_table;
15705        }
15706
15707        # Calculate the loose name for this table.  Mostly it's just its name,
15708        # standardized.  But in the case of Perl tables that are single-form
15709        # equivalents to Unicode properties, it is the latter's name.
15710        my $loose_table_name =
15711                        ($property != $perl || $leader_prop == $perl)
15712                        ? standardize($table->name)
15713                        : standardize($parent->name);
15714
15715        my $deprecated = ($table->status eq $DEPRECATED)
15716                         ? $table->status_info
15717                         : "";
15718        my $caseless_equivalent = $table->caseless_equivalent;
15719
15720        # And for each of the table's aliases...  This inner loop eventually
15721        # goes through all aliases in the UCD that we generate regex match
15722        # files for
15723        foreach my $alias ($table->aliases) {
15724            my $standard = UCD_name($table, $alias);
15725
15726            # Generate an entry in either the loose or strict hashes, which
15727            # will translate the property and alias names combination into the
15728            # file where the table for them is stored.
15729            if ($alias->loose_match) {
15730                if (exists $loose_to_file_of{$standard}) {
15731                    Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
15732                }
15733                else {
15734                    $loose_to_file_of{$standard} = $sub_filename;
15735                }
15736            }
15737            else {
15738                if (exists $stricter_to_file_of{$standard}) {
15739                    Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
15740                }
15741                else {
15742                    $stricter_to_file_of{$standard} = $sub_filename;
15743
15744                    # Tightly coupled with how Unicode::UCD works, for a
15745                    # floating point number that is a whole number, get rid of
15746                    # the trailing decimal point and 0's, so that Unicode::UCD
15747                    # will work.  Also note that this assumes that such a
15748                    # number is matched strictly; so if that were to change,
15749                    # this would be wrong.
15750                    if ((my $integer_name = $alias->name)
15751                            =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
15752                    {
15753                        $stricter_to_file_of{$property_name . $integer_name}
15754                                                            = $sub_filename;
15755                    }
15756                }
15757            }
15758
15759            # For Unicode::UCD, create a mapping of the prop=value to the
15760            # canonical =value for that property.
15761            if ($standard =~ /=/) {
15762
15763                # This could happen if a strict name mapped into an existing
15764                # loose name.  In that event, the strict names would have to
15765                # be moved to a new hash.
15766                if (exists($loose_to_standard_value{$standard})) {
15767                    Carp::my_carp_bug("'$standard' conflicts with a pre-existing use.  Bad News.  Continuing anyway");
15768                }
15769                $loose_to_standard_value{$standard} = $loose_table_name;
15770            }
15771
15772            # Keep a list of the deprecated properties and their filenames
15773            if ($deprecated && $complement == 0) {
15774                $Unicode::UCD::why_deprecated{$sub_filename} = $deprecated;
15775            }
15776
15777            # And a substitute table, if any, for case-insensitive matching
15778            if ($caseless_equivalent != 0) {
15779                $caseless_equivalent_to{$standard} = $caseless_equivalent;
15780            }
15781
15782            # Add to defaults list if the table this alias belongs to is the
15783            # default one
15784            $loose_defaults{$standard} = 1 if $is_default;
15785        }
15786    }
15787
15788    return;
15789}
15790
15791{   # Closure
15792    my %base_names;  # Names already used for avoiding DOS 8.3 filesystem
15793                     # conflicts
15794    my %full_dir_name_of;   # Full length names of directories used.
15795
15796    sub construct_filename($name, $mutable, $directories_ref) {
15797        # Return a file name for a table, based on the table name, but perhaps
15798        # changed to get rid of non-portable characters in it, and to make
15799        # sure that it is unique on a file system that allows the names before
15800        # any period to be at most 8 characters (DOS).  While we're at it
15801        # check and complain if there are any directory conflicts.
15802
15803        # $name                 # The name to start with
15804        # $mutable              # Boolean: can it be changed?  If no, but
15805                                # yet it must be to work properly, a warning
15806                                # is given
15807        # $directories_ref      # A reference to an array containing the
15808                                # path to the file, with each element one path
15809                                # component.  This is used because the same
15810                                # name can be used in different directories.
15811
15812        my $warn = ! defined wantarray;  # If true, then if the name is
15813                                # changed, a warning is issued as well.
15814
15815        if (! defined $name) {
15816            Carp::my_carp("Undefined name in directory "
15817                          . File::Spec->join(@$directories_ref)
15818                          . ". '_' used");
15819            return '_';
15820        }
15821
15822        # Make sure that no directory names conflict with each other.  Look at
15823        # each directory in the input file's path.  If it is already in use,
15824        # assume it is correct, and is merely being re-used, but if we
15825        # truncate it to 8 characters, and find that there are two directories
15826        # that are the same for the first 8 characters, but differ after that,
15827        # then that is a problem.
15828        foreach my $directory (@$directories_ref) {
15829            my $short_dir = substr($directory, 0, 8);
15830            if (defined $full_dir_name_of{$short_dir}) {
15831                next if $full_dir_name_of{$short_dir} eq $directory;
15832                Carp::my_carp("Directory $directory conflicts with directory $full_dir_name_of{$short_dir}.  Bad News.  Continuing anyway");
15833            }
15834            else {
15835                $full_dir_name_of{$short_dir} = $directory;
15836            }
15837        }
15838
15839        my $path = join '/', @$directories_ref;
15840        $path .= '/' if $path;
15841
15842        # Remove interior underscores.
15843        (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
15844
15845        # Convert the dot in floating point numbers to an underscore
15846        $filename =~ s/\./_/ if $filename =~ / ^ \d+ \. \d+ $ /x;
15847
15848        my $suffix = "";
15849
15850        # Extract any suffix, delete any non-word character, and truncate to 3
15851        # after the dot
15852        if ($filename =~ m/ ( .*? ) ( \. .* ) /x) {
15853            $filename = $1;
15854            $suffix = $2;
15855            $suffix =~ s/\W+//g;
15856            substr($suffix, 4) = "" if length($suffix) > 4;
15857        }
15858
15859        # Change any non-word character outside the suffix into an underscore,
15860        # and truncate to 8.
15861        $filename =~ s/\W+/_/g;   # eg., "L&" -> "L_"
15862        substr($filename, 8) = "" if length($filename) > 8;
15863
15864        # Make sure the basename doesn't conflict with something we
15865        # might have already written. If we have, say,
15866        #     InGreekExtended1
15867        #     InGreekExtended2
15868        # they become
15869        #     InGreekE
15870        #     InGreek2
15871        my $warned = 0;
15872        while (my $num = $base_names{$path}{lc "$filename$suffix"}++) {
15873            $num++; # so basenames with numbers start with '2', which
15874                    # just looks more natural.
15875
15876            # Want to append $num, but if it'll make the basename longer
15877            # than 8 characters, pre-truncate $filename so that the result
15878            # is acceptable.
15879            my $delta = length($filename) + length($num) - 8;
15880            if ($delta > 0) {
15881                substr($filename, -$delta) = $num;
15882            }
15883            else {
15884                $filename .= $num;
15885            }
15886            if ($warn && ! $warned) {
15887                $warned = 1;
15888                Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS).  Proceeding anyway.");
15889            }
15890        }
15891
15892        return $filename if $mutable;
15893
15894        # If not changeable, must return the input name, but warn if needed to
15895        # change it beyond shortening it.
15896        if ($name ne $filename
15897            && substr($name, 0, length($filename)) ne $filename) {
15898            Carp::my_carp("'$path$name' had to be changed into '$filename'.  Bad News.  Proceeding anyway.");
15899        }
15900        return $name;
15901    }
15902}
15903
15904# The pod file contains a very large table.  Many of the lines in that table
15905# would exceed a typical output window's size, and so need to be wrapped with
15906# a hanging indent to make them look good.  The pod language is really
15907# insufficient here.  There is no general construct to do that in pod, so it
15908# is done here by beginning each such line with a space to cause the result to
15909# be output without formatting, and doing all the formatting here.  This leads
15910# to the result that if the eventual display window is too narrow it won't
15911# look good, and if the window is too wide, no advantage is taken of that
15912# extra width.  A further complication is that the output may be indented by
15913# the formatter so that there is less space than expected.  What I (khw) have
15914# done is to assume that that indent is a particular number of spaces based on
15915# what it is in my Linux system;  people can always resize their windows if
15916# necessary, but this is obviously less than desirable, but the best that can
15917# be expected.
15918my $automatic_pod_indent = 8;
15919
15920# Try to format so that uses fewest lines, but few long left column entries
15921# slide into the right column.  An experiment on 5.1 data yielded the
15922# following percentages that didn't cut into the other side along with the
15923# associated first-column widths
15924# 69% = 24
15925# 80% not too bad except for a few blocks
15926# 90% = 33; # , cuts 353/3053 lines from 37 = 12%
15927# 95% = 37;
15928my $indent_info_column = 27;    # 75% of lines didn't have overlap
15929
15930my $FILLER = 3;     # Length of initial boiler-plate columns in a pod line
15931                    # The 3 is because of:
15932                    #   1   for the leading space to tell the pod formatter to
15933                    #       output as-is
15934                    #   1   for the flag
15935                    #   1   for the space between the flag and the main data
15936
15937sub format_pod_line($first_column_width, $entry, $info, $status = "", $loose_match = 1 ) {
15938    # Take a pod line and return it, formatted properly
15939
15940    # $entry Contents of left column
15941    # $info Contents of right column
15942
15943    my $flags = "";
15944    $flags .= $STRICTER if ! $loose_match;
15945
15946    $flags .= $status if $status;
15947
15948    # There is a blank in the left column to cause the pod formatter to
15949    # output the line as-is.
15950    return sprintf " %-*s%-*s %s\n",
15951                    # The first * in the format is replaced by this, the -1 is
15952                    # to account for the leading blank.  There isn't a
15953                    # hard-coded blank after this to separate the flags from
15954                    # the rest of the line, so that in the unlikely event that
15955                    # multiple flags are shown on the same line, they both
15956                    # will get displayed at the expense of that separation,
15957                    # but since they are left justified, a blank will be
15958                    # inserted in the normal case.
15959                    $FILLER - 1,
15960                    $flags,
15961
15962                    # The other * in the format is replaced by this number to
15963                    # cause the first main column to right fill with blanks.
15964                    # The -1 is for the guaranteed blank following it.
15965                    $first_column_width - $FILLER - 1,
15966                    $entry,
15967                    $info;
15968}
15969
15970my @zero_match_tables;  # List of tables that have no matches in this release
15971
15972sub make_re_pod_entries($input_table) {
15973    # This generates the entries for the pod file for a given table.
15974    # Also done at this time are any children tables.  The output looks like:
15975    # \p{Common}              \p{Script=Common} (Short: \p{Zyyy}) (5178)
15976
15977    # Generate parent and all its children at the same time.
15978    return if $input_table->parent != $input_table;
15979
15980    my $property = $input_table->property;
15981    my $type = $property->type;
15982    my $full_name = $property->full_name;
15983
15984    my $count = $input_table->count;
15985    my $unicode_count;
15986    my $non_unicode_string;
15987    if ($count > $MAX_UNICODE_CODEPOINTS) {
15988        $unicode_count = $count - ($MAX_WORKING_CODEPOINT
15989                                    - $MAX_UNICODE_CODEPOINT);
15990        $non_unicode_string = " plus all above-Unicode code points";
15991    }
15992    else {
15993        $unicode_count = $count;
15994        $non_unicode_string = "";
15995    }
15996
15997    my $string_count = clarify_number($unicode_count) . $non_unicode_string;
15998
15999    my $definition = $input_table->calculate_table_definition;
16000    if ($definition) {
16001
16002        # Save the definition for later use.
16003        $input_table->set_definition($definition);
16004
16005        $definition = ": $definition";
16006    }
16007
16008    my $status = $input_table->status;
16009    my $status_info = $input_table->status_info;
16010    my $caseless_equivalent = $input_table->caseless_equivalent;
16011
16012    # Don't mention a placeholder equivalent as it isn't to be listed in the
16013    # pod
16014    $caseless_equivalent = 0 if $caseless_equivalent != 0
16015                                && $caseless_equivalent->fate > $ORDINARY;
16016
16017    my $entry_for_first_table; # The entry for the first table output.
16018                           # Almost certainly, it is the parent.
16019
16020    # For each related table (including itself), we will generate a pod entry
16021    # for each name each table goes by
16022    foreach my $table ($input_table, $input_table->children) {
16023
16024        # Unicode::UCD cannot deal with null string property values, so skip
16025        # any tables that have no non-null names.
16026        next if ! grep { $_->name ne "" } $table->aliases;
16027
16028        # First, gather all the info that applies to this table as a whole.
16029
16030        push @zero_match_tables, $table if $count == 0
16031                                            # Don't mention special tables
16032                                            # as being zero length
16033                                           && $table->fate == $ORDINARY;
16034
16035        my $table_property = $table->property;
16036
16037        # The short name has all the underscores removed, while the full name
16038        # retains them.  Later, we decide whether to output a short synonym
16039        # for the full one, we need to compare apples to apples, so we use the
16040        # short name's length including underscores.
16041        my $table_property_short_name_length;
16042        my $table_property_short_name
16043            = $table_property->short_name(\$table_property_short_name_length);
16044        my $table_property_full_name = $table_property->full_name;
16045
16046        # Get how much savings there is in the short name over the full one
16047        # (delta will always be <= 0)
16048        my $table_property_short_delta = $table_property_short_name_length
16049                                         - length($table_property_full_name);
16050        my @table_description = $table->description;
16051        my @table_note = $table->note;
16052
16053        # Generate an entry for each alias in this table.
16054        my $entry_for_first_alias;  # saves the first one encountered.
16055        foreach my $alias ($table->aliases) {
16056
16057            # Skip if not to go in pod.
16058            next unless $alias->make_re_pod_entry;
16059
16060            # Start gathering all the components for the entry
16061            my $name = $alias->name;
16062
16063            # Skip if name is empty, as can't be accessed by regexes.
16064            next if $name eq "";
16065
16066            my $entry;      # Holds the left column, may include extras
16067            my $entry_ref;  # To refer to the left column's contents from
16068                            # another entry; has no extras
16069
16070            # First the left column of the pod entry.  Tables for the $perl
16071            # property always use the single form.
16072            if ($table_property == $perl) {
16073                $entry = "\\p{$name}";
16074                $entry .= " \\p$name" if length $name == 1; # Show non-braced
16075                                                            # form too
16076                $entry_ref = "\\p{$name}";
16077            }
16078            else {    # Compound form.
16079
16080                # Only generate one entry for all the aliases that mean true
16081                # or false in binary properties.  Append a '*' to indicate
16082                # some are missing.  (The heading comment notes this.)
16083                my $rhs;
16084                if ($type == $BINARY) {
16085                    next if $name ne 'N' && $name ne 'Y';
16086                    $rhs = "$name*";
16087                }
16088                elsif ($type != $FORCED_BINARY) {
16089                    $rhs = $name;
16090                }
16091                else {
16092
16093                    # Forced binary properties require special handling.  It
16094                    # has two sets of tables, one set is true/false; and the
16095                    # other set is everything else.  Entries are generated for
16096                    # each set.  Use the Bidi_Mirrored property (which appears
16097                    # in all Unicode versions) to get a list of the aliases
16098                    # for the true/false tables.  Of these, only output the N
16099                    # and Y ones, the same as, a regular binary property.  And
16100                    # output all the rest, same as a non-binary property.
16101                    my $bm = property_ref("Bidi_Mirrored");
16102                    if ($name eq 'N' || $name eq 'Y') {
16103                        $rhs = "$name*";
16104                    } elsif (grep { $name eq $_->name } $bm->table("Y")->aliases,
16105                                                        $bm->table("N")->aliases)
16106                    {
16107                        next;
16108                    }
16109                    else {
16110                        $rhs = $name;
16111                    }
16112                }
16113
16114                # Colon-space is used to give a little more space to be easier
16115                # to read;
16116                $entry = "\\p{"
16117                        . $table_property_full_name
16118                        . ": $rhs}";
16119
16120                # But for the reference to this entry, which will go in the
16121                # right column, where space is at a premium, use equals
16122                # without a space
16123                $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
16124            }
16125
16126            # Then the right (info) column.  This is stored as components of
16127            # an array for the moment, then joined into a string later.  For
16128            # non-internal only properties, begin the info with the entry for
16129            # the first table we encountered (if any), as things are ordered
16130            # so that that one is the most descriptive.  This leads to the
16131            # info column of an entry being a more descriptive version of the
16132            # name column
16133            my @info;
16134            if ($name =~ /^_/) {
16135                push @info,
16136                        '(For internal use by Perl, not necessarily stable)';
16137            }
16138            elsif ($entry_for_first_alias) {
16139                push @info, $entry_for_first_alias;
16140            }
16141
16142            # If this entry is equivalent to another, add that to the info,
16143            # using the first such table we encountered
16144            if ($entry_for_first_table) {
16145                if (@info) {
16146                    push @info, "(= $entry_for_first_table)";
16147                }
16148                else {
16149                    push @info, $entry_for_first_table;
16150                }
16151            }
16152
16153            # If the name is a large integer, add an equivalent with an
16154            # exponent for better readability
16155            if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
16156                push @info, sprintf "(= %.1e)", $name
16157            }
16158
16159            my $parenthesized = "";
16160            if (! $entry_for_first_alias) {
16161
16162                # This is the first alias for the current table.  The alias
16163                # array is ordered so that this is the fullest, most
16164                # descriptive alias, so it gets the fullest info.  The other
16165                # aliases are mostly merely pointers to this one, using the
16166                # information already added above.
16167
16168                # Display any status message, but only on the parent table
16169                if ($status && ! $entry_for_first_table) {
16170                    push @info, $status_info;
16171                }
16172
16173                # Put out any descriptive info
16174                if (@table_description || @table_note) {
16175                    push @info, join "; ", @table_description, @table_note;
16176                }
16177
16178                # Look to see if there is a shorter name we can point people
16179                # at
16180                my $standard_name = standardize($name);
16181                my $short_name;
16182                my $proposed_short = $table->short_name;
16183                if (defined $proposed_short) {
16184                    my $standard_short = standardize($proposed_short);
16185
16186                    # If the short name is shorter than the standard one, or
16187                    # even if it's not, but the combination of it and its
16188                    # short property name (as in \p{prop=short} ($perl doesn't
16189                    # have this form)) saves at least two characters, then,
16190                    # cause it to be listed as a shorter synonym.
16191                    if (length $standard_short < length $standard_name
16192                        || ($table_property != $perl
16193                            && (length($standard_short)
16194                                - length($standard_name)
16195                                + $table_property_short_delta)  # (<= 0)
16196                                < -2))
16197                    {
16198                        $short_name = $proposed_short;
16199                        if ($table_property != $perl) {
16200                            $short_name = $table_property_short_name
16201                                          . "=$short_name";
16202                        }
16203                        $short_name = "\\p{$short_name}";
16204                    }
16205                }
16206
16207                # And if this is a compound form name, see if there is a
16208                # single form equivalent
16209                my $single_form;
16210                if ($table_property != $perl && $table_property != $block) {
16211
16212                    # Special case the binary N tables, so that will print
16213                    # \P{single}, but use the Y table values to populate
16214                    # 'single', as we haven't likewise populated the N table.
16215                    # For forced binary tables, we can't just look at the N
16216                    # table, but must see if this table is equivalent to the N
16217                    # one, as there are two equivalent beasts in these
16218                    # properties.
16219                    my $test_table;
16220                    my $p;
16221                    if (   ($type == $BINARY
16222                            && $input_table == $property->table('No'))
16223                        || ($type == $FORCED_BINARY
16224                            && $property->table('No')->
16225                                        is_set_equivalent_to($input_table)))
16226                    {
16227                        $test_table = $property->table('Yes');
16228                        $p = 'P';
16229                    }
16230                    else {
16231                        $test_table = $input_table;
16232                        $p = 'p';
16233                    }
16234
16235                    # Look for a single form amongst all the children.
16236                    foreach my $table ($test_table->children) {
16237                        next if $table->property != $perl;
16238                        my $proposed_name = $table->short_name;
16239                        next if ! defined $proposed_name;
16240
16241                        # Don't mention internal-only properties as a possible
16242                        # single form synonym
16243                        next if substr($proposed_name, 0, 1) eq '_';
16244
16245                        $proposed_name = "\\$p\{$proposed_name}";
16246                        if (! defined $single_form
16247                            || length($proposed_name) < length $single_form)
16248                        {
16249                            $single_form = $proposed_name;
16250
16251                            # The goal here is to find a single form; not the
16252                            # shortest possible one.  We've already found a
16253                            # short name.  So, stop at the first single form
16254                            # found, which is likely to be closer to the
16255                            # original.
16256                            last;
16257                        }
16258                    }
16259                }
16260
16261                # Output both short and single in the same parenthesized
16262                # expression, but with only one of 'Single', 'Short' if there
16263                # are both items.
16264                if ($short_name || $single_form || $table->conflicting) {
16265                    $parenthesized .= "Short: $short_name" if $short_name;
16266                    if ($short_name && $single_form) {
16267                        $parenthesized .= ', ';
16268                    }
16269                    elsif ($single_form) {
16270                        $parenthesized .= 'Single: ';
16271                    }
16272                    $parenthesized .= $single_form if $single_form;
16273                }
16274            }
16275
16276            if ($caseless_equivalent != 0) {
16277                $parenthesized .=  '; ' if $parenthesized ne "";
16278                $parenthesized .= "/i= " . $caseless_equivalent->complete_name;
16279            }
16280
16281
16282            # Warn if this property isn't the same as one that a
16283            # semi-casual user might expect.  The other components of this
16284            # parenthesized structure are calculated only for the first entry
16285            # for this table, but the conflicting is deemed important enough
16286            # to go on every entry.
16287            my $conflicting = join " NOR ", $table->conflicting;
16288            if ($conflicting) {
16289                $parenthesized .=  '; ' if $parenthesized ne "";
16290                $parenthesized .= "NOT $conflicting";
16291            }
16292
16293            push @info, "($parenthesized)" if $parenthesized;
16294
16295            if ($name =~ /_$/ && $alias->loose_match) {
16296                push @info, "Note the trailing '_' matters in spite of loose matching rules.";
16297            }
16298
16299            if ($table_property != $perl && $table->perl_extension) {
16300                push @info, '(Perl extension)';
16301            }
16302            my $definition = $table->definition // "";
16303            $definition = "" if $entry_for_first_alias;
16304            $definition = ": $definition" if $definition;
16305            push @info, "($string_count$definition)";
16306
16307            # Now, we have both the entry and info so add them to the
16308            # list of all the properties.
16309            push @match_properties,
16310                format_pod_line($indent_info_column,
16311                                $entry,
16312                                join( " ", @info),
16313                                $alias->status,
16314                                $alias->loose_match);
16315
16316            $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
16317        } # End of looping through the aliases for this table.
16318
16319        if (! $entry_for_first_table) {
16320            $entry_for_first_table = $entry_for_first_alias;
16321        }
16322    } # End of looping through all the related tables
16323    return;
16324}
16325
16326sub make_ucd_table_pod_entries($table) {
16327    # Generate the entries for the UCD section of the pod for $table.  This
16328    # also calculates if names are ambiguous, so has to be called even if the
16329    # pod is not being output
16330
16331    my $short_name = $table->name;
16332    my $standard_short_name = standardize($short_name);
16333    my $full_name = $table->full_name;
16334    my $standard_full_name = standardize($full_name);
16335
16336    my $full_info = "";     # Text of info column for full-name entries
16337    my $other_info = "";    # Text of info column for short-name entries
16338    my $short_info = "";    # Text of info column for other entries
16339    my $meaning = "";       # Synonym of this table
16340
16341    my $property = ($table->isa('Property'))
16342                   ? $table
16343                   : $table->parent->property;
16344
16345    my $perl_extension = $table->perl_extension;
16346    my $is_perl_extension_match_table_but_not_dollar_perl
16347                                                        = $property != $perl
16348                                                       && $perl_extension
16349                                                       && $property != $table;
16350
16351    # Get the more official name for perl extensions that aren't
16352    # stand-alone properties
16353    if ($is_perl_extension_match_table_but_not_dollar_perl) {
16354        if ($property->type == $BINARY) {
16355            $meaning = $property->full_name;
16356        }
16357        else {
16358            $meaning = $table->parent->complete_name;
16359        }
16360    }
16361
16362    # There are three types of info column.  One for the short name, one for
16363    # the full name, and one for everything else.  They mostly are the same,
16364    # so initialize in the same loop.
16365
16366    foreach my $info_ref (\$full_info, \$short_info, \$other_info) {
16367        if ($info_ref != \$full_info) {
16368
16369            # The non-full name columns include the full name
16370            $$info_ref .= $full_name;
16371        }
16372
16373
16374        if ($is_perl_extension_match_table_but_not_dollar_perl) {
16375
16376            # Add the synonymous name for the non-full name entries; and to
16377            # the full-name entry if it adds extra information
16378            if (   standardize($meaning) ne $standard_full_name
16379                || $info_ref == \$other_info
16380                || $info_ref == \$short_info)
16381            {
16382                my $parenthesized =  $info_ref != \$full_info;
16383                $$info_ref .= " " if $$info_ref && $parenthesized;
16384                $$info_ref .= "(=" if $parenthesized;
16385                $$info_ref .= "$meaning";
16386                $$info_ref .= ")" if $parenthesized;
16387                $$info_ref .= ".";
16388            }
16389        }
16390
16391        # And the full-name entry includes the short name, if shorter
16392        if ($info_ref == \$full_info
16393            && length $standard_short_name < length $standard_full_name)
16394        {
16395            $full_info =~ s/\.\Z//;
16396            $full_info .= "  " if $full_info;
16397            $full_info .= "(Short: $short_name)";
16398        }
16399
16400        if ($table->perl_extension) {
16401            $$info_ref =~ s/\.\Z//;
16402            $$info_ref .= ".  " if $$info_ref;
16403            $$info_ref .= "(Perl extension)";
16404        }
16405    }
16406
16407    my $definition;
16408    my $definition_table;
16409    my $type = $table->property->type;
16410    if ($type == $BINARY || $type == $FORCED_BINARY) {
16411        $definition_table = $table->property->table('Y');
16412    }
16413    elsif ($table->isa('Match_Table')) {
16414        $definition_table = $table;
16415    }
16416
16417    $definition = $definition_table->calculate_table_definition
16418                                            if defined $definition_table
16419                                                    && $definition_table != 0;
16420
16421    # Add any extra annotations to the full name entry
16422    foreach my $more_info ($table->description,
16423                            $definition,
16424                            $table->note,
16425                            $table->status_info)
16426    {
16427        next unless $more_info;
16428        $full_info =~ s/\.\Z//;
16429        $full_info .= ".  " if $full_info;
16430        $full_info .= $more_info;
16431    }
16432    if ($table->property->type == $FORCED_BINARY) {
16433        if ($full_info) {
16434            $full_info =~ s/\.\Z//;
16435            $full_info .= ".  ";
16436        }
16437        $full_info .= "This is a combination property which has both:"
16438                    . " 1) a map to various string values; and"
16439                    . " 2) a map to boolean Y/N, where 'Y' means the"
16440                    . " string value is non-empty.  Add the prefix 'is'"
16441                    . " to the prop_invmap() call to get the latter";
16442    }
16443
16444    # These keep track if have created full and short name pod entries for the
16445    # property
16446    my $done_full = 0;
16447    my $done_short = 0;
16448
16449    # Every possible name is kept track of, even those that aren't going to be
16450    # output.  This way we can be sure to find the ambiguities.
16451    foreach my $alias ($table->aliases) {
16452        my $name = $alias->name;
16453        my $standard = standardize($name);
16454        my $info;
16455        my $output_this = $alias->ucd;
16456
16457        # If the full and short names are the same, we want to output the full
16458        # one's entry, so it has priority.
16459        if ($standard eq $standard_full_name) {
16460            next if $done_full;
16461            $done_full = 1;
16462            $info = $full_info;
16463        }
16464        elsif ($standard eq $standard_short_name) {
16465            next if $done_short;
16466            $done_short = 1;
16467            next if $standard_short_name eq $standard_full_name;
16468            $info = $short_info;
16469        }
16470        else {
16471            $info = $other_info;
16472        }
16473
16474        $combination_property{$standard} = 1
16475                                  if $table->property->type == $FORCED_BINARY;
16476
16477        # Here, we have set up the two columns for this entry.  But if an
16478        # entry already exists for this name, we have to decide which one
16479        # we're going to later output.
16480        if (exists $ucd_pod{$standard}) {
16481
16482            # If the two entries refer to the same property, it's not going to
16483            # be ambiguous.  (Likely it's because the names when standardized
16484            # are the same.)  But that means if they are different properties,
16485            # there is ambiguity.
16486            if ($ucd_pod{$standard}->{'property'} != $property) {
16487
16488                # Here, we have an ambiguity.  This code assumes that one is
16489                # scheduled to be output and one not and that one is a perl
16490                # extension (which is not to be output) and the other isn't.
16491                # If those assumptions are wrong, things have to be rethought.
16492                if ($ucd_pod{$standard}{'output_this'} == $output_this
16493                    || $ucd_pod{$standard}{'perl_extension'} == $perl_extension
16494                    || $output_this == $perl_extension)
16495                {
16496                    Carp::my_carp("Bad news.  $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations.  Proceeding anyway.");
16497                }
16498
16499                # We modify the info column of the one being output to
16500                # indicate the ambiguity.  Set $which to point to that one's
16501                # info.
16502                my $which;
16503                if ($ucd_pod{$standard}{'output_this'}) {
16504                    $which = \$ucd_pod{$standard}->{'info'};
16505                }
16506                else {
16507                    $which = \$info;
16508                    $meaning = $ucd_pod{$standard}{'meaning'};
16509                }
16510
16511                chomp $$which;
16512                $$which =~ s/\.\Z//;
16513                $$which .= "; NOT '$standard' meaning '$meaning'";
16514
16515                $ambiguous_names{$standard} = 1;
16516            }
16517
16518            # Use the non-perl-extension variant
16519            next unless $ucd_pod{$standard}{'perl_extension'};
16520        }
16521
16522        # Store enough information about this entry that we can later look for
16523        # ambiguities, and output it properly.
16524        $ucd_pod{$standard} = { 'name' => $name,
16525                                'info' => $info,
16526                                'meaning' => $meaning,
16527                                'output_this' => $output_this,
16528                                'perl_extension' => $perl_extension,
16529                                'property' => $property,
16530                                'status' => $alias->status,
16531        };
16532    } # End of looping through all this table's aliases
16533
16534    return;
16535}
16536
16537sub pod_alphanumeric_sort {
16538    # Sort pod entries alphanumerically.
16539
16540    # The first few character columns are filler, plus the '\p{'; and get rid
16541    # of all the trailing stuff, starting with the trailing '}', so as to sort
16542    # on just 'Name=Value'
16543    (my $a = lc $a) =~ s/^ .*? \{ //x;
16544    $a =~ s/}.*//;
16545    (my $b = lc $b) =~ s/^ .*? \{ //x;
16546    $b =~ s/}.*//;
16547
16548    # Determine if the two operands are both internal only or both not.
16549    # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
16550    # should be the underscore that begins internal only
16551    my $a_is_internal = (substr($a, 0, 1) eq '_');
16552    my $b_is_internal = (substr($b, 0, 1) eq '_');
16553
16554    # Sort so the internals come last in the table instead of first (which the
16555    # leading underscore would otherwise indicate).
16556    if ($a_is_internal != $b_is_internal) {
16557        return 1 if $a_is_internal;
16558        return -1
16559    }
16560
16561    # Determine if the two operands are compound or not, and if so if are
16562    # "numeric" property values or not, like \p{Age: 3.0}.  But there are also
16563    # things like \p{Canonical_Combining_Class: CCC133} and \p{Age: V10_0},
16564    # all of which this considers numeric, and for sorting, looks just at the
16565    # numeric parts.  It can also be a rational like \p{Numeric Value=-1/2}.
16566    my $split_re = qr/
16567        ^ ( [^:=]+ ) # $1 is undef if not a compound form, otherwise is the
16568                     # property name
16569        [:=] \s*     # The syntax for the compound form
16570        (?:          # followed by ...
16571            (        # $2 gets defined if what follows is a "numeric"
16572                     # expression, which is ...
16573              ( -? \d+ (?: [.\/] \d+)?  # An integer, float, or rational
16574                                        # number, optionally signed
16575               | [[:alpha:]]{2,} \d+ $ ) # or something like CCC131.  Either
16576                                         # of these go into $3
16577             | ( V \d+ _ \d+ )           # or a Unicode's Age property version
16578                                         # number, into $4
16579            )
16580            | .* $    # If not "numeric", accept anything so that $1 gets
16581                      # defined if it is any compound form
16582        ) /ix;
16583    my ($a_initial, $a_numeric, $a_number, $a_version) = ($a =~ $split_re);
16584    my ($b_initial, $b_numeric, $b_number, $b_version) = ($b =~ $split_re);
16585
16586    # Sort alphabeticlly on the whole property name if either operand isn't
16587    # compound, or they differ.
16588    return $a cmp $b if   ! defined $a_initial
16589                       || ! defined $b_initial
16590                       || $a_initial ne $b_initial;
16591
16592    if (! defined $a_numeric) {
16593
16594        # If neither is numeric, use alpha sort
16595        return $a cmp $b if ! defined $b_numeric;
16596        return 1;  # Sort numeric ahead of alpha
16597    }
16598
16599    # Here $a is numeric
16600    return -1 if ! defined $b_numeric;  # Numeric sorts before alpha
16601
16602    # Here they are both numeric in the same property.
16603    # Convert version numbers into regular numbers
16604    if (defined $a_version) {
16605        ($a_number = $a_version) =~ s/^V//i;
16606        $a_number =~ s/_/./;
16607    }
16608    else {  # Otherwise get rid of the, e.g., CCC in CCC9 */
16609        $a_number =~ s/ ^ [[:alpha:]]+ //x;
16610    }
16611    if (defined $b_version) {
16612        ($b_number = $b_version) =~ s/^V//i;
16613        $b_number =~ s/_/./;
16614    }
16615    else {
16616        $b_number =~ s/ ^ [[:alpha:]]+ //x;
16617    }
16618
16619    # Convert rationals to floating for the comparison.
16620    $a_number = eval $a_number if $a_number =~ qr{/};
16621    $b_number = eval $b_number if $b_number =~ qr{/};
16622
16623    return $a_number <=> $b_number || $a cmp $b;
16624}
16625
16626sub make_pod () {
16627    # Create the .pod file.  This generates the various subsections and then
16628    # combines them in one big HERE document.
16629
16630    my $Is_flags_text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)";
16631
16632    return unless defined $pod_directory;
16633    print "Making pod file\n" if $verbosity >= $PROGRESS;
16634
16635    my $exception_message =
16636    '(Any exceptions are individually noted beginning with the word NOT.)';
16637    my @block_warning;
16638    if (-e 'Blocks.txt') {
16639
16640        # Add the line: '\p{In_*}    \p{Block: *}', with the warning message
16641        # if the global $has_In_conflicts indicates we have them.
16642        push @match_properties, format_pod_line($indent_info_column,
16643                                                '\p{In_*}',
16644                                                '\p{Block: *}'
16645                                                    . (($has_In_conflicts)
16646                                                      ? " $exception_message"
16647                                                      : ""),
16648                                                 $DISCOURAGED);
16649        @block_warning = << "END";
16650
16651In particular, matches in the Block property have single forms
16652defined by Perl that begin with C<"In_">, C<"Is_>, or even with no prefix at
16653all,  Like all B<DISCOURAGED> forms, these are not stable.  For example,
16654C<\\p{Block=Deseret}> can currently be written as C<\\p{In_Deseret}>,
16655C<\\p{Is_Deseret}>, or C<\\p{Deseret}>.  But, a new Unicode version may
16656come along that would force Perl to change the meaning of one or more of
16657these, and your program would no longer be correct.  Currently there are no
16658such conflicts with the form that begins C<"In_">, but there are many with the
16659other two shortcuts, and Unicode continues to define new properties that begin
16660with C<"In">, so it's quite possible that a conflict will occur in the future.
16661The compound form is guaranteed to not become obsolete, and its meaning is
16662clearer anyway.  See L<perlunicode/"Blocks"> for more information about this.
16663
16664User-defined properties must begin with "In" or "Is".  These override any
16665Unicode property of the same name.
16666END
16667    }
16668    my $text = $Is_flags_text;
16669    $text = "$exception_message $text" if $has_Is_conflicts;
16670
16671    # And the 'Is_ line';
16672    push @match_properties, format_pod_line($indent_info_column,
16673                                            '\p{Is_*}',
16674                                            "\\p{*} $text");
16675    push @match_properties, format_pod_line($indent_info_column,
16676            '\p{Name=*}',
16677            "Combination of Name and Name_Alias properties; has special"
16678          . " loose matching rules, for which see Unicode UAX #44");
16679    push @match_properties, format_pod_line($indent_info_column,
16680                                            '\p{Na=*}',
16681                                            '\p{Name=*}');
16682
16683    # Sort the properties array for output.  It is sorted alphabetically
16684    # except numerically for numeric properties, and only output unique lines.
16685    @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
16686
16687    my $formatted_properties = simple_fold(\@match_properties,
16688                                        "",
16689                                        # indent succeeding lines by two extra
16690                                        # which looks better
16691                                        $indent_info_column + 2,
16692
16693                                        # shorten the line length by how much
16694                                        # the formatter indents, so the folded
16695                                        # line will fit in the space
16696                                        # presumably available
16697                                        $automatic_pod_indent);
16698    # Add column headings, indented to be a little more centered, but not
16699    # exactly
16700    $formatted_properties =  format_pod_line($indent_info_column,
16701                                                    '    NAME',
16702                                                    '           INFO')
16703                                    . "\n"
16704                                    . $formatted_properties;
16705
16706    # Generate pod documentation lines for the tables that match nothing
16707    my $zero_matches = "";
16708    if (@zero_match_tables) {
16709        @zero_match_tables = uniques(@zero_match_tables);
16710        $zero_matches = join "\n\n",
16711                        map { $_ = '=item \p{' . $_->complete_name . "}" }
16712                            sort { $a->complete_name cmp $b->complete_name }
16713                            @zero_match_tables;
16714
16715        $zero_matches = <<END;
16716
16717=head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters
16718
16719Unicode has some property-value pairs that currently don't match anything.
16720This happens generally either because they are obsolete, or they exist for
16721symmetry with other forms, but no language has yet been encoded that uses
16722them.  In this version of Unicode, the following match zero code points:
16723
16724=over 4
16725
16726$zero_matches
16727
16728=back
16729
16730END
16731    }
16732
16733    # Generate list of properties that we don't accept, grouped by the reasons
16734    # why.  This is so only put out the 'why' once, and then list all the
16735    # properties that have that reason under it.
16736
16737    my %why_list;   # The keys are the reasons; the values are lists of
16738                    # properties that have the key as their reason
16739
16740    # For each property, add it to the list that are suppressed for its reason
16741    # The sort will cause the alphabetically first properties to be added to
16742    # each list first, so each list will be sorted.
16743    foreach my $property (sort keys %why_suppressed) {
16744        next unless $why_suppressed{$property};
16745        push @{$why_list{$why_suppressed{$property}}}, $property;
16746    }
16747
16748    # For each reason (sorted by the first property that has that reason)...
16749    my @bad_re_properties;
16750    foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
16751                     keys %why_list)
16752    {
16753        # Add to the output, all the properties that have that reason.
16754        my $has_item = 0;   # Flag if actually output anything.
16755        foreach my $name (@{$why_list{$why}}) {
16756
16757            # Split compound names into $property and $table components
16758            my $property = $name;
16759            my $table;
16760            if ($property =~ / (.*) = (.*) /x) {
16761                $property = $1;
16762                $table = $2;
16763            }
16764
16765            # This release of Unicode may not have a property that is
16766            # suppressed, so don't reference a non-existent one.
16767            $property = property_ref($property);
16768            next if ! defined $property;
16769
16770            # And since this list is only for match tables, don't list the
16771            # ones that don't have match tables.
16772            next if ! $property->to_create_match_tables;
16773
16774            # Find any abbreviation, and turn it into a compound name if this
16775            # is a property=value pair.
16776            my $short_name = $property->name;
16777            $short_name .= '=' . $property->table($table)->name if $table;
16778
16779            # Start with an empty line.
16780            push @bad_re_properties, "\n\n" unless $has_item;
16781
16782            # And add the property as an item for the reason.
16783            push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
16784            $has_item = 1;
16785        }
16786
16787        # And add the reason under the list of properties, if such a list
16788        # actually got generated.  Note that the header got added
16789        # unconditionally before.  But pod ignores extra blank lines, so no
16790        # harm.
16791        push @bad_re_properties, "\n$why\n" if $has_item;
16792
16793    } # End of looping through each reason.
16794
16795    if (! @bad_re_properties) {
16796        push @bad_re_properties,
16797                "*** This installation accepts ALL non-Unihan properties ***";
16798    }
16799    else {
16800        # Add =over only if non-empty to avoid an empty =over/=back section,
16801        # which is considered bad form.
16802        unshift @bad_re_properties, "\n=over 4\n";
16803        push @bad_re_properties, "\n=back\n";
16804    }
16805
16806    # Similarly, generate a list of files that we don't use, grouped by the
16807    # reasons why (Don't output if the reason is empty).  First, create a hash
16808    # whose keys are the reasons, and whose values are anonymous arrays of all
16809    # the files that share that reason.
16810    my %grouped_by_reason;
16811    foreach my $file (keys %skipped_files) {
16812        next unless $skipped_files{$file};
16813        push @{$grouped_by_reason{$skipped_files{$file}}}, $file;
16814    }
16815
16816    # Then, sort each group.
16817    foreach my $group (keys %grouped_by_reason) {
16818        @{$grouped_by_reason{$group}} = sort { lc $a cmp lc $b }
16819                                        @{$grouped_by_reason{$group}} ;
16820    }
16821
16822    # Finally, create the output text.  For each reason (sorted by the
16823    # alphabetically first file that has that reason)...
16824    my @unused_files;
16825    foreach my $reason (sort { lc $grouped_by_reason{$a}->[0]
16826                               cmp lc $grouped_by_reason{$b}->[0]
16827                              }
16828                         keys %grouped_by_reason)
16829    {
16830        # Add all the files that have that reason to the output.  Start
16831        # with an empty line.
16832        push @unused_files, "\n\n";
16833        push @unused_files, map { "\n=item F<$_> \n" }
16834                            @{$grouped_by_reason{$reason}};
16835        # And add the reason under the list of files
16836        push @unused_files, "\n$reason\n";
16837    }
16838
16839    # Similarly, create the output text for the UCD section of the pod
16840    my @ucd_pod;
16841    foreach my $key (keys %ucd_pod) {
16842        next unless $ucd_pod{$key}->{'output_this'};
16843        push @ucd_pod, format_pod_line($indent_info_column,
16844                                       $ucd_pod{$key}->{'name'},
16845                                       $ucd_pod{$key}->{'info'},
16846                                       $ucd_pod{$key}->{'status'},
16847                                      );
16848    }
16849
16850    # Sort alphabetically, and fold for output
16851    @ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod;
16852    my $ucd_pod = simple_fold(\@ucd_pod,
16853                           ' ',
16854                           $indent_info_column,
16855                           $automatic_pod_indent);
16856    $ucd_pod =  format_pod_line($indent_info_column, 'NAME', '  INFO')
16857                . "\n"
16858                . $ucd_pod;
16859    my $space_hex = sprintf("%02x", ord " ");
16860    local $" = "";
16861
16862    # Everything is ready to assemble.
16863    my @OUT = << "END";
16864=begin comment
16865
16866$HEADER
16867
16868To change this file, edit $0 instead.
16869
16870=end comment
16871
16872=head1 NAME
16873
16874$pod_file - Index of Unicode Version $unicode_version character properties in Perl
16875
16876=head1 DESCRIPTION
16877
16878This document provides information about the portion of the Unicode database
16879that deals with character properties, that is the portion that is defined on
16880single code points.  (L</Other information in the Unicode data base>
16881below briefly mentions other data that Unicode provides.)
16882
16883Perl can provide access to all non-provisional Unicode character properties,
16884though not all are enabled by default.  The omitted ones are the Unihan
16885properties and certain
16886deprecated or Unicode-internal properties.  (An installation may choose to
16887recompile Perl's tables to change this.  See L</Unicode character
16888properties that are NOT accepted by Perl>.)
16889
16890For most purposes, access to Unicode properties from the Perl core is through
16891regular expression matches, as described in the next section.
16892For some special purposes, and to access the properties that are not suitable
16893for regular expression matching, all the Unicode character properties that
16894Perl handles are accessible via the standard L<Unicode::UCD> module, as
16895described in the section L</Properties accessible through Unicode::UCD>.
16896
16897Perl also provides some additional extensions and short-cut synonyms
16898for Unicode properties.
16899
16900This document merely lists all available properties and does not attempt to
16901explain what each property really means.  There is a brief description of each
16902Perl extension; see L<perlunicode/Other Properties> for more information on
16903these.  There is some detail about Blocks, Scripts, General_Category,
16904and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
16905official Unicode properties, refer to the Unicode standard.  A good starting
16906place is L<$unicode_reference_url>.
16907
16908Note that you can define your own properties; see
16909L<perlunicode/"User-Defined Character Properties">.
16910
16911=head1 Properties accessible through C<\\p{}> and C<\\P{}>
16912
16913The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to
16914most of the Unicode character properties.  The table below shows all these
16915constructs, both single and compound forms.
16916
16917B<Compound forms> consist of two components, separated by an equals sign or a
16918colon.  The first component is the property name, and the second component is
16919the particular value of the property to match against, for example,
16920C<\\p{Script_Extensions: Greek}> and C<\\p{Script_Extensions=Greek}> both mean
16921to match characters whose Script_Extensions property value is Greek.
16922(C<Script_Extensions> is an improved version of the C<Script> property.)
16923
16924B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for
16925their equivalent compound forms.  The table shows these equivalences.  (In our
16926example, C<\\p{Greek}> is a just a shortcut for
16927C<\\p{Script_Extensions=Greek}>).  There are also a few Perl-defined single
16928forms that are not shortcuts for a compound form.  One such is C<\\p{Word}>.
16929These are also listed in the table.
16930
16931In parsing these constructs, Perl always ignores Upper/lower case differences
16932everywhere within the {braces}.  Thus C<\\p{Greek}> means the same thing as
16933C<\\p{greek}>.  But note that changing the case of the C<"p"> or C<"P"> before
16934the left brace completely changes the meaning of the construct, from "match"
16935(for C<\\p{}>) to "doesn't match" (for C<\\P{}>).  Casing in this document is
16936for improved legibility.
16937
16938Also, white space, hyphens, and underscores are normally ignored
16939everywhere between the {braces}, and hence can be freely added or removed
16940even if the C</x> modifier hasn't been specified on the regular expression.
16941But in the table below $a_bold_stricter at the beginning of an entry
16942means that tighter (stricter) rules are used for that entry:
16943
16944=over 4
16945
16946=over 4
16947
16948=item Single form (C<\\p{name}>) tighter rules:
16949
16950White space, hyphens, and underscores ARE significant
16951except for:
16952
16953=over 4
16954
16955=item * white space adjacent to a non-word character
16956
16957=item * underscores separating digits in numbers
16958
16959=back
16960
16961That means, for example, that you can freely add or remove white space
16962adjacent to (but within) the braces without affecting the meaning.
16963
16964=item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules:
16965
16966The tighter rules given above for the single form apply to everything to the
16967right of the colon or equals; the looser rules still apply to everything to
16968the left.
16969
16970That means, for example, that you can freely add or remove white space
16971adjacent to (but within) the braces and the colon or equal sign.
16972
16973=back
16974
16975=back
16976
16977Some properties are considered obsolete by Unicode, but still available.
16978There are several varieties of obsolescence:
16979
16980=over 4
16981
16982=over 4
16983
16984=item Stabilized
16985
16986A property may be stabilized.  Such a determination does not indicate
16987that the property should or should not be used; instead it is a declaration
16988that the property will not be maintained nor extended for newly encoded
16989characters.  Such properties are marked with $a_bold_stabilized in the
16990table.
16991
16992=item Deprecated
16993
16994A property may be deprecated, perhaps because its original intent
16995has been replaced by another property, or because its specification was
16996somehow defective.  This means that its use is strongly
16997discouraged, so much so that a warning will be issued if used, unless the
16998regular expression is in the scope of a C<S<no warnings 'deprecated'>>
16999statement.  $A_bold_deprecated flags each such entry in the table, and
17000the entry there for the longest, most descriptive version of the property will
17001give the reason it is deprecated, and perhaps advice.  Perl may issue such a
17002warning, even for properties that aren't officially deprecated by Unicode,
17003when there used to be characters or code points that were matched by them, but
17004no longer.  This is to warn you that your program may not work like it did on
17005earlier Unicode releases.
17006
17007A deprecated property may be made unavailable in a future Perl version, so it
17008is best to move away from them.
17009
17010A deprecated property may also be stabilized, but this fact is not shown.
17011
17012=item Obsolete
17013
17014Properties marked with $a_bold_obsolete in the table are considered (plain)
17015obsolete.  Generally this designation is given to properties that Unicode once
17016used for internal purposes (but not any longer).
17017
17018=item Discouraged
17019
17020This is not actually a Unicode-specified obsolescence, but applies to certain
17021Perl extensions that are present for backwards compatibility, but are
17022discouraged from being used.  These are not obsolete, but their meanings are
17023not stable.  Future Unicode versions could force any of these extensions to be
17024removed without warning, replaced by another property with the same name that
17025means something different.  $A_bold_discouraged flags each such entry in the
17026table.  Use the equivalent shown instead.
17027
17028@block_warning
17029
17030=back
17031
17032=back
17033
17034The table below has two columns.  The left column contains the C<\\p{}>
17035constructs to look up, possibly preceded by the flags mentioned above; and
17036the right column contains information about them, like a description, or
17037synonyms.  The table shows both the single and compound forms for each
17038property that has them.  If the left column is a short name for a property,
17039the right column will give its longer, more descriptive name; and if the left
17040column is the longest name, the right column will show any equivalent shortest
17041name, in both single and compound forms if applicable.
17042
17043If braces are not needed to specify a property (e.g., C<\\pL>), the left
17044column contains both forms, with and without braces.
17045
17046The right column will also caution you if a property means something different
17047than what might normally be expected.
17048
17049All single forms are Perl extensions; a few compound forms are as well, and
17050are noted as such.
17051
17052Numbers in (parentheses) indicate the total number of Unicode code points
17053matched by the property.  For the entries that give the longest, most
17054descriptive version of the property, the count is followed by a list of some
17055of the code points matched by it.  The list includes all the matched
17056characters in the 0-255 range, enclosed in the familiar [brackets] the same as
17057a regular expression bracketed character class.  Following that, the next few
17058higher matching ranges are also given.  To avoid visual ambiguity, the SPACE
17059character is represented as C<\\x$space_hex>.
17060
17061For emphasis, those properties that match no code points at all are listed as
17062well in a separate section following the table.
17063
17064Most properties match the same code points regardless of whether C<"/i">
17065case-insensitive matching is specified or not.  But a few properties are
17066affected.  These are shown with the notation S<C<(/i= I<other_property>)>>
17067in the second column.  Under case-insensitive matching they match the
17068same code pode points as the property I<other_property>.
17069
17070There is no description given for most non-Perl defined properties (See
17071L<$unicode_reference_url> for that).
17072
17073For compactness, 'B<*>' is used as a wildcard instead of showing all possible
17074combinations.  For example, entries like:
17075
17076 \\p{Gc: *}                                  \\p{General_Category: *}
17077
17078mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
17079for the latter is also valid for the former.  Similarly,
17080
17081 \\p{Is_*}                                   \\p{*}
17082
17083means that if and only if, for example, C<\\p{Foo}> exists, then
17084C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing.
17085And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and
17086C<\\p{IsFoo=Bar}>.  "*" here is restricted to something not beginning with an
17087underscore.
17088
17089Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
17090And 'No', 'F', and 'False' are all synonyms for 'N'.  The table shows 'Y*' and
17091'N*' to indicate this, and doesn't have separate entries for the other
17092possibilities.  Note that not all properties which have values 'Yes' and 'No'
17093are binary, and they have all their values spelled out without using this wild
17094card, and a C<NOT> clause in their description that highlights their not being
17095binary.  These also require the compound form to match them, whereas true
17096binary properties have both single and compound forms available.
17097
17098Note that all non-essential underscores are removed in the display of the
17099short names below.
17100
17101B<Legend summary:>
17102
17103=over 4
17104
17105=item Z<>B<*> is a wild-card
17106
17107=item B<(\\d+)> in the info column gives the number of Unicode code points matched
17108by this property.
17109
17110=item B<$DEPRECATED> means this is deprecated.
17111
17112=item B<$OBSOLETE> means this is obsolete.
17113
17114=item B<$STABILIZED> means this is stabilized.
17115
17116=item B<$STRICTER> means tighter (stricter) name matching applies.
17117
17118=item B<$DISCOURAGED> means use of this form is discouraged, and may not be
17119stable.
17120
17121=back
17122
17123$formatted_properties
17124
17125$zero_matches
17126
17127=head1 Properties accessible through Unicode::UCD
17128
17129The value of any Unicode (not including Perl extensions) character
17130property mentioned above for any single code point is available through
17131L<Unicode::UCD/charprop()>.  L<Unicode::UCD/charprops_all()> returns the
17132values of all the Unicode properties for a given code point.
17133
17134Besides these, all the Unicode character properties mentioned above
17135(except for those marked as for internal use by Perl) are also
17136accessible by L<Unicode::UCD/prop_invlist()>.
17137
17138Due to their nature, not all Unicode character properties are suitable for
17139regular expression matches, nor C<prop_invlist()>.  The remaining
17140non-provisional, non-internal ones are accessible via
17141L<Unicode::UCD/prop_invmap()> (except for those that this Perl installation
17142hasn't included; see L<below for which those are|/Unicode character properties
17143that are NOT accepted by Perl>).
17144
17145For compatibility with other parts of Perl, all the single forms given in the
17146table in the L<section above|/Properties accessible through \\p{} and \\P{}>
17147are recognized.  BUT, there are some ambiguities between some Perl extensions
17148and the Unicode properties, all of which are silently resolved in favor of the
17149official Unicode property.  To avoid surprises, you should only use
17150C<prop_invmap()> for forms listed in the table below, which omits the
17151non-recommended ones.  The affected forms are the Perl single form equivalents
17152of Unicode properties, such as C<\\p{sc}> being a single-form equivalent of
17153C<\\p{gc=sc}>, which is treated by C<prop_invmap()> as the C<Script> property,
17154whose short name is C<sc>.  The table indicates the current ambiguities in the
17155INFO column, beginning with the word C<"NOT">.
17156
17157The standard Unicode properties listed below are documented in
17158L<$unicode_reference_url>; Perl_Decimal_Digit is documented in
17159L<Unicode::UCD/prop_invmap()>.  The other Perl extensions are in
17160L<perlunicode/Other Properties>;
17161
17162The first column in the table is a name for the property; the second column is
17163an alternative name, if any, plus possibly some annotations.  The alternative
17164name is the property's full name, unless that would simply repeat the first
17165column, in which case the second column indicates the property's short name
17166(if different).  The annotations are given only in the entry for the full
17167name.  The annotations for binary properties include a list of the first few
17168ranges that the property matches.  To avoid any ambiguity, the SPACE character
17169is represented as C<\\x$space_hex>.
17170
17171If a property is obsolete, etc, the entry will be flagged with the same
17172characters used in the table in the L<section above|/Properties accessible
17173through \\p{} and \\P{}>, like B<$DEPRECATED> or B<$STABILIZED>.
17174
17175$ucd_pod
17176
17177=head1 Properties accessible through other means
17178
17179Certain properties are accessible also via core function calls.  These are:
17180
17181 Lowercase_Mapping          lc() and lcfirst()
17182 Titlecase_Mapping          ucfirst()
17183 Uppercase_Mapping          uc()
17184
17185Also, Case_Folding is accessible through the C</i> modifier in regular
17186expressions, the C<\\F> transliteration escape, and the C<L<fc|perlfunc/fc>>
17187operator.
17188
17189Besides being able to say C<\\p{Name=...}>, the Name and Name_Aliases
17190properties are accessible through the C<\\N{}> interpolation in double-quoted
17191strings and regular expressions; and functions C<charnames::viacode()>,
17192C<charnames::vianame()>, and C<charnames::string_vianame()> (which require a
17193C<use charnames ();> to be specified.
17194
17195Finally, most properties related to decomposition are accessible via
17196L<Unicode::Normalize>.
17197
17198=head1 Unicode character properties that are NOT accepted by Perl
17199
17200Perl will generate an error for a few character properties in Unicode when
17201used in a regular expression.  The non-Unihan ones are listed below, with the
17202reasons they are not accepted, perhaps with work-arounds.  The short names for
17203the properties are listed enclosed in (parentheses).
17204As described after the list, an installation can change the defaults and choose
17205to accept any of these.  The list is machine generated based on the
17206choices made for the installation that generated this document.
17207
17208@bad_re_properties
17209
17210An installation can choose to allow any of these to be matched by downloading
17211the Unicode database from L<http://www.unicode.org/Public/> to
17212C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the
17213controlling lists contained in the program
17214C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
17215(C<\%Config> is available from the Config module).
17216
17217Also, perl can be recompiled to operate on an earlier version of the Unicode
17218standard.  Further information is at
17219C<\$Config{privlib}>/F<unicore/README.perl>.
17220
17221=head1 Other information in the Unicode data base
17222
17223The Unicode data base is delivered in two different formats.  The XML version
17224is valid for more modern Unicode releases.  The other version is a collection
17225of files.  The two are intended to give equivalent information.  Perl uses the
17226older form; this allows you to recompile Perl to use early Unicode releases.
17227
17228The only non-character property that Perl currently supports is Named
17229Sequences, in which a sequence of code points
17230is given a name and generally treated as a single entity.  (Perl supports
17231these via the C<\\N{...}> double-quotish construct,
17232L<charnames/charnames::string_vianame(name)>, and L<Unicode::UCD/namedseq()>.
17233
17234Below is a list of the files in the Unicode data base that Perl doesn't
17235currently use, along with very brief descriptions of their purposes.
17236Some of the names of the files have been shortened from those that Unicode
17237uses, in order to allow them to be distinguishable from similarly named files
17238on file systems for which only the first 8 characters of a name are
17239significant.
17240
17241=over 4
17242
17243@unused_files
17244
17245=back
17246
17247=head1 SEE ALSO
17248
17249L<$unicode_reference_url>
17250
17251L<perlrecharclass>
17252
17253L<perlunicode>
17254
17255END
17256
17257    # And write it.  The 0 means no utf8.
17258    main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT);
17259    return;
17260}
17261
17262sub make_Name_pm () {
17263    # Create and write Name.pm, which contains subroutines and data to use in
17264    # conjunction with Name.pl
17265
17266    # Maybe there's nothing to do.
17267    return unless $has_hangul_syllables || @code_points_ending_in_code_point;
17268
17269    my @name = <<END;
17270$HEADER
17271$INTERNAL_ONLY_HEADER
17272
17273=head1 NAME -- Internal generated file for use by charnames
17274
17275=cut
17276
17277END
17278
17279    # Convert these structures to output format.
17280    my $code_points_ending_in_code_point =
17281        main::simple_dumper(\@code_points_ending_in_code_point,
17282                            ' ' x 8);
17283    my $names = main::simple_dumper(\%names_ending_in_code_point,
17284                                    ' ' x 8);
17285    my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point,
17286                                    ' ' x 8);
17287
17288    # Do the same with the Hangul names,
17289    my $jamo;
17290    my $jamo_l;
17291    my $jamo_v;
17292    my $jamo_t;
17293    my $jamo_re;
17294    if ($has_hangul_syllables) {
17295
17296        # Construct a regular expression of all the possible
17297        # combinations of the Hangul syllables.
17298        my @L_re;   # Leading consonants
17299        for my $i ($LBase .. $LBase + $LCount - 1) {
17300            push @L_re, $Jamo{$i}
17301        }
17302        my @V_re;   # Middle vowels
17303        for my $i ($VBase .. $VBase + $VCount - 1) {
17304            push @V_re, $Jamo{$i}
17305        }
17306        my @T_re;   # Trailing consonants
17307        for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
17308            push @T_re, $Jamo{$i}
17309        }
17310
17311        # The whole re is made up of the L V T combination.
17312        $jamo_re = '('
17313                    . join ('|', sort @L_re)
17314                    . ')('
17315                    . join ('|', sort @V_re)
17316                    . ')('
17317                    . join ('|', sort @T_re)
17318                    . ')?';
17319
17320        # These hashes needed by the algorithm were generated
17321        # during reading of the Jamo.txt file
17322        $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
17323        $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
17324        $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
17325        $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
17326    }
17327
17328    push @name, <<END;
17329
17330package charnames;
17331
17332# This module contains machine-generated tables and code for the
17333# algorithmically-determinable Unicode character names.  The following
17334# routines can be used to translate between name and code point and vice versa
17335
17336{ # Closure
17337
17338    # Matches legal code point.  4-6 hex numbers, If there are 6, the first
17339    # two must be 10; if there are 5, the first must not be a 0.  Written this
17340    # way to decrease backtracking.  The first regex allows the code point to
17341    # be at the end of a word, but to work properly, the word shouldn't end
17342    # with a valid hex character.  The second one won't match a code point at
17343    # the end of a word, and doesn't have the run-on issue
17344    my \$run_on_code_point_re = qr/$run_on_code_point_re/;
17345    my \$code_point_re = qr/$code_point_re/;
17346
17347    # In the following hash, the keys are the bases of names which include
17348    # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01.  The value
17349    # of each key is another hash which is used to get the low and high ends
17350    # for each range of code points that apply to the name.
17351    my %names_ending_in_code_point = (
17352$names
17353    );
17354
17355    # The following hash is a copy of the previous one, except is for loose
17356    # matching, so each name has blanks and dashes squeezed out
17357    my %loose_names_ending_in_code_point = (
17358$loose_names
17359    );
17360
17361    # And the following array gives the inverse mapping from code points to
17362    # names.  Lowest code points are first
17363    \@code_points_ending_in_code_point = (
17364$code_points_ending_in_code_point
17365    );
17366
17367    # Is exportable, make read-only
17368    Internals::SvREADONLY(\@code_points_ending_in_code_point, 1);
17369END
17370    # Earlier releases didn't have Jamos.  No sense outputting
17371    # them unless will be used.
17372    if ($has_hangul_syllables) {
17373        push @name, <<END;
17374
17375    # Convert from code point to Jamo short name for use in composing Hangul
17376    # syllable names
17377    my %Jamo = (
17378$jamo
17379    );
17380
17381    # Leading consonant (can be null)
17382    my %Jamo_L = (
17383$jamo_l
17384    );
17385
17386    # Vowel
17387    my %Jamo_V = (
17388$jamo_v
17389    );
17390
17391    # Optional trailing consonant
17392    my %Jamo_T = (
17393$jamo_t
17394    );
17395
17396    # Computed re that splits up a Hangul name into LVT or LV syllables
17397    my \$syllable_re = qr/$jamo_re/;
17398
17399    my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
17400    my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
17401
17402    # These constants names and values were taken from the Unicode standard,
17403    # version 5.1, section 3.12.  They are used in conjunction with Hangul
17404    # syllables
17405    my \$SBase = $SBase_string;
17406    my \$LBase = $LBase_string;
17407    my \$VBase = $VBase_string;
17408    my \$TBase = $TBase_string;
17409    my \$SCount = $SCount;
17410    my \$LCount = $LCount;
17411    my \$VCount = $VCount;
17412    my \$TCount = $TCount;
17413    my \$NCount = \$VCount * \$TCount;
17414END
17415    } # End of has Jamos
17416
17417    push @name, << 'END';
17418
17419    sub name_to_code_point_special {
17420        my ($name, $loose) = @_;
17421
17422        # Returns undef if not one of the specially handled names; otherwise
17423        # returns the code point equivalent to the input name
17424        # $loose is non-zero if to use loose matching, 'name' in that case
17425        # must be input as upper case with all blanks and dashes squeezed out.
17426END
17427    if ($has_hangul_syllables) {
17428        push @name, << 'END';
17429
17430        if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
17431            || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
17432        {
17433            return if $name !~ qr/^$syllable_re$/;
17434            my $L = $Jamo_L{$1};
17435            my $V = $Jamo_V{$2};
17436            my $T = (defined $3) ? $Jamo_T{$3} : 0;
17437            return ($L * $VCount + $V) * $TCount + $T + $SBase;
17438        }
17439END
17440    }
17441    push @name, << 'END';
17442
17443        # Name must end in 'code_point' for this to handle.
17444        return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
17445                   || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
17446
17447        my $base = $1;
17448        my $code_point = CORE::hex $2;
17449        my $names_ref;
17450
17451        if ($loose) {
17452            $names_ref = \%loose_names_ending_in_code_point;
17453        }
17454        else {
17455            return if $base !~ s/-$//;
17456            $names_ref = \%names_ending_in_code_point;
17457        }
17458
17459        # Name must be one of the ones which has the code point in it.
17460        return if ! $names_ref->{$base};
17461
17462        # Look through the list of ranges that apply to this name to see if
17463        # the code point is in one of them.
17464        for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
17465            return if $names_ref->{$base}{'low'}->[$i] > $code_point;
17466            next if $names_ref->{$base}{'high'}->[$i] < $code_point;
17467
17468            # Here, the code point is in the range.
17469            return $code_point;
17470        }
17471
17472        # Here, looked like the name had a code point number in it, but
17473        # did not match one of the valid ones.
17474        return;
17475    }
17476
17477    sub code_point_to_name_special {
17478        my $code_point = shift;
17479
17480        # Returns the name of a code point if algorithmically determinable;
17481        # undef if not
17482END
17483    if ($has_hangul_syllables) {
17484        push @name, << 'END';
17485
17486        # If in the Hangul range, calculate the name based on Unicode's
17487        # algorithm
17488        if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
17489            use integer;
17490            my $SIndex = $code_point - $SBase;
17491            my $L = $LBase + $SIndex / $NCount;
17492            my $V = $VBase + ($SIndex % $NCount) / $TCount;
17493            my $T = $TBase + $SIndex % $TCount;
17494            $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
17495            $name .= $Jamo{$T} if $T != $TBase;
17496            return $name;
17497        }
17498END
17499    }
17500    push @name, << 'END';
17501
17502        # Look through list of these code points for one in range.
17503        foreach my $hash (@code_points_ending_in_code_point) {
17504            return if $code_point < $hash->{'low'};
17505            if ($code_point <= $hash->{'high'}) {
17506                return sprintf("%s-%04X", $hash->{'name'}, $code_point);
17507            }
17508        }
17509        return;            # None found
17510    }
17511} # End closure
17512
175131;
17514END
17515
17516    main::write("Name.pm", 0, \@name);  # The 0 means no utf8.
17517    return;
17518}
17519
17520sub make_UCD () {
17521    # Create and write UCD.pl, which passes info about the tables to
17522    # Unicode::UCD
17523
17524    # Stringify structures for output
17525    my $loose_property_name_of
17526                           = simple_dumper(\%loose_property_name_of, ' ' x 4);
17527    chomp $loose_property_name_of;
17528
17529    my $strict_property_name_of
17530                           = simple_dumper(\%strict_property_name_of, ' ' x 4);
17531    chomp $strict_property_name_of;
17532
17533    my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4);
17534    chomp $stricter_to_file_of;
17535
17536    my $inline_definitions = simple_dumper(\@inline_definitions, " " x 4);
17537    chomp $inline_definitions;
17538
17539    my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4);
17540    chomp $loose_to_file_of;
17541
17542    my $nv_floating_to_rational
17543                           = simple_dumper(\%nv_floating_to_rational, ' ' x 4);
17544    chomp $nv_floating_to_rational;
17545
17546    my $why_deprecated = simple_dumper(\%Unicode::UCD::why_deprecated, ' ' x 4);
17547    chomp $why_deprecated;
17548
17549    # We set the key to the file when we associated files with tables, but we
17550    # couldn't do the same for the value then, as we might not have the file
17551    # for the alternate table figured out at that time.
17552    foreach my $cased (keys %caseless_equivalent_to) {
17553        my @path = $caseless_equivalent_to{$cased}->file_path;
17554        my $path;
17555        if ($path[0] eq "#") {  # Pseudo-directory '#'
17556            $path = join '/', @path;
17557        }
17558        else {  # Gets rid of lib/
17559            $path = join '/', @path[1, -1];
17560        }
17561        $caseless_equivalent_to{$cased} = $path;
17562    }
17563    my $caseless_equivalent_to
17564                           = simple_dumper(\%caseless_equivalent_to, ' ' x 4);
17565    chomp $caseless_equivalent_to;
17566
17567    my $loose_property_to_file_of
17568                        = simple_dumper(\%loose_property_to_file_of, ' ' x 4);
17569    chomp $loose_property_to_file_of;
17570
17571    my $strict_property_to_file_of
17572                        = simple_dumper(\%strict_property_to_file_of, ' ' x 4);
17573    chomp $strict_property_to_file_of;
17574
17575    my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4);
17576    chomp $file_to_swash_name;
17577
17578    # Create a mapping from each alias of Perl single-form extensions to all
17579    # its equivalent aliases, for quick look-up.
17580    my %perlprop_to_aliases;
17581    foreach my $table ($perl->tables) {
17582
17583        # First create the list of the aliases of each extension
17584        my @aliases_list;    # List of legal aliases for this extension
17585
17586        my $table_name = $table->name;
17587        my $standard_table_name = standardize($table_name);
17588        my $table_full_name = $table->full_name;
17589        my $standard_table_full_name = standardize($table_full_name);
17590
17591        # Make sure that the list has both the short and full names
17592        push @aliases_list, $table_name, $table_full_name;
17593
17594        my $found_ucd = 0;  # ? Did we actually get an alias that should be
17595                            # output for this table
17596
17597        # Go through all the aliases (including the two just added), and add
17598        # any new unique ones to the list
17599        foreach my $alias ($table->aliases) {
17600
17601            # Skip non-legal names
17602            next unless $alias->ok_as_filename;
17603            next unless $alias->ucd;
17604
17605            $found_ucd = 1;     # have at least one legal name
17606
17607            my $name = $alias->name;
17608            my $standard = standardize($name);
17609
17610            # Don't repeat a name that is equivalent to one already on the
17611            # list
17612            next if $standard eq $standard_table_name;
17613            next if $standard eq $standard_table_full_name;
17614
17615            push @aliases_list, $name;
17616        }
17617
17618        # If there were no legal names, don't output anything.
17619        next unless $found_ucd;
17620
17621        # To conserve memory in the program reading these in, omit full names
17622        # that are identical to the short name, when those are the only two
17623        # aliases for the property.
17624        if (@aliases_list == 2 && $aliases_list[0] eq $aliases_list[1]) {
17625            pop @aliases_list;
17626        }
17627
17628        # Here, @aliases_list is the list of all the aliases that this
17629        # extension legally has.  Now can create a map to it from each legal
17630        # standardized alias
17631        foreach my $alias ($table->aliases) {
17632            next unless $alias->ucd;
17633            next unless $alias->ok_as_filename;
17634            push @{$perlprop_to_aliases{standardize($alias->name)}},
17635                 uniques @aliases_list;
17636        }
17637    }
17638
17639    # Make a list of all combinations of properties/values that are suppressed.
17640    my @suppressed;
17641    if (! $debug_skip) {    # This tends to fail in this debug mode
17642        foreach my $property_name (keys %why_suppressed) {
17643
17644            # Just the value
17645            my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
17646
17647            # The hash may contain properties not in this release of Unicode
17648            next unless defined (my $property = property_ref($property_name));
17649
17650            # Find all combinations
17651            foreach my $prop_alias ($property->aliases) {
17652                my $prop_alias_name = standardize($prop_alias->name);
17653
17654                # If no =value, there's just one combination possible for this
17655                if (! $value_name) {
17656
17657                    # The property may be suppressed, but there may be a proxy
17658                    # for it, so it shouldn't be listed as suppressed
17659                    next if $prop_alias->ucd;
17660                    push @suppressed, $prop_alias_name;
17661                }
17662                else {  # Otherwise
17663                    foreach my $value_alias
17664                                    ($property->table($value_name)->aliases)
17665                    {
17666                        next if $value_alias->ucd;
17667
17668                        push @suppressed, "$prop_alias_name="
17669                                        .  standardize($value_alias->name);
17670                    }
17671                }
17672            }
17673        }
17674    }
17675    @suppressed = sort @suppressed; # So doesn't change between runs of this
17676                                    # program
17677
17678    # Convert the structure below (designed for Name.pm) to a form that UCD
17679    # wants, so it doesn't have to modify it at all; i.e. so that it includes
17680    # an element for the Hangul syllables in the appropriate place, and
17681    # otherwise changes the name to include the "-<code point>" suffix.
17682    my @algorithm_names;
17683    my $done_hangul = $v_version lt v2.0.0;  # Hanguls as we know them came
17684                                             # along in this version
17685    # Copy it linearly.
17686    for my $i (0 .. @code_points_ending_in_code_point - 1) {
17687
17688        # Insert the hanguls in the correct place.
17689        if (! $done_hangul
17690            && $code_points_ending_in_code_point[$i]->{'low'} > $SBase)
17691        {
17692            $done_hangul = 1;
17693            push @algorithm_names, { low => $SBase,
17694                                     high => $SBase + $SCount - 1,
17695                                     name => '<hangul syllable>',
17696                                    };
17697        }
17698
17699        # Copy the current entry, modified.
17700        push @algorithm_names, {
17701            low => $code_points_ending_in_code_point[$i]->{'low'},
17702            high => $code_points_ending_in_code_point[$i]->{'high'},
17703            name =>
17704               "$code_points_ending_in_code_point[$i]->{'name'}-<code point>",
17705        };
17706    }
17707
17708    # Serialize these structures for output.
17709    my $loose_to_standard_value
17710                          = simple_dumper(\%loose_to_standard_value, ' ' x 4);
17711    chomp $loose_to_standard_value;
17712
17713    my $string_property_loose_to_name
17714                    = simple_dumper(\%string_property_loose_to_name, ' ' x 4);
17715    chomp $string_property_loose_to_name;
17716
17717    my $perlprop_to_aliases = simple_dumper(\%perlprop_to_aliases, ' ' x 4);
17718    chomp $perlprop_to_aliases;
17719
17720    my $prop_aliases = simple_dumper(\%prop_aliases, ' ' x 4);
17721    chomp $prop_aliases;
17722
17723    my $prop_value_aliases = simple_dumper(\%prop_value_aliases, ' ' x 4);
17724    chomp $prop_value_aliases;
17725
17726    my $suppressed = (@suppressed) ? simple_dumper(\@suppressed, ' ' x 4) : "";
17727    chomp $suppressed;
17728
17729    my $algorithm_names = simple_dumper(\@algorithm_names, ' ' x 4);
17730    chomp $algorithm_names;
17731
17732    my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4);
17733    chomp $ambiguous_names;
17734
17735    my $combination_property = simple_dumper(\%combination_property, ' ' x 4);
17736    chomp $combination_property;
17737
17738    my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4);
17739    chomp $loose_defaults;
17740
17741    my @ucd = <<END;
17742$HEADER
17743$INTERNAL_ONLY_HEADER
17744
17745# This file is for the use of Unicode::UCD
17746
17747# Highest legal Unicode code point
17748\$Unicode::UCD::MAX_UNICODE_CODEPOINT = 0x$MAX_UNICODE_CODEPOINT_STRING;
17749
17750# Hangul syllables
17751\$Unicode::UCD::HANGUL_BEGIN = $SBase_string;
17752\$Unicode::UCD::HANGUL_COUNT = $SCount;
17753
17754# Maps Unicode (not Perl single-form extensions) property names in loose
17755# standard form to their corresponding standard names
17756\%Unicode::UCD::loose_property_name_of = (
17757$loose_property_name_of
17758);
17759
17760# Same, but strict names
17761\%Unicode::UCD::strict_property_name_of = (
17762$strict_property_name_of
17763);
17764
17765# Gives the definitions (in the form of inversion lists) for those properties
17766# whose definitions aren't kept in files
17767\@Unicode::UCD::inline_definitions = (
17768$inline_definitions
17769);
17770
17771# Maps property, table to file for those using stricter matching.  For paths
17772# whose directory is '#', the file is in the form of a numeric index into
17773# \@inline_definitions
17774\%Unicode::UCD::stricter_to_file_of = (
17775$stricter_to_file_of
17776);
17777
17778# Maps property, table to file for those using loose matching.  For paths
17779# whose directory is '#', the file is in the form of a numeric index into
17780# \@inline_definitions
17781\%Unicode::UCD::loose_to_file_of = (
17782$loose_to_file_of
17783);
17784
17785# Maps floating point to fractional form
17786\%Unicode::UCD::nv_floating_to_rational = (
17787$nv_floating_to_rational
17788);
17789
17790# If a %e floating point number doesn't have this number of digits in it after
17791# the decimal point to get this close to a fraction, it isn't considered to be
17792# that fraction even if all the digits it does have match.
17793\$Unicode::UCD::e_precision = $E_FLOAT_PRECISION;
17794
17795# Deprecated tables to generate a warning for.  The key is the file containing
17796# the table, so as to avoid duplication, as many property names can map to the
17797# file, but we only need one entry for all of them.
17798\%Unicode::UCD::why_deprecated = (
17799$why_deprecated
17800);
17801
17802# A few properties have different behavior under /i matching.  This maps
17803# those to substitute files to use under /i.
17804\%Unicode::UCD::caseless_equivalent = (
17805$caseless_equivalent_to
17806);
17807
17808# Property names to mapping files
17809\%Unicode::UCD::loose_property_to_file_of = (
17810$loose_property_to_file_of
17811);
17812
17813# Property names to mapping files
17814\%Unicode::UCD::strict_property_to_file_of = (
17815$strict_property_to_file_of
17816);
17817
17818# Files to the swash names within them.
17819\%Unicode::UCD::file_to_swash_name = (
17820$file_to_swash_name
17821);
17822
17823# Keys are all the possible "prop=value" combinations, in loose form; values
17824# are the standard loose name for the 'value' part of the key
17825\%Unicode::UCD::loose_to_standard_value = (
17826$loose_to_standard_value
17827);
17828
17829# String property loose names to standard loose name
17830\%Unicode::UCD::string_property_loose_to_name = (
17831$string_property_loose_to_name
17832);
17833
17834# Keys are Perl extensions in loose form; values are each one's list of
17835# aliases
17836\%Unicode::UCD::loose_perlprop_to_name = (
17837$perlprop_to_aliases
17838);
17839
17840# Keys are standard property name; values are each one's aliases
17841\%Unicode::UCD::prop_aliases = (
17842$prop_aliases
17843);
17844
17845# Keys of top level are standard property name; values are keys to another
17846# hash,  Each one is one of the property's values, in standard form.  The
17847# values are that prop-val's aliases.  If only one specified, the short and
17848# long alias are identical.
17849\%Unicode::UCD::prop_value_aliases = (
17850$prop_value_aliases
17851);
17852
17853# Ordered (by code point ordinal) list of the ranges of code points whose
17854# names are algorithmically determined.  Each range entry is an anonymous hash
17855# of the start and end points and a template for the names within it.
17856\@Unicode::UCD::algorithmic_named_code_points = (
17857$algorithm_names
17858);
17859
17860# The properties that as-is have two meanings, and which must be disambiguated
17861\%Unicode::UCD::ambiguous_names = (
17862$ambiguous_names
17863);
17864
17865# Keys are the prop-val combinations which are the default values for the
17866# given property, expressed in standard loose form
17867\%Unicode::UCD::loose_defaults = (
17868$loose_defaults
17869);
17870
17871# The properties that are combinations, in that they have both a map table and
17872# a match table.  This is actually for UCD.t, so it knows how to test for
17873# these.
17874\%Unicode::UCD::combination_property = (
17875$combination_property
17876);
17877
17878# All combinations of names that are suppressed.
17879# This is actually for UCD.t, so it knows which properties shouldn't have
17880# entries.  If it got any bigger, would probably want to put it in its own
17881# file to use memory only when it was needed, in testing.
17882\@Unicode::UCD::suppressed_properties = (
17883$suppressed
17884);
17885
178861;
17887END
17888
17889    main::write("UCD.pl", 0, \@ucd);  # The 0 means no utf8.
17890    return;
17891}
17892
17893sub write_all_tables() {
17894    # Write out all the tables generated by this program to files, as well as
17895    # the supporting data structures, pod file, and .t file.
17896
17897    my @writables;              # List of tables that actually get written
17898    my %match_tables_to_write;  # Used to collapse identical match tables
17899                                # into one file.  Each key is a hash function
17900                                # result to partition tables into buckets.
17901                                # Each value is an array of the tables that
17902                                # fit in the bucket.
17903
17904    # For each property ...
17905    # (sort so that if there is an immutable file name, it has precedence, so
17906    # some other property can't come in and take over its file name.  (We
17907    # don't care if both defined, as they had better be different anyway.)
17908    # The property named 'Perl' needs to be first (it doesn't have any
17909    # immutable file name) because empty properties are defined in terms of
17910    # its table named 'All' under the -annotate option.)   We also sort by
17911    # the property's name.  This is just for repeatability of the outputs
17912    # between runs of this program, but does not affect correctness.
17913    PROPERTY:
17914    foreach my $property ($perl,
17915                          sort { return -1 if defined $a->file;
17916                                 return 1 if defined $b->file;
17917                                 return $a->name cmp $b->name;
17918                                } grep { $_ != $perl } property_ref('*'))
17919    {
17920        my $type = $property->type;
17921
17922        # And for each table for that property, starting with the mapping
17923        # table for it ...
17924        TABLE:
17925        foreach my $table($property,
17926
17927                        # and all the match tables for it (if any), sorted so
17928                        # the ones with the shortest associated file name come
17929                        # first.  The length sorting prevents problems of a
17930                        # longer file taking a name that might have to be used
17931                        # by a shorter one.  The alphabetic sorting prevents
17932                        # differences between releases
17933                        sort {  my $ext_a = $a->external_name;
17934                                return 1 if ! defined $ext_a;
17935                                my $ext_b = $b->external_name;
17936                                return -1 if ! defined $ext_b;
17937
17938                                # But return the non-complement table before
17939                                # the complement one, as the latter is defined
17940                                # in terms of the former, and needs to have
17941                                # the information for the former available.
17942                                return 1 if $a->complement != 0;
17943                                return -1 if $b->complement != 0;
17944
17945                                # Similarly, return a subservient table after
17946                                # a leader
17947                                return 1 if $a->leader != $a;
17948                                return -1 if $b->leader != $b;
17949
17950                                my $cmp = length $ext_a <=> length $ext_b;
17951
17952                                # Return result if lengths not equal
17953                                return $cmp if $cmp;
17954
17955                                # Alphabetic if lengths equal
17956                                return $ext_a cmp $ext_b
17957                        } $property->tables
17958                    )
17959        {
17960
17961            # Here we have a table associated with a property.  It could be
17962            # the map table (done first for each property), or one of the
17963            # other tables.  Determine which type.
17964            my $is_property = $table->isa('Property');
17965
17966            my $name = $table->name;
17967            my $complete_name = $table->complete_name;
17968
17969            # See if should suppress the table if is empty, but warn if it
17970            # contains something.
17971            my $suppress_if_empty_warn_if_not
17972                    = $why_suppress_if_empty_warn_if_not{$complete_name} || 0;
17973
17974            # Calculate if this table should have any code points associated
17975            # with it or not.
17976            my $expected_empty =
17977
17978                # $perl should be empty
17979                ($is_property && ($table == $perl))
17980
17981                # Match tables in properties we skipped populating should be
17982                # empty
17983                || (! $is_property && ! $property->to_create_match_tables)
17984
17985                # Tables and properties that are expected to have no code
17986                # points should be empty
17987                || $suppress_if_empty_warn_if_not
17988            ;
17989
17990            # Set a boolean if this table is the complement of an empty binary
17991            # table
17992            my $is_complement_of_empty_binary =
17993                $type == $BINARY &&
17994                (($table == $property->table('Y')
17995                    && $property->table('N')->is_empty)
17996                || ($table == $property->table('N')
17997                    && $property->table('Y')->is_empty));
17998
17999            if ($table->is_empty) {
18000
18001                if ($suppress_if_empty_warn_if_not) {
18002                    $table->set_fate($SUPPRESSED,
18003                                     $suppress_if_empty_warn_if_not);
18004                }
18005
18006                # Suppress (by skipping them) expected empty tables.
18007                next TABLE if $expected_empty;
18008
18009                # And setup to later output a warning for those that aren't
18010                # known to be allowed to be empty.  Don't do the warning if
18011                # this table is a child of another one to avoid duplicating
18012                # the warning that should come from the parent one.
18013                if (($table == $property || $table->parent == $table)
18014                    && $table->fate != $SUPPRESSED
18015                    && $table->fate != $MAP_PROXIED
18016                    && ! grep { $complete_name =~ /^$_$/ }
18017                                                    @tables_that_may_be_empty)
18018                {
18019                    push @unhandled_properties, "$table";
18020                }
18021
18022                # The old way of expressing an empty match list was to
18023                # complement the list that matches everything.  The new way is
18024                # to create an empty inversion list, but this doesn't work for
18025                # annotating, so use the old way then.
18026                $table->set_complement($All) if $annotate
18027                                                && $table != $property;
18028            }
18029            elsif ($expected_empty) {
18030                my $because = "";
18031                if ($suppress_if_empty_warn_if_not) {
18032                    $because = " because $suppress_if_empty_warn_if_not";
18033                }
18034
18035                Carp::my_carp("Not expecting property $table$because.  Generating file for it anyway.");
18036            }
18037
18038            # Some tables should match everything
18039            my $expected_full =
18040                ($table->fate == $SUPPRESSED)
18041                ? 0
18042                : ($is_property)
18043                  ? # All these types of map tables will be full because
18044                    # they will have been populated with defaults
18045                    ($type == $ENUM)
18046
18047                  : # A match table should match everything if its method
18048                    # shows it should
18049                    ($table->matches_all
18050
18051                    # The complement of an empty binary table will match
18052                    # everything
18053                    || $is_complement_of_empty_binary
18054                    )
18055            ;
18056
18057            my $count = $table->count;
18058            if ($expected_full) {
18059                if ($count != $MAX_WORKING_CODEPOINTS) {
18060                    Carp::my_carp("$table matches only "
18061                    . clarify_number($count)
18062                    . " Unicode code points but should match "
18063                    . clarify_number($MAX_WORKING_CODEPOINTS)
18064                    . " (off by "
18065                    .  clarify_number(abs($MAX_WORKING_CODEPOINTS - $count))
18066                    . ").  Proceeding anyway.");
18067                }
18068
18069                # Here is expected to be full.  If it is because it is the
18070                # complement of an (empty) binary table that is to be
18071                # suppressed, then suppress this one as well.
18072                if ($is_complement_of_empty_binary) {
18073                    my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
18074                    my $opposing = $property->table($opposing_name);
18075                    my $opposing_status = $opposing->status;
18076                    if ($opposing_status) {
18077                        $table->set_status($opposing_status,
18078                                           $opposing->status_info);
18079                    }
18080                }
18081            }
18082            elsif ($count == $MAX_UNICODE_CODEPOINTS
18083                   && $name ne "Any"
18084                   && ($table == $property || $table->leader == $table)
18085                   && $table->property->status ne $NORMAL)
18086            {
18087                    Carp::my_carp("$table unexpectedly matches all Unicode code points.  Proceeding anyway.");
18088            }
18089
18090            if ($table->fate >= $SUPPRESSED) {
18091                if (! $is_property) {
18092                    my @children = $table->children;
18093                    foreach my $child (@children) {
18094                        if ($child->fate < $SUPPRESSED) {
18095                            Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
18096                        }
18097                    }
18098                }
18099                next TABLE;
18100
18101            }
18102
18103            if (! $is_property) {
18104
18105                make_ucd_table_pod_entries($table) if $table->property == $perl;
18106
18107                # Several things need to be done just once for each related
18108                # group of match tables.  Do them on the parent.
18109                if ($table->parent == $table) {
18110
18111                    # Add an entry in the pod file for the table; it also does
18112                    # the children.
18113                    make_re_pod_entries($table) if defined $pod_directory;
18114
18115                    # See if the table matches identical code points with
18116                    # something that has already been processed and is ready
18117                    # for output.  In that case, no need to have two files
18118                    # with the same code points in them.  We use the table's
18119                    # hash() method to store these in buckets, so that it is
18120                    # quite likely that if two tables are in the same bucket
18121                    # they will be identical, so don't have to compare tables
18122                    # frequently.  The tables have to have the same status to
18123                    # share a file, so add this to the bucket hash.  (The
18124                    # reason for this latter is that UCD.pm associates a
18125                    # status with a file.) We don't check tables that are
18126                    # inverses of others, as it would lead to some coding
18127                    # complications, and checking all the regular ones should
18128                    # find everything.
18129                    if ($table->complement == 0) {
18130                        my $hash = $table->hash . ';' . $table->status;
18131
18132                        # Look at each table that is in the same bucket as
18133                        # this one would be.
18134                        foreach my $comparison
18135                                            (@{$match_tables_to_write{$hash}})
18136                        {
18137                            # If the table doesn't point back to this one, we
18138                            # see if it matches identically
18139                            if (   $comparison->leader != $table
18140                                && $table->matches_identically_to($comparison))
18141                            {
18142                                $table->set_equivalent_to($comparison,
18143                                                                Related => 0);
18144                                next TABLE;
18145                            }
18146                        }
18147
18148                        # Here, not equivalent, add this table to the bucket.
18149                        push @{$match_tables_to_write{$hash}}, $table;
18150                    }
18151                }
18152            }
18153            else {
18154
18155                # Here is the property itself.
18156                # Don't write out or make references to the $perl property
18157                next if $table == $perl;
18158
18159                make_ucd_table_pod_entries($table);
18160
18161                # There is a mapping stored of the various synonyms to the
18162                # standardized name of the property for Unicode::UCD.
18163                # Also, the pod file contains entries of the form:
18164                # \p{alias: *}         \p{full: *}
18165                # rather than show every possible combination of things.
18166
18167                my @property_aliases = $property->aliases;
18168
18169                my $full_property_name = $property->full_name;
18170                my $property_name = $property->name;
18171                my $standard_property_name = standardize($property_name);
18172                my $standard_property_full_name
18173                                        = standardize($full_property_name);
18174
18175                # We also create for Unicode::UCD a list of aliases for
18176                # the property.  The list starts with the property name;
18177                # then its full name.
18178                my @property_list;
18179                my @standard_list;
18180                if ( $property->fate <= $MAP_PROXIED) {
18181                    @property_list = ($property_name, $full_property_name);
18182                    @standard_list = ($standard_property_name,
18183                                        $standard_property_full_name);
18184                }
18185
18186                # For each synonym ...
18187                for my $i (0 .. @property_aliases - 1)  {
18188                    my $alias = $property_aliases[$i];
18189                    my $alias_name = $alias->name;
18190                    my $alias_standard = standardize($alias_name);
18191
18192
18193                    # Add other aliases to the list of property aliases
18194                    if ($property->fate <= $MAP_PROXIED
18195                        && ! grep { $alias_standard eq $_ } @standard_list)
18196                    {
18197                        push @property_list, $alias_name;
18198                        push @standard_list, $alias_standard;
18199                    }
18200
18201                    # For Unicode::UCD, set the mapping of the alias to the
18202                    # property
18203                    if ($type == $STRING) {
18204                        if ($property->fate <= $MAP_PROXIED) {
18205                            $string_property_loose_to_name{$alias_standard}
18206                                            = $standard_property_name;
18207                        }
18208                    }
18209                    else {
18210                        my $hash_ref = ($alias_standard =~ /^_/)
18211                                       ? \%strict_property_name_of
18212                                       : \%loose_property_name_of;
18213                        if (exists $hash_ref->{$alias_standard}) {
18214                            Carp::my_carp("There already is a property with the same standard name as $alias_name: $hash_ref->{$alias_standard}.  Old name is retained");
18215                        }
18216                        else {
18217                            $hash_ref->{$alias_standard}
18218                                                = $standard_property_name;
18219                        }
18220
18221                        # Now for the re pod entry for this alias.  Skip if not
18222                        # outputting a pod; skip the first one, which is the
18223                        # full name so won't have an entry like: '\p{full: *}
18224                        # \p{full: *}', and skip if don't want an entry for
18225                        # this one.
18226                        next if $i == 0
18227                                || ! defined $pod_directory
18228                                || ! $alias->make_re_pod_entry;
18229
18230                        my $rhs = "\\p{$full_property_name: *}";
18231                        if ($property != $perl && $table->perl_extension) {
18232                            $rhs .= ' (Perl extension)';
18233                        }
18234                        push @match_properties,
18235                            format_pod_line($indent_info_column,
18236                                        '\p{' . $alias->name . ': *}',
18237                                        $rhs,
18238                                        $alias->status);
18239                    }
18240                }
18241
18242                # The list of all possible names is attached to each alias, so
18243                # lookup is easy
18244                if (@property_list) {
18245                    push @{$prop_aliases{$standard_list[0]}}, @property_list;
18246                }
18247
18248                if ($property->fate <= $MAP_PROXIED) {
18249
18250                    # Similarly, we create for Unicode::UCD a list of
18251                    # property-value aliases.
18252
18253                    # Look at each table in the property...
18254                    foreach my $table ($property->tables) {
18255                        my @values_list;
18256                        my $table_full_name = $table->full_name;
18257                        my $standard_table_full_name
18258                                              = standardize($table_full_name);
18259                        my $table_name = $table->name;
18260                        my $standard_table_name = standardize($table_name);
18261
18262                        # The list starts with the table name and its full
18263                        # name.
18264                        push @values_list, $table_name, $table_full_name;
18265
18266                        # We add to the table each unique alias that isn't
18267                        # discouraged from use.
18268                        foreach my $alias ($table->aliases) {
18269                            next if $alias->status
18270                                 && $alias->status eq $DISCOURAGED;
18271                            my $name = $alias->name;
18272                            my $standard = standardize($name);
18273                            next if $standard eq $standard_table_name;
18274                            next if $standard eq $standard_table_full_name;
18275                            push @values_list, $name;
18276                        }
18277
18278                        # Here @values_list is a list of all the aliases for
18279                        # the table.  That is, all the property-values given
18280                        # by this table.  By agreement with Unicode::UCD,
18281                        # if the name and full name are identical, and there
18282                        # are no other names, drop the duplicate entry to save
18283                        # memory.
18284                        if (@values_list == 2
18285                            && $values_list[0] eq $values_list[1])
18286                        {
18287                            pop @values_list
18288                        }
18289
18290                        # To save memory, unlike the similar list for property
18291                        # aliases above, only the standard forms have the list.
18292                        # This forces an extra step of converting from input
18293                        # name to standard name, but the savings are
18294                        # considerable.  (There is only marginal savings if we
18295                        # did this with the property aliases.)
18296                        push @{$prop_value_aliases{$standard_property_name}{$standard_table_name}}, @values_list;
18297                    }
18298                }
18299
18300                # Don't write out a mapping file if not desired.
18301                next if ! $property->to_output_map;
18302            }
18303
18304            # Here, we know we want to write out the table, but don't do it
18305            # yet because there may be other tables that come along and will
18306            # want to share the file, and the file's comments will change to
18307            # mention them.  So save for later.
18308            push @writables, $table;
18309
18310        } # End of looping through the property and all its tables.
18311    } # End of looping through all properties.
18312
18313    # Now have all the tables that will have files written for them.  Do it.
18314    foreach my $table (@writables) {
18315        my @directory;
18316        my $filename;
18317        my $property = $table->property;
18318        my $is_property = ($table == $property);
18319
18320        # For very short tables, instead of writing them out to actual files,
18321        # we in-line their inversion list definitions into UCD.pm.  The
18322        # definition replaces the file name, and the special pseudo-directory
18323        # '#' is used to signal this.  This significantly cuts down the number
18324        # of files written at little extra cost to the hashes in UCD.pm.
18325        # And it means, no run-time files to read to get the definitions.
18326        if (! $is_property
18327            && ! $annotate  # For annotation, we want to explicitly show
18328                            # everything, so keep in files
18329            && $table->ranges <= 3)
18330        {
18331            my @ranges = $table->ranges;
18332            my $count = @ranges;
18333            if ($count == 0) {  # 0th index reserved for 0-length lists
18334                $filename = 0;
18335            }
18336            elsif ($table->leader != $table) {
18337
18338                # Here, is a table that is equivalent to another; code
18339                # in register_file_for_name() causes its leader's definition
18340                # to be used
18341
18342                next;
18343            }
18344            else {  # No equivalent table so far.
18345
18346                # Build up its definition range-by-range.
18347                my $definition = "";
18348                while (defined (my $range = shift @ranges)) {
18349                    my $end = $range->end;
18350                    if ($end < $MAX_WORKING_CODEPOINT) {
18351                        $count++;
18352                        $end = "\n" . ($end + 1);
18353                    }
18354                    else {  # Extends to infinity, hence no 'end'
18355                        $end = "";
18356                    }
18357                    $definition .= "\n" . $range->start . $end;
18358                }
18359                $definition = "V$count" . $definition;
18360                $filename = @inline_definitions;
18361                push @inline_definitions, $definition;
18362            }
18363            @directory = "#";
18364            register_file_for_name($table, \@directory, $filename);
18365            next;
18366        }
18367
18368        if (! $is_property) {
18369            # Match tables for the property go in lib/$subdirectory, which is
18370            # the property's name.  Don't use the standard file name for this,
18371            # as may get an unfamiliar alias
18372            @directory = ($matches_directory, ($property->match_subdir)
18373                                              ? $property->match_subdir
18374                                              : $property->external_name);
18375        }
18376        else {
18377
18378            @directory = $table->directory;
18379            $filename = $table->file;
18380        }
18381
18382        # Use specified filename if available, or default to property's
18383        # shortest name.  We need an 8.3 safe filename (which means "an 8
18384        # safe" filename, since after the dot is only 'pl', which is < 3)
18385        # The 2nd parameter is if the filename shouldn't be changed, and
18386        # it shouldn't iff there is a hard-coded name for this table.
18387        $filename = construct_filename(
18388                                $filename || $table->external_name,
18389                                ! $filename,    # mutable if no filename
18390                                \@directory);
18391
18392        register_file_for_name($table, \@directory, $filename);
18393
18394        # Only need to write one file when shared by more than one
18395        # property
18396        next if ! $is_property
18397                && ($table->leader != $table || $table->complement != 0);
18398
18399        # Construct a nice comment to add to the file
18400        $table->set_final_comment;
18401
18402        $table->write;
18403    }
18404
18405
18406    # Write out the pod file
18407    make_pod;
18408
18409    # And Name.pm, UCD.pl
18410    make_Name_pm;
18411    make_UCD;
18412
18413    make_property_test_script() if $make_test_script;
18414    make_normalization_test_script() if $make_norm_test_script;
18415    return;
18416}
18417
18418my @white_space_separators = ( # This used only for making the test script.
18419                            "",
18420                            ' ',
18421                            "\t",
18422                            '   '
18423                        );
18424
18425sub generate_separator($lhs) {
18426    # This used only for making the test script.  It generates the colon or
18427    # equal separator between the property and property value, with random
18428    # white space surrounding the separator
18429
18430    return "" if $lhs eq "";  # No separator if there's only one (the r) side
18431
18432    # Choose space before and after randomly
18433    my $spaces_before =$white_space_separators[rand(@white_space_separators)];
18434    my $spaces_after = $white_space_separators[rand(@white_space_separators)];
18435
18436    # And return the whole complex, half the time using a colon, half the
18437    # equals
18438    return $spaces_before
18439            . (rand() < 0.5) ? '=' : ':'
18440            . $spaces_after;
18441}
18442
18443sub generate_tests($lhs, $rhs, $valid_code, $invalid_code, $warning) {
18444    # This used only for making the test script.  It generates test cases that
18445    # are expected to compile successfully in perl.  Note that the LHS and
18446    # RHS are assumed to already be as randomized as the caller wants.
18447
18448    # $lhs          # The property: what's to the left of the colon
18449                    #  or equals separator
18450    # $rhs          # The property value; what's to the right
18451    # $valid_code   # A code point that's known to be in the
18452                        # table given by LHS=RHS; undef if table is
18453                        # empty
18454    # $invalid_code # A code point known to not be in the table;
18455                    # undef if the table is all code points
18456    # $warning
18457
18458    # Get the colon or equal
18459    my $separator = generate_separator($lhs);
18460
18461    # The whole 'property=value'
18462    my $name = "$lhs$separator$rhs";
18463
18464    my @output;
18465    # Create a complete set of tests, with complements.
18466    if (defined $valid_code) {
18467        push @output, <<"EOC"
18468Expect(1, $valid_code, '\\p{$name}', $warning);
18469Expect(0, $valid_code, '\\p{^$name}', $warning);
18470Expect(0, $valid_code, '\\P{$name}', $warning);
18471Expect(1, $valid_code, '\\P{^$name}', $warning);
18472EOC
18473    }
18474    if (defined $invalid_code) {
18475        push @output, <<"EOC"
18476Expect(0, $invalid_code, '\\p{$name}', $warning);
18477Expect(1, $invalid_code, '\\p{^$name}', $warning);
18478Expect(1, $invalid_code, '\\P{$name}', $warning);
18479Expect(0, $invalid_code, '\\P{^$name}', $warning);
18480EOC
18481    }
18482    return @output;
18483}
18484
18485sub generate_wildcard_tests($lhs, $rhs, $valid_code, $invalid_code, $warning) {
18486    # This used only for making the test script.  It generates wildcardl
18487    # matching test cases that are expected to compile successfully in perl.
18488
18489    # $lhs           # The property: what's to the left of the
18490                     # or equals separator
18491    # $rhs           # The property value; what's to the right
18492    # $valid_code    # A code point that's known to be in the
18493                     # table given by LHS=RHS; undef if table is
18494                     # empty
18495    # $invalid_code  # A code point known to not be in the table;
18496                     # undef if the table is all code points
18497    # $warning
18498
18499    return if $lhs eq "";
18500    return if $lhs =~ / ^ Is_ /x;   # These are not currently supported
18501
18502    # Generate a standardized pattern, with colon being the delimitter
18503    my $wildcard = "$lhs=:\\A$rhs\\z:";
18504
18505    my @output;
18506    push @output, "Expect(1, $valid_code, '\\p{$wildcard}', $warning);"
18507                                                        if defined $valid_code;
18508    push @output, "Expect(0, $invalid_code, '\\p{$wildcard}', $warning);"
18509                                                      if defined $invalid_code;
18510    return @output;
18511}
18512
18513sub generate_error($lhs, $rhs, $already_in_error=0) {
18514    # This used only for making the test script.  It generates test cases that
18515    # are expected to not only not match, but to be syntax or similar errors
18516
18517    # $lhs                # The property: what's to the left of the
18518                          # colon or equals separator
18519    # $rhs                # The property value; what's to the right
18520    # $already_in_error   # Boolean; if true it's known that the
18521                          # unmodified LHS and RHS will cause an error.
18522                          # This routine should not force another one
18523    # Get the colon or equal
18524    my $separator = generate_separator($lhs);
18525
18526    # Since this is an error only, don't bother to randomly decide whether to
18527    # put the error on the left or right side; and assume that the RHS is
18528    # loosely matched, again for convenience rather than rigor.
18529    $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
18530
18531    my $property = $lhs . $separator . $rhs;
18532
18533    return <<"EOC";
18534Error('\\p{$property}');
18535Error('\\P{$property}');
18536EOC
18537}
18538
18539# These are used only for making the test script
18540# XXX Maybe should also have a bad strict seps, which includes underscore.
18541
18542my @good_loose_seps = (
18543            " ",
18544            "-",
18545            "\t",
18546            "",
18547            "_",
18548           );
18549my @bad_loose_seps = (
18550           "/a/",
18551           ':=',
18552          );
18553
18554sub randomize_stricter_name($name) {
18555    # This used only for making the test script.  Take the input name and
18556    # return a randomized, but valid version of it under the stricter matching
18557    # rules.
18558
18559    # If the name looks like a number (integer, floating, or rational), do
18560    # some extra work
18561    if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
18562        my $sign = $1;
18563        my $number = $2;
18564        my $separator = $3;
18565
18566        # If there isn't a sign, part of the time add a plus
18567        # Note: Not testing having any denominator having a minus sign
18568        if (! $sign) {
18569            $sign = '+' if rand() <= .3;
18570        }
18571
18572        # And add 0 or more leading zeros.
18573        $name = $sign . ('0' x int rand(10)) . $number;
18574
18575        if (defined $separator) {
18576            my $extra_zeros = '0' x int rand(10);
18577
18578            if ($separator eq '.') {
18579
18580                # Similarly, add 0 or more trailing zeros after a decimal
18581                # point
18582                $name .= $extra_zeros;
18583            }
18584            else {
18585
18586                # Or, leading zeros before the denominator
18587                $name =~ s,/,/$extra_zeros,;
18588            }
18589        }
18590    }
18591
18592    # For legibility of the test, only change the case of whole sections at a
18593    # time.  To do this, first split into sections.  The split returns the
18594    # delimiters
18595    my @sections;
18596    for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
18597        trace $section if main::DEBUG && $to_trace;
18598
18599        if (length $section > 1 && $section !~ /\D/) {
18600
18601            # If the section is a sequence of digits, about half the time
18602            # randomly add underscores between some of them.
18603            if (rand() > .5) {
18604
18605                # Figure out how many underscores to add.  max is 1 less than
18606                # the number of digits.  (But add 1 at the end to make sure
18607                # result isn't 0, and compensate earlier by subtracting 2
18608                # instead of 1)
18609                my $num_underscores = int rand(length($section) - 2) + 1;
18610
18611                # And add them evenly throughout, for convenience, not rigor
18612                use integer;
18613                my $spacing = (length($section) - 1)/ $num_underscores;
18614                my $temp = $section;
18615                $section = "";
18616                for my $i (1 .. $num_underscores) {
18617                    $section .= substr($temp, 0, $spacing, "") . '_';
18618                }
18619                $section .= $temp;
18620            }
18621            push @sections, $section;
18622        }
18623        else {
18624
18625            # Here not a sequence of digits.  Change the case of the section
18626            # randomly
18627            my $switch = int rand(4);
18628            if ($switch == 0) {
18629                push @sections, uc $section;
18630            }
18631            elsif ($switch == 1) {
18632                push @sections, lc $section;
18633            }
18634            elsif ($switch == 2) {
18635                push @sections, ucfirst $section;
18636            }
18637            else {
18638                push @sections, $section;
18639            }
18640        }
18641    }
18642    trace "returning", join "", @sections if main::DEBUG && $to_trace;
18643    return join "", @sections;
18644}
18645
18646sub randomize_loose_name($name, $want_error=0) {
18647    # This used only for making the test script
18648
18649    $name = randomize_stricter_name($name);
18650
18651    my @parts;
18652    push @parts, $good_loose_seps[rand(@good_loose_seps)];
18653
18654    # Preserve trailing ones for the sake of not stripping the underscore from
18655    # 'L_'
18656    for my $part (split /[-\s_]+ (?= . )/, $name) {
18657        if (@parts) {
18658            if ($want_error and rand() < 0.3) {
18659                push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
18660                $want_error = 0;
18661            }
18662            else {
18663                push @parts, $good_loose_seps[rand(@good_loose_seps)];
18664            }
18665        }
18666        push @parts, $part;
18667    }
18668    my $new = join("", @parts);
18669    trace "$name => $new" if main::DEBUG && $to_trace;
18670
18671    if ($want_error) {
18672        if (rand() >= 0.5) {
18673            $new .= $bad_loose_seps[rand(@bad_loose_seps)];
18674        }
18675        else {
18676            $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
18677        }
18678    }
18679    return $new;
18680}
18681
18682# Used to make sure don't generate duplicate test cases.
18683my %test_generated;
18684
18685sub make_property_test_script() {
18686    # This used only for making the test script
18687    # this written directly -- it's huge.
18688
18689    print "Making test script\n" if $verbosity >= $PROGRESS;
18690
18691    # This uses randomness to test different possibilities without testing all
18692    # possibilities.  To ensure repeatability, set the seed to 0.  But if
18693    # tests are added, it will perturb all later ones in the .t file
18694    srand 0;
18695
18696    $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
18697
18698    # Create a list of what the %f representation is for each rational number.
18699    # This will be used below.
18700    my @valid_base_floats = '0.0';
18701    foreach my $e_representation (keys %nv_floating_to_rational) {
18702        push @valid_base_floats,
18703                            eval $nv_floating_to_rational{$e_representation};
18704    }
18705
18706    # It doesn't matter whether the elements of this array contain single lines
18707    # or multiple lines. main::write doesn't count the lines.
18708    my @output;
18709
18710    push @output, <<'EOF_CODE';
18711Error('\p{Script=InGreek}');    # Bug #69018
18712Test_GCB("1100 $nobreak 1161");  # Bug #70940
18713Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
18714Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
18715Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726
18716Error('\p{InKana}');    # 'Kana' is not a block so InKana shouldn't compile
18717
18718# Make sure this gets tested; it was not part of the official test suite at
18719# the time this was added.  Note that this is as it would appear in the
18720# official suite, and gets modified to check for the perl tailoring by
18721# Test_WB()
18722Test_WB("$breakable 0020 $breakable 0020 $breakable 0308 $breakable");
18723Test_LB("$nobreak 200B $nobreak 0020 $nobreak 0020 $breakable 2060 $breakable");
18724Expect(1, ord(" "), '\p{gc=:(?aa)s:}', "");     # /aa is valid
18725Expect(1, ord(" "), '\p{gc=:(?-s)s:}', "");     # /-s is valid
18726EOF_CODE
18727
18728    # Sort these so get results in same order on different runs of this
18729    # program
18730    foreach my $property (sort { $a->has_dependency <=> $b->has_dependency
18731                                    or
18732                                 lc $a->name cmp lc $b->name
18733                               } property_ref('*'))
18734    {
18735        # Non-binary properties should not match \p{};  Test all for that.
18736        if ($property->type != $BINARY && $property->type != $FORCED_BINARY) {
18737            my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS }
18738                                                            $property->aliases;
18739            foreach my $property_alias ($property->aliases) {
18740                my $name = standardize($property_alias->name);
18741
18742                # But some names are ambiguous, meaning a binary property with
18743                # the same name when used in \p{}, and a different
18744                # (non-binary) property in other contexts.
18745                next if grep { $name eq $_ } keys %ambiguous_names;
18746
18747                push @output, <<"EOF_CODE";
18748Error('\\p{$name}');
18749Error('\\P{$name}');
18750EOF_CODE
18751            }
18752        }
18753        foreach my $table (sort { $a->has_dependency <=> $b->has_dependency
18754                                    or
18755                                  lc $a->name cmp lc $b->name
18756                                } $property->tables)
18757        {
18758
18759            # Find code points that match, and don't match this table.
18760            my $valid = $table->get_valid_code_point;
18761            my $invalid = $table->get_invalid_code_point;
18762            my $warning = ($table->status eq $DEPRECATED)
18763                            ? "'deprecated'"
18764                            : '""';
18765
18766            # Test each possible combination of the property's aliases with
18767            # the table's.  If this gets to be too many, could do what is done
18768            # in the set_final_comment() for Tables
18769            my @table_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->aliases;
18770            next unless @table_aliases;
18771            my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->property->aliases;
18772            next unless @property_aliases;
18773
18774            # Every property can be optionally be prefixed by 'Is_', so test
18775            # that those work, by creating such a new alias for each
18776            # pre-existing one.
18777            push @property_aliases, map { Alias->new("Is_" . $_->name,
18778                                                    $_->loose_match,
18779                                                    $_->make_re_pod_entry,
18780                                                    $_->ok_as_filename,
18781                                                    $_->status,
18782                                                    $_->ucd,
18783                                                    )
18784                                         } @property_aliases;
18785            my $max = max(scalar @table_aliases, scalar @property_aliases);
18786            for my $j (0 .. $max - 1) {
18787
18788                # The current alias for property is the next one on the list,
18789                # or if beyond the end, start over.  Similarly for table
18790                my $property_name
18791                            = $property_aliases[$j % @property_aliases]->name;
18792
18793                $property_name = "" if $table->property == $perl;
18794                my $table_alias = $table_aliases[$j % @table_aliases];
18795                my $table_name = $table_alias->name;
18796                my $loose_match = $table_alias->loose_match;
18797
18798                # If the table doesn't have a file, any test for it is
18799                # already guaranteed to be in error
18800                my $already_error = ! $table->file_path;
18801
18802                # A table that begins with these could actually be a
18803                # user-defined property, so won't be compile time errors, as
18804                # the definitions of those can be deferred until runtime
18805                next if $already_error && $table_name =~ / ^ I[ns] /x;
18806
18807                # Generate error cases for this alias.
18808                push @output, generate_error($property_name,
18809                                             $table_name,
18810                                             $already_error);
18811
18812                # If the table is guaranteed to always generate an error,
18813                # quit now without generating success cases.
18814                next if $already_error;
18815
18816                # Now for the success cases.  First, wildcard matching, as it
18817                # shouldn't have any randomization.
18818                if ($table_alias->status eq $NORMAL) {
18819                    push @output, generate_wildcard_tests($property_name,
18820                                                          $table_name,
18821                                                          $valid,
18822                                                          $invalid,
18823                                                          $warning,
18824                                                         );
18825                }
18826                my $random;
18827                if ($loose_match) {
18828
18829                    # For loose matching, create an extra test case for the
18830                    # standard name.
18831                    my $standard = standardize($table_name);
18832
18833                    # $test_name should be a unique combination for each test
18834                    # case; used just to avoid duplicate tests
18835                    my $test_name = "$property_name=$standard";
18836
18837                    # Don't output duplicate test cases.
18838                    if (! exists $test_generated{$test_name}) {
18839                        $test_generated{$test_name} = 1;
18840                        push @output, generate_tests($property_name,
18841                                                     $standard,
18842                                                     $valid,
18843                                                     $invalid,
18844                                                     $warning,
18845                                                 );
18846                        if ($table_alias->status eq $NORMAL) {
18847                            push @output, generate_wildcard_tests(
18848                                                     $property_name,
18849                                                     $standard,
18850                                                     $valid,
18851                                                     $invalid,
18852                                                     $warning,
18853                                                 );
18854                        }
18855                    }
18856                    $random = randomize_loose_name($table_name)
18857                }
18858                else { # Stricter match
18859                    $random = randomize_stricter_name($table_name);
18860                }
18861
18862                # Now for the main test case for this alias.
18863                my $test_name = "$property_name=$random";
18864                if (! exists $test_generated{$test_name}) {
18865                    $test_generated{$test_name} = 1;
18866                    push @output, generate_tests($property_name,
18867                                                 $random,
18868                                                 $valid,
18869                                                 $invalid,
18870                                                 $warning,
18871                                             );
18872
18873                    if ($property->name eq 'nv') {
18874                        if ($table_name !~ qr{/}) {
18875                            push @output, generate_tests($property_name,
18876                                                sprintf("%.15e", $table_name),
18877                                                $valid,
18878                                                $invalid,
18879                                                $warning,
18880                                            );
18881                    }
18882                    else {
18883                        # If the name is a rational number, add tests for a
18884                        # non-reduced form, and for a floating point equivalent.
18885
18886                        # 60 is a number divisible by a bunch of things
18887                        my ($numerator, $denominator) = $table_name
18888                                                        =~ m! (.+) / (.+) !x;
18889                        $numerator *= 60;
18890                        $denominator *= 60;
18891                        push @output, generate_tests($property_name,
18892                                                    "$numerator/$denominator",
18893                                                    $valid,
18894                                                    $invalid,
18895                                                    $warning,
18896                                    );
18897
18898                        # Calculate the float, and the %e representation
18899                        my $float = eval $table_name;
18900                        my $e_representation = sprintf("%.*e",
18901                                                $E_FLOAT_PRECISION, $float);
18902                        # Parse that
18903                        my ($non_zeros, $zeros, $exponent_sign, $exponent)
18904                           = $e_representation
18905                               =~ / -? [1-9] \. (\d*?) (0*) e ([+-]) (\d+) /x;
18906                        my $min_e_precision;
18907                        my $min_f_precision;
18908
18909                        if ($exponent_sign eq '+' && $exponent != 0) {
18910                            Carp::my_carp_bug("Not yet equipped to handle"
18911                                            . " positive exponents");
18912                            return;
18913                        }
18914                        else {
18915                            # We're trying to find the minimum precision that
18916                            # is needed to indicate this particular rational
18917                            # for the given $E_FLOAT_PRECISION.  For %e, any
18918                            # trailing zeros, like 1.500e-02 aren't needed, so
18919                            # the correct value is how many non-trailing zeros
18920                            # there are after the decimal point.
18921                            $min_e_precision = length $non_zeros;
18922
18923                            # For %f, like .01500, we want at least
18924                            # $E_FLOAT_PRECISION digits, but any trailing
18925                            # zeros aren't needed, so we can subtract the
18926                            # length of those.  But we also need to include
18927                            # the zeros after the decimal point, but before
18928                            # the first significant digit.
18929                            $min_f_precision = $E_FLOAT_PRECISION
18930                                             + $exponent
18931                                             - length $zeros;
18932                        }
18933
18934                        # Make tests for each possible precision from 1 to
18935                        # just past the worst case.
18936                        my $upper_limit = ($min_e_precision > $min_f_precision)
18937                                           ? $min_e_precision
18938                                           : $min_f_precision;
18939
18940                        for my $i (1 .. $upper_limit + 1) {
18941                            for my $format ("e", "f") {
18942                                my $this_table
18943                                          = sprintf("%.*$format", $i, $float);
18944
18945                                # If we don't have enough precision digits,
18946                                # make a fail test; otherwise a pass test.
18947                                my $pass = ($format eq "e")
18948                                            ? $i >= $min_e_precision
18949                                            : $i >= $min_f_precision;
18950                                if ($pass) {
18951                                    push @output, generate_tests($property_name,
18952                                                                $this_table,
18953                                                                $valid,
18954                                                                $invalid,
18955                                                                $warning,
18956                                                );
18957                                }
18958                                elsif (   $format eq "e"
18959
18960                                          # Here we would fail, but in the %f
18961                                          # case, the representation at this
18962                                          # precision could actually be a
18963                                          # valid one for some other rational
18964                                       || ! grep { $this_table
18965                                                            =~ / ^ $_ 0* $ /x }
18966                                                            @valid_base_floats)
18967                                {
18968                                    push @output,
18969                                        generate_error($property_name,
18970                                                       $this_table,
18971                                                       1   # 1 => already an
18972                                                           # error
18973                                                );
18974                                }
18975                            }
18976                        }
18977                    }
18978                    }
18979                }
18980            }
18981            $table->DESTROY();
18982        }
18983        $property->DESTROY();
18984    }
18985
18986    # Make any test of the boundary (break) properties TODO if the code
18987    # doesn't match the version being compiled
18988    my $TODO_FAILING_BREAKS = ($version_of_mk_invlist_bounds ne $v_version)
18989                             ? "\nsub TODO_FAILING_BREAKS { 1 }\n"
18990                             : "\nsub TODO_FAILING_BREAKS { 0 }\n";
18991
18992    @output= map {
18993        map s/^/    /mgr,
18994        map "$_;\n",
18995        split /;\n/, $_
18996    } @output;
18997
18998    # Cause there to be 'if' statements to only execute a portion of this
18999    # long-running test each time, so that we can have a bunch of .t's running
19000    # in parallel
19001    my $chunks = 10     # Number of test files
19002               - 1      # For GCB & SB
19003               - 1      # For WB
19004               - 4;     # LB split into this many files
19005    my @output_chunked;
19006    my $chunk_count=0;
19007    my $chunk_size= int(@output / $chunks) + 1;
19008    while (@output) {
19009        $chunk_count++;
19010        my @chunk= splice @output, 0, $chunk_size;
19011        push @output_chunked,
19012            "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19013                @chunk,
19014            "}\n";
19015    }
19016
19017    $chunk_count++;
19018    push @output_chunked,
19019        "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19020            (map {"    Test_GCB('$_');\n"} @backslash_X_tests),
19021            (map {"    Test_SB('$_');\n"} @SB_tests),
19022        "}\n";
19023
19024
19025    $chunk_size= int(@LB_tests / 4) + 1;
19026    @LB_tests = map {"    Test_LB('$_');\n"} @LB_tests;
19027    while (@LB_tests) {
19028        $chunk_count++;
19029        my @chunk= splice @LB_tests, 0, $chunk_size;
19030        push @output_chunked,
19031            "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19032                @chunk,
19033            "}\n";
19034    }
19035
19036    $chunk_count++;
19037    push @output_chunked,
19038        "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19039            (map {"    Test_WB('$_');\n"} @WB_tests),
19040        "}\n";
19041
19042    &write($t_path,
19043           0,           # Not utf8;
19044           [$HEADER,
19045            $TODO_FAILING_BREAKS,
19046            <DATA>,
19047            @output_chunked,
19048            "Finished();\n",
19049           ]);
19050
19051    return;
19052}
19053
19054sub make_normalization_test_script() {
19055    print "Making normalization test script\n" if $verbosity >= $PROGRESS;
19056
19057    my $n_path = 'TestNorm.pl';
19058
19059    unshift @normalization_tests, <<'END';
19060use utf8;
19061use Test::More;
19062
19063sub ord_string {    # Convert packed ords to printable string
19064    use charnames ();
19065    return "'" . join("", map { '\N{' . charnames::viacode($_) . '}' }
19066                                                unpack "U*", shift) .  "'";
19067    #return "'" . join(" ", map { sprintf "%04X", $_ } unpack "U*", shift) .  "'";
19068}
19069
19070sub Test_N {
19071    my ($source, $nfc, $nfd, $nfkc, $nfkd) = @_;
19072    my $display_source = ord_string($source);
19073    my $display_nfc = ord_string($nfc);
19074    my $display_nfd = ord_string($nfd);
19075    my $display_nfkc = ord_string($nfkc);
19076    my $display_nfkd = ord_string($nfkd);
19077
19078    use Unicode::Normalize;
19079    #    NFC
19080    #      nfc ==  toNFC(source) ==  toNFC(nfc) ==  toNFC(nfd)
19081    #      nfkc ==  toNFC(nfkc) ==  toNFC(nfkd)
19082    #
19083    #    NFD
19084    #      nfd ==  toNFD(source) ==  toNFD(nfc) ==  toNFD(nfd)
19085    #      nfkd ==  toNFD(nfkc) ==  toNFD(nfkd)
19086    #
19087    #    NFKC
19088    #      nfkc == toNFKC(source) == toNFKC(nfc) == toNFKC(nfd) ==
19089    #      toNFKC(nfkc) == toNFKC(nfkd)
19090    #
19091    #    NFKD
19092    #      nfkd == toNFKD(source) == toNFKD(nfc) == toNFKD(nfd) ==
19093    #      toNFKD(nfkc) == toNFKD(nfkd)
19094
19095    is(NFC($source), $nfc, "NFC($display_source) eq $display_nfc");
19096    is(NFC($nfc), $nfc, "NFC($display_nfc) eq $display_nfc");
19097    is(NFC($nfd), $nfc, "NFC($display_nfd) eq $display_nfc");
19098    is(NFC($nfkc), $nfkc, "NFC($display_nfkc) eq $display_nfkc");
19099    is(NFC($nfkd), $nfkc, "NFC($display_nfkd) eq $display_nfkc");
19100
19101    is(NFD($source), $nfd, "NFD($display_source) eq $display_nfd");
19102    is(NFD($nfc), $nfd, "NFD($display_nfc) eq $display_nfd");
19103    is(NFD($nfd), $nfd, "NFD($display_nfd) eq $display_nfd");
19104    is(NFD($nfkc), $nfkd, "NFD($display_nfkc) eq $display_nfkd");
19105    is(NFD($nfkd), $nfkd, "NFD($display_nfkd) eq $display_nfkd");
19106
19107    is(NFKC($source), $nfkc, "NFKC($display_source) eq $display_nfkc");
19108    is(NFKC($nfc), $nfkc, "NFKC($display_nfc) eq $display_nfkc");
19109    is(NFKC($nfd), $nfkc, "NFKC($display_nfd) eq $display_nfkc");
19110    is(NFKC($nfkc), $nfkc, "NFKC($display_nfkc) eq $display_nfkc");
19111    is(NFKC($nfkd), $nfkc, "NFKC($display_nfkd) eq $display_nfkc");
19112
19113    is(NFKD($source), $nfkd, "NFKD($display_source) eq $display_nfkd");
19114    is(NFKD($nfc), $nfkd, "NFKD($display_nfc) eq $display_nfkd");
19115    is(NFKD($nfd), $nfkd, "NFKD($display_nfd) eq $display_nfkd");
19116    is(NFKD($nfkc), $nfkd, "NFKD($display_nfkc) eq $display_nfkd");
19117    is(NFKD($nfkd), $nfkd, "NFKD($display_nfkd) eq $display_nfkd");
19118}
19119END
19120
19121    &write($n_path,
19122           1,           # Is utf8;
19123           [
19124            @normalization_tests,
19125            'done_testing();'
19126            ]);
19127    return;
19128}
19129
19130# Skip reasons, so will be exact same text and hence the files with each
19131# reason will get grouped together in perluniprops.
19132my $Documentation = "Documentation";
19133my $Indic_Skip
19134            = "Provisional; for the analysis and processing of Indic scripts";
19135my $Validation = "Validation Tests";
19136my $Validation_Documentation = "Documentation of validation Tests";
19137
19138# This is a list of the input files and how to handle them.  The files are
19139# processed in their order in this list.  Some reordering is possible if
19140# desired, but the PropertyAliases and PropValueAliases files should be first,
19141# and the extracted before the others (as data in an extracted file can be
19142# over-ridden by the non-extracted.  Some other files depend on data derived
19143# from an earlier file, like UnicodeData requires data from Jamo, and the case
19144# changing and folding requires data from Unicode.  Mostly, it is safest to
19145# order by first version releases in (except the Jamo).
19146#
19147# The version strings allow the program to know whether to expect a file or
19148# not, but if a file exists in the directory, it will be processed, even if it
19149# is in a version earlier than expected, so you can copy files from a later
19150# release into an earlier release's directory.
19151my @input_file_objects = (
19152    Input_file->new('PropertyAliases.txt', v3.2,
19153                    Handler => \&process_PropertyAliases,
19154                    Early => [ \&substitute_PropertyAliases ],
19155                    Required_Even_in_Debug_Skip => 1,
19156                   ),
19157    Input_file->new(undef, v0,  # No file associated with this
19158                    Progress_Message => 'Finishing property setup',
19159                    Handler => \&finish_property_setup,
19160                   ),
19161    Input_file->new('PropValueAliases.txt', v3.2,
19162                     Handler => \&process_PropValueAliases,
19163                     Early => [ \&substitute_PropValueAliases ],
19164                     Has_Missings_Defaults => $NOT_IGNORED,
19165                     Required_Even_in_Debug_Skip => 1,
19166                    ),
19167    Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
19168                    Property => 'General_Category',
19169                   ),
19170    Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
19171                    Property => 'Canonical_Combining_Class',
19172                    Has_Missings_Defaults => $NOT_IGNORED,
19173                   ),
19174    Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
19175                    Property => 'Numeric_Type',
19176                    Has_Missings_Defaults => $NOT_IGNORED,
19177                   ),
19178    Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
19179                    Property => 'East_Asian_Width',
19180                    Has_Missings_Defaults => $NOT_IGNORED,
19181                   ),
19182    Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
19183                    Property => 'Line_Break',
19184                    Has_Missings_Defaults => $NOT_IGNORED,
19185                   ),
19186    Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
19187                    Property => 'Bidi_Class',
19188                    Has_Missings_Defaults => $NOT_IGNORED,
19189                   ),
19190    Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
19191                    Property => 'Decomposition_Type',
19192                    Has_Missings_Defaults => $NOT_IGNORED,
19193                   ),
19194    Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
19195    Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
19196                    Property => 'Numeric_Value',
19197                    Each_Line_Handler => \&filter_numeric_value_line,
19198                    Has_Missings_Defaults => $NOT_IGNORED,
19199                   ),
19200    Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
19201                    Property => 'Joining_Group',
19202                    Has_Missings_Defaults => $NOT_IGNORED,
19203                   ),
19204
19205    Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
19206                    Property => 'Joining_Type',
19207                    Has_Missings_Defaults => $NOT_IGNORED,
19208                   ),
19209    Input_file->new("${EXTRACTED}DName.txt", v10.0.0,
19210                    Skip => 'This file adds no new information not already'
19211                          . ' present in other files',
19212                    # And it's unnecessary programmer work to handle this new
19213                    # format.  Previous Derived files actually had bug fixes
19214                    # in them that were useful, but that should not be the
19215                    # case here.
19216                   ),
19217    Input_file->new('Jamo.txt', v2.0.0,
19218                    Property => 'Jamo_Short_Name',
19219                    Each_Line_Handler => \&filter_jamo_line,
19220                   ),
19221    Input_file->new('UnicodeData.txt', v1.1.5,
19222                    Pre_Handler => \&setup_UnicodeData,
19223
19224                    # We clean up this file for some early versions.
19225                    Each_Line_Handler => [ (($v_version lt v2.0.0 )
19226                                            ? \&filter_v1_ucd
19227                                            : ($v_version eq v2.1.5)
19228                                                ? \&filter_v2_1_5_ucd
19229
19230                                                # And for 5.14 Perls with 6.0,
19231                                                # have to also make changes
19232                                                : ($v_version ge v6.0.0
19233                                                   && $^V lt v5.17.0)
19234                                                    ? \&filter_v6_ucd
19235                                                    : undef),
19236
19237                                            # Early versions did not have the
19238                                            # proper Unicode_1 names for the
19239                                            # controls
19240                                            (($v_version lt v3.0.0)
19241                                            ? \&filter_early_U1_names
19242                                            : undef),
19243
19244                                            # Early versions did not correctly
19245                                            # use the later method for giving
19246                                            # decimal digit values
19247                                            (($v_version le v3.2.0)
19248                                            ? \&filter_bad_Nd_ucd
19249                                            : undef),
19250
19251                                            # And the main filter
19252                                            \&filter_UnicodeData_line,
19253                                         ],
19254                    EOF_Handler => \&EOF_UnicodeData,
19255                   ),
19256    Input_file->new('CJKXREF.TXT', v1.1.5,
19257                    Withdrawn => v2.0.0,
19258                    Skip => 'Gives the mapping of CJK code points '
19259                          . 'between Unicode and various other standards',
19260                   ),
19261    Input_file->new('ArabicShaping.txt', v2.0.0,
19262                    Each_Line_Handler =>
19263                        ($v_version lt 4.1.0)
19264                                    ? \&filter_old_style_arabic_shaping
19265                                    : undef,
19266                    # The first field after the range is a "schematic name"
19267                    # not used by Perl
19268                    Properties => [ '<ignored>', 'Joining_Type', 'Joining_Group' ],
19269                    Has_Missings_Defaults => $NOT_IGNORED,
19270                   ),
19271    Input_file->new('Blocks.txt', v2.0.0,
19272                    Property => 'Block',
19273                    Has_Missings_Defaults => $NOT_IGNORED,
19274                    Each_Line_Handler => \&filter_blocks_lines
19275                   ),
19276    Input_file->new('Index.txt', v2.0.0,
19277                    Skip => 'Alphabetical index of Unicode characters',
19278                   ),
19279    Input_file->new('NamesList.txt', v2.0.0,
19280                    Skip => 'Annotated list of characters',
19281                   ),
19282    Input_file->new('PropList.txt', v2.0.0,
19283                    Each_Line_Handler => (($v_version lt v3.1.0)
19284                                            ? \&filter_old_style_proplist
19285                                            : undef),
19286                   ),
19287    Input_file->new('Props.txt', v2.0.0,
19288                    Withdrawn => v3.0.0,
19289                    Skip => 'A subset of F<PropList.txt> (which is used instead)',
19290                   ),
19291    Input_file->new('ReadMe.txt', v2.0.0,
19292                    Skip => $Documentation,
19293                   ),
19294    Input_file->new('Unihan.txt', v2.0.0,
19295                    Withdrawn => v5.2.0,
19296                    Construction_Time_Handler => \&construct_unihan,
19297                    Pre_Handler => \&setup_unihan,
19298                    Optional => [ "",
19299                                  'Unicode_Radical_Stroke'
19300                                ],
19301                    Each_Line_Handler => \&filter_unihan_line,
19302                   ),
19303    Input_file->new('SpecialCasing.txt', v2.1.8,
19304                    Each_Line_Handler => ($v_version eq 2.1.8)
19305                                         ? \&filter_2_1_8_special_casing_line
19306                                         : \&filter_special_casing_line,
19307                    Pre_Handler => \&setup_special_casing,
19308                    Has_Missings_Defaults => $IGNORED,
19309                   ),
19310    Input_file->new(
19311                    'LineBreak.txt', v3.0.0,
19312                    Has_Missings_Defaults => $NOT_IGNORED,
19313                    Property => 'Line_Break',
19314                    # Early versions had problematic syntax
19315                    Each_Line_Handler => ($v_version ge v3.1.0)
19316                                          ? undef
19317                                          : ($v_version lt v3.0.0)
19318                                            ? \&filter_substitute_lb
19319                                            : \&filter_early_ea_lb,
19320                    # Must use long names for property values see comments at
19321                    # sub filter_substitute_lb
19322                    Early => [ "LBsubst.txt", '_Perl_LB', 'Alphabetic',
19323                               'Alphabetic', # default to this because XX ->
19324                                             # AL
19325
19326                               # Don't use _Perl_LB as a synonym for
19327                               # Line_Break in later perls, as it is tailored
19328                               # and isn't the same as Line_Break
19329                               'ONLY_EARLY' ],
19330                   ),
19331    Input_file->new('EastAsianWidth.txt', v3.0.0,
19332                    Property => 'East_Asian_Width',
19333                    Has_Missings_Defaults => $NOT_IGNORED,
19334                    # Early versions had problematic syntax
19335                    Each_Line_Handler => (($v_version lt v3.1.0)
19336                                        ? \&filter_early_ea_lb
19337                                        : undef),
19338                   ),
19339    Input_file->new('CompositionExclusions.txt', v3.0.0,
19340                    Property => 'Composition_Exclusion',
19341                   ),
19342    Input_file->new('UnicodeData.html', v3.0.0,
19343                    Withdrawn => v4.0.1,
19344                    Skip => $Documentation,
19345                   ),
19346    Input_file->new('BidiMirroring.txt', v3.0.1,
19347                    Property => 'Bidi_Mirroring_Glyph',
19348                    Has_Missings_Defaults => ($v_version lt v6.2.0)
19349                                              ? $NO_DEFAULTS
19350                                              # Is <none> which doesn't mean
19351                                              # anything to us, we will use the
19352                                              # null string
19353                                              : $IGNORED,
19354                   ),
19355    Input_file->new('NamesList.html', v3.0.0,
19356                    Skip => 'Describes the format and contents of '
19357                          . 'F<NamesList.txt>',
19358                   ),
19359    Input_file->new('UnicodeCharacterDatabase.html', v3.0.0,
19360                    Withdrawn => v5.1,
19361                    Skip => $Documentation,
19362                   ),
19363    Input_file->new('CaseFolding.txt', v3.0.1,
19364                    Pre_Handler => \&setup_case_folding,
19365                    Each_Line_Handler =>
19366                        [ ($v_version lt v3.1.0)
19367                                 ? \&filter_old_style_case_folding
19368                                 : undef,
19369                           \&filter_case_folding_line
19370                        ],
19371                    Has_Missings_Defaults => $IGNORED,
19372                   ),
19373    Input_file->new("NormTest.txt", v3.0.1,
19374                     Handler => \&process_NormalizationsTest,
19375                     Skip => ($make_norm_test_script) ? 0 : $Validation,
19376                   ),
19377    Input_file->new('DCoreProperties.txt', v3.1.0,
19378                    # 5.2 changed this file
19379                    Has_Missings_Defaults => (($v_version ge v5.2.0)
19380                                            ? $NOT_IGNORED
19381                                            : $NO_DEFAULTS),
19382                   ),
19383    Input_file->new('DProperties.html', v3.1.0,
19384                    Withdrawn => v3.2.0,
19385                    Skip => $Documentation,
19386                   ),
19387    Input_file->new('PropList.html', v3.1.0,
19388                    Withdrawn => v5.1,
19389                    Skip => $Documentation,
19390                   ),
19391    Input_file->new('Scripts.txt', v3.1.0,
19392                    Property => 'Script',
19393                    Each_Line_Handler => (($v_version le v4.0.0)
19394                                          ? \&filter_all_caps_script_names
19395                                          : undef),
19396                    Has_Missings_Defaults => $NOT_IGNORED,
19397                   ),
19398    Input_file->new('DNormalizationProps.txt', v3.1.0,
19399                    Has_Missings_Defaults => $NOT_IGNORED,
19400                    Each_Line_Handler => (($v_version lt v4.0.1)
19401                                      ? \&filter_old_style_normalization_lines
19402                                      : undef),
19403                   ),
19404    Input_file->new('DerivedProperties.html', v3.1.1,
19405                    Withdrawn => v5.1,
19406                    Skip => $Documentation,
19407                   ),
19408    Input_file->new('DAge.txt', v3.2.0,
19409                    Has_Missings_Defaults => $NOT_IGNORED,
19410                    Property => 'Age'
19411                   ),
19412    Input_file->new('HangulSyllableType.txt', v4.0,
19413                    Has_Missings_Defaults => $NOT_IGNORED,
19414                    Early => [ \&generate_hst, 'Hangul_Syllable_Type' ],
19415                    Property => 'Hangul_Syllable_Type'
19416                   ),
19417    Input_file->new('NormalizationCorrections.txt', v3.2.0,
19418                     # This documents the cumulative fixes to erroneous
19419                     # normalizations in earlier Unicode versions.  Its main
19420                     # purpose is so that someone running on an earlier
19421                     # version can use this file to override what got
19422                     # published in that earlier release.  It would be easy
19423                     # for mktables to handle this file.  But all the
19424                     # corrections in it should already be in the other files
19425                     # for the release it is.  To get it to actually mean
19426                     # something useful, someone would have to be using an
19427                     # earlier Unicode release, and copy it into the directory
19428                     # for that release and recompile.  So far there has been
19429                     # no demand to do that, so this hasn't been implemented.
19430                    Skip => 'Documentation of corrections already '
19431                          . 'incorporated into the Unicode data base',
19432                   ),
19433    Input_file->new('StandardizedVariants.html', v3.2.0,
19434                    Skip => 'Obsoleted as of Unicode 9.0, but previously '
19435                          . 'provided a visual display of the standard '
19436                          . 'variant sequences derived from '
19437                          . 'F<StandardizedVariants.txt>.',
19438                        # I don't know why the html came earlier than the
19439                        # .txt, but both are skipped anyway, so it doesn't
19440                        # matter.
19441                   ),
19442    Input_file->new('StandardizedVariants.txt', v4.0.0,
19443                    Skip => 'Certain glyph variations for character display '
19444                          . 'are standardized.  This lists the non-Unihan '
19445                          . 'ones; the Unihan ones are also not used by '
19446                          . 'Perl, and are in a separate Unicode data base '
19447                          . 'L<http://www.unicode.org/ivd>',
19448                   ),
19449    Input_file->new('UCD.html', v4.0.0,
19450                    Withdrawn => v5.2,
19451                    Skip => $Documentation,
19452                   ),
19453    Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
19454                    Early => [ "WBsubst.txt", '_Perl_WB', 'ALetter' ],
19455                    Property => 'Word_Break',
19456                    Has_Missings_Defaults => $NOT_IGNORED,
19457                   ),
19458    Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1,
19459                    Early => [ \&generate_GCB, '_Perl_GCB' ],
19460                    Property => 'Grapheme_Cluster_Break',
19461                    Has_Missings_Defaults => $NOT_IGNORED,
19462                   ),
19463    Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
19464                    Handler => \&process_GCB_test,
19465                    retain_trailing_comments => 1,
19466                   ),
19467    Input_file->new("$AUXILIARY/GraphemeBreakTest.html", v4.1.0,
19468                    Skip => $Validation_Documentation,
19469                   ),
19470    Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
19471                    Handler => \&process_SB_test,
19472                    retain_trailing_comments => 1,
19473                   ),
19474    Input_file->new("$AUXILIARY/SentenceBreakTest.html", v4.1.0,
19475                    Skip => $Validation_Documentation,
19476                   ),
19477    Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
19478                    Handler => \&process_WB_test,
19479                    retain_trailing_comments => 1,
19480                   ),
19481    Input_file->new("$AUXILIARY/WordBreakTest.html", v4.1.0,
19482                    Skip => $Validation_Documentation,
19483                   ),
19484    Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
19485                    Property => 'Sentence_Break',
19486                    Early => [ "SBsubst.txt", '_Perl_SB', 'OLetter' ],
19487                    Has_Missings_Defaults => $NOT_IGNORED,
19488                   ),
19489    Input_file->new('NamedSequences.txt', v4.1.0,
19490                    Handler => \&process_NamedSequences
19491                   ),
19492    Input_file->new('Unihan.html', v4.1.0,
19493                    Withdrawn => v5.2,
19494                    Skip => $Documentation,
19495                   ),
19496    Input_file->new('NameAliases.txt', v5.0,
19497                    Property => 'Name_Alias',
19498                    Each_Line_Handler => ($v_version le v6.0.0)
19499                                   ? \&filter_early_version_name_alias_line
19500                                   : \&filter_later_version_name_alias_line,
19501                   ),
19502        # NameAliases.txt came along in v5.0.  The above constructor handles
19503        # this.  But until 6.1, it was lacking some information needed by core
19504        # perl.  The constructor below handles that.  It is either a kludge or
19505        # clever, depending on your point of view.  The 'Withdrawn' parameter
19506        # indicates not to use it at all starting in 6.1 (so the above
19507        # constructor applies), and the 'v6.1' parameter indicates to use the
19508        # Early parameter before 6.1.  Therefore 'Early" is always used,
19509        # yielding the internal-only property '_Perl_Name_Alias', which it
19510        # gets from a NameAliases.txt from 6.1 or later stored in
19511        # N_Asubst.txt.  In combination with the above constructor,
19512        # 'Name_Alias' is publicly accessible starting with v5.0, and the
19513        # better 6.1 version is accessible to perl core in all releases.
19514    Input_file->new("NameAliases.txt", v6.1,
19515                    Withdrawn => v6.1,
19516                    Early => [ "N_Asubst.txt", '_Perl_Name_Alias', "" ],
19517                    Property => 'Name_Alias',
19518                    EOF_Handler => \&fixup_early_perl_name_alias,
19519                    Each_Line_Handler =>
19520                                       \&filter_later_version_name_alias_line,
19521                   ),
19522    Input_file->new('NamedSqProv.txt', v5.0.0,
19523                    Skip => 'Named sequences proposed for inclusion in a '
19524                          . 'later version of the Unicode Standard; if you '
19525                          . 'need them now, you can append this file to '
19526                          . 'F<NamedSequences.txt> and recompile perl',
19527                   ),
19528    Input_file->new("$AUXILIARY/LBTest.txt", v5.1.0,
19529                    Handler => \&process_LB_test,
19530                    retain_trailing_comments => 1,
19531                   ),
19532    Input_file->new("$AUXILIARY/LineBreakTest.html", v5.1.0,
19533                    Skip => $Validation_Documentation,
19534                   ),
19535    Input_file->new("BidiTest.txt", v5.2.0,
19536                    Skip => $Validation,
19537                   ),
19538    Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
19539                    Optional => "",
19540                    Each_Line_Handler => \&filter_unihan_line,
19541                   ),
19542    Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
19543                    Optional => "",
19544                    Each_Line_Handler => \&filter_unihan_line,
19545                   ),
19546    Input_file->new('UnihanIRGSources.txt', v5.2.0,
19547                    Optional => [ "",
19548                                  'kCompatibilityVariant',
19549                                  'kIICore',
19550                                  'kIRG_GSource',
19551                                  'kIRG_HSource',
19552                                  'kIRG_JSource',
19553                                  'kIRG_KPSource',
19554                                  'kIRG_MSource',
19555                                  'kIRG_KSource',
19556                                  'kIRG_SSource',
19557                                  'kIRG_TSource',
19558                                  'kIRG_USource',
19559                                  'kIRG_UKSource',
19560                                  'kIRG_VSource',
19561                               ],
19562                    Pre_Handler => \&setup_unihan,
19563                    Each_Line_Handler => \&filter_unihan_line,
19564                   ),
19565    Input_file->new('UnihanNumericValues.txt', v5.2.0,
19566                    Optional => [ "",
19567                                  'kAccountingNumeric',
19568                                  'kOtherNumeric',
19569                                  'kPrimaryNumeric',
19570                                ],
19571                    Each_Line_Handler => \&filter_unihan_line,
19572                   ),
19573    Input_file->new('UnihanOtherMappings.txt', v5.2.0,
19574                    Optional => "",
19575                    Each_Line_Handler => \&filter_unihan_line,
19576                   ),
19577    Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
19578                    Optional => [ "",
19579                                  'Unicode_Radical_Stroke'
19580                                ],
19581                    Each_Line_Handler => \&filter_unihan_line,
19582                   ),
19583    Input_file->new('UnihanReadings.txt', v5.2.0,
19584                    Optional => "",
19585                    Each_Line_Handler => \&filter_unihan_line,
19586                   ),
19587    Input_file->new('UnihanVariants.txt', v5.2.0,
19588                    Optional => "",
19589                    Each_Line_Handler => \&filter_unihan_line,
19590                   ),
19591    Input_file->new('CJKRadicals.txt', v5.2.0,
19592                    Skip => 'Maps the kRSUnicode property values to '
19593                          . 'corresponding code points',
19594                   ),
19595    Input_file->new('EmojiSources.txt', v6.0.0,
19596                    Skip => 'Maps certain Unicode code points to their '
19597                          . 'legacy Japanese cell-phone values',
19598                   ),
19599    # This file is actually not usable as-is until 6.1.0, because the property
19600    # is provisional, so its name is missing from PropertyAliases.txt until
19601    # that release, so that further work would have to be done to get it to
19602    # work properly
19603    Input_file->new('ScriptExtensions.txt', v6.0.0,
19604                    Property => 'Script_Extensions',
19605                    Early => [ sub {} ], # Doesn't do anything but ensures
19606                                         # that this isn't skipped for early
19607                                         # versions
19608                    Pre_Handler => \&setup_script_extensions,
19609                    Each_Line_Handler => \&filter_script_extensions_line,
19610                    Has_Missings_Defaults => (($v_version le v6.0.0)
19611                                            ? $NO_DEFAULTS
19612                                            : $IGNORED),
19613                   ),
19614    # These two Indic files are actually not usable as-is until 6.1.0,
19615    # because they are provisional, so their property values are missing from
19616    # PropValueAliases.txt until that release, so that further work would have
19617    # to be done to get them to work properly.
19618    Input_file->new('IndicMatraCategory.txt', v6.0.0,
19619                    Withdrawn => v8.0.0,
19620                    Property => 'Indic_Matra_Category',
19621                    Has_Missings_Defaults => $NOT_IGNORED,
19622                    Skip => $Indic_Skip,
19623                   ),
19624    Input_file->new('IndicSyllabicCategory.txt', v6.0.0,
19625                    Property => 'Indic_Syllabic_Category',
19626                    Has_Missings_Defaults => $NOT_IGNORED,
19627                    Skip => (($v_version lt v8.0.0)
19628                              ? $Indic_Skip
19629                              : 0),
19630                   ),
19631    Input_file->new('USourceData.txt', v6.2.0,
19632                    Skip => 'Documentation of status and cross reference of '
19633                          . 'proposals for encoding by Unicode of Unihan '
19634                          . 'characters',
19635                   ),
19636    Input_file->new('USourceGlyphs.pdf', v6.2.0,
19637                    Skip => 'Pictures of the characters in F<USourceData.txt>',
19638                   ),
19639    Input_file->new('BidiBrackets.txt', v6.3.0,
19640                    Properties => [ 'Bidi_Paired_Bracket',
19641                                    'Bidi_Paired_Bracket_Type'
19642                                  ],
19643                    Has_Missings_Defaults => $NO_DEFAULTS,
19644                   ),
19645    Input_file->new("BidiCharacterTest.txt", v6.3.0,
19646                    Skip => $Validation,
19647                   ),
19648    Input_file->new('IndicPositionalCategory.txt', v8.0.0,
19649                    Property => 'Indic_Positional_Category',
19650                    Has_Missings_Defaults => $NOT_IGNORED,
19651                   ),
19652    Input_file->new('TangutSources.txt', v9.0.0,
19653                    Skip => 'Specifies source mappings for Tangut ideographs'
19654                          . ' and components. This data file also includes'
19655                          . ' informative radical-stroke values that are used'
19656                          . ' internally by Unicode',
19657                   ),
19658    Input_file->new('VerticalOrientation.txt', v10.0.0,
19659                    Property => 'Vertical_Orientation',
19660                    Has_Missings_Defaults => $NOT_IGNORED,
19661                   ),
19662    Input_file->new('NushuSources.txt', v10.0.0,
19663                    Skip => 'Specifies source material for Nushu characters',
19664                   ),
19665    Input_file->new('EquivalentUnifiedIdeograph.txt', v11.0.0,
19666                    Property => 'Equivalent_Unified_Ideograph',
19667                    Has_Missings_Defaults => $NOT_IGNORED,
19668                   ),
19669    Input_file->new('EmojiData.txt', v11.0.0,
19670                    # Is in UAX #51 and not the UCD, so must be updated
19671                    # separately, and the first line edited to indicate the
19672                    # UCD release we're pretending it to be in.  The UTC says
19673                    # this is a transitional state, and in fact was moved to
19674                    # the UCD in 13.0
19675                    Withdrawn => v13.0.0,
19676                    Pre_Handler => \&setup_emojidata,
19677                    Has_Missings_Defaults => $NOT_IGNORED,
19678                    Each_Line_Handler => \&filter_emojidata_line,
19679                    UCD => 0,
19680                   ),
19681    Input_file->new("$EMOJI/emoji.txt", v13.0.0,
19682                    Has_Missings_Defaults => $NOT_IGNORED,
19683                    UCD => 0,
19684                   ),
19685    Input_file->new("$EMOJI/ReadMe.txt", v13.0.0,
19686                    Skip => $Documentation,
19687                    UCD => 0,
19688                   ),
19689    Input_file->new('IdStatus.txt', v13.0.0,
19690                    Pre_Handler => \&setup_IdStatus,
19691                    Property => 'Identifier_Status',
19692                    UCD => 0,
19693                   ),
19694    Input_file->new('IdType.txt', v13.0.0,
19695                    Pre_Handler => \&setup_IdType,
19696                    Each_Line_Handler => \&filter_IdType_line,
19697                    Property => 'Identifier_Type',
19698                    UCD => 0,
19699                   ),
19700);
19701
19702# End of all the preliminaries.
19703# Do it...
19704
19705if (@missing_early_files) {
19706    print simple_fold(join_lines(<<END
19707
19708The compilation cannot be completed because one or more required input files,
19709listed below, are missing.  This is because you are compiling Unicode version
19710$unicode_version, which predates the existence of these file(s).  To fully
19711function, perl needs the data that these files would have contained if they
19712had been in this release.  To work around this, create copies of later
19713versions of the missing files in the directory containing '$0'.  (Perl will
19714make the necessary adjustments to the data to compensate for it not being the
19715same version as is being compiled.)  The files are available from unicode.org,
19716via either ftp or http.  If using http, they will be under
19717www.unicode.org/versions/.  Below are listed the source file name of each
19718missing file, the Unicode version to copy it from, and the name to store it
19719as.  (Note that the listed source file name may not be exactly the one that
19720Unicode calls it.  If you don't find it, you can look it up in 'README.perl'
19721to get the correct name.)
19722END
19723    ));
19724    print simple_fold(join_lines("\n$_")) for @missing_early_files;
19725    exit 2;
19726}
19727
19728if ($compare_versions) {
19729    Carp::my_carp(<<END
19730Warning.  \$compare_versions is set.  Output is not suitable for production
19731END
19732    );
19733}
19734
19735# Put into %potential_files a list of all the files in the directory structure
19736# that could be inputs to this program
19737File::Find::find({
19738    wanted=>sub {
19739        return unless / \. ( txt | htm l? ) $ /xi;  # Some platforms change the
19740                                                    # name's case
19741        my $full = lc(File::Spec->rel2abs($_));
19742        $potential_files{$full} = 1;
19743        return;
19744    }
19745}, File::Spec->curdir());
19746
19747my @mktables_list_output_files;
19748my $old_start_time = 0;
19749my $old_options = "";
19750
19751if (! -e $file_list) {
19752    print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
19753    $write_unchanged_files = 1;
19754} elsif ($write_unchanged_files) {
19755    print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
19756}
19757else {
19758    print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
19759    my $file_handle;
19760    if (! open $file_handle, "<", $file_list) {
19761        Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
19762        $glob_list = 1;
19763    }
19764    else {
19765        my @input;
19766
19767        # Read and parse mktables.lst, placing the results from the first part
19768        # into @input, and the second part into @mktables_list_output_files
19769        for my $list ( \@input, \@mktables_list_output_files ) {
19770            while (<$file_handle>) {
19771                s/^ \s+ | \s+ $//xg;
19772                if (/^ \s* \# \s* Autogenerated\ starting\ on\ (\d+)/x) {
19773                    $old_start_time = $1;
19774                    next;
19775                }
19776                if (/^ \s* \# \s* From\ options\ (.+) /x) {
19777                    $old_options = $1;
19778                    next;
19779                }
19780                next if /^ \s* (?: \# .* )? $/x;
19781                last if /^ =+ $/x;
19782                my ( $file ) = split /\t/;
19783                push @$list, $file;
19784            }
19785            @$list = uniques(@$list);
19786            next;
19787        }
19788
19789        # Look through all the input files
19790        foreach my $input (@input) {
19791            next if $input eq 'version'; # Already have checked this.
19792
19793            # Ignore if doesn't exist.  The checking about whether we care or
19794            # not is done via the Input_file object.
19795            next if ! file_exists($input);
19796
19797            # The paths are stored with relative names, and with '/' as the
19798            # delimiter; convert to absolute on this machine
19799            my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
19800            $potential_files{lc $full} = 1;
19801        }
19802    }
19803
19804    close $file_handle;
19805}
19806
19807if ($glob_list) {
19808
19809    # Here wants to process all .txt files in the directory structure.
19810    # Convert them to full path names.  They are stored in the platform's
19811    # relative style
19812    my @known_files;
19813    foreach my $object (@input_file_objects) {
19814        my $file = $object->file;
19815        next unless defined $file;
19816        push @known_files, File::Spec->rel2abs($file);
19817    }
19818
19819    my @unknown_input_files;
19820    foreach my $file (keys %potential_files) {  # The keys are stored in lc
19821        next if grep { $file eq lc($_) } @known_files;
19822
19823        # Here, the file is unknown to us.  Get relative path name
19824        $file = File::Spec->abs2rel($file);
19825        push @unknown_input_files, $file;
19826
19827        # What will happen is we create a data structure for it, and add it to
19828        # the list of input files to process.  First get the subdirectories
19829        # into an array
19830        my (undef, $directories, undef) = File::Spec->splitpath($file);
19831        $directories =~ s;/$;;;     # Can have extraneous trailing '/'
19832        my @directories = File::Spec->splitdir($directories);
19833
19834        # If the file isn't extracted (meaning none of the directories is the
19835        # extracted one), just add it to the end of the list of inputs.
19836        if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
19837            push @input_file_objects, Input_file->new($file, v0);
19838        }
19839        else {
19840
19841            # Here, the file is extracted.  It needs to go ahead of most other
19842            # processing.  Search for the first input file that isn't a
19843            # special required property (that is, find one whose first_release
19844            # is non-0), and isn't extracted.  Also, the Age property file is
19845            # processed before the extracted ones, just in case
19846            # $compare_versions is set.
19847            for (my $i = 0; $i < @input_file_objects; $i++) {
19848                if ($input_file_objects[$i]->first_released ne v0
19849                    && lc($input_file_objects[$i]->file) ne 'dage.txt'
19850                    && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
19851                {
19852                    splice @input_file_objects, $i, 0,
19853                                                Input_file->new($file, v0);
19854                    last;
19855                }
19856            }
19857
19858        }
19859    }
19860    if (@unknown_input_files) {
19861        print STDERR simple_fold(join_lines(<<END
19862
19863The following files are unknown as to how to handle.  Assuming they are
19864typical property files.  You'll know by later error messages if it worked or
19865not:
19866END
19867        ) . " " . join(", ", @unknown_input_files) . "\n\n");
19868    }
19869} # End of looking through directory structure for more .txt files.
19870
19871# Create the list of input files from the objects we have defined, plus
19872# version
19873my @input_files = qw(version Makefile);
19874foreach my $object (@input_file_objects) {
19875    my $file = $object->file;
19876    next if ! defined $file;    # Not all objects have files
19877    next if defined $object->skip;;
19878    push @input_files,  $file;
19879}
19880
19881if ( $verbosity >= $VERBOSE ) {
19882    print "Expecting ".scalar( @input_files )." input files. ",
19883         "Checking ".scalar( @mktables_list_output_files )." output files.\n";
19884}
19885
19886# We set $most_recent to be the most recently changed input file, including
19887# this program itself (done much earlier in this file)
19888foreach my $in (@input_files) {
19889    next unless -e $in;        # Keep going even if missing a file
19890    my $mod_time = (stat $in)[9];
19891    $most_recent = $mod_time if $mod_time > $most_recent;
19892
19893    # See that the input files have distinct names, to warn someone if they
19894    # are adding a new one
19895    if ($make_list) {
19896        my ($volume, $directories, $file ) = File::Spec->splitpath($in);
19897        $directories =~ s;/$;;;     # Can have extraneous trailing '/'
19898        my @directories = File::Spec->splitdir($directories);
19899        construct_filename($file, 'mutable', \@directories);
19900    }
19901}
19902
19903# We use 'Makefile' just to see if it has changed since the last time we
19904# rebuilt.  Now discard it.
19905@input_files = grep { $_ ne 'Makefile' } @input_files;
19906
19907my $rebuild = $write_unchanged_files    # Rebuild: if unconditional rebuild
19908              || ! scalar @mktables_list_output_files  # or if no outputs known
19909              || $old_start_time < $most_recent        # or out-of-date
19910              || $old_options ne $command_line_arguments; # or with different
19911                                                          # options
19912
19913# Now we check to see if any output files are older than youngest, if
19914# they are, we need to continue on, otherwise we can presumably bail.
19915if (! $rebuild) {
19916    foreach my $out (@mktables_list_output_files) {
19917        if ( ! file_exists($out)) {
19918            print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
19919            $rebuild = 1;
19920            last;
19921         }
19922        #local $to_trace = 1 if main::DEBUG;
19923        trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace;
19924        if ( (stat $out)[9] <= $most_recent ) {
19925            #trace "$out:  most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace;
19926            print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
19927            $rebuild = 1;
19928            last;
19929        }
19930    }
19931}
19932if (! $rebuild) {
19933    print "$0: Files seem to be ok, not bothering to rebuild.  Add '-w' option to force build\n";
19934    exit(0);
19935}
19936print "$0: Must rebuild tables.\n" if $verbosity >= $VERBOSE;
19937
19938# Ready to do the major processing.  First create the perl pseudo-property.
19939$perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
19940
19941# Process each input file
19942foreach my $file (@input_file_objects) {
19943    $file->run;
19944}
19945
19946# Finish the table generation.
19947
19948print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
19949finish_Unicode();
19950
19951# For the very specialized case of comparing two Unicode versions...
19952if (DEBUG && $compare_versions) {
19953    handle_compare_versions();
19954}
19955
19956print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
19957compile_perl();
19958
19959print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
19960add_perl_synonyms();
19961
19962print "Writing tables\n" if $verbosity >= $PROGRESS;
19963write_all_tables();
19964
19965# Write mktables.lst
19966if ( $file_list and $make_list ) {
19967
19968    print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
19969    foreach my $file (@input_files, @files_actually_output) {
19970        my (undef, $directories, $basefile) = File::Spec->splitpath($file);
19971        my @directories = grep length, File::Spec->splitdir($directories);
19972        $file = join '/', @directories, $basefile;
19973    }
19974
19975    my $ofh;
19976    if (! open $ofh,">",$file_list) {
19977        Carp::my_carp("Can't write to '$file_list'.  Skipping: $!");
19978        return
19979    }
19980    else {
19981        my $localtime = localtime $start_time;
19982        print $ofh <<"END";
19983#
19984# $file_list -- File list for $0.
19985#
19986#   Autogenerated starting on $start_time ($localtime)
19987#   From options $command_line_arguments
19988#
19989# - First section is input files
19990#   ($0 itself is not listed but is automatically considered an input)
19991# - Section separator is /^=+\$/
19992# - Second section is a list of output files.
19993# - Lines matching /^\\s*#/ are treated as comments
19994#   which along with blank lines are ignored.
19995#
19996
19997# Input files:
19998
19999END
20000        print $ofh "$_\n" for sort(@input_files);
20001        print $ofh "\n=================================\n# Output files:\n\n";
20002        print $ofh "$_\n" for sort @files_actually_output;
20003        print $ofh "\n# ",scalar(@input_files)," input files\n",
20004                "# ",scalar(@files_actually_output)+1," output files\n\n",
20005                "# End list\n";
20006        close $ofh
20007            or Carp::my_carp("Failed to close $ofh: $!");
20008
20009        print "Filelist has ",scalar(@input_files)," input files and ",
20010            scalar(@files_actually_output)+1," output files\n"
20011            if $verbosity >= $VERBOSE;
20012    }
20013}
20014
20015# Output these warnings unless -q explicitly specified.
20016if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) {
20017    if (@unhandled_properties) {
20018        print "\nProperties and tables that unexpectedly have no code points\n";
20019        foreach my $property (sort @unhandled_properties) {
20020            print $property, "\n";
20021        }
20022    }
20023
20024    if (%potential_files) {
20025        print "\nInput files that are not considered:\n";
20026        foreach my $file (sort keys %potential_files) {
20027            print File::Spec->abs2rel($file), "\n";
20028        }
20029    }
20030    print "\nAll done\n" if $verbosity >= $VERBOSE;
20031}
20032
20033if ($version_of_mk_invlist_bounds lt $v_version) {
20034    Carp::my_carp("WARNING: \\b{} algorithms (regen/mk_invlist.pl) need"
20035                . " to be checked and possibly updated to Unicode"
20036                . " $string_version.  Failing tests will be marked TODO");
20037}
20038
20039exit(0);
20040
20041# TRAILING CODE IS USED BY make_property_test_script()
20042__DATA__
20043
20044use strict;
20045use warnings;
20046
20047use feature 'signatures';
20048
20049no warnings 'experimental::signatures';
20050no warnings 'experimental::uniprop_wildcards';
20051
20052# Test qr/\X/ and the \p{} regular expression constructs.  This file is
20053# constructed by mktables from the tables it generates, so if mktables is
20054# buggy, this won't necessarily catch those bugs.  Tests are generated for all
20055# feasible properties; a few aren't currently feasible; see
20056# is_code_point_usable() in mktables for details.
20057
20058# Standard test packages are not used because this manipulates SIG_WARN.  It
20059# exits 0 if every non-skipped test succeeded; -1 if any failed.
20060
20061my $Tests = 0;
20062my $Fails = 0;
20063
20064# loc_tools.pl requires this function to be defined
20065sub ok($pass, @msg) {
20066    print "not " unless $pass;
20067    print "ok ";
20068    print ++$Tests;
20069    print " - ", join "", @msg if @msg;
20070    print "\n";
20071}
20072
20073sub Expect($expected, $ord, $regex, $warning_type='') {
20074    my $line   = (caller)[2];
20075
20076    # Convert the code point to hex form
20077    my $string = sprintf "\"\\x{%04X}\"", $ord;
20078
20079    my @tests = "";
20080
20081    # The first time through, use all warnings.  If the input should generate
20082    # a warning, add another time through with them turned off
20083    push @tests, "no warnings '$warning_type';" if $warning_type;
20084
20085    foreach my $no_warnings (@tests) {
20086
20087        # Store any warning messages instead of outputting them
20088        local $SIG{__WARN__} = $SIG{__WARN__};
20089        my $warning_message;
20090        $SIG{__WARN__} = sub { $warning_message = $_[0] };
20091
20092        $Tests++;
20093
20094        # A string eval is needed because of the 'no warnings'.
20095        # Assumes no parentheses in the regular expression
20096        my $result = eval "$no_warnings
20097                            my \$RegObj = qr($regex);
20098                            $string =~ \$RegObj ? 1 : 0";
20099        if (not defined $result) {
20100            print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
20101            $Fails++;
20102        }
20103        elsif ($result ^ $expected) {
20104            print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
20105            $Fails++;
20106        }
20107        elsif ($warning_message) {
20108            if (! $warning_type || ($warning_type && $no_warnings)) {
20109                print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
20110                $Fails++;
20111            }
20112            else {
20113                print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
20114            }
20115        }
20116        elsif ($warning_type && ! $no_warnings) {
20117            print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
20118            $Fails++;
20119        }
20120        else {
20121            print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
20122        }
20123    }
20124    return;
20125}
20126
20127sub Error($regex) {
20128    $Tests++;
20129    if (eval { 'x' =~ qr/$regex/; 1 }) {
20130        $Fails++;
20131        my $line = (caller)[2];
20132        print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
20133    }
20134    else {
20135        my $line = (caller)[2];
20136        print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
20137    }
20138    return;
20139}
20140
20141# Break test files (e.g. GCBTest.txt) character that break allowed here
20142my $breakable_utf8 = my $breakable = chr(utf8::unicode_to_native(0xF7));
20143utf8::upgrade($breakable_utf8);
20144
20145# Break test files (e.g. GCBTest.txt) character that indicates can't break
20146# here
20147my $nobreak_utf8 = my $nobreak = chr(utf8::unicode_to_native(0xD7));
20148utf8::upgrade($nobreak_utf8);
20149
20150my $are_ctype_locales_available;
20151my $utf8_locale;
20152chdir 't' if -d 't';
20153eval { require "./loc_tools.pl" };
20154if (defined &locales_enabled) {
20155    $are_ctype_locales_available = locales_enabled('LC_CTYPE');
20156    if ($are_ctype_locales_available) {
20157        $utf8_locale = &find_utf8_ctype_locale;
20158    }
20159}
20160
20161# Eval'd so can run on versions earlier than the property is available in
20162my $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}\p{WB=ZWJ}]/';
20163if (! defined $WB_Extend_or_Format_re) {
20164    $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}]/';
20165}
20166
20167sub _test_break($template, $break_type) {
20168    # Test various break property matches.  The 2nd parameter gives the
20169    # property name.  The input is a line from auxiliary/*Test.txt for the
20170    # given property.  Each such line is a sequence of Unicode (not native)
20171    # code points given by their hex numbers, separated by the two characters
20172    # defined just before this subroutine that indicate that either there can
20173    # or cannot be a break between the adjacent code points.  All these are
20174    # tested.
20175    #
20176    # For the gcb property extra tests are made.  if there isn't a break, that
20177    # means the sequence forms an extended grapheme cluster, which means that
20178    # \X should match the whole thing.  If there is a break, \X should stop
20179    # there.  This is all converted by this routine into a match: $string =~
20180    # /(\X)/, Each \X should match the next cluster; and that is what is
20181    # checked.
20182
20183    my $line   = (caller 1)[2];   # Line number
20184    my $comment = "";
20185
20186    if ($template =~ / ( .*? ) \s* \# (.*) /x) {
20187        $template = $1;
20188        $comment = $2;
20189
20190        # Replace leading spaces with a single one.
20191        $comment =~ s/ ^ \s* / # /x;
20192    }
20193
20194    # The line contains characters above the ASCII range, but in Latin1.  It
20195    # may or may not be in utf8, and if it is, it may or may not know it.  So,
20196    # convert these characters to 8 bits.  If knows is in utf8, simply
20197    # downgrade.
20198    if (utf8::is_utf8($template)) {
20199        utf8::downgrade($template);
20200    } else {
20201
20202        # Otherwise, if it is in utf8, but doesn't know it, the next lines
20203        # convert the two problematic characters to their 8-bit equivalents.
20204        # If it isn't in utf8, they don't harm anything.
20205        use bytes;
20206        $template =~ s/$nobreak_utf8/$nobreak/g;
20207        $template =~ s/$breakable_utf8/$breakable/g;
20208    }
20209
20210    # Perl customizes wb.  So change the official tests accordingly
20211    if ($break_type eq 'wb' && $WB_Extend_or_Format_re) {
20212
20213        # Split into elements that alternate between code point and
20214        # break/no-break
20215        my @line = split / +/, $template;
20216
20217        # Look at each code point and its following one
20218        for (my $i = 1; $i <  @line - 1 - 1; $i+=2) {
20219
20220            # The customization only involves changing some breaks to
20221            # non-breaks.
20222            next if $line[$i+1] =~ /$nobreak/;
20223
20224            my $lhs = chr utf8::unicode_to_native(hex $line[$i]);
20225            my $rhs = chr utf8::unicode_to_native(hex $line[$i+2]);
20226
20227            # And it only affects adjacent space characters.
20228            next if $lhs !~ /\s/u;
20229
20230            # But, we want to make sure to test spaces followed by a Extend
20231            # or Format.
20232            next if $rhs !~ /\s|$WB_Extend_or_Format_re/;
20233
20234            # To test the customization, add some white-space before this to
20235            # create a span.  The $lhs white space may or may not be bound to
20236            # that span, and also with the $rhs.  If the $rhs is a binding
20237            # character, the $lhs is bound to it and not to the span, unless
20238            # $lhs is vertical space.  In all other cases, the $lhs is bound
20239            # to the span.  If the $rhs is white space, it is bound to the
20240            # $lhs
20241            my $bound;
20242            my $span;
20243            if ($rhs =~ /$WB_Extend_or_Format_re/) {
20244                if ($lhs =~ /\v/) {
20245                    $bound = $breakable;
20246                    $span = $nobreak;
20247                }
20248                else {
20249                    $bound = $nobreak;
20250                    $span = $breakable;
20251                }
20252            }
20253            else {
20254                $span = $nobreak;
20255                $bound = $nobreak;
20256            }
20257
20258            splice @line, $i, 0, ( '0020', $nobreak, '0020', $span);
20259            $i += 4;
20260            $line[$i+1] = $bound;
20261        }
20262        $template = join " ", @line;
20263    }
20264
20265    # The input is just the break/no-break symbols and sequences of Unicode
20266    # code points as hex digits separated by spaces for legibility. e.g.:
20267    # ÷ 0020 × 0308 ÷ 0020 ÷
20268    # Convert to native \x format
20269    $template =~ s/ \s* ( [[:xdigit:]]+ ) \s* /sprintf("\\x{%02X}", utf8::unicode_to_native(hex $1))/gex;
20270    $template =~ s/ \s* //gx;   # Probably the line above removed all spaces;
20271                                # but be sure
20272
20273    # Make a copy of the input with the symbols replaced by \b{} and \B{} as
20274    # appropriate
20275    my $break_pattern = $template =~ s/ $breakable /\\b{$break_type}/grx;
20276    $break_pattern =~ s/ $nobreak /\\B{$break_type}/gx;
20277
20278    my $display_string = $template =~ s/[$breakable$nobreak]//gr;
20279    my $string = eval "\"$display_string\"";
20280
20281    # The remaining massaging of the input is for the \X tests.  Get rid of
20282    # the leading and trailing breakables
20283    $template =~ s/^ \s* $breakable \s* //x;
20284    $template =~ s/ \s* $breakable \s* $ //x;
20285
20286    # Delete no-breaks
20287    $template =~ s/ \s* $nobreak \s* //xg;
20288
20289    # Split the input into segments that are breakable between them.
20290    my @should_display = split /\s*$breakable\s*/, $template;
20291    my @should_match = map { eval "\"$_\"" } @should_display;
20292
20293    # If a string can be represented in both non-ut8 and utf8, test both cases
20294    my $display_upgrade = "";
20295    UPGRADE:
20296    for my $to_upgrade (0 .. 1) {
20297
20298        if ($to_upgrade) {
20299
20300            # If already in utf8, would just be a repeat
20301            next UPGRADE if utf8::is_utf8($string);
20302
20303            utf8::upgrade($string);
20304            $display_upgrade = " (utf8-upgraded)";
20305        }
20306
20307        my @modifiers = qw(a aa d u i);
20308        if ($are_ctype_locales_available) {
20309            push @modifiers, "l$utf8_locale" if defined $utf8_locale;
20310
20311            # The /l modifier has C after it to indicate the locale to try
20312            push @modifiers, "lC";
20313        }
20314
20315        # Test for each of the regex modifiers.
20316        for my $modifier (@modifiers) {
20317            my $display_locale = "";
20318
20319            # For /l, set the locale to what it says to.
20320            if ($modifier =~ / ^ l (.*) /x) {
20321                my $locale = $1;
20322                $display_locale = "(locale = $locale)";
20323                POSIX::setlocale(&POSIX::LC_CTYPE, $locale);
20324                $modifier = 'l';
20325            }
20326
20327            no warnings qw(locale regexp surrogate);
20328            my $pattern = "(?$modifier:$break_pattern)";
20329
20330            # Actually do the test
20331            my $matched_text;
20332            my $matched = $string =~ qr/$pattern/;
20333            if ($matched) {
20334                $matched_text = "matched";
20335            }
20336            else {
20337                $matched_text = "failed to match";
20338                print "not ";
20339
20340                if (TODO_FAILING_BREAKS) {
20341                    $comment = " # $comment" unless $comment =~ / ^ \s* \# /x;
20342                    $comment =~ s/#/# TODO/;
20343                }
20344            }
20345            print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$pattern/$display_upgrade; line $line $display_locale$comment\n";
20346
20347            # Only print the comment on the first use of this line
20348            $comment = "";
20349
20350            # Repeat with the first \B{} in the pattern.  This makes sure the
20351            # code in regexec.c:find_byclass() for \B gets executed
20352            if ($pattern =~ / ( .*? : ) .* ( \\B\{ .* ) /x) {
20353                my $B_pattern = "$1$2";
20354                $matched = $string =~ qr/$B_pattern/;
20355                print "not " unless $matched;
20356                $matched_text = ($matched) ? "matched" : "failed to match";
20357                print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$B_pattern/$display_upgrade; line $line $display_locale";
20358                print " # TODO" if TODO_FAILING_BREAKS && ! $matched;
20359                print "\n";
20360            }
20361        }
20362
20363        next if $break_type ne 'gcb';
20364
20365        # Finally, do the \X match.
20366        my @matches = $string =~ /(\X)/g;
20367
20368        # Look through each matched cluster to verify that it matches what we
20369        # expect.
20370        my $min = (@matches < @should_match) ? @matches : @should_match;
20371        for my $i (0 .. $min - 1) {
20372            $Tests++;
20373            if ($matches[$i] eq $should_match[$i]) {
20374                print "ok $Tests - ";
20375                if ($i == 0) {
20376                    print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
20377                } else {
20378                    print "And \\X #", $i + 1,
20379                }
20380                print " correctly matched $should_display[$i]; line $line\n";
20381            } else {
20382                $matches[$i] = join("", map { sprintf "\\x{%04X}", ord $_ }
20383                                                    split "", $matches[$i]);
20384                print "not ok $Tests -";
20385                print " # TODO" if TODO_FAILING_BREAKS;
20386                print " In \"$display_string\" =~ /(\\X)/g, \\X #",
20387                    $i + 1,
20388                    " should have matched $should_display[$i]",
20389                    " but instead matched $matches[$i]",
20390                    ".  Abandoning rest of line $line\n";
20391                next UPGRADE;
20392            }
20393        }
20394
20395        # And the number of matches should equal the number of expected matches.
20396        $Tests++;
20397        if (@matches == @should_match) {
20398            print "ok $Tests - Nothing was left over; line $line\n";
20399        } else {
20400            print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line";
20401            print " # TODO" if TODO_FAILING_BREAKS;
20402            print "\n";
20403        }
20404    }
20405
20406    return;
20407}
20408
20409sub Test_GCB($t) {
20410    _test_break($t, 'gcb');
20411}
20412
20413sub Test_LB($t) {
20414    _test_break($t, 'lb');
20415}
20416
20417sub Test_SB($t) {
20418    _test_break($t, 'sb');
20419}
20420
20421sub Test_WB($t) {
20422    _test_break($t, 'wb');
20423}
20424
20425sub Finished() {
20426    print "1..$Tests\n";
20427    exit($Fails ? -1 : 0);
20428}
20429
20430