1#!/usr/bin/perl -w 2 3# !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!! 4# Any files created or read by this program should be listed in 'mktables.lst' 5# Use -makelist to regenerate it. 6 7# There was an attempt when this was first rewritten to make it 5.8 8# compatible, but that has now been abandoned, and newer constructs are used 9# as convenient. 10 11# NOTE: this script can run quite slowly in older/slower systems. 12# It can also consume a lot of memory (128 MB or more), you may need 13# to raise your process resource limits (e.g. in bash, "ulimit -a" 14# to inspect, and "ulimit -d ..." or "ulimit -m ..." to set) 15 16my $start_time; 17BEGIN { # Get the time the script started running; do it at compilation to 18 # get it as close as possible 19 $start_time= time; 20} 21 22require 5.010_001; 23use strict; 24use warnings; 25use Carp; 26use Config; 27use File::Find; 28use File::Path; 29use File::Spec; 30use Text::Tabs; 31use re "/aa"; 32 33use feature 'state'; 34use feature 'signatures'; 35no warnings 'experimental::signatures'; 36 37sub DEBUG () { 0 } # Set to 0 for production; 1 for development 38my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/; 39 40sub NON_ASCII_PLATFORM { ord("A") != 65 } 41 42# When a new version of Unicode is published, unfortunately the algorithms for 43# dealing with various bounds, like \b{gcb}, \b{lb} may have to be updated 44# manually. The changes may or may not be backward compatible with older 45# releases. The code is in regen/mk_invlist.pl and regexec.c. Make the 46# changes, then come back here and set the variable below to what version the 47# code is expecting. If a newer version of Unicode is being compiled than 48# expected, a warning will be generated. If an older version is being 49# compiled, any bounds tests that fail in the generated test file (-maketest 50# option) will be marked as TODO. 51my $version_of_mk_invlist_bounds = v13.0.0; 52 53########################################################################## 54# 55# mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl), 56# from the Unicode database files (lib/unicore/.../*.txt), It also generates 57# a pod file and .t files, depending on option parameters. 58# 59# The structure of this file is: 60# First these introductory comments; then 61# code needed for everywhere, such as debugging stuff; then 62# code to handle input parameters; then 63# data structures likely to be of external interest (some of which depend on 64# the input parameters, so follows them; then 65# more data structures and subroutine and package (class) definitions; then 66# the small actual loop to process the input files and finish up; then 67# a __DATA__ section, for the .t tests 68# 69# This program works on all releases of Unicode so far. The outputs have been 70# scrutinized most intently for release 5.1. The others have been checked for 71# somewhat more than just sanity. It can handle all non-provisional Unicode 72# character properties in those releases. 73# 74# This program is mostly about Unicode character (or code point) properties. 75# A property describes some attribute or quality of a code point, like if it 76# is lowercase or not, its name, what version of Unicode it was first defined 77# in, or what its uppercase equivalent is. Unicode deals with these disparate 78# possibilities by making all properties into mappings from each code point 79# into some corresponding value. In the case of it being lowercase or not, 80# the mapping is either to 'Y' or 'N' (or various synonyms thereof). Each 81# property maps each Unicode code point to a single value, called a "property 82# value". (Some more recently defined properties, map a code point to a set 83# of values.) 84# 85# When using a property in a regular expression, what is desired isn't the 86# mapping of the code point to its property's value, but the reverse (or the 87# mathematical "inverse relation"): starting with the property value, "Does a 88# code point map to it?" These are written in a "compound" form: 89# \p{property=value}, e.g., \p{category=punctuation}. This program generates 90# files containing the lists of code points that map to each such regular 91# expression property value, one file per list 92# 93# There is also a single form shortcut that Perl adds for many of the commonly 94# used properties. This happens for all binary properties, plus script, 95# general_category, and block properties. 96# 97# Thus the outputs of this program are files. There are map files, mostly in 98# the 'To' directory; and there are list files for use in regular expression 99# matching, all in subdirectories of the 'lib' directory, with each 100# subdirectory being named for the property that the lists in it are for. 101# Bookkeeping, test, and documentation files are also generated. 102 103my $matches_directory = 'lib'; # Where match (\p{}) files go. 104my $map_directory = 'To'; # Where map files go. 105 106# DATA STRUCTURES 107# 108# The major data structures of this program are Property, of course, but also 109# Table. There are two kinds of tables, very similar to each other. 110# "Match_Table" is the data structure giving the list of code points that have 111# a particular property value, mentioned above. There is also a "Map_Table" 112# data structure which gives the property's mapping from code point to value. 113# There are two structures because the match tables need to be combined in 114# various ways, such as constructing unions, intersections, complements, etc., 115# and the map ones don't. And there would be problems, perhaps subtle, if 116# a map table were inadvertently operated on in some of those ways. 117# The use of separate classes with operations defined on one but not the other 118# prevents accidentally confusing the two. 119# 120# At the heart of each table's data structure is a "Range_List", which is just 121# an ordered list of "Ranges", plus ancillary information, and methods to 122# operate on them. A Range is a compact way to store property information. 123# Each range has a starting code point, an ending code point, and a value that 124# is meant to apply to all the code points between the two end points, 125# inclusive. For a map table, this value is the property value for those 126# code points. Two such ranges could be written like this: 127# 0x41 .. 0x5A, 'Upper', 128# 0x61 .. 0x7A, 'Lower' 129# 130# Each range also has a type used as a convenience to classify the values. 131# Most ranges in this program will be Type 0, or normal, but there are some 132# ranges that have a non-zero type. These are used only in map tables, and 133# are for mappings that don't fit into the normal scheme of things. Mappings 134# that require a hash entry to communicate with utf8.c are one example; 135# another example is mappings for charnames.pm to use which indicate a name 136# that is algorithmically determinable from its code point (and the reverse). 137# These are used to significantly compact these tables, instead of listing 138# each one of the tens of thousands individually. 139# 140# In a match table, the value of a range is irrelevant (and hence the type as 141# well, which will always be 0), and arbitrarily set to the empty string. 142# Using the example above, there would be two match tables for those two 143# entries, one named Upper would contain the 0x41..0x5A range, and the other 144# named Lower would contain 0x61..0x7A. 145# 146# Actually, there are two types of range lists, "Range_Map" is the one 147# associated with map tables, and "Range_List" with match tables. 148# Again, this is so that methods can be defined on one and not the others so 149# as to prevent operating on them in incorrect ways. 150# 151# Eventually, most tables are written out to files to be read by Unicode::UCD. 152# All tables could in theory be written, but some are suppressed because there 153# is no current practical use for them. It is easy to change which get 154# written by changing various lists that are near the top of the actual code 155# in this file. The table data structures contain enough ancillary 156# information to allow them to be treated as separate entities for writing, 157# such as the path to each one's file. There is a heading in each map table 158# that gives the format of its entries, and what the map is for all the code 159# points missing from it. (This allows tables to be more compact.) 160# 161# The Property data structure contains one or more tables. All properties 162# contain a map table (except the $perl property which is a 163# pseudo-property containing only match tables), and any properties that 164# are usable in regular expression matches also contain various matching 165# tables, one for each value the property can have. A binary property can 166# have two values, True and False (or Y and N, which are preferred by Unicode 167# terminology). Thus each of these properties will have a map table that 168# takes every code point and maps it to Y or N (but having ranges cuts the 169# number of entries in that table way down), and two match tables, one 170# which has a list of all the code points that map to Y, and one for all the 171# code points that map to N. (For each binary property, a third table is also 172# generated for the pseudo Perl property. It contains the identical code 173# points as the Y table, but can be written in regular expressions, not in the 174# compound form, but in a "single" form like \p{IsUppercase}.) Many 175# properties are binary, but some properties have several possible values, 176# some have many, and properties like Name have a different value for every 177# named code point. Those will not, unless the controlling lists are changed, 178# have their match tables written out. But all the ones which can be used in 179# regular expression \p{} and \P{} constructs will. Prior to 5.14, generally 180# a property would have either its map table or its match tables written but 181# not both. Again, what gets written is controlled by lists which can easily 182# be changed. Starting in 5.14, advantage was taken of this, and all the map 183# tables needed to reconstruct the Unicode db are now written out, while 184# suppressing the Unicode .txt files that contain the data. Our tables are 185# much more compact than the .txt files, so a significant space savings was 186# achieved. Also, tables are not written out that are trivially derivable 187# from tables that do get written. So, there typically is no file containing 188# the code points not matched by a binary property (the table for \P{} versus 189# lowercase \p{}), since you just need to invert the True table to get the 190# False table. 191 192# Properties have a 'Type', like 'binary', or 'string', or 'enum' depending on 193# how many match tables there are and the content of the maps. This 'Type' is 194# different than a range 'Type', so don't get confused by the two concepts 195# having the same name. 196# 197# For information about the Unicode properties, see Unicode's UAX44 document: 198 199my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/'; 200 201# As stated earlier, this program will work on any release of Unicode so far. 202# Most obvious problems in earlier data have NOT been corrected except when 203# necessary to make Perl or this program work reasonably, and to keep out 204# potential security issues. For example, no folding information was given in 205# early releases, so this program substitutes lower case instead, just so that 206# a regular expression with the /i option will do something that actually 207# gives the right results in many cases. There are also a couple other 208# corrections for version 1.1.5, commented at the point they are made. As an 209# example of corrections that weren't made (but could be) is this statement 210# from DerivedAge.txt: "The supplementary private use code points and the 211# non-character code points were assigned in version 2.0, but not specifically 212# listed in the UCD until versions 3.0 and 3.1 respectively." (To be precise 213# it was 3.0.1 not 3.0.0) More information on Unicode version glitches is 214# further down in these introductory comments. 215# 216# This program works on all non-provisional properties as of the current 217# Unicode release, though the files for some are suppressed for various 218# reasons. You can change which are output by changing lists in this program. 219# 220# The old version of mktables emphasized the term "Fuzzy" to mean Unicode's 221# loose matchings rules (from Unicode TR18): 222# 223# The recommended names for UCD properties and property values are in 224# PropertyAliases.txt [Prop] and PropertyValueAliases.txt 225# [PropValue]. There are both abbreviated names and longer, more 226# descriptive names. It is strongly recommended that both names be 227# recognized, and that loose matching of property names be used, 228# whereby the case distinctions, whitespace, hyphens, and underbar 229# are ignored. 230# 231# The program still allows Fuzzy to override its determination of if loose 232# matching should be used, but it isn't currently used, as it is no longer 233# needed; the calculations it makes are good enough. 234# 235# SUMMARY OF HOW IT WORKS: 236# 237# Process arguments 238# 239# A list is constructed containing each input file that is to be processed 240# 241# Each file on the list is processed in a loop, using the associated handler 242# code for each: 243# The PropertyAliases.txt and PropValueAliases.txt files are processed 244# first. These files name the properties and property values. 245# Objects are created of all the property and property value names 246# that the rest of the input should expect, including all synonyms. 247# The other input files give mappings from properties to property 248# values. That is, they list code points and say what the mapping 249# is under the given property. Some files give the mappings for 250# just one property; and some for many. This program goes through 251# each file and populates the properties and their map tables from 252# them. Some properties are listed in more than one file, and 253# Unicode has set up a precedence as to which has priority if there 254# is a conflict. Thus the order of processing matters, and this 255# program handles the conflict possibility by processing the 256# overriding input files last, so that if necessary they replace 257# earlier values. 258# After this is all done, the program creates the property mappings not 259# furnished by Unicode, but derivable from what it does give. 260# The tables of code points that match each property value in each 261# property that is accessible by regular expressions are created. 262# The Perl-defined properties are created and populated. Many of these 263# require data determined from the earlier steps 264# Any Perl-defined synonyms are created, and name clashes between Perl 265# and Unicode are reconciled and warned about. 266# All the properties are written to files 267# Any other files are written, and final warnings issued. 268# 269# For clarity, a number of operators have been overloaded to work on tables: 270# ~ means invert (take all characters not in the set). The more 271# conventional '!' is not used because of the possibility of confusing 272# it with the actual boolean operation. 273# + means union 274# - means subtraction 275# & means intersection 276# The precedence of these is the order listed. Parentheses should be 277# copiously used. These are not a general scheme. The operations aren't 278# defined for a number of things, deliberately, to avoid getting into trouble. 279# Operations are done on references and affect the underlying structures, so 280# that the copy constructors for them have been overloaded to not return a new 281# clone, but the input object itself. 282# 283# The bool operator is deliberately not overloaded to avoid confusion with 284# "should it mean if the object merely exists, or also is non-empty?". 285# 286# WHY CERTAIN DESIGN DECISIONS WERE MADE 287# 288# This program needs to be able to run under miniperl. Therefore, it uses a 289# minimum of other modules, and hence implements some things itself that could 290# be gotten from CPAN 291# 292# This program uses inputs published by the Unicode Consortium. These can 293# change incompatibly between releases without the Perl maintainers realizing 294# it. Therefore this program is now designed to try to flag these. It looks 295# at the directories where the inputs are, and flags any unrecognized files. 296# It keeps track of all the properties in the files it handles, and flags any 297# that it doesn't know how to handle. It also flags any input lines that 298# don't match the expected syntax, among other checks. 299# 300# It is also designed so if a new input file matches one of the known 301# templates, one hopefully just needs to add it to a list to have it 302# processed. 303# 304# As mentioned earlier, some properties are given in more than one file. In 305# particular, the files in the extracted directory are supposedly just 306# reformattings of the others. But they contain information not easily 307# derivable from the other files, including results for Unihan (which isn't 308# usually available to this program) and for unassigned code points. They 309# also have historically had errors or been incomplete. In an attempt to 310# create the best possible data, this program thus processes them first to 311# glean information missing from the other files; then processes those other 312# files to override any errors in the extracted ones. Much of the design was 313# driven by this need to store things and then possibly override them. 314# 315# It tries to keep fatal errors to a minimum, to generate something usable for 316# testing purposes. It always looks for files that could be inputs, and will 317# warn about any that it doesn't know how to handle (the -q option suppresses 318# the warning). 319# 320# Why is there more than one type of range? 321# This simplified things. There are some very specialized code points that 322# have to be handled specially for output, such as Hangul syllable names. 323# By creating a range type (done late in the development process), it 324# allowed this to be stored with the range, and overridden by other input. 325# Originally these were stored in another data structure, and it became a 326# mess trying to decide if a second file that was for the same property was 327# overriding the earlier one or not. 328# 329# Why are there two kinds of tables, match and map? 330# (And there is a base class shared by the two as well.) As stated above, 331# they actually are for different things. Development proceeded much more 332# smoothly when I (khw) realized the distinction. Map tables are used to 333# give the property value for every code point (actually every code point 334# that doesn't map to a default value). Match tables are used for regular 335# expression matches, and are essentially the inverse mapping. Separating 336# the two allows more specialized methods, and error checks so that one 337# can't just take the intersection of two map tables, for example, as that 338# is nonsensical. 339# 340# What about 'fate' and 'status'. The concept of a table's fate was created 341# late when it became clear that something more was needed. The difference 342# between this and 'status' is unclean, and could be improved if someone 343# wanted to spend the effort. 344# 345# DEBUGGING 346# 347# This program is written so it will run under miniperl. Occasionally changes 348# will cause an error where the backtrace doesn't work well under miniperl. 349# To diagnose the problem, you can instead run it under regular perl, if you 350# have one compiled. 351# 352# There is a good trace facility. To enable it, first sub DEBUG must be set 353# to return true. Then a line like 354# 355# local $to_trace = 1 if main::DEBUG; 356# 357# can be added to enable tracing in its lexical scope (plus dynamic) or until 358# you insert another line: 359# 360# local $to_trace = 0 if main::DEBUG; 361# 362# To actually trace, use a line like "trace $a, @b, %c, ...; 363# 364# Some of the more complex subroutines already have trace statements in them. 365# Permanent trace statements should be like: 366# 367# trace ... if main::DEBUG && $to_trace; 368# 369# main::stack_trace() will display what its name implies 370# 371# If there is just one or a few files that you're debugging, you can easily 372# cause most everything else to be skipped. Change the line 373# 374# my $debug_skip = 0; 375# 376# to 1, and every file whose object is in @input_file_objects and doesn't have 377# a, 'non_skip => 1,' in its constructor will be skipped. However, skipping 378# Jamo.txt or UnicodeData.txt will likely cause fatal errors. 379# 380# To compare the output tables, it may be useful to specify the -annotate 381# flag. (As of this writing, this can't be done on a clean workspace, due to 382# requirements in Text::Tabs used in this option; so first run mktables 383# without this option.) This option adds comment lines to each table, one for 384# each non-algorithmically named character giving, currently its code point, 385# name, and graphic representation if printable (and you have a font that 386# knows about it). This makes it easier to see what the particular code 387# points are in each output table. Non-named code points are annotated with a 388# description of their status, and contiguous ones with the same description 389# will be output as a range rather than individually. Algorithmically named 390# characters are also output as ranges, except when there are just a few 391# contiguous ones. 392# 393# FUTURE ISSUES 394# 395# The program would break if Unicode were to change its names so that 396# interior white space, underscores, or dashes differences were significant 397# within property and property value names. 398# 399# It might be easier to use the xml versions of the UCD if this program ever 400# would need heavy revision, and the ability to handle old versions was not 401# required. 402# 403# There is the potential for name collisions, in that Perl has chosen names 404# that Unicode could decide it also likes. There have been such collisions in 405# the past, with mostly Perl deciding to adopt the Unicode definition of the 406# name. However in the 5.2 Unicode beta testing, there were a number of such 407# collisions, which were withdrawn before the final release, because of Perl's 408# and other's protests. These all involved new properties which began with 409# 'Is'. Based on the protests, Unicode is unlikely to try that again. Also, 410# many of the Perl-defined synonyms, like Any, Word, etc, are listed in a 411# Unicode document, so they are unlikely to be used by Unicode for another 412# purpose. However, they might try something beginning with 'In', or use any 413# of the other Perl-defined properties. This program will warn you of name 414# collisions, and refuse to generate tables with them, but manual intervention 415# will be required in this event. One scheme that could be implemented, if 416# necessary, would be to have this program generate another file, or add a 417# field to mktables.lst that gives the date of first definition of a property. 418# Each new release of Unicode would use that file as a basis for the next 419# iteration. And the Perl synonym addition code could sort based on the age 420# of the property, so older properties get priority, and newer ones that clash 421# would be refused; hence existing code would not be impacted, and some other 422# synonym would have to be used for the new property. This is ugly, and 423# manual intervention would certainly be easier to do in the short run; lets 424# hope it never comes to this. 425# 426# A NOTE ON UNIHAN 427# 428# This program can generate tables from the Unihan database. But that DB 429# isn't normally available, so it is marked as optional. Prior to version 430# 5.2, this database was in a single file, Unihan.txt. In 5.2 the database 431# was split into 8 different files, all beginning with the letters 'Unihan'. 432# If you plunk those files down into the directory mktables ($0) is in, this 433# program will read them and automatically create tables for the properties 434# from it that are listed in PropertyAliases.txt and PropValueAliases.txt, 435# plus any you add to the @cjk_properties array and the @cjk_property_values 436# array, being sure to add necessary '# @missings' lines to the latter. For 437# Unicode versions earlier than 5.2, most of the Unihan properties are not 438# listed at all in PropertyAliases nor PropValueAliases. This program assumes 439# for these early releases that you want the properties that are specified in 440# the 5.2 release. 441# 442# You may need to adjust the entries to suit your purposes. setup_unihan(), 443# and filter_unihan_line() are the functions where this is done. This program 444# already does some adjusting to make the lines look more like the rest of the 445# Unicode DB; You can see what that is in filter_unihan_line() 446# 447# There is a bug in the 3.2 data file in which some values for the 448# kPrimaryNumeric property have commas and an unexpected comment. A filter 449# could be added to correct these; or for a particular installation, the 450# Unihan.txt file could be edited to fix them. 451# 452# HOW TO ADD A FILE TO BE PROCESSED 453# 454# A new file from Unicode needs to have an object constructed for it in 455# @input_file_objects, probably at the end or at the end of the extracted 456# ones. The program should warn you if its name will clash with others on 457# restrictive file systems, like DOS. If so, figure out a better name, and 458# add lines to the README.perl file giving that. If the file is a character 459# property, it should be in the format that Unicode has implicitly 460# standardized for such files for the more recently introduced ones. 461# If so, the Input_file constructor for @input_file_objects can just be the 462# file name and release it first appeared in. If not, then it should be 463# possible to construct an each_line_handler() to massage the line into the 464# standardized form. 465# 466# For non-character properties, more code will be needed. You can look at 467# the existing entries for clues. 468# 469# UNICODE VERSIONS NOTES 470# 471# The Unicode UCD has had a number of errors in it over the versions. And 472# these remain, by policy, in the standard for that version. Therefore it is 473# risky to correct them, because code may be expecting the error. So this 474# program doesn't generally make changes, unless the error breaks the Perl 475# core. As an example, some versions of 2.1.x Jamo.txt have the wrong value 476# for U+1105, which causes real problems for the algorithms for Jamo 477# calculations, so it is changed here. 478# 479# But it isn't so clear cut as to what to do about concepts that are 480# introduced in a later release; should they extend back to earlier releases 481# where the concept just didn't exist? It was easier to do this than to not, 482# so that's what was done. For example, the default value for code points not 483# in the files for various properties was probably undefined until changed by 484# some version. No_Block for blocks is such an example. This program will 485# assign No_Block even in Unicode versions that didn't have it. This has the 486# benefit that code being written doesn't have to special case earlier 487# versions; and the detriment that it doesn't match the Standard precisely for 488# the affected versions. 489# 490# Here are some observations about some of the issues in early versions: 491# 492# Prior to version 3.0, there were 3 character decompositions. These are not 493# handled by Unicode::Normalize, nor will it compile when presented a version 494# that has them. However, you can trivially get it to compile by simply 495# ignoring those decompositions, by changing the croak to a carp. At the time 496# of this writing, the line (in dist/Unicode-Normalize/Normalize.pm or 497# dist/Unicode-Normalize/mkheader) reads 498# 499# croak("Weird Canonical Decomposition of U+$h"); 500# 501# Simply comment it out. It will compile, but will not know about any three 502# character decompositions. 503 504# The number of code points in \p{alpha=True} halved in 2.1.9. It turns out 505# that the reason is that the CJK block starting at 4E00 was removed from 506# PropList, and was not put back in until 3.1.0. The Perl extension (the 507# single property name \p{alpha}) has the correct values. But the compound 508# form is simply not generated until 3.1, as it can be argued that prior to 509# this release, this was not an official property. The comments for 510# filter_old_style_proplist() give more details. 511# 512# Unicode introduced the synonym Space for White_Space in 4.1. Perl has 513# always had a \p{Space}. In release 3.2 only, they are not synonymous. The 514# reason is that 3.2 introduced U+205F=medium math space, which was not 515# classed as white space, but Perl figured out that it should have been. 4.0 516# reclassified it correctly. 517# 518# Another change between 3.2 and 4.0 is the CCC property value ATBL. In 3.2 519# this was erroneously a synonym for 202 (it should be 200). In 4.0, ATB 520# became 202, and ATBL was left with no code points, as all the ones that 521# mapped to 202 stayed mapped to 202. Thus if your program used the numeric 522# name for the class, it would not have been affected, but if it used the 523# mnemonic, it would have been. 524# 525# \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that, code 526# points which eventually came to have this script property value, instead 527# mapped to "Unknown". But in the next release all these code points were 528# moved to \p{sc=common} instead. 529 530# The tests furnished by Unicode for testing WordBreak and SentenceBreak 531# generate errors in 5.0 and earlier. 532# 533# The default for missing code points for BidiClass is complicated. Starting 534# in 3.1.1, the derived file DBidiClass.txt handles this, but this program 535# tries to do the best it can for earlier releases. It is done in 536# process_PropertyAliases() 537# 538# In version 2.1.2, the entry in UnicodeData.txt: 539# 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;;019F; 540# should instead be 541# 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F 542# Without this change, there are casing problems for this character. 543# 544# Search for $string_compare_versions to see how to compare changes to 545# properties between Unicode versions 546# 547############################################################################## 548 549my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing 550 # and errors 551my $MAX_LINE_WIDTH = 78; 552 553# Debugging aid to skip most files so as to not be distracted by them when 554# concentrating on the ones being debugged. Add 555# non_skip => 1, 556# to the constructor for those files you want processed when you set this. 557# Files with a first version number of 0 are special: they are always 558# processed regardless of the state of this flag. Generally, Jamo.txt and 559# UnicodeData.txt must not be skipped if you want this program to not die 560# before normal completion. 561my $debug_skip = 0; 562 563 564# Normally these are suppressed. 565my $write_Unicode_deprecated_tables = 0; 566 567# Set to 1 to enable tracing. 568our $to_trace = 0; 569 570{ # Closure for trace: debugging aid 571 my $print_caller = 1; # ? Include calling subroutine name 572 my $main_with_colon = 'main::'; 573 my $main_colon_length = length($main_with_colon); 574 575 sub trace { 576 return unless $to_trace; # Do nothing if global flag not set 577 578 my @input = @_; 579 580 local $DB::trace = 0; 581 $DB::trace = 0; # Quiet 'used only once' message 582 583 my $line_number; 584 585 # Loop looking up the stack to get the first non-trace caller 586 my $caller_line; 587 my $caller_name; 588 my $i = 0; 589 do { 590 $line_number = $caller_line; 591 (my $pkg, my $file, $caller_line, my $caller) = caller $i++; 592 $caller = $main_with_colon unless defined $caller; 593 594 $caller_name = $caller; 595 596 # get rid of pkg 597 $caller_name =~ s/.*:://; 598 if (substr($caller_name, 0, $main_colon_length) 599 eq $main_with_colon) 600 { 601 $caller_name = substr($caller_name, $main_colon_length); 602 } 603 604 } until ($caller_name ne 'trace'); 605 606 # If the stack was empty, we were called from the top level 607 $caller_name = 'main' if ($caller_name eq "" 608 || $caller_name eq 'trace'); 609 610 my $output = ""; 611 #print STDERR __LINE__, ": ", join ", ", @input, "\n"; 612 foreach my $string (@input) { 613 if (ref $string eq 'ARRAY' || ref $string eq 'HASH') { 614 $output .= simple_dumper($string); 615 } 616 else { 617 $string = "$string" if ref $string; 618 $string = $UNDEF unless defined $string; 619 chomp $string; 620 $string = '""' if $string eq ""; 621 $output .= " " if $output ne "" 622 && $string ne "" 623 && substr($output, -1, 1) ne " " 624 && substr($string, 0, 1) ne " "; 625 $output .= $string; 626 } 627 } 628 629 print STDERR sprintf "%4d: ", $line_number if defined $line_number; 630 print STDERR "$caller_name: " if $print_caller; 631 print STDERR $output, "\n"; 632 return; 633 } 634} 635 636sub stack_trace() { 637 local $to_trace = 1 if main::DEBUG; 638 my $line = (caller(0))[2]; 639 my $i = 1; 640 641 # Accumulate the stack trace 642 while (1) { 643 my ($pkg, $file, $caller_line, $caller) = caller $i++; 644 645 last unless defined $caller; 646 647 trace "called from $caller() at line $line"; 648 $line = $caller_line; 649 } 650} 651 652# This is for a rarely used development feature that allows you to compare two 653# versions of the Unicode standard without having to deal with changes caused 654# by the code points introduced in the later version. You probably also want 655# to use the -annotate option when using this. Run this program on a unicore 656# containing the starting release you want to compare. Save that output 657# structure. Then, switching to a unicore with the ending release, change the 658# "" in the $string_compare_versions definition just below to a string 659# containing a SINGLE dotted Unicode release number (e.g. "2.1") corresponding 660# to the starting release. This program will then compile, but throw away all 661# code points introduced after the starting release. Finally use a diff tool 662# to compare the two directory structures. They include only the code points 663# common to both releases, and you can see the changes caused just by the 664# underlying release semantic changes. For versions earlier than 3.2, you 665# must copy a version of DAge.txt into the directory. 666my $string_compare_versions = DEBUG && ""; 667my $compare_versions = DEBUG 668 && $string_compare_versions 669 && pack "C*", split /\./, $string_compare_versions; 670 671sub uniques { 672 # Returns non-duplicated input values. From "Perl Best Practices: 673 # Encapsulated Cleverness". p. 455 in first edition. 674 675 my %seen; 676 # Arguably this breaks encapsulation, if the goal is to permit multiple 677 # distinct objects to stringify to the same value, and be interchangeable. 678 # However, for this program, no two objects stringify identically, and all 679 # lists passed to this function are either objects or strings. So this 680 # doesn't affect correctness, but it does give a couple of percent speedup. 681 no overloading; 682 return grep { ! $seen{$_}++ } @_; 683} 684 685$0 = File::Spec->canonpath($0); 686 687my $make_test_script = 0; # ? Should we output a test script 688my $make_norm_test_script = 0; # ? Should we output a normalization test script 689my $write_unchanged_files = 0; # ? Should we update the output files even if 690 # we don't think they have changed 691my $use_directory = ""; # ? Should we chdir somewhere. 692my $pod_directory; # input directory to store the pod file. 693my $pod_file = 'perluniprops'; 694my $t_path; # Path to the .t test file 695my $file_list = 'mktables.lst'; # File to store input and output file names. 696 # This is used to speed up the build, by not 697 # executing the main body of the program if 698 # nothing on the list has changed since the 699 # previous build 700my $make_list = 1; # ? Should we write $file_list. Set to always 701 # make a list so that when the pumpking is 702 # preparing a release, s/he won't have to do 703 # special things 704my $glob_list = 0; # ? Should we try to include unknown .txt files 705 # in the input. 706my $output_range_counts = $debugging_build; # ? Should we include the number 707 # of code points in ranges in 708 # the output 709my $annotate = 0; # ? Should character names be in the output 710 711# Verbosity levels; 0 is quiet 712my $NORMAL_VERBOSITY = 1; 713my $PROGRESS = 2; 714my $VERBOSE = 3; 715 716my $verbosity = $NORMAL_VERBOSITY; 717 718# Stored in mktables.lst so that if this program is called with different 719# options, will regenerate even if the files otherwise look like they're 720# up-to-date. 721my $command_line_arguments = join " ", @ARGV; 722 723# Process arguments 724while (@ARGV) { 725 my $arg = shift @ARGV; 726 if ($arg eq '-v') { 727 $verbosity = $VERBOSE; 728 } 729 elsif ($arg eq '-p') { 730 $verbosity = $PROGRESS; 731 $| = 1; # Flush buffers as we go. 732 } 733 elsif ($arg eq '-q') { 734 $verbosity = 0; 735 } 736 elsif ($arg eq '-w') { 737 # update the files even if they haven't changed 738 $write_unchanged_files = 1; 739 } 740 elsif ($arg eq '-check') { 741 my $this = shift @ARGV; 742 my $ok = shift @ARGV; 743 if ($this ne $ok) { 744 print "Skipping as check params are not the same.\n"; 745 exit(0); 746 } 747 } 748 elsif ($arg eq '-P' && defined ($pod_directory = shift)) { 749 -d $pod_directory or croak "Directory '$pod_directory' doesn't exist"; 750 } 751 elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift))) 752 { 753 $make_test_script = 1; 754 } 755 elsif ($arg eq '-makenormtest') 756 { 757 $make_norm_test_script = 1; 758 } 759 elsif ($arg eq '-makelist') { 760 $make_list = 1; 761 } 762 elsif ($arg eq '-C' && defined ($use_directory = shift)) { 763 -d $use_directory or croak "Unknown directory '$use_directory'"; 764 } 765 elsif ($arg eq '-L') { 766 767 # Existence not tested until have chdir'd 768 $file_list = shift; 769 } 770 elsif ($arg eq '-globlist') { 771 $glob_list = 1; 772 } 773 elsif ($arg eq '-c') { 774 $output_range_counts = ! $output_range_counts 775 } 776 elsif ($arg eq '-annotate') { 777 $annotate = 1; 778 $debugging_build = 1; 779 $output_range_counts = 1; 780 } 781 else { 782 my $with_c = 'with'; 783 $with_c .= 'out' if $output_range_counts; # Complements the state 784 croak <<END; 785usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ] 786 [ -T test_file_path ] [-globlist] [-makelist] [-maketest] 787 [-check A B ] 788 -c : Output comments $with_c number of code points in ranges 789 -q : Quiet Mode: Only output serious warnings. 790 -p : Set verbosity level to normal plus show progress. 791 -v : Set Verbosity level high: Show progress and non-serious 792 warnings 793 -w : Write files regardless 794 -C dir : Change to this directory before proceeding. All relative paths 795 except those specified by the -P and -T options will be done 796 with respect to this directory. 797 -P dir : Output $pod_file file to directory 'dir'. 798 -T path : Create a test script as 'path'; overrides -maketest 799 -L filelist : Use alternate 'filelist' instead of standard one 800 -globlist : Take as input all non-Test *.txt files in current and sub 801 directories 802 -maketest : Make test script 'TestProp.pl' in current (or -C directory), 803 overrides -T 804 -makelist : Rewrite the file list $file_list based on current setup 805 -annotate : Output an annotation for each character in the table files; 806 useful for debugging mktables, looking at diffs; but is slow 807 and memory intensive 808 -check A B : Executes $0 only if A and B are the same 809END 810 } 811} 812 813# Stores the most-recently changed file. If none have changed, can skip the 814# build 815my $most_recent = (stat $0)[9]; # Do this before the chdir! 816 817# Change directories now, because need to read 'version' early. 818if ($use_directory) { 819 if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) { 820 $pod_directory = File::Spec->rel2abs($pod_directory); 821 } 822 if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) { 823 $t_path = File::Spec->rel2abs($t_path); 824 } 825 chdir $use_directory or croak "Failed to chdir to '$use_directory':$!"; 826 if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) { 827 $pod_directory = File::Spec->abs2rel($pod_directory); 828 } 829 if ($t_path && File::Spec->file_name_is_absolute($t_path)) { 830 $t_path = File::Spec->abs2rel($t_path); 831 } 832} 833 834# Get Unicode version into regular and v-string. This is done now because 835# various tables below get populated based on it. These tables are populated 836# here to be near the top of the file, and so easily seeable by those needing 837# to modify things. 838open my $VERSION, "<", "version" 839 or croak "$0: can't open required file 'version': $!\n"; 840my $string_version = <$VERSION>; 841close $VERSION; 842chomp $string_version; 843my $v_version = pack "C*", split /\./, $string_version; # v string 844 845my $unicode_version = ($compare_versions) 846 ? ( "$string_compare_versions (using " 847 . "$string_version rules)") 848 : $string_version; 849 850# The following are the complete names of properties with property values that 851# are known to not match any code points in some versions of Unicode, but that 852# may change in the future so they should be matchable, hence an empty file is 853# generated for them. 854my @tables_that_may_be_empty; 855push @tables_that_may_be_empty, 'Joining_Type=Left_Joining' 856 if $v_version lt v6.3.0; 857push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1; 858push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0; 859push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana' 860 if $v_version ge v4.1.0; 861push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana' 862 if $v_version ge v6.0.0; 863push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend' 864 if $v_version ge v6.1.0; 865push @tables_that_may_be_empty, 'Canonical_Combining_Class=CCC133' 866 if $v_version ge v6.2.0; 867 868# The lists below are hashes, so the key is the item in the list, and the 869# value is the reason why it is in the list. This makes generation of 870# documentation easier. 871 872my %why_suppressed; # No file generated for these. 873 874# Files aren't generated for empty extraneous properties. This is arguable. 875# Extraneous properties generally come about because a property is no longer 876# used in a newer version of Unicode. If we generated a file without code 877# points, programs that used to work on that property will still execute 878# without errors. It just won't ever match (or will always match, with \P{}). 879# This means that the logic is now likely wrong. I (khw) think its better to 880# find this out by getting an error message. Just move them to the table 881# above to change this behavior 882my %why_suppress_if_empty_warn_if_not = ( 883 884 # It is the only property that has ever officially been removed from the 885 # Standard. The database never contained any code points for it. 886 'Special_Case_Condition' => 'Obsolete', 887 888 # Apparently never official, but there were code points in some versions of 889 # old-style PropList.txt 890 'Non_Break' => 'Obsolete', 891); 892 893# These would normally go in the warn table just above, but they were changed 894# a long time before this program was written, so warnings about them are 895# moot. 896if ($v_version gt v3.2.0) { 897 push @tables_that_may_be_empty, 898 'Canonical_Combining_Class=Attached_Below_Left' 899} 900 901# Obsoleted 902if ($v_version ge v11.0.0) { 903 push @tables_that_may_be_empty, qw( 904 Grapheme_Cluster_Break=E_Base 905 Grapheme_Cluster_Break=E_Base_GAZ 906 Grapheme_Cluster_Break=E_Modifier 907 Grapheme_Cluster_Break=Glue_After_Zwj 908 Word_Break=E_Base 909 Word_Break=E_Base_GAZ 910 Word_Break=E_Modifier 911 Word_Break=Glue_After_Zwj); 912} 913 914# Enum values for to_output_map() method in the Map_Table package. (0 is don't 915# output) 916my $EXTERNAL_MAP = 1; 917my $INTERNAL_MAP = 2; 918my $OUTPUT_ADJUSTED = 3; 919 920# To override computed values for writing the map tables for these properties. 921# The default for enum map tables is to write them out, so that the Unicode 922# .txt files can be removed, but all the data to compute any property value 923# for any code point is available in a more compact form. 924my %global_to_output_map = ( 925 # Needed by UCD.pm, but don't want to publicize that it exists, so won't 926 # get stuck supporting it if things change. Since it is a STRING 927 # property, it normally would be listed in the pod, but INTERNAL_MAP 928 # suppresses that. 929 Unicode_1_Name => $INTERNAL_MAP, 930 931 Present_In => 0, # Suppress, as easily computed from Age 932 Block => (NON_ASCII_PLATFORM) ? 1 : 0, # Suppress, as Blocks.txt is 933 # retained, but needed for 934 # non-ASCII 935 936 # Suppress, as mapping can be found instead from the 937 # Perl_Decomposition_Mapping file 938 Decomposition_Type => 0, 939); 940 941# There are several types of obsolete properties defined by Unicode. These 942# must be hand-edited for every new Unicode release. 943my %why_deprecated; # Generates a deprecated warning message if used. 944my %why_stabilized; # Documentation only 945my %why_obsolete; # Documentation only 946 947{ # Closure 948 my $simple = 'Perl uses the more complete version'; 949 my $unihan = 'Unihan properties are by default not enabled in the Perl core. Instead use CPAN: Unicode::Unihan'; 950 951 my $other_properties = 'other properties'; 952 my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone"; 953 my $why_no_expand = "Deprecated by Unicode. These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used. For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character."; 954 955 %why_deprecated = ( 956 'Grapheme_Link' => 'Duplicates ccc=vr (Canonical_Combining_Class=Virama)', 957 'Jamo_Short_Name' => $contributory, 958 'Line_Break=Surrogate' => 'Surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking', 959 'Other_Alphabetic' => $contributory, 960 'Other_Default_Ignorable_Code_Point' => $contributory, 961 'Other_Grapheme_Extend' => $contributory, 962 'Other_ID_Continue' => $contributory, 963 'Other_ID_Start' => $contributory, 964 'Other_Lowercase' => $contributory, 965 'Other_Math' => $contributory, 966 'Other_Uppercase' => $contributory, 967 'Expands_On_NFC' => $why_no_expand, 968 'Expands_On_NFD' => $why_no_expand, 969 'Expands_On_NFKC' => $why_no_expand, 970 'Expands_On_NFKD' => $why_no_expand, 971 ); 972 973 %why_suppressed = ( 974 # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which 975 # contains the same information, but without the algorithmically 976 # determinable Hangul syllables'. This file is not published, so it's 977 # existence is not noted in the comment. 978 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or prop_invmap() or charprop() in Unicode::UCD::', 979 980 # Don't suppress ISO_Comment, as otherwise special handling is needed 981 # to differentiate between it and gc=c, which can be written as 'isc', 982 # which is the same characters as ISO_Comment's short name. 983 984 'Name' => "Accessible via \\N{...} or 'use charnames;' or charprop() or prop_invmap() in Unicode::UCD::", 985 986 'Simple_Case_Folding' => "$simple. Can access this through casefold(), charprop(), or prop_invmap() in Unicode::UCD", 987 'Simple_Lowercase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD", 988 'Simple_Titlecase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD", 989 'Simple_Uppercase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD", 990 991 FC_NFKC_Closure => 'Deprecated by Unicode, and supplanted in usage by NFKC_Casefold; otherwise not useful', 992 ); 993 994 foreach my $property ( 995 996 # The following are suppressed because they were made contributory 997 # or deprecated by Unicode before Perl ever thought about 998 # supporting them. 999 'Jamo_Short_Name', 1000 'Grapheme_Link', 1001 'Expands_On_NFC', 1002 'Expands_On_NFD', 1003 'Expands_On_NFKC', 1004 'Expands_On_NFKD', 1005 1006 # The following are suppressed because they have been marked 1007 # as deprecated for a sufficient amount of time 1008 'Other_Alphabetic', 1009 'Other_Default_Ignorable_Code_Point', 1010 'Other_Grapheme_Extend', 1011 'Other_ID_Continue', 1012 'Other_ID_Start', 1013 'Other_Lowercase', 1014 'Other_Math', 1015 'Other_Uppercase', 1016 ) { 1017 $why_suppressed{$property} = $why_deprecated{$property}; 1018 } 1019 1020 # Customize the message for all the 'Other_' properties 1021 foreach my $property (keys %why_deprecated) { 1022 next if (my $main_property = $property) !~ s/^Other_//; 1023 $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/; 1024 } 1025} 1026 1027if ($write_Unicode_deprecated_tables) { 1028 foreach my $property (keys %why_suppressed) { 1029 delete $why_suppressed{$property} if $property =~ 1030 / ^ Other | Grapheme /x; 1031 } 1032} 1033 1034if ($v_version ge 4.0.0) { 1035 $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14'; 1036 if ($v_version ge 6.0.0) { 1037 $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14'; 1038 } 1039} 1040if ($v_version ge 5.2.0 && $v_version lt 6.0.0) { 1041 $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed'; 1042 if ($v_version ge 6.0.0) { 1043 $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed'; 1044 } 1045} 1046 1047# Probably obsolete forever 1048if ($v_version ge v4.1.0) { 1049 $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common".'; 1050} 1051if ($v_version ge v6.0.0) { 1052 $why_suppressed{'Script=Katakana_Or_Hiragana'} .= ' Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana" (or both)'; 1053 $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"'; 1054} 1055 1056# This program can create files for enumerated-like properties, such as 1057# 'Numeric_Type'. This file would be the same format as for a string 1058# property, with a mapping from code point to its value, so you could look up, 1059# for example, the script a code point is in. But no one so far wants this 1060# mapping, or they have found another way to get it since this is a new 1061# feature. So no file is generated except if it is in this list. 1062my @output_mapped_properties = split "\n", <<END; 1063END 1064 1065# If you want more Unihan properties than the default, you need to add them to 1066# these arrays. Depending on the property type, @missing lines might have to 1067# be added to the second array. A sample entry would be (including the '#'): 1068# @missing: 0000..10FFFF; cjkAccountingNumeric; NaN 1069my @cjk_properties = split "\n", <<'END'; 1070END 1071my @cjk_property_values = split "\n", <<'END'; 1072END 1073 1074# The input files don't list every code point. Those not listed are to be 1075# defaulted to some value. Below are hard-coded what those values are for 1076# non-binary properties as of 5.1. Starting in 5.0, there are 1077# machine-parsable comment lines in the files that give the defaults; so this 1078# list shouldn't have to be extended. The claim is that all missing entries 1079# for binary properties will default to 'N'. Unicode tried to change that in 1080# 5.2, but the beta period produced enough protest that they backed off. 1081# 1082# The defaults for the fields that appear in UnicodeData.txt in this hash must 1083# be in the form that it expects. The others may be synonyms. 1084my $CODE_POINT = '<code point>'; 1085my %default_mapping = ( 1086 Age => "Unassigned", 1087 # Bidi_Class => Complicated; set in code 1088 Bidi_Mirroring_Glyph => "", 1089 Block => 'No_Block', 1090 Canonical_Combining_Class => 0, 1091 Case_Folding => $CODE_POINT, 1092 Decomposition_Mapping => $CODE_POINT, 1093 Decomposition_Type => 'None', 1094 East_Asian_Width => "Neutral", 1095 FC_NFKC_Closure => $CODE_POINT, 1096 General_Category => ($v_version le 6.3.0) ? 'Cn' : 'Unassigned', 1097 Grapheme_Cluster_Break => 'Other', 1098 Hangul_Syllable_Type => 'NA', 1099 ISO_Comment => "", 1100 Jamo_Short_Name => "", 1101 Joining_Group => "No_Joining_Group", 1102 # Joining_Type => Complicated; set in code 1103 kIICore => 'N', # Is converted to binary 1104 #Line_Break => Complicated; set in code 1105 Lowercase_Mapping => $CODE_POINT, 1106 Name => "", 1107 Name_Alias => "", 1108 NFC_QC => 'Yes', 1109 NFD_QC => 'Yes', 1110 NFKC_QC => 'Yes', 1111 NFKD_QC => 'Yes', 1112 Numeric_Type => 'None', 1113 Numeric_Value => 'NaN', 1114 Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown', 1115 Sentence_Break => 'Other', 1116 Simple_Case_Folding => $CODE_POINT, 1117 Simple_Lowercase_Mapping => $CODE_POINT, 1118 Simple_Titlecase_Mapping => $CODE_POINT, 1119 Simple_Uppercase_Mapping => $CODE_POINT, 1120 Titlecase_Mapping => $CODE_POINT, 1121 Unicode_1_Name => "", 1122 Unicode_Radical_Stroke => "", 1123 Uppercase_Mapping => $CODE_POINT, 1124 Word_Break => 'Other', 1125); 1126 1127### End of externally interesting definitions, except for @input_file_objects 1128 1129my $HEADER=<<"EOF"; 1130# !!!!!!! DO NOT EDIT THIS FILE !!!!!!! 1131# This file is machine-generated by $0 from the Unicode 1132# database, Version $unicode_version. Any changes made here will be lost! 1133EOF 1134 1135my $INTERNAL_ONLY_HEADER = <<"EOF"; 1136 1137# !!!!!!! INTERNAL PERL USE ONLY !!!!!!! 1138# This file is for internal use by core Perl only. The format and even the 1139# name or existence of this file are subject to change without notice. Don't 1140# use it directly. Use Unicode::UCD to access the Unicode character data 1141# base. 1142EOF 1143 1144my $DEVELOPMENT_ONLY=<<"EOF"; 1145# !!!!!!! DEVELOPMENT USE ONLY !!!!!!! 1146# This file contains information artificially constrained to code points 1147# present in Unicode release $string_compare_versions. 1148# IT CANNOT BE RELIED ON. It is for use during development only and should 1149# not be used for production. 1150 1151EOF 1152 1153my $MAX_UNICODE_CODEPOINT_STRING = ($v_version ge v2.0.0) 1154 ? "10FFFF" 1155 : "FFFF"; 1156my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING; 1157my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1; 1158 1159# We work with above-Unicode code points, up to IV_MAX, but we may want to use 1160# sentinels above that number. Therefore for internal use, we use a much 1161# smaller number, translating it to IV_MAX only for output. The exact number 1162# is immaterial (all above-Unicode code points are treated exactly the same), 1163# but the algorithm requires it to be at least 1164# 2 * $MAX_UNICODE_CODEPOINTS + 1 1165my $MAX_WORKING_CODEPOINTS= $MAX_UNICODE_CODEPOINT * 8; 1166my $MAX_WORKING_CODEPOINT = $MAX_WORKING_CODEPOINTS - 1; 1167my $MAX_WORKING_CODEPOINT_STRING = sprintf("%X", $MAX_WORKING_CODEPOINT); 1168 1169my $MAX_PLATFORM_CODEPOINT = ~0 >> 1; 1170 1171# Matches legal code point. 4-6 hex numbers, If there are 6, the first 1172# two must be 10; if there are 5, the first must not be a 0. Written this way 1173# to decrease backtracking. The first regex allows the code point to be at 1174# the end of a word, but to work properly, the word shouldn't end with a valid 1175# hex character. The second one won't match a code point at the end of a 1176# word, and doesn't have the run-on issue 1177my $run_on_code_point_re = 1178 qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x; 1179my $code_point_re = qr/\b$run_on_code_point_re/; 1180 1181# This matches the beginning of the line in the Unicode DB files that give the 1182# defaults for code points not listed (i.e., missing) in the file. The code 1183# depends on this ending with a semi-colon, so it can assume it is a valid 1184# field when the line is split() by semi-colons 1185my $missing_defaults_prefix = qr/^#\s+\@missing:\s+0000\.\.10FFFF\s*;/; 1186 1187# Property types. Unicode has more types, but these are sufficient for our 1188# purposes. 1189my $UNKNOWN = -1; # initialized to illegal value 1190my $NON_STRING = 1; # Either binary or enum 1191my $BINARY = 2; 1192my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal 1193 # tables, additional true and false tables are 1194 # generated so that false is anything matching the 1195 # default value, and true is everything else. 1196my $ENUM = 4; # Include catalog 1197my $STRING = 5; # Anything else: string or misc 1198 1199# Some input files have lines that give default values for code points not 1200# contained in the file. Sometimes these should be ignored. 1201my $NO_DEFAULTS = 0; # Must evaluate to false 1202my $NOT_IGNORED = 1; 1203my $IGNORED = 2; 1204 1205# Range types. Each range has a type. Most ranges are type 0, for normal, 1206# and will appear in the main body of the tables in the output files, but 1207# there are other types of ranges as well, listed below, that are specially 1208# handled. There are pseudo-types as well that will never be stored as a 1209# type, but will affect the calculation of the type. 1210 1211# 0 is for normal, non-specials 1212my $MULTI_CP = 1; # Sequence of more than code point 1213my $HANGUL_SYLLABLE = 2; 1214my $CP_IN_NAME = 3; # The NAME contains the code point appended to it. 1215my $NULL = 4; # The map is to the null string; utf8.c can't 1216 # handle these, nor is there an accepted syntax 1217 # for them in \p{} constructs 1218my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would 1219 # otherwise be $MULTI_CP type are instead type 0 1220 1221# process_generic_property_file() can accept certain overrides in its input. 1222# Each of these must begin AND end with $CMD_DELIM. 1223my $CMD_DELIM = "\a"; 1224my $REPLACE_CMD = 'replace'; # Override the Replace 1225my $MAP_TYPE_CMD = 'map_type'; # Override the Type 1226 1227my $NO = 0; 1228my $YES = 1; 1229 1230# Values for the Replace argument to add_range. 1231# $NO # Don't replace; add only the code points not 1232 # already present. 1233my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in 1234 # the comments at the subroutine definition. 1235my $UNCONDITIONALLY = 2; # Replace without conditions. 1236my $MULTIPLE_BEFORE = 4; # Don't replace, but add a duplicate record if 1237 # already there 1238my $MULTIPLE_AFTER = 5; # Don't replace, but add a duplicate record if 1239 # already there 1240my $CROAK = 6; # Die with an error if is already there 1241 1242# Flags to give property statuses. The phrases are to remind maintainers that 1243# if the flag is changed, the indefinite article referring to it in the 1244# documentation may need to be as well. 1245my $NORMAL = ""; 1246my $DEPRECATED = 'D'; 1247my $a_bold_deprecated = "a 'B<$DEPRECATED>'"; 1248my $A_bold_deprecated = "A 'B<$DEPRECATED>'"; 1249my $DISCOURAGED = 'X'; 1250my $a_bold_discouraged = "an 'B<$DISCOURAGED>'"; 1251my $A_bold_discouraged = "An 'B<$DISCOURAGED>'"; 1252my $STRICTER = 'T'; 1253my $a_bold_stricter = "a 'B<$STRICTER>'"; 1254my $A_bold_stricter = "A 'B<$STRICTER>'"; 1255my $STABILIZED = 'S'; 1256my $a_bold_stabilized = "an 'B<$STABILIZED>'"; 1257my $A_bold_stabilized = "An 'B<$STABILIZED>'"; 1258my $OBSOLETE = 'O'; 1259my $a_bold_obsolete = "an 'B<$OBSOLETE>'"; 1260my $A_bold_obsolete = "An 'B<$OBSOLETE>'"; 1261 1262# Aliases can also have an extra status: 1263my $INTERNAL_ALIAS = 'P'; 1264 1265my %status_past_participles = ( 1266 $DISCOURAGED => 'discouraged', 1267 $STABILIZED => 'stabilized', 1268 $OBSOLETE => 'obsolete', 1269 $DEPRECATED => 'deprecated', 1270 $INTERNAL_ALIAS => 'reserved for Perl core internal use only', 1271); 1272 1273# Table fates. These are somewhat ordered, so that fates < $MAP_PROXIED should be 1274# externally documented. 1275my $ORDINARY = 0; # The normal fate. 1276my $MAP_PROXIED = 1; # The map table for the property isn't written out, 1277 # but there is a file written that can be used to 1278 # reconstruct this table 1279my $INTERNAL_ONLY = 2; # The file for this table is written out, but it is 1280 # for Perl's internal use only 1281my $LEGACY_ONLY = 3; # Like $INTERNAL_ONLY, but not actually used by Perl. 1282 # Is for backwards compatibility for applications that 1283 # read the file directly, so it's format is 1284 # unchangeable. 1285my $SUPPRESSED = 4; # The file for this table is not written out, and as a 1286 # result, we don't bother to do many computations on 1287 # it. 1288my $PLACEHOLDER = 5; # Like $SUPPRESSED, but we go through all the 1289 # computations anyway, as the values are needed for 1290 # things to work. This happens when we have Perl 1291 # extensions that depend on Unicode tables that 1292 # wouldn't normally be in a given Unicode version. 1293 1294# The format of the values of the tables: 1295my $EMPTY_FORMAT = ""; 1296my $BINARY_FORMAT = 'b'; 1297my $DECIMAL_FORMAT = 'd'; 1298my $FLOAT_FORMAT = 'f'; 1299my $INTEGER_FORMAT = 'i'; 1300my $HEX_FORMAT = 'x'; 1301my $RATIONAL_FORMAT = 'r'; 1302my $STRING_FORMAT = 's'; 1303my $ADJUST_FORMAT = 'a'; 1304my $HEX_ADJUST_FORMAT = 'ax'; 1305my $DECOMP_STRING_FORMAT = 'c'; 1306my $STRING_WHITE_SPACE_LIST = 'sw'; 1307 1308my %map_table_formats = ( 1309 $BINARY_FORMAT => 'binary', 1310 $DECIMAL_FORMAT => 'single decimal digit', 1311 $FLOAT_FORMAT => 'floating point number', 1312 $INTEGER_FORMAT => 'integer', 1313 $HEX_FORMAT => 'non-negative hex whole number; a code point', 1314 $RATIONAL_FORMAT => 'rational: an integer or a fraction', 1315 $STRING_FORMAT => 'string', 1316 $ADJUST_FORMAT => 'some entries need adjustment', 1317 $HEX_ADJUST_FORMAT => 'mapped value in hex; some entries need adjustment', 1318 $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping', 1319 $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators' 1320); 1321 1322# Unicode didn't put such derived files in a separate directory at first. 1323my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : ""; 1324my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : ""; 1325my $AUXILIARY = 'auxiliary'; 1326my $EMOJI = 'emoji'; 1327 1328# Hashes and arrays that will eventually go into UCD.pl for the use of UCD.pm 1329my %loose_to_file_of; # loosely maps table names to their respective 1330 # files 1331my %stricter_to_file_of; # same; but for stricter mapping. 1332my %loose_property_to_file_of; # Maps a loose property name to its map file 1333my %strict_property_to_file_of; # Same, but strict 1334my @inline_definitions = "V0"; # Each element gives a definition of a unique 1335 # inversion list. When a definition is inlined, 1336 # its value in the hash it's in (one of the two 1337 # defined just above) will include an index into 1338 # this array. The 0th element is initialized to 1339 # the definition for a zero length inversion list 1340my %file_to_swash_name; # Maps the file name to its corresponding key name 1341 # in the hash %Unicode::UCD::SwashInfo 1342my %nv_floating_to_rational; # maps numeric values floating point numbers to 1343 # their rational equivalent 1344my %loose_property_name_of; # Loosely maps (non_string) property names to 1345 # standard form 1346my %strict_property_name_of; # Strictly maps (non_string) property names to 1347 # standard form 1348my %string_property_loose_to_name; # Same, for string properties. 1349my %loose_defaults; # keys are of form "prop=value", where 'prop' is 1350 # the property name in standard loose form, and 1351 # 'value' is the default value for that property, 1352 # also in standard loose form. 1353my %loose_to_standard_value; # loosely maps table names to the canonical 1354 # alias for them 1355my %ambiguous_names; # keys are alias names (in standard form) that 1356 # have more than one possible meaning. 1357my %combination_property; # keys are alias names (in standard form) that 1358 # have both a map table, and a binary one that 1359 # yields true for all non-null maps. 1360my %prop_aliases; # Keys are standard property name; values are each 1361 # one's aliases 1362my %prop_value_aliases; # Keys of top level are standard property name; 1363 # values are keys to another hash, Each one is 1364 # one of the property's values, in standard form. 1365 # The values are that prop-val's aliases. 1366my %skipped_files; # List of files that we skip 1367my %ucd_pod; # Holds entries that will go into the UCD section of the pod 1368 1369# Most properties are immune to caseless matching, otherwise you would get 1370# nonsensical results, as properties are a function of a code point, not 1371# everything that is caselessly equivalent to that code point. For example, 1372# Changes_When_Case_Folded('s') should be false, whereas caselessly it would 1373# be true because 's' and 'S' are equivalent caselessly. However, 1374# traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we 1375# extend that concept to those very few properties that are like this. Each 1376# such property will match the full range caselessly. They are hard-coded in 1377# the program; it's not worth trying to make it general as it's extremely 1378# unlikely that they will ever change. 1379my %caseless_equivalent_to; 1380 1381# This is the range of characters that were in Release 1 of Unicode, and 1382# removed in Release 2 (replaced with the current Hangul syllables starting at 1383# U+AC00). The range was reused starting in Release 3 for other purposes. 1384my $FIRST_REMOVED_HANGUL_SYLLABLE = 0x3400; 1385my $FINAL_REMOVED_HANGUL_SYLLABLE = 0x4DFF; 1386 1387# These constants names and values were taken from the Unicode standard, 1388# version 5.1, section 3.12. They are used in conjunction with Hangul 1389# syllables. The '_string' versions are so generated tables can retain the 1390# hex format, which is the more familiar value 1391my $SBase_string = "0xAC00"; 1392my $SBase = CORE::hex $SBase_string; 1393my $LBase_string = "0x1100"; 1394my $LBase = CORE::hex $LBase_string; 1395my $VBase_string = "0x1161"; 1396my $VBase = CORE::hex $VBase_string; 1397my $TBase_string = "0x11A7"; 1398my $TBase = CORE::hex $TBase_string; 1399my $SCount = 11172; 1400my $LCount = 19; 1401my $VCount = 21; 1402my $TCount = 28; 1403my $NCount = $VCount * $TCount; 1404 1405# For Hangul syllables; These store the numbers from Jamo.txt in conjunction 1406# with the above published constants. 1407my %Jamo; 1408my %Jamo_L; # Leading consonants 1409my %Jamo_V; # Vowels 1410my %Jamo_T; # Trailing consonants 1411 1412# For code points whose name contains its ordinal as a '-ABCD' suffix. 1413# The key is the base name of the code point, and the value is an 1414# array giving all the ranges that use this base name. Each range 1415# is actually a hash giving the 'low' and 'high' values of it. 1416my %names_ending_in_code_point; 1417my %loose_names_ending_in_code_point; # Same as above, but has blanks, dashes 1418 # removed from the names 1419# Inverse mapping. The list of ranges that have these kinds of 1420# names. Each element contains the low, high, and base names in an 1421# anonymous hash. 1422my @code_points_ending_in_code_point; 1423 1424# To hold Unicode's normalization test suite 1425my @normalization_tests; 1426 1427# Boolean: does this Unicode version have the hangul syllables, and are we 1428# writing out a table for them? 1429my $has_hangul_syllables = 0; 1430 1431# Does this Unicode version have code points whose names end in their 1432# respective code points, and are we writing out a table for them? 0 for no; 1433# otherwise points to first property that a table is needed for them, so that 1434# if multiple tables are needed, we don't create duplicates 1435my $needing_code_points_ending_in_code_point = 0; 1436 1437my @backslash_X_tests; # List of tests read in for testing \X 1438my @LB_tests; # List of tests read in for testing \b{lb} 1439my @SB_tests; # List of tests read in for testing \b{sb} 1440my @WB_tests; # List of tests read in for testing \b{wb} 1441my @unhandled_properties; # Will contain a list of properties found in 1442 # the input that we didn't process. 1443my @match_properties; # Properties that have match tables, to be 1444 # listed in the pod 1445my @map_properties; # Properties that get map files written 1446my @named_sequences; # NamedSequences.txt contents. 1447my %potential_files; # Generated list of all .txt files in the directory 1448 # structure so we can warn if something is being 1449 # ignored. 1450my @missing_early_files; # Generated list of absent files that we need to 1451 # proceed in compiling this early Unicode version 1452my @files_actually_output; # List of files we generated. 1453my @more_Names; # Some code point names are compound; this is used 1454 # to store the extra components of them. 1455my $E_FLOAT_PRECISION = 2; # The minimum number of digits after the decimal 1456 # point of a normalized floating point number 1457 # needed to match before we consider it equivalent 1458 # to a candidate rational 1459 1460# These store references to certain commonly used property objects 1461my $age; 1462my $ccc; 1463my $gc; 1464my $perl; 1465my $block; 1466my $perl_charname; 1467my $print; 1468my $All; 1469my $Assigned; # All assigned characters in this Unicode release 1470my $DI; # Default_Ignorable_Code_Point property 1471my $NChar; # Noncharacter_Code_Point property 1472my $script; 1473my $scx; # Script_Extensions property 1474my $idt; # Identifier_Type property 1475 1476# Are there conflicting names because of beginning with 'In_', or 'Is_' 1477my $has_In_conflicts = 0; 1478my $has_Is_conflicts = 0; 1479 1480sub internal_file_to_platform ($file=undef) { 1481 # Convert our file paths which have '/' separators to those of the 1482 # platform. 1483 1484 return undef unless defined $file; 1485 1486 return File::Spec->join(split '/', $file); 1487} 1488 1489sub file_exists ($file=undef) { # platform independent '-e'. This program internally 1490 # uses slash as a path separator. 1491 return 0 unless defined $file; 1492 return -e internal_file_to_platform($file); 1493} 1494 1495sub objaddr($addr) { 1496 # Returns the address of the blessed input object. 1497 # It doesn't check for blessedness because that would do a string eval 1498 # every call, and the program is structured so that this is never called 1499 # for a non-blessed object. 1500 1501 no overloading; # If overloaded, numifying below won't work. 1502 1503 # Numifying a ref gives its address. 1504 return pack 'J', $addr; 1505} 1506 1507# These are used only if $annotate is true. 1508# The entire range of Unicode characters is examined to populate these 1509# after all the input has been processed. But most can be skipped, as they 1510# have the same descriptive phrases, such as being unassigned 1511my @viacode; # Contains the 1 million character names 1512my @age; # And their ages ("" if none) 1513my @printable; # boolean: And are those characters printable? 1514my @annotate_char_type; # Contains a type of those characters, specifically 1515 # for the purposes of annotation. 1516my $annotate_ranges; # A map of ranges of code points that have the same 1517 # name for the purposes of annotation. They map to the 1518 # upper edge of the range, so that the end point can 1519 # be immediately found. This is used to skip ahead to 1520 # the end of a range, and avoid processing each 1521 # individual code point in it. 1522my $unassigned_sans_noncharacters; # A Range_List of the unassigned 1523 # characters, but excluding those which are 1524 # also noncharacter code points 1525 1526# The annotation types are an extension of the regular range types, though 1527# some of the latter are folded into one. Make the new types negative to 1528# avoid conflicting with the regular types 1529my $SURROGATE_TYPE = -1; 1530my $UNASSIGNED_TYPE = -2; 1531my $PRIVATE_USE_TYPE = -3; 1532my $NONCHARACTER_TYPE = -4; 1533my $CONTROL_TYPE = -5; 1534my $ABOVE_UNICODE_TYPE = -6; 1535my $UNKNOWN_TYPE = -7; # Used only if there is a bug in this program 1536 1537sub populate_char_info ($i) { 1538 # Used only with the $annotate option. Populates the arrays with the 1539 # input code point's info that are needed for outputting more detailed 1540 # comments. If calling context wants a return, it is the end point of 1541 # any contiguous range of characters that share essentially the same info 1542 1543 $viacode[$i] = $perl_charname->value_of($i) || ""; 1544 $age[$i] = (defined $age) 1545 ? (($age->value_of($i) =~ / ^ \d+ \. \d+ $ /x) 1546 ? $age->value_of($i) 1547 : "") 1548 : ""; 1549 1550 # A character is generally printable if Unicode says it is, 1551 # but below we make sure that most Unicode general category 'C' types 1552 # aren't. 1553 $printable[$i] = $print->contains($i); 1554 1555 # But the characters in this range were removed in v2.0 and replaced by 1556 # different ones later. Modern fonts will be for the replacement 1557 # characters, so suppress printing them. 1558 if (($v_version lt v2.0 1559 || ($compare_versions && $compare_versions lt v2.0)) 1560 && ( $i >= $FIRST_REMOVED_HANGUL_SYLLABLE 1561 && $i <= $FINAL_REMOVED_HANGUL_SYLLABLE)) 1562 { 1563 $printable[$i] = 0; 1564 } 1565 1566 $annotate_char_type[$i] = $perl_charname->type_of($i) || 0; 1567 1568 # Only these two regular types are treated specially for annotations 1569 # purposes 1570 $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME 1571 && $annotate_char_type[$i] != $HANGUL_SYLLABLE; 1572 1573 # Give a generic name to all code points that don't have a real name. 1574 # We output ranges, if applicable, for these. Also calculate the end 1575 # point of the range. 1576 my $end; 1577 if (! $viacode[$i]) { 1578 if ($i > $MAX_UNICODE_CODEPOINT) { 1579 $viacode[$i] = 'Above-Unicode'; 1580 $annotate_char_type[$i] = $ABOVE_UNICODE_TYPE; 1581 $printable[$i] = 0; 1582 $end = $MAX_WORKING_CODEPOINT; 1583 } 1584 elsif ($gc-> table('Private_use')->contains($i)) { 1585 $viacode[$i] = 'Private Use'; 1586 $annotate_char_type[$i] = $PRIVATE_USE_TYPE; 1587 $printable[$i] = 0; 1588 $end = $gc->table('Private_Use')->containing_range($i)->end; 1589 } 1590 elsif ($NChar->contains($i)) { 1591 $viacode[$i] = 'Noncharacter'; 1592 $annotate_char_type[$i] = $NONCHARACTER_TYPE; 1593 $printable[$i] = 0; 1594 $end = $NChar->containing_range($i)->end; 1595 } 1596 elsif ($gc-> table('Control')->contains($i)) { 1597 my $name_ref = property_ref('Name_Alias'); 1598 $name_ref = property_ref('Unicode_1_Name') if ! defined $name_ref; 1599 $viacode[$i] = (defined $name_ref) 1600 ? $name_ref->value_of($i) 1601 : 'Control'; 1602 $annotate_char_type[$i] = $CONTROL_TYPE; 1603 $printable[$i] = 0; 1604 } 1605 elsif ($gc-> table('Unassigned')->contains($i)) { 1606 $annotate_char_type[$i] = $UNASSIGNED_TYPE; 1607 $printable[$i] = 0; 1608 $viacode[$i] = 'Unassigned'; 1609 1610 if (defined $block) { # No blocks in earliest releases 1611 $viacode[$i] .= ', block=' . $block-> value_of($i); 1612 $end = $gc-> table('Unassigned')->containing_range($i)->end; 1613 1614 # Because we name the unassigned by the blocks they are in, it 1615 # can't go past the end of that block, and it also can't go 1616 # past the unassigned range it is in. The special table makes 1617 # sure that the non-characters, which are unassigned, are 1618 # separated out. 1619 $end = min($block->containing_range($i)->end, 1620 $unassigned_sans_noncharacters-> 1621 containing_range($i)->end); 1622 } 1623 else { 1624 $end = $i + 1; 1625 while ($unassigned_sans_noncharacters->contains($end)) { 1626 $end++; 1627 } 1628 $end--; 1629 } 1630 } 1631 elsif ($perl->table('_Perl_Surrogate')->contains($i)) { 1632 $viacode[$i] = 'Surrogate'; 1633 $annotate_char_type[$i] = $SURROGATE_TYPE; 1634 $printable[$i] = 0; 1635 $end = $gc->table('Surrogate')->containing_range($i)->end; 1636 } 1637 else { 1638 Carp::my_carp_bug("Can't figure out how to annotate " 1639 . sprintf("U+%04X", $i) 1640 . ". Proceeding anyway."); 1641 $viacode[$i] = 'UNKNOWN'; 1642 $annotate_char_type[$i] = $UNKNOWN_TYPE; 1643 $printable[$i] = 0; 1644 } 1645 } 1646 1647 # Here, has a name, but if it's one in which the code point number is 1648 # appended to the name, do that. 1649 elsif ($annotate_char_type[$i] == $CP_IN_NAME) { 1650 $viacode[$i] .= sprintf("-%04X", $i); 1651 1652 my $limit = $perl_charname->containing_range($i)->end; 1653 if (defined $age) { 1654 # Do all these as groups of the same age, instead of individually, 1655 # because their names are so meaningless, and there are typically 1656 # large quantities of them. 1657 $end = $i + 1; 1658 while ($end <= $limit && $age->value_of($end) == $age[$i]) { 1659 $end++; 1660 } 1661 $end--; 1662 } 1663 else { 1664 $end = $limit; 1665 } 1666 } 1667 1668 # And here, has a name, but if it's a hangul syllable one, replace it with 1669 # the correct name from the Unicode algorithm 1670 elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) { 1671 use integer; 1672 my $SIndex = $i - $SBase; 1673 my $L = $LBase + $SIndex / $NCount; 1674 my $V = $VBase + ($SIndex % $NCount) / $TCount; 1675 my $T = $TBase + $SIndex % $TCount; 1676 $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}"; 1677 $viacode[$i] .= $Jamo{$T} if $T != $TBase; 1678 $end = $perl_charname->containing_range($i)->end; 1679 } 1680 1681 return if ! defined wantarray; 1682 return $i if ! defined $end; # If not a range, return the input 1683 1684 # Save this whole range so can find the end point quickly 1685 $annotate_ranges->add_map($i, $end, $end); 1686 1687 return $end; 1688} 1689 1690sub max($a, $b) { 1691 return $a >= $b ? $a : $b; 1692} 1693 1694sub min($a, $b) { 1695 return $a <= $b ? $a : $b; 1696} 1697 1698sub clarify_number ($number) { 1699 # This returns the input number with underscores inserted every 3 digits 1700 # in large (5 digits or more) numbers. Input must be entirely digits, not 1701 # checked. 1702 1703 my $pos = length($number) - 3; 1704 return $number if $pos <= 1; 1705 while ($pos > 0) { 1706 substr($number, $pos, 0) = '_'; 1707 $pos -= 3; 1708 } 1709 return $number; 1710} 1711 1712sub clarify_code_point_count ($number) { 1713 # This is like clarify_number(), but the input is assumed to be a count of 1714 # code points, rather than a generic number. 1715 1716 my $append = ""; 1717 1718 if ($number > $MAX_UNICODE_CODEPOINTS) { 1719 $number -= ($MAX_WORKING_CODEPOINTS - $MAX_UNICODE_CODEPOINTS); 1720 return "All above-Unicode code points" if $number == 0; 1721 $append = " + all above-Unicode code points"; 1722 } 1723 return clarify_number($number) . $append; 1724} 1725 1726package Carp; 1727 1728# These routines give a uniform treatment of messages in this program. They 1729# are placed in the Carp package to cause the stack trace to not include them, 1730# although an alternative would be to use another package and set @CARP_NOT 1731# for it. 1732 1733our $Verbose = 1 if main::DEBUG; # Useful info when debugging 1734 1735# This is a work-around suggested by Nicholas Clark to fix a problem with Carp 1736# and overload trying to load Scalar:Util under miniperl. See 1737# http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html 1738undef $overload::VERSION; 1739 1740sub my_carp($message="", $nofold=0) { 1741 1742 if ($message) { 1743 $message = main::join_lines($message); 1744 $message =~ s/^$0: *//; # Remove initial program name 1745 $message =~ s/[.;,]+$//; # Remove certain ending punctuation 1746 $message = "\n$0: $message;"; 1747 1748 # Fold the message with program name, semi-colon end punctuation 1749 # (which looks good with the message that carp appends to it), and a 1750 # hanging indent for continuation lines. 1751 $message = main::simple_fold($message, "", 4) unless $nofold; 1752 $message =~ s/\n$//; # Remove the trailing nl so what carp 1753 # appends is to the same line 1754 } 1755 1756 return $message if defined wantarray; # If a caller just wants the msg 1757 1758 carp $message; 1759 return; 1760} 1761 1762sub my_carp_bug($message="") { 1763 # This is called when it is clear that the problem is caused by a bug in 1764 # this program. 1765 $message =~ s/^$0: *//; 1766 $message = my_carp("Bug in $0. Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message"); 1767 carp $message; 1768 return; 1769} 1770 1771sub carp_too_few_args($args_ref, $count) { 1772 my_carp_bug("Need at least $count arguments to " 1773 . (caller 1)[3] 1774 . ". Instead got: '" 1775 . join ', ', @$args_ref 1776 . "'. No action taken."); 1777 return; 1778} 1779 1780sub carp_extra_args($args_ref) { 1781 unless (ref $args_ref) { 1782 my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref. Not checking arguments."); 1783 return; 1784 } 1785 my ($package, $file, $line) = caller; 1786 my $subroutine = (caller 1)[3]; 1787 1788 my $list; 1789 if (ref $args_ref eq 'HASH') { 1790 foreach my $key (keys %$args_ref) { 1791 $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key}; 1792 } 1793 $list = join ', ', each %{$args_ref}; 1794 } 1795 elsif (ref $args_ref eq 'ARRAY') { 1796 foreach my $arg (@$args_ref) { 1797 $arg = $UNDEF unless defined $arg; 1798 } 1799 $list = join ', ', @$args_ref; 1800 } 1801 else { 1802 my_carp_bug("Can't cope with ref " 1803 . ref($args_ref) 1804 . " . argument to 'carp_extra_args'. Not checking arguments."); 1805 return; 1806 } 1807 1808 my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine. Skipped."); 1809 return; 1810} 1811 1812package main; 1813 1814{ # Closure 1815 1816 # This program uses the inside-out method for objects, as recommended in 1817 # "Perl Best Practices". (This is the best solution still, since this has 1818 # to run under miniperl.) This closure aids in generating those. There 1819 # are two routines. setup_package() is called once per package to set 1820 # things up, and then set_access() is called for each hash representing a 1821 # field in the object. These routines arrange for the object to be 1822 # properly destroyed when no longer used, and for standard accessor 1823 # functions to be generated. If you need more complex accessors, just 1824 # write your own and leave those accesses out of the call to set_access(). 1825 # More details below. 1826 1827 my %constructor_fields; # fields that are to be used in constructors; see 1828 # below 1829 1830 # The values of this hash will be the package names as keys to other 1831 # hashes containing the name of each field in the package as keys, and 1832 # references to their respective hashes as values. 1833 my %package_fields; 1834 1835 sub setup_package { 1836 # Sets up the package, creating standard DESTROY and dump methods 1837 # (unless already defined). The dump method is used in debugging by 1838 # simple_dumper(). 1839 # The optional parameters are: 1840 # a) a reference to a hash, that gets populated by later 1841 # set_access() calls with one of the accesses being 1842 # 'constructor'. The caller can then refer to this, but it is 1843 # not otherwise used by these two routines. 1844 # b) a reference to a callback routine to call during destruction 1845 # of the object, before any fields are actually destroyed 1846 1847 my %args = @_; 1848 my $constructor_ref = delete $args{'Constructor_Fields'}; 1849 my $destroy_callback = delete $args{'Destroy_Callback'}; 1850 Carp::carp_extra_args(\@_) if main::DEBUG && %args; 1851 1852 my %fields; 1853 my $package = (caller)[0]; 1854 1855 $package_fields{$package} = \%fields; 1856 $constructor_fields{$package} = $constructor_ref; 1857 1858 unless ($package->can('DESTROY')) { 1859 my $destroy_name = "${package}::DESTROY"; 1860 no strict "refs"; 1861 1862 # Use typeglob to give the anonymous subroutine the name we want 1863 *$destroy_name = sub { 1864 my $self = shift; 1865 my $addr = do { no overloading; pack 'J', $self; }; 1866 1867 $self->$destroy_callback if $destroy_callback; 1868 foreach my $field (keys %{$package_fields{$package}}) { 1869 #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n"; 1870 delete $package_fields{$package}{$field}{$addr}; 1871 } 1872 return; 1873 } 1874 } 1875 1876 unless ($package->can('dump')) { 1877 my $dump_name = "${package}::dump"; 1878 no strict "refs"; 1879 *$dump_name = sub { 1880 my $self = shift; 1881 return dump_inside_out($self, $package_fields{$package}, @_); 1882 } 1883 } 1884 return; 1885 } 1886 1887 sub set_access($name, $field, @accessors) { 1888 # Arrange for the input field to be garbage collected when no longer 1889 # needed. Also, creates standard accessor functions for the field 1890 # based on the optional parameters-- none if none of these parameters: 1891 # 'addable' creates an 'add_NAME()' accessor function. 1892 # 'readable' or 'readable_array' creates a 'NAME()' accessor 1893 # function. 1894 # 'settable' creates a 'set_NAME()' accessor function. 1895 # 'constructor' doesn't create an accessor function, but adds the 1896 # field to the hash that was previously passed to 1897 # setup_package(); 1898 # Any of the accesses can be abbreviated down, so that 'a', 'ad', 1899 # 'add' etc. all mean 'addable'. 1900 # The read accessor function will work on both array and scalar 1901 # values. If another accessor in the parameter list is 'a', the read 1902 # access assumes an array. You can also force it to be array access 1903 # by specifying 'readable_array' instead of 'readable' 1904 # 1905 # A sort-of 'protected' access can be set-up by preceding the addable, 1906 # readable or settable with some initial portion of 'protected_' (but, 1907 # the underscore is required), like 'p_a', 'pro_set', etc. The 1908 # "protection" is only by convention. All that happens is that the 1909 # accessor functions' names begin with an underscore. So instead of 1910 # calling set_foo, the call is _set_foo. (Real protection could be 1911 # accomplished by having a new subroutine, end_package, called at the 1912 # end of each package, and then storing the __LINE__ ranges and 1913 # checking them on every accessor. But that is way overkill.) 1914 1915 # We create anonymous subroutines as the accessors and then use 1916 # typeglobs to assign them to the proper package and name 1917 1918 # $name Name of the field 1919 # $field Reference to the inside-out hash containing the 1920 # field 1921 1922 my $package = (caller)[0]; 1923 1924 if (! exists $package_fields{$package}) { 1925 croak "$0: Must call 'setup_package' before 'set_access'"; 1926 } 1927 1928 # Stash the field so DESTROY can get it. 1929 $package_fields{$package}{$name} = $field; 1930 1931 # Remaining arguments are the accessors. For each... 1932 foreach my $access (@accessors) { 1933 my $access = lc $access; 1934 1935 my $protected = ""; 1936 1937 # Match the input as far as it goes. 1938 if ($access =~ /^(p[^_]*)_/) { 1939 $protected = $1; 1940 if (substr('protected_', 0, length $protected) 1941 eq $protected) 1942 { 1943 1944 # Add 1 for the underscore not included in $protected 1945 $access = substr($access, length($protected) + 1); 1946 $protected = '_'; 1947 } 1948 else { 1949 $protected = ""; 1950 } 1951 } 1952 1953 if (substr('addable', 0, length $access) eq $access) { 1954 my $subname = "${package}::${protected}add_$name"; 1955 no strict "refs"; 1956 1957 # add_ accessor. Don't add if already there, which we 1958 # determine using 'eq' for scalars and '==' otherwise. 1959 *$subname = sub ($self, $value) { 1960 use strict "refs"; 1961 my $addr = do { no overloading; pack 'J', $self; }; 1962 if (ref $value) { 1963 return if grep { $value == $_ } @{$field->{$addr}}; 1964 } 1965 else { 1966 return if grep { $value eq $_ } @{$field->{$addr}}; 1967 } 1968 push @{$field->{$addr}}, $value; 1969 return; 1970 } 1971 } 1972 elsif (substr('constructor', 0, length $access) eq $access) { 1973 if ($protected) { 1974 Carp::my_carp_bug("Can't set-up 'protected' constructors") 1975 } 1976 else { 1977 $constructor_fields{$package}{$name} = $field; 1978 } 1979 } 1980 elsif (substr('readable_array', 0, length $access) eq $access) { 1981 1982 # Here has read access. If one of the other parameters for 1983 # access is array, or this one specifies array (by being more 1984 # than just 'readable_'), then create a subroutine that 1985 # assumes the data is an array. Otherwise just a scalar 1986 my $subname = "${package}::${protected}$name"; 1987 if (grep { /^a/i } @_ 1988 or length($access) > length('readable_')) 1989 { 1990 no strict "refs"; 1991 *$subname = sub ($_addr) { 1992 use strict "refs"; 1993 my $addr = do { no overloading; pack 'J', $_addr; }; 1994 if (ref $field->{$addr} ne 'ARRAY') { 1995 my $type = ref $field->{$addr}; 1996 $type = 'scalar' unless $type; 1997 Carp::my_carp_bug("Trying to read $name as an array when it is a $type. Big problems."); 1998 return; 1999 } 2000 return scalar @{$field->{$addr}} unless wantarray; 2001 2002 # Make a copy; had problems with caller modifying the 2003 # original otherwise 2004 my @return = @{$field->{$addr}}; 2005 return @return; 2006 } 2007 } 2008 else { 2009 2010 # Here not an array value, a simpler function. 2011 no strict "refs"; 2012 *$subname = sub ($addr) { 2013 use strict "refs"; 2014 no overloading; 2015 return $field->{pack 'J', $addr}; 2016 } 2017 } 2018 } 2019 elsif (substr('settable', 0, length $access) eq $access) { 2020 my $subname = "${package}::${protected}set_$name"; 2021 no strict "refs"; 2022 *$subname = sub ($self, $value) { 2023 use strict "refs"; 2024 # $self is $_[0]; $value is $_[1] 2025 no overloading; 2026 $field->{pack 'J', $self} = $value; 2027 return; 2028 } 2029 } 2030 else { 2031 Carp::my_carp_bug("Unknown accessor type $access. No accessor set."); 2032 } 2033 } 2034 return; 2035 } 2036} 2037 2038package Input_file; 2039 2040# All input files use this object, which stores various attributes about them, 2041# and provides for convenient, uniform handling. The run method wraps the 2042# processing. It handles all the bookkeeping of opening, reading, and closing 2043# the file, returning only significant input lines. 2044# 2045# Each object gets a handler which processes the body of the file, and is 2046# called by run(). All character property files must use the generic, 2047# default handler, which has code scrubbed to handle things you might not 2048# expect, including automatic EBCDIC handling. For files that don't deal with 2049# mapping code points to a property value, such as test files, 2050# PropertyAliases, PropValueAliases, and named sequences, you can override the 2051# handler to be a custom one. Such a handler should basically be a 2052# while(next_line()) {...} loop. 2053# 2054# You can also set up handlers to 2055# 0) call during object construction time, after everything else is done 2056# 1) call before the first line is read, for pre processing 2057# 2) call to adjust each line of the input before the main handler gets 2058# them. This can be automatically generated, if appropriately simple 2059# enough, by specifying a Properties parameter in the constructor. 2060# 3) call upon EOF before the main handler exits its loop 2061# 4) call at the end, for post processing 2062# 2063# $_ is used to store the input line, and is to be filtered by the 2064# each_line_handler()s. So, if the format of the line is not in the desired 2065# format for the main handler, these are used to do that adjusting. They can 2066# be stacked (by enclosing them in an [ anonymous array ] in the constructor, 2067# so the $_ output of one is used as the input to the next. The EOF handler 2068# is also stackable, but none of the others are, but could easily be changed 2069# to be so. 2070# 2071# Some properties are used by the Perl core but aren't defined until later 2072# Unicode releases. The perl interpreter would have problems working when 2073# compiled with an earlier Unicode version that doesn't have them, so we need 2074# to define them somehow for those releases. The 'Early' constructor 2075# parameter can be used to automatically handle this. It is essentially 2076# ignored if the Unicode version being compiled has a data file for this 2077# property. Either code to execute or a file to read can be specified. 2078# Details are at the %early definition. 2079# 2080# Most of the handlers can call insert_lines() or insert_adjusted_lines() 2081# which insert the parameters as lines to be processed before the next input 2082# file line is read. This allows the EOF handler(s) to flush buffers, for 2083# example. The difference between the two routines is that the lines inserted 2084# by insert_lines() are subjected to the each_line_handler()s. (So if you 2085# called it from such a handler, you would get infinite recursion without some 2086# mechanism to prevent that.) Lines inserted by insert_adjusted_lines() go 2087# directly to the main handler without any adjustments. If the 2088# post-processing handler calls any of these, there will be no effect. Some 2089# error checking for these conditions could be added, but it hasn't been done. 2090# 2091# carp_bad_line() should be called to warn of bad input lines, which clears $_ 2092# to prevent further processing of the line. This routine will output the 2093# message as a warning once, and then keep a count of the lines that have the 2094# same message, and output that count at the end of the file's processing. 2095# This keeps the number of messages down to a manageable amount. 2096# 2097# get_missings() should be called to retrieve any @missing input lines. 2098# Messages will be raised if this isn't done if the options aren't to ignore 2099# missings. 2100 2101sub trace { return main::trace(@_); } 2102 2103{ # Closure 2104 # Keep track of fields that are to be put into the constructor. 2105 my %constructor_fields; 2106 2107 main::setup_package(Constructor_Fields => \%constructor_fields); 2108 2109 my %file; # Input file name, required 2110 main::set_access('file', \%file, qw{ c r }); 2111 2112 my %first_released; # Unicode version file was first released in, required 2113 main::set_access('first_released', \%first_released, qw{ c r }); 2114 2115 my %handler; # Subroutine to process the input file, defaults to 2116 # 'process_generic_property_file' 2117 main::set_access('handler', \%handler, qw{ c }); 2118 2119 my %property; 2120 # name of property this file is for. defaults to none, meaning not 2121 # applicable, or is otherwise determinable, for example, from each line. 2122 main::set_access('property', \%property, qw{ c r }); 2123 2124 my %optional; 2125 # This is either an unsigned number, or a list of property names. In the 2126 # former case, if it is non-zero, it means the file is optional, so if the 2127 # file is absent, no warning about that is output. In the latter case, it 2128 # is a list of properties that the file (exclusively) defines. If the 2129 # file is present, tables for those properties will be produced; if 2130 # absent, none will, even if they are listed elsewhere (namely 2131 # PropertyAliases.txt and PropValueAliases.txt) as being in this release, 2132 # and no warnings will be raised about them not being available. (And no 2133 # warning about the file itself will be raised.) 2134 main::set_access('optional', \%optional, qw{ c readable_array } ); 2135 2136 my %non_skip; 2137 # This is used for debugging, to skip processing of all but a few input 2138 # files. Add 'non_skip => 1' to the constructor for those files you want 2139 # processed when you set the $debug_skip global. 2140 main::set_access('non_skip', \%non_skip, 'c'); 2141 2142 my %skip; 2143 # This is used to skip processing of this input file (semi-) permanently. 2144 # The value should be the reason the file is being skipped. It is used 2145 # for files that we aren't planning to process anytime soon, but want to 2146 # allow to be in the directory and be checked for their names not 2147 # conflicting with any other files on a DOS 8.3 name filesystem, but to 2148 # not otherwise be processed, and to not raise a warning about not being 2149 # handled. In the constructor call, any value that evaluates to a numeric 2150 # 0 or undef means don't skip. Any other value is a string giving the 2151 # reason it is being skipped, and this will appear in generated pod. 2152 # However, an empty string reason will suppress the pod entry. 2153 # Internally, calls that evaluate to numeric 0 are changed into undef to 2154 # distinguish them from an empty string call. 2155 main::set_access('skip', \%skip, 'c', 'r'); 2156 2157 my %each_line_handler; 2158 # list of subroutines to look at and filter each non-comment line in the 2159 # file. defaults to none. The subroutines are called in order, each is 2160 # to adjust $_ for the next one, and the final one adjusts it for 2161 # 'handler' 2162 main::set_access('each_line_handler', \%each_line_handler, 'c'); 2163 2164 my %retain_trailing_comments; 2165 # This is used to not discard the comments that end data lines. This 2166 # would be used only for files with non-typical syntax, and most code here 2167 # assumes that comments have been stripped, so special handlers would have 2168 # to be written. It is assumed that the code will use these in 2169 # single-quoted contexts, and so any "'" marks in the comment will be 2170 # prefixed by a backslash. 2171 main::set_access('retain_trailing_comments', \%retain_trailing_comments, 'c'); 2172 2173 my %properties; # Optional ordered list of the properties that occur in each 2174 # meaningful line of the input file. If present, an appropriate 2175 # each_line_handler() is automatically generated and pushed onto the stack 2176 # of such handlers. This is useful when a file contains multiple 2177 # properties per line, but no other special considerations are necessary. 2178 # The special value "<ignored>" means to discard the corresponding input 2179 # field. 2180 # Any @missing lines in the file should also match this syntax; no such 2181 # files exist as of 6.3. But if it happens in a future release, the code 2182 # could be expanded to properly parse them. 2183 main::set_access('properties', \%properties, qw{ c r }); 2184 2185 my %has_missings_defaults; 2186 # ? Are there lines in the file giving default values for code points 2187 # missing from it?. Defaults to NO_DEFAULTS. Otherwise NOT_IGNORED is 2188 # the norm, but IGNORED means it has such lines, but the handler doesn't 2189 # use them. Having these three states allows us to catch changes to the 2190 # UCD that this program should track. XXX This could be expanded to 2191 # specify the syntax for such lines, like %properties above. 2192 main::set_access('has_missings_defaults', 2193 \%has_missings_defaults, qw{ c r }); 2194 2195 my %construction_time_handler; 2196 # Subroutine to call at the end of the new method. If undef, no such 2197 # handler is called. 2198 main::set_access('construction_time_handler', 2199 \%construction_time_handler, qw{ c }); 2200 2201 my %pre_handler; 2202 # Subroutine to call before doing anything else in the file. If undef, no 2203 # such handler is called. 2204 main::set_access('pre_handler', \%pre_handler, qw{ c }); 2205 2206 my %eof_handler; 2207 # Subroutines to call upon getting an EOF on the input file, but before 2208 # that is returned to the main handler. This is to allow buffers to be 2209 # flushed. The handler is expected to call insert_lines() or 2210 # insert_adjusted() with the buffered material 2211 main::set_access('eof_handler', \%eof_handler, qw{ c }); 2212 2213 my %post_handler; 2214 # Subroutine to call after all the lines of the file are read in and 2215 # processed. If undef, no such handler is called. Note that this cannot 2216 # add lines to be processed; instead use eof_handler 2217 main::set_access('post_handler', \%post_handler, qw{ c }); 2218 2219 my %progress_message; 2220 # Message to print to display progress in lieu of the standard one 2221 main::set_access('progress_message', \%progress_message, qw{ c }); 2222 2223 my %handle; 2224 # cache open file handle, internal. Is undef if file hasn't been 2225 # processed at all, empty if has; 2226 main::set_access('handle', \%handle); 2227 2228 my %added_lines; 2229 # cache of lines added virtually to the file, internal 2230 main::set_access('added_lines', \%added_lines); 2231 2232 my %remapped_lines; 2233 # cache of lines added virtually to the file, internal 2234 main::set_access('remapped_lines', \%remapped_lines); 2235 2236 my %errors; 2237 # cache of errors found, internal 2238 main::set_access('errors', \%errors); 2239 2240 my %missings; 2241 # storage of '@missing' defaults lines 2242 main::set_access('missings', \%missings); 2243 2244 my %early; 2245 # Used for properties that must be defined (for Perl's purposes) on 2246 # versions of Unicode earlier than Unicode itself defines them. The 2247 # parameter is an array (it would be better to be a hash, but not worth 2248 # bothering about due to its rare use). 2249 # 2250 # The first element is either a code reference to call when in a release 2251 # earlier than the Unicode file is available in, or it is an alternate 2252 # file to use instead of the non-existent one. This file must have been 2253 # plunked down in the same directory as mktables. Should you be compiling 2254 # on a release that needs such a file, mktables will abort the 2255 # compilation, and tell you where to get the necessary file(s), and what 2256 # name(s) to use to store them as. 2257 # In the case of specifying an alternate file, the array must contain two 2258 # further elements: 2259 # 2260 # [1] is the name of the property that will be generated by this file. 2261 # The class automatically takes the input file and excludes any code 2262 # points in it that were not assigned in the Unicode version being 2263 # compiled. It then uses this result to define the property in the given 2264 # version. Since the property doesn't actually exist in the Unicode 2265 # version being compiled, this should be a name accessible only by core 2266 # perl. If it is the same name as the regular property, the constructor 2267 # will mark the output table as a $PLACEHOLDER so that it doesn't actually 2268 # get output, and so will be unusable by non-core code. Otherwise it gets 2269 # marked as $INTERNAL_ONLY. 2270 # 2271 # [2] is a property value to assign (only when compiling Unicode 1.1.5) to 2272 # the Hangul syllables in that release (which were ripped out in version 2273 # 2) for the given property . (Hence it is ignored except when compiling 2274 # version 1. You only get one value that applies to all of them, which 2275 # may not be the actual reality, but probably nobody cares anyway for 2276 # these obsolete characters.) 2277 # 2278 # [3] if present is the default value for the property to assign for code 2279 # points not given in the input. If not present, the default from the 2280 # normal property is used 2281 # 2282 # [-1] If there is an extra final element that is the string 'ONLY_EARLY'. 2283 # it means to not add the name in [1] as an alias to the property name 2284 # used for these. Normally, when compiling Unicode versions that don't 2285 # invoke the early handling, the name is added as a synonym. 2286 # 2287 # Not all files can be handled in the above way, and so the code ref 2288 # alternative is available. It can do whatever it needs to. The other 2289 # array elements are optional in this case, and the code is free to use or 2290 # ignore them if they are present. 2291 # 2292 # Internally, the constructor unshifts a 0 or 1 onto this array to 2293 # indicate if an early alternative is actually being used or not. This 2294 # makes for easier testing later on. 2295 main::set_access('early', \%early, 'c'); 2296 2297 my %only_early; 2298 main::set_access('only_early', \%only_early, 'c'); 2299 2300 my %required_even_in_debug_skip; 2301 # debug_skip is used to speed up compilation during debugging by skipping 2302 # processing files that are not needed for the task at hand. However, 2303 # some files pretty much can never be skipped, and this is used to specify 2304 # that this is one of them. In order to skip this file, the call to the 2305 # constructor must be edited to comment out this parameter. 2306 main::set_access('required_even_in_debug_skip', 2307 \%required_even_in_debug_skip, 'c'); 2308 2309 my %withdrawn; 2310 # Some files get removed from the Unicode DB. This is a version object 2311 # giving the first release without this file. 2312 main::set_access('withdrawn', \%withdrawn, 'c'); 2313 2314 my %ucd; 2315 # Some files are not actually part of the Unicode Character Database. 2316 # These typically have a different way of indicating their version 2317 main::set_access('ucd', \%ucd, 'c'); 2318 2319 my %in_this_release; 2320 # Calculated value from %first_released and %withdrawn. Are we compiling 2321 # a Unicode release which includes this file? 2322 main::set_access('in_this_release', \%in_this_release); 2323 2324 sub _next_line; 2325 sub _next_line_with_remapped_range; 2326 2327 sub new { 2328 my $class = shift; 2329 2330 my $self = bless \do{ my $anonymous_scalar }, $class; 2331 my $addr = do { no overloading; pack 'J', $self; }; 2332 2333 # Set defaults 2334 $handler{$addr} = \&main::process_generic_property_file; 2335 $retain_trailing_comments{$addr} = 0; 2336 $non_skip{$addr} = 0; 2337 $skip{$addr} = undef; 2338 $has_missings_defaults{$addr} = $NO_DEFAULTS; 2339 $handle{$addr} = undef; 2340 $added_lines{$addr} = [ ]; 2341 $remapped_lines{$addr} = [ ]; 2342 $each_line_handler{$addr} = [ ]; 2343 $eof_handler{$addr} = [ ]; 2344 $errors{$addr} = { }; 2345 $missings{$addr} = [ ]; 2346 $early{$addr} = [ ]; 2347 $optional{$addr} = [ ]; 2348 $ucd{$addr} = 1; 2349 2350 # Two positional parameters. 2351 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2; 2352 $file{$addr} = main::internal_file_to_platform(shift); 2353 $first_released{$addr} = shift; 2354 2355 # The rest of the arguments are key => value pairs 2356 # %constructor_fields has been set up earlier to list all possible 2357 # ones. Either set or push, depending on how the default has been set 2358 # up just above. 2359 my %args = @_; 2360 foreach my $key (keys %args) { 2361 my $argument = $args{$key}; 2362 2363 # Note that the fields are the lower case of the constructor keys 2364 my $hash = $constructor_fields{lc $key}; 2365 if (! defined $hash) { 2366 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self. Skipped"); 2367 next; 2368 } 2369 if (ref $hash->{$addr} eq 'ARRAY') { 2370 if (ref $argument eq 'ARRAY') { 2371 foreach my $argument (@{$argument}) { 2372 next if ! defined $argument; 2373 push @{$hash->{$addr}}, $argument; 2374 } 2375 } 2376 else { 2377 push @{$hash->{$addr}}, $argument if defined $argument; 2378 } 2379 } 2380 else { 2381 $hash->{$addr} = $argument; 2382 } 2383 delete $args{$key}; 2384 }; 2385 2386 $non_skip{$addr} = 1 if $required_even_in_debug_skip{$addr}; 2387 2388 # Convert 0 (meaning don't skip) to undef 2389 undef $skip{$addr} unless $skip{$addr}; 2390 2391 # Handle the case where this file is optional 2392 my $pod_message_for_non_existent_optional = ""; 2393 if ($optional{$addr}->@*) { 2394 2395 # First element is the pod message 2396 $pod_message_for_non_existent_optional 2397 = shift $optional{$addr}->@*; 2398 # Convert a 0 'Optional' argument to an empty list to make later 2399 # code more concise. 2400 if ( $optional{$addr}->@* 2401 && $optional{$addr}->@* == 1 2402 && $optional{$addr}[0] ne "" 2403 && $optional{$addr}[0] !~ /\D/ 2404 && $optional{$addr}[0] == 0) 2405 { 2406 $optional{$addr} = [ ]; 2407 } 2408 else { # But if the only element doesn't evaluate to 0, make sure 2409 # that this file is indeed considered optional below. 2410 unshift $optional{$addr}->@*, 1; 2411 } 2412 } 2413 2414 my $progress; 2415 my $function_instead_of_file = 0; 2416 2417 if ($early{$addr}->@* && $early{$addr}[-1] eq 'ONLY_EARLY') { 2418 $only_early{$addr} = 1; 2419 pop $early{$addr}->@*; 2420 } 2421 2422 # If we are compiling a Unicode release earlier than the file became 2423 # available, the constructor may have supplied a substitute 2424 if ($first_released{$addr} gt $v_version && $early{$addr}->@*) { 2425 2426 # Yes, we have a substitute, that we will use; mark it so 2427 unshift $early{$addr}->@*, 1; 2428 2429 # See the definition of %early for what the array elements mean. 2430 # Note that we have just unshifted onto the array, so the numbers 2431 # below are +1 of those in the %early description. 2432 # If we have a property this defines, create a table and default 2433 # map for it now (at essentially compile time), so that it will be 2434 # available for the whole of run time. (We will want to add this 2435 # name as an alias when we are using the official property name; 2436 # but this must be deferred until run(), because at construction 2437 # time the official names have yet to be defined.) 2438 if ($early{$addr}[2]) { 2439 my $fate = ($property{$addr} 2440 && $property{$addr} eq $early{$addr}[2]) 2441 ? $PLACEHOLDER 2442 : $INTERNAL_ONLY; 2443 my $prop_object = Property->new($early{$addr}[2], 2444 Fate => $fate, 2445 Perl_Extension => 1, 2446 ); 2447 2448 # If not specified by the constructor, use the default mapping 2449 # for the regular property for this substitute one. 2450 if ($early{$addr}[4]) { 2451 $prop_object->set_default_map($early{$addr}[4]); 2452 } 2453 elsif ( defined $property{$addr} 2454 && defined $default_mapping{$property{$addr}}) 2455 { 2456 $prop_object 2457 ->set_default_map($default_mapping{$property{$addr}}); 2458 } 2459 } 2460 2461 if (ref $early{$addr}[1] eq 'CODE') { 2462 $function_instead_of_file = 1; 2463 2464 # If the first element of the array is a code ref, the others 2465 # are optional. 2466 $handler{$addr} = $early{$addr}[1]; 2467 $property{$addr} = $early{$addr}[2] 2468 if defined $early{$addr}[2]; 2469 $progress = "substitute $file{$addr}"; 2470 2471 undef $file{$addr}; 2472 } 2473 else { # Specifying a substitute file 2474 2475 if (! main::file_exists($early{$addr}[1])) { 2476 2477 # If we don't see the substitute file, generate an error 2478 # message giving the needed things, and add it to the list 2479 # of such to output before actual processing happens 2480 # (hence the user finds out all of them in one run). 2481 # Instead of creating a general method for NameAliases, 2482 # hard-code it here, as there is unlikely to ever be a 2483 # second one which needs special handling. 2484 my $string_version = ($file{$addr} eq "NameAliases.txt") 2485 ? 'at least 6.1 (the later, the better)' 2486 : sprintf "%vd", $first_released{$addr}; 2487 push @missing_early_files, <<END; 2488'$file{$addr}' version $string_version should be copied to '$early{$addr}[1]'. 2489END 2490 ; 2491 return; 2492 } 2493 $progress = $early{$addr}[1]; 2494 $progress .= ", substituting for $file{$addr}" if $file{$addr}; 2495 $file{$addr} = $early{$addr}[1]; 2496 $property{$addr} = $early{$addr}[2]; 2497 2498 # Ignore code points not in the version being compiled 2499 push $each_line_handler{$addr}->@*, \&_exclude_unassigned; 2500 2501 if ( $v_version lt v2.0 # Hanguls in this release ... 2502 && defined $early{$addr}[3]) # ... need special treatment 2503 { 2504 push $eof_handler{$addr}->@*, \&_fixup_obsolete_hanguls; 2505 } 2506 } 2507 2508 # And this substitute is valid for all releases. 2509 $first_released{$addr} = v0; 2510 } 2511 else { # Normal behavior 2512 $progress = $file{$addr}; 2513 unshift $early{$addr}->@*, 0; # No substitute 2514 } 2515 2516 my $file = $file{$addr}; 2517 $progress_message{$addr} = "Processing $progress" 2518 unless $progress_message{$addr}; 2519 2520 # A file should be there if it is within the window of versions for 2521 # which Unicode supplies it 2522 if ($withdrawn{$addr} && $withdrawn{$addr} le $v_version) { 2523 $in_this_release{$addr} = 0; 2524 $skip{$addr} = ""; 2525 } 2526 else { 2527 $in_this_release{$addr} = $first_released{$addr} le $v_version; 2528 2529 # Check that the file for this object (possibly using a substitute 2530 # for early releases) exists or we have a function alternative 2531 if ( ! $function_instead_of_file 2532 && ! main::file_exists($file)) 2533 { 2534 # Here there is nothing available for this release. This is 2535 # fine if we aren't expecting anything in this release. 2536 if (! $in_this_release{$addr}) { 2537 $skip{$addr} = ""; # Don't remark since we expected 2538 # nothing and got nothing 2539 } 2540 elsif ($optional{$addr}->@*) { 2541 2542 # Here the file is optional in this release; Use the 2543 # passed in text to document this case in the pod. 2544 $skip{$addr} = $pod_message_for_non_existent_optional; 2545 } 2546 elsif ( $in_this_release{$addr} 2547 && ! defined $skip{$addr} 2548 && defined $file) 2549 { # Doesn't exist but should. 2550 $skip{$addr} = "'$file' not found. Possibly Big problems"; 2551 Carp::my_carp($skip{$addr}); 2552 } 2553 } 2554 elsif ($debug_skip && ! defined $skip{$addr} && ! $non_skip{$addr}) 2555 { 2556 2557 # The file exists; if not skipped for another reason, and we are 2558 # skipping most everything during debugging builds, use that as 2559 # the skip reason. 2560 $skip{$addr} = '$debug_skip is on' 2561 } 2562 } 2563 2564 if ( ! $debug_skip 2565 && $non_skip{$addr} 2566 && ! $required_even_in_debug_skip{$addr} 2567 && $verbosity) 2568 { 2569 print "Warning: " . __PACKAGE__ . " constructor for $file has useless 'non_skip' in it\n"; 2570 } 2571 2572 # Here, we have figured out if we will be skipping this file or not. 2573 # If so, we add any single property it defines to any passed in 2574 # optional property list. These will be dealt with at run time. 2575 if (defined $skip{$addr}) { 2576 if ($property{$addr}) { 2577 push $optional{$addr}->@*, $property{$addr}; 2578 } 2579 } # Otherwise, are going to process the file. 2580 elsif ($property{$addr}) { 2581 2582 # If the file has a property defined in the constructor for it, it 2583 # means that the property is not listed in the file's entries. So 2584 # add a handler (to the list of line handlers) to insert the 2585 # property name into the lines, to provide a uniform interface to 2586 # the final processing subroutine. 2587 push @{$each_line_handler{$addr}}, \&_insert_property_into_line; 2588 } 2589 elsif ($properties{$addr}) { 2590 2591 # Similarly, there may be more than one property represented on 2592 # each line, with no clue but the constructor input what those 2593 # might be. Add a handler for each line in the input so that it 2594 # creates a separate input line for each property in those input 2595 # lines, thus making them suitable to handle generically. 2596 2597 push @{$each_line_handler{$addr}}, 2598 sub { 2599 my $file = shift; 2600 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 2601 my @fields = split /\s*;\s*/, $_, -1; 2602 2603 if (@fields - 1 > @{$properties{$addr}}) { 2604 $file->carp_bad_line('Extra fields'); 2605 $_ = ""; 2606 return; 2607 } 2608 my $range = shift @fields; # 0th element is always the 2609 # range 2610 2611 # The next fields in the input line correspond 2612 # respectively to the stored properties. 2613 for my $i (0 .. @{$properties{$addr}} - 1) { 2614 my $property_name = $properties{$addr}[$i]; 2615 next if $property_name eq '<ignored>'; 2616 $file->insert_adjusted_lines( 2617 "$range; $property_name; $fields[$i]"); 2618 } 2619 $_ = ""; 2620 2621 return; 2622 }; 2623 } 2624 2625 { # On non-ascii platforms, we use a special pre-handler 2626 no strict; 2627 no warnings 'once'; 2628 *next_line = (main::NON_ASCII_PLATFORM) 2629 ? *_next_line_with_remapped_range 2630 : *_next_line; 2631 } 2632 2633 &{$construction_time_handler{$addr}}($self) 2634 if $construction_time_handler{$addr}; 2635 2636 return $self; 2637 } 2638 2639 2640 use overload 2641 fallback => 0, 2642 qw("") => "_operator_stringify", 2643 "." => \&main::_operator_dot, 2644 ".=" => \&main::_operator_dot_equal, 2645 ; 2646 2647 sub _operator_stringify($self) { 2648 return __PACKAGE__ . " object for " . $self->file; 2649 } 2650 2651 sub run($self) { 2652 # Process the input object $self. This opens and closes the file and 2653 # calls all the handlers for it. Currently, this can only be called 2654 # once per file, as it destroy's the EOF handlers 2655 2656 # flag to make sure extracted files are processed early 2657 state $seen_non_extracted = 0; 2658 2659 my $addr = do { no overloading; pack 'J', $self; }; 2660 2661 my $file = $file{$addr}; 2662 2663 if (! $file) { 2664 $handle{$addr} = 'pretend_is_open'; 2665 } 2666 else { 2667 if ($seen_non_extracted) { 2668 if ($file =~ /$EXTRACTED/i) # Some platforms may change the 2669 # case of the file's name 2670 { 2671 Carp::my_carp_bug(main::join_lines(<<END 2672$file should be processed just after the 'Prop...Alias' files, and before 2673anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may 2674have subtle problems 2675END 2676 )); 2677 } 2678 } 2679 elsif ($EXTRACTED_DIR 2680 2681 # We only do this check for generic property files 2682 && $handler{$addr} == \&main::process_generic_property_file 2683 2684 && $file !~ /$EXTRACTED/i) 2685 { 2686 # We don't set this (by the 'if' above) if we have no 2687 # extracted directory, so if running on an early version, 2688 # this test won't work. Not worth worrying about. 2689 $seen_non_extracted = 1; 2690 } 2691 2692 # Mark the file as having being processed, and warn if it 2693 # isn't a file we are expecting. As we process the files, 2694 # they are deleted from the hash, so any that remain at the 2695 # end of the program are files that we didn't process. 2696 my $fkey = File::Spec->rel2abs($file); 2697 my $exists = delete $potential_files{lc($fkey)}; 2698 2699 Carp::my_carp("Was not expecting '$file'.") 2700 if $exists && ! $in_this_release{$addr}; 2701 2702 # If there is special handling for compiling Unicode releases 2703 # earlier than the first one in which Unicode defines this 2704 # property ... 2705 if ($early{$addr}->@* > 1) { 2706 2707 # Mark as processed any substitute file that would be used in 2708 # such a release 2709 $fkey = File::Spec->rel2abs($early{$addr}[1]); 2710 delete $potential_files{lc($fkey)}; 2711 2712 # As commented in the constructor code, when using the 2713 # official property, we still have to allow the publicly 2714 # inaccessible early name so that the core code which uses it 2715 # will work regardless. 2716 if ( ! $only_early{$addr} 2717 && ! $early{$addr}[0] 2718 && $early{$addr}->@* > 2) 2719 { 2720 my $early_property_name = $early{$addr}[2]; 2721 if ($property{$addr} ne $early_property_name) { 2722 main::property_ref($property{$addr}) 2723 ->add_alias($early_property_name); 2724 } 2725 } 2726 } 2727 2728 # We may be skipping this file ... 2729 if (defined $skip{$addr}) { 2730 2731 # If the file isn't supposed to be in this release, there is 2732 # nothing to do 2733 if ($in_this_release{$addr}) { 2734 2735 # But otherwise, we may print a message 2736 if ($debug_skip) { 2737 print STDERR "Skipping input file '$file'", 2738 " because '$skip{$addr}'\n"; 2739 } 2740 2741 # And add it to the list of skipped files, which is later 2742 # used to make the pod 2743 $skipped_files{$file} = $skip{$addr}; 2744 2745 # The 'optional' list contains properties that are also to 2746 # be skipped along with the file. (There may also be 2747 # digits which are just placeholders to make sure it isn't 2748 # an empty list 2749 foreach my $property ($optional{$addr}->@*) { 2750 next unless $property =~ /\D/; 2751 my $prop_object = main::property_ref($property); 2752 next unless defined $prop_object; 2753 $prop_object->set_fate($SUPPRESSED, $skip{$addr}); 2754 } 2755 } 2756 2757 return; 2758 } 2759 2760 # Here, we are going to process the file. Open it, converting the 2761 # slashes used in this program into the proper form for the OS 2762 my $file_handle; 2763 if (not open $file_handle, "<", $file) { 2764 Carp::my_carp("Can't open $file. Skipping: $!"); 2765 return; 2766 } 2767 $handle{$addr} = $file_handle; # Cache the open file handle 2768 2769 # If possible, make sure that the file is the correct version. 2770 # (This data isn't available on early Unicode releases or in 2771 # UnicodeData.txt.) We don't do this check if we are using a 2772 # substitute file instead of the official one (though the code 2773 # could be extended to do so). 2774 if ($in_this_release{$addr} 2775 && ! $early{$addr}[0] 2776 && lc($file) ne 'unicodedata.txt') 2777 { 2778 my $this_version; 2779 2780 if ($file !~ /^Unihan/i) { 2781 2782 # The non-Unihan files started getting version numbers in 2783 # 3.2, but some files in 4.0 are unchanged from 3.2, and 2784 # marked as 3.2. 4.0.1 is the first version where there 2785 # are no files marked as being from less than 4.0, though 2786 # some are marked as 4.0. In versions after that, the 2787 # numbers are correct. 2788 if ($v_version ge v4.0.1) { 2789 $_ = <$file_handle>; # The version number is in the 2790 # very first line if it is a 2791 # UCD file; otherwise, it 2792 # might be 2793 goto valid_version if $_ =~ / - $string_version \. /x; 2794 chomp; 2795 if ($ucd{$addr}) { 2796 $_ =~ s/^#\s*//; 2797 2798 # 4.0.1 had some valid files that weren't updated. 2799 goto valid_version 2800 if $v_version eq v4.0.1 && $_ =~ /4\.0\.0/; 2801 $this_version = $_; 2802 goto wrong_version; 2803 } 2804 else { 2805 my $BOM = "\x{FEFF}"; 2806 utf8::encode($BOM); 2807 my $BOM_re = qr/ ^ (?:$BOM)? /x; 2808 2809 while ($_ =~ s/$BOM_re//) { # BOM; seems to be on 2810 # many lines in some files!! 2811 $_ = <$file_handle>; 2812 chomp; 2813 if ($_ =~ /^# Version: (.*)/) { 2814 $this_version = $1; 2815 goto valid_version 2816 if $this_version eq $string_version; 2817 goto valid_version 2818 if "$this_version.0" eq $string_version; 2819 goto wrong_version; 2820 } 2821 } 2822 goto no_version; 2823 } 2824 } 2825 } 2826 elsif ($v_version ge v6.0.0) { # Unihan 2827 2828 # Unihan files didn't get accurate version numbers until 2829 # 6.0. The version is somewhere in the first comment 2830 # block 2831 while (<$file_handle>) { 2832 goto no_version if $_ !~ /^#/; 2833 chomp; 2834 $_ =~ s/^#\s*//; 2835 next if $_ !~ / version: /x; 2836 goto valid_version if $_ =~ /$string_version/; 2837 goto wrong_version; 2838 } 2839 goto no_version; 2840 } 2841 else { # Old Unihan; have to assume is valid 2842 goto valid_version; 2843 } 2844 2845 wrong_version: 2846 die Carp::my_carp("File '$file' is version " 2847 . "'$this_version'. It should be " 2848 . "version $string_version"); 2849 no_version: 2850 Carp::my_carp_bug("Could not find the expected " 2851 . "version info in file '$file'"); 2852 } 2853 } 2854 2855 valid_version: 2856 print "$progress_message{$addr}\n" if $verbosity >= $PROGRESS; 2857 2858 # Call any special handler for before the file. 2859 &{$pre_handler{$addr}}($self) if $pre_handler{$addr}; 2860 2861 # Then the main handler 2862 &{$handler{$addr}}($self); 2863 2864 # Then any special post-file handler. 2865 &{$post_handler{$addr}}($self) if $post_handler{$addr}; 2866 2867 # If any errors have been accumulated, output the counts (as the first 2868 # error message in each class was output when it was encountered). 2869 if ($errors{$addr}) { 2870 my $total = 0; 2871 my $types = 0; 2872 foreach my $error (keys %{$errors{$addr}}) { 2873 $total += $errors{$addr}->{$error}; 2874 delete $errors{$addr}->{$error}; 2875 $types++; 2876 } 2877 if ($total > 1) { 2878 my $message 2879 = "A total of $total lines had errors in $file. "; 2880 2881 $message .= ($types == 1) 2882 ? '(Only the first one was displayed.)' 2883 : '(Only the first of each type was displayed.)'; 2884 Carp::my_carp($message); 2885 } 2886 } 2887 2888 if (@{$missings{$addr}}) { 2889 Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines. Generated tables likely are wrong"); 2890 } 2891 2892 # If a real file handle, close it. 2893 close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if 2894 ref $handle{$addr}; 2895 $handle{$addr} = ""; # Uses empty to indicate that has already seen 2896 # the file, as opposed to undef 2897 return; 2898 } 2899 2900 sub _next_line($self) { 2901 # Sets $_ to be the next logical input line, if any. Returns non-zero 2902 # if such a line exists. 'logical' means that any lines that have 2903 # been added via insert_lines() will be returned in $_ before the file 2904 # is read again. 2905 2906 my $addr = do { no overloading; pack 'J', $self; }; 2907 2908 # Here the file is open (or if the handle is not a ref, is an open 2909 # 'virtual' file). Get the next line; any inserted lines get priority 2910 # over the file itself. 2911 my $adjusted; 2912 2913 LINE: 2914 while (1) { # Loop until find non-comment, non-empty line 2915 #local $to_trace = 1 if main::DEBUG; 2916 my $inserted_ref = shift @{$added_lines{$addr}}; 2917 if (defined $inserted_ref) { 2918 ($adjusted, $_) = @{$inserted_ref}; 2919 trace $adjusted, $_ if main::DEBUG && $to_trace; 2920 return 1 if $adjusted; 2921 } 2922 else { 2923 last if ! ref $handle{$addr}; # Don't read unless is real file 2924 last if ! defined ($_ = readline $handle{$addr}); 2925 } 2926 chomp; 2927 trace $_ if main::DEBUG && $to_trace; 2928 2929 # See if this line is the comment line that defines what property 2930 # value that code points that are not listed in the file should 2931 # have. The format or existence of these lines is not guaranteed 2932 # by Unicode since they are comments, but the documentation says 2933 # that this was added for machine-readability, so probably won't 2934 # change. This works starting in Unicode Version 5.0. They look 2935 # like: 2936 # 2937 # @missing: 0000..10FFFF; Not_Reordered 2938 # @missing: 0000..10FFFF; Decomposition_Mapping; <code point> 2939 # @missing: 0000..10FFFF; ; NaN 2940 # 2941 # Save the line for a later get_missings() call. 2942 if (/$missing_defaults_prefix/) { 2943 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) { 2944 $self->carp_bad_line("Unexpected \@missing line. Assuming no missing entries"); 2945 } 2946 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) { 2947 my @defaults = split /\s* ; \s*/x, $_; 2948 2949 # The first field is the @missing, which ends in a 2950 # semi-colon, so can safely shift. 2951 shift @defaults; 2952 2953 # Some of these lines may have empty field placeholders 2954 # which get in the way. An example is: 2955 # @missing: 0000..10FFFF; ; NaN 2956 # Remove them. Process starting from the top so the 2957 # splice doesn't affect things still to be looked at. 2958 for (my $i = @defaults - 1; $i >= 0; $i--) { 2959 next if $defaults[$i] ne ""; 2960 splice @defaults, $i, 1; 2961 } 2962 2963 # What's left should be just the property (maybe) and the 2964 # default. Having only one element means it doesn't have 2965 # the property. 2966 my $default; 2967 my $property; 2968 if (@defaults >= 1) { 2969 if (@defaults == 1) { 2970 $default = $defaults[0]; 2971 } 2972 else { 2973 $property = $defaults[0]; 2974 $default = $defaults[1]; 2975 } 2976 } 2977 2978 if (@defaults < 1 2979 || @defaults > 2 2980 || ($default =~ /^</ 2981 && $default !~ /^<code *point>$/i 2982 && $default !~ /^<none>$/i 2983 && $default !~ /^<script>$/i)) 2984 { 2985 $self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries"); 2986 } 2987 else { 2988 2989 # If the property is missing from the line, it should 2990 # be the one for the whole file 2991 $property = $property{$addr} if ! defined $property; 2992 2993 # Change <none> to the null string, which is what it 2994 # really means. If the default is the code point 2995 # itself, set it to <code point>, which is what 2996 # Unicode uses (but sometimes they've forgotten the 2997 # space) 2998 if ($default =~ /^<none>$/i) { 2999 $default = ""; 3000 } 3001 elsif ($default =~ /^<code *point>$/i) { 3002 $default = $CODE_POINT; 3003 } 3004 elsif ($default =~ /^<script>$/i) { 3005 3006 # Special case this one. Currently is from 3007 # ScriptExtensions.txt, and means for all unlisted 3008 # code points, use their Script property values. 3009 # For the code points not listed in that file, the 3010 # default value is 'Unknown'. 3011 $default = "Unknown"; 3012 } 3013 3014 # Store them as a sub-arrays with both components. 3015 push @{$missings{$addr}}, [ $default, $property ]; 3016 } 3017 } 3018 3019 # There is nothing for the caller to process on this comment 3020 # line. 3021 next; 3022 } 3023 3024 # Unless to keep, remove comments. If to keep, ignore 3025 # comment-only lines 3026 if ($retain_trailing_comments{$addr}) { 3027 next if / ^ \s* \# /x; 3028 3029 # But escape any single quotes (done in both the comment and 3030 # non-comment portion; this could be a bug someday, but not 3031 # likely) 3032 s/'/\\'/g; 3033 } 3034 else { 3035 s/#.*//; 3036 } 3037 3038 # Remove trailing space, and skip this line if the result is empty 3039 s/\s+$//; 3040 next if /^$/; 3041 3042 # Call any handlers for this line, and skip further processing of 3043 # the line if the handler sets the line to null. 3044 foreach my $sub_ref (@{$each_line_handler{$addr}}) { 3045 &{$sub_ref}($self); 3046 next LINE if /^$/; 3047 } 3048 3049 # Here the line is ok. return success. 3050 return 1; 3051 } # End of looping through lines. 3052 3053 # If there are EOF handlers, call each (only once) and if it generates 3054 # more lines to process go back in the loop to handle them. 3055 while ($eof_handler{$addr}->@*) { 3056 &{$eof_handler{$addr}[0]}($self); 3057 shift $eof_handler{$addr}->@*; # Currently only get one shot at it. 3058 goto LINE if $added_lines{$addr}; 3059 } 3060 3061 # Return failure -- no more lines. 3062 return 0; 3063 3064 } 3065 3066 sub _next_line_with_remapped_range($self) { 3067 # like _next_line(), but for use on non-ASCII platforms. It sets $_ 3068 # to be the next logical input line, if any. Returns non-zero if such 3069 # a line exists. 'logical' means that any lines that have been added 3070 # via insert_lines() will be returned in $_ before the file is read 3071 # again. 3072 # 3073 # The difference from _next_line() is that this remaps the Unicode 3074 # code points in the input to those of the native platform. Each 3075 # input line contains a single code point, or a single contiguous 3076 # range of them This routine splits each range into its individual 3077 # code points and caches them. It returns the cached values, 3078 # translated into their native equivalents, one at a time, for each 3079 # call, before reading the next line. Since native values can only be 3080 # a single byte wide, no translation is needed for code points above 3081 # 0xFF, and ranges that are entirely above that number are not split. 3082 # If an input line contains the range 254-1000, it would be split into 3083 # three elements: 254, 255, and 256-1000. (The downstream table 3084 # insertion code will sort and coalesce the individual code points 3085 # into appropriate ranges.) 3086 3087 my $addr = do { no overloading; pack 'J', $self; }; 3088 3089 while (1) { 3090 3091 # Look in cache before reading the next line. Return any cached 3092 # value, translated 3093 my $inserted = shift @{$remapped_lines{$addr}}; 3094 if (defined $inserted) { 3095 trace $inserted if main::DEBUG && $to_trace; 3096 $_ = $inserted =~ s/^ ( \d+ ) /sprintf("%04X", utf8::unicode_to_native($1))/xer; 3097 trace $_ if main::DEBUG && $to_trace; 3098 return 1; 3099 } 3100 3101 # Get the next line. 3102 return 0 unless _next_line($self); 3103 3104 # If there is a special handler for it, return the line, 3105 # untranslated. This should happen only for files that are 3106 # special, not being code-point related, such as property names. 3107 return 1 if $handler{$addr} 3108 != \&main::process_generic_property_file; 3109 3110 my ($range, $property_name, $map, @remainder) 3111 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 3112 3113 if (@remainder 3114 || ! defined $property_name 3115 || $range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x) 3116 { 3117 Carp::my_carp_bug("Unrecognized input line '$_'. Ignored"); 3118 } 3119 3120 my $low = hex $1; 3121 my $high = (defined $2) ? hex $2 : $low; 3122 3123 # If the input maps the range to another code point, remap the 3124 # target if it is between 0 and 255. 3125 my $tail; 3126 if (defined $map) { 3127 $map =~ s/\b 00 ( [0-9A-F]{2} ) \b/sprintf("%04X", utf8::unicode_to_native(hex $1))/gxe; 3128 $tail = "$property_name; $map"; 3129 $_ = "$range; $tail"; 3130 } 3131 else { 3132 $tail = $property_name; 3133 } 3134 3135 # If entire range is above 255, just return it, unchanged (except 3136 # any mapped-to code point, already changed above) 3137 return 1 if $low > 255; 3138 3139 # Cache an entry for every code point < 255. For those in the 3140 # range above 255, return a dummy entry for just that portion of 3141 # the range. Note that this will be out-of-order, but that is not 3142 # a problem. 3143 foreach my $code_point ($low .. $high) { 3144 if ($code_point > 255) { 3145 $_ = sprintf "%04X..%04X; $tail", $code_point, $high; 3146 return 1; 3147 } 3148 push @{$remapped_lines{$addr}}, "$code_point; $tail"; 3149 } 3150 } # End of looping through lines. 3151 3152 # NOTREACHED 3153 } 3154 3155# Not currently used, not fully tested. 3156# sub peek { 3157# # Non-destructive lookahead one non-adjusted, non-comment, non-blank 3158# # record. Not callable from an each_line_handler(), nor does it call 3159# # an each_line_handler() on the line. 3160# 3161# my $self = shift; 3162# my $addr = do { no overloading; pack 'J', $self; }; 3163# 3164# foreach my $inserted_ref (@{$added_lines{$addr}}) { 3165# my ($adjusted, $line) = @{$inserted_ref}; 3166# next if $adjusted; 3167# 3168# # Remove comments and trailing space, and return a non-empty 3169# # resulting line 3170# $line =~ s/#.*//; 3171# $line =~ s/\s+$//; 3172# return $line if $line ne ""; 3173# } 3174# 3175# return if ! ref $handle{$addr}; # Don't read unless is real file 3176# while (1) { # Loop until find non-comment, non-empty line 3177# local $to_trace = 1 if main::DEBUG; 3178# trace $_ if main::DEBUG && $to_trace; 3179# return if ! defined (my $line = readline $handle{$addr}); 3180# chomp $line; 3181# push @{$added_lines{$addr}}, [ 0, $line ]; 3182# 3183# $line =~ s/#.*//; 3184# $line =~ s/\s+$//; 3185# return $line if $line ne ""; 3186# } 3187# 3188# return; 3189# } 3190 3191 3192 sub insert_lines($self, @lines) { 3193 # Lines can be inserted so that it looks like they were in the input 3194 # file at the place it was when this routine is called. See also 3195 # insert_adjusted_lines(). Lines inserted via this routine go through 3196 # any each_line_handler() 3197 3198 # Each inserted line is an array, with the first element being 0 to 3199 # indicate that this line hasn't been adjusted, and needs to be 3200 # processed. 3201 no overloading; 3202 push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @lines; 3203 return; 3204 } 3205 3206 sub insert_adjusted_lines($self, @lines) { 3207 # Lines can be inserted so that it looks like they were in the input 3208 # file at the place it was when this routine is called. See also 3209 # insert_lines(). Lines inserted via this routine are already fully 3210 # adjusted, ready to be processed; each_line_handler()s handlers will 3211 # not be called. This means this is not a completely general 3212 # facility, as only the last each_line_handler on the stack should 3213 # call this. It could be made more general, by passing to each of the 3214 # line_handlers their position on the stack, which they would pass on 3215 # to this routine, and that would replace the boolean first element in 3216 # the anonymous array pushed here, so that the next_line routine could 3217 # use that to call only those handlers whose index is after it on the 3218 # stack. But this is overkill for what is needed now. 3219 3220 trace $_[0] if main::DEBUG && $to_trace; 3221 3222 # Each inserted line is an array, with the first element being 1 to 3223 # indicate that this line has been adjusted 3224 no overloading; 3225 push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @lines; 3226 return; 3227 } 3228 3229 sub get_missings($self) { 3230 # Returns the stored up @missings lines' values, and clears the list. 3231 # The values are in an array, consisting of the default in the first 3232 # element, and the property in the 2nd. However, since these lines 3233 # can be stacked up, the return is an array of all these arrays. 3234 3235 my $addr = do { no overloading; pack 'J', $self; }; 3236 3237 # If not accepting a list return, just return the first one. 3238 return shift @{$missings{$addr}} unless wantarray; 3239 3240 my @return = @{$missings{$addr}}; 3241 undef @{$missings{$addr}}; 3242 return @return; 3243 } 3244 3245 sub _exclude_unassigned($self) { 3246 3247 # Takes the range in $_ and excludes code points that aren't assigned 3248 # in this release 3249 3250 state $skip_inserted_count = 0; 3251 3252 # Ignore recursive calls. 3253 if ($skip_inserted_count) { 3254 $skip_inserted_count--; 3255 return; 3256 } 3257 3258 # Find what code points are assigned in this release 3259 main::calculate_Assigned() if ! defined $Assigned; 3260 3261 my $addr = do { no overloading; pack 'J', $self; }; 3262 3263 my ($range, @remainder) 3264 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 3265 3266 # Examine the range. 3267 if ($range =~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x) 3268 { 3269 my $low = hex $1; 3270 my $high = (defined $2) ? hex $2 : $low; 3271 3272 # Split the range into subranges of just those code points in it 3273 # that are assigned. 3274 my @ranges = (Range_List->new(Initialize 3275 => Range->new($low, $high)) & $Assigned)->ranges; 3276 3277 # Do nothing if nothing in the original range is assigned in this 3278 # release; handle normally if everything is in this release. 3279 if (! @ranges) { 3280 $_ = ""; 3281 } 3282 elsif (@ranges != 1) { 3283 3284 # Here, some code points in the original range aren't in this 3285 # release; @ranges gives the ones that are. Create fake input 3286 # lines for each of the ranges, and set things up so that when 3287 # this routine is called on that fake input, it will do 3288 # nothing. 3289 $skip_inserted_count = @ranges; 3290 my $remainder = join ";", @remainder; 3291 for my $range (@ranges) { 3292 $self->insert_lines(sprintf("%04X..%04X;%s", 3293 $range->start, $range->end, $remainder)); 3294 } 3295 $_ = ""; # The original range is now defunct. 3296 } 3297 } 3298 3299 return; 3300 } 3301 3302 sub _fixup_obsolete_hanguls($self) { 3303 3304 # This is called only when compiling Unicode version 1. All Unicode 3305 # data for subsequent releases assumes that the code points that were 3306 # Hangul syllables in this release only are something else, so if 3307 # using such data, we have to override it 3308 3309 my $addr = do { no overloading; pack 'J', $self; }; 3310 3311 my $object = main::property_ref($property{$addr}); 3312 $object->add_map($FIRST_REMOVED_HANGUL_SYLLABLE, 3313 $FINAL_REMOVED_HANGUL_SYLLABLE, 3314 $early{$addr}[3], # Passed-in value for these 3315 Replace => $UNCONDITIONALLY); 3316 } 3317 3318 sub _insert_property_into_line($self) { 3319 # Add a property field to $_, if this file requires it. 3320 3321 my $addr = do { no overloading; pack 'J', $self; }; 3322 my $property = $property{$addr}; 3323 3324 $_ =~ s/(;|$)/; $property$1/; 3325 return; 3326 } 3327 3328 sub carp_bad_line($self, $message="") { 3329 # Output consistent error messages, using either a generic one, or the 3330 # one given by the optional parameter. To avoid gazillions of the 3331 # same message in case the syntax of a file is way off, this routine 3332 # only outputs the first instance of each message, incrementing a 3333 # count so the totals can be output at the end of the file. 3334 3335 my $addr = do { no overloading; pack 'J', $self; }; 3336 3337 $message = 'Unexpected line' unless $message; 3338 3339 # No trailing punctuation so as to fit with our addenda. 3340 $message =~ s/[.:;,]$//; 3341 3342 # If haven't seen this exact message before, output it now. Otherwise 3343 # increment the count of how many times it has occurred 3344 unless ($errors{$addr}->{$message}) { 3345 Carp::my_carp("$message in '$_' in " 3346 . $file{$addr} 3347 . " at line $.. Skipping this line;"); 3348 $errors{$addr}->{$message} = 1; 3349 } 3350 else { 3351 $errors{$addr}->{$message}++; 3352 } 3353 3354 # Clear the line to prevent any further (meaningful) processing of it. 3355 $_ = ""; 3356 3357 return; 3358 } 3359} # End closure 3360 3361package Multi_Default; 3362 3363# Certain properties in early versions of Unicode had more than one possible 3364# default for code points missing from the files. In these cases, one 3365# default applies to everything left over after all the others are applied, 3366# and for each of the others, there is a description of which class of code 3367# points applies to it. This object helps implement this by storing the 3368# defaults, and for all but that final default, an eval string that generates 3369# the class that it applies to. 3370 3371use strict; 3372use warnings; 3373 3374use feature 'signatures'; 3375no warnings 'experimental::signatures'; 3376 3377{ # Closure 3378 3379 main::setup_package(); 3380 3381 my %class_defaults; 3382 # The defaults structure for the classes 3383 main::set_access('class_defaults', \%class_defaults); 3384 3385 my %other_default; 3386 # The default that applies to everything left over. 3387 main::set_access('other_default', \%other_default, 'r'); 3388 3389 3390 sub new { 3391 # The constructor is called with default => eval pairs, terminated by 3392 # the left-over default. e.g. 3393 # Multi_Default->new( 3394 # 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C 3395 # - 0x200D', 3396 # 'R' => 'some other expression that evaluates to code points', 3397 # . 3398 # . 3399 # . 3400 # 'U')); 3401 # It is best to leave the final value be the one that matches the 3402 # above-Unicode code points. 3403 3404 my $class = shift; 3405 3406 my $self = bless \do{my $anonymous_scalar}, $class; 3407 my $addr = do { no overloading; pack 'J', $self; }; 3408 3409 while (@_ > 1) { 3410 my $default = shift; 3411 my $eval = shift; 3412 $class_defaults{$addr}->{$default} = $eval; 3413 } 3414 3415 $other_default{$addr} = shift; 3416 3417 return $self; 3418 } 3419 3420 sub get_next_defaults($self) { 3421 # Iterates and returns the next class of defaults. 3422 3423 my $addr = do { no overloading; pack 'J', $self; }; 3424 3425 return each %{$class_defaults{$addr}}; 3426 } 3427} 3428 3429package Alias; 3430 3431# An alias is one of the names that a table goes by. This class defines them 3432# including some attributes. Everything is currently setup in the 3433# constructor. 3434 3435use strict; 3436use warnings; 3437 3438use feature 'signatures'; 3439no warnings 'experimental::signatures'; 3440 3441 3442{ # Closure 3443 3444 main::setup_package(); 3445 3446 my %name; 3447 main::set_access('name', \%name, 'r'); 3448 3449 my %loose_match; 3450 # Should this name match loosely or not. 3451 main::set_access('loose_match', \%loose_match, 'r'); 3452 3453 my %make_re_pod_entry; 3454 # Some aliases should not get their own entries in the re section of the 3455 # pod, because they are covered by a wild-card, and some we want to 3456 # discourage use of. Binary 3457 main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's'); 3458 3459 my %ucd; 3460 # Is this documented to be accessible via Unicode::UCD 3461 main::set_access('ucd', \%ucd, 'r', 's'); 3462 3463 my %status; 3464 # Aliases have a status, like deprecated, or even suppressed (which means 3465 # they don't appear in documentation). Enum 3466 main::set_access('status', \%status, 'r'); 3467 3468 my %ok_as_filename; 3469 # Similarly, some aliases should not be considered as usable ones for 3470 # external use, such as file names, or we don't want documentation to 3471 # recommend them. Boolean 3472 main::set_access('ok_as_filename', \%ok_as_filename, 'r'); 3473 3474 sub new { 3475 my $class = shift; 3476 3477 my $self = bless \do { my $anonymous_scalar }, $class; 3478 my $addr = do { no overloading; pack 'J', $self; }; 3479 3480 $name{$addr} = shift; 3481 $loose_match{$addr} = shift; 3482 $make_re_pod_entry{$addr} = shift; 3483 $ok_as_filename{$addr} = shift; 3484 $status{$addr} = shift; 3485 $ucd{$addr} = shift; 3486 3487 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3488 3489 # Null names are never ok externally 3490 $ok_as_filename{$addr} = 0 if $name{$addr} eq ""; 3491 3492 return $self; 3493 } 3494} 3495 3496package Range; 3497 3498# A range is the basic unit for storing code points, and is described in the 3499# comments at the beginning of the program. Each range has a starting code 3500# point; an ending code point (not less than the starting one); a value 3501# that applies to every code point in between the two end-points, inclusive; 3502# and an enum type that applies to the value. The type is for the user's 3503# convenience, and has no meaning here, except that a non-zero type is 3504# considered to not obey the normal Unicode rules for having standard forms. 3505# 3506# The same structure is used for both map and match tables, even though in the 3507# latter, the value (and hence type) is irrelevant and could be used as a 3508# comment. In map tables, the value is what all the code points in the range 3509# map to. Type 0 values have the standardized version of the value stored as 3510# well, so as to not have to recalculate it a lot. 3511 3512use strict; 3513use warnings; 3514 3515use feature 'signatures'; 3516no warnings 'experimental::signatures'; 3517 3518sub trace { return main::trace(@_); } 3519 3520{ # Closure 3521 3522 main::setup_package(); 3523 3524 my %start; 3525 main::set_access('start', \%start, 'r', 's'); 3526 3527 my %end; 3528 main::set_access('end', \%end, 'r', 's'); 3529 3530 my %value; 3531 main::set_access('value', \%value, 'r', 's'); 3532 3533 my %type; 3534 main::set_access('type', \%type, 'r'); 3535 3536 my %standard_form; 3537 # The value in internal standard form. Defined only if the type is 0. 3538 main::set_access('standard_form', \%standard_form); 3539 3540 # Note that if these fields change, the dump() method should as well 3541 3542 sub new($class, $_addr, $_end, @_args) { 3543 my $self = bless \do { my $anonymous_scalar }, $class; 3544 my $addr = do { no overloading; pack 'J', $self; }; 3545 3546 $start{$addr} = $_addr; 3547 $end{$addr} = $_end; 3548 3549 my %args = @_args; 3550 3551 my $value = delete $args{'Value'}; # Can be 0 3552 $value = "" unless defined $value; 3553 $value{$addr} = $value; 3554 3555 $type{$addr} = delete $args{'Type'} || 0; 3556 3557 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 3558 3559 return $self; 3560 } 3561 3562 use overload 3563 fallback => 0, 3564 qw("") => "_operator_stringify", 3565 "." => \&main::_operator_dot, 3566 ".=" => \&main::_operator_dot_equal, 3567 ; 3568 3569 sub _operator_stringify($self) { 3570 my $addr = do { no overloading; pack 'J', $self; }; 3571 3572 # Output it like '0041..0065 (value)' 3573 my $return = sprintf("%04X", $start{$addr}) 3574 . '..' 3575 . sprintf("%04X", $end{$addr}); 3576 my $value = $value{$addr}; 3577 my $type = $type{$addr}; 3578 $return .= ' ('; 3579 $return .= "$value"; 3580 $return .= ", Type=$type" if $type != 0; 3581 $return .= ')'; 3582 3583 return $return; 3584 } 3585 3586 sub standard_form($self) { 3587 # Calculate the standard form only if needed, and cache the result. 3588 # The standard form is the value itself if the type is special. 3589 # This represents a considerable CPU and memory saving - at the time 3590 # of writing there are 368676 non-special objects, but the standard 3591 # form is only requested for 22047 of them - ie about 6%. 3592 3593 my $addr = do { no overloading; pack 'J', $self; }; 3594 3595 return $standard_form{$addr} if defined $standard_form{$addr}; 3596 3597 my $value = $value{$addr}; 3598 return $value if $type{$addr}; 3599 return $standard_form{$addr} = main::standardize($value); 3600 } 3601 3602 sub dump($self, $indent) { 3603 # Human, not machine readable. For machine readable, comment out this 3604 # entire routine and let the standard one take effect. 3605 my $addr = do { no overloading; pack 'J', $self; }; 3606 3607 my $return = $indent 3608 . sprintf("%04X", $start{$addr}) 3609 . '..' 3610 . sprintf("%04X", $end{$addr}) 3611 . " '$value{$addr}';"; 3612 if (! defined $standard_form{$addr}) { 3613 $return .= "(type=$type{$addr})"; 3614 } 3615 elsif ($standard_form{$addr} ne $value{$addr}) { 3616 $return .= "(standard '$standard_form{$addr}')"; 3617 } 3618 return $return; 3619 } 3620} # End closure 3621 3622package _Range_List_Base; 3623 3624use strict; 3625use warnings; 3626 3627use feature 'signatures'; 3628no warnings 'experimental::signatures'; 3629 3630# Base class for range lists. A range list is simply an ordered list of 3631# ranges, so that the ranges with the lowest starting numbers are first in it. 3632# 3633# When a new range is added that is adjacent to an existing range that has the 3634# same value and type, it merges with it to form a larger range. 3635# 3636# Ranges generally do not overlap, except that there can be multiple entries 3637# of single code point ranges. This is because of NameAliases.txt. 3638# 3639# In this program, there is a standard value such that if two different 3640# values, have the same standard value, they are considered equivalent. This 3641# value was chosen so that it gives correct results on Unicode data 3642 3643# There are a number of methods to manipulate range lists, and some operators 3644# are overloaded to handle them. 3645 3646sub trace { return main::trace(@_); } 3647 3648{ # Closure 3649 3650 our $addr; 3651 3652 # Max is initialized to a negative value that isn't adjacent to 0, for 3653 # simpler tests 3654 my $max_init = -2; 3655 3656 main::setup_package(); 3657 3658 my %ranges; 3659 # The list of ranges 3660 main::set_access('ranges', \%ranges, 'readable_array'); 3661 3662 my %max; 3663 # The highest code point in the list. This was originally a method, but 3664 # actual measurements said it was used a lot. 3665 main::set_access('max', \%max, 'r'); 3666 3667 my %each_range_iterator; 3668 # Iterator position for each_range() 3669 main::set_access('each_range_iterator', \%each_range_iterator); 3670 3671 my %owner_name_of; 3672 # Name of parent this is attached to, if any. Solely for better error 3673 # messages. 3674 main::set_access('owner_name_of', \%owner_name_of, 'p_r'); 3675 3676 my %_search_ranges_cache; 3677 # A cache of the previous result from _search_ranges(), for better 3678 # performance 3679 main::set_access('_search_ranges_cache', \%_search_ranges_cache); 3680 3681 sub new { 3682 my $class = shift; 3683 my %args = @_; 3684 3685 # Optional initialization data for the range list. 3686 my $initialize = delete $args{'Initialize'}; 3687 3688 my $self; 3689 3690 # Use _union() to initialize. _union() returns an object of this 3691 # class, which means that it will call this constructor recursively. 3692 # But it won't have this $initialize parameter so that it won't 3693 # infinitely loop on this. 3694 return _union($class, $initialize, %args) if defined $initialize; 3695 3696 $self = bless \do { my $anonymous_scalar }, $class; 3697 my $addr = do { no overloading; pack 'J', $self; }; 3698 3699 # Optional parent object, only for debug info. 3700 $owner_name_of{$addr} = delete $args{'Owner'}; 3701 $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr}; 3702 3703 # Stringify, in case it is an object. 3704 $owner_name_of{$addr} = "$owner_name_of{$addr}"; 3705 3706 # This is used only for error messages, and so a colon is added 3707 $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne ""; 3708 3709 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 3710 3711 $max{$addr} = $max_init; 3712 3713 $_search_ranges_cache{$addr} = 0; 3714 $ranges{$addr} = []; 3715 3716 return $self; 3717 } 3718 3719 use overload 3720 fallback => 0, 3721 qw("") => "_operator_stringify", 3722 "." => \&main::_operator_dot, 3723 ".=" => \&main::_operator_dot_equal, 3724 ; 3725 3726 sub _operator_stringify($self) { 3727 my $addr = do { no overloading; pack 'J', $self; }; 3728 3729 return "Range_List attached to '$owner_name_of{$addr}'" 3730 if $owner_name_of{$addr}; 3731 return "anonymous Range_List " . \$self; 3732 } 3733 3734 sub _union { 3735 # Returns the union of the input code points. It can be called as 3736 # either a constructor or a method. If called as a method, the result 3737 # will be a new() instance of the calling object, containing the union 3738 # of that object with the other parameter's code points; if called as 3739 # a constructor, the first parameter gives the class that the new object 3740 # should be, and the second parameter gives the code points to go into 3741 # it. 3742 # In either case, there are two parameters looked at by this routine; 3743 # any additional parameters are passed to the new() constructor. 3744 # 3745 # The code points can come in the form of some object that contains 3746 # ranges, and has a conventionally named method to access them; or 3747 # they can be an array of individual code points (as integers); or 3748 # just a single code point. 3749 # 3750 # If they are ranges, this routine doesn't make any effort to preserve 3751 # the range values and types of one input over the other. Therefore 3752 # this base class should not allow _union to be called from other than 3753 # initialization code, so as to prevent two tables from being added 3754 # together where the range values matter. The general form of this 3755 # routine therefore belongs in a derived class, but it was moved here 3756 # to avoid duplication of code. The failure to overload this in this 3757 # class keeps it safe. 3758 # 3759 # It does make the effort during initialization to accept tables with 3760 # multiple values for the same code point, and to preserve the order 3761 # of these. If there is only one input range or range set, it doesn't 3762 # sort (as it should already be sorted to the desired order), and will 3763 # accept multiple values per code point. Otherwise it will merge 3764 # multiple values into a single one. 3765 3766 my $self; 3767 my @args; # Arguments to pass to the constructor 3768 3769 my $class = shift; 3770 3771 # If a method call, will start the union with the object itself, and 3772 # the class of the new object will be the same as self. 3773 if (ref $class) { 3774 $self = $class; 3775 $class = ref $self; 3776 push @args, $self; 3777 } 3778 3779 # Add the other required parameter. 3780 push @args, shift; 3781 # Rest of parameters are passed on to the constructor 3782 3783 # Accumulate all records from both lists. 3784 my @records; 3785 my $input_count = 0; 3786 for my $arg (@args) { 3787 #local $to_trace = 0 if main::DEBUG; 3788 trace "argument = $arg" if main::DEBUG && $to_trace; 3789 if (! defined $arg) { 3790 my $message = ""; 3791 if (defined $self) { 3792 no overloading; 3793 $message .= $owner_name_of{pack 'J', $self}; 3794 } 3795 Carp::my_carp_bug($message . "Undefined argument to _union. No union done."); 3796 return; 3797 } 3798 3799 $arg = [ $arg ] if ! ref $arg; 3800 my $type = ref $arg; 3801 if ($type eq 'ARRAY') { 3802 foreach my $element (@$arg) { 3803 push @records, Range->new($element, $element); 3804 $input_count++; 3805 } 3806 } 3807 elsif ($arg->isa('Range')) { 3808 push @records, $arg; 3809 $input_count++; 3810 } 3811 elsif ($arg->can('ranges')) { 3812 push @records, $arg->ranges; 3813 $input_count++; 3814 } 3815 else { 3816 my $message = ""; 3817 if (defined $self) { 3818 no overloading; 3819 $message .= $owner_name_of{pack 'J', $self}; 3820 } 3821 Carp::my_carp_bug($message . "Cannot take the union of a $type. No union done."); 3822 return; 3823 } 3824 } 3825 3826 # Sort with the range containing the lowest ordinal first, but if 3827 # two ranges start at the same code point, sort with the bigger range 3828 # of the two first, because it takes fewer cycles. 3829 if ($input_count > 1) { 3830 @records = sort { ($a->start <=> $b->start) 3831 or 3832 # if b is shorter than a, b->end will be 3833 # less than a->end, and we want to select 3834 # a, so want to return -1 3835 ($b->end <=> $a->end) 3836 } @records; 3837 } 3838 3839 my $new = $class->new(@_); 3840 3841 # Fold in records so long as they add new information. 3842 for my $set (@records) { 3843 my $start = $set->start; 3844 my $end = $set->end; 3845 my $value = $set->value; 3846 my $type = $set->type; 3847 if ($start > $new->max) { 3848 $new->_add_delete('+', $start, $end, $value, Type => $type); 3849 } 3850 elsif ($end > $new->max) { 3851 $new->_add_delete('+', $new->max +1, $end, $value, 3852 Type => $type); 3853 } 3854 elsif ($input_count == 1) { 3855 # Here, overlaps existing range, but is from a single input, 3856 # so preserve the multiple values from that input. 3857 $new->_add_delete('+', $start, $end, $value, Type => $type, 3858 Replace => $MULTIPLE_AFTER); 3859 } 3860 } 3861 3862 return $new; 3863 } 3864 3865 sub range_count($self) { # Return the number of ranges in the range list 3866 no overloading; 3867 return scalar @{$ranges{pack 'J', $self}}; 3868 } 3869 3870 sub min($self) { 3871 # Returns the minimum code point currently in the range list, or if 3872 # the range list is empty, 2 beyond the max possible. This is a 3873 # method because used so rarely, that not worth saving between calls, 3874 # and having to worry about changing it as ranges are added and 3875 # deleted. 3876 3877 my $addr = do { no overloading; pack 'J', $self; }; 3878 3879 # If the range list is empty, return a large value that isn't adjacent 3880 # to any that could be in the range list, for simpler tests 3881 return $MAX_WORKING_CODEPOINT + 2 unless scalar @{$ranges{$addr}}; 3882 return $ranges{$addr}->[0]->start; 3883 } 3884 3885 sub contains($self, $codepoint) { 3886 # Boolean: Is argument in the range list? If so returns $i such that: 3887 # range[$i]->end < $codepoint <= range[$i+1]->end 3888 # which is one beyond what you want; this is so that the 0th range 3889 # doesn't return false 3890 3891 my $i = $self->_search_ranges($codepoint); 3892 return 0 unless defined $i; 3893 3894 # The search returns $i, such that 3895 # range[$i-1]->end < $codepoint <= range[$i]->end 3896 # So is in the table if and only iff it is at least the start position 3897 # of range $i. 3898 no overloading; 3899 return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint; 3900 return $i + 1; 3901 } 3902 3903 sub containing_range($self, $codepoint) { 3904 # Returns the range object that contains the code point, undef if none 3905 my $i = $self->contains($codepoint); 3906 return unless $i; 3907 3908 # contains() returns 1 beyond where we should look 3909 no overloading; 3910 return $ranges{pack 'J', $self}->[$i-1]; 3911 } 3912 3913 sub value_of($self, $codepoint) { 3914 # Returns the value associated with the code point, undef if none 3915 my $range = $self->containing_range($codepoint); 3916 return unless defined $range; 3917 3918 return $range->value; 3919 } 3920 3921 sub type_of($self, $codepoint) { 3922 # Returns the type of the range containing the code point, undef if 3923 # the code point is not in the table 3924 my $range = $self->containing_range($codepoint); 3925 return unless defined $range; 3926 3927 return $range->type; 3928 } 3929 3930 sub _search_ranges($self, $code_point) { 3931 # Find the range in the list which contains a code point, or where it 3932 # should go if were to add it. That is, it returns $i, such that: 3933 # range[$i-1]->end < $codepoint <= range[$i]->end 3934 # Returns undef if no such $i is possible (e.g. at end of table), or 3935 # if there is an error. 3936 my $addr = do { no overloading; pack 'J', $self; }; 3937 3938 return if $code_point > $max{$addr}; 3939 my $r = $ranges{$addr}; # The current list of ranges 3940 my $range_list_size = scalar @$r; 3941 my $i; 3942 3943 use integer; # want integer division 3944 3945 # Use the cached result as the starting guess for this one, because, 3946 # an experiment on 5.1 showed that 90% of the time the cache was the 3947 # same as the result on the next call (and 7% it was one less). 3948 $i = $_search_ranges_cache{$addr}; 3949 $i = 0 if $i >= $range_list_size; # Reset if no longer valid (prob. 3950 # from an intervening deletion 3951 #local $to_trace = 1 if main::DEBUG; 3952 trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point); 3953 return $i if $code_point <= $r->[$i]->end 3954 && ($i == 0 || $r->[$i-1]->end < $code_point); 3955 3956 # Here the cache doesn't yield the correct $i. Try adding 1. 3957 if ($i < $range_list_size - 1 3958 && $r->[$i]->end < $code_point && 3959 $code_point <= $r->[$i+1]->end) 3960 { 3961 $i++; 3962 trace "next \$i is correct: $i" if main::DEBUG && $to_trace; 3963 $_search_ranges_cache{$addr} = $i; 3964 return $i; 3965 } 3966 3967 # Here, adding 1 also didn't work. We do a binary search to 3968 # find the correct position, starting with current $i 3969 my $lower = 0; 3970 my $upper = $range_list_size - 1; 3971 while (1) { 3972 trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace; 3973 3974 if ($code_point <= $r->[$i]->end) { 3975 3976 # Here we have met the upper constraint. We can quit if we 3977 # also meet the lower one. 3978 last if $i == 0 || $r->[$i-1]->end < $code_point; 3979 3980 $upper = $i; # Still too high. 3981 3982 } 3983 else { 3984 3985 # Here, $r[$i]->end < $code_point, so look higher up. 3986 $lower = $i; 3987 } 3988 3989 # Split search domain in half to try again. 3990 my $temp = ($upper + $lower) / 2; 3991 3992 # No point in continuing unless $i changes for next time 3993 # in the loop. 3994 if ($temp == $i) { 3995 3996 # We can't reach the highest element because of the averaging. 3997 # So if one below the upper edge, force it there and try one 3998 # more time. 3999 if ($i == $range_list_size - 2) { 4000 4001 trace "Forcing to upper edge" if main::DEBUG && $to_trace; 4002 $i = $range_list_size - 1; 4003 4004 # Change $lower as well so if fails next time through, 4005 # taking the average will yield the same $i, and we will 4006 # quit with the error message just below. 4007 $lower = $i; 4008 next; 4009 } 4010 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go. No action taken."); 4011 return; 4012 } 4013 $i = $temp; 4014 } # End of while loop 4015 4016 if (main::DEBUG && $to_trace) { 4017 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i; 4018 trace "i= [ $i ]", $r->[$i]; 4019 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1; 4020 } 4021 4022 # Here we have found the offset. Cache it as a starting point for the 4023 # next call. 4024 $_search_ranges_cache{$addr} = $i; 4025 return $i; 4026 } 4027 4028 sub _add_delete { 4029 # Add, replace or delete ranges to or from a list. The $type 4030 # parameter gives which: 4031 # '+' => insert or replace a range, returning a list of any changed 4032 # ranges. 4033 # '-' => delete a range, returning a list of any deleted ranges. 4034 # 4035 # The next three parameters give respectively the start, end, and 4036 # value associated with the range. 'value' should be null unless the 4037 # operation is '+'; 4038 # 4039 # The range list is kept sorted so that the range with the lowest 4040 # starting position is first in the list, and generally, adjacent 4041 # ranges with the same values are merged into a single larger one (see 4042 # exceptions below). 4043 # 4044 # There are more parameters; all are key => value pairs: 4045 # Type gives the type of the value. It is only valid for '+'. 4046 # All ranges have types; if this parameter is omitted, 0 is 4047 # assumed. Ranges with type 0 are assumed to obey the 4048 # Unicode rules for casing, etc; ranges with other types are 4049 # not. Otherwise, the type is arbitrary, for the caller's 4050 # convenience, and looked at only by this routine to keep 4051 # adjacent ranges of different types from being merged into 4052 # a single larger range, and when Replace => 4053 # $IF_NOT_EQUIVALENT is specified (see just below). 4054 # Replace determines what to do if the range list already contains 4055 # ranges which coincide with all or portions of the input 4056 # range. It is only valid for '+': 4057 # => $NO means that the new value is not to replace 4058 # any existing ones, but any empty gaps of the 4059 # range list coinciding with the input range 4060 # will be filled in with the new value. 4061 # => $UNCONDITIONALLY means to replace the existing values with 4062 # this one unconditionally. However, if the 4063 # new and old values are identical, the 4064 # replacement is skipped to save cycles 4065 # => $IF_NOT_EQUIVALENT means to replace the existing values 4066 # (the default) with this one if they are not equivalent. 4067 # Ranges are equivalent if their types are the 4068 # same, and they are the same string; or if 4069 # both are type 0 ranges, if their Unicode 4070 # standard forms are identical. In this last 4071 # case, the routine chooses the more "modern" 4072 # one to use. This is because some of the 4073 # older files are formatted with values that 4074 # are, for example, ALL CAPs, whereas the 4075 # derived files have a more modern style, 4076 # which looks better. By looking for this 4077 # style when the pre-existing and replacement 4078 # standard forms are the same, we can move to 4079 # the modern style 4080 # => $MULTIPLE_BEFORE means that if this range duplicates an 4081 # existing one, but has a different value, 4082 # don't replace the existing one, but insert 4083 # this one so that the same range can occur 4084 # multiple times. They are stored LIFO, so 4085 # that the final one inserted is the first one 4086 # returned in an ordered search of the table. 4087 # If this is an exact duplicate, including the 4088 # value, the original will be moved to be 4089 # first, before any other duplicate ranges 4090 # with different values. 4091 # => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored 4092 # FIFO, so that this one is inserted after all 4093 # others that currently exist. If this is an 4094 # exact duplicate, including value, of an 4095 # existing range, this one is discarded 4096 # (leaving the existing one in its original, 4097 # higher priority position 4098 # => $CROAK Die with an error if is already there 4099 # => anything else is the same as => $IF_NOT_EQUIVALENT 4100 # 4101 # "same value" means identical for non-type-0 ranges, and it means 4102 # having the same standard forms for type-0 ranges. 4103 4104 return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5; 4105 4106 my $self = shift; 4107 my $operation = shift; # '+' for add/replace; '-' for delete; 4108 my $start = shift; 4109 my $end = shift; 4110 my $value = shift; 4111 4112 my %args = @_; 4113 4114 $value = "" if not defined $value; # warning: $value can be "0" 4115 4116 my $replace = delete $args{'Replace'}; 4117 $replace = $IF_NOT_EQUIVALENT unless defined $replace; 4118 4119 my $type = delete $args{'Type'}; 4120 $type = 0 unless defined $type; 4121 4122 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 4123 4124 my $addr = do { no overloading; pack 'J', $self; }; 4125 4126 if ($operation ne '+' && $operation ne '-') { 4127 Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'. No action taken."); 4128 return; 4129 } 4130 unless (defined $start && defined $end) { 4131 Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete. No action taken."); 4132 return; 4133 } 4134 unless ($end >= $start) { 4135 Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken."); 4136 return; 4137 } 4138 #local $to_trace = 1 if main::DEBUG; 4139 4140 if ($operation eq '-') { 4141 if ($replace != $IF_NOT_EQUIVALENT) { 4142 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list. Assuming Replace => \$IF_NOT_EQUIVALENT."); 4143 $replace = $IF_NOT_EQUIVALENT; 4144 } 4145 if ($type) { 4146 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list. Assuming Type => 0."); 4147 $type = 0; 4148 } 4149 if ($value ne "") { 4150 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list. Assuming Value => \"\"."); 4151 $value = ""; 4152 } 4153 } 4154 4155 my $r = $ranges{$addr}; # The current list of ranges 4156 my $range_list_size = scalar @$r; # And its size 4157 my $max = $max{$addr}; # The current high code point in 4158 # the list of ranges 4159 4160 # Do a special case requiring fewer machine cycles when the new range 4161 # starts after the current highest point. The Unicode input data is 4162 # structured so this is common. 4163 if ($start > $max) { 4164 4165 trace "$owner_name_of{$addr} $operation", sprintf("%04X..%04X (%s) type=%d; prev max=%04X", $start, $end, $value, $type, $max) if main::DEBUG && $to_trace; 4166 return if $operation eq '-'; # Deleting a non-existing range is a 4167 # no-op 4168 4169 # If the new range doesn't logically extend the current final one 4170 # in the range list, create a new range at the end of the range 4171 # list. (max cleverly is initialized to a negative number not 4172 # adjacent to 0 if the range list is empty, so even adding a range 4173 # to an empty range list starting at 0 will have this 'if' 4174 # succeed.) 4175 if ($start > $max + 1 # non-adjacent means can't extend. 4176 || @{$r}[-1]->value ne $value # values differ, can't extend. 4177 || @{$r}[-1]->type != $type # types differ, can't extend. 4178 ) { 4179 push @$r, Range->new($start, $end, 4180 Value => $value, 4181 Type => $type); 4182 } 4183 else { 4184 4185 # Here, the new range starts just after the current highest in 4186 # the range list, and they have the same type and value. 4187 # Extend the existing range to incorporate the new one. 4188 @{$r}[-1]->set_end($end); 4189 } 4190 4191 # This becomes the new maximum. 4192 $max{$addr} = $end; 4193 4194 return; 4195 } 4196 #local $to_trace = 0 if main::DEBUG; 4197 4198 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace; 4199 4200 # Here, the input range isn't after the whole rest of the range list. 4201 # Most likely 'splice' will be needed. The rest of the routine finds 4202 # the needed splice parameters, and if necessary, does the splice. 4203 # First, find the offset parameter needed by the splice function for 4204 # the input range. Note that the input range may span multiple 4205 # existing ones, but we'll worry about that later. For now, just find 4206 # the beginning. If the input range is to be inserted starting in a 4207 # position not currently in the range list, it must (obviously) come 4208 # just after the range below it, and just before the range above it. 4209 # Slightly less obviously, it will occupy the position currently 4210 # occupied by the range that is to come after it. More formally, we 4211 # are looking for the position, $i, in the array of ranges, such that: 4212 # 4213 # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end 4214 # 4215 # (The ordered relationships within existing ranges are also shown in 4216 # the equation above). However, if the start of the input range is 4217 # within an existing range, the splice offset should point to that 4218 # existing range's position in the list; that is $i satisfies a 4219 # somewhat different equation, namely: 4220 # 4221 #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end 4222 # 4223 # More briefly, $start can come before or after r[$i]->start, and at 4224 # this point, we don't know which it will be. However, these 4225 # two equations share these constraints: 4226 # 4227 # r[$i-1]->end < $start <= r[$i]->end 4228 # 4229 # And that is good enough to find $i. 4230 4231 my $i = $self->_search_ranges($start); 4232 if (! defined $i) { 4233 Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined. Operation '$operation' not performed"); 4234 return; 4235 } 4236 4237 # The search function returns $i such that: 4238 # 4239 # r[$i-1]->end < $start <= r[$i]->end 4240 # 4241 # That means that $i points to the first range in the range list 4242 # that could possibly be affected by this operation. We still don't 4243 # know if the start of the input range is within r[$i], or if it 4244 # points to empty space between r[$i-1] and r[$i]. 4245 trace "[$i] is the beginning splice point. Existing range there is ", $r->[$i] if main::DEBUG && $to_trace; 4246 4247 # Special case the insertion of data that is not to replace any 4248 # existing data. 4249 if ($replace == $NO) { # If $NO, has to be operation '+' 4250 #local $to_trace = 1 if main::DEBUG; 4251 trace "Doesn't replace" if main::DEBUG && $to_trace; 4252 4253 # Here, the new range is to take effect only on those code points 4254 # that aren't already in an existing range. This can be done by 4255 # looking through the existing range list and finding the gaps in 4256 # the ranges that this new range affects, and then calling this 4257 # function recursively on each of those gaps, leaving untouched 4258 # anything already in the list. Gather up a list of the changed 4259 # gaps first so that changes to the internal state as new ranges 4260 # are added won't be a problem. 4261 my @gap_list; 4262 4263 # First, if the starting point of the input range is outside an 4264 # existing one, there is a gap from there to the beginning of the 4265 # existing range -- add a span to fill the part that this new 4266 # range occupies 4267 if ($start < $r->[$i]->start) { 4268 push @gap_list, Range->new($start, 4269 main::min($end, 4270 $r->[$i]->start - 1), 4271 Type => $type); 4272 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace; 4273 } 4274 4275 # Then look through the range list for other gaps until we reach 4276 # the highest range affected by the input one. 4277 my $j; 4278 for ($j = $i+1; $j < $range_list_size; $j++) { 4279 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace; 4280 last if $end < $r->[$j]->start; 4281 4282 # If there is a gap between when this range starts and the 4283 # previous one ends, add a span to fill it. Note that just 4284 # because there are two ranges doesn't mean there is a 4285 # non-zero gap between them. It could be that they have 4286 # different values or types 4287 if ($r->[$j-1]->end + 1 != $r->[$j]->start) { 4288 push @gap_list, 4289 Range->new($r->[$j-1]->end + 1, 4290 $r->[$j]->start - 1, 4291 Type => $type); 4292 trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace; 4293 } 4294 } 4295 4296 # Here, we have either found an existing range in the range list, 4297 # beyond the area affected by the input one, or we fell off the 4298 # end of the loop because the input range affects the whole rest 4299 # of the range list. In either case, $j is 1 higher than the 4300 # highest affected range. If $j == $i, it means that there are no 4301 # affected ranges, that the entire insertion is in the gap between 4302 # r[$i-1], and r[$i], which we already have taken care of before 4303 # the loop. 4304 # On the other hand, if there are affected ranges, it might be 4305 # that there is a gap that needs filling after the final such 4306 # range to the end of the input range 4307 if ($r->[$j-1]->end < $end) { 4308 push @gap_list, Range->new(main::max($start, 4309 $r->[$j-1]->end + 1), 4310 $end, 4311 Type => $type); 4312 trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace; 4313 } 4314 4315 # Call recursively to fill in all the gaps. 4316 foreach my $gap (@gap_list) { 4317 $self->_add_delete($operation, 4318 $gap->start, 4319 $gap->end, 4320 $value, 4321 Type => $type); 4322 } 4323 4324 return; 4325 } 4326 4327 # Here, we have taken care of the case where $replace is $NO. 4328 # Remember that here, r[$i-1]->end < $start <= r[$i]->end 4329 # If inserting a multiple record, this is where it goes, before the 4330 # first (if any) existing one if inserting LIFO. (If this is to go 4331 # afterwards, FIFO, we below move the pointer to there.) These imply 4332 # an insertion, and no change to any existing ranges. Note that $i 4333 # can be -1 if this new range doesn't actually duplicate any existing, 4334 # and comes at the beginning of the list. 4335 if ($replace == $MULTIPLE_BEFORE || $replace == $MULTIPLE_AFTER) { 4336 4337 if ($start != $end) { 4338 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point. No action taken."); 4339 return; 4340 } 4341 4342 # If the new code point is within a current range ... 4343 if ($end >= $r->[$i]->start) { 4344 4345 # Don't add an exact duplicate, as it isn't really a multiple 4346 my $existing_value = $r->[$i]->value; 4347 my $existing_type = $r->[$i]->type; 4348 return if $value eq $existing_value && $type eq $existing_type; 4349 4350 # If the multiple value is part of an existing range, we want 4351 # to split up that range, so that only the single code point 4352 # is affected. To do this, we first call ourselves 4353 # recursively to delete that code point from the table, having 4354 # preserved its current data above. Then we call ourselves 4355 # recursively again to add the new multiple, which we know by 4356 # the test just above is different than the current code 4357 # point's value, so it will become a range containing a single 4358 # code point: just itself. Finally, we add back in the 4359 # pre-existing code point, which will again be a single code 4360 # point range. Because 'i' likely will have changed as a 4361 # result of these operations, we can't just continue on, but 4362 # do this operation recursively as well. If we are inserting 4363 # LIFO, the pre-existing code point needs to go after the new 4364 # one, so use MULTIPLE_AFTER; and vice versa. 4365 if ($r->[$i]->start != $r->[$i]->end) { 4366 $self->_add_delete('-', $start, $end, ""); 4367 $self->_add_delete('+', $start, $end, $value, Type => $type); 4368 return $self->_add_delete('+', 4369 $start, $end, 4370 $existing_value, 4371 Type => $existing_type, 4372 Replace => ($replace == $MULTIPLE_BEFORE) 4373 ? $MULTIPLE_AFTER 4374 : $MULTIPLE_BEFORE); 4375 } 4376 } 4377 4378 # If to place this new record after, move to beyond all existing 4379 # ones; but don't add this one if identical to any of them, as it 4380 # isn't really a multiple. This leaves the original order, so 4381 # that the current request is ignored. The reasoning is that the 4382 # previous request that wanted this record to have high priority 4383 # should have precedence. 4384 if ($replace == $MULTIPLE_AFTER) { 4385 while ($i < @$r && $r->[$i]->start == $start) { 4386 return if $value eq $r->[$i]->value 4387 && $type eq $r->[$i]->type; 4388 $i++; 4389 } 4390 } 4391 else { 4392 # If instead we are to place this new record before any 4393 # existing ones, remove any identical ones that come after it. 4394 # This changes the existing order so that the new one is 4395 # first, as is being requested. 4396 for (my $j = $i + 1; 4397 $j < @$r && $r->[$j]->start == $start; 4398 $j++) 4399 { 4400 if ($value eq $r->[$j]->value && $type eq $r->[$j]->type) { 4401 splice @$r, $j, 1; 4402 last; # There should only be one instance, so no 4403 # need to keep looking 4404 } 4405 } 4406 } 4407 4408 trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace; 4409 my @return = splice @$r, 4410 $i, 4411 0, 4412 Range->new($start, 4413 $end, 4414 Value => $value, 4415 Type => $type); 4416 if (main::DEBUG && $to_trace) { 4417 trace "After splice:"; 4418 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2; 4419 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1; 4420 trace "i =[", $i, "]", $r->[$i] if $i >= 0; 4421 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1; 4422 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2; 4423 trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3; 4424 } 4425 return @return; 4426 } 4427 4428 # Here, we have taken care of $NO and $MULTIPLE_foo replaces. This 4429 # leaves delete, insert, and replace either unconditionally or if not 4430 # equivalent. $i still points to the first potential affected range. 4431 # Now find the highest range affected, which will determine the length 4432 # parameter to splice. (The input range can span multiple existing 4433 # ones.) If this isn't a deletion, while we are looking through the 4434 # range list, see also if this is a replacement rather than a clean 4435 # insertion; that is if it will change the values of at least one 4436 # existing range. Start off assuming it is an insert, until find it 4437 # isn't. 4438 my $clean_insert = $operation eq '+'; 4439 my $j; # This will point to the highest affected range 4440 4441 # For non-zero types, the standard form is the value itself; 4442 my $standard_form = ($type) ? $value : main::standardize($value); 4443 4444 for ($j = $i; $j < $range_list_size; $j++) { 4445 trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace; 4446 4447 # If find a range that it doesn't overlap into, we can stop 4448 # searching 4449 last if $end < $r->[$j]->start; 4450 4451 # Here, overlaps the range at $j. If the values don't match, 4452 # and so far we think this is a clean insertion, it becomes a 4453 # non-clean insertion, i.e., a 'change' or 'replace' instead. 4454 if ($clean_insert) { 4455 if ($r->[$j]->standard_form ne $standard_form) { 4456 $clean_insert = 0; 4457 if ($replace == $CROAK) { 4458 main::croak("The range to add " 4459 . sprintf("%04X", $start) 4460 . '-' 4461 . sprintf("%04X", $end) 4462 . " with value '$value' overlaps an existing range $r->[$j]"); 4463 } 4464 } 4465 else { 4466 4467 # Here, the two values are essentially the same. If the 4468 # two are actually identical, replacing wouldn't change 4469 # anything so skip it. 4470 my $pre_existing = $r->[$j]->value; 4471 if ($pre_existing ne $value) { 4472 4473 # Here the new and old standardized values are the 4474 # same, but the non-standardized values aren't. If 4475 # replacing unconditionally, then replace 4476 if( $replace == $UNCONDITIONALLY) { 4477 $clean_insert = 0; 4478 } 4479 else { 4480 4481 # Here, are replacing conditionally. Decide to 4482 # replace or not based on which appears to look 4483 # the "nicest". If one is mixed case and the 4484 # other isn't, choose the mixed case one. 4485 my $new_mixed = $value =~ /[A-Z]/ 4486 && $value =~ /[a-z]/; 4487 my $old_mixed = $pre_existing =~ /[A-Z]/ 4488 && $pre_existing =~ /[a-z]/; 4489 4490 if ($old_mixed != $new_mixed) { 4491 $clean_insert = 0 if $new_mixed; 4492 if (main::DEBUG && $to_trace) { 4493 if ($clean_insert) { 4494 trace "Retaining $pre_existing over $value"; 4495 } 4496 else { 4497 trace "Replacing $pre_existing with $value"; 4498 } 4499 } 4500 } 4501 else { 4502 4503 # Here casing wasn't different between the two. 4504 # If one has hyphens or underscores and the 4505 # other doesn't, choose the one with the 4506 # punctuation. 4507 my $new_punct = $value =~ /[-_]/; 4508 my $old_punct = $pre_existing =~ /[-_]/; 4509 4510 if ($old_punct != $new_punct) { 4511 $clean_insert = 0 if $new_punct; 4512 if (main::DEBUG && $to_trace) { 4513 if ($clean_insert) { 4514 trace "Retaining $pre_existing over $value"; 4515 } 4516 else { 4517 trace "Replacing $pre_existing with $value"; 4518 } 4519 } 4520 } # else existing one is just as "good"; 4521 # retain it to save cycles. 4522 } 4523 } 4524 } 4525 } 4526 } 4527 } # End of loop looking for highest affected range. 4528 4529 # Here, $j points to one beyond the highest range that this insertion 4530 # affects (hence to beyond the range list if that range is the final 4531 # one in the range list). 4532 4533 # The splice length is all the affected ranges. Get it before 4534 # subtracting, for efficiency, so we don't have to later add 1. 4535 my $length = $j - $i; 4536 4537 $j--; # $j now points to the highest affected range. 4538 trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace; 4539 4540 # Here, have taken care of $NO and $MULTIPLE_foo replaces. 4541 # $j points to the highest affected range. But it can be < $i or even 4542 # -1. These happen only if the insertion is entirely in the gap 4543 # between r[$i-1] and r[$i]. Here's why: j < i means that the j loop 4544 # above exited first time through with $end < $r->[$i]->start. (And 4545 # then we subtracted one from j) This implies also that $start < 4546 # $r->[$i]->start, but we know from above that $r->[$i-1]->end < 4547 # $start, so the entire input range is in the gap. 4548 if ($j < $i) { 4549 4550 # Here the entire input range is in the gap before $i. 4551 4552 if (main::DEBUG && $to_trace) { 4553 if ($i) { 4554 trace "Entire range is between $r->[$i-1] and $r->[$i]"; 4555 } 4556 else { 4557 trace "Entire range is before $r->[$i]"; 4558 } 4559 } 4560 return if $operation ne '+'; # Deletion of a non-existent range is 4561 # a no-op 4562 } 4563 else { 4564 4565 # Here part of the input range is not in the gap before $i. Thus, 4566 # there is at least one affected one, and $j points to the highest 4567 # such one. 4568 4569 # At this point, here is the situation: 4570 # This is not an insertion of a multiple, nor of tentative ($NO) 4571 # data. 4572 # $i points to the first element in the current range list that 4573 # may be affected by this operation. In fact, we know 4574 # that the range at $i is affected because we are in 4575 # the else branch of this 'if' 4576 # $j points to the highest affected range. 4577 # In other words, 4578 # r[$i-1]->end < $start <= r[$i]->end 4579 # And: 4580 # r[$i-1]->end < $start <= $end < r[$j+1]->start 4581 # 4582 # Also: 4583 # $clean_insert is a boolean which is set true if and only if 4584 # this is a "clean insertion", i.e., not a change nor a 4585 # deletion (multiple was handled above). 4586 4587 # We now have enough information to decide if this call is a no-op 4588 # or not. It is a no-op if this is an insertion of already 4589 # existing data. To be so, it must be contained entirely in one 4590 # range. 4591 4592 if (main::DEBUG && $to_trace && $clean_insert 4593 && $start >= $r->[$i]->start 4594 && $end <= $r->[$i]->end) 4595 { 4596 trace "no-op"; 4597 } 4598 return if $clean_insert 4599 && $start >= $r->[$i]->start 4600 && $end <= $r->[$i]->end; 4601 } 4602 4603 # Here, we know that some action will have to be taken. We have 4604 # calculated the offset and length (though adjustments may be needed) 4605 # for the splice. Now start constructing the replacement list. 4606 my @replacement; 4607 my $splice_start = $i; 4608 4609 my $extends_below; 4610 my $extends_above; 4611 4612 # See if should extend any adjacent ranges. 4613 if ($operation eq '-') { # Don't extend deletions 4614 $extends_below = $extends_above = 0; 4615 } 4616 else { # Here, should extend any adjacent ranges. See if there are 4617 # any. 4618 $extends_below = ($i > 0 4619 # can't extend unless adjacent 4620 && $r->[$i-1]->end == $start -1 4621 # can't extend unless are same standard value 4622 && $r->[$i-1]->standard_form eq $standard_form 4623 # can't extend unless share type 4624 && $r->[$i-1]->type == $type); 4625 $extends_above = ($j+1 < $range_list_size 4626 && $r->[$j+1]->start == $end +1 4627 && $r->[$j+1]->standard_form eq $standard_form 4628 && $r->[$j+1]->type == $type); 4629 } 4630 if ($extends_below && $extends_above) { # Adds to both 4631 $splice_start--; # start replace at element below 4632 $length += 2; # will replace on both sides 4633 trace "Extends both below and above ranges" if main::DEBUG && $to_trace; 4634 4635 # The result will fill in any gap, replacing both sides, and 4636 # create one large range. 4637 @replacement = Range->new($r->[$i-1]->start, 4638 $r->[$j+1]->end, 4639 Value => $value, 4640 Type => $type); 4641 } 4642 else { 4643 4644 # Here we know that the result won't just be the conglomeration of 4645 # a new range with both its adjacent neighbors. But it could 4646 # extend one of them. 4647 4648 if ($extends_below) { 4649 4650 # Here the new element adds to the one below, but not to the 4651 # one above. If inserting, and only to that one range, can 4652 # just change its ending to include the new one. 4653 if ($length == 0 && $clean_insert) { 4654 $r->[$i-1]->set_end($end); 4655 trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace; 4656 return; 4657 } 4658 else { 4659 trace "Changing inserted range to start at ", sprintf("%04X", $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace; 4660 $splice_start--; # start replace at element below 4661 $length++; # will replace the element below 4662 $start = $r->[$i-1]->start; 4663 } 4664 } 4665 elsif ($extends_above) { 4666 4667 # Here the new element adds to the one above, but not below. 4668 # Mirror the code above 4669 if ($length == 0 && $clean_insert) { 4670 $r->[$j+1]->set_start($start); 4671 trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace; 4672 return; 4673 } 4674 else { 4675 trace "Changing inserted range to end at ", sprintf("%04X", $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace; 4676 $length++; # will replace the element above 4677 $end = $r->[$j+1]->end; 4678 } 4679 } 4680 4681 trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace; 4682 4683 # Finally, here we know there will have to be a splice. 4684 # If the change or delete affects only the highest portion of the 4685 # first affected range, the range will have to be split. The 4686 # splice will remove the whole range, but will replace it by a new 4687 # range containing just the unaffected part. So, in this case, 4688 # add to the replacement list just this unaffected portion. 4689 if (! $extends_below 4690 && $start > $r->[$i]->start && $start <= $r->[$i]->end) 4691 { 4692 push @replacement, 4693 Range->new($r->[$i]->start, 4694 $start - 1, 4695 Value => $r->[$i]->value, 4696 Type => $r->[$i]->type); 4697 } 4698 4699 # In the case of an insert or change, but not a delete, we have to 4700 # put in the new stuff; this comes next. 4701 if ($operation eq '+') { 4702 push @replacement, Range->new($start, 4703 $end, 4704 Value => $value, 4705 Type => $type); 4706 } 4707 4708 trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i; 4709 #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace; 4710 4711 # And finally, if we're changing or deleting only a portion of the 4712 # highest affected range, it must be split, as the lowest one was. 4713 if (! $extends_above 4714 && $j >= 0 # Remember that j can be -1 if before first 4715 # current element 4716 && $end >= $r->[$j]->start 4717 && $end < $r->[$j]->end) 4718 { 4719 push @replacement, 4720 Range->new($end + 1, 4721 $r->[$j]->end, 4722 Value => $r->[$j]->value, 4723 Type => $r->[$j]->type); 4724 } 4725 } 4726 4727 # And do the splice, as calculated above 4728 if (main::DEBUG && $to_trace) { 4729 trace "replacing $length element(s) at $i with "; 4730 foreach my $replacement (@replacement) { 4731 trace " $replacement"; 4732 } 4733 trace "Before splice:"; 4734 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2; 4735 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1; 4736 trace "i =[", $i, "]", $r->[$i]; 4737 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1; 4738 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2; 4739 } 4740 4741 my @return = splice @$r, $splice_start, $length, @replacement; 4742 4743 if (main::DEBUG && $to_trace) { 4744 trace "After splice:"; 4745 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2; 4746 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1; 4747 trace "i =[", $i, "]", $r->[$i]; 4748 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1; 4749 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2; 4750 trace "removed ", @return if @return; 4751 } 4752 4753 # An actual deletion could have changed the maximum in the list. 4754 # There was no deletion if the splice didn't return something, but 4755 # otherwise recalculate it. This is done too rarely to worry about 4756 # performance. 4757 if ($operation eq '-' && @return) { 4758 if (@$r) { 4759 $max{$addr} = $r->[-1]->end; 4760 } 4761 else { # Now empty 4762 $max{$addr} = $max_init; 4763 } 4764 } 4765 return @return; 4766 } 4767 4768 sub reset_each_range($self) { # reset the iterator for each_range(); 4769 no overloading; 4770 undef $each_range_iterator{pack 'J', $self}; 4771 return; 4772 } 4773 4774 sub each_range($self) { 4775 # Iterate over each range in a range list. Results are undefined if 4776 # the range list is changed during the iteration. 4777 my $addr = do { no overloading; pack 'J', $self; }; 4778 4779 return if $self->is_empty; 4780 4781 $each_range_iterator{$addr} = -1 4782 if ! defined $each_range_iterator{$addr}; 4783 $each_range_iterator{$addr}++; 4784 return $ranges{$addr}->[$each_range_iterator{$addr}] 4785 if $each_range_iterator{$addr} < @{$ranges{$addr}}; 4786 undef $each_range_iterator{$addr}; 4787 return; 4788 } 4789 4790 sub count($self) { # Returns count of code points in range list 4791 my $addr = do { no overloading; pack 'J', $self; }; 4792 4793 my $count = 0; 4794 foreach my $range (@{$ranges{$addr}}) { 4795 $count += $range->end - $range->start + 1; 4796 } 4797 return $count; 4798 } 4799 4800 sub delete_range($self, $start, $end) { # Delete a range 4801 return $self->_add_delete('-', $start, $end, ""); 4802 } 4803 4804 sub is_empty($self) { # Returns boolean as to if a range list is empty 4805 no overloading; 4806 return scalar @{$ranges{pack 'J', $self}} == 0; 4807 } 4808 4809 sub hash($self) { 4810 # Quickly returns a scalar suitable for separating tables into 4811 # buckets, i.e. it is a hash function of the contents of a table, so 4812 # there are relatively few conflicts. 4813 my $addr = do { no overloading; pack 'J', $self; }; 4814 4815 # These are quickly computable. Return looks like 'min..max;count' 4816 return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}}; 4817 } 4818} # End closure for _Range_List_Base 4819 4820package Range_List; 4821use parent '-norequire', '_Range_List_Base'; 4822 4823use warnings; 4824use strict; 4825 4826use feature 'signatures'; 4827no warnings 'experimental::signatures'; 4828 4829# A Range_List is a range list for match tables; i.e. the range values are 4830# not significant. Thus a number of operations can be safely added to it, 4831# such as inversion, intersection. Note that union is also an unsafe 4832# operation when range values are cared about, and that method is in the base 4833# class, not here. But things are set up so that that method is callable only 4834# during initialization. Only in this derived class, is there an operation 4835# that combines two tables. A Range_Map can thus be used to initialize a 4836# Range_List, and its mappings will be in the list, but are not significant to 4837# this class. 4838 4839sub trace { return main::trace(@_); } 4840 4841{ # Closure 4842 4843 use overload 4844 fallback => 0, 4845 '+' => sub { my $self = shift; 4846 my $other = shift; 4847 4848 return $self->_union($other) 4849 }, 4850 '+=' => sub { my $self = shift; 4851 my $other = shift; 4852 my $reversed = shift; 4853 4854 if ($reversed) { 4855 Carp::my_carp_bug("Bad news. Can't cope with '" 4856 . ref($other) 4857 . ' += ' 4858 . ref($self) 4859 . "'. undef returned."); 4860 return; 4861 } 4862 4863 return $self->_union($other) 4864 }, 4865 '&' => sub { my $self = shift; 4866 my $other = shift; 4867 4868 return $self->_intersect($other, 0); 4869 }, 4870 '&=' => sub { my $self = shift; 4871 my $other = shift; 4872 my $reversed = shift; 4873 4874 if ($reversed) { 4875 Carp::my_carp_bug("Bad news. Can't cope with '" 4876 . ref($other) 4877 . ' &= ' 4878 . ref($self) 4879 . "'. undef returned."); 4880 return; 4881 } 4882 4883 return $self->_intersect($other, 0); 4884 }, 4885 '~' => "_invert", 4886 '-' => "_subtract", 4887 ; 4888 4889 sub _invert($self, @) { 4890 # Returns a new Range_List that gives all code points not in $self. 4891 my $new = Range_List->new; 4892 4893 # Go through each range in the table, finding the gaps between them 4894 my $max = -1; # Set so no gap before range beginning at 0 4895 for my $range ($self->ranges) { 4896 my $start = $range->start; 4897 my $end = $range->end; 4898 4899 # If there is a gap before this range, the inverse will contain 4900 # that gap. 4901 if ($start > $max + 1) { 4902 $new->add_range($max + 1, $start - 1); 4903 } 4904 $max = $end; 4905 } 4906 4907 # And finally, add the gap from the end of the table to the max 4908 # possible code point 4909 if ($max < $MAX_WORKING_CODEPOINT) { 4910 $new->add_range($max + 1, $MAX_WORKING_CODEPOINT); 4911 } 4912 return $new; 4913 } 4914 4915 sub _subtract($self, $other, $reversed=0) { 4916 # Returns a new Range_List with the argument deleted from it. The 4917 # argument can be a single code point, a range, or something that has 4918 # a range, with the _range_list() method on it returning them 4919 4920 if ($reversed) { 4921 Carp::my_carp_bug("Bad news. Can't cope with '" 4922 . ref($other) 4923 . ' - ' 4924 . ref($self) 4925 . "'. undef returned."); 4926 return; 4927 } 4928 4929 my $new = Range_List->new(Initialize => $self); 4930 4931 if (! ref $other) { # Single code point 4932 $new->delete_range($other, $other); 4933 } 4934 elsif ($other->isa('Range')) { 4935 $new->delete_range($other->start, $other->end); 4936 } 4937 elsif ($other->can('_range_list')) { 4938 foreach my $range ($other->_range_list->ranges) { 4939 $new->delete_range($range->start, $range->end); 4940 } 4941 } 4942 else { 4943 Carp::my_carp_bug("Can't cope with a " 4944 . ref($other) 4945 . " argument to '-'. Subtraction ignored." 4946 ); 4947 return $self; 4948 } 4949 4950 return $new; 4951 } 4952 4953 sub _intersect($a_object, $b_object, $check_if_overlapping=0) { 4954 # Returns either a boolean giving whether the two inputs' range lists 4955 # intersect (overlap), or a new Range_List containing the intersection 4956 # of the two lists. The optional final parameter being true indicates 4957 # to do the check instead of the intersection. 4958 4959 if (! defined $b_object) { 4960 my $message = ""; 4961 $message .= $a_object->_owner_name_of if defined $a_object; 4962 Carp::my_carp_bug($message .= "Called with undefined value. Intersection not done."); 4963 return; 4964 } 4965 4966 # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b ) 4967 # Thus the intersection could be much more simply be written: 4968 # return ~(~$a_object + ~$b_object); 4969 # But, this is slower, and when taking the inverse of a large 4970 # range_size_1 table, back when such tables were always stored that 4971 # way, it became prohibitively slow, hence the code was changed to the 4972 # below 4973 4974 if ($b_object->isa('Range')) { 4975 $b_object = Range_List->new(Initialize => $b_object, 4976 Owner => $a_object->_owner_name_of); 4977 } 4978 $b_object = $b_object->_range_list if $b_object->can('_range_list'); 4979 4980 my @a_ranges = $a_object->ranges; 4981 my @b_ranges = $b_object->ranges; 4982 4983 #local $to_trace = 1 if main::DEBUG; 4984 trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace; 4985 4986 # Start with the first range in each list 4987 my $a_i = 0; 4988 my $range_a = $a_ranges[$a_i]; 4989 my $b_i = 0; 4990 my $range_b = $b_ranges[$b_i]; 4991 4992 my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of) 4993 if ! $check_if_overlapping; 4994 4995 # If either list is empty, there is no intersection and no overlap 4996 if (! defined $range_a || ! defined $range_b) { 4997 return $check_if_overlapping ? 0 : $new; 4998 } 4999 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace; 5000 5001 # Otherwise, must calculate the intersection/overlap. Start with the 5002 # very first code point in each list 5003 my $a = $range_a->start; 5004 my $b = $range_b->start; 5005 5006 # Loop through all the ranges of each list; in each iteration, $a and 5007 # $b are the current code points in their respective lists 5008 while (1) { 5009 5010 # If $a and $b are the same code point, ... 5011 if ($a == $b) { 5012 5013 # it means the lists overlap. If just checking for overlap 5014 # know the answer now, 5015 return 1 if $check_if_overlapping; 5016 5017 # The intersection includes this code point plus anything else 5018 # common to both current ranges. 5019 my $start = $a; 5020 my $end = main::min($range_a->end, $range_b->end); 5021 if (! $check_if_overlapping) { 5022 trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace; 5023 $new->add_range($start, $end); 5024 } 5025 5026 # Skip ahead to the end of the current intersect 5027 $a = $b = $end; 5028 5029 # If the current intersect ends at the end of either range (as 5030 # it must for at least one of them), the next possible one 5031 # will be the beginning code point in it's list's next range. 5032 if ($a == $range_a->end) { 5033 $range_a = $a_ranges[++$a_i]; 5034 last unless defined $range_a; 5035 $a = $range_a->start; 5036 } 5037 if ($b == $range_b->end) { 5038 $range_b = $b_ranges[++$b_i]; 5039 last unless defined $range_b; 5040 $b = $range_b->start; 5041 } 5042 5043 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace; 5044 } 5045 elsif ($a < $b) { 5046 5047 # Not equal, but if the range containing $a encompasses $b, 5048 # change $a to be the middle of the range where it does equal 5049 # $b, so the next iteration will get the intersection 5050 if ($range_a->end >= $b) { 5051 $a = $b; 5052 } 5053 else { 5054 5055 # Here, the current range containing $a is entirely below 5056 # $b. Go try to find a range that could contain $b. 5057 $a_i = $a_object->_search_ranges($b); 5058 5059 # If no range found, quit. 5060 last unless defined $a_i; 5061 5062 # The search returns $a_i, such that 5063 # range_a[$a_i-1]->end < $b <= range_a[$a_i]->end 5064 # Set $a to the beginning of this new range, and repeat. 5065 $range_a = $a_ranges[$a_i]; 5066 $a = $range_a->start; 5067 } 5068 } 5069 else { # Here, $b < $a. 5070 5071 # Mirror image code to the leg just above 5072 if ($range_b->end >= $a) { 5073 $b = $a; 5074 } 5075 else { 5076 $b_i = $b_object->_search_ranges($a); 5077 last unless defined $b_i; 5078 $range_b = $b_ranges[$b_i]; 5079 $b = $range_b->start; 5080 } 5081 } 5082 } # End of looping through ranges. 5083 5084 # Intersection fully computed, or now know that there is no overlap 5085 return $check_if_overlapping ? 0 : $new; 5086 } 5087 5088 sub overlaps($self, $other) { 5089 # Returns boolean giving whether the two arguments overlap somewhere 5090 return $self->_intersect($other, 1); 5091 } 5092 5093 sub add_range($self, $start, $end) { 5094 # Add a range to the list. 5095 return $self->_add_delete('+', $start, $end, ""); 5096 } 5097 5098 sub matches_identically_to($self, $other) { 5099 # Return a boolean as to whether or not two Range_Lists match identical 5100 # sets of code points. 5101 # These are ordered in increasing real time to figure out (at least 5102 # until a patch changes that and doesn't change this) 5103 return 0 if $self->max != $other->max; 5104 return 0 if $self->min != $other->min; 5105 return 0 if $self->range_count != $other->range_count; 5106 return 0 if $self->count != $other->count; 5107 5108 # Here they could be identical because all the tests above passed. 5109 # The loop below is somewhat simpler since we know they have the same 5110 # number of elements. Compare range by range, until reach the end or 5111 # find something that differs. 5112 my @a_ranges = $self->ranges; 5113 my @b_ranges = $other->ranges; 5114 for my $i (0 .. @a_ranges - 1) { 5115 my $a = $a_ranges[$i]; 5116 my $b = $b_ranges[$i]; 5117 trace "self $a; other $b" if main::DEBUG && $to_trace; 5118 return 0 if ! defined $b 5119 || $a->start != $b->start 5120 || $a->end != $b->end; 5121 } 5122 return 1; 5123 } 5124 5125 sub is_code_point_usable($code, $try_hard) { 5126 # This used only for making the test script. See if the input 5127 # proposed trial code point is one that Perl will handle. If second 5128 # parameter is 0, it won't select some code points for various 5129 # reasons, noted below. 5130 return 0 if $code < 0; # Never use a negative 5131 5132 # shun null. I'm (khw) not sure why this was done, but NULL would be 5133 # the character very frequently used. 5134 return $try_hard if $code == 0x0000; 5135 5136 # shun non-character code points. 5137 return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF; 5138 return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF 5139 5140 return $try_hard if $code > $MAX_UNICODE_CODEPOINT; # keep in range 5141 return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate 5142 5143 return 1; 5144 } 5145 5146 sub get_valid_code_point($self) { 5147 # Return a code point that's part of the range list. Returns nothing 5148 # if the table is empty or we can't find a suitable code point. This 5149 # used only for making the test script. 5150 my $addr = do { no overloading; pack 'J', $self; }; 5151 5152 # On first pass, don't choose less desirable code points; if no good 5153 # one is found, repeat, allowing a less desirable one to be selected. 5154 for my $try_hard (0, 1) { 5155 5156 # Look through all the ranges for a usable code point. 5157 for my $set (reverse $self->ranges) { 5158 5159 # Try the edge cases first, starting with the end point of the 5160 # range. 5161 my $end = $set->end; 5162 return $end if is_code_point_usable($end, $try_hard); 5163 $end = $MAX_UNICODE_CODEPOINT + 1 if $end > $MAX_UNICODE_CODEPOINT; 5164 5165 # End point didn't, work. Start at the beginning and try 5166 # every one until find one that does work. 5167 for my $trial ($set->start .. $end - 1) { 5168 return $trial if is_code_point_usable($trial, $try_hard); 5169 } 5170 } 5171 } 5172 return (); # If none found, give up. 5173 } 5174 5175 sub get_invalid_code_point($self) { 5176 # Return a code point that's not part of the table. Returns nothing 5177 # if the table covers all code points or a suitable code point can't 5178 # be found. This used only for making the test script. 5179 5180 # Just find a valid code point of the inverse, if any. 5181 return Range_List->new(Initialize => ~ $self)->get_valid_code_point; 5182 } 5183} # end closure for Range_List 5184 5185package Range_Map; 5186use parent '-norequire', '_Range_List_Base'; 5187 5188use strict; 5189use warnings; 5190 5191use feature 'signatures'; 5192no warnings 'experimental::signatures'; 5193 5194# A Range_Map is a range list in which the range values (called maps) are 5195# significant, and hence shouldn't be manipulated by our other code, which 5196# could be ambiguous or lose things. For example, in taking the union of two 5197# lists, which share code points, but which have differing values, which one 5198# has precedence in the union? 5199# It turns out that these operations aren't really necessary for map tables, 5200# and so this class was created to make sure they aren't accidentally 5201# applied to them. 5202 5203{ # Closure 5204 5205 sub add_map($self, @add) { 5206 # Add a range containing a mapping value to the list 5207 return $self->_add_delete('+', @add); 5208 } 5209 5210 sub replace_map($self, @list) { 5211 # Replace a range 5212 return $self->_add_delete('+', @list, Replace => $UNCONDITIONALLY); 5213 } 5214 5215 sub add_duplicate { 5216 # Adds entry to a range list which can duplicate an existing entry 5217 5218 my $self = shift; 5219 my $code_point = shift; 5220 my $value = shift; 5221 my %args = @_; 5222 my $replace = delete $args{'Replace'} // $MULTIPLE_BEFORE; 5223 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 5224 5225 return $self->add_map($code_point, $code_point, 5226 $value, Replace => $replace); 5227 } 5228} # End of closure for package Range_Map 5229 5230package _Base_Table; 5231 5232use strict; 5233use warnings; 5234 5235use feature 'signatures'; 5236no warnings 'experimental::signatures'; 5237 5238# A table is the basic data structure that gets written out into a file for 5239# use by the Perl core. This is the abstract base class implementing the 5240# common elements from the derived ones. A list of the methods to be 5241# furnished by an implementing class is just after the constructor. 5242 5243sub standardize { return main::standardize($_[0]); } 5244sub trace { return main::trace(@_); } 5245 5246{ # Closure 5247 5248 main::setup_package(); 5249 5250 my %range_list; 5251 # Object containing the ranges of the table. 5252 main::set_access('range_list', \%range_list, 'p_r', 'p_s'); 5253 5254 my %full_name; 5255 # The full table name. 5256 main::set_access('full_name', \%full_name, 'r'); 5257 5258 my %name; 5259 # The table name, almost always shorter 5260 main::set_access('name', \%name, 'r'); 5261 5262 my %short_name; 5263 # The shortest of all the aliases for this table, with underscores removed 5264 main::set_access('short_name', \%short_name); 5265 5266 my %nominal_short_name_length; 5267 # The length of short_name before removing underscores 5268 main::set_access('nominal_short_name_length', 5269 \%nominal_short_name_length); 5270 5271 my %complete_name; 5272 # The complete name, including property. 5273 main::set_access('complete_name', \%complete_name, 'r'); 5274 5275 my %property; 5276 # Parent property this table is attached to. 5277 main::set_access('property', \%property, 'r'); 5278 5279 my %aliases; 5280 # Ordered list of alias objects of the table's name. The first ones in 5281 # the list are output first in comments 5282 main::set_access('aliases', \%aliases, 'readable_array'); 5283 5284 my %comment; 5285 # A comment associated with the table for human readers of the files 5286 main::set_access('comment', \%comment, 's'); 5287 5288 my %description; 5289 # A comment giving a short description of the table's meaning for human 5290 # readers of the files. 5291 main::set_access('description', \%description, 'readable_array'); 5292 5293 my %note; 5294 # A comment giving a short note about the table for human readers of the 5295 # files. 5296 main::set_access('note', \%note, 'readable_array'); 5297 5298 my %fate; 5299 # Enum; there are a number of possibilities for what happens to this 5300 # table: it could be normal, or suppressed, or not for external use. See 5301 # values at definition for $SUPPRESSED. 5302 main::set_access('fate', \%fate, 'r'); 5303 5304 my %find_table_from_alias; 5305 # The parent property passes this pointer to a hash which this class adds 5306 # all its aliases to, so that the parent can quickly take an alias and 5307 # find this table. 5308 main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r'); 5309 5310 my %locked; 5311 # After this table is made equivalent to another one; we shouldn't go 5312 # changing the contents because that could mean it's no longer equivalent 5313 main::set_access('locked', \%locked, 'r'); 5314 5315 my %file_path; 5316 # This gives the final path to the file containing the table. Each 5317 # directory in the path is an element in the array 5318 main::set_access('file_path', \%file_path, 'readable_array'); 5319 5320 my %status; 5321 # What is the table's status, normal, $OBSOLETE, etc. Enum 5322 main::set_access('status', \%status, 'r'); 5323 5324 my %status_info; 5325 # A comment about its being obsolete, or whatever non normal status it has 5326 main::set_access('status_info', \%status_info, 'r'); 5327 5328 my %caseless_equivalent; 5329 # The table this is equivalent to under /i matching, if any. 5330 main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's'); 5331 5332 my %range_size_1; 5333 # Is the table to be output with each range only a single code point? 5334 # This is done to avoid breaking existing code that may have come to rely 5335 # on this behavior in previous versions of this program.) 5336 main::set_access('range_size_1', \%range_size_1, 'r', 's'); 5337 5338 my %perl_extension; 5339 # A boolean set iff this table is a Perl extension to the Unicode 5340 # standard. 5341 main::set_access('perl_extension', \%perl_extension, 'r'); 5342 5343 my %output_range_counts; 5344 # A boolean set iff this table is to have comments written in the 5345 # output file that contain the number of code points in the range. 5346 # The constructor can override the global flag of the same name. 5347 main::set_access('output_range_counts', \%output_range_counts, 'r'); 5348 5349 my %write_as_invlist; 5350 # A boolean set iff the output file for this table is to be in the form of 5351 # an inversion list/map. 5352 main::set_access('write_as_invlist', \%write_as_invlist, 'r'); 5353 5354 my %format; 5355 # The format of the entries of the table. This is calculated from the 5356 # data in the table (or passed in the constructor). This is an enum e.g., 5357 # $STRING_FORMAT. It is marked protected as it should not be generally 5358 # used to override calculations. 5359 main::set_access('format', \%format, 'r', 'p_s'); 5360 5361 my %has_dependency; 5362 # A boolean that gives whether some other table in this property is 5363 # defined as the complement of this table. This is a crude, but currently 5364 # sufficient, mechanism to make this table not get destroyed before what 5365 # is dependent on it is. Other dependencies could be added, so the name 5366 # was chosen to reflect a more general situation than actually is 5367 # currently the case. 5368 main::set_access('has_dependency', \%has_dependency, 'r', 's'); 5369 5370 sub new { 5371 # All arguments are key => value pairs, which you can see below, most 5372 # of which match fields documented above. Otherwise: Re_Pod_Entry, 5373 # OK_as_Filename, and Fuzzy apply to the names of the table, and are 5374 # documented in the Alias package 5375 5376 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2; 5377 5378 my $class = shift; 5379 5380 my $self = bless \do { my $anonymous_scalar }, $class; 5381 my $addr = do { no overloading; pack 'J', $self; }; 5382 5383 my %args = @_; 5384 5385 $name{$addr} = delete $args{'Name'}; 5386 $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'}; 5387 $full_name{$addr} = delete $args{'Full_Name'}; 5388 my $complete_name = $complete_name{$addr} 5389 = delete $args{'Complete_Name'}; 5390 $format{$addr} = delete $args{'Format'}; 5391 $output_range_counts{$addr} = delete $args{'Output_Range_Counts'}; 5392 $property{$addr} = delete $args{'_Property'}; 5393 $range_list{$addr} = delete $args{'_Range_List'}; 5394 $status{$addr} = delete $args{'Status'} || $NORMAL; 5395 $status_info{$addr} = delete $args{'_Status_Info'} || ""; 5396 $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0; 5397 $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0; 5398 $fate{$addr} = delete $args{'Fate'} || $ORDINARY; 5399 $write_as_invlist{$addr} = delete $args{'Write_As_Invlist'};# No default 5400 my $ucd = delete $args{'UCD'}; 5401 5402 my $description = delete $args{'Description'}; 5403 my $ok_as_filename = delete $args{'OK_as_Filename'}; 5404 my $loose_match = delete $args{'Fuzzy'}; 5405 my $note = delete $args{'Note'}; 5406 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'}; 5407 my $perl_extension = delete $args{'Perl_Extension'}; 5408 my $suppression_reason = delete $args{'Suppression_Reason'}; 5409 5410 # Shouldn't have any left over 5411 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 5412 5413 # Can't use || above because conceivably the name could be 0, and 5414 # can't use // operator in case this program gets used in Perl 5.8 5415 $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr}; 5416 $output_range_counts{$addr} = $output_range_counts if 5417 ! defined $output_range_counts{$addr}; 5418 5419 $aliases{$addr} = [ ]; 5420 $comment{$addr} = [ ]; 5421 $description{$addr} = [ ]; 5422 $note{$addr} = [ ]; 5423 $file_path{$addr} = [ ]; 5424 $locked{$addr} = ""; 5425 $has_dependency{$addr} = 0; 5426 5427 push @{$description{$addr}}, $description if $description; 5428 push @{$note{$addr}}, $note if $note; 5429 5430 if ($fate{$addr} == $PLACEHOLDER) { 5431 5432 # A placeholder table doesn't get documented, is a perl extension, 5433 # and quite likely will be empty 5434 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry; 5435 $perl_extension = 1 if ! defined $perl_extension; 5436 $ucd = 0 if ! defined $ucd; 5437 push @tables_that_may_be_empty, $complete_name{$addr}; 5438 $self->add_comment(<<END); 5439This is a placeholder because it is not in Version $string_version of Unicode, 5440but is needed by the Perl core to work gracefully. Because it is not in this 5441version of Unicode, it will not be listed in $pod_file.pod 5442END 5443 } 5444 elsif (exists $why_suppressed{$complete_name} 5445 # Don't suppress if overridden 5446 && ! grep { $_ eq $complete_name{$addr} } 5447 @output_mapped_properties) 5448 { 5449 $fate{$addr} = $SUPPRESSED; 5450 } 5451 elsif ($fate{$addr} == $SUPPRESSED) { 5452 Carp::my_carp_bug("Need reason for suppressing") unless $suppression_reason; 5453 # Though currently unused 5454 } 5455 elsif ($suppression_reason) { 5456 Carp::my_carp_bug("A reason was given for suppressing, but not suppressed"); 5457 } 5458 5459 # If hasn't set its status already, see if it is on one of the 5460 # lists of properties or tables that have particular statuses; if 5461 # not, is normal. The lists are prioritized so the most serious 5462 # ones are checked first 5463 if (! $status{$addr}) { 5464 if (exists $why_deprecated{$complete_name}) { 5465 $status{$addr} = $DEPRECATED; 5466 } 5467 elsif (exists $why_stabilized{$complete_name}) { 5468 $status{$addr} = $STABILIZED; 5469 } 5470 elsif (exists $why_obsolete{$complete_name}) { 5471 $status{$addr} = $OBSOLETE; 5472 } 5473 5474 # Existence above doesn't necessarily mean there is a message 5475 # associated with it. Use the most serious message. 5476 if ($status{$addr}) { 5477 if ($why_deprecated{$complete_name}) { 5478 $status_info{$addr} 5479 = $why_deprecated{$complete_name}; 5480 } 5481 elsif ($why_stabilized{$complete_name}) { 5482 $status_info{$addr} 5483 = $why_stabilized{$complete_name}; 5484 } 5485 elsif ($why_obsolete{$complete_name}) { 5486 $status_info{$addr} 5487 = $why_obsolete{$complete_name}; 5488 } 5489 } 5490 } 5491 5492 $perl_extension{$addr} = $perl_extension || 0; 5493 5494 # Don't list a property by default that is internal only 5495 if ($fate{$addr} > $MAP_PROXIED) { 5496 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry; 5497 $ucd = 0 if ! defined $ucd; 5498 } 5499 else { 5500 $ucd = 1 if ! defined $ucd; 5501 } 5502 5503 # By convention what typically gets printed only or first is what's 5504 # first in the list, so put the full name there for good output 5505 # clarity. Other routines rely on the full name being first on the 5506 # list 5507 $self->add_alias($full_name{$addr}, 5508 OK_as_Filename => $ok_as_filename, 5509 Fuzzy => $loose_match, 5510 Re_Pod_Entry => $make_re_pod_entry, 5511 Status => $status{$addr}, 5512 UCD => $ucd, 5513 ); 5514 5515 # Then comes the other name, if meaningfully different. 5516 if (standardize($full_name{$addr}) ne standardize($name{$addr})) { 5517 $self->add_alias($name{$addr}, 5518 OK_as_Filename => $ok_as_filename, 5519 Fuzzy => $loose_match, 5520 Re_Pod_Entry => $make_re_pod_entry, 5521 Status => $status{$addr}, 5522 UCD => $ucd, 5523 ); 5524 } 5525 5526 return $self; 5527 } 5528 5529 # Here are the methods that are required to be defined by any derived 5530 # class 5531 for my $sub (qw( 5532 handle_special_range 5533 append_to_body 5534 pre_body 5535 )) 5536 # write() knows how to write out normal ranges, but it calls 5537 # handle_special_range() when it encounters a non-normal one. 5538 # append_to_body() is called by it after it has handled all 5539 # ranges to add anything after the main portion of the table. 5540 # And finally, pre_body() is called after all this to build up 5541 # anything that should appear before the main portion of the 5542 # table. Doing it this way allows things in the middle to 5543 # affect what should appear before the main portion of the 5544 # table. 5545 { 5546 no strict "refs"; 5547 *$sub = sub { 5548 Carp::my_carp_bug( __LINE__ 5549 . ": Must create method '$sub()' for " 5550 . ref shift); 5551 return; 5552 } 5553 } 5554 5555 use overload 5556 fallback => 0, 5557 "." => \&main::_operator_dot, 5558 ".=" => \&main::_operator_dot_equal, 5559 '!=' => \&main::_operator_not_equal, 5560 '==' => \&main::_operator_equal, 5561 ; 5562 5563 sub ranges { 5564 # Returns the array of ranges associated with this table. 5565 5566 no overloading; 5567 return $range_list{pack 'J', shift}->ranges; 5568 } 5569 5570 sub add_alias { 5571 # Add a synonym for this table. 5572 5573 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3; 5574 5575 my $self = shift; 5576 my $name = shift; # The name to add. 5577 my $pointer = shift; # What the alias hash should point to. For 5578 # map tables, this is the parent property; 5579 # for match tables, it is the table itself. 5580 5581 my %args = @_; 5582 my $loose_match = delete $args{'Fuzzy'}; 5583 5584 my $ok_as_filename = delete $args{'OK_as_Filename'}; 5585 $ok_as_filename = 1 unless defined $ok_as_filename; 5586 5587 # An internal name does not get documented, unless overridden by the 5588 # input; same for making tests for it. 5589 my $status = delete $args{'Status'} || (($name =~ /^_/) 5590 ? $INTERNAL_ALIAS 5591 : $NORMAL); 5592 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'} 5593 // (($status ne $INTERNAL_ALIAS) 5594 ? (($name =~ /^_/) ? $NO : $YES) 5595 : $NO); 5596 my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1); 5597 5598 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 5599 5600 # Capitalize the first letter of the alias unless it is one of the CJK 5601 # ones which specifically begins with a lower 'k'. Do this because 5602 # Unicode has varied whether they capitalize first letters or not, and 5603 # have later changed their minds and capitalized them, but not the 5604 # other way around. So do it always and avoid changes from release to 5605 # release 5606 $name = ucfirst($name) unless $name =~ /^k[A-Z]/; 5607 5608 my $addr = do { no overloading; pack 'J', $self; }; 5609 5610 # Figure out if should be loosely matched if not already specified. 5611 if (! defined $loose_match) { 5612 5613 # Is a loose_match if isn't null, and doesn't begin with an 5614 # underscore and isn't just a number 5615 if ($name ne "" 5616 && substr($name, 0, 1) ne '_' 5617 && $name !~ qr{^[0-9_.+-/]+$}) 5618 { 5619 $loose_match = 1; 5620 } 5621 else { 5622 $loose_match = 0; 5623 } 5624 } 5625 5626 # If this alias has already been defined, do nothing. 5627 return if defined $find_table_from_alias{$addr}->{$name}; 5628 5629 # That includes if it is standardly equivalent to an existing alias, 5630 # in which case, add this name to the list, so won't have to search 5631 # for it again. 5632 my $standard_name = main::standardize($name); 5633 if (defined $find_table_from_alias{$addr}->{$standard_name}) { 5634 $find_table_from_alias{$addr}->{$name} 5635 = $find_table_from_alias{$addr}->{$standard_name}; 5636 return; 5637 } 5638 5639 # Set the index hash for this alias for future quick reference. 5640 $find_table_from_alias{$addr}->{$name} = $pointer; 5641 $find_table_from_alias{$addr}->{$standard_name} = $pointer; 5642 local $to_trace = 0 if main::DEBUG; 5643 trace "adding alias $name to $pointer" if main::DEBUG && $to_trace; 5644 trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace; 5645 5646 5647 # Put the new alias at the end of the list of aliases unless the final 5648 # element begins with an underscore (meaning it is for internal perl 5649 # use) or is all numeric, in which case, put the new one before that 5650 # one. This floats any all-numeric or underscore-beginning aliases to 5651 # the end. This is done so that they are listed last in output lists, 5652 # to encourage the user to use a better name (either more descriptive 5653 # or not an internal-only one) instead. This ordering is relied on 5654 # implicitly elsewhere in this program, like in short_name() 5655 my $list = $aliases{$addr}; 5656 my $insert_position = (@$list == 0 5657 || (substr($list->[-1]->name, 0, 1) ne '_' 5658 && $list->[-1]->name =~ /\D/)) 5659 ? @$list 5660 : @$list - 1; 5661 splice @$list, 5662 $insert_position, 5663 0, 5664 Alias->new($name, $loose_match, $make_re_pod_entry, 5665 $ok_as_filename, $status, $ucd); 5666 5667 # This name may be shorter than any existing ones, so clear the cache 5668 # of the shortest, so will have to be recalculated. 5669 no overloading; 5670 undef $short_name{pack 'J', $self}; 5671 return; 5672 } 5673 5674 sub short_name($self, $nominal_length_ptr=undef) { 5675 # Returns a name suitable for use as the base part of a file name. 5676 # That is, shorter wins. It can return undef if there is no suitable 5677 # name. The name has all non-essential underscores removed. 5678 5679 # The optional second parameter is a reference to a scalar in which 5680 # this routine will store the length the returned name had before the 5681 # underscores were removed, or undef if the return is undef. 5682 5683 # The shortest name can change if new aliases are added. So using 5684 # this should be deferred until after all these are added. The code 5685 # that does that should clear this one's cache. 5686 # Any name with alphabetics is preferred over an all numeric one, even 5687 # if longer. 5688 5689 my $addr = do { no overloading; pack 'J', $self; }; 5690 5691 # For efficiency, don't recalculate, but this means that adding new 5692 # aliases could change what the shortest is, so the code that does 5693 # that needs to undef this. 5694 if (defined $short_name{$addr}) { 5695 if ($nominal_length_ptr) { 5696 $$nominal_length_ptr = $nominal_short_name_length{$addr}; 5697 } 5698 return $short_name{$addr}; 5699 } 5700 5701 # Look at each alias 5702 my $is_last_resort = 0; 5703 my $deprecated_or_discouraged 5704 = qr/ ^ (?: $DEPRECATED | $DISCOURAGED ) $/x; 5705 foreach my $alias ($self->aliases()) { 5706 5707 # Don't use an alias that isn't ok to use for an external name. 5708 next if ! $alias->ok_as_filename; 5709 5710 my $name = main::Standardize($alias->name); 5711 trace $self, $name if main::DEBUG && $to_trace; 5712 5713 # Take the first one, or any non-deprecated non-discouraged one 5714 # over one that is, or a shorter one that isn't numeric. This 5715 # relies on numeric aliases always being last in the array 5716 # returned by aliases(). Any alpha one will have precedence. 5717 if ( ! defined $short_name{$addr} 5718 || ( $is_last_resort 5719 && $alias->status !~ $deprecated_or_discouraged) 5720 || ($name =~ /\D/ 5721 && length($name) < length($short_name{$addr}))) 5722 { 5723 # Remove interior underscores. 5724 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg; 5725 5726 $nominal_short_name_length{$addr} = length $name; 5727 $is_last_resort = $alias->status =~ $deprecated_or_discouraged; 5728 } 5729 } 5730 5731 # If the short name isn't a nice one, perhaps an equivalent table has 5732 # a better one. 5733 if ( $self->can('children') 5734 && ( ! defined $short_name{$addr} 5735 || $short_name{$addr} eq "" 5736 || $short_name{$addr} eq "_")) 5737 { 5738 my $return; 5739 foreach my $follower ($self->children) { # All equivalents 5740 my $follower_name = $follower->short_name; 5741 next unless defined $follower_name; 5742 5743 # Anything (except undefined) is better than underscore or 5744 # empty 5745 if (! defined $return || $return eq "_") { 5746 $return = $follower_name; 5747 next; 5748 } 5749 5750 # If the new follower name isn't "_" and is shorter than the 5751 # current best one, prefer the new one. 5752 next if $follower_name eq "_"; 5753 next if length $follower_name > length $return; 5754 $return = $follower_name; 5755 } 5756 $short_name{$addr} = $return if defined $return; 5757 } 5758 5759 # If no suitable external name return undef 5760 if (! defined $short_name{$addr}) { 5761 $$nominal_length_ptr = undef if $nominal_length_ptr; 5762 return; 5763 } 5764 5765 # Don't allow a null short name. 5766 if ($short_name{$addr} eq "") { 5767 $short_name{$addr} = '_'; 5768 $nominal_short_name_length{$addr} = 1; 5769 } 5770 5771 trace $self, $short_name{$addr} if main::DEBUG && $to_trace; 5772 5773 if ($nominal_length_ptr) { 5774 $$nominal_length_ptr = $nominal_short_name_length{$addr}; 5775 } 5776 return $short_name{$addr}; 5777 } 5778 5779 sub external_name($self) { 5780 # Returns the external name that this table should be known by. This 5781 # is usually the short_name, but not if the short_name is undefined, 5782 # in which case the external_name is arbitrarily set to the 5783 # underscore. 5784 5785 my $short = $self->short_name; 5786 return $short if defined $short; 5787 5788 return '_'; 5789 } 5790 5791 sub add_description($self, $description) { # Adds the parameter as a short description. 5792 no overloading; 5793 push @{$description{pack 'J', $self}}, $description; 5794 5795 return; 5796 } 5797 5798 sub add_note($self, $note) { # Adds the parameter as a short note. 5799 no overloading; 5800 push @{$note{pack 'J', $self}}, $note; 5801 5802 return; 5803 } 5804 5805 sub add_comment($self, $comment) { # Adds the parameter as a comment. 5806 5807 return unless $debugging_build; 5808 5809 chomp $comment; 5810 5811 no overloading; 5812 push @{$comment{pack 'J', $self}}, $comment; 5813 5814 return; 5815 } 5816 5817 sub comment($self) { 5818 # Return the current comment for this table. If called in list 5819 # context, returns the array of comments. In scalar, returns a string 5820 # of each element joined together with a period ending each. 5821 5822 my $addr = do { no overloading; pack 'J', $self; }; 5823 my @list = @{$comment{$addr}}; 5824 return @list if wantarray; 5825 my $return = ""; 5826 foreach my $sentence (@list) { 5827 $return .= '. ' if $return; 5828 $return .= $sentence; 5829 $return =~ s/\.$//; 5830 } 5831 $return .= '.' if $return; 5832 return $return; 5833 } 5834 5835 sub initialize($self, $initialization) { 5836 # Initialize the table with the argument which is any valid 5837 # initialization for range lists. 5838 5839 my $addr = do { no overloading; pack 'J', $self; }; 5840 5841 # Replace the current range list with a new one of the same exact 5842 # type. 5843 my $class = ref $range_list{$addr}; 5844 $range_list{$addr} = $class->new(Owner => $self, 5845 Initialize => $initialization); 5846 return; 5847 5848 } 5849 5850 sub header($self) { 5851 # The header that is output for the table in the file it is written 5852 # in. 5853 my $return = ""; 5854 $return .= $DEVELOPMENT_ONLY if $compare_versions; 5855 $return .= $HEADER; 5856 return $return; 5857 } 5858 5859 sub merge_single_annotation_line ($output, $annotation, $annotation_column) { 5860 5861 # This appends an annotation comment, $annotation, to $output, 5862 # starting in or after column $annotation_column, removing any 5863 # pre-existing comment from $output. 5864 5865 $annotation =~ s/^ \s* \# \ //x; 5866 $output =~ s/ \s* ( \# \N* )? \n //x; 5867 $output = Text::Tabs::expand($output); 5868 5869 my $spaces = $annotation_column - length $output; 5870 $spaces = 2 if $spaces < 0; # Have 2 blanks before the comment 5871 5872 $output = sprintf "%s%*s# %s", 5873 $output, 5874 $spaces, 5875 " ", 5876 $annotation; 5877 return Text::Tabs::unexpand $output; 5878 } 5879 5880 sub write($self, $use_adjustments=0, $suppress_value=0) { 5881 # Write a representation of the table to its file. It calls several 5882 # functions furnished by sub-classes of this abstract base class to 5883 # handle non-normal ranges, to add stuff before the table, and at its 5884 # end. If the table is to be written so that adjustments are 5885 # required, this does that conversion. 5886 5887 5888 # $use_adjustments ? output in adjusted format or not 5889 # $suppress_value Optional, if the value associated with 5890 # a range equals this one, don't write 5891 # the range 5892 5893 my $addr = do { no overloading; pack 'J', $self; }; 5894 my $write_as_invlist = $write_as_invlist{$addr}; 5895 5896 # Start with the header 5897 my @HEADER = $self->header; 5898 5899 # Then the comments 5900 push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n" 5901 if $comment{$addr}; 5902 5903 # Things discovered processing the main body of the document may 5904 # affect what gets output before it, therefore pre_body() isn't called 5905 # until after all other processing of the table is done. 5906 5907 # The main body looks like a 'here' document. If there are comments, 5908 # get rid of them when processing it. 5909 my @OUT; 5910 if ($annotate || $output_range_counts) { 5911 # Use the line below in Perls that don't have /r 5912 #push @OUT, 'return join "\n", map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n"; 5913 push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n"; 5914 } else { 5915 push @OUT, "return <<'END';\n"; 5916 } 5917 5918 if ($range_list{$addr}->is_empty) { 5919 5920 # This is a kludge for empty tables to silence a warning in 5921 # utf8.c, which can't really deal with empty tables, but it can 5922 # deal with a table that matches nothing, as the inverse of 'All' 5923 # does. 5924 push @OUT, "!Unicode::UCD::All\n"; 5925 } 5926 elsif ($self->name eq 'N' 5927 5928 # To save disk space and table cache space, avoid putting out 5929 # binary N tables, but instead create a file which just inverts 5930 # the Y table. Since the file will still exist and occupy a 5931 # certain number of blocks, might as well output the whole 5932 # thing if it all will fit in one block. The number of 5933 # ranges below is an approximate number for that. 5934 && ($self->property->type == $BINARY 5935 || $self->property->type == $FORCED_BINARY) 5936 # && $self->property->tables == 2 Can't do this because the 5937 # non-binary properties, like NFDQC aren't specifiable 5938 # by the notation 5939 && $range_list{$addr}->ranges > 15 5940 && ! $annotate) # Under --annotate, want to see everything 5941 { 5942 push @OUT, "!Unicode::UCD::" . $self->property->name . "\n"; 5943 } 5944 else { 5945 my $range_size_1 = $range_size_1{$addr}; 5946 5947 # To make it more readable, use a minimum indentation 5948 my $comment_indent; 5949 5950 # These are used only in $annotate option 5951 my $format; # e.g. $HEX_ADJUST_FORMAT 5952 my $include_name; # ? Include the character's name in the 5953 # annotation? 5954 my $include_cp; # ? Include its code point 5955 5956 if (! $annotate) { 5957 $comment_indent = ($self->isa('Map_Table')) 5958 ? 24 5959 : ($write_as_invlist) 5960 ? 8 5961 : 16; 5962 } 5963 else { 5964 $format = $self->format; 5965 5966 # The name of the character is output only for tables that 5967 # don't already include the name in the output. 5968 my $property = $self->property; 5969 $include_name = 5970 ! ($property == $perl_charname 5971 || $property == main::property_ref('Unicode_1_Name') 5972 || $property == main::property_ref('Name') 5973 || $property == main::property_ref('Name_Alias') 5974 ); 5975 5976 # Don't include the code point in the annotation where all 5977 # lines are a single code point, so it can be easily found in 5978 # the first column 5979 $include_cp = ! $range_size_1; 5980 5981 if (! $self->isa('Map_Table')) { 5982 $comment_indent = ($write_as_invlist) ? 8 : 16; 5983 } 5984 else { 5985 $comment_indent = 16; 5986 5987 # There are just a few short ranges in this table, so no 5988 # need to include the code point in the annotation. 5989 $include_cp = 0 if $format eq $DECOMP_STRING_FORMAT; 5990 5991 # We're trying to get this to look good, as the whole 5992 # point is to make human-readable tables. It is easier to 5993 # read if almost all the annotation comments begin in the 5994 # same column. Map tables have varying width maps, so can 5995 # create a jagged comment appearance. This code does a 5996 # preliminary pass through these tables looking for the 5997 # maximum width map in each, and causing the comments to 5998 # begin just to the right of that. However, if the 5999 # comments begin too far to the right of most lines, it's 6000 # hard to line them up horizontally with their real data. 6001 # Therefore we ignore the longest outliers 6002 my $ignore_longest_X_percent = 2; # Discard longest X% 6003 6004 # Each key in this hash is a width of at least one of the 6005 # maps in the table. Its value is how many lines have 6006 # that width. 6007 my %widths; 6008 6009 # We won't space things further left than one tab stop 6010 # after the rest of the line; initializing it to that 6011 # number saves some work. 6012 my $max_map_width = 8; 6013 6014 # Fill in the %widths hash 6015 my $total = 0; 6016 for my $set ($range_list{$addr}->ranges) { 6017 my $value = $set->value; 6018 6019 # These range types don't appear in the main table 6020 next if $set->type == 0 6021 && defined $suppress_value 6022 && $value eq $suppress_value; 6023 next if $set->type == $MULTI_CP 6024 || $set->type == $NULL; 6025 6026 # Include 2 spaces before the beginning of the 6027 # comment 6028 my $this_width = length($value) + 2; 6029 6030 # Ranges of the remaining non-zero types usually 6031 # occupy just one line (maybe occasionally two, but 6032 # this doesn't have to be dead accurate). This is 6033 # because these ranges are like "unassigned code 6034 # points" 6035 my $count = ($set->type != 0) 6036 ? 1 6037 : $set->end - $set->start + 1; 6038 $widths{$this_width} += $count; 6039 $total += $count; 6040 $max_map_width = $this_width 6041 if $max_map_width < $this_width; 6042 } 6043 6044 # If the widest map gives us less than two tab stops 6045 # worth, just take it as-is. 6046 if ($max_map_width > 16) { 6047 6048 # Otherwise go through %widths until we have included 6049 # the desired percentage of lines in the whole table. 6050 my $running_total = 0; 6051 foreach my $width (sort { $a <=> $b } keys %widths) 6052 { 6053 $running_total += $widths{$width}; 6054 use integer; 6055 if ($running_total * 100 / $total 6056 >= 100 - $ignore_longest_X_percent) 6057 { 6058 $max_map_width = $width; 6059 last; 6060 } 6061 } 6062 } 6063 $comment_indent += $max_map_width; 6064 } 6065 } 6066 6067 # Values for previous time through the loop. Initialize to 6068 # something that won't be adjacent to the first iteration; 6069 # only $previous_end matters for that. 6070 my $previous_start; 6071 my $previous_end = -2; 6072 my $previous_value; 6073 6074 # Values for next time through the portion of the loop that splits 6075 # the range. 0 in $next_start means there is no remaining portion 6076 # to deal with. 6077 my $next_start = 0; 6078 my $next_end; 6079 my $next_value; 6080 my $offset = 0; 6081 my $invlist_count = 0; 6082 6083 my $output_value_in_hex = $self->isa('Map_Table') 6084 && ($self->format eq $HEX_ADJUST_FORMAT 6085 || $self->to_output_map == $EXTERNAL_MAP); 6086 # Use leading zeroes just for files whose format should not be 6087 # changed from what it has been. Otherwise, they just take up 6088 # space and time to process. 6089 my $hex_format = ($self->isa('Map_Table') 6090 && $self->to_output_map == $EXTERNAL_MAP) 6091 ? "%04X" 6092 : "%X"; 6093 6094 # The values for some of these tables are stored in mktables as 6095 # hex strings. Normally, these are just output as strings without 6096 # change, but when we are doing adjustments, we have to operate on 6097 # these numerically, so we convert those to decimal to do that, 6098 # and back to hex for output 6099 my $convert_map_to_from_hex = 0; 6100 my $output_map_in_hex = 0; 6101 if ($self->isa('Map_Table')) { 6102 $convert_map_to_from_hex 6103 = ($use_adjustments && $self->format eq $HEX_ADJUST_FORMAT) 6104 || ($annotate && $self->format eq $HEX_FORMAT); 6105 $output_map_in_hex = $convert_map_to_from_hex 6106 || $self->format eq $HEX_FORMAT; 6107 } 6108 6109 # To store any annotations about the characters. 6110 my @annotation; 6111 6112 # Output each range as part of the here document. 6113 RANGE: 6114 for my $set ($range_list{$addr}->ranges) { 6115 if ($set->type != 0) { 6116 $self->handle_special_range($set); 6117 next RANGE; 6118 } 6119 my $start = $set->start; 6120 my $end = $set->end; 6121 my $value = $set->value; 6122 6123 # Don't output ranges whose value is the one to suppress 6124 next RANGE if defined $suppress_value 6125 && $value eq $suppress_value; 6126 6127 $value = CORE::hex $value if $convert_map_to_from_hex; 6128 6129 6130 { # This bare block encloses the scope where we may need to 6131 # 'redo' to. Consider a table that is to be written out 6132 # using single item ranges. This is given in the 6133 # $range_size_1 boolean. To accomplish this, we split the 6134 # range each time through the loop into two portions, the 6135 # first item, and the rest. We handle that first item 6136 # this time in the loop, and 'redo' to repeat the process 6137 # for the rest of the range. 6138 # 6139 # We may also have to do it, with other special handling, 6140 # if the table has adjustments. Consider the table that 6141 # contains the lowercasing maps. mktables stores the 6142 # ASCII range ones as 26 ranges: 6143 # ord('A') => ord('a'), .. ord('Z') => ord('z') 6144 # For compactness, the table that gets written has this as 6145 # just one range 6146 # ( ord('A') .. ord('Z') ) => ord('a') 6147 # and the software that reads the tables is smart enough 6148 # to "connect the dots". This change is accomplished in 6149 # this loop by looking to see if the current iteration 6150 # fits the paradigm of the previous iteration, and if so, 6151 # we merge them by replacing the final output item with 6152 # the merged data. Repeated 25 times, this gets A-Z. But 6153 # we also have to make sure we don't screw up cases where 6154 # we have internally stored 6155 # ( 0x1C4 .. 0x1C6 ) => 0x1C5 6156 # This single internal range has to be output as 3 ranges, 6157 # which is done by splitting, like we do for $range_size_1 6158 # tables. (There are very few of such ranges that need to 6159 # be split, so the gain of doing the combining of other 6160 # ranges far outweighs the splitting of these.) The 6161 # values to use for the redo at the end of this block are 6162 # set up just below in the scalars whose names begin with 6163 # '$next_'. 6164 6165 if (($use_adjustments || $range_size_1) && $end != $start) 6166 { 6167 $next_start = $start + 1; 6168 $next_end = $end; 6169 $next_value = $value; 6170 $end = $start; 6171 } 6172 6173 if ($use_adjustments && ! $range_size_1) { 6174 6175 # If this range is adjacent to the previous one, and 6176 # the values in each are integers that are also 6177 # adjacent (differ by 1), then this range really 6178 # extends the previous one that is already in element 6179 # $OUT[-1]. So we pop that element, and pretend that 6180 # the range starts with whatever it started with. 6181 # $offset is incremented by 1 each time so that it 6182 # gives the current offset from the first element in 6183 # the accumulating range, and we keep in $value the 6184 # value of that first element. 6185 if ($start == $previous_end + 1 6186 && $value =~ /^ -? \d+ $/xa 6187 && $previous_value =~ /^ -? \d+ $/xa 6188 && ($value == ($previous_value + ++$offset))) 6189 { 6190 pop @OUT; 6191 $start = $previous_start; 6192 $value = $previous_value; 6193 } 6194 else { 6195 $offset = 0; 6196 if (@annotation == 1) { 6197 $OUT[-1] = merge_single_annotation_line( 6198 $OUT[-1], $annotation[0], $comment_indent); 6199 } 6200 else { 6201 push @OUT, @annotation; 6202 } 6203 } 6204 undef @annotation; 6205 6206 # Save the current values for the next time through 6207 # the loop. 6208 $previous_start = $start; 6209 $previous_end = $end; 6210 $previous_value = $value; 6211 } 6212 6213 if ($write_as_invlist) { 6214 if ( $previous_end > 0 6215 && $output_range_counts{$addr}) 6216 { 6217 my $complement_count = $start - $previous_end - 1; 6218 if ($complement_count > 1) { 6219 $OUT[-1] = merge_single_annotation_line( 6220 $OUT[-1], 6221 "#" 6222 . (" " x 17) 6223 . "[" 6224 . main::clarify_code_point_count( 6225 $complement_count) 6226 . "] in complement\n", 6227 $comment_indent); 6228 } 6229 } 6230 6231 # Inversion list format has a single number per line, 6232 # the starting code point of a range that matches the 6233 # property 6234 push @OUT, $start, "\n"; 6235 $invlist_count++; 6236 6237 # Add a comment with the size of the range, if 6238 # requested. 6239 if ($output_range_counts{$addr}) { 6240 $OUT[-1] = merge_single_annotation_line( 6241 $OUT[-1], 6242 "# [" 6243 . main::clarify_code_point_count($end - $start + 1) 6244 . "]\n", 6245 $comment_indent); 6246 } 6247 } 6248 elsif ($start != $end) { # If there is a range 6249 if ($end == $MAX_WORKING_CODEPOINT) { 6250 push @OUT, sprintf "$hex_format\t$hex_format", 6251 $start, 6252 $MAX_PLATFORM_CODEPOINT; 6253 } 6254 else { 6255 push @OUT, sprintf "$hex_format\t$hex_format", 6256 $start, $end; 6257 } 6258 if (length $value) { 6259 if ($convert_map_to_from_hex) { 6260 $OUT[-1] .= sprintf "\t$hex_format\n", $value; 6261 } 6262 else { 6263 $OUT[-1] .= "\t$value\n"; 6264 } 6265 } 6266 6267 # Add a comment with the size of the range, if 6268 # requested. 6269 if ($output_range_counts{$addr}) { 6270 $OUT[-1] = merge_single_annotation_line( 6271 $OUT[-1], 6272 "# [" 6273 . main::clarify_code_point_count($end - $start + 1) 6274 . "]\n", 6275 $comment_indent); 6276 } 6277 } 6278 else { # Here to output a single code point per line. 6279 6280 # Use any passed in subroutine to output. 6281 if (ref $range_size_1 eq 'CODE') { 6282 for my $i ($start .. $end) { 6283 push @OUT, &{$range_size_1}($i, $value); 6284 } 6285 } 6286 else { 6287 6288 # Here, caller is ok with default output. 6289 for (my $i = $start; $i <= $end; $i++) { 6290 if ($convert_map_to_from_hex) { 6291 push @OUT, 6292 sprintf "$hex_format\t\t$hex_format\n", 6293 $i, $value; 6294 } 6295 else { 6296 push @OUT, sprintf $hex_format, $i; 6297 $OUT[-1] .= "\t\t$value" if $value ne ""; 6298 $OUT[-1] .= "\n"; 6299 } 6300 } 6301 } 6302 } 6303 6304 if ($annotate) { 6305 for (my $i = $start; $i <= $end; $i++) { 6306 my $annotation = ""; 6307 6308 # Get character information if don't have it already 6309 main::populate_char_info($i) 6310 if ! defined $viacode[$i]; 6311 my $type = $annotate_char_type[$i]; 6312 6313 # Figure out if should output the next code points 6314 # as part of a range or not. If this is not in an 6315 # annotation range, then won't output as a range, 6316 # so returns $i. Otherwise use the end of the 6317 # annotation range, but no further than the 6318 # maximum possible end point of the loop. 6319 my $range_end = 6320 $range_size_1 6321 ? $start 6322 : main::min( 6323 $annotate_ranges->value_of($i) || $i, 6324 $end); 6325 6326 # Use a range if it is a range, and either is one 6327 # of the special annotation ranges, or the range 6328 # is at most 3 long. This last case causes the 6329 # algorithmically named code points to be output 6330 # individually in spans of at most 3, as they are 6331 # the ones whose $type is > 0. 6332 if ($range_end != $i 6333 && ( $type < 0 || $range_end - $i > 2)) 6334 { 6335 # Here is to output a range. We don't allow a 6336 # caller-specified output format--just use the 6337 # standard one. 6338 my $range_name = $viacode[$i]; 6339 6340 # For the code points which end in their hex 6341 # value, we eliminate that from the output 6342 # annotation, and capitalize only the first 6343 # letter of each word. 6344 if ($type == $CP_IN_NAME) { 6345 my $hex = sprintf $hex_format, $i; 6346 $range_name =~ s/-$hex$//; 6347 my @words = split " ", $range_name; 6348 for my $word (@words) { 6349 $word = 6350 ucfirst(lc($word)) if $word ne 'CJK'; 6351 } 6352 $range_name = join " ", @words; 6353 } 6354 elsif ($type == $HANGUL_SYLLABLE) { 6355 $range_name = "Hangul Syllable"; 6356 } 6357 6358 # If the annotation would just repeat what's 6359 # already being output as the range, skip it. 6360 # (When an inversion list is being written, it 6361 # isn't a repeat, as that always is in 6362 # decimal) 6363 if ( $write_as_invlist 6364 || $i != $start 6365 || $range_end < $end) 6366 { 6367 if ($range_end < $MAX_WORKING_CODEPOINT) 6368 { 6369 $annotation = sprintf "%04X..%04X", 6370 $i, $range_end; 6371 } 6372 else { 6373 $annotation = sprintf "%04X..INFINITY", 6374 $i; 6375 } 6376 } 6377 else { # Indent if not displaying code points 6378 $annotation = " " x 4; 6379 } 6380 6381 if ($range_name) { 6382 $annotation .= " $age[$i]" if $age[$i]; 6383 $annotation .= " $range_name"; 6384 } 6385 6386 # Include the number of code points in the 6387 # range 6388 my $count = 6389 main::clarify_code_point_count($range_end - $i + 1); 6390 $annotation .= " [$count]\n"; 6391 6392 # Skip to the end of the range 6393 $i = $range_end; 6394 } 6395 else { # Not in a range. 6396 my $comment = ""; 6397 6398 # When outputting the names of each character, 6399 # use the character itself if printable 6400 $comment .= "'" . main::display_chr($i) . "' " 6401 if $printable[$i]; 6402 6403 my $output_value = $value; 6404 6405 # Determine the annotation 6406 if ($format eq $DECOMP_STRING_FORMAT) { 6407 6408 # This is very specialized, with the type 6409 # of decomposition beginning the line 6410 # enclosed in <...>, and the code points 6411 # that the code point decomposes to 6412 # separated by blanks. Create two 6413 # strings, one of the printable 6414 # characters, and one of their official 6415 # names. 6416 (my $map = $output_value) 6417 =~ s/ \ * < .*? > \ +//x; 6418 my $tostr = ""; 6419 my $to_name = ""; 6420 my $to_chr = ""; 6421 foreach my $to (split " ", $map) { 6422 $to = CORE::hex $to; 6423 $to_name .= " + " if $to_name; 6424 $to_chr .= main::display_chr($to); 6425 main::populate_char_info($to) 6426 if ! defined $viacode[$to]; 6427 $to_name .= $viacode[$to]; 6428 } 6429 6430 $comment .= 6431 "=> '$to_chr'; $viacode[$i] => $to_name"; 6432 } 6433 else { 6434 $output_value += $i - $start 6435 if $use_adjustments 6436 # Don't try to adjust a 6437 # non-integer 6438 && $output_value !~ /[-\D]/; 6439 6440 if ($output_map_in_hex) { 6441 main::populate_char_info($output_value) 6442 if ! defined $viacode[$output_value]; 6443 $comment .= " => '" 6444 . main::display_chr($output_value) 6445 . "'; " if $printable[$output_value]; 6446 } 6447 if ($include_name && $viacode[$i]) { 6448 $comment .= " " if $comment; 6449 $comment .= $viacode[$i]; 6450 } 6451 if ($output_map_in_hex) { 6452 $comment .= 6453 " => $viacode[$output_value]" 6454 if $viacode[$output_value]; 6455 $output_value = sprintf($hex_format, 6456 $output_value); 6457 } 6458 } 6459 6460 if ($include_cp) { 6461 $annotation = sprintf "%04X %s", $i, $age[$i]; 6462 if ($use_adjustments) { 6463 $annotation .= " => $output_value"; 6464 } 6465 } 6466 6467 if ($comment ne "") { 6468 $annotation .= " " if $annotation ne ""; 6469 $annotation .= $comment; 6470 } 6471 $annotation .= "\n" if $annotation ne ""; 6472 } 6473 6474 if ($annotation ne "") { 6475 push @annotation, (" " x $comment_indent) 6476 . "# $annotation"; 6477 } 6478 } 6479 6480 # If not adjusting, we don't have to go through the 6481 # loop again to know that the annotation comes next 6482 # in the output. 6483 if (! $use_adjustments) { 6484 if (@annotation == 1) { 6485 $OUT[-1] = merge_single_annotation_line( 6486 $OUT[-1], $annotation[0], $comment_indent); 6487 } 6488 else { 6489 push @OUT, map { Text::Tabs::unexpand $_ } 6490 @annotation; 6491 } 6492 undef @annotation; 6493 } 6494 } 6495 6496 # Add the beginning of the range that doesn't match the 6497 # property, except if the just added match range extends 6498 # to infinity. We do this after any annotations for the 6499 # match range. 6500 if ($write_as_invlist && $end < $MAX_WORKING_CODEPOINT) { 6501 push @OUT, $end + 1, "\n"; 6502 $invlist_count++; 6503 } 6504 6505 # If we split the range, set up so the next time through 6506 # we get the remainder, and redo. 6507 if ($next_start) { 6508 $start = $next_start; 6509 $end = $next_end; 6510 $value = $next_value; 6511 $next_start = 0; 6512 redo; 6513 } 6514 } 6515 } # End of loop through all the table's ranges 6516 6517 push @OUT, @annotation; # Add orphaned annotation, if any 6518 6519 splice @OUT, 1, 0, "V$invlist_count\n" if $invlist_count; 6520 } 6521 6522 # Add anything that goes after the main body, but within the here 6523 # document, 6524 my $append_to_body = $self->append_to_body; 6525 push @OUT, $append_to_body if $append_to_body; 6526 6527 # And finish the here document. 6528 push @OUT, "END\n"; 6529 6530 # Done with the main portion of the body. Can now figure out what 6531 # should appear before it in the file. 6532 my $pre_body = $self->pre_body; 6533 push @HEADER, $pre_body, "\n" if $pre_body; 6534 6535 # All these files should have a .pl suffix added to them. 6536 my @file_with_pl = @{$file_path{$addr}}; 6537 $file_with_pl[-1] .= '.pl'; 6538 6539 main::write(\@file_with_pl, 6540 $annotate, # utf8 iff annotating 6541 \@HEADER, 6542 \@OUT); 6543 return; 6544 } 6545 6546 sub set_status($self, $status, $info) { # Set the table's status 6547 # status The status enum value 6548 # info Any message associated with it. 6549 my $addr = do { no overloading; pack 'J', $self; }; 6550 6551 $status{$addr} = $status; 6552 $status_info{$addr} = $info; 6553 return; 6554 } 6555 6556 sub set_fate($self, $fate, $reason=undef) { # Set the fate of a table 6557 my $addr = do { no overloading; pack 'J', $self; }; 6558 6559 return if $fate{$addr} == $fate; # If no-op 6560 6561 # Can only change the ordinary fate, except if going to $MAP_PROXIED 6562 return if $fate{$addr} != $ORDINARY && $fate != $MAP_PROXIED; 6563 6564 $fate{$addr} = $fate; 6565 6566 # Don't document anything to do with a non-normal fated table 6567 if ($fate != $ORDINARY) { 6568 my $put_in_pod = ($fate == $MAP_PROXIED) ? 1 : 0; 6569 foreach my $alias ($self->aliases) { 6570 $alias->set_ucd($put_in_pod); 6571 6572 # MAP_PROXIED doesn't affect the match tables 6573 next if $fate == $MAP_PROXIED; 6574 $alias->set_make_re_pod_entry($put_in_pod); 6575 } 6576 } 6577 6578 # Save the reason for suppression for output 6579 if ($fate >= $SUPPRESSED) { 6580 $reason = "" unless defined $reason; 6581 $why_suppressed{$complete_name{$addr}} = $reason; 6582 } 6583 6584 return; 6585 } 6586 6587 sub lock($self) { 6588 # Don't allow changes to the table from now on. This stores a stack 6589 # trace of where it was called, so that later attempts to modify it 6590 # can immediately show where it got locked. 6591 my $addr = do { no overloading; pack 'J', $self; }; 6592 6593 $locked{$addr} = ""; 6594 6595 my $line = (caller(0))[2]; 6596 my $i = 1; 6597 6598 # Accumulate the stack trace 6599 while (1) { 6600 my ($pkg, $file, $caller_line, $caller) = caller $i++; 6601 6602 last unless defined $caller; 6603 6604 $locked{$addr} .= " called from $caller() at line $line\n"; 6605 $line = $caller_line; 6606 } 6607 $locked{$addr} .= " called from main at line $line\n"; 6608 6609 return; 6610 } 6611 6612 sub carp_if_locked($self) { 6613 # Return whether a table is locked or not, and, by the way, complain 6614 # if is locked 6615 my $addr = do { no overloading; pack 'J', $self; }; 6616 6617 return 0 if ! $locked{$addr}; 6618 Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n"); 6619 return 1; 6620 } 6621 6622 sub set_file_path($self, @path) { # Set the final directory path for this table 6623 no overloading; 6624 @{$file_path{pack 'J', $self}} = @path; 6625 return 6626 } 6627 6628 # Accessors for the range list stored in this table. First for 6629 # unconditional 6630 for my $sub (qw( 6631 containing_range 6632 contains 6633 count 6634 each_range 6635 hash 6636 is_empty 6637 matches_identically_to 6638 max 6639 min 6640 range_count 6641 reset_each_range 6642 type_of 6643 value_of 6644 )) 6645 { 6646 no strict "refs"; 6647 *$sub = sub { 6648 use strict "refs"; 6649 my $self = shift; 6650 return $self->_range_list->$sub(@_); 6651 } 6652 } 6653 6654 # Then for ones that should fail if locked 6655 for my $sub (qw( 6656 delete_range 6657 )) 6658 { 6659 no strict "refs"; 6660 *$sub = sub { 6661 use strict "refs"; 6662 my $self = shift; 6663 6664 return if $self->carp_if_locked; 6665 no overloading; 6666 return $self->_range_list->$sub(@_); 6667 } 6668 } 6669 6670} # End closure 6671 6672package Map_Table; 6673use parent '-norequire', '_Base_Table'; 6674 6675# A Map Table is a table that contains the mappings from code points to 6676# values. There are two weird cases: 6677# 1) Anomalous entries are ones that aren't maps of ranges of code points, but 6678# are written in the table's file at the end of the table nonetheless. It 6679# requires specially constructed code to handle these; utf8.c can not read 6680# these in, so they should not go in $map_directory. As of this writing, 6681# the only case that these happen is for named sequences used in 6682# charnames.pm. But this code doesn't enforce any syntax on these, so 6683# something else could come along that uses it. 6684# 2) Specials are anything that doesn't fit syntactically into the body of the 6685# table. The ranges for these have a map type of non-zero. The code below 6686# knows about and handles each possible type. In most cases, these are 6687# written as part of the header. 6688# 6689# A map table deliberately can't be manipulated at will unlike match tables. 6690# This is because of the ambiguities having to do with what to do with 6691# overlapping code points. And there just isn't a need for those things; 6692# what one wants to do is just query, add, replace, or delete mappings, plus 6693# write the final result. 6694# However, there is a method to get the list of possible ranges that aren't in 6695# this table to use for defaulting missing code point mappings. And, 6696# map_add_or_replace_non_nulls() does allow one to add another table to this 6697# one, but it is clearly very specialized, and defined that the other's 6698# non-null values replace this one's if there is any overlap. 6699 6700sub trace { return main::trace(@_); } 6701 6702{ # Closure 6703 6704 main::setup_package(); 6705 6706 my %default_map; 6707 # Many input files omit some entries; this gives what the mapping for the 6708 # missing entries should be 6709 main::set_access('default_map', \%default_map, 'r'); 6710 6711 my %anomalous_entries; 6712 # Things that go in the body of the table which don't fit the normal 6713 # scheme of things, like having a range. Not much can be done with these 6714 # once there except to output them. This was created to handle named 6715 # sequences. 6716 main::set_access('anomalous_entry', \%anomalous_entries, 'a'); 6717 main::set_access('anomalous_entries', # Append singular, read plural 6718 \%anomalous_entries, 6719 'readable_array'); 6720 6721 my %replacement_property; 6722 # Certain files are unused by Perl itself, and are kept only for backwards 6723 # compatibility for programs that used them before Unicode::UCD existed. 6724 # These are termed legacy properties. At some point they may be removed, 6725 # but for now mark them as legacy. If non empty, this is the name of the 6726 # property to use instead (i.e., the modern equivalent). 6727 main::set_access('replacement_property', \%replacement_property, 'r'); 6728 6729 my %to_output_map; 6730 # Enum as to whether or not to write out this map table, and how: 6731 # 0 don't output 6732 # $EXTERNAL_MAP means its existence is noted in the documentation, and 6733 # it should not be removed nor its format changed. This 6734 # is done for those files that have traditionally been 6735 # output. Maps of legacy-only properties default to 6736 # this. 6737 # $INTERNAL_MAP means Perl reserves the right to do anything it wants 6738 # with this file 6739 # $OUTPUT_ADJUSTED means that it is an $INTERNAL_MAP, and instead of 6740 # outputting the actual mappings as-is, we adjust things 6741 # to create a much more compact table. Only those few 6742 # tables where the mapping is convertible at least to an 6743 # integer and compacting makes a big difference should 6744 # have this. Hence, the default is to not do this 6745 # unless the table's default mapping is to $CODE_POINT, 6746 # and the range size is not 1. 6747 main::set_access('to_output_map', \%to_output_map, 's'); 6748 6749 sub new { 6750 my $class = shift; 6751 my $name = shift; 6752 6753 my %args = @_; 6754 6755 # Optional initialization data for the table. 6756 my $initialize = delete $args{'Initialize'}; 6757 6758 my $default_map = delete $args{'Default_Map'}; 6759 my $property = delete $args{'_Property'}; 6760 my $full_name = delete $args{'Full_Name'}; 6761 my $replacement_property = delete $args{'Replacement_Property'} // ""; 6762 my $to_output_map = delete $args{'To_Output_Map'}; 6763 6764 # Rest of parameters passed on; legacy properties have several common 6765 # other attributes 6766 if ($replacement_property) { 6767 $args{"Fate"} = $LEGACY_ONLY; 6768 $args{"Range_Size_1"} = 1; 6769 $args{"Perl_Extension"} = 1; 6770 $args{"UCD"} = 0; 6771 } 6772 6773 my $range_list = Range_Map->new(Owner => $property); 6774 6775 my $self = $class->SUPER::new( 6776 Name => $name, 6777 Complete_Name => $full_name, 6778 Full_Name => $full_name, 6779 _Property => $property, 6780 _Range_List => $range_list, 6781 Write_As_Invlist => 0, 6782 %args); 6783 6784 my $addr = do { no overloading; pack 'J', $self; }; 6785 6786 $anomalous_entries{$addr} = []; 6787 $default_map{$addr} = $default_map; 6788 $replacement_property{$addr} = $replacement_property; 6789 $to_output_map = $EXTERNAL_MAP if ! defined $to_output_map 6790 && $replacement_property; 6791 $to_output_map{$addr} = $to_output_map; 6792 6793 $self->initialize($initialize) if defined $initialize; 6794 6795 return $self; 6796 } 6797 6798 use overload 6799 fallback => 0, 6800 qw("") => "_operator_stringify", 6801 ; 6802 6803 sub _operator_stringify { 6804 my $self = shift; 6805 6806 my $name = $self->property->full_name; 6807 $name = '""' if $name eq ""; 6808 return "Map table for Property '$name'"; 6809 } 6810 6811 sub add_alias { 6812 # Add a synonym for this table (which means the property itself) 6813 my $self = shift; 6814 my $name = shift; 6815 # Rest of parameters passed on. 6816 6817 $self->SUPER::add_alias($name, $self->property, @_); 6818 return; 6819 } 6820 6821 sub add_map { 6822 # Add a range of code points to the list of specially-handled code 6823 # points. $MULTI_CP is assumed if the type of special is not passed 6824 # in. 6825 6826 my $self = shift; 6827 my $lower = shift; 6828 my $upper = shift; 6829 my $string = shift; 6830 my %args = @_; 6831 6832 my $type = delete $args{'Type'} || 0; 6833 # Rest of parameters passed on 6834 6835 # Can't change the table if locked. 6836 return if $self->carp_if_locked; 6837 6838 my $addr = do { no overloading; pack 'J', $self; }; 6839 6840 $self->_range_list->add_map($lower, $upper, 6841 $string, 6842 @_, 6843 Type => $type); 6844 return; 6845 } 6846 6847 sub append_to_body($self) { 6848 # Adds to the written HERE document of the table's body any anomalous 6849 # entries in the table.. 6850 my $addr = do { no overloading; pack 'J', $self; }; 6851 6852 return "" unless @{$anomalous_entries{$addr}}; 6853 return join("\n", @{$anomalous_entries{$addr}}) . "\n"; 6854 } 6855 6856 sub map_add_or_replace_non_nulls($self, $other) { 6857 # This adds the mappings in the table $other to $self. Non-null 6858 # mappings from $other override those in $self. It essentially merges 6859 # the two tables, with the second having priority except for null 6860 # mappings. 6861 return if $self->carp_if_locked; 6862 6863 if (! $other->isa(__PACKAGE__)) { 6864 Carp::my_carp_bug("$other should be a " 6865 . __PACKAGE__ 6866 . ". Not a '" 6867 . ref($other) 6868 . "'. Not added;"); 6869 return; 6870 } 6871 6872 my $addr = do { no overloading; pack 'J', $self; }; 6873 my $other_addr = do { no overloading; pack 'J', $other; }; 6874 6875 local $to_trace = 0 if main::DEBUG; 6876 6877 my $self_range_list = $self->_range_list; 6878 my $other_range_list = $other->_range_list; 6879 foreach my $range ($other_range_list->ranges) { 6880 my $value = $range->value; 6881 next if $value eq ""; 6882 $self_range_list->_add_delete('+', 6883 $range->start, 6884 $range->end, 6885 $value, 6886 Type => $range->type, 6887 Replace => $UNCONDITIONALLY); 6888 } 6889 6890 return; 6891 } 6892 6893 sub set_default_map($self, $map, $use_full_name=0) { 6894 # Define what code points that are missing from the input files should 6895 # map to. The optional second parameter 'full_name' indicates to 6896 # force using the full name of the map instead of its standard name. 6897 if ($use_full_name && $use_full_name ne 'full_name') { 6898 Carp::my_carp_bug("Second parameter to set_default_map() if" 6899 . " present, must be 'full_name'"); 6900 } 6901 6902 my $addr = do { no overloading; pack 'J', $self; }; 6903 6904 # Convert the input to the standard equivalent, if any (won't have any 6905 # for $STRING properties) 6906 my $standard = $self->property->table($map); 6907 if (defined $standard) { 6908 $map = ($use_full_name) 6909 ? $standard->full_name 6910 : $standard->name; 6911 } 6912 6913 # Warn if there already is a non-equivalent default map for this 6914 # property. Note that a default map can be a ref, which means that 6915 # what it actually means is delayed until later in the program, and it 6916 # IS permissible to override it here without a message. 6917 my $default_map = $default_map{$addr}; 6918 if (defined $default_map 6919 && ! ref($default_map) 6920 && $default_map ne $map 6921 && main::Standardize($map) ne $default_map) 6922 { 6923 my $property = $self->property; 6924 my $map_table = $property->table($map); 6925 my $default_table = $property->table($default_map); 6926 if (defined $map_table 6927 && defined $default_table 6928 && $map_table != $default_table) 6929 { 6930 Carp::my_carp("Changing the default mapping for " 6931 . $property 6932 . " from $default_map to $map'"); 6933 } 6934 } 6935 6936 $default_map{$addr} = $map; 6937 6938 # Don't also create any missing table for this map at this point, 6939 # because if we did, it could get done before the main table add is 6940 # done for PropValueAliases.txt; instead the caller will have to make 6941 # sure it exists, if desired. 6942 return; 6943 } 6944 6945 sub to_output_map($self) { 6946 # Returns boolean: should we write this map table? 6947 my $addr = do { no overloading; pack 'J', $self; }; 6948 6949 # If overridden, use that 6950 return $to_output_map{$addr} if defined $to_output_map{$addr}; 6951 6952 my $full_name = $self->full_name; 6953 return $global_to_output_map{$full_name} 6954 if defined $global_to_output_map{$full_name}; 6955 6956 # If table says to output, do so; if says to suppress it, do so. 6957 my $fate = $self->fate; 6958 return $INTERNAL_MAP if $fate == $INTERNAL_ONLY; 6959 return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties; 6960 return 0 if $fate == $SUPPRESSED || $fate == $MAP_PROXIED; 6961 6962 my $type = $self->property->type; 6963 6964 # Don't want to output binary map tables even for debugging. 6965 return 0 if $type == $BINARY; 6966 6967 # But do want to output string ones. All the ones that remain to 6968 # be dealt with (i.e. which haven't explicitly been set to external) 6969 # are for internal Perl use only. The default for those that map to 6970 # $CODE_POINT and haven't been restricted to a single element range 6971 # is to use the adjusted form. 6972 if ($type == $STRING) { 6973 return $INTERNAL_MAP if $self->range_size_1 6974 || $default_map{$addr} ne $CODE_POINT; 6975 return $OUTPUT_ADJUSTED; 6976 } 6977 6978 # Otherwise is an $ENUM, do output it, for Perl's purposes 6979 return $INTERNAL_MAP; 6980 } 6981 6982 sub inverse_list($self) { 6983 # Returns a Range_List that is gaps of the current table. That is, 6984 # the inversion 6985 my $current = Range_List->new(Initialize => $self->_range_list, 6986 Owner => $self->property); 6987 return ~ $current; 6988 } 6989 6990 sub header($self) { 6991 my $return = $self->SUPER::header(); 6992 6993 if ($self->to_output_map >= $INTERNAL_MAP) { 6994 $return .= $INTERNAL_ONLY_HEADER; 6995 } 6996 else { 6997 my $property_name = $self->property->replacement_property; 6998 6999 # The legacy-only properties were gotten above; but there are some 7000 # other properties whose files are in current use that have fixed 7001 # formats. 7002 $property_name = $self->property->full_name unless $property_name; 7003 7004 $return .= <<END; 7005 7006# !!!!!!! IT IS DEPRECATED TO USE THIS FILE !!!!!!! 7007 7008# This file is for internal use by core Perl only. It is retained for 7009# backwards compatibility with applications that may have come to rely on it, 7010# but its format and even its name or existence are subject to change without 7011# notice in a future Perl version. Don't use it directly. Instead, its 7012# contents are now retrievable through a stable API in the Unicode::UCD 7013# module: Unicode::UCD::prop_invmap('$property_name') (Values for individual 7014# code points can be retrieved via Unicode::UCD::charprop()); 7015END 7016 } 7017 return $return; 7018 } 7019 7020 sub set_final_comment($self) { 7021 # Just before output, create the comment that heads the file 7022 # containing this table. 7023 7024 return unless $debugging_build; 7025 7026 # No sense generating a comment if aren't going to write it out. 7027 return if ! $self->to_output_map; 7028 7029 my $addr = do { no overloading; pack 'J', $self; }; 7030 7031 my $property = $self->property; 7032 7033 # Get all the possible names for this property. Don't use any that 7034 # aren't ok for use in a file name, etc. This is perhaps causing that 7035 # flag to do double duty, and may have to be changed in the future to 7036 # have our own flag for just this purpose; but it works now to exclude 7037 # Perl generated synonyms from the lists for properties, where the 7038 # name is always the proper Unicode one. 7039 my @property_aliases = grep { $_->ok_as_filename } $self->aliases; 7040 7041 my $count = $self->count; 7042 my $default_map = $default_map{$addr}; 7043 7044 # The ranges that map to the default aren't output, so subtract that 7045 # to get those actually output. A property with matching tables 7046 # already has the information calculated. 7047 if ($property->type != $STRING && $property->type != $FORCED_BINARY) { 7048 $count -= $property->table($default_map)->count; 7049 } 7050 elsif (defined $default_map) { 7051 7052 # But for $STRING properties, must calculate now. Subtract the 7053 # count from each range that maps to the default. 7054 foreach my $range ($self->_range_list->ranges) { 7055 if ($range->value eq $default_map) { 7056 $count -= $range->end +1 - $range->start; 7057 } 7058 } 7059 7060 } 7061 7062 # Get a string version of $count with underscores in large numbers, 7063 # for clarity. 7064 my $string_count = main::clarify_code_point_count($count); 7065 7066 my $code_points = ($count == 1) 7067 ? 'single code point' 7068 : "$string_count code points"; 7069 7070 my $mapping; 7071 my $these_mappings; 7072 my $are; 7073 if (@property_aliases <= 1) { 7074 $mapping = 'mapping'; 7075 $these_mappings = 'this mapping'; 7076 $are = 'is' 7077 } 7078 else { 7079 $mapping = 'synonymous mappings'; 7080 $these_mappings = 'these mappings'; 7081 $are = 'are' 7082 } 7083 my $cp; 7084 if ($count >= $MAX_UNICODE_CODEPOINTS) { 7085 $cp = "any code point in Unicode Version $string_version"; 7086 } 7087 else { 7088 my $map_to; 7089 if ($default_map eq "") { 7090 $map_to = 'the null string'; 7091 } 7092 elsif ($default_map eq $CODE_POINT) { 7093 $map_to = "itself"; 7094 } 7095 else { 7096 $map_to = "'$default_map'"; 7097 } 7098 if ($count == 1) { 7099 $cp = "the single code point"; 7100 } 7101 else { 7102 $cp = "one of the $code_points"; 7103 } 7104 $cp .= " in Unicode Version $unicode_version for which the mapping is not to $map_to"; 7105 } 7106 7107 my $comment = ""; 7108 7109 my $status = $self->status; 7110 if ($status ne $NORMAL) { 7111 my $warn = uc $status_past_participles{$status}; 7112 $comment .= <<END; 7113 7114!!!!!!! $warn !!!!!!!!!!!!!!!!!!! 7115 All property or property=value combinations contained in this file are $warn. 7116 See $unicode_reference_url for what this means. 7117 7118END 7119 } 7120 $comment .= "This file returns the $mapping:\n"; 7121 7122 my $ucd_accessible_name = ""; 7123 my $has_underscore_name = 0; 7124 my $full_name = $self->property->full_name; 7125 for my $i (0 .. @property_aliases - 1) { 7126 my $name = $property_aliases[$i]->name; 7127 $has_underscore_name = 1 if $name =~ /^_/; 7128 $comment .= sprintf("%-8s%s\n", " ", $name . '(cp)'); 7129 if ($property_aliases[$i]->ucd) { 7130 if ($name eq $full_name) { 7131 $ucd_accessible_name = $full_name; 7132 } 7133 elsif (! $ucd_accessible_name) { 7134 $ucd_accessible_name = $name; 7135 } 7136 } 7137 } 7138 $comment .= "\nwhere 'cp' is $cp."; 7139 if ($ucd_accessible_name) { 7140 $comment .= " Note that $these_mappings"; 7141 if ($has_underscore_name) { 7142 $comment .= " (except for the one(s) that begin with an underscore)"; 7143 } 7144 $comment .= " $are accessible via the functions prop_invmap('$full_name') or charprop() in Unicode::UCD"; 7145 7146 } 7147 7148 # And append any commentary already set from the actual property. 7149 $comment .= "\n\n" . $self->comment if $self->comment; 7150 if ($self->description) { 7151 $comment .= "\n\n" . join " ", $self->description; 7152 } 7153 if ($self->note) { 7154 $comment .= "\n\n" . join " ", $self->note; 7155 } 7156 $comment .= "\n"; 7157 7158 if (! $self->perl_extension) { 7159 $comment .= <<END; 7160 7161For information about what this property really means, see: 7162$unicode_reference_url 7163END 7164 } 7165 7166 if ($count) { # Format differs for empty table 7167 $comment.= "\nThe format of the "; 7168 if ($self->range_size_1) { 7169 $comment.= <<END; 7170main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT 7171is in hex; MAPPING is what CODE_POINT maps to. 7172END 7173 } 7174 else { 7175 7176 # There are tables which end up only having one element per 7177 # range, but it is not worth keeping track of for making just 7178 # this comment a little better. 7179 $comment .= <<END; 7180non-comment portions of the main body of lines of this file is: 7181START\\tSTOP\\tMAPPING where START is the starting code point of the 7182range, in hex; STOP is the ending point, or if omitted, the range has just one 7183code point; MAPPING is what each code point between START and STOP maps to. 7184END 7185 if ($self->output_range_counts) { 7186 $comment .= <<END; 7187Numbers in comments in [brackets] indicate how many code points are in the 7188range (omitted when the range is a single code point or if the mapping is to 7189the null string). 7190END 7191 } 7192 } 7193 } 7194 $self->set_comment(main::join_lines($comment)); 7195 return; 7196 } 7197 7198 my %swash_keys; # Makes sure don't duplicate swash names. 7199 7200 # The remaining variables are temporaries used while writing each table, 7201 # to output special ranges. 7202 my @multi_code_point_maps; # Map is to more than one code point. 7203 7204 sub handle_special_range($self, $range) { 7205 # Called in the middle of write when it finds a range it doesn't know 7206 # how to handle. 7207 7208 my $addr = do { no overloading; pack 'J', $self; }; 7209 7210 my $type = $range->type; 7211 7212 my $low = $range->start; 7213 my $high = $range->end; 7214 my $map = $range->value; 7215 7216 # No need to output the range if it maps to the default. 7217 return if $map eq $default_map{$addr}; 7218 7219 my $property = $self->property; 7220 7221 # Switch based on the map type... 7222 if ($type == $HANGUL_SYLLABLE) { 7223 7224 # These are entirely algorithmically determinable based on 7225 # some constants furnished by Unicode; for now, just set a 7226 # flag to indicate that have them. After everything is figured 7227 # out, we will output the code that does the algorithm. (Don't 7228 # output them if not needed because we are suppressing this 7229 # property.) 7230 $has_hangul_syllables = 1 if $property->to_output_map; 7231 } 7232 elsif ($type == $CP_IN_NAME) { 7233 7234 # Code points whose name ends in their code point are also 7235 # algorithmically determinable, but need information about the map 7236 # to do so. Both the map and its inverse are stored in data 7237 # structures output in the file. They are stored in the mean time 7238 # in global lists The lists will be written out later into Name.pm, 7239 # which is created only if needed. In order to prevent duplicates 7240 # in the list, only add to them for one property, should multiple 7241 # ones need them. 7242 if ($needing_code_points_ending_in_code_point == 0) { 7243 $needing_code_points_ending_in_code_point = $property; 7244 } 7245 if ($property == $needing_code_points_ending_in_code_point) { 7246 push @{$names_ending_in_code_point{$map}->{'low'}}, $low; 7247 push @{$names_ending_in_code_point{$map}->{'high'}}, $high; 7248 7249 my $squeezed = $map =~ s/[-\s]+//gr; 7250 push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}}, 7251 $low; 7252 push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}}, 7253 $high; 7254 7255 # Calculate the set of legal characters in names of this 7256 # series. It includes every character in the name prefix. 7257 my %legal; 7258 $legal{$_} = 1 for split //, $map; 7259 7260 # Plus the hex code point chars, blank, and minus. Also \n 7261 # can show up as being required due to anchoring 7262 for my $i ('0' .. '9', 'A' .. 'F', '-', ' ', "\n") { 7263 $legal{$i} = 1; 7264 } 7265 my $legal = join "", sort { $a cmp $b } keys %legal; 7266 7267 # The legal chars can be used in match optimizations 7268 push @code_points_ending_in_code_point, { low => $low, 7269 high => $high, 7270 name => $map, 7271 legal => $legal, 7272 }; 7273 } 7274 } 7275 elsif ($range->type == $MULTI_CP || $range->type == $NULL) { 7276 7277 # Multi-code point maps and null string maps have an entry 7278 # for each code point in the range. They use the same 7279 # output format. 7280 for my $code_point ($low .. $high) { 7281 7282 # The pack() below can't cope with surrogates. XXX This may 7283 # no longer be true 7284 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) { 7285 Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self. No map created"); 7286 next; 7287 } 7288 7289 # Generate the hash entries for these in the form that 7290 # utf8.c understands. 7291 my $tostr = ""; 7292 my $to_name = ""; 7293 my $to_chr = ""; 7294 foreach my $to (split " ", $map) { 7295 if ($to !~ /^$code_point_re$/) { 7296 Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self. No map created"); 7297 next; 7298 } 7299 $tostr .= sprintf "\\x{%s}", $to; 7300 $to = CORE::hex $to; 7301 if ($annotate) { 7302 $to_name .= " + " if $to_name; 7303 $to_chr .= main::display_chr($to); 7304 main::populate_char_info($to) 7305 if ! defined $viacode[$to]; 7306 $to_name .= $viacode[$to]; 7307 } 7308 } 7309 7310 # The unpack yields a list of the bytes that comprise the 7311 # UTF-8 of $code_point, which are each placed in \xZZ format 7312 # and output in the %s to map to $tostr, so the result looks 7313 # like: 7314 # "\xC4\xB0" => "\x{0069}\x{0307}", 7315 my $utf8 = sprintf(qq["%s" => "$tostr",], 7316 join("", map { sprintf "\\x%02X", $_ } 7317 unpack("U0C*", chr $code_point))); 7318 7319 # Add a comment so that a human reader can more easily 7320 # see what's going on. 7321 push @multi_code_point_maps, 7322 sprintf("%-45s # U+%04X", $utf8, $code_point); 7323 if (! $annotate) { 7324 $multi_code_point_maps[-1] .= " => $map"; 7325 } 7326 else { 7327 main::populate_char_info($code_point) 7328 if ! defined $viacode[$code_point]; 7329 $multi_code_point_maps[-1] .= " '" 7330 . main::display_chr($code_point) 7331 . "' => '$to_chr'; $viacode[$code_point] => $to_name"; 7332 } 7333 } 7334 } 7335 else { 7336 Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self. Not written"); 7337 } 7338 7339 return; 7340 } 7341 7342 sub pre_body($self) { 7343 # Returns the string that should be output in the file before the main 7344 # body of this table. It isn't called until the main body is 7345 # calculated, saving a pass. The string includes some hash entries 7346 # identifying the format of the body, and what the single value should 7347 # be for all ranges missing from it. It also includes any code points 7348 # which have map_types that don't go in the main table. 7349 7350 my $addr = do { no overloading; pack 'J', $self; }; 7351 7352 my $name = $self->property->swash_name; 7353 7354 # Currently there is nothing in the pre_body unless a swash is being 7355 # generated. 7356 return unless defined $name; 7357 7358 if (defined $swash_keys{$name}) { 7359 Carp::my_carp(main::join_lines(<<END 7360Already created a swash name '$name' for $swash_keys{$name}. This means that 7361the same name desired for $self shouldn't be used. Bad News. This must be 7362fixed before production use, but proceeding anyway 7363END 7364 )); 7365 } 7366 $swash_keys{$name} = "$self"; 7367 7368 my $pre_body = ""; 7369 7370 # Here we assume we were called after have gone through the whole 7371 # file. If we actually generated anything for each map type, add its 7372 # respective header and trailer 7373 my $specials_name = ""; 7374 if (@multi_code_point_maps) { 7375 $specials_name = "Unicode::UCD::ToSpec$name"; 7376 $pre_body .= <<END; 7377 7378# Some code points require special handling because their mappings are each to 7379# multiple code points. These do not appear in the main body, but are defined 7380# in the hash below. 7381 7382# Each key is the string of N bytes that together make up the UTF-8 encoding 7383# for the code point. (i.e. the same as looking at the code point's UTF-8 7384# under "use bytes"). Each value is the UTF-8 of the translation, for speed. 7385\%$specials_name = ( 7386END 7387 $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n"; 7388 } 7389 7390 my $format = $self->format; 7391 7392 my $return = ""; 7393 7394 my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED); 7395 if ($output_adjusted) { 7396 if ($specials_name) { 7397 $return .= <<END; 7398# The mappings in the non-hash portion of this file must be modified to get the 7399# correct values by adding the code point ordinal number to each one that is 7400# numeric. 7401END 7402 } 7403 else { 7404 $return .= <<END; 7405# The mappings must be modified to get the correct values by adding the code 7406# point ordinal number to each one that is numeric. 7407END 7408 } 7409 } 7410 7411 $return .= <<END; 7412 7413# The name this table is to be known by, with the format of the mappings in 7414# the main body of the table, and what all code points missing from this file 7415# map to. 7416\$Unicode::UCD::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format} 7417END 7418 if ($specials_name) { 7419 $return .= <<END; 7420\$Unicode::UCD::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings 7421END 7422 } 7423 my $default_map = $default_map{$addr}; 7424 7425 # For $CODE_POINT default maps and using adjustments, instead the default 7426 # becomes zero. 7427 $return .= "\$Unicode::UCD::SwashInfo{'To$name'}{'missing'} = '" 7428 . (($output_adjusted && $default_map eq $CODE_POINT) 7429 ? "0" 7430 : $default_map) 7431 . "';"; 7432 7433 if ($default_map eq $CODE_POINT) { 7434 $return .= ' # code point maps to itself'; 7435 } 7436 elsif ($default_map eq "") { 7437 $return .= ' # code point maps to the null string'; 7438 } 7439 $return .= "\n"; 7440 7441 $return .= $pre_body; 7442 7443 return $return; 7444 } 7445 7446 sub write($self) { 7447 # Write the table to the file. 7448 7449 my $addr = do { no overloading; pack 'J', $self; }; 7450 7451 # Clear the temporaries 7452 undef @multi_code_point_maps; 7453 7454 # Calculate the format of the table if not already done. 7455 my $format = $self->format; 7456 my $type = $self->property->type; 7457 my $default_map = $self->default_map; 7458 if (! defined $format) { 7459 if ($type == $BINARY) { 7460 7461 # Don't bother checking the values, because we elsewhere 7462 # verify that a binary table has only 2 values. 7463 $format = $BINARY_FORMAT; 7464 } 7465 else { 7466 my @ranges = $self->_range_list->ranges; 7467 7468 # default an empty table based on its type and default map 7469 if (! @ranges) { 7470 7471 # But it turns out that the only one we can say is a 7472 # non-string (besides binary, handled above) is when the 7473 # table is a string and the default map is to a code point 7474 if ($type == $STRING && $default_map eq $CODE_POINT) { 7475 $format = $HEX_FORMAT; 7476 } 7477 else { 7478 $format = $STRING_FORMAT; 7479 } 7480 } 7481 else { 7482 7483 # Start with the most restrictive format, and as we find 7484 # something that doesn't fit with that, change to the next 7485 # most restrictive, and so on. 7486 $format = $DECIMAL_FORMAT; 7487 foreach my $range (@ranges) { 7488 next if $range->type != 0; # Non-normal ranges don't 7489 # affect the main body 7490 my $map = $range->value; 7491 if ($map ne $default_map) { 7492 last if $format eq $STRING_FORMAT; # already at 7493 # least 7494 # restrictive 7495 $format = $INTEGER_FORMAT 7496 if $format eq $DECIMAL_FORMAT 7497 && $map !~ / ^ [0-9] $ /x; 7498 $format = $FLOAT_FORMAT 7499 if $format eq $INTEGER_FORMAT 7500 && $map !~ / ^ -? [0-9]+ $ /x; 7501 $format = $RATIONAL_FORMAT 7502 if $format eq $FLOAT_FORMAT 7503 && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x; 7504 $format = $HEX_FORMAT 7505 if ($format eq $RATIONAL_FORMAT 7506 && $map !~ 7507 m/ ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x) 7508 # Assume a leading zero means hex, 7509 # even if all digits are 0-9 7510 || ($format eq $INTEGER_FORMAT 7511 && $map =~ /^0[0-9A-F]/); 7512 $format = $STRING_FORMAT if $format eq $HEX_FORMAT 7513 && $map =~ /[^0-9A-F]/; 7514 } 7515 } 7516 } 7517 } 7518 } # end of calculating format 7519 7520 if ($default_map eq $CODE_POINT 7521 && $format ne $HEX_FORMAT 7522 && ! defined $self->format) # manual settings are always 7523 # considered ok 7524 { 7525 Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'") 7526 } 7527 7528 # If the output is to be adjusted, the format of the table that gets 7529 # output is actually 'a' or 'ax' instead of whatever it is stored 7530 # internally as. 7531 my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED); 7532 if ($output_adjusted) { 7533 if ($default_map eq $CODE_POINT) { 7534 $format = $HEX_ADJUST_FORMAT; 7535 } 7536 else { 7537 $format = $ADJUST_FORMAT; 7538 } 7539 } 7540 7541 $self->_set_format($format); 7542 7543 return $self->SUPER::write( 7544 $output_adjusted, 7545 $default_map); # don't write defaulteds 7546 } 7547 7548 # Accessors for the underlying list that should fail if locked. 7549 for my $sub (qw( 7550 add_duplicate 7551 replace_map 7552 )) 7553 { 7554 no strict "refs"; 7555 *$sub = sub { 7556 use strict "refs"; 7557 my $self = shift; 7558 7559 return if $self->carp_if_locked; 7560 return $self->_range_list->$sub(@_); 7561 } 7562 } 7563} # End closure for Map_Table 7564 7565package Match_Table; 7566use parent '-norequire', '_Base_Table'; 7567 7568# A Match table is one which is a list of all the code points that have 7569# the same property and property value, for use in \p{property=value} 7570# constructs in regular expressions. It adds very little data to the base 7571# structure, but many methods, as these lists can be combined in many ways to 7572# form new ones. 7573# There are only a few concepts added: 7574# 1) Equivalents and Relatedness. 7575# Two tables can match the identical code points, but have different names. 7576# This always happens when there is a perl single form extension 7577# \p{IsProperty} for the Unicode compound form \P{Property=True}. The two 7578# tables are set to be related, with the Perl extension being a child, and 7579# the Unicode property being the parent. 7580# 7581# It may be that two tables match the identical code points and we don't 7582# know if they are related or not. This happens most frequently when the 7583# Block and Script properties have the exact range. But note that a 7584# revision to Unicode could add new code points to the script, which would 7585# now have to be in a different block (as the block was filled, or there 7586# would have been 'Unknown' script code points in it and they wouldn't have 7587# been identical). So we can't rely on any two properties from Unicode 7588# always matching the same code points from release to release, and thus 7589# these tables are considered coincidentally equivalent--not related. When 7590# two tables are unrelated but equivalent, one is arbitrarily chosen as the 7591# 'leader', and the others are 'equivalents'. This concept is useful 7592# to minimize the number of tables written out. Only one file is used for 7593# any identical set of code points, with entries in UCD.pl mapping all 7594# the involved tables to it. 7595# 7596# Related tables will always be identical; we set them up to be so. Thus 7597# if the Unicode one is deprecated, the Perl one will be too. Not so for 7598# unrelated tables. Relatedness makes generating the documentation easier. 7599# 7600# 2) Complement. 7601# Like equivalents, two tables may be the inverses of each other, the 7602# intersection between them is null, and the union is every Unicode code 7603# point. The two tables that occupy a binary property are necessarily like 7604# this. By specifying one table as the complement of another, we can avoid 7605# storing it on disk (using the other table and performing a fast 7606# transform), and some memory and calculations. 7607# 7608# 3) Conflicting. It may be that there will eventually be name clashes, with 7609# the same name meaning different things. For a while, there actually were 7610# conflicts, but they have so far been resolved by changing Perl's or 7611# Unicode's definitions to match the other, but when this code was written, 7612# it wasn't clear that that was what was going to happen. (Unicode changed 7613# because of protests during their beta period.) Name clashes are warned 7614# about during compilation, and the documentation. The generated tables 7615# are sane, free of name clashes, because the code suppresses the Perl 7616# version. But manual intervention to decide what the actual behavior 7617# should be may be required should this happen. The introductory comments 7618# have more to say about this. 7619# 7620# 4) Definition. This is a string for human consumption that specifies the 7621# code points that this table matches. This is used only for the generated 7622# pod file. It may be specified explicitly, or automatically computed. 7623# Only the first portion of complicated definitions is computed and 7624# displayed. 7625 7626sub standardize { return main::standardize($_[0]); } 7627sub trace { return main::trace(@_); } 7628 7629 7630{ # Closure 7631 7632 main::setup_package(); 7633 7634 my %leader; 7635 # The leader table of this one; initially $self. 7636 main::set_access('leader', \%leader, 'r'); 7637 7638 my %equivalents; 7639 # An array of any tables that have this one as their leader 7640 main::set_access('equivalents', \%equivalents, 'readable_array'); 7641 7642 my %parent; 7643 # The parent table to this one, initially $self. This allows us to 7644 # distinguish between equivalent tables that are related (for which this 7645 # is set to), and those which may not be, but share the same output file 7646 # because they match the exact same set of code points in the current 7647 # Unicode release. 7648 main::set_access('parent', \%parent, 'r'); 7649 7650 my %children; 7651 # An array of any tables that have this one as their parent 7652 main::set_access('children', \%children, 'readable_array'); 7653 7654 my %conflicting; 7655 # Array of any tables that would have the same name as this one with 7656 # a different meaning. This is used for the generated documentation. 7657 main::set_access('conflicting', \%conflicting, 'readable_array'); 7658 7659 my %matches_all; 7660 # Set in the constructor for tables that are expected to match all code 7661 # points. 7662 main::set_access('matches_all', \%matches_all, 'r'); 7663 7664 my %complement; 7665 # Points to the complement that this table is expressed in terms of; 0 if 7666 # none. 7667 main::set_access('complement', \%complement, 'r'); 7668 7669 my %definition; 7670 # Human readable string of the first few ranges of code points matched by 7671 # this table 7672 main::set_access('definition', \%definition, 'r', 's'); 7673 7674 sub new { 7675 my $class = shift; 7676 7677 my %args = @_; 7678 7679 # The property for which this table is a listing of property values. 7680 my $property = delete $args{'_Property'}; 7681 7682 my $name = delete $args{'Name'}; 7683 my $full_name = delete $args{'Full_Name'}; 7684 $full_name = $name if ! defined $full_name; 7685 7686 # Optional 7687 my $initialize = delete $args{'Initialize'}; 7688 my $matches_all = delete $args{'Matches_All'} || 0; 7689 my $format = delete $args{'Format'}; 7690 my $definition = delete $args{'Definition'} // ""; 7691 # Rest of parameters passed on. 7692 7693 my $range_list = Range_List->new(Initialize => $initialize, 7694 Owner => $property); 7695 7696 my $complete = $full_name; 7697 $complete = '""' if $complete eq ""; # A null name shouldn't happen, 7698 # but this helps debug if it 7699 # does 7700 # The complete name for a match table includes it's property in a 7701 # compound form 'property=table', except if the property is the 7702 # pseudo-property, perl, in which case it is just the single form, 7703 # 'table' (If you change the '=' must also change the ':' in lots of 7704 # places in this program that assume an equal sign) 7705 $complete = $property->full_name . "=$complete" if $property != $perl; 7706 7707 my $self = $class->SUPER::new(%args, 7708 Name => $name, 7709 Complete_Name => $complete, 7710 Full_Name => $full_name, 7711 _Property => $property, 7712 _Range_List => $range_list, 7713 Format => $EMPTY_FORMAT, 7714 Write_As_Invlist => 1, 7715 ); 7716 my $addr = do { no overloading; pack 'J', $self; }; 7717 7718 $conflicting{$addr} = [ ]; 7719 $equivalents{$addr} = [ ]; 7720 $children{$addr} = [ ]; 7721 $matches_all{$addr} = $matches_all; 7722 $leader{$addr} = $self; 7723 $parent{$addr} = $self; 7724 $complement{$addr} = 0; 7725 $definition{$addr} = $definition; 7726 7727 if (defined $format && $format ne $EMPTY_FORMAT) { 7728 Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'. Using '$EMPTY_FORMAT'"); 7729 } 7730 7731 return $self; 7732 } 7733 7734 # See this program's beginning comment block about overloading these. 7735 use overload 7736 fallback => 0, 7737 qw("") => "_operator_stringify", 7738 '=' => sub { 7739 my $self = shift; 7740 7741 return if $self->carp_if_locked; 7742 return $self; 7743 }, 7744 7745 '+' => sub { 7746 my $self = shift; 7747 my $other = shift; 7748 7749 return $self->_range_list + $other; 7750 }, 7751 '&' => sub { 7752 my $self = shift; 7753 my $other = shift; 7754 7755 return $self->_range_list & $other; 7756 }, 7757 '+=' => sub { 7758 my $self = shift; 7759 my $other = shift; 7760 my $reversed = shift; 7761 7762 if ($reversed) { 7763 Carp::my_carp_bug("Bad news. Can't cope with '" 7764 . ref($other) 7765 . ' += ' 7766 . ref($self) 7767 . "'. undef returned."); 7768 return; 7769 } 7770 7771 return if $self->carp_if_locked; 7772 7773 my $addr = do { no overloading; pack 'J', $self; }; 7774 7775 if (ref $other) { 7776 7777 # Change the range list of this table to be the 7778 # union of the two. 7779 $self->_set_range_list($self->_range_list 7780 + $other); 7781 } 7782 else { # $other is just a simple value 7783 $self->add_range($other, $other); 7784 } 7785 return $self; 7786 }, 7787 '&=' => sub { 7788 my $self = shift; 7789 my $other = shift; 7790 my $reversed = shift; 7791 7792 if ($reversed) { 7793 Carp::my_carp_bug("Bad news. Can't cope with '" 7794 . ref($other) 7795 . ' &= ' 7796 . ref($self) 7797 . "'. undef returned."); 7798 return; 7799 } 7800 7801 return if $self->carp_if_locked; 7802 $self->_set_range_list($self->_range_list & $other); 7803 return $self; 7804 }, 7805 '-' => sub { my $self = shift; 7806 my $other = shift; 7807 my $reversed = shift; 7808 if ($reversed) { 7809 Carp::my_carp_bug("Bad news. Can't cope with '" 7810 . ref($other) 7811 . ' - ' 7812 . ref($self) 7813 . "'. undef returned."); 7814 return; 7815 } 7816 7817 return $self->_range_list - $other; 7818 }, 7819 '~' => sub { my $self = shift; 7820 return ~ $self->_range_list; 7821 }, 7822 ; 7823 7824 sub _operator_stringify { 7825 my $self = shift; 7826 7827 my $name = $self->complete_name; 7828 return "Table '$name'"; 7829 } 7830 7831 sub _range_list { 7832 # Returns the range list associated with this table, which will be the 7833 # complement's if it has one. 7834 7835 my $self = shift; 7836 my $complement = $self->complement; 7837 7838 # In order to avoid re-complementing on each access, only do the 7839 # complement the first time, and store the result in this table's 7840 # range list to use henceforth. However, this wouldn't work if the 7841 # controlling (complement) table changed after we do this, so lock it. 7842 # Currently, the value of the complement isn't needed until after it 7843 # is fully constructed, so this works. If this were to change, the 7844 # each_range iteration functionality would no longer work on this 7845 # complement. 7846 if ($complement != 0 && $self->SUPER::_range_list->count == 0) { 7847 $self->_set_range_list($self->SUPER::_range_list 7848 + ~ $complement->_range_list); 7849 $complement->lock; 7850 } 7851 7852 return $self->SUPER::_range_list; 7853 } 7854 7855 sub add_alias { 7856 # Add a synonym for this table. See the comments in the base class 7857 7858 my $self = shift; 7859 my $name = shift; 7860 # Rest of parameters passed on. 7861 7862 $self->SUPER::add_alias($name, $self, @_); 7863 return; 7864 } 7865 7866 sub add_conflicting { 7867 # Add the name of some other object to the list of ones that name 7868 # clash with this match table. 7869 7870 my $self = shift; 7871 my $conflicting_name = shift; # The name of the conflicting object 7872 my $p = shift || 'p'; # Optional, is this a \p{} or \P{} ? 7873 my $conflicting_object = shift; # Optional, the conflicting object 7874 # itself. This is used to 7875 # disambiguate the text if the input 7876 # name is identical to any of the 7877 # aliases $self is known by. 7878 # Sometimes the conflicting object is 7879 # merely hypothetical, so this has to 7880 # be an optional parameter. 7881 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7882 7883 my $addr = do { no overloading; pack 'J', $self; }; 7884 7885 # Check if the conflicting name is exactly the same as any existing 7886 # alias in this table (as long as there is a real object there to 7887 # disambiguate with). 7888 if (defined $conflicting_object) { 7889 foreach my $alias ($self->aliases) { 7890 if (standardize($alias->name) eq standardize($conflicting_name)) { 7891 7892 # Here, there is an exact match. This results in 7893 # ambiguous comments, so disambiguate by changing the 7894 # conflicting name to its object's complete equivalent. 7895 $conflicting_name = $conflicting_object->complete_name; 7896 last; 7897 } 7898 } 7899 } 7900 7901 # Convert to the \p{...} final name 7902 $conflicting_name = "\\$p" . "{$conflicting_name}"; 7903 7904 # Only add once 7905 return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}}; 7906 7907 push @{$conflicting{$addr}}, $conflicting_name; 7908 7909 return; 7910 } 7911 7912 sub is_set_equivalent_to($self, $other=undef) { 7913 # Return boolean of whether or not the other object is a table of this 7914 # type and has been marked equivalent to this one. 7915 7916 return 0 if ! defined $other; # Can happen for incomplete early 7917 # releases 7918 unless ($other->isa(__PACKAGE__)) { 7919 my $ref_other = ref $other; 7920 my $ref_self = ref $self; 7921 Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self."); 7922 return 0; 7923 } 7924 7925 # Two tables are equivalent if they have the same leader. 7926 no overloading; 7927 return $leader{pack 'J', $self} == $leader{pack 'J', $other}; 7928 return; 7929 } 7930 7931 sub set_equivalent_to { 7932 # Set $self equivalent to the parameter table. 7933 # The required Related => 'x' parameter is a boolean indicating 7934 # whether these tables are related or not. If related, $other becomes 7935 # the 'parent' of $self; if unrelated it becomes the 'leader' 7936 # 7937 # Related tables share all characteristics except names; equivalents 7938 # not quite so many. 7939 # If they are related, one must be a perl extension. This is because 7940 # we can't guarantee that Unicode won't change one or the other in a 7941 # later release even if they are identical now. 7942 7943 my $self = shift; 7944 my $other = shift; 7945 7946 my %args = @_; 7947 my $related = delete $args{'Related'}; 7948 7949 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 7950 7951 return if ! defined $other; # Keep on going; happens in some early 7952 # Unicode releases. 7953 7954 if (! defined $related) { 7955 Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter. Assuming $self is not related to $other"); 7956 $related = 0; 7957 } 7958 7959 # If already are equivalent, no need to re-do it; if subroutine 7960 # returns null, it found an error, also do nothing 7961 my $are_equivalent = $self->is_set_equivalent_to($other); 7962 return if ! defined $are_equivalent || $are_equivalent; 7963 7964 my $addr = do { no overloading; pack 'J', $self; }; 7965 my $current_leader = ($related) ? $parent{$addr} : $leader{$addr}; 7966 7967 if ($related) { 7968 if ($current_leader->perl_extension) { 7969 if ($other->perl_extension) { 7970 Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent."); 7971 return; 7972 } 7973 } elsif ($self->property != $other->property # Depending on 7974 # situation, might 7975 # be better to use 7976 # add_alias() 7977 # instead for same 7978 # property 7979 && ! $other->perl_extension 7980 7981 # We allow the sc and scx properties to be marked as 7982 # related. They are in fact related, and this allows 7983 # the pod to show that better. This test isn't valid 7984 # if this is an early Unicode release without the scx 7985 # property (having that also implies the sc property 7986 # exists, so don't have to test for no 'sc') 7987 && ( ! defined $scx 7988 && ! ( ( $self->property == $script 7989 || $self->property == $scx) 7990 && ( $self->property == $script 7991 || $self->property == $scx)))) 7992 { 7993 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other"); 7994 $related = 0; 7995 } 7996 } 7997 7998 if (! $self->is_empty && ! $self->matches_identically_to($other)) { 7999 Carp::my_carp_bug("$self should be empty or match identically to $other. Not setting equivalent"); 8000 return; 8001 } 8002 8003 my $leader = do { no overloading; pack 'J', $current_leader; }; 8004 my $other_addr = do { no overloading; pack 'J', $other; }; 8005 8006 # Any tables that are equivalent to or children of this table must now 8007 # instead be equivalent to or (children) to the new leader (parent), 8008 # still equivalent. The equivalency includes their matches_all info, 8009 # and for related tables, their fate and status. 8010 # All related tables are of necessity equivalent, but the converse 8011 # isn't necessarily true 8012 my $status = $other->status; 8013 my $status_info = $other->status_info; 8014 my $fate = $other->fate; 8015 my $matches_all = $matches_all{other_addr}; 8016 my $caseless_equivalent = $other->caseless_equivalent; 8017 foreach my $table ($current_leader, @{$equivalents{$leader}}) { 8018 next if $table == $other; 8019 trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace; 8020 8021 my $table_addr = do { no overloading; pack 'J', $table; }; 8022 $leader{$table_addr} = $other; 8023 $matches_all{$table_addr} = $matches_all; 8024 $self->_set_range_list($other->_range_list); 8025 push @{$equivalents{$other_addr}}, $table; 8026 if ($related) { 8027 $parent{$table_addr} = $other; 8028 push @{$children{$other_addr}}, $table; 8029 $table->set_status($status, $status_info); 8030 8031 # This reason currently doesn't get exposed outside; otherwise 8032 # would have to look up the parent's reason and use it instead. 8033 $table->set_fate($fate, "Parent's fate"); 8034 8035 $self->set_caseless_equivalent($caseless_equivalent); 8036 } 8037 } 8038 8039 # Now that we've declared these to be equivalent, any changes to one 8040 # of the tables would invalidate that equivalency. 8041 $self->lock; 8042 $other->lock; 8043 return; 8044 } 8045 8046 sub set_complement($self, $other) { 8047 # Set $self to be the complement of the parameter table. $self is 8048 # locked, as what it contains should all come from the other table. 8049 8050 if ($other->complement != 0) { 8051 Carp::my_carp_bug("Can't set $self to be the complement of $other, which itself is the complement of " . $other->complement); 8052 return; 8053 } 8054 my $addr = do { no overloading; pack 'J', $self; }; 8055 $complement{$addr} = $other; 8056 8057 # Be sure the other property knows we are depending on them; or the 8058 # other table if it is one in the current property. 8059 if ($self->property != $other->property) { 8060 $other->property->set_has_dependency(1); 8061 } 8062 else { 8063 $other->set_has_dependency(1); 8064 } 8065 $self->lock; 8066 return; 8067 } 8068 8069 sub add_range($self, @range) { # Add a range to the list for this table. 8070 # Rest of parameters passed on 8071 8072 return if $self->carp_if_locked; 8073 return $self->_range_list->add_range(@range); 8074 } 8075 8076 sub header($self) { 8077 # All match tables are to be used only by the Perl core. 8078 return $self->SUPER::header() . $INTERNAL_ONLY_HEADER; 8079 } 8080 8081 sub pre_body { # Does nothing for match tables. 8082 return 8083 } 8084 8085 sub append_to_body { # Does nothing for match tables. 8086 return 8087 } 8088 8089 sub set_fate($self, $fate, $reason=undef) { 8090 $self->SUPER::set_fate($fate, $reason); 8091 8092 # All children share this fate 8093 foreach my $child ($self->children) { 8094 $child->set_fate($fate, $reason); 8095 } 8096 return; 8097 } 8098 8099 sub calculate_table_definition 8100 { 8101 # Returns a human-readable string showing some or all of the code 8102 # points matched by this table. The string will include a 8103 # bracketed-character class for all characters matched in the 00-FF 8104 # range, and the first few ranges matched beyond that. 8105 my $max_ranges = 6; 8106 8107 my $self = shift; 8108 my $definition = $self->definition || ""; 8109 8110 # Skip this if already have a definition. 8111 return $definition if $definition; 8112 8113 my $lows_string = ""; # The string representation of the 0-FF 8114 # characters 8115 my $string_range = ""; # The string rep. of the above FF ranges 8116 my $range_count = 0; # How many ranges in $string_rage 8117 8118 my @lows_invlist; # The inversion list of the 0-FF code points 8119 my $first_non_control = ord(" "); # Everything below this is a 8120 # control, on ASCII or EBCDIC 8121 my $max_table_code_point = $self->max; 8122 8123 # On ASCII platforms, the range 80-FF contains no printables. 8124 my $highest_printable = ((main::NON_ASCII_PLATFORM) ? 255 : 126); 8125 8126 8127 # Look through the first few ranges matched by this table. 8128 $self->reset_each_range; # Defensive programming 8129 while (defined (my $range = $self->each_range())) { 8130 my $start = $range->start; 8131 my $end = $range->end; 8132 8133 # Accumulate an inversion list of the 00-FF code points 8134 if ($start < 256 && ($start > 0 || $end < 256)) { 8135 push @lows_invlist, $start; 8136 push @lows_invlist, 1 + (($end < 256) ? $end : 255); 8137 8138 # Get next range if there are more ranges below 256 8139 next if $end < 256 && $end < $max_table_code_point; 8140 8141 # If the range straddles the 255/256 boundary, we split it 8142 # there. We already added above the low portion to the 8143 # inversion list 8144 $start = 256 if $end > 256; 8145 } 8146 8147 # Here, @lows_invlist contains the code points below 256, and 8148 # there is no other range, or the current one starts at or above 8149 # 256. Generate the [char class] for the 0-255 ones. 8150 while (@lows_invlist) { 8151 8152 # If this range (necessarily the first one, by the way) starts 8153 # at 0 ... 8154 if ($lows_invlist[0] == 0) { 8155 8156 # If it ends within the block of controls, that means that 8157 # some controls are in it and some aren't. Since Unicode 8158 # properties pretty much only know about a few of the 8159 # controls, like \n, \t, this means that its one of them 8160 # that isn't in the range. Complement the inversion list 8161 # which will likely cause these to be output using their 8162 # mnemonics, hence being clearer. 8163 if ($lows_invlist[1] < $first_non_control) { 8164 $lows_string .= '^'; 8165 shift @lows_invlist; 8166 push @lows_invlist, 256; 8167 } 8168 elsif ($lows_invlist[1] <= $highest_printable) { 8169 8170 # Here, it extends into the printables block. Split 8171 # into two ranges so that the controls are separate. 8172 $lows_string .= sprintf "\\x00-\\x%02x", 8173 $first_non_control - 1; 8174 $lows_invlist[0] = $first_non_control; 8175 } 8176 } 8177 8178 # If the range completely contains the printables, don't 8179 # individually spell out the printables. 8180 if ( $lows_invlist[0] <= $first_non_control 8181 && $lows_invlist[1] > $highest_printable) 8182 { 8183 $lows_string .= sprintf "\\x%02x-\\x%02x", 8184 $lows_invlist[0], $lows_invlist[1] - 1; 8185 shift @lows_invlist; 8186 shift @lows_invlist; 8187 next; 8188 } 8189 8190 # Here, the range may include some but not all printables. 8191 # Look at each one individually 8192 foreach my $ord (shift @lows_invlist .. shift(@lows_invlist) - 1) { 8193 my $char = chr $ord; 8194 8195 # If there is already something in the list, an 8196 # alphanumeric char could be the next in sequence. If so, 8197 # we start or extend a range. That is, we could have so 8198 # far something like 'a-c', and the next char is a 'd', so 8199 # we change it to 'a-d'. We use native_to_unicode() 8200 # because a-z on EBCDIC means 26 chars, and excludes the 8201 # gap ones. 8202 if ($lows_string ne "" && $char =~ /[[:alnum:]]/) { 8203 my $prev = substr($lows_string, -1); 8204 if ( $prev !~ /[[:alnum:]]/ 8205 || utf8::native_to_unicode(ord $prev) + 1 8206 != utf8::native_to_unicode(ord $char)) 8207 { 8208 # Not extending the range 8209 $lows_string .= $char; 8210 } 8211 elsif ( length $lows_string > 1 8212 && substr($lows_string, -2, 1) eq '-') 8213 { 8214 # We had a sequence like '-c' and the current 8215 # character is 'd'. Extend the range. 8216 substr($lows_string, -1, 1) = $char; 8217 } 8218 else { 8219 # We had something like 'd' and this is 'e'. 8220 # Start a range. 8221 $lows_string .= "-$char"; 8222 } 8223 } 8224 elsif ($char =~ /[[:graph:]]/) { 8225 8226 # We output a graphic char as-is, preceded by a 8227 # backslash if it is a metacharacter 8228 $lows_string .= '\\' 8229 if $char =~ /[\\\^\$\@\%\|()\[\]\{\}\-\/"']/; 8230 $lows_string .= $char; 8231 } # Otherwise use mnemonic for any that have them 8232 elsif ($char =~ /[\a]/) { 8233 $lows_string .= '\a'; 8234 } 8235 elsif ($char =~ /[\b]/) { 8236 $lows_string .= '\b'; 8237 } 8238 elsif ($char eq "\e") { 8239 $lows_string .= '\e'; 8240 } 8241 elsif ($char eq "\f") { 8242 $lows_string .= '\f'; 8243 } 8244 elsif ($char eq "\cK") { 8245 $lows_string .= '\cK'; 8246 } 8247 elsif ($char eq "\n") { 8248 $lows_string .= '\n'; 8249 } 8250 elsif ($char eq "\r") { 8251 $lows_string .= '\r'; 8252 } 8253 elsif ($char eq "\t") { 8254 $lows_string .= '\t'; 8255 } 8256 else { 8257 8258 # Here is a non-graphic without a mnemonic. We use \x 8259 # notation. But if the ordinal of this is one above 8260 # the previous, create or extend the range 8261 my $hex_representation = sprintf("%02x", ord $char); 8262 if ( length $lows_string >= 4 8263 && substr($lows_string, -4, 2) eq '\\x' 8264 && hex(substr($lows_string, -2)) + 1 == ord $char) 8265 { 8266 if ( length $lows_string >= 5 8267 && substr($lows_string, -5, 1) eq '-' 8268 && ( length $lows_string == 5 8269 || substr($lows_string, -6, 1) ne '\\')) 8270 { 8271 substr($lows_string, -2) = $hex_representation; 8272 } 8273 else { 8274 $lows_string .= '-\\x' . $hex_representation; 8275 } 8276 } 8277 else { 8278 $lows_string .= '\\x' . $hex_representation; 8279 } 8280 } 8281 } 8282 } 8283 8284 # Done with assembling the string of all lows. If there are only 8285 # lows in the property, are completely done. 8286 if ($max_table_code_point < 256) { 8287 $self->reset_each_range; 8288 last; 8289 } 8290 8291 # Otherwise, quit if reached max number of non-lows ranges. If 8292 # there are lows, count them as one unit towards the maximum. 8293 $range_count++; 8294 if ($range_count > (($lows_string eq "") ? $max_ranges : $max_ranges - 1)) { 8295 $string_range .= " ..."; 8296 $self->reset_each_range; 8297 last; 8298 } 8299 8300 # Otherwise add this range. 8301 $string_range .= ", " if $string_range ne ""; 8302 if ($start == $end) { 8303 $string_range .= sprintf("U+%04X", $start); 8304 } 8305 elsif ($end >= $MAX_WORKING_CODEPOINT) { 8306 $string_range .= sprintf("U+%04X..infinity", $start); 8307 } 8308 else { 8309 $string_range .= sprintf("U+%04X..%04X", 8310 $start, $end); 8311 } 8312 } 8313 8314 # Done with all the ranges we're going to look at. Assemble the 8315 # definition from the lows + non-lows. 8316 8317 if ($lows_string ne "" || $string_range ne "") { 8318 if ($lows_string ne "") { 8319 $definition .= "[$lows_string]"; 8320 $definition .= ", " if $string_range; 8321 } 8322 $definition .= $string_range; 8323 } 8324 8325 return $definition; 8326 } 8327 8328 sub write($self) { 8329 return $self->SUPER::write(0); # No adjustments 8330 } 8331 8332 # $leader - Should only be called on the leader table of an equivalent group 8333 sub set_final_comment($leader) { 8334 # This creates a comment for the file that is to hold the match table 8335 # $self. It is somewhat convoluted to make the English read nicely, 8336 # but, heh, it's just a comment. 8337 # This should be called only with the leader match table of all the 8338 # ones that share the same file. It lists all such tables, ordered so 8339 # that related ones are together. 8340 8341 return unless $debugging_build; 8342 8343 my $addr = do { no overloading; pack 'J', $leader; }; 8344 8345 if ($leader{$addr} != $leader) { 8346 Carp::my_carp_bug(<<END 8347set_final_comment() must be called on a leader table, which $leader is not. 8348It is equivalent to $leader{$addr}. No comment created 8349END 8350 ); 8351 return; 8352 } 8353 8354 # Get the number of code points matched by each of the tables in this 8355 # file, and add underscores for clarity. 8356 my $count = $leader->count; 8357 my $unicode_count; 8358 my $non_unicode_string; 8359 if ($count > $MAX_UNICODE_CODEPOINTS) { 8360 $unicode_count = $count - ($MAX_WORKING_CODEPOINT 8361 - $MAX_UNICODE_CODEPOINT); 8362 $non_unicode_string = "All above-Unicode code points match as well, and are also returned"; 8363 } 8364 else { 8365 $unicode_count = $count; 8366 $non_unicode_string = ""; 8367 } 8368 my $string_count = main::clarify_code_point_count($unicode_count); 8369 8370 my $loose_count = 0; # how many aliases loosely matched 8371 my $compound_name = ""; # ? Are any names compound?, and if so, an 8372 # example 8373 my $properties_with_compound_names = 0; # count of these 8374 8375 8376 my %flags; # The status flags used in the file 8377 my $total_entries = 0; # number of entries written in the comment 8378 my $matches_comment = ""; # The portion of the comment about the 8379 # \p{}'s 8380 my @global_comments; # List of all the tables' comments that are 8381 # there before this routine was called. 8382 my $has_ucd_alias = 0; # If there is an alias that is accessible via 8383 # Unicode::UCD. If not, then don't say it is 8384 # in the comment 8385 8386 # Get list of all the parent tables that are equivalent to this one 8387 # (including itself). 8388 my @parents = grep { $parent{main::objaddr $_} == $_ } 8389 main::uniques($leader, @{$equivalents{$addr}}); 8390 my $has_unrelated = (@parents >= 2); # boolean, ? are there unrelated 8391 # tables 8392 for my $parent (@parents) { 8393 8394 my $property = $parent->property; 8395 8396 # Special case 'N' tables in properties with two match tables when 8397 # the other is a 'Y' one. These are likely to be binary tables, 8398 # but not necessarily. In either case, \P{} will match the 8399 # complement of \p{}, and so if something is a synonym of \p, the 8400 # complement of that something will be the synonym of \P. This 8401 # would be true of any property with just two match tables, not 8402 # just those whose values are Y and N; but that would require a 8403 # little extra work, and there are none such so far in Unicode. 8404 my $perl_p = 'p'; # which is it? \p{} or \P{} 8405 my @yes_perl_synonyms; # list of any synonyms for the 'Y' table 8406 8407 if (scalar $property->tables == 2 8408 && $parent == $property->table('N') 8409 && defined (my $yes = $property->table('Y'))) 8410 { 8411 my $yes_addr = do { no overloading; pack 'J', $yes; }; 8412 @yes_perl_synonyms 8413 = grep { $_->property == $perl } 8414 main::uniques($yes, 8415 $parent{$yes_addr}, 8416 $parent{$yes_addr}->children); 8417 8418 # But these synonyms are \P{} ,not \p{} 8419 $perl_p = 'P'; 8420 } 8421 8422 my @description; # Will hold the table description 8423 my @note; # Will hold the table notes. 8424 my @conflicting; # Will hold the table conflicts. 8425 8426 # Look at the parent, any yes synonyms, and all the children 8427 my $parent_addr = do { no overloading; pack 'J', $parent; }; 8428 for my $table ($parent, 8429 @yes_perl_synonyms, 8430 @{$children{$parent_addr}}) 8431 { 8432 my $table_addr = do { no overloading; pack 'J', $table; }; 8433 my $table_property = $table->property; 8434 8435 # Tables are separated by a blank line to create a grouping. 8436 $matches_comment .= "\n" if $matches_comment; 8437 8438 # The table is named based on the property and value 8439 # combination it is for, like script=greek. But there may be 8440 # a number of synonyms for each side, like 'sc' for 'script', 8441 # and 'grek' for 'greek'. Any combination of these is a valid 8442 # name for this table. In this case, there are three more, 8443 # 'sc=grek', 'sc=greek', and 'script='grek'. Rather than 8444 # listing all possible combinations in the comment, we make 8445 # sure that each synonym occurs at least once, and add 8446 # commentary that the other combinations are possible. 8447 # Because regular expressions don't recognize things like 8448 # \p{jsn=}, only look at non-null right-hand-sides 8449 my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table_property->aliases; 8450 my @table_aliases = grep { $_->name ne "" } $table->aliases; 8451 8452 # The alias lists above are already ordered in the order we 8453 # want to output them. To ensure that each synonym is listed, 8454 # we must use the max of the two numbers. But if there are no 8455 # legal synonyms (nothing in @table_aliases), then we don't 8456 # list anything. 8457 my $listed_combos = (@table_aliases) 8458 ? main::max(scalar @table_aliases, 8459 scalar @property_aliases) 8460 : 0; 8461 trace "$listed_combos, tables=", scalar @table_aliases, "; property names=", scalar @property_aliases if main::DEBUG; 8462 8463 my $property_had_compound_name = 0; 8464 8465 for my $i (0 .. $listed_combos - 1) { 8466 $total_entries++; 8467 8468 # The current alias for the property is the next one on 8469 # the list, or if beyond the end, start over. Similarly 8470 # for the table (\p{prop=table}) 8471 my $property_alias = $property_aliases 8472 [$i % @property_aliases]->name; 8473 my $table_alias_object = $table_aliases 8474 [$i % @table_aliases]; 8475 my $table_alias = $table_alias_object->name; 8476 my $loose_match = $table_alias_object->loose_match; 8477 $has_ucd_alias |= $table_alias_object->ucd; 8478 8479 if ($table_alias !~ /\D/) { # Clarify large numbers. 8480 $table_alias = main::clarify_number($table_alias) 8481 } 8482 8483 # Add a comment for this alias combination 8484 my $current_match_comment; 8485 if ($table_property == $perl) { 8486 $current_match_comment = "\\$perl_p" 8487 . "{$table_alias}"; 8488 } 8489 else { 8490 $current_match_comment 8491 = "\\p{$property_alias=$table_alias}"; 8492 $property_had_compound_name = 1; 8493 } 8494 8495 # Flag any abnormal status for this table. 8496 my $flag = $property->status 8497 || $table->status 8498 || $table_alias_object->status; 8499 if ($flag && $flag ne $PLACEHOLDER) { 8500 $flags{$flag} = $status_past_participles{$flag}; 8501 } 8502 8503 $loose_count++; 8504 8505 # Pretty up the comment. Note the \b; it says don't make 8506 # this line a continuation. 8507 $matches_comment .= sprintf("\b%-1s%-s%s\n", 8508 $flag, 8509 " " x 7, 8510 $current_match_comment); 8511 } # End of generating the entries for this table. 8512 8513 # Save these for output after this group of related tables. 8514 push @description, $table->description; 8515 push @note, $table->note; 8516 push @conflicting, $table->conflicting; 8517 8518 # And this for output after all the tables. 8519 push @global_comments, $table->comment; 8520 8521 # Compute an alternate compound name using the final property 8522 # synonym and the first table synonym with a colon instead of 8523 # the equal sign used elsewhere. 8524 if ($property_had_compound_name) { 8525 $properties_with_compound_names ++; 8526 if (! $compound_name || @property_aliases > 1) { 8527 $compound_name = $property_aliases[-1]->name 8528 . ': ' 8529 . $table_aliases[0]->name; 8530 } 8531 } 8532 } # End of looping through all children of this table 8533 8534 # Here have assembled in $matches_comment all the related tables 8535 # to the current parent (preceded by the same info for all the 8536 # previous parents). Put out information that applies to all of 8537 # the current family. 8538 if (@conflicting) { 8539 8540 # But output the conflicting information now, as it applies to 8541 # just this table. 8542 my $conflicting = join ", ", @conflicting; 8543 if ($conflicting) { 8544 $matches_comment .= <<END; 8545 8546 Note that contrary to what you might expect, the above is NOT the same as 8547END 8548 $matches_comment .= "any of: " if @conflicting > 1; 8549 $matches_comment .= "$conflicting\n"; 8550 } 8551 } 8552 if (@description) { 8553 $matches_comment .= "\n Meaning: " 8554 . join('; ', @description) 8555 . "\n"; 8556 } 8557 if (@note) { 8558 $matches_comment .= "\n Note: " 8559 . join("\n ", @note) 8560 . "\n"; 8561 } 8562 } # End of looping through all tables 8563 8564 $matches_comment .= "\n$non_unicode_string\n" if $non_unicode_string; 8565 8566 8567 my $code_points; 8568 my $match; 8569 my $any_of_these; 8570 if ($unicode_count == 1) { 8571 $match = 'matches'; 8572 $code_points = 'single code point'; 8573 } 8574 else { 8575 $match = 'match'; 8576 $code_points = "$string_count code points"; 8577 } 8578 8579 my $synonyms; 8580 my $entries; 8581 if ($total_entries == 1) { 8582 $synonyms = ""; 8583 $entries = 'entry'; 8584 $any_of_these = 'this' 8585 } 8586 else { 8587 $synonyms = " any of the following regular expression constructs"; 8588 $entries = 'entries'; 8589 $any_of_these = 'any of these' 8590 } 8591 8592 my $comment = ""; 8593 if ($has_ucd_alias) { 8594 $comment .= "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n"; 8595 } 8596 if ($has_unrelated) { 8597 $comment .= <<END; 8598This file is for tables that are not necessarily related: To conserve 8599resources, every table that matches the identical set of code points in this 8600version of Unicode uses this file. Each one is listed in a separate group 8601below. It could be that the tables will match the same set of code points in 8602other Unicode releases, or it could be purely coincidence that they happen to 8603be the same in Unicode $unicode_version, and hence may not in other versions. 8604 8605END 8606 } 8607 8608 if (%flags) { 8609 foreach my $flag (sort keys %flags) { 8610 $comment .= <<END; 8611'$flag' below means that this form is $flags{$flag}. 8612END 8613 if ($flag eq $INTERNAL_ALIAS) { 8614 $comment .= "DO NOT USE!!!"; 8615 } 8616 else { 8617 $comment .= "Consult $pod_file.pod"; 8618 } 8619 $comment .= "\n"; 8620 } 8621 $comment .= "\n"; 8622 } 8623 8624 if ($total_entries == 0) { 8625 Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string. Creating file anyway."); 8626 $comment .= <<END; 8627This file returns the $code_points in Unicode Version 8628$unicode_version for 8629$leader, but it is inaccessible through Perl regular expressions, as 8630"\\p{prop=}" is not recognized. 8631END 8632 8633 } else { 8634 $comment .= <<END; 8635This file returns the $code_points in Unicode Version 8636$unicode_version that 8637$match$synonyms: 8638 8639$matches_comment 8640$pod_file.pod should be consulted for the syntax rules for $any_of_these, 8641including if adding or subtracting white space, underscore, and hyphen 8642characters matters or doesn't matter, and other permissible syntactic 8643variants. Upper/lower case distinctions never matter. 8644END 8645 8646 } 8647 if ($compound_name) { 8648 $comment .= <<END; 8649 8650A colon can be substituted for the equals sign, and 8651END 8652 if ($properties_with_compound_names > 1) { 8653 $comment .= <<END; 8654within each group above, 8655END 8656 } 8657 $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name); 8658 8659 # Note the \b below, it says don't make that line a continuation. 8660 $comment .= <<END; 8661anything to the left of the equals (or colon) can be combined with anything to 8662the right. Thus, for example, 8663$compound_name 8664\bis also valid. 8665END 8666 } 8667 8668 # And append any comment(s) from the actual tables. They are all 8669 # gathered here, so may not read all that well. 8670 if (@global_comments) { 8671 $comment .= "\n" . join("\n\n", @global_comments) . "\n"; 8672 } 8673 8674 if ($count) { # The format differs if no code points, and needs no 8675 # explanation in that case 8676 if ($leader->write_as_invlist) { 8677 $comment.= <<END; 8678 8679The first data line of this file begins with the letter V to indicate it is in 8680inversion list format. The number following the V gives the number of lines 8681remaining. Each of those remaining lines is a single number representing the 8682starting code point of a range which goes up to but not including the number 8683on the next line; The 0th, 2nd, 4th... ranges are for code points that match 8684the property; the 1st, 3rd, 5th... are ranges of code points that don't match 8685the property. The final line's range extends to the platform's infinity. 8686END 8687 } 8688 else { 8689 $comment.= <<END; 8690The format of the lines of this file is: 8691START\\tSTOP\\twhere START is the starting code point of the range, in hex; 8692STOP is the ending point, or if omitted, the range has just one code point. 8693END 8694 } 8695 if ($leader->output_range_counts) { 8696 $comment .= <<END; 8697Numbers in comments in [brackets] indicate how many code points are in the 8698range. 8699END 8700 } 8701 } 8702 8703 $leader->set_comment(main::join_lines($comment)); 8704 return; 8705 } 8706 8707 # Accessors for the underlying list 8708 for my $sub (qw( 8709 get_valid_code_point 8710 get_invalid_code_point 8711 )) 8712 { 8713 no strict "refs"; 8714 *$sub = sub { 8715 use strict "refs"; 8716 my $self = shift; 8717 8718 return $self->_range_list->$sub(@_); 8719 } 8720 } 8721} # End closure for Match_Table 8722 8723package Property; 8724 8725# The Property class represents a Unicode property, or the $perl 8726# pseudo-property. It contains a map table initialized empty at construction 8727# time, and for properties accessible through regular expressions, various 8728# match tables, created through the add_match_table() method, and referenced 8729# by the table('NAME') or tables() methods, the latter returning a list of all 8730# of the match tables. Otherwise table operations implicitly are for the map 8731# table. 8732# 8733# Most of the data in the property is actually about its map table, so it 8734# mostly just uses that table's accessors for most methods. The two could 8735# have been combined into one object, but for clarity because of their 8736# differing semantics, they have been kept separate. It could be argued that 8737# the 'file' and 'directory' fields should be kept with the map table. 8738# 8739# Each property has a type. This can be set in the constructor, or in the 8740# set_type accessor, but mostly it is figured out by the data. Every property 8741# starts with unknown type, overridden by a parameter to the constructor, or 8742# as match tables are added, or ranges added to the map table, the data is 8743# inspected, and the type changed. After the table is mostly or entirely 8744# filled, compute_type() should be called to finalize they analysis. 8745# 8746# There are very few operations defined. One can safely remove a range from 8747# the map table, and property_add_or_replace_non_nulls() adds the maps from another 8748# table to this one, replacing any in the intersection of the two. 8749 8750sub standardize { return main::standardize($_[0]); } 8751sub trace { return main::trace(@_) if main::DEBUG && $to_trace } 8752 8753{ # Closure 8754 8755 # This hash will contain as keys, all the aliases of all properties, and 8756 # as values, pointers to their respective property objects. This allows 8757 # quick look-up of a property from any of its names. 8758 my %alias_to_property_of; 8759 8760 sub dump_alias_to_property_of { 8761 # For debugging 8762 8763 print "\n", main::simple_dumper (\%alias_to_property_of), "\n"; 8764 return; 8765 } 8766 8767 sub property_ref($name) { 8768 # This is a package subroutine, not called as a method. 8769 # If the single parameter is a literal '*' it returns a list of all 8770 # defined properties. 8771 # Otherwise, the single parameter is a name, and it returns a pointer 8772 # to the corresponding property object, or undef if none. 8773 # 8774 # Properties can have several different names. The 'standard' form of 8775 # each of them is stored in %alias_to_property_of as they are defined. 8776 # But it's possible that this subroutine will be called with some 8777 # variant, so if the initial lookup fails, it is repeated with the 8778 # standardized form of the input name. If found, besides returning the 8779 # result, the input name is added to the list so future calls won't 8780 # have to do the conversion again. 8781 8782 if (! defined $name) { 8783 Carp::my_carp_bug("Undefined input property. No action taken."); 8784 return; 8785 } 8786 8787 return main::uniques(values %alias_to_property_of) if $name eq '*'; 8788 8789 # Return cached result if have it. 8790 my $result = $alias_to_property_of{$name}; 8791 return $result if defined $result; 8792 8793 # Convert the input to standard form. 8794 my $standard_name = standardize($name); 8795 8796 $result = $alias_to_property_of{$standard_name}; 8797 return unless defined $result; # Don't cache undefs 8798 8799 # Cache the result before returning it. 8800 $alias_to_property_of{$name} = $result; 8801 return $result; 8802 } 8803 8804 8805 main::setup_package(); 8806 8807 my %map; 8808 # A pointer to the map table object for this property 8809 main::set_access('map', \%map); 8810 8811 my %full_name; 8812 # The property's full name. This is a duplicate of the copy kept in the 8813 # map table, but is needed because stringify needs it during 8814 # construction of the map table, and then would have a chicken before egg 8815 # problem. 8816 main::set_access('full_name', \%full_name, 'r'); 8817 8818 my %table_ref; 8819 # This hash will contain as keys, all the aliases of any match tables 8820 # attached to this property, and as values, the pointers to their 8821 # respective tables. This allows quick look-up of a table from any of its 8822 # names. 8823 main::set_access('table_ref', \%table_ref); 8824 8825 my %type; 8826 # The type of the property, $ENUM, $BINARY, etc 8827 main::set_access('type', \%type, 'r'); 8828 8829 my %file; 8830 # The filename where the map table will go (if actually written). 8831 # Normally defaulted, but can be overridden. 8832 main::set_access('file', \%file, 'r', 's'); 8833 8834 my %directory; 8835 # The directory where the map table will go (if actually written). 8836 # Normally defaulted, but can be overridden. 8837 main::set_access('directory', \%directory, 's'); 8838 8839 my %pseudo_map_type; 8840 # This is used to affect the calculation of the map types for all the 8841 # ranges in the table. It should be set to one of the values that signify 8842 # to alter the calculation. 8843 main::set_access('pseudo_map_type', \%pseudo_map_type, 'r'); 8844 8845 my %has_only_code_point_maps; 8846 # A boolean used to help in computing the type of data in the map table. 8847 main::set_access('has_only_code_point_maps', \%has_only_code_point_maps); 8848 8849 my %unique_maps; 8850 # A list of the first few distinct mappings this property has. This is 8851 # used to disambiguate between binary and enum property types, so don't 8852 # have to keep more than three. 8853 main::set_access('unique_maps', \%unique_maps); 8854 8855 my %pre_declared_maps; 8856 # A boolean that gives whether the input data should declare all the 8857 # tables used, or not. If the former, unknown ones raise a warning. 8858 main::set_access('pre_declared_maps', 8859 \%pre_declared_maps, 'r', 's'); 8860 8861 my %match_subdir; 8862 # For properties whose shortest names are too long for a DOS 8.3 8863 # filesystem to distinguish between, this is used to manually give short 8864 # names for the directory name immediately under $match_tables that the 8865 # match tables for this property should be placed in. 8866 main::set_access('match_subdir', \%match_subdir, 'r'); 8867 8868 my %has_dependency; 8869 # A boolean that gives whether some table somewhere is defined as the 8870 # complement of a table in this property. This is a crude, but currently 8871 # sufficient, mechanism to make this property not get destroyed before 8872 # what is dependent on it is. Other dependencies could be added, so the 8873 # name was chosen to reflect a more general situation than actually is 8874 # currently the case. 8875 main::set_access('has_dependency', \%has_dependency, 'r', 's'); 8876 8877 sub new { 8878 # The only required parameter is the positionally first, name. All 8879 # other parameters are key => value pairs. See the documentation just 8880 # above for the meanings of the ones not passed directly on to the map 8881 # table constructor. 8882 8883 my $class = shift; 8884 my $name = shift || ""; 8885 8886 my $self = property_ref($name); 8887 if (defined $self) { 8888 my $options_string = join ", ", @_; 8889 $options_string = ". Ignoring options $options_string" if $options_string; 8890 Carp::my_carp("$self is already in use. Using existing one$options_string;"); 8891 return $self; 8892 } 8893 8894 my %args = @_; 8895 8896 $self = bless \do { my $anonymous_scalar }, $class; 8897 my $addr = do { no overloading; pack 'J', $self; }; 8898 8899 $directory{$addr} = delete $args{'Directory'}; 8900 $file{$addr} = delete $args{'File'}; 8901 $full_name{$addr} = delete $args{'Full_Name'} || $name; 8902 $type{$addr} = delete $args{'Type'} || $UNKNOWN; 8903 $pseudo_map_type{$addr} = delete $args{'Map_Type'}; 8904 $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'} 8905 # Starting in this release, property 8906 # values should be defined for all 8907 # properties, except those overriding this 8908 // $v_version ge v5.1.0; 8909 $match_subdir{$addr} = delete $args{'Match_SubDir'}; 8910 8911 # Rest of parameters passed on. 8912 8913 $has_only_code_point_maps{$addr} = 1; 8914 $table_ref{$addr} = { }; 8915 $unique_maps{$addr} = { }; 8916 $has_dependency{$addr} = 0; 8917 8918 $map{$addr} = Map_Table->new($name, 8919 Full_Name => $full_name{$addr}, 8920 _Alias_Hash => \%alias_to_property_of, 8921 _Property => $self, 8922 %args); 8923 return $self; 8924 } 8925 8926 # See this program's beginning comment block about overloading the copy 8927 # constructor. Few operations are defined on properties, but a couple are 8928 # useful. It is safe to take the inverse of a property, and to remove a 8929 # single code point from it. 8930 use overload 8931 fallback => 0, 8932 qw("") => "_operator_stringify", 8933 "." => \&main::_operator_dot, 8934 ".=" => \&main::_operator_dot_equal, 8935 '==' => \&main::_operator_equal, 8936 '!=' => \&main::_operator_not_equal, 8937 '=' => sub { return shift }, 8938 '-=' => "_minus_and_equal", 8939 ; 8940 8941 sub _operator_stringify { 8942 return "Property '" . shift->full_name . "'"; 8943 } 8944 8945 sub _minus_and_equal($self, $other, $reversed=0) { 8946 # Remove a single code point from the map table of a property. 8947 if (ref $other) { 8948 Carp::my_carp_bug("Bad news. Can't cope with a " 8949 . ref($other) 8950 . " argument to '-='. Subtraction ignored."); 8951 return $self; 8952 } 8953 elsif ($reversed) { # Shouldn't happen in a -=, but just in case 8954 Carp::my_carp_bug("Bad news. Can't cope with subtracting a " 8955 . ref $self 8956 . " from a non-object. undef returned."); 8957 return; 8958 } 8959 else { 8960 no overloading; 8961 $map{pack 'J', $self}->delete_range($other, $other); 8962 } 8963 return $self; 8964 } 8965 8966 sub add_match_table { 8967 # Add a new match table for this property, with name given by the 8968 # parameter. It returns a pointer to the table. 8969 8970 my $self = shift; 8971 my $name = shift; 8972 my %args = @_; 8973 8974 my $addr = do { no overloading; pack 'J', $self; }; 8975 8976 my $table = $table_ref{$addr}{$name}; 8977 my $standard_name = main::standardize($name); 8978 if (defined $table 8979 || (defined ($table = $table_ref{$addr}{$standard_name}))) 8980 { 8981 Carp::my_carp("Table '$name' in $self is already in use. Using existing one"); 8982 $table_ref{$addr}{$name} = $table; 8983 return $table; 8984 } 8985 else { 8986 8987 # See if this is a perl extension, if not passed in. 8988 my $perl_extension = delete $args{'Perl_Extension'}; 8989 $perl_extension 8990 = $self->perl_extension if ! defined $perl_extension; 8991 8992 my $fate; 8993 my $suppression_reason = ""; 8994 if ($self->name =~ /^_/) { 8995 $fate = $SUPPRESSED; 8996 $suppression_reason = "Parent property is internal only"; 8997 } 8998 elsif ($self->fate >= $SUPPRESSED) { 8999 $fate = $self->fate; 9000 $suppression_reason = $why_suppressed{$self->complete_name}; 9001 9002 } 9003 elsif ($name =~ /^_/) { 9004 $fate = $INTERNAL_ONLY; 9005 } 9006 $table = Match_Table->new( 9007 Name => $name, 9008 Perl_Extension => $perl_extension, 9009 _Alias_Hash => $table_ref{$addr}, 9010 _Property => $self, 9011 Fate => $fate, 9012 Suppression_Reason => $suppression_reason, 9013 Status => $self->status, 9014 _Status_Info => $self->status_info, 9015 %args); 9016 return unless defined $table; 9017 } 9018 9019 # Save the names for quick look up 9020 $table_ref{$addr}{$standard_name} = $table; 9021 $table_ref{$addr}{$name} = $table; 9022 9023 # Perhaps we can figure out the type of this property based on the 9024 # fact of adding this match table. First, string properties don't 9025 # have match tables; second, a binary property can't have 3 match 9026 # tables 9027 if ($type{$addr} == $UNKNOWN) { 9028 $type{$addr} = $NON_STRING; 9029 } 9030 elsif ($type{$addr} == $STRING) { 9031 Carp::my_carp("$self Added a match table '$name' to a string property '$self'. Changed it to a non-string property. Bad News."); 9032 $type{$addr} = $NON_STRING; 9033 } 9034 elsif ($type{$addr} != $ENUM && $type{$addr} != $FORCED_BINARY) { 9035 if (scalar main::uniques(values %{$table_ref{$addr}}) > 2) { 9036 if ($type{$addr} == $BINARY) { 9037 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary. Changing its type to 'enum'. Bad News."); 9038 } 9039 $type{$addr} = $ENUM; 9040 } 9041 } 9042 9043 return $table; 9044 } 9045 9046 sub delete_match_table($self, $table_to_remove) { 9047 # Delete the table referred to by $2 from the property $1. 9048 my $addr = do { no overloading; pack 'J', $self; }; 9049 9050 # Remove all names that refer to it. 9051 foreach my $key (keys %{$table_ref{$addr}}) { 9052 delete $table_ref{$addr}{$key} 9053 if $table_ref{$addr}{$key} == $table_to_remove; 9054 } 9055 9056 $table_to_remove->DESTROY; 9057 return; 9058 } 9059 9060 sub table($self, $name) { 9061 # Return a pointer to the match table (with name given by the 9062 # parameter) associated with this property; undef if none. 9063 my $addr = do { no overloading; pack 'J', $self; }; 9064 9065 return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name}; 9066 9067 # If quick look-up failed, try again using the standard form of the 9068 # input name. If that succeeds, cache the result before returning so 9069 # won't have to standardize this input name again. 9070 my $standard_name = main::standardize($name); 9071 return unless defined $table_ref{$addr}{$standard_name}; 9072 9073 $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name}; 9074 return $table_ref{$addr}{$name}; 9075 } 9076 9077 sub tables { 9078 # Return a list of pointers to all the match tables attached to this 9079 # property 9080 9081 no overloading; 9082 return main::uniques(values %{$table_ref{pack 'J', shift}}); 9083 } 9084 9085 sub directory { 9086 # Returns the directory the map table for this property should be 9087 # output in. If a specific directory has been specified, that has 9088 # priority; 'undef' is returned if the type isn't defined; 9089 # or $map_directory for everything else. 9090 9091 my $addr = do { no overloading; pack 'J', shift; }; 9092 9093 return $directory{$addr} if defined $directory{$addr}; 9094 return undef if $type{$addr} == $UNKNOWN; 9095 return $map_directory; 9096 } 9097 9098 sub swash_name($self) { 9099 # Return the name that is used to both: 9100 # 1) Name the file that the map table is written to. 9101 # 2) The name of swash related stuff inside that file. 9102 # The reason for this is that the Perl core historically has used 9103 # certain names that aren't the same as the Unicode property names. 9104 # To continue using these, $file is hard-coded in this file for those, 9105 # but otherwise the standard name is used. This is different from the 9106 # external_name, so that the rest of the files, like in lib can use 9107 # the standard name always, without regard to historical precedent. 9108 my $addr = do { no overloading; pack 'J', $self; }; 9109 9110 # Swash names are used only on either 9111 # 1) legacy-only properties, because the formats for these are 9112 # unchangeable, and they have had these lines in them; or 9113 # 2) regular or internal-only map tables 9114 # 3) otherwise there should be no access to the 9115 # property map table from other parts of Perl. 9116 return if $map{$addr}->fate != $ORDINARY 9117 && $map{$addr}->fate != $LEGACY_ONLY 9118 && ! ($map{$addr}->name =~ /^_/ 9119 && $map{$addr}->fate == $INTERNAL_ONLY); 9120 9121 return $file{$addr} if defined $file{$addr}; 9122 return $map{$addr}->external_name; 9123 } 9124 9125 sub to_create_match_tables($self) { 9126 # Returns a boolean as to whether or not match tables should be 9127 # created for this property. 9128 9129 # The whole point of this pseudo property is match tables. 9130 return 1 if $self == $perl; 9131 9132 my $addr = do { no overloading; pack 'J', $self; }; 9133 9134 # Don't generate tables of code points that match the property values 9135 # of a string property. Such a list would most likely have many 9136 # property values, each with just one or very few code points mapping 9137 # to it. 9138 return 0 if $type{$addr} == $STRING; 9139 9140 # Otherwise, do. 9141 return 1; 9142 } 9143 9144 sub property_add_or_replace_non_nulls($self, $other) { 9145 # This adds the mappings in the property $other to $self. Non-null 9146 # mappings from $other override those in $self. It essentially merges 9147 # the two properties, with the second having priority except for null 9148 # mappings. 9149 9150 if (! $other->isa(__PACKAGE__)) { 9151 Carp::my_carp_bug("$other should be a " 9152 . __PACKAGE__ 9153 . ". Not a '" 9154 . ref($other) 9155 . "'. Not added;"); 9156 return; 9157 } 9158 9159 no overloading; 9160 return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other}); 9161 } 9162 9163 sub set_proxy_for { 9164 # Certain tables are not generally written out to files, but 9165 # Unicode::UCD has the intelligence to know that the file for $self 9166 # can be used to reconstruct those tables. This routine just changes 9167 # things so that UCD pod entries for those suppressed tables are 9168 # generated, so the fact that a proxy is used is invisible to the 9169 # user. 9170 9171 my $self = shift; 9172 9173 foreach my $property_name (@_) { 9174 my $ref = property_ref($property_name); 9175 next if $ref->to_output_map; 9176 $ref->set_fate($MAP_PROXIED); 9177 } 9178 } 9179 9180 sub set_type($self, $type) { 9181 # Set the type of the property. Mostly this is figured out by the 9182 # data in the table. But this is used to set it explicitly. The 9183 # reason it is not a standard accessor is that when setting a binary 9184 # property, we need to make sure that all the true/false aliases are 9185 # present, as they were omitted in early Unicode releases. 9186 9187 if ($type != $ENUM 9188 && $type != $BINARY 9189 && $type != $FORCED_BINARY 9190 && $type != $STRING) 9191 { 9192 Carp::my_carp("Unrecognized type '$type'. Type not set"); 9193 return; 9194 } 9195 9196 { no overloading; $type{pack 'J', $self} = $type; } 9197 return if $type != $BINARY && $type != $FORCED_BINARY; 9198 9199 my $yes = $self->table('Y'); 9200 $yes = $self->table('Yes') if ! defined $yes; 9201 $yes = $self->add_match_table('Y', Full_Name => 'Yes') 9202 if ! defined $yes; 9203 9204 # Add aliases in order wanted, duplicates will be ignored. We use a 9205 # binary property present in all releases for its ordered lists of 9206 # true/false aliases. Note, that could run into problems in 9207 # outputting things in that we don't distinguish between the name and 9208 # full name of these. Hopefully, if the table was already created 9209 # before this code is executed, it was done with these set properly. 9210 my $bm = property_ref("Bidi_Mirrored"); 9211 foreach my $alias ($bm->table("Y")->aliases) { 9212 $yes->add_alias($alias->name); 9213 } 9214 my $no = $self->table('N'); 9215 $no = $self->table('No') if ! defined $no; 9216 $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no; 9217 foreach my $alias ($bm->table("N")->aliases) { 9218 $no->add_alias($alias->name); 9219 } 9220 9221 return; 9222 } 9223 9224 sub add_map { 9225 # Add a map to the property's map table. This also keeps 9226 # track of the maps so that the property type can be determined from 9227 # its data. 9228 9229 my $self = shift; 9230 my $start = shift; # First code point in range 9231 my $end = shift; # Final code point in range 9232 my $map = shift; # What the range maps to. 9233 # Rest of parameters passed on. 9234 9235 my $addr = do { no overloading; pack 'J', $self; }; 9236 9237 # If haven't the type of the property, gather information to figure it 9238 # out. 9239 if ($type{$addr} == $UNKNOWN) { 9240 9241 # If the map contains an interior blank or dash, or most other 9242 # nonword characters, it will be a string property. This 9243 # heuristic may actually miss some string properties. If so, they 9244 # may need to have explicit set_types called for them. This 9245 # happens in the Unihan properties. 9246 if ($map =~ / (?<= . ) [ -] (?= . ) /x 9247 || $map =~ / [^\w.\/\ -] /x) 9248 { 9249 $self->set_type($STRING); 9250 9251 # $unique_maps is used for disambiguating between ENUM and 9252 # BINARY later; since we know the property is not going to be 9253 # one of those, no point in keeping the data around 9254 undef $unique_maps{$addr}; 9255 } 9256 else { 9257 9258 # Not necessarily a string. The final decision has to be 9259 # deferred until all the data are in. We keep track of if all 9260 # the values are code points for that eventual decision. 9261 $has_only_code_point_maps{$addr} &= 9262 $map =~ / ^ $code_point_re $/x; 9263 9264 # For the purposes of disambiguating between binary and other 9265 # enumerations at the end, we keep track of the first three 9266 # distinct property values. Once we get to three, we know 9267 # it's not going to be binary, so no need to track more. 9268 if (scalar keys %{$unique_maps{$addr}} < 3) { 9269 $unique_maps{$addr}{main::standardize($map)} = 1; 9270 } 9271 } 9272 } 9273 9274 # Add the mapping by calling our map table's method 9275 return $map{$addr}->add_map($start, $end, $map, @_); 9276 } 9277 9278 sub compute_type($self) { 9279 # Compute the type of the property: $ENUM, $STRING, or $BINARY. This 9280 # should be called after the property is mostly filled with its maps. 9281 # We have been keeping track of what the property values have been, 9282 # and now have the necessary information to figure out the type. 9283 9284 my $addr = do { no overloading; pack 'J', $self; }; 9285 9286 my $type = $type{$addr}; 9287 9288 # If already have figured these out, no need to do so again, but we do 9289 # a double check on ENUMS to make sure that a string property hasn't 9290 # improperly been classified as an ENUM, so continue on with those. 9291 return if $type == $STRING 9292 || $type == $BINARY 9293 || $type == $FORCED_BINARY; 9294 9295 # If every map is to a code point, is a string property. 9296 if ($type == $UNKNOWN 9297 && ($has_only_code_point_maps{$addr} 9298 || (defined $map{$addr}->default_map 9299 && $map{$addr}->default_map eq ""))) 9300 { 9301 $self->set_type($STRING); 9302 } 9303 else { 9304 9305 # Otherwise, it is to some sort of enumeration. (The case where 9306 # it is a Unicode miscellaneous property, and treated like a 9307 # string in this program is handled in add_map()). Distinguish 9308 # between binary and some other enumeration type. Of course, if 9309 # there are more than two values, it's not binary. But more 9310 # subtle is the test that the default mapping is defined means it 9311 # isn't binary. This in fact may change in the future if Unicode 9312 # changes the way its data is structured. But so far, no binary 9313 # properties ever have @missing lines for them, so the default map 9314 # isn't defined for them. The few properties that are two-valued 9315 # and aren't considered binary have the default map defined 9316 # starting in Unicode 5.0, when the @missing lines appeared; and 9317 # this program has special code to put in a default map for them 9318 # for earlier than 5.0 releases. 9319 if ($type == $ENUM 9320 || scalar keys %{$unique_maps{$addr}} > 2 9321 || defined $self->default_map) 9322 { 9323 my $tables = $self->tables; 9324 my $count = $self->count; 9325 if ($verbosity && $tables > 500 && $tables/$count > .1) { 9326 Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $tables\n"); 9327 } 9328 $self->set_type($ENUM); 9329 } 9330 else { 9331 $self->set_type($BINARY); 9332 } 9333 } 9334 undef $unique_maps{$addr}; # Garbage collect 9335 return; 9336 } 9337 9338 # $reaons - Ignored unless suppressing 9339 sub set_fate($self, $fate, $reason=undef) { 9340 my $addr = do { no overloading; pack 'J', $self; }; 9341 if ($fate >= $SUPPRESSED) { 9342 $why_suppressed{$self->complete_name} = $reason; 9343 } 9344 9345 # Each table shares the property's fate, except that MAP_PROXIED 9346 # doesn't affect match tables 9347 $map{$addr}->set_fate($fate, $reason); 9348 if ($fate != $MAP_PROXIED) { 9349 foreach my $table ($map{$addr}, $self->tables) { 9350 $table->set_fate($fate, $reason); 9351 } 9352 } 9353 return; 9354 } 9355 9356 9357 # Most of the accessors for a property actually apply to its map table. 9358 # Setup up accessor functions for those, referring to %map 9359 for my $sub (qw( 9360 add_alias 9361 add_anomalous_entry 9362 add_comment 9363 add_conflicting 9364 add_description 9365 add_duplicate 9366 add_note 9367 aliases 9368 comment 9369 complete_name 9370 containing_range 9371 count 9372 default_map 9373 definition 9374 delete_range 9375 description 9376 each_range 9377 external_name 9378 fate 9379 file_path 9380 format 9381 initialize 9382 inverse_list 9383 is_empty 9384 replacement_property 9385 name 9386 note 9387 perl_extension 9388 property 9389 range_count 9390 ranges 9391 range_size_1 9392 replace_map 9393 reset_each_range 9394 set_comment 9395 set_default_map 9396 set_file_path 9397 set_final_comment 9398 _set_format 9399 set_range_size_1 9400 set_status 9401 set_to_output_map 9402 short_name 9403 status 9404 status_info 9405 to_output_map 9406 type_of 9407 value_of 9408 write 9409 )) 9410 # 'property' above is for symmetry, so that one can take 9411 # the property of a property and get itself, and so don't 9412 # have to distinguish between properties and tables in 9413 # calling code 9414 { 9415 no strict "refs"; 9416 *$sub = sub { 9417 use strict "refs"; 9418 my $self = shift; 9419 no overloading; 9420 return $map{pack 'J', $self}->$sub(@_); 9421 } 9422 } 9423 9424 9425} # End closure 9426 9427package main; 9428 9429sub display_chr { 9430 # Converts an ordinal printable character value to a displayable string, 9431 # using a dotted circle to hold combining characters. 9432 9433 my $ord = shift; 9434 my $chr = chr $ord; 9435 return $chr if $ccc->table(0)->contains($ord); 9436 return "\x{25CC}$chr"; 9437} 9438 9439sub join_lines($return) { 9440 # Returns lines of the input joined together, so that they can be folded 9441 # properly. 9442 # This causes continuation lines to be joined together into one long line 9443 # for folding. A continuation line is any line that doesn't begin with a 9444 # space or "\b" (the latter is stripped from the output). This is so 9445 # lines can be in a HERE document so as to fit nicely in the terminal 9446 # width, but be joined together in one long line, and then folded with 9447 # indents, '#' prefixes, etc, properly handled. 9448 # A blank separates the joined lines except if there is a break; an extra 9449 # blank is inserted after a period ending a line. 9450 9451 # Initialize the return with the first line. 9452 my ( @lines ) = split "\n", $return; 9453 9454 # If the first line is null, it was an empty line, add the \n back in 9455 $return = "\n" if $return eq ""; 9456 9457 # Now join the remainder of the physical lines. 9458 for my $line (@lines) { 9459 9460 # An empty line means wanted a blank line, so add two \n's to get that 9461 # effect, and go to the next line. 9462 if (length $line == 0) { 9463 $return .= "\n\n"; 9464 next; 9465 } 9466 9467 # Look at the last character of what we have so far. 9468 my $previous_char = substr($return, -1, 1); 9469 9470 # And at the next char to be output. 9471 my $next_char = substr($line, 0, 1); 9472 9473 if ($previous_char ne "\n") { 9474 9475 # Here didn't end wth a nl. If the next char a blank or \b, it 9476 # means that here there is a break anyway. So add a nl to the 9477 # output. 9478 if ($next_char eq " " || $next_char eq "\b") { 9479 $previous_char = "\n"; 9480 $return .= $previous_char; 9481 } 9482 9483 # Add an extra space after periods. 9484 $return .= " " if $previous_char eq '.'; 9485 } 9486 9487 # Here $previous_char is still the latest character to be output. If 9488 # it isn't a nl, it means that the next line is to be a continuation 9489 # line, with a blank inserted between them. 9490 $return .= " " if $previous_char ne "\n"; 9491 9492 # Get rid of any \b 9493 substr($line, 0, 1) = "" if $next_char eq "\b"; 9494 9495 # And append this next line. 9496 $return .= $line; 9497 } 9498 9499 return $return; 9500} 9501 9502sub simple_fold( $line, $prefix="", $hanging_indent=0, $right_margin=0) { 9503 # Returns a string of the input (string or an array of strings) folded 9504 # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus 9505 # a \n 9506 # This is tailored for the kind of text written by this program, 9507 # especially the pod file, which can have very long names with 9508 # underscores in the middle, or words like AbcDefgHij.... We allow 9509 # breaking in the middle of such constructs if the line won't fit 9510 # otherwise. The break in such cases will come either just after an 9511 # underscore, or just before one of the Capital letters. 9512 9513 local $to_trace = 0 if main::DEBUG; 9514 9515 # $prefix Optional string to prepend to each output line 9516 # $hanging_indent Optional number of spaces to indent 9517 # continuation lines 9518 # $right_margin Optional number of spaces to narrow the 9519 # total width by. 9520 9521 # The space available doesn't include what's automatically prepended 9522 # to each line, or what's reserved on the right. 9523 my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin; 9524 # XXX Instead of using the 'nofold' perhaps better to look up the stack 9525 9526 if (DEBUG && $hanging_indent >= $max) { 9527 Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max. Using 0", 'nofold'); 9528 $hanging_indent = 0; 9529 } 9530 9531 # First, split into the current physical lines. 9532 my @line; 9533 if (ref $line) { # Better be an array, because not bothering to 9534 # test 9535 foreach my $line (@{$line}) { 9536 push @line, split /\n/, $line; 9537 } 9538 } 9539 else { 9540 @line = split /\n/, $line; 9541 } 9542 9543 #local $to_trace = 1 if main::DEBUG; 9544 trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace; 9545 9546 # Look at each current physical line. 9547 for (my $i = 0; $i < @line; $i++) { 9548 Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/; 9549 #local $to_trace = 1 if main::DEBUG; 9550 trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace; 9551 9552 # Remove prefix, because will be added back anyway, don't want 9553 # doubled prefix 9554 $line[$i] =~ s/^$prefix//; 9555 9556 # Remove trailing space 9557 $line[$i] =~ s/\s+\Z//; 9558 9559 # If the line is too long, fold it. 9560 if (length $line[$i] > $max) { 9561 my $remainder; 9562 9563 # Here needs to fold. Save the leading space in the line for 9564 # later. 9565 $line[$i] =~ /^ ( \s* )/x; 9566 my $leading_space = $1; 9567 trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace; 9568 9569 # If character at final permissible position is white space, 9570 # fold there, which will delete that white space 9571 if (substr($line[$i], $max - 1, 1) =~ /\s/) { 9572 $remainder = substr($line[$i], $max); 9573 $line[$i] = substr($line[$i], 0, $max - 1); 9574 } 9575 else { 9576 9577 # Otherwise fold at an acceptable break char closest to 9578 # the max length. Look at just the maximal initial 9579 # segment of the line 9580 my $segment = substr($line[$i], 0, $max - 1); 9581 if ($segment =~ 9582 /^ ( .{$hanging_indent} # Don't look before the 9583 # indent. 9584 \ * # Don't look in leading 9585 # blanks past the indent 9586 [^ ] .* # Find the right-most 9587 (?: # acceptable break: 9588 [ \s = ] # space or equal 9589 | - (?! [.0-9] ) # or non-unary minus. 9590 | [^\\[(] (?= \\ )# break before single backslash 9591 # not immediately after opening 9592 # punctuation 9593 ) # $1 includes the character 9594 )/x) 9595 { 9596 # Split into the initial part that fits, and remaining 9597 # part of the input 9598 $remainder = substr($line[$i], length $1); 9599 $line[$i] = $1; 9600 trace $line[$i] if DEBUG && $to_trace; 9601 trace $remainder if DEBUG && $to_trace; 9602 } 9603 9604 # If didn't find a good breaking spot, see if there is a 9605 # not-so-good breaking spot. These are just after 9606 # underscores or where the case changes from lower to 9607 # upper. Use \a as a soft hyphen, but give up 9608 # and don't break the line if there is actually a \a 9609 # already in the input. We use an ascii character for the 9610 # soft-hyphen to avoid any attempt by miniperl to try to 9611 # access the files that this program is creating. 9612 elsif ($segment !~ /\a/ 9613 && ($segment =~ s/_/_\a/g 9614 || $segment =~ s/ ( (?!\\) [a-z] ) (?= [A-Z] )/$1\a/xg)) 9615 { 9616 # Here were able to find at least one place to insert 9617 # our substitute soft hyphen. Find the right-most one 9618 # and replace it by a real hyphen. 9619 trace $segment if DEBUG && $to_trace; 9620 substr($segment, 9621 rindex($segment, "\a"), 9622 1) = '-'; 9623 9624 # Then remove the soft hyphen substitutes. 9625 $segment =~ s/\a//g; 9626 trace $segment if DEBUG && $to_trace; 9627 9628 # And split into the initial part that fits, and 9629 # remainder of the line 9630 my $pos = rindex($segment, '-'); 9631 $remainder = substr($line[$i], $pos); 9632 trace $remainder if DEBUG && $to_trace; 9633 $line[$i] = substr($segment, 0, $pos + 1); 9634 } 9635 } 9636 9637 # Here we know if we can fold or not. If we can, $remainder 9638 # is what remains to be processed in the next iteration. 9639 if (defined $remainder) { 9640 trace "folded='$line[$i]'" if main::DEBUG && $to_trace; 9641 9642 # Insert the folded remainder of the line as a new element 9643 # of the array. (It may still be too long, but we will 9644 # deal with that next time through the loop.) Omit any 9645 # leading space in the remainder. 9646 $remainder =~ s/^\s+//; 9647 trace "remainder='$remainder'" if main::DEBUG && $to_trace; 9648 9649 # But then indent by whichever is larger of: 9650 # 1) the leading space on the input line; 9651 # 2) the hanging indent. 9652 # This preserves indentation in the original line. 9653 my $lead = ($leading_space) 9654 ? length $leading_space 9655 : $hanging_indent; 9656 $lead = max($lead, $hanging_indent); 9657 splice @line, $i+1, 0, (" " x $lead) . $remainder; 9658 } 9659 } 9660 9661 # Ready to output the line. Get rid of any trailing space 9662 # And prefix by the required $prefix passed in. 9663 $line[$i] =~ s/\s+$//; 9664 $line[$i] = "$prefix$line[$i]\n"; 9665 } # End of looping through all the lines. 9666 9667 return join "", @line; 9668} 9669 9670sub property_ref { # Returns a reference to a property object. 9671 return Property::property_ref(@_); 9672} 9673 9674sub force_unlink ($filename) { 9675 return unless file_exists($filename); 9676 return if CORE::unlink($filename); 9677 9678 # We might need write permission 9679 chmod 0777, $filename; 9680 CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename. Proceeding anyway: $!"); 9681 return; 9682} 9683 9684sub write ($file, $use_utf8, @lines) { 9685 # Given a filename and references to arrays of lines, write the lines of 9686 # each array to the file 9687 # Filename can be given as an arrayref of directory names 9688 9689 # Get into a single string if an array, and get rid of, in Unix terms, any 9690 # leading '.' 9691 $file= File::Spec->join(@$file) if ref $file eq 'ARRAY'; 9692 $file = File::Spec->canonpath($file); 9693 9694 # If has directories, make sure that they all exist 9695 (undef, my $directories, undef) = File::Spec->splitpath($file); 9696 File::Path::mkpath($directories) if $directories && ! -d $directories; 9697 9698 push @files_actually_output, $file; 9699 9700 force_unlink ($file); 9701 9702 my $OUT; 9703 if (not open $OUT, ">", $file) { 9704 Carp::my_carp("can't open $file for output. Skipping this file: $!"); 9705 return; 9706 } 9707 9708 binmode $OUT, ":utf8" if $use_utf8; 9709 9710 foreach my $lines_ref (@lines) { 9711 unless (@$lines_ref) { 9712 Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;"); 9713 } 9714 9715 print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!"); 9716 } 9717 close $OUT or die Carp::my_carp("close '$file' failed: $!"); 9718 9719 print "$file written.\n" if $verbosity >= $VERBOSE; 9720 9721 return; 9722} 9723 9724 9725sub Standardize($name=undef) { 9726 # This converts the input name string into a standardized equivalent to 9727 # use internally. 9728 9729 unless (defined $name) { 9730 Carp::my_carp_bug("Standardize() called with undef. Returning undef."); 9731 return; 9732 } 9733 9734 # Remove any leading or trailing white space 9735 $name =~ s/^\s+//g; 9736 $name =~ s/\s+$//g; 9737 9738 # Convert interior white space and hyphens into underscores. 9739 $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg; 9740 9741 # Capitalize the letter following an underscore, and convert a sequence of 9742 # multiple underscores to a single one 9743 $name =~ s/ (?<= .) _+ (.) /_\u$1/xg; 9744 9745 # And capitalize the first letter, but not for the special cjk ones. 9746 $name = ucfirst($name) unless $name =~ /^k[A-Z]/; 9747 return $name; 9748} 9749 9750sub standardize ($str=undef) { 9751 # Returns a lower-cased standardized name, without underscores. This form 9752 # is chosen so that it can distinguish between any real versus superficial 9753 # Unicode name differences. It relies on the fact that Unicode doesn't 9754 # have interior underscores, white space, nor dashes in any 9755 # stricter-matched name. It should not be used on Unicode code point 9756 # names (the Name property), as they mostly, but not always follow these 9757 # rules. 9758 9759 my $name = Standardize($str); 9760 return if !defined $name; 9761 9762 $name =~ s/ (?<= .) _ (?= . ) //xg; 9763 return lc $name; 9764} 9765 9766sub UCD_name ($table, $alias) { 9767 # Returns the name that Unicode::UCD will use to find a table. XXX 9768 # perhaps this function should be placed somewhere, like UCD.pm so that 9769 # Unicode::UCD can use it directly without duplicating code that can get 9770 # out-of sync. 9771 9772 my $property = $table->property; 9773 $property = ($property == $perl) 9774 ? "" # 'perl' is never explicitly stated 9775 : standardize($property->name) . '='; 9776 if ($alias->loose_match) { 9777 return $property . standardize($alias->name); 9778 } 9779 else { 9780 return lc ($property . $alias->name); 9781 } 9782 9783 return; 9784} 9785 9786{ # Closure 9787 9788 my $indent_increment = " " x ( $debugging_build ? 2 : 0); 9789 %main::already_output = (); 9790 9791 $main::simple_dumper_nesting = 0; 9792 9793 sub simple_dumper( $item, $indent = "" ) { 9794 # Like Simple Data::Dumper. Good enough for our needs. We can't use 9795 # the real thing as we have to run under miniperl. 9796 9797 # It is designed so that on input it is at the beginning of a line, 9798 # and the final thing output in any call is a trailing ",\n". 9799 9800 $indent = "" if ! $debugging_build; 9801 9802 # nesting level is localized, so that as the call stack pops, it goes 9803 # back to the prior value. 9804 local $main::simple_dumper_nesting = $main::simple_dumper_nesting; 9805 local %main::already_output = %main::already_output; 9806 $main::simple_dumper_nesting++; 9807 #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n"; 9808 9809 # Determine the indent for recursive calls. 9810 my $next_indent = $indent . $indent_increment; 9811 9812 my $output; 9813 if (! ref $item) { 9814 9815 # Dump of scalar: just output it in quotes if not a number. To do 9816 # so we must escape certain characters, and therefore need to 9817 # operate on a copy to avoid changing the original 9818 my $copy = $item; 9819 $copy = $UNDEF unless defined $copy; 9820 9821 # Quote non-integers (integers also have optional leading '-') 9822 if ($copy eq "" || $copy !~ /^ -? \d+ $/x) { 9823 9824 # Escape apostrophe and backslash 9825 $copy =~ s/ ( ['\\] ) /\\$1/xg; 9826 $copy = "'$copy'"; 9827 } 9828 $output = "$indent$copy,\n"; 9829 } 9830 else { 9831 9832 # Keep track of cycles in the input, and refuse to infinitely loop 9833 my $addr = do { no overloading; pack 'J', $item; }; 9834 if (defined $main::already_output{$addr}) { 9835 return "${indent}ALREADY OUTPUT: $item\n"; 9836 } 9837 $main::already_output{$addr} = $item; 9838 9839 if (ref $item eq 'ARRAY') { 9840 my $using_brackets; 9841 $output = $indent; 9842 if ($main::simple_dumper_nesting > 1) { 9843 $output .= '['; 9844 $using_brackets = 1; 9845 } 9846 else { 9847 $using_brackets = 0; 9848 } 9849 9850 # If the array is empty, put the closing bracket on the same 9851 # line. Otherwise, recursively add each array element 9852 if (@$item == 0) { 9853 $output .= " "; 9854 } 9855 else { 9856 $output .= "\n"; 9857 for (my $i = 0; $i < @$item; $i++) { 9858 9859 # Indent array elements one level 9860 $output .= &simple_dumper($item->[$i], $next_indent); 9861 next if ! $debugging_build; 9862 $output =~ s/\n$//; # Remove any trailing nl so 9863 $output .= " # [$i]\n"; # as to add a comment giving 9864 # the array index 9865 } 9866 $output .= $indent; # Indent closing ']' to orig level 9867 } 9868 $output .= ']' if $using_brackets; 9869 $output .= ",\n"; 9870 } 9871 elsif (ref $item eq 'HASH') { 9872 my $is_first_line; 9873 my $using_braces; 9874 my $body_indent; 9875 9876 # No surrounding braces at top level 9877 $output .= $indent; 9878 if ($main::simple_dumper_nesting > 1) { 9879 $output .= "{\n"; 9880 $is_first_line = 0; 9881 $body_indent = $next_indent; 9882 $next_indent .= $indent_increment; 9883 $using_braces = 1; 9884 } 9885 else { 9886 $is_first_line = 1; 9887 $body_indent = $indent; 9888 $using_braces = 0; 9889 } 9890 9891 # Output hashes sorted alphabetically instead of apparently 9892 # random. Use caseless alphabetic sort 9893 foreach my $key (sort { lc $a cmp lc $b } keys %$item) 9894 { 9895 if ($is_first_line) { 9896 $is_first_line = 0; 9897 } 9898 else { 9899 $output .= "$body_indent"; 9900 } 9901 9902 # The key must be a scalar, but this recursive call quotes 9903 # it 9904 $output .= &simple_dumper($key); 9905 9906 # And change the trailing comma and nl to the hash fat 9907 # comma for clarity, and so the value can be on the same 9908 # line 9909 $output =~ s/,\n$/ => /; 9910 9911 # Recursively call to get the value's dump. 9912 my $next = &simple_dumper($item->{$key}, $next_indent); 9913 9914 # If the value is all on one line, remove its indent, so 9915 # will follow the => immediately. If it takes more than 9916 # one line, start it on a new line. 9917 if ($next !~ /\n.*\n/) { 9918 $next =~ s/^ *//; 9919 } 9920 else { 9921 $output .= "\n"; 9922 } 9923 $output .= $next; 9924 } 9925 9926 $output .= "$indent},\n" if $using_braces; 9927 } 9928 elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') { 9929 $output = $indent . ref($item) . "\n"; 9930 # XXX see if blessed 9931 } 9932 elsif ($item->can('dump')) { 9933 9934 # By convention in this program, objects furnish a 'dump' 9935 # method. Since not doing any output at this level, just pass 9936 # on the input indent 9937 $output = $item->dump($indent); 9938 } 9939 else { 9940 Carp::my_carp("Can't cope with dumping a " . ref($item) . ". Skipping."); 9941 } 9942 } 9943 return $output; 9944 } 9945} 9946 9947sub dump_inside_out( $object, $fields_ref ) { 9948 # Dump inside-out hashes in an object's state by converting them to a 9949 # regular hash and then calling simple_dumper on that. 9950 9951 my $addr = do { no overloading; pack 'J', $object; }; 9952 9953 my %hash; 9954 foreach my $key (keys %$fields_ref) { 9955 $hash{$key} = $fields_ref->{$key}{$addr}; 9956 } 9957 9958 return simple_dumper(\%hash, @_); 9959} 9960 9961sub _operator_dot($self, $other="", $reversed=0) { 9962 # Overloaded '.' method that is common to all packages. It uses the 9963 # package's stringify method. 9964 9965 foreach my $which (\$self, \$other) { 9966 next unless ref $$which; 9967 if ($$which->can('_operator_stringify')) { 9968 $$which = $$which->_operator_stringify; 9969 } 9970 else { 9971 my $ref = ref $$which; 9972 my $addr = do { no overloading; pack 'J', $$which; }; 9973 $$which = "$ref ($addr)"; 9974 } 9975 } 9976 return ($reversed) 9977 ? "$other$self" 9978 : "$self$other"; 9979} 9980 9981sub _operator_dot_equal($self, $other="", $reversed=0) { 9982 # Overloaded '.=' method that is common to all packages. 9983 9984 if ($reversed) { 9985 return $other .= "$self"; 9986 } 9987 else { 9988 return "$self" . "$other"; 9989 } 9990} 9991 9992sub _operator_equal($self, $other, @) { 9993 # Generic overloaded '==' routine. To be equal, they must be the exact 9994 # same object 9995 9996 return 0 unless defined $other; 9997 return 0 unless ref $other; 9998 no overloading; 9999 return $self == $other; 10000} 10001 10002sub _operator_not_equal($self, $other, @) { 10003 return ! _operator_equal($self, $other); 10004} 10005 10006sub substitute_PropertyAliases($file_object) { 10007 # Deal with early releases that don't have the crucial PropertyAliases.txt 10008 # file. 10009 10010 $file_object->insert_lines(get_old_property_aliases()); 10011 10012 process_PropertyAliases($file_object); 10013} 10014 10015 10016sub process_PropertyAliases($file) { 10017 # This reads in the PropertyAliases.txt file, which contains almost all 10018 # the character properties in Unicode and their equivalent aliases: 10019 # scf ; Simple_Case_Folding ; sfc 10020 # 10021 # Field 0 is the preferred short name for the property. 10022 # Field 1 is the full name. 10023 # Any succeeding ones are other accepted names. 10024 10025 # Add any cjk properties that may have been defined. 10026 $file->insert_lines(@cjk_properties); 10027 10028 while ($file->next_line) { 10029 10030 my @data = split /\s*;\s*/; 10031 10032 my $full = $data[1]; 10033 10034 # This line is defective in early Perls. The property in Unihan.txt 10035 # is kRSUnicode. 10036 if ($full eq 'Unicode_Radical_Stroke' && @data < 3) { 10037 push @data, qw(cjkRSUnicode kRSUnicode); 10038 } 10039 10040 my $this = Property->new($data[0], Full_Name => $full); 10041 10042 $this->set_fate($SUPPRESSED, $why_suppressed{$full}) 10043 if $why_suppressed{$full}; 10044 10045 # Start looking for more aliases after these two. 10046 for my $i (2 .. @data - 1) { 10047 $this->add_alias($data[$i]); 10048 } 10049 10050 } 10051 10052 my $scf = property_ref("Simple_Case_Folding"); 10053 $scf->add_alias("scf"); 10054 $scf->add_alias("sfc"); 10055 10056 return; 10057} 10058 10059sub finish_property_setup($file) { 10060 # Finishes setting up after PropertyAliases. 10061 10062 # This entry was missing from this file in earlier Unicode versions 10063 if (-e 'Jamo.txt' && ! defined property_ref('JSN')) { 10064 Property->new('JSN', Full_Name => 'Jamo_Short_Name'); 10065 } 10066 10067 # These are used so much, that we set globals for them. 10068 $gc = property_ref('General_Category'); 10069 $block = property_ref('Block'); 10070 $script = property_ref('Script'); 10071 $age = property_ref('Age'); 10072 10073 # Perl adds this alias. 10074 $gc->add_alias('Category'); 10075 10076 # Unicode::Normalize expects this file with this name and directory. 10077 $ccc = property_ref('Canonical_Combining_Class'); 10078 if (defined $ccc) { 10079 $ccc->set_file('CombiningClass'); 10080 $ccc->set_directory(File::Spec->curdir()); 10081 } 10082 10083 # These two properties aren't actually used in the core, but unfortunately 10084 # the names just above that are in the core interfere with these, so 10085 # choose different names. These aren't a problem unless the map tables 10086 # for these files get written out. 10087 my $lowercase = property_ref('Lowercase'); 10088 $lowercase->set_file('IsLower') if defined $lowercase; 10089 my $uppercase = property_ref('Uppercase'); 10090 $uppercase->set_file('IsUpper') if defined $uppercase; 10091 10092 # Set up the hard-coded default mappings, but only on properties defined 10093 # for this release 10094 foreach my $property (keys %default_mapping) { 10095 my $property_object = property_ref($property); 10096 next if ! defined $property_object; 10097 my $default_map = $default_mapping{$property}; 10098 $property_object->set_default_map($default_map); 10099 10100 # A map of <code point> implies the property is string. 10101 if ($property_object->type == $UNKNOWN 10102 && $default_map eq $CODE_POINT) 10103 { 10104 $property_object->set_type($STRING); 10105 } 10106 } 10107 10108 # The following use the Multi_Default class to create objects for 10109 # defaults. 10110 10111 # Bidi class has a complicated default, but the derived file takes care of 10112 # the complications, leaving just 'L'. 10113 if (file_exists("${EXTRACTED}DBidiClass.txt")) { 10114 property_ref('Bidi_Class')->set_default_map('L'); 10115 } 10116 else { 10117 my $default; 10118 10119 # The derived file was introduced in 3.1.1. The values below are 10120 # taken from table 3-8, TUS 3.0 10121 my $default_R = 10122 'my $default = Range_List->new; 10123 $default->add_range(0x0590, 0x05FF); 10124 $default->add_range(0xFB1D, 0xFB4F);' 10125 ; 10126 10127 # The defaults apply only to unassigned characters 10128 $default_R .= '$gc->table("Unassigned") & $default;'; 10129 10130 if ($v_version lt v3.0.0) { 10131 $default = Multi_Default->new(R => $default_R, 'L'); 10132 } 10133 else { 10134 10135 # AL apparently not introduced until 3.0: TUS 2.x references are 10136 # not on-line to check it out 10137 my $default_AL = 10138 'my $default = Range_List->new; 10139 $default->add_range(0x0600, 0x07BF); 10140 $default->add_range(0xFB50, 0xFDFF); 10141 $default->add_range(0xFE70, 0xFEFF);' 10142 ; 10143 10144 # Non-character code points introduced in this release; aren't AL 10145 if ($v_version ge 3.1.0) { 10146 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);'; 10147 } 10148 $default_AL .= '$gc->table("Unassigned") & $default'; 10149 $default = Multi_Default->new(AL => $default_AL, 10150 R => $default_R, 10151 'L'); 10152 } 10153 property_ref('Bidi_Class')->set_default_map($default); 10154 } 10155 10156 # Joining type has a complicated default, but the derived file takes care 10157 # of the complications, leaving just 'U' (or Non_Joining), except the file 10158 # is bad in 3.1.0 10159 if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') { 10160 if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) { 10161 property_ref('Joining_Type')->set_default_map('Non_Joining'); 10162 } 10163 else { 10164 10165 # Otherwise, there are not one, but two possibilities for the 10166 # missing defaults: T and U. 10167 # The missing defaults that evaluate to T are given by: 10168 # T = Mn + Cf - ZWNJ - ZWJ 10169 # where Mn and Cf are the general category values. In other words, 10170 # any non-spacing mark or any format control character, except 10171 # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO 10172 # WIDTH JOINER (joining type C). 10173 my $default = Multi_Default->new( 10174 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D', 10175 'Non_Joining'); 10176 property_ref('Joining_Type')->set_default_map($default); 10177 } 10178 } 10179 10180 # Line break has a complicated default in early releases. It is 'Unknown' 10181 # for non-assigned code points; 'AL' for assigned. 10182 if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') { 10183 my $lb = property_ref('Line_Break'); 10184 if (file_exists("${EXTRACTED}DLineBreak.txt")) { 10185 $lb->set_default_map('Unknown'); 10186 } 10187 else { 10188 my $default = Multi_Default->new('AL' => '~ $gc->table("Cn")', 10189 'Unknown', 10190 ); 10191 $lb->set_default_map($default); 10192 } 10193 } 10194 10195 # For backwards compatibility with applications that may read the mapping 10196 # file directly (it was documented in 5.12 and 5.14 as being thusly 10197 # usable), keep it from being adjusted. (range_size_1 is 10198 # used to force the traditional format.) 10199 if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) { 10200 $nfkc_cf->set_to_output_map($EXTERNAL_MAP); 10201 $nfkc_cf->set_range_size_1(1); 10202 } 10203 if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) { 10204 $bmg->set_to_output_map($EXTERNAL_MAP); 10205 $bmg->set_range_size_1(1); 10206 } 10207 10208 property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED); 10209 10210 return; 10211} 10212 10213sub get_old_property_aliases() { 10214 # Returns what would be in PropertyAliases.txt if it existed in very old 10215 # versions of Unicode. It was derived from the one in 3.2, and pared 10216 # down based on the data that was actually in the older releases. 10217 # An attempt was made to use the existence of files to mean inclusion or 10218 # not of various aliases, but if this was not sufficient, using version 10219 # numbers was resorted to. 10220 10221 my @return; 10222 10223 # These are to be used in all versions (though some are constructed by 10224 # this program if missing) 10225 push @return, split /\n/, <<'END'; 10226bc ; Bidi_Class 10227Bidi_M ; Bidi_Mirrored 10228cf ; Case_Folding 10229ccc ; Canonical_Combining_Class 10230dm ; Decomposition_Mapping 10231dt ; Decomposition_Type 10232gc ; General_Category 10233isc ; ISO_Comment 10234lc ; Lowercase_Mapping 10235na ; Name 10236na1 ; Unicode_1_Name 10237nt ; Numeric_Type 10238nv ; Numeric_Value 10239scf ; Simple_Case_Folding 10240slc ; Simple_Lowercase_Mapping 10241stc ; Simple_Titlecase_Mapping 10242suc ; Simple_Uppercase_Mapping 10243tc ; Titlecase_Mapping 10244uc ; Uppercase_Mapping 10245END 10246 10247 if (-e 'Blocks.txt') { 10248 push @return, "blk ; Block\n"; 10249 } 10250 if (-e 'ArabicShaping.txt') { 10251 push @return, split /\n/, <<'END'; 10252jg ; Joining_Group 10253jt ; Joining_Type 10254END 10255 } 10256 if (-e 'PropList.txt') { 10257 10258 # This first set is in the original old-style proplist. 10259 push @return, split /\n/, <<'END'; 10260Bidi_C ; Bidi_Control 10261Dash ; Dash 10262Dia ; Diacritic 10263Ext ; Extender 10264Hex ; Hex_Digit 10265Hyphen ; Hyphen 10266IDC ; ID_Continue 10267Ideo ; Ideographic 10268Join_C ; Join_Control 10269Math ; Math 10270QMark ; Quotation_Mark 10271Term ; Terminal_Punctuation 10272WSpace ; White_Space 10273END 10274 # The next sets were added later 10275 if ($v_version ge v3.0.0) { 10276 push @return, split /\n/, <<'END'; 10277Upper ; Uppercase 10278Lower ; Lowercase 10279END 10280 } 10281 if ($v_version ge v3.0.1) { 10282 push @return, split /\n/, <<'END'; 10283NChar ; Noncharacter_Code_Point 10284END 10285 } 10286 # The next sets were added in the new-style 10287 if ($v_version ge v3.1.0) { 10288 push @return, split /\n/, <<'END'; 10289OAlpha ; Other_Alphabetic 10290OLower ; Other_Lowercase 10291OMath ; Other_Math 10292OUpper ; Other_Uppercase 10293END 10294 } 10295 if ($v_version ge v3.1.1) { 10296 push @return, "AHex ; ASCII_Hex_Digit\n"; 10297 } 10298 } 10299 if (-e 'EastAsianWidth.txt') { 10300 push @return, "ea ; East_Asian_Width\n"; 10301 } 10302 if (-e 'CompositionExclusions.txt') { 10303 push @return, "CE ; Composition_Exclusion\n"; 10304 } 10305 if (-e 'LineBreak.txt') { 10306 push @return, "lb ; Line_Break\n"; 10307 } 10308 if (-e 'BidiMirroring.txt') { 10309 push @return, "bmg ; Bidi_Mirroring_Glyph\n"; 10310 } 10311 if (-e 'Scripts.txt') { 10312 push @return, "sc ; Script\n"; 10313 } 10314 if (-e 'DNormalizationProps.txt') { 10315 push @return, split /\n/, <<'END'; 10316Comp_Ex ; Full_Composition_Exclusion 10317FC_NFKC ; FC_NFKC_Closure 10318NFC_QC ; NFC_Quick_Check 10319NFD_QC ; NFD_Quick_Check 10320NFKC_QC ; NFKC_Quick_Check 10321NFKD_QC ; NFKD_Quick_Check 10322XO_NFC ; Expands_On_NFC 10323XO_NFD ; Expands_On_NFD 10324XO_NFKC ; Expands_On_NFKC 10325XO_NFKD ; Expands_On_NFKD 10326END 10327 } 10328 if (-e 'DCoreProperties.txt') { 10329 push @return, split /\n/, <<'END'; 10330Alpha ; Alphabetic 10331IDS ; ID_Start 10332XIDC ; XID_Continue 10333XIDS ; XID_Start 10334END 10335 # These can also appear in some versions of PropList.txt 10336 push @return, "Lower ; Lowercase\n" 10337 unless grep { $_ =~ /^Lower\b/} @return; 10338 push @return, "Upper ; Uppercase\n" 10339 unless grep { $_ =~ /^Upper\b/} @return; 10340 } 10341 10342 # This flag requires the DAge.txt file to be copied into the directory. 10343 if (DEBUG && $compare_versions) { 10344 push @return, 'age ; Age'; 10345 } 10346 10347 return @return; 10348} 10349 10350sub substitute_PropValueAliases($file_object) { 10351 # Deal with early releases that don't have the crucial 10352 # PropValueAliases.txt file. 10353 10354 $file_object->insert_lines(get_old_property_value_aliases()); 10355 10356 process_PropValueAliases($file_object); 10357} 10358 10359sub process_PropValueAliases($file) { 10360 # This file contains values that properties look like: 10361 # bc ; AL ; Arabic_Letter 10362 # blk; n/a ; Greek_And_Coptic ; Greek 10363 # 10364 # Field 0 is the property. 10365 # Field 1 is the short name of a property value or 'n/a' if no 10366 # short name exists; 10367 # Field 2 is the full property value name; 10368 # Any other fields are more synonyms for the property value. 10369 # Purely numeric property values are omitted from the file; as are some 10370 # others, fewer and fewer in later releases 10371 10372 # Entries for the ccc property have an extra field before the 10373 # abbreviation: 10374 # ccc; 0; NR ; Not_Reordered 10375 # It is the numeric value that the names are synonyms for. 10376 10377 # There are comment entries for values missing from this file: 10378 # # @missing: 0000..10FFFF; ISO_Comment; <none> 10379 # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point> 10380 10381 if ($v_version lt 4.0.0) { 10382 $file->insert_lines(split /\n/, <<'END' 10383Hangul_Syllable_Type; L ; Leading_Jamo 10384Hangul_Syllable_Type; LV ; LV_Syllable 10385Hangul_Syllable_Type; LVT ; LVT_Syllable 10386Hangul_Syllable_Type; NA ; Not_Applicable 10387Hangul_Syllable_Type; T ; Trailing_Jamo 10388Hangul_Syllable_Type; V ; Vowel_Jamo 10389END 10390 ); 10391 } 10392 if ($v_version lt 4.1.0) { 10393 $file->insert_lines(split /\n/, <<'END' 10394_Perl_GCB; CN ; Control 10395_Perl_GCB; CR ; CR 10396_Perl_GCB; EX ; Extend 10397_Perl_GCB; L ; L 10398_Perl_GCB; LF ; LF 10399_Perl_GCB; LV ; LV 10400_Perl_GCB; LVT ; LVT 10401_Perl_GCB; T ; T 10402_Perl_GCB; V ; V 10403_Perl_GCB; XX ; Other 10404END 10405 ); 10406 } 10407 10408 # Add any explicit cjk values 10409 $file->insert_lines(@cjk_property_values); 10410 10411 # This line is used only for testing the code that checks for name 10412 # conflicts. There is a script Inherited, and when this line is executed 10413 # it causes there to be a name conflict with the 'Inherited' that this 10414 # program generates for this block property value 10415 #$file->insert_lines('blk; n/a; Herited'); 10416 10417 # Process each line of the file ... 10418 while ($file->next_line) { 10419 10420 # Fix typo in input file 10421 s/CCC133/CCC132/g if $v_version eq v6.1.0; 10422 10423 my ($property, @data) = split /\s*;\s*/; 10424 10425 # The ccc property has an extra field at the beginning, which is the 10426 # numeric value. Move it to be after the other two, mnemonic, fields, 10427 # so that those will be used as the property value's names, and the 10428 # number will be an extra alias. (Rightmost splice removes field 1-2, 10429 # returning them in a slice; left splice inserts that before anything, 10430 # thus shifting the former field 0 to after them.) 10431 splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc'; 10432 10433 if ($v_version le v5.0.0 && $property eq 'blk' && $data[1] =~ /-/) { 10434 my $new_style = $data[1] =~ s/-/_/gr; 10435 splice @data, 1, 0, $new_style; 10436 } 10437 10438 # Field 0 is a short name unless "n/a"; field 1 is the full name. If 10439 # there is no short name, use the full one in element 1 10440 if ($data[0] eq "n/a") { 10441 $data[0] = $data[1]; 10442 } 10443 elsif ($data[0] ne $data[1] 10444 && standardize($data[0]) eq standardize($data[1]) 10445 && $data[1] !~ /[[:upper:]]/) 10446 { 10447 # Also, there is a bug in the file in which "n/a" is omitted, and 10448 # the two fields are identical except for case, and the full name 10449 # is all lower case. Copy the "short" name unto the full one to 10450 # give it some upper case. 10451 10452 $data[1] = $data[0]; 10453 } 10454 10455 # Earlier releases had the pseudo property 'qc' that should expand to 10456 # the ones that replace it below. 10457 if ($property eq 'qc') { 10458 if (lc $data[0] eq 'y') { 10459 $file->insert_lines('NFC_QC; Y ; Yes', 10460 'NFD_QC; Y ; Yes', 10461 'NFKC_QC; Y ; Yes', 10462 'NFKD_QC; Y ; Yes', 10463 ); 10464 } 10465 elsif (lc $data[0] eq 'n') { 10466 $file->insert_lines('NFC_QC; N ; No', 10467 'NFD_QC; N ; No', 10468 'NFKC_QC; N ; No', 10469 'NFKD_QC; N ; No', 10470 ); 10471 } 10472 elsif (lc $data[0] eq 'm') { 10473 $file->insert_lines('NFC_QC; M ; Maybe', 10474 'NFKC_QC; M ; Maybe', 10475 ); 10476 } 10477 else { 10478 $file->carp_bad_line("qc followed by unexpected '$data[0]"); 10479 } 10480 next; 10481 } 10482 10483 # The first field is the short name, 2nd is the full one. 10484 my $property_object = property_ref($property); 10485 my $table = $property_object->add_match_table($data[0], 10486 Full_Name => $data[1]); 10487 10488 # Start looking for more aliases after these two. 10489 for my $i (2 .. @data - 1) { 10490 $table->add_alias($data[$i]); 10491 } 10492 } # End of looping through the file 10493 10494 # As noted in the comments early in the program, it generates tables for 10495 # the default values for all releases, even those for which the concept 10496 # didn't exist at the time. Here we add those if missing. 10497 if (defined $age && ! defined $age->table('Unassigned')) { 10498 $age->add_match_table('Unassigned'); 10499 } 10500 $block->add_match_table('No_Block') if -e 'Blocks.txt' 10501 && ! defined $block->table('No_Block'); 10502 10503 10504 # Now set the default mappings of the properties from the file. This is 10505 # done after the loop because a number of properties have only @missings 10506 # entries in the file, and may not show up until the end. 10507 my @defaults = $file->get_missings; 10508 foreach my $default_ref (@defaults) { 10509 my $default = $default_ref->[0]; 10510 my $property = property_ref($default_ref->[1]); 10511 $property->set_default_map($default); 10512 } 10513 return; 10514} 10515 10516sub get_old_property_value_aliases () { 10517 # Returns what would be in PropValueAliases.txt if it existed in very old 10518 # versions of Unicode. It was derived from the one in 3.2, and pared 10519 # down. An attempt was made to use the existence of files to mean 10520 # inclusion or not of various aliases, but if this was not sufficient, 10521 # using version numbers was resorted to. 10522 10523 my @return = split /\n/, <<'END'; 10524bc ; AN ; Arabic_Number 10525bc ; B ; Paragraph_Separator 10526bc ; CS ; Common_Separator 10527bc ; EN ; European_Number 10528bc ; ES ; European_Separator 10529bc ; ET ; European_Terminator 10530bc ; L ; Left_To_Right 10531bc ; ON ; Other_Neutral 10532bc ; R ; Right_To_Left 10533bc ; WS ; White_Space 10534 10535Bidi_M; N; No; F; False 10536Bidi_M; Y; Yes; T; True 10537 10538# The standard combining classes are very much different in v1, so only use 10539# ones that look right (not checked thoroughly) 10540ccc; 0; NR ; Not_Reordered 10541ccc; 1; OV ; Overlay 10542ccc; 7; NK ; Nukta 10543ccc; 8; KV ; Kana_Voicing 10544ccc; 9; VR ; Virama 10545ccc; 202; ATBL ; Attached_Below_Left 10546ccc; 216; ATAR ; Attached_Above_Right 10547ccc; 218; BL ; Below_Left 10548ccc; 220; B ; Below 10549ccc; 222; BR ; Below_Right 10550ccc; 224; L ; Left 10551ccc; 228; AL ; Above_Left 10552ccc; 230; A ; Above 10553ccc; 232; AR ; Above_Right 10554ccc; 234; DA ; Double_Above 10555 10556dt ; can ; canonical 10557dt ; enc ; circle 10558dt ; fin ; final 10559dt ; font ; font 10560dt ; fra ; fraction 10561dt ; init ; initial 10562dt ; iso ; isolated 10563dt ; med ; medial 10564dt ; n/a ; none 10565dt ; nb ; noBreak 10566dt ; sqr ; square 10567dt ; sub ; sub 10568dt ; sup ; super 10569 10570gc ; C ; Other # Cc | Cf | Cn | Co | Cs 10571gc ; Cc ; Control 10572gc ; Cn ; Unassigned 10573gc ; Co ; Private_Use 10574gc ; L ; Letter # Ll | Lm | Lo | Lt | Lu 10575gc ; LC ; Cased_Letter # Ll | Lt | Lu 10576gc ; Ll ; Lowercase_Letter 10577gc ; Lm ; Modifier_Letter 10578gc ; Lo ; Other_Letter 10579gc ; Lu ; Uppercase_Letter 10580gc ; M ; Mark # Mc | Me | Mn 10581gc ; Mc ; Spacing_Mark 10582gc ; Mn ; Nonspacing_Mark 10583gc ; N ; Number # Nd | Nl | No 10584gc ; Nd ; Decimal_Number 10585gc ; No ; Other_Number 10586gc ; P ; Punctuation # Pc | Pd | Pe | Pf | Pi | Po | Ps 10587gc ; Pd ; Dash_Punctuation 10588gc ; Pe ; Close_Punctuation 10589gc ; Po ; Other_Punctuation 10590gc ; Ps ; Open_Punctuation 10591gc ; S ; Symbol # Sc | Sk | Sm | So 10592gc ; Sc ; Currency_Symbol 10593gc ; Sm ; Math_Symbol 10594gc ; So ; Other_Symbol 10595gc ; Z ; Separator # Zl | Zp | Zs 10596gc ; Zl ; Line_Separator 10597gc ; Zp ; Paragraph_Separator 10598gc ; Zs ; Space_Separator 10599 10600nt ; de ; Decimal 10601nt ; di ; Digit 10602nt ; n/a ; None 10603nt ; nu ; Numeric 10604END 10605 10606 if (-e 'ArabicShaping.txt') { 10607 push @return, split /\n/, <<'END'; 10608jg ; n/a ; AIN 10609jg ; n/a ; ALEF 10610jg ; n/a ; DAL 10611jg ; n/a ; GAF 10612jg ; n/a ; LAM 10613jg ; n/a ; MEEM 10614jg ; n/a ; NO_JOINING_GROUP 10615jg ; n/a ; NOON 10616jg ; n/a ; QAF 10617jg ; n/a ; SAD 10618jg ; n/a ; SEEN 10619jg ; n/a ; TAH 10620jg ; n/a ; WAW 10621 10622jt ; C ; Join_Causing 10623jt ; D ; Dual_Joining 10624jt ; L ; Left_Joining 10625jt ; R ; Right_Joining 10626jt ; U ; Non_Joining 10627jt ; T ; Transparent 10628END 10629 if ($v_version ge v3.0.0) { 10630 push @return, split /\n/, <<'END'; 10631jg ; n/a ; ALAPH 10632jg ; n/a ; BEH 10633jg ; n/a ; BETH 10634jg ; n/a ; DALATH_RISH 10635jg ; n/a ; E 10636jg ; n/a ; FEH 10637jg ; n/a ; FINAL_SEMKATH 10638jg ; n/a ; GAMAL 10639jg ; n/a ; HAH 10640jg ; n/a ; HAMZA_ON_HEH_GOAL 10641jg ; n/a ; HE 10642jg ; n/a ; HEH 10643jg ; n/a ; HEH_GOAL 10644jg ; n/a ; HETH 10645jg ; n/a ; KAF 10646jg ; n/a ; KAPH 10647jg ; n/a ; KNOTTED_HEH 10648jg ; n/a ; LAMADH 10649jg ; n/a ; MIM 10650jg ; n/a ; NUN 10651jg ; n/a ; PE 10652jg ; n/a ; QAPH 10653jg ; n/a ; REH 10654jg ; n/a ; REVERSED_PE 10655jg ; n/a ; SADHE 10656jg ; n/a ; SEMKATH 10657jg ; n/a ; SHIN 10658jg ; n/a ; SWASH_KAF 10659jg ; n/a ; TAW 10660jg ; n/a ; TEH_MARBUTA 10661jg ; n/a ; TETH 10662jg ; n/a ; YEH 10663jg ; n/a ; YEH_BARREE 10664jg ; n/a ; YEH_WITH_TAIL 10665jg ; n/a ; YUDH 10666jg ; n/a ; YUDH_HE 10667jg ; n/a ; ZAIN 10668END 10669 } 10670 } 10671 10672 10673 if (-e 'EastAsianWidth.txt') { 10674 push @return, split /\n/, <<'END'; 10675ea ; A ; Ambiguous 10676ea ; F ; Fullwidth 10677ea ; H ; Halfwidth 10678ea ; N ; Neutral 10679ea ; Na ; Narrow 10680ea ; W ; Wide 10681END 10682 } 10683 10684 if (-e 'LineBreak.txt' || -e 'LBsubst.txt') { 10685 my @lb = split /\n/, <<'END'; 10686lb ; AI ; Ambiguous 10687lb ; AL ; Alphabetic 10688lb ; B2 ; Break_Both 10689lb ; BA ; Break_After 10690lb ; BB ; Break_Before 10691lb ; BK ; Mandatory_Break 10692lb ; CB ; Contingent_Break 10693lb ; CL ; Close_Punctuation 10694lb ; CM ; Combining_Mark 10695lb ; CR ; Carriage_Return 10696lb ; EX ; Exclamation 10697lb ; GL ; Glue 10698lb ; HY ; Hyphen 10699lb ; ID ; Ideographic 10700lb ; IN ; Inseperable 10701lb ; IS ; Infix_Numeric 10702lb ; LF ; Line_Feed 10703lb ; NS ; Nonstarter 10704lb ; NU ; Numeric 10705lb ; OP ; Open_Punctuation 10706lb ; PO ; Postfix_Numeric 10707lb ; PR ; Prefix_Numeric 10708lb ; QU ; Quotation 10709lb ; SA ; Complex_Context 10710lb ; SG ; Surrogate 10711lb ; SP ; Space 10712lb ; SY ; Break_Symbols 10713lb ; XX ; Unknown 10714lb ; ZW ; ZWSpace 10715END 10716 # If this Unicode version predates the lb property, we use our 10717 # substitute one 10718 if (-e 'LBsubst.txt') { 10719 $_ = s/^lb/_Perl_LB/r for @lb; 10720 } 10721 push @return, @lb; 10722 } 10723 10724 if (-e 'DNormalizationProps.txt') { 10725 push @return, split /\n/, <<'END'; 10726qc ; M ; Maybe 10727qc ; N ; No 10728qc ; Y ; Yes 10729END 10730 } 10731 10732 if (-e 'Scripts.txt') { 10733 push @return, split /\n/, <<'END'; 10734sc ; Arab ; Arabic 10735sc ; Armn ; Armenian 10736sc ; Beng ; Bengali 10737sc ; Bopo ; Bopomofo 10738sc ; Cans ; Canadian_Aboriginal 10739sc ; Cher ; Cherokee 10740sc ; Cyrl ; Cyrillic 10741sc ; Deva ; Devanagari 10742sc ; Dsrt ; Deseret 10743sc ; Ethi ; Ethiopic 10744sc ; Geor ; Georgian 10745sc ; Goth ; Gothic 10746sc ; Grek ; Greek 10747sc ; Gujr ; Gujarati 10748sc ; Guru ; Gurmukhi 10749sc ; Hang ; Hangul 10750sc ; Hani ; Han 10751sc ; Hebr ; Hebrew 10752sc ; Hira ; Hiragana 10753sc ; Ital ; Old_Italic 10754sc ; Kana ; Katakana 10755sc ; Khmr ; Khmer 10756sc ; Knda ; Kannada 10757sc ; Laoo ; Lao 10758sc ; Latn ; Latin 10759sc ; Mlym ; Malayalam 10760sc ; Mong ; Mongolian 10761sc ; Mymr ; Myanmar 10762sc ; Ogam ; Ogham 10763sc ; Orya ; Oriya 10764sc ; Qaai ; Inherited 10765sc ; Runr ; Runic 10766sc ; Sinh ; Sinhala 10767sc ; Syrc ; Syriac 10768sc ; Taml ; Tamil 10769sc ; Telu ; Telugu 10770sc ; Thaa ; Thaana 10771sc ; Thai ; Thai 10772sc ; Tibt ; Tibetan 10773sc ; Yiii ; Yi 10774sc ; Zyyy ; Common 10775END 10776 } 10777 10778 if ($v_version ge v2.0.0) { 10779 push @return, split /\n/, <<'END'; 10780dt ; com ; compat 10781dt ; nar ; narrow 10782dt ; sml ; small 10783dt ; vert ; vertical 10784dt ; wide ; wide 10785 10786gc ; Cf ; Format 10787gc ; Cs ; Surrogate 10788gc ; Lt ; Titlecase_Letter 10789gc ; Me ; Enclosing_Mark 10790gc ; Nl ; Letter_Number 10791gc ; Pc ; Connector_Punctuation 10792gc ; Sk ; Modifier_Symbol 10793END 10794 } 10795 if ($v_version ge v2.1.2) { 10796 push @return, "bc ; S ; Segment_Separator\n"; 10797 } 10798 if ($v_version ge v2.1.5) { 10799 push @return, split /\n/, <<'END'; 10800gc ; Pf ; Final_Punctuation 10801gc ; Pi ; Initial_Punctuation 10802END 10803 } 10804 if ($v_version ge v2.1.8) { 10805 push @return, "ccc; 240; IS ; Iota_Subscript\n"; 10806 } 10807 10808 if ($v_version ge v3.0.0) { 10809 push @return, split /\n/, <<'END'; 10810bc ; AL ; Arabic_Letter 10811bc ; BN ; Boundary_Neutral 10812bc ; LRE ; Left_To_Right_Embedding 10813bc ; LRO ; Left_To_Right_Override 10814bc ; NSM ; Nonspacing_Mark 10815bc ; PDF ; Pop_Directional_Format 10816bc ; RLE ; Right_To_Left_Embedding 10817bc ; RLO ; Right_To_Left_Override 10818 10819ccc; 233; DB ; Double_Below 10820END 10821 } 10822 10823 if ($v_version ge v3.1.0) { 10824 push @return, "ccc; 226; R ; Right\n"; 10825 } 10826 10827 return @return; 10828} 10829 10830sub process_NormalizationsTest($file) { 10831 10832 # Each line looks like: 10833 # source code point; NFC; NFD; NFKC; NFKD 10834 # e.g. 10835 # 1E0A;1E0A;0044 0307;1E0A;0044 0307; 10836 10837 # Process each line of the file ... 10838 while ($file->next_line) { 10839 10840 next if /^@/; 10841 10842 my ($c1, $c2, $c3, $c4, $c5) = split /\s*;\s*/; 10843 10844 foreach my $var (\$c1, \$c2, \$c3, \$c4, \$c5) { 10845 $$var = pack "U0U*", map { hex } split " ", $$var; 10846 $$var =~ s/(\\)/$1$1/g; 10847 } 10848 10849 push @normalization_tests, 10850 "Test_N(q$c1, q$c2, q$c3, q$c4, q$c5);\n"; 10851 } # End of looping through the file 10852} 10853 10854sub output_perl_charnames_line ($a, $b) { 10855 10856 # Output the entries in Perl_charnames specially, using 5 digits instead 10857 # of four. This makes the entries a constant length, and simplifies 10858 # charnames.pm which this table is for. Unicode can have 6 digit 10859 # ordinals, but they are all private use or noncharacters which do not 10860 # have names, so won't be in this table. 10861 10862 return sprintf "%05X\n%s\n\n", $_[0], $_[1]; 10863} 10864 10865{ # Closure 10866 10867 # These are constants to the $property_info hash in this subroutine, to 10868 # avoid using a quoted-string which might have a typo. 10869 my $TYPE = 'type'; 10870 my $DEFAULT_MAP = 'default_map'; 10871 my $DEFAULT_TABLE = 'default_table'; 10872 my $PSEUDO_MAP_TYPE = 'pseudo_map_type'; 10873 my $MISSINGS = 'missings'; 10874 10875 sub process_generic_property_file($file) { 10876 # This processes a file containing property mappings and puts them 10877 # into internal map tables. It should be used to handle any property 10878 # files that have mappings from a code point or range thereof to 10879 # something else. This means almost all the UCD .txt files. 10880 # each_line_handlers() should be set to adjust the lines of these 10881 # files, if necessary, to what this routine understands: 10882 # 10883 # 0374 ; NFD_QC; N 10884 # 003C..003E ; Math 10885 # 10886 # the fields are: "codepoint-range ; property; map" 10887 # 10888 # meaning the codepoints in the range all have the value 'map' under 10889 # 'property'. 10890 # Beginning and trailing white space in each field are not significant. 10891 # Note there is not a trailing semi-colon in the above. A trailing 10892 # semi-colon means the map is a null-string. An omitted map, as 10893 # opposed to a null-string, is assumed to be 'Y', based on Unicode 10894 # table syntax. (This could have been hidden from this routine by 10895 # doing it in the $file object, but that would require parsing of the 10896 # line there, so would have to parse it twice, or change the interface 10897 # to pass this an array. So not done.) 10898 # 10899 # The map field may begin with a sequence of commands that apply to 10900 # this range. Each such command begins and ends with $CMD_DELIM. 10901 # These are used to indicate, for example, that the mapping for a 10902 # range has a non-default type. 10903 # 10904 # This loops through the file, calling its next_line() method, and 10905 # then taking the map and adding it to the property's table. 10906 # Complications arise because any number of properties can be in the 10907 # file, in any order, interspersed in any way. The first time a 10908 # property is seen, it gets information about that property and 10909 # caches it for quick retrieval later. It also normalizes the maps 10910 # so that only one of many synonyms is stored. The Unicode input 10911 # files do use some multiple synonyms. 10912 10913 my %property_info; # To keep track of what properties 10914 # have already had entries in the 10915 # current file, and info about each, 10916 # so don't have to recompute. 10917 my $property_name; # property currently being worked on 10918 my $property_type; # and its type 10919 my $previous_property_name = ""; # name from last time through loop 10920 my $property_object; # pointer to the current property's 10921 # object 10922 my $property_addr; # the address of that object 10923 my $default_map; # the string that code points missing 10924 # from the file map to 10925 my $default_table; # For non-string properties, a 10926 # reference to the match table that 10927 # will contain the list of code 10928 # points that map to $default_map. 10929 10930 # Get the next real non-comment line 10931 LINE: 10932 while ($file->next_line) { 10933 10934 # Default replacement type; means that if parts of the range have 10935 # already been stored in our tables, the new map overrides them if 10936 # they differ more than cosmetically 10937 my $replace = $IF_NOT_EQUIVALENT; 10938 my $map_type; # Default type for the map of this range 10939 10940 #local $to_trace = 1 if main::DEBUG; 10941 trace $_ if main::DEBUG && $to_trace; 10942 10943 # Split the line into components 10944 my ($range, $property_name, $map, @remainder) 10945 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 10946 10947 # If more or less on the line than we are expecting, warn and skip 10948 # the line 10949 if (@remainder) { 10950 $file->carp_bad_line('Extra fields'); 10951 next LINE; 10952 } 10953 elsif ( ! defined $property_name) { 10954 $file->carp_bad_line('Missing property'); 10955 next LINE; 10956 } 10957 10958 # Examine the range. 10959 if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x) 10960 { 10961 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)"); 10962 next LINE; 10963 } 10964 my $low = hex $1; 10965 my $high = (defined $2) ? hex $2 : $low; 10966 10967 # If changing to a new property, get the things constant per 10968 # property 10969 if ($previous_property_name ne $property_name) { 10970 10971 $property_object = property_ref($property_name); 10972 if (! defined $property_object) { 10973 $file->carp_bad_line("Unexpected property '$property_name'. Skipped"); 10974 next LINE; 10975 } 10976 { no overloading; $property_addr = pack 'J', $property_object; } 10977 10978 # Defer changing names until have a line that is acceptable 10979 # (the 'next' statement above means is unacceptable) 10980 $previous_property_name = $property_name; 10981 10982 # If not the first time for this property, retrieve info about 10983 # it from the cache 10984 if (defined ($property_info{$property_addr}{$TYPE})) { 10985 $property_type = $property_info{$property_addr}{$TYPE}; 10986 $default_map = $property_info{$property_addr}{$DEFAULT_MAP}; 10987 $map_type 10988 = $property_info{$property_addr}{$PSEUDO_MAP_TYPE}; 10989 $default_table 10990 = $property_info{$property_addr}{$DEFAULT_TABLE}; 10991 } 10992 else { 10993 10994 # Here, is the first time for this property. Set up the 10995 # cache. 10996 $property_type = $property_info{$property_addr}{$TYPE} 10997 = $property_object->type; 10998 $map_type 10999 = $property_info{$property_addr}{$PSEUDO_MAP_TYPE} 11000 = $property_object->pseudo_map_type; 11001 11002 # The Unicode files are set up so that if the map is not 11003 # defined, it is a binary property 11004 if (! defined $map && $property_type != $BINARY) { 11005 if ($property_type != $UNKNOWN 11006 && $property_type != $NON_STRING) 11007 { 11008 $file->carp_bad_line("No mapping defined on a non-binary property. Using 'Y' for the map"); 11009 } 11010 else { 11011 $property_object->set_type($BINARY); 11012 $property_type 11013 = $property_info{$property_addr}{$TYPE} 11014 = $BINARY; 11015 } 11016 } 11017 11018 # Get any @missings default for this property. This 11019 # should precede the first entry for the property in the 11020 # input file, and is located in a comment that has been 11021 # stored by the Input_file class until we access it here. 11022 # It's possible that there is more than one such line 11023 # waiting for us; collect them all, and parse 11024 my @missings_list = $file->get_missings 11025 if $file->has_missings_defaults; 11026 foreach my $default_ref (@missings_list) { 11027 my $default = $default_ref->[0]; 11028 my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); }; 11029 11030 # For string properties, the default is just what the 11031 # file says, but non-string properties should already 11032 # have set up a table for the default property value; 11033 # use the table for these, so can resolve synonyms 11034 # later to a single standard one. 11035 if ($property_type == $STRING 11036 || $property_type == $UNKNOWN) 11037 { 11038 $property_info{$addr}{$MISSINGS} = $default; 11039 } 11040 else { 11041 $property_info{$addr}{$MISSINGS} 11042 = $property_object->table($default); 11043 } 11044 } 11045 11046 # Finished storing all the @missings defaults in the input 11047 # file so far. Get the one for the current property. 11048 my $missings = $property_info{$property_addr}{$MISSINGS}; 11049 11050 # But we likely have separately stored what the default 11051 # should be. (This is to accommodate versions of the 11052 # standard where the @missings lines are absent or 11053 # incomplete.) Hopefully the two will match. But check 11054 # it out. 11055 $default_map = $property_object->default_map; 11056 11057 # If the map is a ref, it means that the default won't be 11058 # processed until later, so undef it, so next few lines 11059 # will redefine it to something that nothing will match 11060 undef $default_map if ref $default_map; 11061 11062 # Create a $default_map if don't have one; maybe a dummy 11063 # that won't match anything. 11064 if (! defined $default_map) { 11065 11066 # Use any @missings line in the file. 11067 if (defined $missings) { 11068 if (ref $missings) { 11069 $default_map = $missings->full_name; 11070 $default_table = $missings; 11071 } 11072 else { 11073 $default_map = $missings; 11074 } 11075 11076 # And store it with the property for outside use. 11077 $property_object->set_default_map($default_map); 11078 } 11079 else { 11080 11081 # Neither an @missings nor a default map. Create 11082 # a dummy one, so won't have to test definedness 11083 # in the main loop. 11084 $default_map = '_Perl This will never be in a file 11085 from Unicode'; 11086 } 11087 } 11088 11089 # Here, we have $default_map defined, possibly in terms of 11090 # $missings, but maybe not, and possibly is a dummy one. 11091 if (defined $missings) { 11092 11093 # Make sure there is no conflict between the two. 11094 # $missings has priority. 11095 if (ref $missings) { 11096 $default_table 11097 = $property_object->table($default_map); 11098 if (! defined $default_table 11099 || $default_table != $missings) 11100 { 11101 if (! defined $default_table) { 11102 $default_table = $UNDEF; 11103 } 11104 $file->carp_bad_line(<<END 11105The \@missings line for $property_name in $file says that missings default to 11106$missings, but we expect it to be $default_table. $missings used. 11107END 11108 ); 11109 $default_table = $missings; 11110 $default_map = $missings->full_name; 11111 } 11112 $property_info{$property_addr}{$DEFAULT_TABLE} 11113 = $default_table; 11114 } 11115 elsif ($default_map ne $missings) { 11116 $file->carp_bad_line(<<END 11117The \@missings line for $property_name in $file says that missings default to 11118$missings, but we expect it to be $default_map. $missings used. 11119END 11120 ); 11121 $default_map = $missings; 11122 } 11123 } 11124 11125 $property_info{$property_addr}{$DEFAULT_MAP} 11126 = $default_map; 11127 11128 # If haven't done so already, find the table corresponding 11129 # to this map for non-string properties. 11130 if (! defined $default_table 11131 && $property_type != $STRING 11132 && $property_type != $UNKNOWN) 11133 { 11134 $default_table = $property_info{$property_addr} 11135 {$DEFAULT_TABLE} 11136 = $property_object->table($default_map); 11137 } 11138 } # End of is first time for this property 11139 } # End of switching properties. 11140 11141 # Ready to process the line. 11142 # The Unicode files are set up so that if the map is not defined, 11143 # it is a binary property with value 'Y' 11144 if (! defined $map) { 11145 $map = 'Y'; 11146 } 11147 else { 11148 11149 # If the map begins with a special command to us (enclosed in 11150 # delimiters), extract the command(s). 11151 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) { 11152 my $command = $1; 11153 if ($command =~ / ^ $REPLACE_CMD= (.*) /x) { 11154 $replace = $1; 11155 } 11156 elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) { 11157 $map_type = $1; 11158 } 11159 else { 11160 $file->carp_bad_line("Unknown command line: '$1'"); 11161 next LINE; 11162 } 11163 } 11164 } 11165 11166 if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x) 11167 { 11168 11169 # Here, we have a map to a particular code point, and the 11170 # default map is to a code point itself. If the range 11171 # includes the particular code point, change that portion of 11172 # the range to the default. This makes sure that in the final 11173 # table only the non-defaults are listed. 11174 my $decimal_map = hex $map; 11175 if ($low <= $decimal_map && $decimal_map <= $high) { 11176 11177 # If the range includes stuff before or after the map 11178 # we're changing, split it and process the split-off parts 11179 # later. 11180 if ($low < $decimal_map) { 11181 $file->insert_adjusted_lines( 11182 sprintf("%04X..%04X; %s; %s", 11183 $low, 11184 $decimal_map - 1, 11185 $property_name, 11186 $map)); 11187 } 11188 if ($high > $decimal_map) { 11189 $file->insert_adjusted_lines( 11190 sprintf("%04X..%04X; %s; %s", 11191 $decimal_map + 1, 11192 $high, 11193 $property_name, 11194 $map)); 11195 } 11196 $low = $high = $decimal_map; 11197 $map = $CODE_POINT; 11198 } 11199 } 11200 11201 # If we can tell that this is a synonym for the default map, use 11202 # the default one instead. 11203 if ($property_type != $STRING 11204 && $property_type != $UNKNOWN) 11205 { 11206 my $table = $property_object->table($map); 11207 if (defined $table && $table == $default_table) { 11208 $map = $default_map; 11209 } 11210 } 11211 11212 # And figure out the map type if not known. 11213 if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) { 11214 if ($map eq "") { # Nulls are always $NULL map type 11215 $map_type = $NULL; 11216 } # Otherwise, non-strings, and those that don't allow 11217 # $MULTI_CP, and those that aren't multiple code points are 11218 # 0 11219 elsif 11220 (($property_type != $STRING && $property_type != $UNKNOWN) 11221 || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP) 11222 || $map !~ /^ $code_point_re ( \ $code_point_re )+ $ /x) 11223 { 11224 $map_type = 0; 11225 } 11226 else { 11227 $map_type = $MULTI_CP; 11228 } 11229 } 11230 11231 $property_object->add_map($low, $high, 11232 $map, 11233 Type => $map_type, 11234 Replace => $replace); 11235 } # End of loop through file's lines 11236 11237 return; 11238 } 11239} 11240 11241{ # Closure for UnicodeData.txt handling 11242 11243 # This file was the first one in the UCD; its design leads to some 11244 # awkwardness in processing. Here is a sample line: 11245 # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061; 11246 # The fields in order are: 11247 my $i = 0; # The code point is in field 0, and is shifted off. 11248 my $CHARNAME = $i++; # character name (e.g. "LATIN CAPITAL LETTER A") 11249 my $CATEGORY = $i++; # category (e.g. "Lu") 11250 my $CCC = $i++; # Canonical combining class (e.g. "230") 11251 my $BIDI = $i++; # directional class (e.g. "L") 11252 my $PERL_DECOMPOSITION = $i++; # decomposition mapping 11253 my $PERL_DECIMAL_DIGIT = $i++; # decimal digit value 11254 my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript 11255 # Dual-use in this program; see below 11256 my $NUMERIC = $i++; # numeric value 11257 my $MIRRORED = $i++; # ? mirrored 11258 my $UNICODE_1_NAME = $i++; # name in Unicode 1.0 11259 my $COMMENT = $i++; # iso comment 11260 my $UPPER = $i++; # simple uppercase mapping 11261 my $LOWER = $i++; # simple lowercase mapping 11262 my $TITLE = $i++; # simple titlecase mapping 11263 my $input_field_count = $i; 11264 11265 # This routine in addition outputs these extra fields: 11266 11267 my $DECOMP_TYPE = $i++; # Decomposition type 11268 11269 # These fields are modifications of ones above, and are usually 11270 # suppressed; they must come last, as for speed, the loop upper bound is 11271 # normally set to ignore them 11272 my $NAME = $i++; # This is the strict name field, not the one that 11273 # charnames uses. 11274 my $DECOMP_MAP = $i++; # Strict decomposition mapping; not the one used 11275 # by Unicode::Normalize 11276 my $last_field = $i - 1; 11277 11278 # All these are read into an array for each line, with the indices defined 11279 # above. The empty fields in the example line above indicate that the 11280 # value is defaulted. The handler called for each line of the input 11281 # changes these to their defaults. 11282 11283 # Here are the official names of the properties, in a parallel array: 11284 my @field_names; 11285 $field_names[$BIDI] = 'Bidi_Class'; 11286 $field_names[$CATEGORY] = 'General_Category'; 11287 $field_names[$CCC] = 'Canonical_Combining_Class'; 11288 $field_names[$CHARNAME] = 'Perl_Charnames'; 11289 $field_names[$COMMENT] = 'ISO_Comment'; 11290 $field_names[$DECOMP_MAP] = 'Decomposition_Mapping'; 11291 $field_names[$DECOMP_TYPE] = 'Decomposition_Type'; 11292 $field_names[$LOWER] = 'Lowercase_Mapping'; 11293 $field_names[$MIRRORED] = 'Bidi_Mirrored'; 11294 $field_names[$NAME] = 'Name'; 11295 $field_names[$NUMERIC] = 'Numeric_Value'; 11296 $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type'; 11297 $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit'; 11298 $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping'; 11299 $field_names[$TITLE] = 'Titlecase_Mapping'; 11300 $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name'; 11301 $field_names[$UPPER] = 'Uppercase_Mapping'; 11302 11303 # Some of these need a little more explanation: 11304 # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode 11305 # property, but is used in calculating the Numeric_Type. Perl however, 11306 # creates a file from this field, so a Perl property is created from it. 11307 # Similarly, the Other_Digit field is used only for calculating the 11308 # Numeric_Type, and so it can be safely re-used as the place to store 11309 # the value for Numeric_Type; hence it is referred to as 11310 # $NUMERIC_TYPE_OTHER_DIGIT. 11311 # The input field named $PERL_DECOMPOSITION is a combination of both the 11312 # decomposition mapping and its type. Perl creates a file containing 11313 # exactly this field, so it is used for that. The two properties are 11314 # separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE. 11315 # $DECOMP_MAP is usually suppressed (unless the lists are changed to 11316 # output it), as Perl doesn't use it directly. 11317 # The input field named here $CHARNAME is used to construct the 11318 # Perl_Charnames property, which is a combination of the Name property 11319 # (which the input field contains), and the Unicode_1_Name property, and 11320 # others from other files. Since, the strict Name property is not used 11321 # by Perl, this field is used for the table that Perl does use. The 11322 # strict Name property table is usually suppressed (unless the lists are 11323 # changed to output it), so it is accumulated in a separate field, 11324 # $NAME, which to save time is discarded unless the table is actually to 11325 # be output 11326 11327 # This file is processed like most in this program. Control is passed to 11328 # process_generic_property_file() which calls filter_UnicodeData_line() 11329 # for each input line. This filter converts the input into line(s) that 11330 # process_generic_property_file() understands. There is also a setup 11331 # routine called before any of the file is processed, and a handler for 11332 # EOF processing, all in this closure. 11333 11334 # A huge speed-up occurred at the cost of some added complexity when these 11335 # routines were altered to buffer the outputs into ranges. Almost all the 11336 # lines of the input file apply to just one code point, and for most 11337 # properties, the map for the next code point up is the same as the 11338 # current one. So instead of creating a line for each property for each 11339 # input line, filter_UnicodeData_line() remembers what the previous map 11340 # of a property was, and doesn't generate a line to pass on until it has 11341 # to, as when the map changes; and that passed-on line encompasses the 11342 # whole contiguous range of code points that have the same map for that 11343 # property. This means a slight amount of extra setup, and having to 11344 # flush these buffers on EOF, testing if the maps have changed, plus 11345 # remembering state information in the closure. But it means a lot less 11346 # real time in not having to change the data base for each property on 11347 # each line. 11348 11349 # Another complication is that there are already a few ranges designated 11350 # in the input. There are two lines for each, with the same maps except 11351 # the code point and name on each line. This was actually the hardest 11352 # thing to design around. The code points in those ranges may actually 11353 # have real maps not given by these two lines. These maps will either 11354 # be algorithmically determinable, or be in the extracted files furnished 11355 # with the UCD. In the event of conflicts between these extracted files, 11356 # and this one, Unicode says that this one prevails. But it shouldn't 11357 # prevail for conflicts that occur in these ranges. The data from the 11358 # extracted files prevails in those cases. So, this program is structured 11359 # so that those files are processed first, storing maps. Then the other 11360 # files are processed, generally overwriting what the extracted files 11361 # stored. But just the range lines in this input file are processed 11362 # without overwriting. This is accomplished by adding a special string to 11363 # the lines output to tell process_generic_property_file() to turn off the 11364 # overwriting for just this one line. 11365 # A similar mechanism is used to tell it that the map is of a non-default 11366 # type. 11367 11368 sub setup_UnicodeData($file) { # Called before any lines of the input are read 11369 11370 # Create a new property specially located that is a combination of 11371 # various Name properties: Name, Unicode_1_Name, Named Sequences, and 11372 # _Perl_Name_Alias properties. (The final one duplicates elements of the 11373 # first, and starting in v6.1, is the same as the 'Name_Alias 11374 # property.) A comment for the new property will later be constructed 11375 # based on the actual properties present and used 11376 $perl_charname = Property->new('Perl_Charnames', 11377 Default_Map => "", 11378 Directory => File::Spec->curdir(), 11379 File => 'Name', 11380 Fate => $INTERNAL_ONLY, 11381 Perl_Extension => 1, 11382 Range_Size_1 => \&output_perl_charnames_line, 11383 Type => $STRING, 11384 ); 11385 $perl_charname->set_proxy_for('Name'); 11386 11387 my $Perl_decomp = Property->new('Perl_Decomposition_Mapping', 11388 Directory => File::Spec->curdir(), 11389 File => 'Decomposition', 11390 Format => $DECOMP_STRING_FORMAT, 11391 Fate => $INTERNAL_ONLY, 11392 Perl_Extension => 1, 11393 Default_Map => $CODE_POINT, 11394 11395 # normalize.pm can't cope with these 11396 Output_Range_Counts => 0, 11397 11398 # This is a specially formatted table 11399 # explicitly for normalize.pm, which 11400 # is expecting a particular format, 11401 # which means that mappings containing 11402 # multiple code points are in the main 11403 # body of the table 11404 Map_Type => $COMPUTE_NO_MULTI_CP, 11405 Type => $STRING, 11406 To_Output_Map => $INTERNAL_MAP, 11407 ); 11408 $Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type'); 11409 $Perl_decomp->add_comment(join_lines(<<END 11410This mapping is a combination of the Unicode 'Decomposition_Type' and 11411'Decomposition_Mapping' properties, formatted for use by normalize.pm. It is 11412identical to the official Unicode 'Decomposition_Mapping' property except for 11413two things: 11414 1) It omits the algorithmically determinable Hangul syllable decompositions, 11415which normalize.pm handles algorithmically. 11416 2) It contains the decomposition type as well. Non-canonical decompositions 11417begin with a word in angle brackets, like <super>, which denotes the 11418compatible decomposition type. If the map does not begin with the <angle 11419brackets>, the decomposition is canonical. 11420END 11421 )); 11422 11423 my $Decimal_Digit = Property->new("Perl_Decimal_Digit", 11424 Default_Map => "", 11425 Perl_Extension => 1, 11426 Directory => $map_directory, 11427 Type => $STRING, 11428 To_Output_Map => $OUTPUT_ADJUSTED, 11429 ); 11430 $Decimal_Digit->add_comment(join_lines(<<END 11431This file gives the mapping of all code points which represent a single 11432decimal digit [0-9] to their respective digits, but it has ranges of 10 code 11433points, and the mapping of each non-initial element of each range is actually 11434not to "0", but to the offset that element has from its corresponding DIGIT 0. 11435These code points are those that have Numeric_Type=Decimal; not special 11436things, like subscripts nor Roman numerals. 11437END 11438 )); 11439 11440 # These properties are not used for generating anything else, and are 11441 # usually not output. By making them last in the list, we can just 11442 # change the high end of the loop downwards to avoid the work of 11443 # generating a table(s) that is/are just going to get thrown away. 11444 if (! property_ref('Decomposition_Mapping')->to_output_map 11445 && ! property_ref('Name')->to_output_map) 11446 { 11447 $last_field = min($NAME, $DECOMP_MAP) - 1; 11448 } elsif (property_ref('Decomposition_Mapping')->to_output_map) { 11449 $last_field = $DECOMP_MAP; 11450 } elsif (property_ref('Name')->to_output_map) { 11451 $last_field = $NAME; 11452 } 11453 return; 11454 } 11455 11456 my $first_time = 1; # ? Is this the first line of the file 11457 my $in_range = 0; # ? Are we in one of the file's ranges 11458 my $previous_cp; # hex code point of previous line 11459 my $decimal_previous_cp = -1; # And its decimal equivalent 11460 my @start; # For each field, the current starting 11461 # code point in hex for the range 11462 # being accumulated. 11463 my @fields; # The input fields; 11464 my @previous_fields; # And those from the previous call 11465 11466 sub filter_UnicodeData_line($file) { 11467 # Handle a single input line from UnicodeData.txt; see comments above 11468 # Conceptually this takes a single line from the file containing N 11469 # properties, and converts it into N lines with one property per line, 11470 # which is what the final handler expects. But there are 11471 # complications due to the quirkiness of the input file, and to save 11472 # time, it accumulates ranges where the property values don't change 11473 # and only emits lines when necessary. This is about an order of 11474 # magnitude fewer lines emitted. 11475 11476 # $_ contains the input line. 11477 # -1 in split means retain trailing null fields 11478 (my $cp, @fields) = split /\s*;\s*/, $_, -1; 11479 11480 #local $to_trace = 1 if main::DEBUG; 11481 trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace; 11482 if (@fields > $input_field_count) { 11483 $file->carp_bad_line('Extra fields'); 11484 $_ = ""; 11485 return; 11486 } 11487 11488 my $decimal_cp = hex $cp; 11489 11490 # We have to output all the buffered ranges when the next code point 11491 # is not exactly one after the previous one, which means there is a 11492 # gap in the ranges. 11493 my $force_output = ($decimal_cp != $decimal_previous_cp + 1); 11494 11495 # The decomposition mapping field requires special handling. It looks 11496 # like either: 11497 # 11498 # <compat> 0032 0020 11499 # 0041 0300 11500 # 11501 # The decomposition type is enclosed in <brackets>; if missing, it 11502 # means the type is canonical. There are two decomposition mapping 11503 # tables: the one for use by Perl's normalize.pm has a special format 11504 # which is this field intact; the other, for general use is of 11505 # standard format. In either case we have to find the decomposition 11506 # type. Empty fields have None as their type, and map to the code 11507 # point itself 11508 if ($fields[$PERL_DECOMPOSITION] eq "") { 11509 $fields[$DECOMP_TYPE] = 'None'; 11510 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT; 11511 } 11512 else { 11513 ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION] 11514 =~ / < ( .+? ) > \s* ( .+ ) /x; 11515 if (! defined $fields[$DECOMP_TYPE]) { 11516 $fields[$DECOMP_TYPE] = 'Canonical'; 11517 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION]; 11518 } 11519 else { 11520 $fields[$DECOMP_MAP] = $map; 11521 } 11522 } 11523 11524 # The 3 numeric fields also require special handling. The 2 digit 11525 # fields must be either empty or match the number field. This means 11526 # that if it is empty, they must be as well, and the numeric type is 11527 # None, and the numeric value is 'Nan'. 11528 # The decimal digit field must be empty or match the other digit 11529 # field. If the decimal digit field is non-empty, the code point is 11530 # a decimal digit, and the other two fields will have the same value. 11531 # If it is empty, but the other digit field is non-empty, the code 11532 # point is an 'other digit', and the number field will have the same 11533 # value as the other digit field. If the other digit field is empty, 11534 # but the number field is non-empty, the code point is a generic 11535 # numeric type. 11536 if ($fields[$NUMERIC] eq "") { 11537 if ($fields[$PERL_DECIMAL_DIGIT] ne "" 11538 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "" 11539 ) { 11540 $file->carp_bad_line("Numeric values inconsistent. Trying to process anyway"); 11541 } 11542 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None'; 11543 $fields[$NUMERIC] = 'NaN'; 11544 } 11545 else { 11546 $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number. Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x; 11547 if ($fields[$PERL_DECIMAL_DIGIT] ne "") { 11548 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC]; 11549 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should be empty since the general category ($fields[$CATEGORY]) isn't 'Nd'. Processing as Decimal") if $fields[$CATEGORY] ne "Nd"; 11550 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal'; 11551 } 11552 elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") { 11553 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC]; 11554 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit'; 11555 } 11556 else { 11557 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric'; 11558 11559 # Rationals require extra effort. 11560 if ($fields[$NUMERIC] =~ qr{/}) { 11561 reduce_fraction(\$fields[$NUMERIC]); 11562 register_fraction($fields[$NUMERIC]) 11563 } 11564 } 11565 } 11566 11567 # For the properties that have empty fields in the file, and which 11568 # mean something different from empty, change them to that default. 11569 # Certain fields just haven't been empty so far in any Unicode 11570 # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC, 11571 # $CATEGORY. This leaves just the two fields, and so we hard-code in 11572 # the defaults; which are very unlikely to ever change. 11573 $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq ""; 11574 $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq ""; 11575 11576 # UAX44 says that if title is empty, it is the same as whatever upper 11577 # is, 11578 $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq ""; 11579 11580 # There are a few pairs of lines like: 11581 # AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; 11582 # D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; 11583 # that define ranges. These should be processed after the fields are 11584 # adjusted above, as they may override some of them; but mostly what 11585 # is left is to possibly adjust the $CHARNAME field. The names of all the 11586 # paired lines start with a '<', but this is also true of '<control>, 11587 # which isn't one of these special ones. 11588 if ($fields[$CHARNAME] eq '<control>') { 11589 11590 # Some code points in this file have the pseudo-name 11591 # '<control>', but the official name for such ones is the null 11592 # string. 11593 $fields[$NAME] = $fields[$CHARNAME] = ""; 11594 11595 # We had better not be in between range lines. 11596 if ($in_range) { 11597 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway"); 11598 $in_range = 0; 11599 } 11600 } 11601 elsif (substr($fields[$CHARNAME], 0, 1) ne '<') { 11602 11603 # Here is a non-range line. We had better not be in between range 11604 # lines. 11605 if ($in_range) { 11606 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway"); 11607 $in_range = 0; 11608 } 11609 if ($fields[$CHARNAME] =~ s/- $cp $//x) { 11610 11611 # These are code points whose names end in their code points, 11612 # which means the names are algorithmically derivable from the 11613 # code points. To shorten the output Name file, the algorithm 11614 # for deriving these is placed in the file instead of each 11615 # code point, so they have map type $CP_IN_NAME 11616 $fields[$CHARNAME] = $CMD_DELIM 11617 . $MAP_TYPE_CMD 11618 . '=' 11619 . $CP_IN_NAME 11620 . $CMD_DELIM 11621 . $fields[$CHARNAME]; 11622 } 11623 $fields[$NAME] = $fields[$CHARNAME]; 11624 } 11625 elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) { 11626 $fields[$CHARNAME] = $fields[$NAME] = $1; 11627 11628 # Here we are at the beginning of a range pair. 11629 if ($in_range) { 11630 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'. Trying anyway"); 11631 } 11632 $in_range = 1; 11633 11634 # Because the properties in the range do not overwrite any already 11635 # in the db, we must flush the buffers of what's already there, so 11636 # they get handled in the normal scheme. 11637 $force_output = 1; 11638 11639 } 11640 elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) { 11641 $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME]. Ignoring this line."); 11642 $_ = ""; 11643 return; 11644 } 11645 else { # Here, we are at the last line of a range pair. 11646 11647 if (! $in_range) { 11648 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one. Ignoring this line."); 11649 $_ = ""; 11650 return; 11651 } 11652 $in_range = 0; 11653 11654 $fields[$NAME] = $fields[$CHARNAME]; 11655 11656 # Check that the input is valid: that the closing of the range is 11657 # the same as the beginning. 11658 foreach my $i (0 .. $last_field) { 11659 next if $fields[$i] eq $previous_fields[$i]; 11660 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'. Bad News. Trying anyway"); 11661 } 11662 11663 # The processing differs depending on the type of range, 11664 # determined by its $CHARNAME 11665 if ($fields[$CHARNAME] =~ /^Hangul Syllable/) { 11666 11667 # Check that the data looks right. 11668 if ($decimal_previous_cp != $SBase) { 11669 $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp. Bad News. Results will be wrong"); 11670 } 11671 if ($decimal_cp != $SBase + $SCount - 1) { 11672 $file->carp_bad_line("Unexpected Hangul syllable end = $cp. Bad News. Results will be wrong"); 11673 } 11674 11675 # The Hangul syllable range has a somewhat complicated name 11676 # generation algorithm. Each code point in it has a canonical 11677 # decomposition also computable by an algorithm. The 11678 # perl decomposition map table built from these is used only 11679 # by normalize.pm, which has the algorithm built in it, so the 11680 # decomposition maps are not needed, and are large, so are 11681 # omitted from it. If the full decomposition map table is to 11682 # be output, the decompositions are generated for it, in the 11683 # EOF handling code for this input file. 11684 11685 $previous_fields[$DECOMP_TYPE] = 'Canonical'; 11686 11687 # This range is stored in our internal structure with its 11688 # own map type, different from all others. 11689 $previous_fields[$CHARNAME] = $previous_fields[$NAME] 11690 = $CMD_DELIM 11691 . $MAP_TYPE_CMD 11692 . '=' 11693 . $HANGUL_SYLLABLE 11694 . $CMD_DELIM 11695 . $fields[$CHARNAME]; 11696 } 11697 elsif ($fields[$CATEGORY] eq 'Lo') { # Is a letter 11698 11699 # All the CJK ranges like this have the name given as a 11700 # special case in the next code line. And for the others, we 11701 # hope that Unicode continues to use the correct name in 11702 # future releases, so we don't have to make further special 11703 # cases. 11704 my $name = ($fields[$CHARNAME] =~ /^CJK/) 11705 ? 'CJK UNIFIED IDEOGRAPH' 11706 : uc $fields[$CHARNAME]; 11707 11708 # The name for these contains the code point itself, and all 11709 # are defined to have the same base name, regardless of what 11710 # is in the file. They are stored in our internal structure 11711 # with a map type of $CP_IN_NAME 11712 $previous_fields[$CHARNAME] = $previous_fields[$NAME] 11713 = $CMD_DELIM 11714 . $MAP_TYPE_CMD 11715 . '=' 11716 . $CP_IN_NAME 11717 . $CMD_DELIM 11718 . $name; 11719 11720 } 11721 elsif ($fields[$CATEGORY] eq 'Co' 11722 || $fields[$CATEGORY] eq 'Cs') 11723 { 11724 # The names of all the code points in these ranges are set to 11725 # null, as there are no names for the private use and 11726 # surrogate code points. 11727 11728 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = ""; 11729 } 11730 else { 11731 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY]. Attempting to process it."); 11732 } 11733 11734 # The first line of the range caused everything else to be output, 11735 # and then its values were stored as the beginning values for the 11736 # next set of ranges, which this one ends. Now, for each value, 11737 # add a command to tell the handler that these values should not 11738 # replace any existing ones in our database. 11739 foreach my $i (0 .. $last_field) { 11740 $previous_fields[$i] = $CMD_DELIM 11741 . $REPLACE_CMD 11742 . '=' 11743 . $NO 11744 . $CMD_DELIM 11745 . $previous_fields[$i]; 11746 } 11747 11748 # And change things so it looks like the entire range has been 11749 # gone through with this being the final part of it. Adding the 11750 # command above to each field will cause this range to be flushed 11751 # during the next iteration, as it guaranteed that the stored 11752 # field won't match whatever value the next one has. 11753 $previous_cp = $cp; 11754 $decimal_previous_cp = $decimal_cp; 11755 11756 # We are now set up for the next iteration; so skip the remaining 11757 # code in this subroutine that does the same thing, but doesn't 11758 # know about these ranges. 11759 $_ = ""; 11760 11761 return; 11762 } 11763 11764 # On the very first line, we fake it so the code below thinks there is 11765 # nothing to output, and initialize so that when it does get output it 11766 # uses the first line's values for the lowest part of the range. 11767 # (One could avoid this by using peek(), but then one would need to 11768 # know the adjustments done above and do the same ones in the setup 11769 # routine; not worth it) 11770 if ($first_time) { 11771 $first_time = 0; 11772 @previous_fields = @fields; 11773 @start = ($cp) x scalar @fields; 11774 $decimal_previous_cp = $decimal_cp - 1; 11775 } 11776 11777 # For each field, output the stored up ranges that this code point 11778 # doesn't fit in. Earlier we figured out if all ranges should be 11779 # terminated because of changing the replace or map type styles, or if 11780 # there is a gap between this new code point and the previous one, and 11781 # that is stored in $force_output. But even if those aren't true, we 11782 # need to output the range if this new code point's value for the 11783 # given property doesn't match the stored range's. 11784 #local $to_trace = 1 if main::DEBUG; 11785 foreach my $i (0 .. $last_field) { 11786 my $field = $fields[$i]; 11787 if ($force_output || $field ne $previous_fields[$i]) { 11788 11789 # Flush the buffer of stored values. 11790 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]"); 11791 11792 # Start a new range with this code point and its value 11793 $start[$i] = $cp; 11794 $previous_fields[$i] = $field; 11795 } 11796 } 11797 11798 # Set the values for the next time. 11799 $previous_cp = $cp; 11800 $decimal_previous_cp = $decimal_cp; 11801 11802 # The input line has generated whatever adjusted lines are needed, and 11803 # should not be looked at further. 11804 $_ = ""; 11805 return; 11806 } 11807 11808 sub EOF_UnicodeData($file) { 11809 # Called upon EOF to flush the buffers, and create the Hangul 11810 # decomposition mappings if needed. 11811 11812 # Flush the buffers. 11813 foreach my $i (0 .. $last_field) { 11814 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]"); 11815 } 11816 11817 if (-e 'Jamo.txt') { 11818 11819 # The algorithm is published by Unicode, based on values in 11820 # Jamo.txt, (which should have been processed before this 11821 # subroutine), and the results left in %Jamo 11822 unless (%Jamo) { 11823 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt. Hangul syllables not generated."); 11824 return; 11825 } 11826 11827 # If the full decomposition map table is being output, insert 11828 # into it the Hangul syllable mappings. This is to avoid having 11829 # to publish a subroutine in it to compute them. (which would 11830 # essentially be this code.) This uses the algorithm published by 11831 # Unicode. (No hangul syllables in version 1) 11832 if ($v_version ge v2.0.0 11833 && property_ref('Decomposition_Mapping')->to_output_map) { 11834 for (my $S = $SBase; $S < $SBase + $SCount; $S++) { 11835 use integer; 11836 my $SIndex = $S - $SBase; 11837 my $L = $LBase + $SIndex / $NCount; 11838 my $V = $VBase + ($SIndex % $NCount) / $TCount; 11839 my $T = $TBase + $SIndex % $TCount; 11840 11841 trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace; 11842 my $decomposition = sprintf("%04X %04X", $L, $V); 11843 $decomposition .= sprintf(" %04X", $T) if $T != $TBase; 11844 $file->insert_adjusted_lines( 11845 sprintf("%04X; Decomposition_Mapping; %s", 11846 $S, 11847 $decomposition)); 11848 } 11849 } 11850 } 11851 11852 return; 11853 } 11854 11855 sub filter_v1_ucd($file) { 11856 # Fix UCD lines in version 1. This is probably overkill, but this 11857 # fixes some glaring errors in Version 1 UnicodeData.txt. That file: 11858 # 1) had many Hangul (U+3400 - U+4DFF) code points that were later 11859 # removed. This program retains them 11860 # 2) didn't include ranges, which it should have, and which are now 11861 # added in @corrected_lines below. It was hand populated by 11862 # taking the data from Version 2, verified by analyzing 11863 # DAge.txt. 11864 # 3) There is a syntax error in the entry for U+09F8 which could 11865 # cause problems for Unicode::UCD, and so is changed. It's 11866 # numeric value was simply a minus sign, without any number. 11867 # (Eventually Unicode changed the code point to non-numeric.) 11868 # 4) The decomposition types often don't match later versions 11869 # exactly, and the whole syntax of that field is different; so 11870 # the syntax is changed as well as the types to their later 11871 # terminology. Otherwise normalize.pm would be very unhappy 11872 # 5) Many ccc classes are different. These are left intact. 11873 # 6) U+FF10..U+FF19 are missing their numeric values in all three 11874 # fields. These are unchanged because it doesn't really cause 11875 # problems for Perl. 11876 # 7) A number of code points, such as controls, don't have their 11877 # Unicode Version 1 Names in this file. These are added. 11878 # 8) A number of Symbols were marked as Lm. This changes those in 11879 # the Latin1 range, so that regexes work. 11880 # 9) The odd characters U+03DB .. U+03E1 weren't encoded but are 11881 # referred to by their lc equivalents. Not fixed. 11882 11883 my @corrected_lines = split /\n/, <<'END'; 118844E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; 118859FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; 11886E000;<Private Use, First>;Co;0;L;;;;;N;;;;; 11887F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;; 11888F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;; 11889FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;; 11890END 11891 11892 #local $to_trace = 1 if main::DEBUG; 11893 trace $_ if main::DEBUG && $to_trace; 11894 11895 # -1 => retain trailing null fields 11896 my ($code_point, @fields) = split /\s*;\s*/, $_, -1; 11897 11898 # At the first place that is wrong in the input, insert all the 11899 # corrections, replacing the wrong line. 11900 if ($code_point eq '4E00') { 11901 my @copy = @corrected_lines; 11902 $_ = shift @copy; 11903 ($code_point, @fields) = split /\s*;\s*/, $_, -1; 11904 11905 $file->insert_lines(@copy); 11906 } 11907 elsif ($code_point =~ /^00/ && $fields[$CATEGORY] eq 'Lm') { 11908 11909 # There are no Lm characters in Latin1; these should be 'Sk', but 11910 # there isn't that in V1. 11911 $fields[$CATEGORY] = 'So'; 11912 } 11913 11914 if ($fields[$NUMERIC] eq '-') { 11915 $fields[$NUMERIC] = '-1'; # This is what 2.0 made it. 11916 } 11917 11918 if ($fields[$PERL_DECOMPOSITION] ne "") { 11919 11920 # Several entries have this change to superscript 2 or 3 in the 11921 # middle. Convert these to the modern version, which is to use 11922 # the actual U+00B2 and U+00B3 (the superscript forms) instead. 11923 # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes 11924 # 'HHHH HHHH 00B3 HHHH'. 11925 # It turns out that all of these that don't have another 11926 # decomposition defined at the beginning of the line have the 11927 # <square> decomposition in later releases. 11928 if ($code_point ne '00B2' && $code_point ne '00B3') { 11929 if ($fields[$PERL_DECOMPOSITION] 11930 =~ s/<\+sup> 003([23]) <-sup>/00B$1/) 11931 { 11932 if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') { 11933 $fields[$PERL_DECOMPOSITION] = '<square> ' 11934 . $fields[$PERL_DECOMPOSITION]; 11935 } 11936 } 11937 } 11938 11939 # If is like '<+circled> 0052 <-circled>', convert to 11940 # '<circled> 0052' 11941 $fields[$PERL_DECOMPOSITION] =~ 11942 s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/xg; 11943 11944 # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc. 11945 $fields[$PERL_DECOMPOSITION] =~ 11946 s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x 11947 or $fields[$PERL_DECOMPOSITION] =~ 11948 s/ <join> \s* (.*?) \s* <join> /<medial> $1/x 11949 or $fields[$PERL_DECOMPOSITION] =~ 11950 s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x 11951 or $fields[$PERL_DECOMPOSITION] =~ 11952 s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x; 11953 11954 # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc. 11955 $fields[$PERL_DECOMPOSITION] =~ 11956 s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x; 11957 11958 # Change names to modern form. 11959 $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g; 11960 $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g; 11961 $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g; 11962 $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g; 11963 11964 # One entry has weird braces 11965 $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g; 11966 11967 # One entry at U+2116 has an extra <sup> 11968 $fields[$PERL_DECOMPOSITION] =~ s/( < .*? > .* ) < .*? > \ * /$1/x; 11969 } 11970 11971 $_ = join ';', $code_point, @fields; 11972 trace $_ if main::DEBUG && $to_trace; 11973 return; 11974 } 11975 11976 sub filter_bad_Nd_ucd { 11977 # Early versions specified a value in the decimal digit field even 11978 # though the code point wasn't a decimal digit. Clear the field in 11979 # that situation, so that the main code doesn't think it is a decimal 11980 # digit. 11981 11982 my ($code_point, @fields) = split /\s*;\s*/, $_, -1; 11983 if ($fields[$PERL_DECIMAL_DIGIT] ne "" && $fields[$CATEGORY] ne 'Nd') { 11984 $fields[$PERL_DECIMAL_DIGIT] = ""; 11985 $_ = join ';', $code_point, @fields; 11986 } 11987 return; 11988 } 11989 11990 my @U1_control_names = split /\n/, <<'END'; 11991NULL 11992START OF HEADING 11993START OF TEXT 11994END OF TEXT 11995END OF TRANSMISSION 11996ENQUIRY 11997ACKNOWLEDGE 11998BELL 11999BACKSPACE 12000HORIZONTAL TABULATION 12001LINE FEED 12002VERTICAL TABULATION 12003FORM FEED 12004CARRIAGE RETURN 12005SHIFT OUT 12006SHIFT IN 12007DATA LINK ESCAPE 12008DEVICE CONTROL ONE 12009DEVICE CONTROL TWO 12010DEVICE CONTROL THREE 12011DEVICE CONTROL FOUR 12012NEGATIVE ACKNOWLEDGE 12013SYNCHRONOUS IDLE 12014END OF TRANSMISSION BLOCK 12015CANCEL 12016END OF MEDIUM 12017SUBSTITUTE 12018ESCAPE 12019FILE SEPARATOR 12020GROUP SEPARATOR 12021RECORD SEPARATOR 12022UNIT SEPARATOR 12023DELETE 12024BREAK PERMITTED HERE 12025NO BREAK HERE 12026INDEX 12027NEXT LINE 12028START OF SELECTED AREA 12029END OF SELECTED AREA 12030CHARACTER TABULATION SET 12031CHARACTER TABULATION WITH JUSTIFICATION 12032LINE TABULATION SET 12033PARTIAL LINE DOWN 12034PARTIAL LINE UP 12035REVERSE LINE FEED 12036SINGLE SHIFT TWO 12037SINGLE SHIFT THREE 12038DEVICE CONTROL STRING 12039PRIVATE USE ONE 12040PRIVATE USE TWO 12041SET TRANSMIT STATE 12042CANCEL CHARACTER 12043MESSAGE WAITING 12044START OF GUARDED AREA 12045END OF GUARDED AREA 12046START OF STRING 12047SINGLE CHARACTER INTRODUCER 12048CONTROL SEQUENCE INTRODUCER 12049STRING TERMINATOR 12050OPERATING SYSTEM COMMAND 12051PRIVACY MESSAGE 12052APPLICATION PROGRAM COMMAND 12053END 12054 12055 sub filter_early_U1_names { 12056 # Very early versions did not have the Unicode_1_name field specified. 12057 # They differed in which ones were present; make sure a U1 name 12058 # exists, so that Unicode::UCD::charinfo will work 12059 12060 my ($code_point, @fields) = split /\s*;\s*/, $_, -1; 12061 12062 12063 # @U1_control names above are entirely positional, so we pull them out 12064 # in the exact order required, with gaps for the ones that don't have 12065 # names. 12066 if ($code_point =~ /^00[01]/ 12067 || $code_point eq '007F' 12068 || $code_point =~ /^008[2-9A-F]/ 12069 || $code_point =~ /^009[0-8A-F]/) 12070 { 12071 my $u1_name = shift @U1_control_names; 12072 $fields[$UNICODE_1_NAME] = $u1_name unless $fields[$UNICODE_1_NAME]; 12073 $_ = join ';', $code_point, @fields; 12074 } 12075 return; 12076 } 12077 12078 sub filter_v2_1_5_ucd { 12079 # A dozen entries in this 2.1.5 file had the mirrored and numeric 12080 # columns swapped; These all had mirrored be 'N'. So if the numeric 12081 # column appears to be N, swap it back. 12082 12083 my ($code_point, @fields) = split /\s*;\s*/, $_, -1; 12084 if ($fields[$NUMERIC] eq 'N') { 12085 $fields[$NUMERIC] = $fields[$MIRRORED]; 12086 $fields[$MIRRORED] = 'N'; 12087 $_ = join ';', $code_point, @fields; 12088 } 12089 return; 12090 } 12091 12092 sub filter_v6_ucd { 12093 12094 # Unicode 6.0 co-opted the name BELL for U+1F514, but until 5.17, 12095 # it wasn't accepted, to allow for some deprecation cycles. This 12096 # function is not called after 5.16 12097 12098 return if $_ !~ /^(?:0007|1F514|070F);/; 12099 12100 my ($code_point, @fields) = split /\s*;\s*/, $_, -1; 12101 if ($code_point eq '0007') { 12102 $fields[$CHARNAME] = ""; 12103 } 12104 elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see 12105 # http://www.unicode.org/versions/corrigendum8.html 12106 $fields[$BIDI] = "AL"; 12107 } 12108 elsif ($^V lt v5.18.0) { # For 5.18 will convert to use Unicode's name 12109 $fields[$CHARNAME] = ""; 12110 } 12111 12112 $_ = join ';', $code_point, @fields; 12113 12114 return; 12115 } 12116} # End closure for UnicodeData 12117 12118sub process_GCB_test($file) { 12119 12120 while ($file->next_line) { 12121 push @backslash_X_tests, $_; 12122 } 12123 12124 return; 12125} 12126 12127sub process_LB_test($file) { 12128 12129 while ($file->next_line) { 12130 push @LB_tests, $_; 12131 } 12132 12133 return; 12134} 12135 12136sub process_SB_test($file) { 12137 12138 while ($file->next_line) { 12139 push @SB_tests, $_; 12140 } 12141 12142 return; 12143} 12144 12145sub process_WB_test($file) { 12146 12147 while ($file->next_line) { 12148 push @WB_tests, $_; 12149 } 12150 12151 return; 12152} 12153 12154sub process_NamedSequences($file) { 12155 # NamedSequences.txt entries are just added to an array. Because these 12156 # don't look like the other tables, they have their own handler. 12157 # An example: 12158 # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300 12159 # 12160 # This just adds the sequence to an array for later handling 12161 12162 while ($file->next_line) { 12163 my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1; 12164 if (@remainder) { 12165 $file->carp_bad_line( 12166 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'"); 12167 next; 12168 } 12169 12170 # Code points need to be 5 digits long like the other entries in 12171 # Name.pl, for regcomp.c parsing; and the ones below 0x0100 need to be 12172 # converted to native 12173 $sequence = join " ", map { sprintf("%05X", 12174 utf8::unicode_to_native(hex $_)) 12175 } split / /, $sequence; 12176 push @named_sequences, "$sequence\n$name\n"; 12177 } 12178 return; 12179} 12180 12181{ # Closure 12182 12183 my $first_range; 12184 12185 sub filter_early_ea_lb { 12186 # Fixes early EastAsianWidth.txt and LineBreak.txt files. These had a 12187 # third field be the name of the code point, which can be ignored in 12188 # most cases. But it can be meaningful if it marks a range: 12189 # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE 12190 # 3400;W;<CJK Ideograph Extension A, First> 12191 # 12192 # We need to see the First in the example above to know it's a range. 12193 # They did not use the later range syntaxes. This routine changes it 12194 # to use the modern syntax. 12195 # $1 is the Input_file object. 12196 12197 my @fields = split /\s*;\s*/; 12198 if ($fields[2] =~ /^<.*, First>/) { 12199 $first_range = $fields[0]; 12200 $_ = ""; 12201 } 12202 elsif ($fields[2] =~ /^<.*, Last>/) { 12203 $_ = $_ = "$first_range..$fields[0]; $fields[1]"; 12204 } 12205 else { 12206 undef $first_range; 12207 $_ = "$fields[0]; $fields[1]"; 12208 } 12209 12210 return; 12211 } 12212} 12213 12214sub filter_substitute_lb { 12215 # Used on Unicodes that predate the LB property, where there is a 12216 # substitute file. This just does the regular ea_lb handling for such 12217 # files, and then substitutes the long property value name for the short 12218 # one that comes with the file. (The other break files have the long 12219 # names in them, so this is the odd one out.) The reason for doing this 12220 # kludge is that regen/mk_invlists.pl is expecting the long name. This 12221 # also fixes the typo 'Inseperable' that leads to problems. 12222 12223 filter_early_ea_lb; 12224 return unless $_; 12225 12226 my @fields = split /\s*;\s*/; 12227 $fields[1] = property_ref('_Perl_LB')->table($fields[1])->full_name; 12228 $fields[1] = 'Inseparable' if lc $fields[1] eq 'inseperable'; 12229 $_ = join '; ', @fields; 12230} 12231 12232sub filter_old_style_arabic_shaping { 12233 # Early versions used a different term for the later one. 12234 12235 my @fields = split /\s*;\s*/; 12236 $fields[3] =~ s/<no shaping>/No_Joining_Group/; 12237 $fields[3] =~ s/\s+/_/g; # Change spaces to underscores 12238 $_ = join ';', @fields; 12239 return; 12240} 12241 12242{ # Closure 12243 my $lc; # Table for lowercase mapping 12244 my $tc; 12245 my $uc; 12246 my %special_casing_code_points; 12247 12248 sub setup_special_casing($file) { 12249 # SpecialCasing.txt contains the non-simple case change mappings. The 12250 # simple ones are in UnicodeData.txt, which should already have been 12251 # read in to the full property data structures, so as to initialize 12252 # these with the simple ones. Then the SpecialCasing.txt entries 12253 # add or overwrite the ones which have different full mappings. 12254 12255 # This routine sees if the simple mappings are to be output, and if 12256 # so, copies what has already been put into the full mapping tables, 12257 # while they still contain only the simple mappings. 12258 12259 # The reason it is done this way is that the simple mappings are 12260 # probably not going to be output, so it saves work to initialize the 12261 # full tables with the simple mappings, and then overwrite those 12262 # relatively few entries in them that have different full mappings, 12263 # and thus skip the simple mapping tables altogether. 12264 12265 $lc = property_ref('lc'); 12266 $tc = property_ref('tc'); 12267 $uc = property_ref('uc'); 12268 12269 # For each of the case change mappings... 12270 foreach my $full_casing_table ($lc, $tc, $uc) { 12271 my $full_casing_name = $full_casing_table->name; 12272 my $full_casing_full_name = $full_casing_table->full_name; 12273 unless (defined $full_casing_table 12274 && ! $full_casing_table->is_empty) 12275 { 12276 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated."); 12277 } 12278 12279 # Create a table in the old-style format and with the original 12280 # file name for backwards compatibility with applications that 12281 # read it directly. The new tables contain both the simple and 12282 # full maps, and the old are missing simple maps when there is a 12283 # conflicting full one. Probably it would have been ok to add 12284 # those to the legacy version, as was already done in 5.14 to the 12285 # case folding one, but this was not done, out of an abundance of 12286 # caution. The tables are set up here before we deal with the 12287 # full maps so that as we handle those, we can override the simple 12288 # maps for them in the legacy table, and merely add them in the 12289 # new-style one. 12290 my $legacy = Property->new("Legacy_" . $full_casing_full_name, 12291 File => $full_casing_full_name 12292 =~ s/case_Mapping//r, 12293 Format => $HEX_FORMAT, 12294 Default_Map => $CODE_POINT, 12295 Initialize => $full_casing_table, 12296 Replacement_Property => $full_casing_full_name, 12297 ); 12298 12299 $full_casing_table->add_comment(join_lines( <<END 12300This file includes both the simple and full case changing maps. The simple 12301ones are in the main body of the table below, and the full ones adding to or 12302overriding them are in the hash. 12303END 12304 )); 12305 12306 # The simple version's name in each mapping merely has an 's' in 12307 # front of the full one's 12308 my $simple_name = 's' . $full_casing_name; 12309 my $simple = property_ref($simple_name); 12310 $simple->initialize($full_casing_table) if $simple->to_output_map(); 12311 } 12312 12313 return; 12314 } 12315 12316 sub filter_2_1_8_special_casing_line { 12317 12318 # This version had duplicate entries in this file. Delete all but the 12319 # first one 12320 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null 12321 # fields 12322 if (exists $special_casing_code_points{$fields[0]}) { 12323 $_ = ""; 12324 return; 12325 } 12326 12327 $special_casing_code_points{$fields[0]} = 1; 12328 filter_special_casing_line(@_); 12329 } 12330 12331 sub filter_special_casing_line($file) { 12332 # Change the format of $_ from SpecialCasing.txt into something that 12333 # the generic handler understands. Each input line contains three 12334 # case mappings. This will generate three lines to pass to the 12335 # generic handler for each of those. 12336 12337 # The input syntax (after stripping comments and trailing white space 12338 # is like one of the following (with the final two being entries that 12339 # we ignore): 12340 # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S 12341 # 03A3; 03C2; 03A3; 03A3; Final_Sigma; 12342 # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE 12343 # Note the trailing semi-colon, unlike many of the input files. That 12344 # means that there will be an extra null field generated by the split 12345 12346 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null 12347 # fields 12348 12349 # field #4 is when this mapping is conditional. If any of these get 12350 # implemented, it would be by hard-coding in the casing functions in 12351 # the Perl core, not through tables. But if there is a new condition 12352 # we don't know about, output a warning. We know about all the 12353 # conditions through 6.0 12354 if ($fields[4] ne "") { 12355 my @conditions = split ' ', $fields[4]; 12356 if ($conditions[0] ne 'tr' # We know that these languages have 12357 # conditions, and some are multiple 12358 && $conditions[0] ne 'az' 12359 && $conditions[0] ne 'lt' 12360 12361 # And, we know about a single condition Final_Sigma, but 12362 # nothing else. 12363 && ($v_version gt v5.2.0 12364 && (@conditions > 1 || $conditions[0] ne 'Final_Sigma'))) 12365 { 12366 $file->carp_bad_line("Unknown condition '$fields[4]'. You should inspect it and either add code to handle it, or add to list of those that are to ignore"); 12367 } 12368 elsif ($conditions[0] ne 'Final_Sigma') { 12369 12370 # Don't print out a message for Final_Sigma, because we 12371 # have hard-coded handling for it. (But the standard 12372 # could change what the rule should be, but it wouldn't 12373 # show up here anyway. 12374 12375 print "# SKIPPING Special Casing: $_\n" 12376 if $verbosity >= $VERBOSE; 12377 } 12378 $_ = ""; 12379 return; 12380 } 12381 elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) { 12382 $file->carp_bad_line('Extra fields'); 12383 $_ = ""; 12384 return; 12385 } 12386 12387 my $decimal_code_point = hex $fields[0]; 12388 12389 # Loop to handle each of the three mappings in the input line, in 12390 # order, with $i indicating the current field number. 12391 my $i = 0; 12392 for my $object ($lc, $tc, $uc) { 12393 $i++; # First time through, $i = 0 ... 3rd time = 3 12394 12395 my $value = $object->value_of($decimal_code_point); 12396 $value = ($value eq $CODE_POINT) 12397 ? $decimal_code_point 12398 : hex $value; 12399 12400 # If this isn't a multi-character mapping, it should already have 12401 # been read in. 12402 if ($fields[$i] !~ / /) { 12403 if ($value != hex $fields[$i]) { 12404 Carp::my_carp("Bad news. UnicodeData.txt thinks " 12405 . $object->name 12406 . "(0x$fields[0]) is $value" 12407 . " and SpecialCasing.txt thinks it is " 12408 . hex($fields[$i]) 12409 . ". Good luck. Retaining UnicodeData value, and proceeding anyway."); 12410 } 12411 } 12412 else { 12413 12414 # The mapping goes into both the legacy table, in which it 12415 # replaces the simple one... 12416 $file->insert_adjusted_lines("$fields[0]; Legacy_" 12417 . $object->full_name 12418 . "; $fields[$i]"); 12419 12420 # ... and the regular table, in which it is additional, 12421 # beyond the simple mapping. 12422 $file->insert_adjusted_lines("$fields[0]; " 12423 . $object->name 12424 . "; " 12425 . $CMD_DELIM 12426 . "$REPLACE_CMD=$MULTIPLE_BEFORE" 12427 . $CMD_DELIM 12428 . $fields[$i]); 12429 } 12430 } 12431 12432 # Everything has been handled by the insert_adjusted_lines() 12433 $_ = ""; 12434 12435 return; 12436 } 12437} 12438 12439sub filter_old_style_case_folding($file) { 12440 # This transforms $_ containing the case folding style of 3.0.1, to 3.1 12441 # and later style. Different letters were used in the earlier. 12442 12443 my @fields = split /\s*;\s*/; 12444 12445 if ($fields[1] eq 'L') { 12446 $fields[1] = 'C'; # L => C always 12447 } 12448 elsif ($fields[1] eq 'E') { 12449 if ($fields[2] =~ / /) { # E => C if one code point; F otherwise 12450 $fields[1] = 'F' 12451 } 12452 else { 12453 $fields[1] = 'C' 12454 } 12455 } 12456 else { 12457 $file->carp_bad_line("Expecting L or E in second field"); 12458 $_ = ""; 12459 return; 12460 } 12461 $_ = join("; ", @fields) . ';'; 12462 return; 12463} 12464 12465{ # Closure for case folding 12466 12467 # Create the map for simple only if are going to output it, for otherwise 12468 # it takes no part in anything we do. 12469 my $to_output_simple; 12470 12471 sub setup_case_folding { 12472 # Read in the case foldings in CaseFolding.txt. This handles both 12473 # simple and full case folding. 12474 12475 $to_output_simple 12476 = property_ref('Simple_Case_Folding')->to_output_map; 12477 12478 if (! $to_output_simple) { 12479 property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding'); 12480 } 12481 12482 # If we ever wanted to show that these tables were combined, a new 12483 # property method could be created, like set_combined_props() 12484 property_ref('Case_Folding')->add_comment(join_lines( <<END 12485This file includes both the simple and full case folding maps. The simple 12486ones are in the main body of the table below, and the full ones adding to or 12487overriding them are in the hash. 12488END 12489 )); 12490 return; 12491 } 12492 12493 sub filter_case_folding_line($file) { 12494 # Called for each line in CaseFolding.txt 12495 # Input lines look like: 12496 # 0041; C; 0061; # LATIN CAPITAL LETTER A 12497 # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S 12498 # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S 12499 # 12500 # 'C' means that folding is the same for both simple and full 12501 # 'F' that it is only for full folding 12502 # 'S' that it is only for simple folding 12503 # 'T' is locale-dependent, and ignored 12504 # 'I' is a type of 'F' used in some early releases. 12505 # Note the trailing semi-colon, unlike many of the input files. That 12506 # means that there will be an extra null field generated by the split 12507 # below, which we ignore and hence is not an error. 12508 12509 my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1; 12510 if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) { 12511 $file->carp_bad_line('Extra fields'); 12512 $_ = ""; 12513 return; 12514 } 12515 12516 if ($type =~ / ^ [IT] $/x) { # Skip Turkic case folding, is locale dependent 12517 $_ = ""; 12518 return; 12519 } 12520 12521 # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase 12522 # I are all full foldings; S is single-char. For S, there is always 12523 # an F entry, so we must allow multiple values for the same code 12524 # point. Fortunately this table doesn't need further manipulation 12525 # which would preclude using multiple-values. The S is now included 12526 # so that _swash_inversion_hash() is able to construct closures 12527 # without having to worry about F mappings. 12528 if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') { 12529 $_ = "$range; Case_Folding; " 12530 . "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map"; 12531 } 12532 else { 12533 $_ = ""; 12534 $file->carp_bad_line('Expecting C F I S or T in second field'); 12535 } 12536 12537 # C and S are simple foldings, but simple case folding is not needed 12538 # unless we explicitly want its map table output. 12539 if ($to_output_simple && $type eq 'C' || $type eq 'S') { 12540 $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map"); 12541 } 12542 12543 return; 12544 } 12545 12546} # End case fold closure 12547 12548sub filter_jamo_line { 12549 # Filter Jamo.txt lines. This routine mainly is used to populate hashes 12550 # from this file that is used in generating the Name property for Jamo 12551 # code points. But, it also is used to convert early versions' syntax 12552 # into the modern form. Here are two examples: 12553 # 1100; G # HANGUL CHOSEONG KIYEOK # Modern syntax 12554 # U+1100; G; HANGUL CHOSEONG KIYEOK # 2.0 syntax 12555 # 12556 # The input is $_, the output is $_ filtered. 12557 12558 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 12559 12560 # Let the caller handle unexpected input. In earlier versions, there was 12561 # a third field which is supposed to be a comment, but did not have a '#' 12562 # before it. 12563 return if @fields > (($v_version gt v3.0.0) ? 2 : 3); 12564 12565 $fields[0] =~ s/^U\+//; # Also, early versions had this extraneous 12566 # beginning. 12567 12568 # Some 2.1 versions had this wrong. Causes havoc with the algorithm. 12569 $fields[1] = 'R' if $fields[0] eq '1105'; 12570 12571 # Add to structure so can generate Names from it. 12572 my $cp = hex $fields[0]; 12573 my $short_name = $fields[1]; 12574 $Jamo{$cp} = $short_name; 12575 if ($cp <= $LBase + $LCount) { 12576 $Jamo_L{$short_name} = $cp - $LBase; 12577 } 12578 elsif ($cp <= $VBase + $VCount) { 12579 $Jamo_V{$short_name} = $cp - $VBase; 12580 } 12581 elsif ($cp <= $TBase + $TCount) { 12582 $Jamo_T{$short_name} = $cp - $TBase; 12583 } 12584 else { 12585 Carp::my_carp_bug("Unexpected Jamo code point in $_"); 12586 } 12587 12588 12589 # Reassemble using just the first two fields to look like a typical 12590 # property file line 12591 $_ = "$fields[0]; $fields[1]"; 12592 12593 return; 12594} 12595 12596sub register_fraction($rational) { 12597 # This registers the input rational number so that it can be passed on to 12598 # Unicode::UCD, both in rational and floating forms. 12599 12600 my $floating = eval $rational; 12601 12602 my @floats = sprintf "%.*e", $E_FLOAT_PRECISION, $floating; 12603 12604 # See if the denominator is a power of 2. 12605 $rational =~ m!.*/(.*)!; 12606 my $denominator = $1; 12607 if (defined $denominator && (($denominator & ($denominator - 1)) == 0)) { 12608 12609 # Here the denominator is a power of 2. This means it has an exact 12610 # representation in binary, so rounding could go either way. It turns 12611 # out that Windows doesn't necessarily round towards even, so output 12612 # an extra entry. This happens when the final digit we output is even 12613 # and the next digits would be 50* to the precision of the machine. 12614 my $extra_digit_float = sprintf "%e", $floating; 12615 my $q = $E_FLOAT_PRECISION - 1; 12616 if ($extra_digit_float =~ / ( .* \. \d{$q} ) 12617 ( [02468] ) 5 0* ( e .*) 12618 /ix) 12619 { 12620 push @floats, $1 . ($2 + 1) . $3; 12621 } 12622 } 12623 12624 foreach my $float (@floats) { 12625 # Strip off any leading zeros beyond 2 digits to make it C99 12626 # compliant. (Windows has 3 digit exponents, contrary to C99) 12627 $float =~ s/ ( .* e [-+] ) 0* ( \d{2,}? ) /$1$2/x; 12628 12629 if ( defined $nv_floating_to_rational{$float} 12630 && $nv_floating_to_rational{$float} ne $rational) 12631 { 12632 die Carp::my_carp_bug("Both '$rational' and" 12633 . " '$nv_floating_to_rational{$float}' evaluate to" 12634 . " the same floating point number." 12635 . " \$E_FLOAT_PRECISION must be increased"); 12636 } 12637 $nv_floating_to_rational{$float} = $rational; 12638 } 12639 return; 12640} 12641 12642sub gcd($a, $b) { # Greatest-common-divisor; from 12643 # http://en.wikipedia.org/wiki/Euclidean_algorithm 12644 use integer; 12645 12646 while ($b != 0) { 12647 my $temp = $b; 12648 $b = $a % $b; 12649 $a = $temp; 12650 } 12651 return $a; 12652} 12653 12654sub reduce_fraction($fraction_ref) { 12655 # Reduce a fraction to lowest terms. The Unicode data may be reducible, 12656 # hence this is needed. The argument is a reference to the 12657 # string denoting the fraction, which must be of the form: 12658 if ($$fraction_ref !~ / ^ (-?) (\d+) \/ (\d+) $ /ax) { 12659 Carp::my_carp_bug("Non-fraction input '$$fraction_ref'. Unchanged"); 12660 return; 12661 } 12662 12663 my $sign = $1; 12664 my $numerator = $2; 12665 my $denominator = $3; 12666 12667 use integer; 12668 12669 # Find greatest common divisor 12670 my $gcd = gcd($numerator, $denominator); 12671 12672 # And reduce using the gcd. 12673 if ($gcd != 1) { 12674 $numerator /= $gcd; 12675 $denominator /= $gcd; 12676 $$fraction_ref = "$sign$numerator/$denominator"; 12677 } 12678 12679 return; 12680} 12681 12682sub filter_numeric_value_line($file) { 12683 # DNumValues contains lines of a different syntax than the typical 12684 # property file: 12685 # 0F33 ; -0.5 ; ; -1/2 # No TIBETAN DIGIT HALF ZERO 12686 # 12687 # This routine transforms $_ containing the anomalous syntax to the 12688 # typical, by filtering out the extra columns, and convert early version 12689 # decimal numbers to strings that look like rational numbers. 12690 12691 # Starting in 5.1, there is a rational field. Just use that, omitting the 12692 # extra columns. Otherwise convert the decimal number in the second field 12693 # to a rational, and omit extraneous columns. 12694 my @fields = split /\s*;\s*/, $_, -1; 12695 my $rational; 12696 12697 if ($v_version ge v5.1.0) { 12698 if (@fields != 4) { 12699 $file->carp_bad_line('Not 4 semi-colon separated fields'); 12700 $_ = ""; 12701 return; 12702 } 12703 reduce_fraction(\$fields[3]) if $fields[3] =~ qr{/}; 12704 $rational = $fields[3]; 12705 12706 $_ = join '; ', @fields[ 0, 3 ]; 12707 } 12708 else { 12709 12710 # Here, is an older Unicode file, which has decimal numbers instead of 12711 # rationals in it. Use the fraction to calculate the denominator and 12712 # convert to rational. 12713 12714 if (@fields != 2 && @fields != 3) { 12715 $file->carp_bad_line('Not 2 or 3 semi-colon separated fields'); 12716 $_ = ""; 12717 return; 12718 } 12719 12720 my $codepoints = $fields[0]; 12721 my $decimal = $fields[1]; 12722 if ($decimal =~ s/\.0+$//) { 12723 12724 # Anything ending with a decimal followed by nothing but 0's is an 12725 # integer 12726 $_ = "$codepoints; $decimal"; 12727 $rational = $decimal; 12728 } 12729 else { 12730 12731 my $denominator; 12732 if ($decimal =~ /\.50*$/) { 12733 $denominator = 2; 12734 } 12735 12736 # Here have the hardcoded repeating decimals in the fraction, and 12737 # the denominator they imply. There were only a few denominators 12738 # in the older Unicode versions of this file which this code 12739 # handles, so it is easy to convert them. 12740 12741 # The 4 is because of a round-off error in the Unicode 3.2 files 12742 elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) { 12743 $denominator = 3; 12744 } 12745 elsif ($decimal =~ /\.[27]50*$/) { 12746 $denominator = 4; 12747 } 12748 elsif ($decimal =~ /\.[2468]0*$/) { 12749 $denominator = 5; 12750 } 12751 elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) { 12752 $denominator = 6; 12753 } 12754 elsif ($decimal =~ /\.(12|37|62|87)50*$/) { 12755 $denominator = 8; 12756 } 12757 if ($denominator) { 12758 my $sign = ($decimal < 0) ? "-" : ""; 12759 my $numerator = int((abs($decimal) * $denominator) + .5); 12760 $rational = "$sign$numerator/$denominator"; 12761 $_ = "$codepoints; $rational"; 12762 } 12763 else { 12764 $file->carp_bad_line("Can't cope with number '$decimal'."); 12765 $_ = ""; 12766 return; 12767 } 12768 } 12769 } 12770 12771 register_fraction($rational) if $rational =~ qr{/}; 12772 return; 12773} 12774 12775{ # Closure 12776 my %unihan_properties; 12777 12778 sub construct_unihan($file_object) { 12779 12780 return unless file_exists($file_object->file); 12781 12782 if ($v_version lt v4.0.0) { 12783 push @cjk_properties, 'URS ; Unicode_Radical_Stroke'; 12784 push @cjk_property_values, split "\n", <<'END'; 12785# @missing: 0000..10FFFF; Unicode_Radical_Stroke; <none> 12786END 12787 } 12788 12789 if ($v_version ge v3.0.0) { 12790 push @cjk_properties, split "\n", <<'END'; 12791cjkIRG_GSource; kIRG_GSource 12792cjkIRG_JSource; kIRG_JSource 12793cjkIRG_KSource; kIRG_KSource 12794cjkIRG_TSource; kIRG_TSource 12795cjkIRG_VSource; kIRG_VSource 12796END 12797 push @cjk_property_values, split "\n", <<'END'; 12798# @missing: 0000..10FFFF; cjkIRG_GSource; <none> 12799# @missing: 0000..10FFFF; cjkIRG_JSource; <none> 12800# @missing: 0000..10FFFF; cjkIRG_KSource; <none> 12801# @missing: 0000..10FFFF; cjkIRG_TSource; <none> 12802# @missing: 0000..10FFFF; cjkIRG_VSource; <none> 12803END 12804 } 12805 if ($v_version ge v3.1.0) { 12806 push @cjk_properties, 'cjkIRG_HSource; kIRG_HSource'; 12807 push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_HSource; <none>'; 12808 } 12809 if ($v_version ge v3.1.1) { 12810 push @cjk_properties, 'cjkIRG_KPSource; kIRG_KPSource'; 12811 push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_KPSource; <none>'; 12812 } 12813 if ($v_version ge v3.2.0) { 12814 push @cjk_properties, split "\n", <<'END'; 12815cjkAccountingNumeric; kAccountingNumeric 12816cjkCompatibilityVariant; kCompatibilityVariant 12817cjkOtherNumeric; kOtherNumeric 12818cjkPrimaryNumeric; kPrimaryNumeric 12819END 12820 push @cjk_property_values, split "\n", <<'END'; 12821# @missing: 0000..10FFFF; cjkAccountingNumeric; NaN 12822# @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point> 12823# @missing: 0000..10FFFF; cjkOtherNumeric; NaN 12824# @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN 12825END 12826 } 12827 if ($v_version gt v4.0.0) { 12828 push @cjk_properties, 'cjkIRG_USource; kIRG_USource'; 12829 push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_USource; <none>'; 12830 } 12831 12832 if ($v_version ge v4.1.0) { 12833 push @cjk_properties, 'cjkIICore ; kIICore'; 12834 push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIICore; <none>'; 12835 } 12836 } 12837 12838 sub setup_unihan { 12839 # Do any special setup for Unihan properties. 12840 12841 # This property gives the wrong computed type, so override. 12842 my $usource = property_ref('kIRG_USource'); 12843 $usource->set_type($STRING) if defined $usource; 12844 12845 # This property is to be considered binary (it says so in 12846 # http://www.unicode.org/reports/tr38/) 12847 my $iicore = property_ref('kIICore'); 12848 if (defined $iicore) { 12849 $iicore->set_type($FORCED_BINARY); 12850 $iicore->table("Y")->add_note("Matches any code point which has a non-null value for this property; see unicode.org UAX #38."); 12851 12852 # Unicode doesn't include the maps for this property, so don't 12853 # warn that they are missing. 12854 $iicore->set_pre_declared_maps(0); 12855 $iicore->add_comment(join_lines( <<END 12856This property contains string values, but any non-empty ones are considered to 12857be 'core', so Perl creates tables for both: 1) its string values, plus 2) 12858tables so that \\p{kIICore} matches any code point which has a non-empty 12859value for this property. 12860END 12861 )); 12862 } 12863 12864 return; 12865 } 12866 12867 sub filter_unihan_line { 12868 # Change unihan db lines to look like the others in the db. Here is 12869 # an input sample: 12870 # U+341C kCangjie IEKN 12871 12872 # Tabs are used instead of semi-colons to separate fields; therefore 12873 # they may have semi-colons embedded in them. Change these to periods 12874 # so won't screw up the rest of the code. 12875 s/;/./g; 12876 12877 # Remove lines that don't look like ones we accept. 12878 if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) { 12879 $_ = ""; 12880 return; 12881 } 12882 12883 # Extract the property, and save a reference to its object. 12884 my $property = $1; 12885 if (! exists $unihan_properties{$property}) { 12886 $unihan_properties{$property} = property_ref($property); 12887 } 12888 12889 # Don't do anything unless the property is one we're handling, which 12890 # we determine by seeing if there is an object defined for it or not 12891 if (! defined $unihan_properties{$property}) { 12892 $_ = ""; 12893 return; 12894 } 12895 12896 # Convert the tab separators to our standard semi-colons, and convert 12897 # the U+HHHH notation to the rest of the standard's HHHH 12898 s/\t/;/g; 12899 s/\b U \+ (?= $code_point_re )//xg; 12900 12901 #local $to_trace = 1 if main::DEBUG; 12902 trace $_ if main::DEBUG && $to_trace; 12903 12904 return; 12905 } 12906} 12907 12908sub filter_blocks_lines($file) { 12909 # In the Blocks.txt file, the names of the blocks don't quite match the 12910 # names given in PropertyValueAliases.txt, so this changes them so they 12911 # do match: Blanks and hyphens are changed into underscores. Also makes 12912 # early release versions look like later ones 12913 # 12914 # $_ is transformed to the correct value. 12915 12916 if ($v_version lt v3.2.0) { 12917 if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted 12918 $_ = ""; 12919 return; 12920 } 12921 12922 # Old versions used a different syntax to mark the range. 12923 $_ =~ s/;\s+/../ if $v_version lt v3.1.0; 12924 } 12925 12926 my @fields = split /\s*;\s*/, $_, -1; 12927 if (@fields != 2) { 12928 $file->carp_bad_line("Expecting exactly two fields"); 12929 $_ = ""; 12930 return; 12931 } 12932 12933 # Change hyphens and blanks in the block name field only 12934 $fields[1] =~ s/[ -]/_/g; 12935 $fields[1] =~ s/_ ( [a-z] ) /_\u$1/xg; # Capitalize first letter of word 12936 12937 $_ = join("; ", @fields); 12938 return; 12939} 12940 12941{ # Closure 12942 my $current_property; 12943 12944 sub filter_old_style_proplist { 12945 # PropList.txt has been in Unicode since version 2.0. Until 3.1, it 12946 # was in a completely different syntax. Ken Whistler of Unicode says 12947 # that it was something he used as an aid for his own purposes, but 12948 # was never an official part of the standard. Many of the properties 12949 # in it were incorporated into the later PropList.txt, but some were 12950 # not. This program uses this early file to generate property tables 12951 # that are otherwise not accessible in the early UCD's. It does this 12952 # for the ones that eventually became official, and don't appear to be 12953 # too different in their contents from the later official version, and 12954 # throws away the rest. It could be argued that the ones it generates 12955 # were probably not really official at that time, so should be 12956 # ignored. You can easily modify things to skip all of them by 12957 # changing this function to just set $_ to "", and return; and to skip 12958 # certain of them by simply removing their declarations from 12959 # get_old_property_aliases(). 12960 # 12961 # Here is a list of all the ones that are thrown away: 12962 # Alphabetic The definitions for this are very 12963 # defective, so better to not mislead 12964 # people into thinking it works. 12965 # Instead the Perl extension of the 12966 # same name is constructed from first 12967 # principles. 12968 # Bidi=* duplicates UnicodeData.txt 12969 # Combining never made into official property; 12970 # is \P{ccc=0} 12971 # Composite never made into official property. 12972 # Currency Symbol duplicates UnicodeData.txt: gc=sc 12973 # Decimal Digit duplicates UnicodeData.txt: gc=nd 12974 # Delimiter never made into official property; 12975 # removed in 3.0.1 12976 # Format Control never made into official property; 12977 # similar to gc=cf 12978 # High Surrogate duplicates Blocks.txt 12979 # Ignorable Control never made into official property; 12980 # similar to di=y 12981 # ISO Control duplicates UnicodeData.txt: gc=cc 12982 # Left of Pair never made into official property; 12983 # Line Separator duplicates UnicodeData.txt: gc=zl 12984 # Low Surrogate duplicates Blocks.txt 12985 # Non-break was actually listed as a property 12986 # in 3.2, but without any code 12987 # points. Unicode denies that this 12988 # was ever an official property 12989 # Non-spacing duplicate UnicodeData.txt: gc=mn 12990 # Numeric duplicates UnicodeData.txt: gc=cc 12991 # Paired Punctuation never made into official property; 12992 # appears to be gc=ps + gc=pe 12993 # Paragraph Separator duplicates UnicodeData.txt: gc=cc 12994 # Private Use duplicates UnicodeData.txt: gc=co 12995 # Private Use High Surrogate duplicates Blocks.txt 12996 # Punctuation duplicates UnicodeData.txt: gc=p 12997 # Space different definition than eventual 12998 # one. 12999 # Titlecase duplicates UnicodeData.txt: gc=lt 13000 # Unassigned Code Value duplicates UnicodeData.txt: gc=cn 13001 # Zero-width never made into official property; 13002 # subset of gc=cf 13003 # Most of the properties have the same names in this file as in later 13004 # versions, but a couple do not. 13005 # 13006 # This subroutine filters $_, converting it from the old style into 13007 # the new style. Here's a sample of the old-style 13008 # 13009 # ******************************************* 13010 # 13011 # Property dump for: 0x100000A0 (Join Control) 13012 # 13013 # 200C..200D (2 chars) 13014 # 13015 # In the example, the property is "Join Control". It is kept in this 13016 # closure between calls to the subroutine. The numbers beginning with 13017 # 0x were internal to Ken's program that generated this file. 13018 13019 # If this line contains the property name, extract it. 13020 if (/^Property dump for: [^(]*\((.*)\)/) { 13021 $_ = $1; 13022 13023 # Convert white space to underscores. 13024 s/ /_/g; 13025 13026 # Convert the few properties that don't have the same name as 13027 # their modern counterparts 13028 s/Identifier_Part/ID_Continue/ 13029 or s/Not_a_Character/NChar/; 13030 13031 # If the name matches an existing property, use it. 13032 if (defined property_ref($_)) { 13033 trace "new property=", $_ if main::DEBUG && $to_trace; 13034 $current_property = $_; 13035 } 13036 else { # Otherwise discard it 13037 trace "rejected property=", $_ if main::DEBUG && $to_trace; 13038 undef $current_property; 13039 } 13040 $_ = ""; # The property is saved for the next lines of the 13041 # file, but this defining line is of no further use, 13042 # so clear it so that the caller won't process it 13043 # further. 13044 } 13045 elsif (! defined $current_property || $_ !~ /^$code_point_re/) { 13046 13047 # Here, the input line isn't a header defining a property for the 13048 # following section, and either we aren't in such a section, or 13049 # the line doesn't look like one that defines the code points in 13050 # such a section. Ignore this line. 13051 $_ = ""; 13052 } 13053 else { 13054 13055 # Here, we have a line defining the code points for the current 13056 # stashed property. Anything starting with the first blank is 13057 # extraneous. Otherwise, it should look like a normal range to 13058 # the caller. Append the property name so that it looks just like 13059 # a modern PropList entry. 13060 13061 $_ =~ s/\s.*//; 13062 $_ .= "; $current_property"; 13063 } 13064 trace $_ if main::DEBUG && $to_trace; 13065 return; 13066 } 13067} # End closure for old style proplist 13068 13069sub filter_old_style_normalization_lines { 13070 # For early releases of Unicode, the lines were like: 13071 # 74..2A76 ; NFKD_NO 13072 # For later releases this became: 13073 # 74..2A76 ; NFKD_QC; N 13074 # Filter $_ to look like those in later releases. 13075 # Similarly for MAYBEs 13076 13077 s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x; 13078 13079 # Also, the property FC_NFKC was abbreviated to FNC 13080 s/FNC/FC_NFKC/; 13081 return; 13082} 13083 13084sub setup_script_extensions { 13085 # The Script_Extensions property starts out with a clone of the Script 13086 # property. 13087 13088 $scx = property_ref("Script_Extensions"); 13089 return unless defined $scx; 13090 13091 $scx->_set_format($STRING_WHITE_SPACE_LIST); 13092 $scx->initialize($script); 13093 $scx->set_default_map($script->default_map); 13094 $scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these 13095 $scx->add_comment(join_lines( <<END 13096The values for code points that appear in one script are just the same as for 13097the 'Script' property. Likewise the values for those that appear in many 13098scripts are either 'Common' or 'Inherited', same as with 'Script'. But the 13099values of code points that appear in a few scripts are a space separated list 13100of those scripts. 13101END 13102 )); 13103 13104 # Initialize scx's tables and the aliases for them to be the same as sc's 13105 foreach my $table ($script->tables) { 13106 my $scx_table = $scx->add_match_table($table->name, 13107 Full_Name => $table->full_name); 13108 foreach my $alias ($table->aliases) { 13109 $scx_table->add_alias($alias->name); 13110 } 13111 } 13112} 13113 13114sub filter_script_extensions_line { 13115 # The Scripts file comes with the full name for the scripts; the 13116 # ScriptExtensions, with the short name. The final mapping file is a 13117 # combination of these, and without adjustment, would have inconsistent 13118 # entries. This filters the latter file to convert to full names. 13119 # Entries look like this: 13120 # 064B..0655 ; Arab Syrc # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW 13121 13122 my @fields = split /\s*;\s*/; 13123 13124 # This script was erroneously omitted in this Unicode version. 13125 $fields[1] .= ' Takr' if $v_version eq v6.1.0 && $fields[0] =~ /^0964/; 13126 13127 my @full_names; 13128 foreach my $short_name (split " ", $fields[1]) { 13129 push @full_names, $script->table($short_name)->full_name; 13130 } 13131 $fields[1] = join " ", @full_names; 13132 $_ = join "; ", @fields; 13133 13134 return; 13135} 13136 13137sub setup_emojidata { 13138 my $prop_ref = Property->new('ExtPict', 13139 Full_Name => 'Extended_Pictographic', 13140 ); 13141 $prop_ref->set_fate($PLACEHOLDER, 13142 "Not part of the Unicode Character Database"); 13143} 13144 13145sub filter_emojidata_line { 13146 # We only are interested in this single property from this non-UCD data 13147 # file, and we turn it into a Perl property, so that it isn't accessible 13148 # to the users 13149 13150 $_ = "" unless /\bExtended_Pictographic\b/; 13151 13152 return; 13153} 13154 13155sub setup_IdStatus { 13156 my $ids = Property->new('Identifier_Status', 13157 Match_SubDir => 'IdStatus', 13158 Default_Map => 'Restricted', 13159 ); 13160 $ids->add_match_table('Allowed'); 13161} 13162 13163sub setup_IdType { 13164 $idt = Property->new('Identifier_Type', 13165 Match_SubDir => 'IdType', 13166 Default_Map => 'Not_Character', 13167 Format => $STRING_WHITE_SPACE_LIST, 13168 ); 13169} 13170 13171sub filter_IdType_line { 13172 13173 # Some code points have more than one type, separated by spaces on the 13174 # input. For now, we just add everything as a property value. Later when 13175 # we look for properties with format $STRING_WHITE_SPACE_LIST, we resolve 13176 # things 13177 13178 my @fields = split /\s*;\s*/; 13179 my $types = $fields[1]; 13180 $idt->add_match_table($types) unless defined $idt->table($types); 13181 13182 return; 13183} 13184 13185sub generate_hst($file) { 13186 13187 # Populates the Hangul Syllable Type property from first principles 13188 13189 # These few ranges are hard-coded in. 13190 $file->insert_lines(split /\n/, <<'END' 131911100..1159 ; L 13192115F ; L 131931160..11A2 ; V 1319411A8..11F9 ; T 13195END 13196); 13197 13198 # The Hangul syllables in version 1 are at different code points than 13199 # those that came along starting in version 2, and have different names; 13200 # they comprise about 60% of the code points of the later version. 13201 # From my (khw) research on them (see <558493EB.4000807@att.net>), the 13202 # initial set is a subset of the later version, with different English 13203 # transliterations. I did not see an easy mapping between them. The 13204 # later set includes essentially all possibilities, even ones that aren't 13205 # in modern use (if they ever were), and over 96% of the new ones are type 13206 # LVT. Mathematically, the early set must also contain a preponderance of 13207 # LVT values. In lieu of doing nothing, we just set them all to LVT, and 13208 # expect that this will be right most of the time, which is better than 13209 # not being right at all. 13210 if ($v_version lt v2.0.0) { 13211 my $property = property_ref($file->property); 13212 $file->insert_lines(sprintf("%04X..%04X; LVT\n", 13213 $FIRST_REMOVED_HANGUL_SYLLABLE, 13214 $FINAL_REMOVED_HANGUL_SYLLABLE)); 13215 push @tables_that_may_be_empty, $property->table('LV')->complete_name; 13216 return; 13217 } 13218 13219 # The algorithmically derived syllables are almost all LVT ones, so 13220 # initialize the whole range with that. 13221 $file->insert_lines(sprintf "%04X..%04X; LVT\n", 13222 $SBase, $SBase + $SCount -1); 13223 13224 # Those ones that aren't LVT are LV, and they occur at intervals of 13225 # $TCount code points, starting with the first code point, at $SBase. 13226 for (my $i = $SBase; $i < $SBase + $SCount; $i += $TCount) { 13227 $file->insert_lines(sprintf "%04X..%04X; LV\n", $i, $i); 13228 } 13229 13230 return; 13231} 13232 13233sub generate_GCB($file) { 13234 13235 # Populates the Grapheme Cluster Break property from first principles 13236 13237 # All these definitions are from 13238 # http://www.unicode.org/reports/tr29/tr29-3.html with confirmation 13239 # from http://www.unicode.org/reports/tr29/tr29-4.html 13240 13241 foreach my $range ($gc->ranges) { 13242 13243 # Extend includes gc=Me and gc=Mn, while Control includes gc=Cc 13244 # and gc=Cf 13245 if ($range->value =~ / ^ M [en] $ /x) { 13246 $file->insert_lines(sprintf "%04X..%04X; Extend", 13247 $range->start, $range->end); 13248 } 13249 elsif ($range->value =~ / ^ C [cf] $ /x) { 13250 $file->insert_lines(sprintf "%04X..%04X; Control", 13251 $range->start, $range->end); 13252 } 13253 } 13254 $file->insert_lines("2028; Control"); # Line Separator 13255 $file->insert_lines("2029; Control"); # Paragraph Separator 13256 13257 $file->insert_lines("000D; CR"); 13258 $file->insert_lines("000A; LF"); 13259 13260 # Also from http://www.unicode.org/reports/tr29/tr29-3.html. 13261 foreach my $code_point ( qw{ 13262 09BE 09D7 0B3E 0B57 0BBE 0BD7 0CC2 0CD5 0CD6 13263 0D3E 0D57 0DCF 0DDF FF9E FF9F 1D165 1D16E 1D16F 13264 } 13265 ) { 13266 my $category = $gc->value_of(hex $code_point); 13267 next if ! defined $category || $category eq 'Cn'; # But not if 13268 # unassigned in this 13269 # release 13270 $file->insert_lines("$code_point; Extend"); 13271 } 13272 13273 my $hst = property_ref('Hangul_Syllable_Type'); 13274 if ($hst->count > 0) { 13275 foreach my $range ($hst->ranges) { 13276 $file->insert_lines(sprintf "%04X..%04X; %s", 13277 $range->start, $range->end, $range->value); 13278 } 13279 } 13280 else { 13281 generate_hst($file); 13282 } 13283 13284 main::process_generic_property_file($file); 13285} 13286 13287 13288sub fixup_early_perl_name_alias($file) { 13289 13290 # Different versions of Unicode have varying support for the name synonyms 13291 # below. Just include everything. As of 6.1, all these are correct in 13292 # the Unicode-supplied file. 13293 13294 # ALERT did not come along until 6.0, at which point it became preferred 13295 # over BELL. By inserting it last in early releases, BELL is preferred 13296 # over it; and vice-vers in 6.0 13297 my $type_for_bell = ($v_version lt v6.0.0) 13298 ? 'correction' 13299 : 'alternate'; 13300 $file->insert_lines(split /\n/, <<END 133010007;BELL; $type_for_bell 13302000A;LINE FEED (LF);alternate 13303000C;FORM FEED (FF);alternate 13304000D;CARRIAGE RETURN (CR);alternate 133050085;NEXT LINE (NEL);alternate 13306END 13307 13308 ); 13309 13310 # One might think that the 'Unicode_1_Name' field, could work for most 13311 # of the above names, but sadly that field varies depending on the 13312 # release. Version 1.1.5 had no names for any of the controls; Version 13313 # 2.0 introduced names for the C0 controls, and 3.0 introduced C1 names. 13314 # 3.0.1 removed the name INDEX; and 3.2 changed some names: 13315 # changed to parenthesized versions like "NEXT LINE" to 13316 # "NEXT LINE (NEL)"; 13317 # changed PARTIAL LINE DOWN to PARTIAL LINE FORWARD 13318 # changed PARTIAL LINE UP to PARTIAL LINE BACKWARD;; 13319 # changed e.g. FILE SEPARATOR to INFORMATION SEPARATOR FOUR 13320 # 13321 # All these are present in the 6.1 NameAliases.txt 13322 13323 return; 13324} 13325 13326sub filter_later_version_name_alias_line { 13327 13328 # This file has an extra entry per line for the alias type. This is 13329 # handled by creating a compound entry: "$alias: $type"; First, split 13330 # the line into components. 13331 my ($range, $alias, $type, @remainder) 13332 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 13333 13334 # This file contains multiple entries for some components, so tell the 13335 # downstream code to allow this in our internal tables; the 13336 # $MULTIPLE_AFTER preserves the input ordering. 13337 $_ = join ";", $range, $CMD_DELIM 13338 . $REPLACE_CMD 13339 . '=' 13340 . $MULTIPLE_AFTER 13341 . $CMD_DELIM 13342 . "$alias: $type", 13343 @remainder; 13344 return; 13345} 13346 13347sub filter_early_version_name_alias_line { 13348 13349 # Early versions did not have the trailing alias type field; implicitly it 13350 # was 'correction'. 13351 $_ .= "; correction"; 13352 13353 filter_later_version_name_alias_line; 13354 return; 13355} 13356 13357sub filter_all_caps_script_names { 13358 13359 # Some early Unicode releases had the script names in all CAPS. This 13360 # converts them to just the first letter of each word being capital. 13361 13362 my ($range, $script, @remainder) 13363 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 13364 my @words = split /[_-]/, $script; 13365 for my $word (@words) { 13366 $word = 13367 ucfirst(lc($word)) if $word ne 'CJK'; 13368 } 13369 $script = join "_", @words; 13370 $_ = join ";", $range, $script, @remainder; 13371} 13372 13373sub finish_Unicode() { 13374 # This routine should be called after all the Unicode files have been read 13375 # in. It: 13376 # 1) Creates properties that are missing from the version of Unicode being 13377 # compiled, and which, for whatever reason, are needed for the Perl 13378 # core to function properly. These are minimally populated as 13379 # necessary. 13380 # 2) Adds the mappings for code points missing from the files which have 13381 # defaults specified for them. 13382 # 3) At this point all mappings are known, so it computes the type of 13383 # each property whose type hasn't been determined yet. 13384 # 4) Calculates all the regular expression match tables based on the 13385 # mappings. 13386 # 5) Calculates and adds the tables which are defined by Unicode, but 13387 # which aren't derived by them, and certain derived tables that Perl 13388 # uses. 13389 13390 # Folding information was introduced later into Unicode data. To get 13391 # Perl's case ignore (/i) to work at all in releases that don't have 13392 # folding, use the best available alternative, which is lower casing. 13393 my $fold = property_ref('Case_Folding'); 13394 if ($fold->is_empty) { 13395 $fold->initialize(property_ref('Lowercase_Mapping')); 13396 $fold->add_note(join_lines(<<END 13397WARNING: This table uses lower case as a substitute for missing fold 13398information 13399END 13400 )); 13401 } 13402 13403 # Multiple-character mapping was introduced later into Unicode data, so it 13404 # is by default the simple version. If to output the simple versions and 13405 # not present, just use the regular (which in these Unicode versions is 13406 # the simple as well). 13407 foreach my $map (qw { Uppercase_Mapping 13408 Lowercase_Mapping 13409 Titlecase_Mapping 13410 Case_Folding 13411 } ) 13412 { 13413 my $comment = <<END; 13414 13415Note that although the Perl core uses this file, it has the standard values 13416for code points from U+0000 to U+00FF compiled in, so changing this table will 13417not change the core's behavior with respect to these code points. Use 13418Unicode::Casing to override this table. 13419END 13420 if ($map eq 'Case_Folding') { 13421 $comment .= <<END; 13422(/i regex matching is not overridable except by using a custom regex engine) 13423END 13424 } 13425 property_ref($map)->add_comment(join_lines($comment)); 13426 my $simple = property_ref("Simple_$map"); 13427 next if ! $simple->is_empty; 13428 if ($simple->to_output_map) { 13429 $simple->initialize(property_ref($map)); 13430 } 13431 else { 13432 property_ref($map)->set_proxy_for($simple->name); 13433 } 13434 } 13435 13436 # For each property, fill in any missing mappings, and calculate the re 13437 # match tables. If a property has more than one missing mapping, the 13438 # default is a reference to a data structure, and may require data from 13439 # other properties to resolve. The sort is used to cause these to be 13440 # processed last, after all the other properties have been calculated. 13441 # (Fortunately, the missing properties so far don't depend on each other.) 13442 foreach my $property 13443 (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 } 13444 property_ref('*')) 13445 { 13446 # $perl has been defined, but isn't one of the Unicode properties that 13447 # need to be finished up. 13448 next if $property == $perl; 13449 13450 # Nor do we need to do anything with properties that aren't going to 13451 # be output. 13452 next if $property->fate == $SUPPRESSED; 13453 13454 # Handle the properties that have more than one possible default 13455 if (ref $property->default_map) { 13456 my $default_map = $property->default_map; 13457 13458 # These properties have stored in the default_map: 13459 # One or more of: 13460 # 1) A default map which applies to all code points in a 13461 # certain class 13462 # 2) an expression which will evaluate to the list of code 13463 # points in that class 13464 # And 13465 # 3) the default map which applies to every other missing code 13466 # point. 13467 # 13468 # Go through each list. 13469 while (my ($default, $eval) = $default_map->get_next_defaults) { 13470 13471 # Get the class list, and intersect it with all the so-far 13472 # unspecified code points yielding all the code points 13473 # in the class that haven't been specified. 13474 my $list = eval $eval; 13475 if ($@) { 13476 Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'"); 13477 last; 13478 } 13479 13480 # Narrow down the list to just those code points we don't have 13481 # maps for yet. 13482 $list = $list & $property->inverse_list; 13483 13484 # Add mappings to the property for each code point in the list 13485 foreach my $range ($list->ranges) { 13486 $property->add_map($range->start, $range->end, $default, 13487 Replace => $CROAK); 13488 } 13489 } 13490 13491 # All remaining code points have the other mapping. Set that up 13492 # so the normal single-default mapping code will work on them 13493 $property->set_default_map($default_map->other_default); 13494 13495 # And fall through to do that 13496 } 13497 13498 # We should have enough data now to compute the type of the property. 13499 my $property_name = $property->name; 13500 $property->compute_type; 13501 my $property_type = $property->type; 13502 13503 next if ! $property->to_create_match_tables; 13504 13505 # Here want to create match tables for this property 13506 13507 # The Unicode db always (so far, and they claim into the future) have 13508 # the default for missing entries in binary properties be 'N' (unless 13509 # there is a '@missing' line that specifies otherwise) 13510 if (! defined $property->default_map) { 13511 if ($property_type == $BINARY) { 13512 $property->set_default_map('N'); 13513 } 13514 elsif ($property_type == $ENUM) { 13515 Carp::my_carp("Property '$property_name doesn't have a default mapping. Using a fake one"); 13516 $property->set_default_map('XXX This makes sure there is a default map'); 13517 } 13518 } 13519 13520 # Add any remaining code points to the mapping, using the default for 13521 # missing code points. 13522 my $default_table; 13523 my $default_map = $property->default_map; 13524 if ($property_type == $FORCED_BINARY) { 13525 13526 # A forced binary property creates a 'Y' table that matches all 13527 # non-default values. The actual string values are also written out 13528 # as a map table. (The default value will almost certainly be the 13529 # empty string, so the pod glosses over the distinction, and just 13530 # talks about empty vs non-empty.) 13531 my $yes = $property->table("Y"); 13532 foreach my $range ($property->ranges) { 13533 next if $range->value eq $default_map; 13534 $yes->add_range($range->start, $range->end); 13535 } 13536 $property->table("N")->set_complement($yes); 13537 } 13538 else { 13539 if (defined $default_map) { 13540 13541 # Make sure there is a match table for the default 13542 if (! defined ($default_table = $property->table($default_map))) 13543 { 13544 $default_table = $property->add_match_table($default_map); 13545 } 13546 13547 # And, if the property is binary, the default table will just 13548 # be the complement of the other table. 13549 if ($property_type == $BINARY) { 13550 my $non_default_table; 13551 13552 # Find the non-default table. 13553 for my $table ($property->tables) { 13554 if ($table == $default_table) { 13555 if ($v_version le v5.0.0) { 13556 $table->add_alias($_) for qw(N No F False); 13557 } 13558 next; 13559 } elsif ($v_version le v5.0.0) { 13560 $table->add_alias($_) for qw(Y Yes T True); 13561 } 13562 $non_default_table = $table; 13563 } 13564 $default_table->set_complement($non_default_table); 13565 } 13566 else { 13567 13568 # This fills in any missing values with the default. It's 13569 # not necessary to do this with binary properties, as the 13570 # default is defined completely in terms of the Y table. 13571 $property->add_map(0, $MAX_WORKING_CODEPOINT, 13572 $default_map, Replace => $NO); 13573 } 13574 } 13575 13576 # Have all we need to populate the match tables. 13577 my $maps_should_be_defined = $property->pre_declared_maps; 13578 foreach my $range ($property->ranges) { 13579 my $map = $range->value; 13580 my $table = $property->table($map); 13581 if (! defined $table) { 13582 13583 # Integral and rational property values are not 13584 # necessarily defined in PropValueAliases, but whether all 13585 # the other ones should be depends on the property. 13586 if ($maps_should_be_defined 13587 && $map !~ /^ -? \d+ ( \/ \d+ )? $/x) 13588 { 13589 Carp::my_carp("Table '$property_name=$map' should " 13590 . "have been defined. Defining it now.") 13591 } 13592 $table = $property->add_match_table($map); 13593 } 13594 13595 next if $table->complement != 0; # Don't need to populate these 13596 $table->add_range($range->start, $range->end); 13597 } 13598 } 13599 13600 # For Perl 5.6 compatibility, all properties matchable in regexes can 13601 # have an optional 'Is_' prefix. This is now done in Unicode::UCD. 13602 # But warn if this creates a conflict with a (new) Unicode property 13603 # name, although it appears that Unicode has made a decision never to 13604 # begin a property name with 'Is_', so this shouldn't happen. 13605 foreach my $alias ($property->aliases) { 13606 my $Is_name = 'Is_' . $alias->name; 13607 if (defined (my $pre_existing = property_ref($Is_name))) { 13608 Carp::my_carp(<<END 13609There is already an alias named $Is_name (from " . $pre_existing . "), so 13610creating one for $property won't work. This is bad news. If it is not too 13611late, get Unicode to back off. Otherwise go back to the old scheme (findable 13612from the git blame log for this area of the code that suppressed individual 13613aliases that conflict with the new Unicode names. Proceeding anyway. 13614END 13615 ); 13616 } 13617 } # End of loop through aliases for this property 13618 13619 13620 # Properties that have sets of values for some characters are now 13621 # converted. For example, the Script_Extensions property started out 13622 # as a clone of the Script property. But processing its data file 13623 # caused some elements to be replaced with different data. (These 13624 # elements were for the Common and Inherited properties.) This data 13625 # is a qw() list of all the scripts that the code points in the given 13626 # range are in. An example line is: 13627 # 13628 # 060C ; Arab Syrc Thaa # Po ARABIC COMMA 13629 # 13630 # Code executed earlier has created a new match table named "Arab Syrc 13631 # Thaa" which contains 060C. (The cloned table started out with this 13632 # code point mapping to "Common".) Now we add 060C to each of the 13633 # Arab, Syrc, and Thaa match tables. Then we delete the now spurious 13634 # "Arab Syrc Thaa" match table. This is repeated for all these tables 13635 # and ranges. The map data is retained in the map table for 13636 # reference, but the spurious match tables are deleted. 13637 my $format = $property->format; 13638 if (defined $format && $format eq $STRING_WHITE_SPACE_LIST) { 13639 foreach my $table ($property->tables) { 13640 13641 # Space separates the entries which should go in multiple 13642 # tables 13643 next unless $table->name =~ /\s/; 13644 13645 # The list of the entries, hence the names of the tables that 13646 # everything in this combo table should be added to. 13647 my @list = split /\s+/, $table->name; 13648 13649 # Add the entries from the combo table to each individual 13650 # table 13651 foreach my $individual (@list) { 13652 my $existing_table = $property->table($individual); 13653 13654 # This should only be necessary if this particular entry 13655 # occurs only in combo with others. 13656 $existing_table = $property->add_match_table($individual) 13657 unless defined $existing_table; 13658 $existing_table += $table; 13659 } 13660 $property->delete_match_table($table); 13661 } 13662 } 13663 } # End of loop through all Unicode properties. 13664 13665 # Fill in the mappings that Unicode doesn't completely furnish. First the 13666 # single letter major general categories. If Unicode were to start 13667 # delivering the values, this would be redundant, but better that than to 13668 # try to figure out if should skip and not get it right. Ths could happen 13669 # if a new major category were to be introduced, and the hard-coded test 13670 # wouldn't know about it. 13671 # This routine depends on the standard names for the general categories 13672 # being what it thinks they are, like 'Cn'. The major categories are the 13673 # union of all the general category tables which have the same first 13674 # letters. eg. L = Lu + Lt + Ll + Lo + Lm 13675 foreach my $minor_table ($gc->tables) { 13676 my $minor_name = $minor_table->name; 13677 next if length $minor_name == 1; 13678 if (length $minor_name != 2) { 13679 Carp::my_carp_bug("Unexpected general category '$minor_name'. Skipped."); 13680 next; 13681 } 13682 13683 my $major_name = uc(substr($minor_name, 0, 1)); 13684 my $major_table = $gc->table($major_name); 13685 $major_table += $minor_table; 13686 } 13687 13688 # LC is Ll, Lu, and Lt. (used to be L& or L_, but PropValueAliases.txt 13689 # defines it as LC) 13690 my $LC = $gc->table('LC'); 13691 $LC->add_alias('L_', Status => $DISCOURAGED); # For backwards... 13692 $LC->add_alias('L&', Status => $DISCOURAGED); # compatibility. 13693 13694 13695 if ($LC->is_empty) { # Assume if not empty that Unicode has started to 13696 # deliver the correct values in it 13697 $LC->initialize($gc->table('Ll') + $gc->table('Lu')); 13698 13699 # Lt not in release 1. 13700 if (defined $gc->table('Lt')) { 13701 $LC += $gc->table('Lt'); 13702 $gc->table('Lt')->set_caseless_equivalent($LC); 13703 } 13704 } 13705 $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]'); 13706 13707 $gc->table('Ll')->set_caseless_equivalent($LC); 13708 $gc->table('Lu')->set_caseless_equivalent($LC); 13709 13710 # Create digit and case fold tables with the original file names for 13711 # backwards compatibility with applications that read them directly. 13712 my $Digit = Property->new("Legacy_Perl_Decimal_Digit", 13713 Default_Map => "", 13714 File => 'Digit', # Trad. location 13715 Directory => $map_directory, 13716 Type => $STRING, 13717 Replacement_Property => "Perl_Decimal_Digit", 13718 Initialize => property_ref('Perl_Decimal_Digit'), 13719 ); 13720 $Digit->add_comment(join_lines(<<END 13721This file gives the mapping of all code points which represent a single 13722decimal digit [0-9] to their respective digits. For example, the code point 13723U+0031 (an ASCII '1') is mapped to a numeric 1. These code points are those 13724that have Numeric_Type=Decimal; not special things, like subscripts nor Roman 13725numerals. 13726END 13727 )); 13728 13729 # Make sure this assumption in perl core code is valid in this Unicode 13730 # release, with known exceptions 13731 foreach my $range (property_ref('Numeric-Type')->table('Decimal')->ranges) { 13732 next if $range->end - $range->start == 9; 13733 next if $range->start == 0x1D7CE; # This whole range was added in 3.1 13734 next if $range->end == 0x19DA && $v_version eq v5.2.0; 13735 next if $range->end - $range->start < 9 && $v_version le 4.0.0; 13736 Carp::my_carp("Range $range unexpectedly doesn't contain 10" 13737 . " decimal digits. Code in regcomp.c assumes it does," 13738 . " and will have to be fixed. Proceeding anyway."); 13739 } 13740 13741 Property->new('Legacy_Case_Folding', 13742 File => "Fold", 13743 Directory => $map_directory, 13744 Default_Map => $CODE_POINT, 13745 Type => $STRING, 13746 Replacement_Property => "Case_Folding", 13747 Format => $HEX_FORMAT, 13748 Initialize => property_ref('cf'), 13749 ); 13750 13751 # Mark the scx table as the parent of the corresponding sc table for those 13752 # which are identical. This causes the pod for the script table to refer 13753 # to the corresponding scx one. This is done after everything, so as to 13754 # wait until the tables are stabilized before checking for equivalency. 13755 if (defined $scx) { 13756 if (defined $pod_directory) { 13757 foreach my $table ($scx->tables) { 13758 my $plain_sc_equiv = $script->table($table->name); 13759 if ($table->matches_identically_to($plain_sc_equiv)) { 13760 $plain_sc_equiv->set_equivalent_to($table, Related => 1); 13761 } 13762 } 13763 } 13764 } 13765 13766 return; 13767} 13768 13769sub pre_3_dot_1_Nl () { 13770 13771 # Return a range list for gc=nl for Unicode versions prior to 3.1, which 13772 # is when Unicode's became fully usable. These code points were 13773 # determined by inspection and experimentation. gc=nl is important for 13774 # certain Perl-extension properties that should be available in all 13775 # releases. 13776 13777 my $Nl = Range_List->new(); 13778 if (defined (my $official = $gc->table('Nl'))) { 13779 $Nl += $official; 13780 } 13781 else { 13782 $Nl->add_range(0x2160, 0x2182); 13783 $Nl->add_range(0x3007, 0x3007); 13784 $Nl->add_range(0x3021, 0x3029); 13785 } 13786 $Nl->add_range(0xFE20, 0xFE23); 13787 $Nl->add_range(0x16EE, 0x16F0) if $v_version ge v3.0.0; # 3.0 was when 13788 # these were added 13789 return $Nl; 13790} 13791 13792sub calculate_Assigned() { # Set $Assigned to the gc != Cn code points; may be 13793 # called before the Cn's are completely filled. 13794 # Works on Unicodes earlier than ones that 13795 # explicitly specify Cn. 13796 return if defined $Assigned; 13797 13798 if (! defined $gc || $gc->is_empty()) { 13799 Carp::my_carp_bug("calculate_Assigned() called before $gc is populated"); 13800 } 13801 13802 $Assigned = $perl->add_match_table('Assigned', 13803 Description => "All assigned code points", 13804 ); 13805 while (defined (my $range = $gc->each_range())) { 13806 my $standard_value = standardize($range->value); 13807 next if $standard_value eq 'cn' || $standard_value eq 'unassigned'; 13808 $Assigned->add_range($range->start, $range->end); 13809 } 13810} 13811 13812sub calculate_DI() { # Set $DI to a Range_List equivalent to the 13813 # Default_Ignorable_Code_Point property. Works on 13814 # Unicodes earlier than ones that explicitly specify 13815 # DI. 13816 return if defined $DI; 13817 13818 if (defined (my $di = property_ref('Default_Ignorable_Code_Point'))) { 13819 $DI = $di->table('Y'); 13820 } 13821 else { 13822 $DI = Range_List->new(Initialize => [ 0x180B .. 0x180D, 13823 0x2060 .. 0x206F, 13824 0xFE00 .. 0xFE0F, 13825 0xFFF0 .. 0xFFFB, 13826 ]); 13827 if ($v_version ge v2.0) { 13828 $DI += $gc->table('Cf') 13829 + $gc->table('Cs'); 13830 13831 # These are above the Unicode version 1 max 13832 $DI->add_range(0xE0000, 0xE0FFF); 13833 } 13834 $DI += $gc->table('Cc') 13835 - ord("\t") 13836 - utf8::unicode_to_native(0x0A) # LINE FEED 13837 - utf8::unicode_to_native(0x0B) # VERTICAL TAB 13838 - ord("\f") 13839 - utf8::unicode_to_native(0x0D) # CARRIAGE RETURN 13840 - utf8::unicode_to_native(0x85); # NEL 13841 } 13842} 13843 13844sub calculate_NChar() { # Create a Perl extension match table which is the 13845 # same as the Noncharacter_Code_Point property, and 13846 # set $NChar to point to it. Works on Unicodes 13847 # earlier than ones that explicitly specify NChar 13848 return if defined $NChar; 13849 13850 $NChar = $perl->add_match_table('_Perl_Nchar', 13851 Perl_Extension => 1, 13852 Fate => $INTERNAL_ONLY); 13853 if (defined (my $off_nchar = property_ref('NChar'))) { 13854 $NChar->initialize($off_nchar->table('Y')); 13855 } 13856 else { 13857 $NChar->initialize([ 0xFFFE .. 0xFFFF ]); 13858 if ($v_version ge v2.0) { # First release with these nchars 13859 for (my $i = 0x1FFFE; $i <= 0x10FFFE; $i += 0x10000) { 13860 $NChar += [ $i .. $i+1 ]; 13861 } 13862 } 13863 } 13864} 13865 13866sub handle_compare_versions () { 13867 # This fixes things up for the $compare_versions capability, where we 13868 # compare Unicode version X with version Y (with Y > X), and we are 13869 # running it on the Unicode Data for version Y. 13870 # 13871 # It works by calculating the code points whose meaning has been specified 13872 # after release X, by using the Age property. The complement of this set 13873 # is the set of code points whose meaning is unchanged between the 13874 # releases. This is the set the program restricts itself to. It includes 13875 # everything whose meaning has been specified by the time version X came 13876 # along, plus those still unassigned by the time of version Y. (We will 13877 # continue to use the word 'assigned' to mean 'meaning has been 13878 # specified', as it's shorter and is accurate in all cases except the 13879 # Noncharacter code points.) 13880 # 13881 # This function is run after all the properties specified by Unicode have 13882 # been calculated for release Y. This makes sure we get all the nuances 13883 # of Y's rules. (It is done before the Perl extensions are calculated, as 13884 # those are based entirely on the Unicode ones.) But doing it after the 13885 # Unicode table calculations means we have to fix up the Unicode tables. 13886 # We do this by subtracting the code points that have been assigned since 13887 # X (which is actually done by ANDing each table of assigned code points 13888 # with the set of unchanged code points). Most Unicode properties are of 13889 # the form such that all unassigned code points have a default, grab-bag, 13890 # property value which is changed when the code point gets assigned. For 13891 # these, we just remove the changed code points from the table for the 13892 # latter property value, and add them back in to the grab-bag one. A few 13893 # other properties are not entirely of this form and have values for some 13894 # or all unassigned code points that are not the grab-bag one. These have 13895 # to be handled specially, and are hard-coded in to this routine based on 13896 # manual inspection of the Unicode character database. A list of the 13897 # outlier code points is made for each of these properties, and those 13898 # outliers are excluded from adding and removing from tables. 13899 # 13900 # Note that there are glitches when comparing against Unicode 1.1, as some 13901 # Hangul syllables in it were later ripped out and eventually replaced 13902 # with other things. 13903 13904 print "Fixing up for version comparison\n" if $verbosity >= $PROGRESS; 13905 13906 my $after_first_version = "All matching code points were added after " 13907 . "Unicode $string_compare_versions"; 13908 13909 # Calculate the delta as those code points that have been newly assigned 13910 # since the first compare version. 13911 my $delta = Range_List->new(); 13912 foreach my $table ($age->tables) { 13913 use version; 13914 next if $table == $age->table('Unassigned'); 13915 next if version->parse($table->name) 13916 le version->parse($string_compare_versions); 13917 $delta += $table; 13918 } 13919 if ($delta->is_empty) { 13920 die ("No changes; perhaps you need a 'DAge.txt' file?"); 13921 } 13922 13923 my $unchanged = ~ $delta; 13924 13925 calculate_Assigned() if ! defined $Assigned; 13926 $Assigned &= $unchanged; 13927 13928 # $Assigned now contains the code points that were assigned as of Unicode 13929 # version X. 13930 13931 # A block is all or nothing. If nothing is assigned in it, it all goes 13932 # back to the No_Block pool; but if even one code point is assigned, the 13933 # block is retained. 13934 my $no_block = $block->table('No_Block'); 13935 foreach my $this_block ($block->tables) { 13936 next if $this_block == $no_block 13937 || ! ($this_block & $Assigned)->is_empty; 13938 $this_block->set_fate($SUPPRESSED, $after_first_version); 13939 foreach my $range ($this_block->ranges) { 13940 $block->replace_map($range->start, $range->end, 'No_Block') 13941 } 13942 $no_block += $this_block; 13943 } 13944 13945 my @special_delta_properties; # List of properties that have to be 13946 # handled specially. 13947 my %restricted_delta; # Keys are the entries in 13948 # @special_delta_properties; values 13949 # are the range list of the code points 13950 # that behave normally when they get 13951 # assigned. 13952 13953 # In the next three properties, the Default Ignorable code points are 13954 # outliers. 13955 calculate_DI(); 13956 $DI &= $unchanged; 13957 13958 push @special_delta_properties, property_ref('_Perl_GCB'); 13959 $restricted_delta{$special_delta_properties[-1]} = ~ $DI; 13960 13961 if (defined (my $cwnfkcc = property_ref('Changes_When_NFKC_Casefolded'))) 13962 { 13963 push @special_delta_properties, $cwnfkcc; 13964 $restricted_delta{$special_delta_properties[-1]} = ~ $DI; 13965 } 13966 13967 calculate_NChar(); # Non-character code points 13968 $NChar &= $unchanged; 13969 13970 # This may have to be updated from time-to-time to get the most accurate 13971 # results. 13972 my $default_BC_non_LtoR = Range_List->new(Initialize => 13973 # These came from the comments in v8.0 DBidiClass.txt 13974 [ # AL 13975 0x0600 .. 0x07BF, 13976 0x08A0 .. 0x08FF, 13977 0xFB50 .. 0xFDCF, 13978 0xFDF0 .. 0xFDFF, 13979 0xFE70 .. 0xFEFF, 13980 0x1EE00 .. 0x1EEFF, 13981 # R 13982 0x0590 .. 0x05FF, 13983 0x07C0 .. 0x089F, 13984 0xFB1D .. 0xFB4F, 13985 0x10800 .. 0x10FFF, 13986 0x1E800 .. 0x1EDFF, 13987 0x1EF00 .. 0x1EFFF, 13988 # ET 13989 0x20A0 .. 0x20CF, 13990 ] 13991 ); 13992 $default_BC_non_LtoR += $DI + $NChar; 13993 push @special_delta_properties, property_ref('BidiClass'); 13994 $restricted_delta{$special_delta_properties[-1]} = ~ $default_BC_non_LtoR; 13995 13996 if (defined (my $eaw = property_ref('East_Asian_Width'))) { 13997 13998 my $default_EA_width_W = Range_List->new(Initialize => 13999 # From comments in v8.0 EastAsianWidth.txt 14000 [ 14001 0x3400 .. 0x4DBF, 14002 0x4E00 .. 0x9FFF, 14003 0xF900 .. 0xFAFF, 14004 0x20000 .. 0x2A6DF, 14005 0x2A700 .. 0x2B73F, 14006 0x2B740 .. 0x2B81F, 14007 0x2B820 .. 0x2CEAF, 14008 0x2F800 .. 0x2FA1F, 14009 0x20000 .. 0x2FFFD, 14010 0x30000 .. 0x3FFFD, 14011 ] 14012 ); 14013 push @special_delta_properties, $eaw; 14014 $restricted_delta{$special_delta_properties[-1]} 14015 = ~ $default_EA_width_W; 14016 14017 # Line break came along in the same release as East_Asian_Width, and 14018 # the non-grab-bag default set is a superset of the EAW one. 14019 if (defined (my $lb = property_ref('Line_Break'))) { 14020 my $default_LB_non_XX = Range_List->new(Initialize => 14021 # From comments in v8.0 LineBreak.txt 14022 [ 0x20A0 .. 0x20CF ]); 14023 $default_LB_non_XX += $default_EA_width_W; 14024 push @special_delta_properties, $lb; 14025 $restricted_delta{$special_delta_properties[-1]} 14026 = ~ $default_LB_non_XX; 14027 } 14028 } 14029 14030 # Go through every property, skipping those we've already worked on, those 14031 # that are immutable, and the perl ones that will be calculated after this 14032 # routine has done its fixup. 14033 foreach my $property (property_ref('*')) { 14034 next if $property == $perl # Done later in the program 14035 || $property == $block # Done just above 14036 || $property == $DI # Done just above 14037 || $property == $NChar # Done just above 14038 14039 # The next two are invariant across Unicode versions 14040 || $property == property_ref('Pattern_Syntax') 14041 || $property == property_ref('Pattern_White_Space'); 14042 14043 # Find the grab-bag value. 14044 my $default_map = $property->default_map; 14045 14046 if (! $property->to_create_match_tables) { 14047 14048 # Here there aren't any match tables. So far, all such properties 14049 # have a default map, and don't require special handling. Just 14050 # change each newly assigned code point back to the default map, 14051 # as if they were unassigned. 14052 foreach my $range ($delta->ranges) { 14053 $property->add_map($range->start, 14054 $range->end, 14055 $default_map, 14056 Replace => $UNCONDITIONALLY); 14057 } 14058 } 14059 else { # Here there are match tables. Find the one (if any) for the 14060 # grab-bag value that unassigned code points go to. 14061 my $default_table; 14062 if (defined $default_map) { 14063 $default_table = $property->table($default_map); 14064 } 14065 14066 # If some code points don't go back to the grab-bag when they 14067 # are considered unassigned, exclude them from the list that does 14068 # that. 14069 my $this_delta = $delta; 14070 my $this_unchanged = $unchanged; 14071 if (grep { $_ == $property } @special_delta_properties) { 14072 $this_delta = $delta & $restricted_delta{$property}; 14073 $this_unchanged = ~ $this_delta; 14074 } 14075 14076 # Fix up each match table for this property. 14077 foreach my $table ($property->tables) { 14078 if (defined $default_table && $table == $default_table) { 14079 14080 # The code points assigned after release X (the ones we 14081 # are excluding in this routine) go back on to the default 14082 # (grab-bag) table. However, some of these tables don't 14083 # actually exist, but are specified solely by the other 14084 # tables. (In a binary property, we don't need to 14085 # actually have an 'N' table, as it's just the complement 14086 # of the 'Y' table.) Such tables will be locked, so just 14087 # skip those. 14088 $table += $this_delta unless $table->locked; 14089 } 14090 else { 14091 14092 # Here the table is not for the default value. We need to 14093 # subtract the code points we are ignoring for this 14094 # comparison (the deltas) from it. But if the table 14095 # started out with nothing, no need to exclude anything, 14096 # and want to skip it here anyway, so it gets listed 14097 # properly in the pod. 14098 next if $table->is_empty; 14099 14100 # Save the deltas for later, before we do the subtraction 14101 my $deltas = $table & $this_delta; 14102 14103 $table &= $this_unchanged; 14104 14105 # Suppress the table if the subtraction left it with 14106 # nothing in it 14107 if ($table->is_empty) { 14108 if ($property->type == $BINARY) { 14109 push @tables_that_may_be_empty, $table->complete_name; 14110 } 14111 else { 14112 $table->set_fate($SUPPRESSED, $after_first_version); 14113 } 14114 } 14115 14116 # Now we add the removed code points to the property's 14117 # map, as they should now map to the grab-bag default 14118 # property (which they did in the first comparison 14119 # version). But we don't have to do this if the map is 14120 # only for internal use. 14121 if (defined $default_map && $property->to_output_map) { 14122 14123 # The gc property has pseudo property values whose names 14124 # have length 1. These are the union of all the 14125 # property values whose name is longer than 1 and 14126 # whose first letter is all the same. The replacement 14127 # is done once for the longer-named tables. 14128 next if $property == $gc && length $table->name == 1; 14129 14130 foreach my $range ($deltas->ranges) { 14131 $property->add_map($range->start, 14132 $range->end, 14133 $default_map, 14134 Replace => $UNCONDITIONALLY); 14135 } 14136 } 14137 } 14138 } 14139 } 14140 } 14141 14142 # The above code doesn't work on 'gc=C', as it is a superset of the default 14143 # ('Cn') table. It's easiest to just special case it here. 14144 my $C = $gc->table('C'); 14145 $C += $gc->table('Cn'); 14146 14147 return; 14148} 14149 14150sub compile_perl() { 14151 # Create perl-defined tables. Almost all are part of the pseudo-property 14152 # named 'perl' internally to this program. Many of these are recommended 14153 # in UTS#18 "Unicode Regular Expressions", and their derivations are based 14154 # on those found there. 14155 # Almost all of these are equivalent to some Unicode property. 14156 # A number of these properties have equivalents restricted to the ASCII 14157 # range, with their names prefaced by 'Posix', to signify that these match 14158 # what the Posix standard says they should match. A couple are 14159 # effectively this, but the name doesn't have 'Posix' in it because there 14160 # just isn't any Posix equivalent. 'XPosix' are the Posix tables extended 14161 # to the full Unicode range, by our guesses as to what is appropriate. 14162 14163 # 'All' is all code points. As an error check, instead of just setting it 14164 # to be that, construct it to be the union of all the major categories 14165 $All = $perl->add_match_table('All', 14166 Description 14167 => "All code points, including those above Unicode. Same as qr/./s", 14168 Matches_All => 1); 14169 14170 foreach my $major_table ($gc->tables) { 14171 14172 # Major categories are the ones with single letter names. 14173 next if length($major_table->name) != 1; 14174 14175 $All += $major_table; 14176 } 14177 14178 if ($All->max != $MAX_WORKING_CODEPOINT) { 14179 Carp::my_carp_bug("Generated highest code point (" 14180 . sprintf("%X", $All->max) 14181 . ") doesn't match expected value $MAX_WORKING_CODEPOINT_STRING.") 14182 } 14183 if ($All->range_count != 1 || $All->min != 0) { 14184 Carp::my_carp_bug("Generated table 'All' doesn't match all code points.") 14185 } 14186 14187 my $Any = $perl->add_match_table('Any', 14188 Description => "All Unicode code points"); 14189 $Any->add_range(0, $MAX_UNICODE_CODEPOINT); 14190 $Any->add_alias('Unicode'); 14191 14192 calculate_Assigned(); 14193 14194 my $ASCII = $perl->add_match_table('ASCII'); 14195 if (defined $block) { # This is equivalent to the block if have it. 14196 my $Unicode_ASCII = $block->table('Basic_Latin'); 14197 if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) { 14198 $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1); 14199 } 14200 } 14201 14202 # Very early releases didn't have blocks, so initialize ASCII ourselves if 14203 # necessary 14204 if ($ASCII->is_empty) { 14205 if (! NON_ASCII_PLATFORM) { 14206 $ASCII->add_range(0, 127); 14207 } 14208 else { 14209 for my $i (0 .. 127) { 14210 $ASCII->add_range(utf8::unicode_to_native($i), 14211 utf8::unicode_to_native($i)); 14212 } 14213 } 14214 } 14215 14216 # Get the best available case definitions. Early Unicode versions didn't 14217 # have Uppercase and Lowercase defined, so use the general category 14218 # instead for them, modified by hard-coding in the code points each is 14219 # missing. 14220 my $Lower = $perl->add_match_table('XPosixLower'); 14221 my $Unicode_Lower = property_ref('Lowercase'); 14222 if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) { 14223 $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1); 14224 14225 } 14226 else { 14227 $Lower += $gc->table('Lowercase_Letter'); 14228 14229 # There are quite a few code points in Lower, that aren't in gc=lc, 14230 # and not all are in all releases. 14231 my $temp = Range_List->new(Initialize => [ 14232 utf8::unicode_to_native(0xAA), 14233 utf8::unicode_to_native(0xBA), 14234 0x02B0 .. 0x02B8, 14235 0x02C0 .. 0x02C1, 14236 0x02E0 .. 0x02E4, 14237 0x0345, 14238 0x037A, 14239 0x1D2C .. 0x1D6A, 14240 0x1D78, 14241 0x1D9B .. 0x1DBF, 14242 0x2071, 14243 0x207F, 14244 0x2090 .. 0x209C, 14245 0x2170 .. 0x217F, 14246 0x24D0 .. 0x24E9, 14247 0x2C7C .. 0x2C7D, 14248 0xA770, 14249 0xA7F8 .. 0xA7F9, 14250 ]); 14251 $Lower += $temp & $Assigned; 14252 } 14253 my $Posix_Lower = $perl->add_match_table("PosixLower", 14254 Initialize => $Lower & $ASCII, 14255 ); 14256 14257 my $Upper = $perl->add_match_table("XPosixUpper"); 14258 my $Unicode_Upper = property_ref('Uppercase'); 14259 if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) { 14260 $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1); 14261 } 14262 else { 14263 14264 # Unlike Lower, there are only two ranges in Upper that aren't in 14265 # gc=Lu, and all code points were assigned in all releases. 14266 $Upper += $gc->table('Uppercase_Letter'); 14267 $Upper->add_range(0x2160, 0x216F); # Uppercase Roman numerals 14268 $Upper->add_range(0x24B6, 0x24CF); # Circled Latin upper case letters 14269 } 14270 my $Posix_Upper = $perl->add_match_table("PosixUpper", 14271 Initialize => $Upper & $ASCII, 14272 ); 14273 14274 # Earliest releases didn't have title case. Initialize it to empty if not 14275 # otherwise present 14276 my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase', 14277 Description => '(= \p{Gc=Lt})'); 14278 my $lt = $gc->table('Lt'); 14279 14280 # Earlier versions of mktables had this related to $lt since they have 14281 # identical code points, but their caseless equivalents are not the same, 14282 # one being 'Cased' and the other being 'LC', and so now must be kept as 14283 # separate entities. 14284 if (defined $lt) { 14285 $Title += $lt; 14286 } 14287 else { 14288 push @tables_that_may_be_empty, $Title->complete_name; 14289 } 14290 14291 my $Unicode_Cased = property_ref('Cased'); 14292 if (defined $Unicode_Cased) { 14293 my $yes = $Unicode_Cased->table('Y'); 14294 my $no = $Unicode_Cased->table('N'); 14295 $Title->set_caseless_equivalent($yes); 14296 if (defined $Unicode_Upper) { 14297 $Unicode_Upper->table('Y')->set_caseless_equivalent($yes); 14298 $Unicode_Upper->table('N')->set_caseless_equivalent($no); 14299 } 14300 $Upper->set_caseless_equivalent($yes); 14301 if (defined $Unicode_Lower) { 14302 $Unicode_Lower->table('Y')->set_caseless_equivalent($yes); 14303 $Unicode_Lower->table('N')->set_caseless_equivalent($no); 14304 } 14305 $Lower->set_caseless_equivalent($yes); 14306 } 14307 else { 14308 # If this Unicode version doesn't have Cased, set up the Perl 14309 # extension from first principles. From Unicode 5.1: Definition D120: 14310 # A character C is defined to be cased if and only if C has the 14311 # Lowercase or Uppercase property or has a General_Category value of 14312 # Titlecase_Letter. 14313 my $cased = $perl->add_match_table('Cased', 14314 Initialize => $Lower + $Upper + $Title, 14315 Description => 'Uppercase or Lowercase or Titlecase', 14316 ); 14317 # $notcased is purely for the caseless equivalents below 14318 my $notcased = $perl->add_match_table('_Not_Cased', 14319 Initialize => ~ $cased, 14320 Fate => $INTERNAL_ONLY, 14321 Description => 'All not-cased code points'); 14322 $Title->set_caseless_equivalent($cased); 14323 if (defined $Unicode_Upper) { 14324 $Unicode_Upper->table('Y')->set_caseless_equivalent($cased); 14325 $Unicode_Upper->table('N')->set_caseless_equivalent($notcased); 14326 } 14327 $Upper->set_caseless_equivalent($cased); 14328 if (defined $Unicode_Lower) { 14329 $Unicode_Lower->table('Y')->set_caseless_equivalent($cased); 14330 $Unicode_Lower->table('N')->set_caseless_equivalent($notcased); 14331 } 14332 $Lower->set_caseless_equivalent($cased); 14333 } 14334 14335 # The remaining perl defined tables are mostly based on Unicode TR 18, 14336 # "Annex C: Compatibility Properties". All of these have two versions, 14337 # one whose name generally begins with Posix that is posix-compliant, and 14338 # one that matches Unicode characters beyond the Posix, ASCII range 14339 14340 my $Alpha = $perl->add_match_table('XPosixAlpha'); 14341 14342 # Alphabetic was not present in early releases 14343 my $Alphabetic = property_ref('Alphabetic'); 14344 if (defined $Alphabetic && ! $Alphabetic->is_empty) { 14345 $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1); 14346 } 14347 else { 14348 14349 # The Alphabetic property doesn't exist for early releases, so 14350 # generate it. The actual definition, in 5.2 terms is: 14351 # 14352 # gc=L + gc=Nl + Other_Alphabetic 14353 # 14354 # Other_Alphabetic is also not defined in these early releases, but it 14355 # contains one gc=So range plus most of gc=Mn and gc=Mc, so we add 14356 # those last two as well, then subtract the relatively few of them that 14357 # shouldn't have been added. (The gc=So range is the circled capital 14358 # Latin characters. Early releases mistakenly didn't also include the 14359 # lower-case versions of these characters, and so we don't either, to 14360 # maintain consistency with those releases that first had this 14361 # property. 14362 $Alpha->initialize($gc->table('Letter') 14363 + pre_3_dot_1_Nl() 14364 + $gc->table('Mn') 14365 + $gc->table('Mc') 14366 ); 14367 $Alpha->add_range(0x24D0, 0x24E9); # gc=So 14368 foreach my $range ( [ 0x0300, 0x0344 ], 14369 [ 0x0346, 0x034E ], 14370 [ 0x0360, 0x0362 ], 14371 [ 0x0483, 0x0486 ], 14372 [ 0x0591, 0x05AF ], 14373 [ 0x06DF, 0x06E0 ], 14374 [ 0x06EA, 0x06EC ], 14375 [ 0x0740, 0x074A ], 14376 0x093C, 14377 0x094D, 14378 [ 0x0951, 0x0954 ], 14379 0x09BC, 14380 0x09CD, 14381 0x0A3C, 14382 0x0A4D, 14383 0x0ABC, 14384 0x0ACD, 14385 0x0B3C, 14386 0x0B4D, 14387 0x0BCD, 14388 0x0C4D, 14389 0x0CCD, 14390 0x0D4D, 14391 0x0DCA, 14392 [ 0x0E47, 0x0E4C ], 14393 0x0E4E, 14394 [ 0x0EC8, 0x0ECC ], 14395 [ 0x0F18, 0x0F19 ], 14396 0x0F35, 14397 0x0F37, 14398 0x0F39, 14399 [ 0x0F3E, 0x0F3F ], 14400 [ 0x0F82, 0x0F84 ], 14401 [ 0x0F86, 0x0F87 ], 14402 0x0FC6, 14403 0x1037, 14404 0x1039, 14405 [ 0x17C9, 0x17D3 ], 14406 [ 0x20D0, 0x20DC ], 14407 0x20E1, 14408 [ 0x302A, 0x302F ], 14409 [ 0x3099, 0x309A ], 14410 [ 0xFE20, 0xFE23 ], 14411 [ 0x1D165, 0x1D169 ], 14412 [ 0x1D16D, 0x1D172 ], 14413 [ 0x1D17B, 0x1D182 ], 14414 [ 0x1D185, 0x1D18B ], 14415 [ 0x1D1AA, 0x1D1AD ], 14416 ) { 14417 if (ref $range) { 14418 $Alpha->delete_range($range->[0], $range->[1]); 14419 } 14420 else { 14421 $Alpha->delete_range($range, $range); 14422 } 14423 } 14424 $Alpha->add_description('Alphabetic'); 14425 $Alpha->add_alias('Alphabetic'); 14426 } 14427 my $Posix_Alpha = $perl->add_match_table("PosixAlpha", 14428 Initialize => $Alpha & $ASCII, 14429 ); 14430 $Posix_Upper->set_caseless_equivalent($Posix_Alpha); 14431 $Posix_Lower->set_caseless_equivalent($Posix_Alpha); 14432 14433 my $Alnum = $perl->add_match_table('Alnum', Full_Name => 'XPosixAlnum', 14434 Description => 'Alphabetic and (decimal) Numeric', 14435 Initialize => $Alpha + $gc->table('Decimal_Number'), 14436 ); 14437 $perl->add_match_table("PosixAlnum", 14438 Initialize => $Alnum & $ASCII, 14439 ); 14440 14441 my $Word = $perl->add_match_table('Word', Full_Name => 'XPosixWord', 14442 Description => '\w, including beyond ASCII;' 14443 . ' = \p{Alnum} + \pM + \p{Pc}' 14444 . ' + \p{Join_Control}', 14445 Initialize => $Alnum + $gc->table('Mark'), 14446 ); 14447 my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1 14448 if (defined $Pc) { 14449 $Word += $Pc; 14450 } 14451 else { 14452 $Word += ord('_'); # Make sure this is a $Word 14453 } 14454 my $JC = property_ref('Join_Control'); # Wasn't in release 1 14455 if (defined $JC) { 14456 $Word += $JC->table('Y'); 14457 } 14458 else { 14459 $Word += 0x200C + 0x200D; 14460 } 14461 14462 # This is a Perl extension, so the name doesn't begin with Posix. 14463 my $PerlWord = $perl->add_match_table('PosixWord', 14464 Description => '\w, restricted to ASCII', 14465 Initialize => $Word & $ASCII, 14466 ); 14467 $PerlWord->add_alias('PerlWord'); 14468 14469 my $Blank = $perl->add_match_table('Blank', Full_Name => 'XPosixBlank', 14470 Description => '\h, Horizontal white space', 14471 14472 # 200B is Zero Width Space which is for line 14473 # break control, and was listed as 14474 # Space_Separator in early releases 14475 Initialize => $gc->table('Space_Separator') 14476 + ord("\t") 14477 - 0x200B, # ZWSP 14478 ); 14479 $Blank->add_alias('HorizSpace'); # Another name for it. 14480 $perl->add_match_table("PosixBlank", 14481 Initialize => $Blank & $ASCII, 14482 ); 14483 14484 my $VertSpace = $perl->add_match_table('VertSpace', 14485 Description => '\v', 14486 Initialize => 14487 $gc->table('Line_Separator') 14488 + $gc->table('Paragraph_Separator') 14489 + utf8::unicode_to_native(0x0A) # LINE FEED 14490 + utf8::unicode_to_native(0x0B) # VERTICAL TAB 14491 + ord("\f") 14492 + utf8::unicode_to_native(0x0D) # CARRIAGE RETURN 14493 + utf8::unicode_to_native(0x85) # NEL 14494 ); 14495 # No Posix equivalent for vertical space 14496 14497 my $Space = $perl->add_match_table('XPosixSpace', 14498 Description => '\s including beyond ASCII and vertical tab', 14499 Initialize => $Blank + $VertSpace, 14500 ); 14501 $Space->add_alias('XPerlSpace'); # Pre-existing synonyms 14502 $Space->add_alias('SpacePerl'); 14503 $Space->add_alias('Space') if $v_version lt v4.1.0; 14504 14505 my $Posix_space = $perl->add_match_table("PosixSpace", 14506 Initialize => $Space & $ASCII, 14507 ); 14508 $Posix_space->add_alias('PerlSpace'); # A pre-existing synonym 14509 14510 my $Cntrl = $perl->add_match_table('Cntrl', Full_Name => 'XPosixCntrl', 14511 Description => 'Control characters'); 14512 $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1); 14513 $perl->add_match_table("PosixCntrl", 14514 Description => "ASCII control characters", 14515 Definition => "ACK, BEL, BS, CAN, CR, DC1, DC2," 14516 . " DC3, DC4, DEL, DLE, ENQ, EOM," 14517 . " EOT, ESC, ETB, ETX, FF, FS, GS," 14518 . " HT, LF, NAK, NUL, RS, SI, SO," 14519 . " SOH, STX, SUB, SYN, US, VT", 14520 Initialize => $Cntrl & $ASCII, 14521 ); 14522 14523 my $perl_surrogate = $perl->add_match_table('_Perl_Surrogate'); 14524 my $Cs = $gc->table('Cs'); 14525 if (defined $Cs && ! $Cs->is_empty) { 14526 $perl_surrogate += $Cs; 14527 } 14528 else { 14529 push @tables_that_may_be_empty, '_Perl_Surrogate'; 14530 } 14531 14532 # $controls is a temporary used to construct Graph. 14533 my $controls = Range_List->new(Initialize => $gc->table('Unassigned') 14534 + $gc->table('Control') 14535 + $perl_surrogate); 14536 14537 # Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls) 14538 my $Graph = $perl->add_match_table('Graph', Full_Name => 'XPosixGraph', 14539 Description => 'Characters that are graphical', 14540 Initialize => ~ ($Space + $controls), 14541 ); 14542 $perl->add_match_table("PosixGraph", 14543 Initialize => $Graph & $ASCII, 14544 ); 14545 14546 $print = $perl->add_match_table('Print', Full_Name => 'XPosixPrint', 14547 Description => 'Characters that are graphical plus space characters (but no controls)', 14548 Initialize => $Blank + $Graph - $gc->table('Control'), 14549 ); 14550 $perl->add_match_table("PosixPrint", 14551 Initialize => $print & $ASCII, 14552 ); 14553 14554 my $Punct = $perl->add_match_table('Punct'); 14555 $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1); 14556 14557 # \p{punct} doesn't include the symbols, which posix does 14558 my $XPosixPunct = $perl->add_match_table('XPosixPunct', 14559 Description => '\p{Punct} + ASCII-range \p{Symbol}', 14560 Initialize => $gc->table('Punctuation') 14561 + ($ASCII & $gc->table('Symbol')), 14562 Perl_Extension => 1 14563 ); 14564 $perl->add_match_table('PosixPunct', Perl_Extension => 1, 14565 Initialize => $ASCII & $XPosixPunct, 14566 ); 14567 14568 my $Digit = $perl->add_match_table('Digit', Full_Name => 'XPosixDigit', 14569 Description => '[0-9] + all other decimal digits'); 14570 $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1); 14571 my $PosixDigit = $perl->add_match_table("PosixDigit", 14572 Initialize => $Digit & $ASCII, 14573 ); 14574 14575 # Hex_Digit was not present in first release 14576 my $Xdigit = $perl->add_match_table('XDigit', Full_Name => 'XPosixXDigit'); 14577 my $Hex = property_ref('Hex_Digit'); 14578 if (defined $Hex && ! $Hex->is_empty) { 14579 $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1); 14580 } 14581 else { 14582 $Xdigit->initialize([ ord('0') .. ord('9'), 14583 ord('A') .. ord('F'), 14584 ord('a') .. ord('f'), 14585 0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]); 14586 } 14587 14588 # AHex was not present in early releases 14589 my $PosixXDigit = $perl->add_match_table('PosixXDigit'); 14590 my $AHex = property_ref('ASCII_Hex_Digit'); 14591 if (defined $AHex && ! $AHex->is_empty) { 14592 $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1); 14593 } 14594 else { 14595 $PosixXDigit->initialize($Xdigit & $ASCII); 14596 $PosixXDigit->add_alias('AHex'); 14597 $PosixXDigit->add_alias('Ascii_Hex_Digit'); 14598 } 14599 14600 my $any_folds = $perl->add_match_table("_Perl_Any_Folds", 14601 Description => "Code points that particpate in some fold", 14602 ); 14603 my $loc_problem_folds = $perl->add_match_table( 14604 "_Perl_Problematic_Locale_Folds", 14605 Description => 14606 "Code points that are in some way problematic under locale", 14607 ); 14608 14609 # This allows regexec.c to skip some work when appropriate. Some of the 14610 # entries in _Perl_Problematic_Locale_Folds are multi-character folds, 14611 my $loc_problem_folds_start = $perl->add_match_table( 14612 "_Perl_Problematic_Locale_Foldeds_Start", 14613 Description => 14614 "The first character of every sequence in _Perl_Problematic_Locale_Folds", 14615 ); 14616 14617 my $cf = property_ref('Case_Folding'); 14618 14619 # Every character 0-255 is problematic because what each folds to depends 14620 # on the current locale 14621 $loc_problem_folds->add_range(0, 255); 14622 $loc_problem_folds->add_range(0x130, 0x131); # These are problematic in 14623 # Turkic locales 14624 $loc_problem_folds_start += $loc_problem_folds; 14625 14626 # Also problematic are anything these fold to outside the range. Likely 14627 # forever the only thing folded to by these outside the 0-255 range is the 14628 # GREEK SMALL MU (from the MICRO SIGN), but it's easy to make the code 14629 # completely general, which should catch any unexpected changes or errors. 14630 # We look at each code point 0-255, and add its fold (including each part 14631 # of a multi-char fold) to the list. See commit message 14632 # 31f05a37c4e9c37a7263491f2fc0237d836e1a80 for a more complete description 14633 # of the MU issue. 14634 foreach my $range ($loc_problem_folds->ranges) { 14635 foreach my $code_point ($range->start .. $range->end) { 14636 my $fold_range = $cf->containing_range($code_point); 14637 next unless defined $fold_range; 14638 14639 # Skip if folds to itself 14640 next if $fold_range->value eq $CODE_POINT; 14641 14642 my @hex_folds = split " ", $fold_range->value; 14643 my $start_cp = $hex_folds[0]; 14644 next if $start_cp eq $CODE_POINT; 14645 $start_cp = hex $start_cp; 14646 foreach my $i (0 .. @hex_folds - 1) { 14647 my $cp = $hex_folds[$i]; 14648 next if $cp eq $CODE_POINT; 14649 $cp = hex $cp; 14650 next unless $cp > 255; # Already have the < 256 ones 14651 14652 $loc_problem_folds->add_range($cp, $cp); 14653 $loc_problem_folds_start->add_range($start_cp, $start_cp); 14654 } 14655 } 14656 } 14657 14658 my $folds_to_multi_char = $perl->add_match_table( 14659 "_Perl_Folds_To_Multi_Char", 14660 Description => 14661 "Code points whose fold is a string of more than one character", 14662 ); 14663 my $in_multi_fold = $perl->add_match_table( 14664 "_Perl_Is_In_Multi_Char_Fold", 14665 Description => 14666 "Code points that are in some multiple character fold", 14667 ); 14668 if ($v_version lt v3.0.1) { 14669 push @tables_that_may_be_empty, '_Perl_Folds_To_Multi_Char', 14670 '_Perl_Is_In_Multi_Char_Fold', 14671 '_Perl_Non_Final_Folds'; 14672 } 14673 14674 # Look through all the known folds to populate these tables. 14675 foreach my $range ($cf->ranges) { 14676 next if $range->value eq $CODE_POINT; 14677 my $start = $range->start; 14678 my $end = $range->end; 14679 $any_folds->add_range($start, $end); 14680 14681 my @hex_folds = split " ", $range->value; 14682 if (@hex_folds > 1) { # Is multi-char fold 14683 $folds_to_multi_char->add_range($start, $end); 14684 } 14685 14686 my $found_locale_problematic = 0; 14687 14688 my $folded_count = @hex_folds; 14689 if ($folded_count > 3) { 14690 die Carp::my_carp("Maximum number of characters in a fold should be 3: Instead, it's $folded_count for U+" . sprintf "%04X", $range->start); 14691 } 14692 14693 # Look at each of the folded-to characters... 14694 foreach my $i (1 .. $folded_count) { 14695 my $cp = hex $hex_folds[$i-1]; 14696 $any_folds->add_range($cp, $cp); 14697 14698 # The fold is problematic if any of the folded-to characters is 14699 # already considered problematic. 14700 if ($loc_problem_folds->contains($cp)) { 14701 $loc_problem_folds->add_range($start, $end); 14702 $found_locale_problematic = 1; 14703 } 14704 14705 if ($folded_count > 1) { 14706 $in_multi_fold->add_range($cp, $cp); 14707 } 14708 } 14709 14710 # If this is a problematic fold, add to the start chars the 14711 # folding-from characters and first folded-to character. 14712 if ($found_locale_problematic) { 14713 $loc_problem_folds_start->add_range($start, $end); 14714 my $cp = hex $hex_folds[0]; 14715 $loc_problem_folds_start->add_range($cp, $cp); 14716 } 14717 } 14718 14719 my $dt = property_ref('Decomposition_Type'); 14720 $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical', 14721 Initialize => ~ ($dt->table('None') + $dt->table('Canonical')), 14722 Perl_Extension => 1, 14723 Note => 'Union of all non-canonical decompositions', 14724 ); 14725 14726 # For backward compatibility, Perl has its own definition for IDStart. 14727 # It is regular XID_Start plus the underscore, but all characters must be 14728 # Word characters as well 14729 my $XID_Start = property_ref('XID_Start'); 14730 my $perl_xids = $perl->add_match_table('_Perl_IDStart', 14731 Perl_Extension => 1, 14732 Fate => $INTERNAL_ONLY, 14733 Initialize => ord('_') 14734 ); 14735 if (defined $XID_Start 14736 || defined ($XID_Start = property_ref('ID_Start'))) 14737 { 14738 $perl_xids += $XID_Start->table('Y'); 14739 } 14740 else { 14741 # For Unicode versions that don't have the property, construct our own 14742 # from first principles. The actual definition is: 14743 # Letters 14744 # + letter numbers (Nl) 14745 # - Pattern_Syntax 14746 # - Pattern_White_Space 14747 # + stability extensions 14748 # - NKFC modifications 14749 # 14750 # What we do in the code below is to include the identical code points 14751 # that are in the first release that had Unicode's version of this 14752 # property, essentially extrapolating backwards. There were no 14753 # stability extensions until v4.1, so none are included; likewise in 14754 # no Unicode version so far do subtracting PatSyn and PatWS make any 14755 # difference, so those also are ignored. 14756 $perl_xids += $gc->table('Letter') + pre_3_dot_1_Nl(); 14757 14758 # We do subtract the NFKC modifications that are in the first version 14759 # that had this property. We don't bother to test if they are in the 14760 # version in question, because if they aren't, the operation is a 14761 # no-op. The NKFC modifications are discussed in 14762 # http://www.unicode.org/reports/tr31/#NFKC_Modifications 14763 foreach my $range ( 0x037A, 14764 0x0E33, 14765 0x0EB3, 14766 [ 0xFC5E, 0xFC63 ], 14767 [ 0xFDFA, 0xFE70 ], 14768 [ 0xFE72, 0xFE76 ], 14769 0xFE78, 14770 0xFE7A, 14771 0xFE7C, 14772 0xFE7E, 14773 [ 0xFF9E, 0xFF9F ], 14774 ) { 14775 if (ref $range) { 14776 $perl_xids->delete_range($range->[0], $range->[1]); 14777 } 14778 else { 14779 $perl_xids->delete_range($range, $range); 14780 } 14781 } 14782 } 14783 14784 $perl_xids &= $Word; 14785 14786 my $perl_xidc = $perl->add_match_table('_Perl_IDCont', 14787 Perl_Extension => 1, 14788 Fate => $INTERNAL_ONLY); 14789 my $XIDC = property_ref('XID_Continue'); 14790 if (defined $XIDC 14791 || defined ($XIDC = property_ref('ID_Continue'))) 14792 { 14793 $perl_xidc += $XIDC->table('Y'); 14794 } 14795 else { 14796 # Similarly, we construct our own XIDC if necessary for early Unicode 14797 # versions. The definition is: 14798 # everything in XIDS 14799 # + Gc=Mn 14800 # + Gc=Mc 14801 # + Gc=Nd 14802 # + Gc=Pc 14803 # - Pattern_Syntax 14804 # - Pattern_White_Space 14805 # + stability extensions 14806 # - NFKC modifications 14807 # 14808 # The same thing applies to this as with XIDS for the PatSyn, PatWS, 14809 # and stability extensions. There is a somewhat different set of NFKC 14810 # mods to remove (and add in this case). The ones below make this 14811 # have identical code points as in the first release that defined it. 14812 $perl_xidc += $perl_xids 14813 + $gc->table('L') 14814 + $gc->table('Mn') 14815 + $gc->table('Mc') 14816 + $gc->table('Nd') 14817 + utf8::unicode_to_native(0xB7) 14818 ; 14819 if (defined (my $pc = $gc->table('Pc'))) { 14820 $perl_xidc += $pc; 14821 } 14822 else { # 1.1.5 didn't have Pc, but these should have been in it 14823 $perl_xidc += 0xFF3F; 14824 $perl_xidc->add_range(0x203F, 0x2040); 14825 $perl_xidc->add_range(0xFE33, 0xFE34); 14826 $perl_xidc->add_range(0xFE4D, 0xFE4F); 14827 } 14828 14829 # Subtract the NFKC mods 14830 foreach my $range ( 0x037A, 14831 [ 0xFC5E, 0xFC63 ], 14832 [ 0xFDFA, 0xFE1F ], 14833 0xFE70, 14834 [ 0xFE72, 0xFE76 ], 14835 0xFE78, 14836 0xFE7A, 14837 0xFE7C, 14838 0xFE7E, 14839 ) { 14840 if (ref $range) { 14841 $perl_xidc->delete_range($range->[0], $range->[1]); 14842 } 14843 else { 14844 $perl_xidc->delete_range($range, $range); 14845 } 14846 } 14847 } 14848 14849 $perl_xidc &= $Word; 14850 14851 my $charname_begin = $perl->add_match_table('_Perl_Charname_Begin', 14852 Perl_Extension => 1, 14853 Fate => $INTERNAL_ONLY, 14854 Initialize => $gc->table('Letter') & $Alpha & $perl_xids, 14855 ); 14856 14857 my $charname_continue = $perl->add_match_table('_Perl_Charname_Continue', 14858 Perl_Extension => 1, 14859 Fate => $INTERNAL_ONLY, 14860 Initialize => $perl_xidc 14861 + ord(" ") 14862 + ord("(") 14863 + ord(")") 14864 + ord("-") 14865 ); 14866 14867 my @composition = ('Name', 'Unicode_1_Name', '_Perl_Name_Alias'); 14868 14869 if (@named_sequences) { 14870 push @composition, 'Named_Sequence'; 14871 foreach my $sequence (@named_sequences) { 14872 $perl_charname->add_anomalous_entry($sequence); 14873 } 14874 } 14875 14876 my $alias_sentence = ""; 14877 my %abbreviations; 14878 my $alias = property_ref('_Perl_Name_Alias'); 14879 $perl_charname->set_proxy_for('_Perl_Name_Alias'); 14880 14881 # Add each entry in _Perl_Name_Alias to Perl_Charnames. Where these go 14882 # with respect to any existing entry depends on the entry type. 14883 # Corrections go before said entry, as they should be returned in 14884 # preference over the existing entry. (A correction to a correction 14885 # should be later in the _Perl_Name_Alias table, so it will correctly 14886 # precede the erroneous correction in Perl_Charnames.) 14887 # 14888 # Abbreviations go after everything else, so they are saved temporarily in 14889 # a hash for later. 14890 # 14891 # Everything else is added afterwards, which preserves the input 14892 # ordering 14893 14894 foreach my $range ($alias->ranges) { 14895 next if $range->value eq ""; 14896 my $code_point = $range->start; 14897 if ($code_point != $range->end) { 14898 Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;"); 14899 } 14900 my ($value, $type) = split ': ', $range->value; 14901 my $replace_type; 14902 if ($type eq 'correction') { 14903 $replace_type = $MULTIPLE_BEFORE; 14904 } 14905 elsif ($type eq 'abbreviation') { 14906 14907 # Save for later 14908 $abbreviations{$value} = $code_point; 14909 next; 14910 } 14911 else { 14912 $replace_type = $MULTIPLE_AFTER; 14913 } 14914 14915 # Actually add; before or after current entry(ies) as determined 14916 # above. 14917 14918 $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type); 14919 } 14920 $alias_sentence = <<END; 14921The _Perl_Name_Alias property adds duplicate code point entries that are 14922alternatives to the original name. If an addition is a corrected 14923name, it will be physically first in the table. The original (less correct, 14924but still valid) name will be next; then any alternatives, in no particular 14925order; and finally any abbreviations, again in no particular order. 14926END 14927 14928 # Now add the Unicode_1 names for the controls. The Unicode_1 names had 14929 # precedence before 6.1, including the awful ones like "LINE FEED (LF)", 14930 # so should be first in the file; the other names have precedence starting 14931 # in 6.1, 14932 my $before_or_after = ($v_version lt v6.1.0) 14933 ? $MULTIPLE_BEFORE 14934 : $MULTIPLE_AFTER; 14935 14936 foreach my $range (property_ref('Unicode_1_Name')->ranges) { 14937 my $code_point = $range->start; 14938 my $unicode_1_value = $range->value; 14939 next if $unicode_1_value eq ""; # Skip if name doesn't exist. 14940 14941 if ($code_point != $range->end) { 14942 Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;"); 14943 } 14944 14945 # To handle EBCDIC, we don't hard code in the code points of the 14946 # controls; instead realizing that all of them are below 256. 14947 last if $code_point > 255; 14948 14949 # We only add in the controls. 14950 next if $gc->value_of($code_point) ne 'Cc'; 14951 14952 # We reject this Unicode1 name for later Perls, as it is used for 14953 # another code point 14954 next if $unicode_1_value eq 'BELL' && $^V ge v5.17.0; 14955 14956 # This won't add an exact duplicate. 14957 $perl_charname->add_duplicate($code_point, $unicode_1_value, 14958 Replace => $before_or_after); 14959 } 14960 14961 # Now that have everything added, add in abbreviations after 14962 # everything else. Sort so results don't change between runs of this 14963 # program 14964 foreach my $value (sort keys %abbreviations) { 14965 $perl_charname->add_duplicate($abbreviations{$value}, $value, 14966 Replace => $MULTIPLE_AFTER); 14967 } 14968 14969 my $comment; 14970 if (@composition <= 2) { # Always at least 2 14971 $comment = join " and ", @composition; 14972 } 14973 else { 14974 $comment = join ", ", @composition[0 .. scalar @composition - 2]; 14975 $comment .= ", and $composition[-1]"; 14976 } 14977 14978 $perl_charname->add_comment(join_lines( <<END 14979This file is for charnames.pm. It is the union of the $comment properties. 14980Unicode_1_Name entries are used only for nameless code points in the Name 14981property. 14982$alias_sentence 14983This file doesn't include the algorithmically determinable names. For those, 14984use 'unicore/Name.pm' 14985END 14986 )); 14987 property_ref('Name')->add_comment(join_lines( <<END 14988This file doesn't include the algorithmically determinable names. For those, 14989use 'unicore/Name.pm' 14990END 14991 )); 14992 14993 # Construct the Present_In property from the Age property. 14994 if (-e 'DAge.txt' && defined $age) { 14995 my $default_map = $age->default_map; 14996 my $in = Property->new('In', 14997 Default_Map => $default_map, 14998 Full_Name => "Present_In", 14999 Perl_Extension => 1, 15000 Type => $ENUM, 15001 Initialize => $age, 15002 ); 15003 $in->add_comment(join_lines(<<END 15004THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE. The values in this file are the 15005same as for $age, and not for what $in really means. This is because anything 15006defined in a given release should have multiple values: that release and all 15007higher ones. But only one value per code point can be represented in a table 15008like this. 15009END 15010 )); 15011 15012 # The Age tables are named like 1.5, 2.0, 2.1, .... Sort so that the 15013 # lowest numbered (earliest) come first, with the non-numeric one 15014 # last. 15015 my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/) 15016 ? 1 15017 : ($b->name !~ /^[\d.]*$/) 15018 ? -1 15019 : $a->name <=> $b->name 15020 } $age->tables; 15021 15022 # The Present_In property is the cumulative age properties. The first 15023 # one hence is identical to the first age one. 15024 my $previous_in = $in->add_match_table($first_age->name); 15025 $previous_in->set_equivalent_to($first_age, Related => 1); 15026 15027 my $description_start = "Code point's usage introduced in version "; 15028 $first_age->add_description($description_start . $first_age->name); 15029 15030 # To construct the accumulated values, for each of the age tables 15031 # starting with the 2nd earliest, merge the earliest with it, to get 15032 # all those code points existing in the 2nd earliest. Repeat merging 15033 # the new 2nd earliest with the 3rd earliest to get all those existing 15034 # in the 3rd earliest, and so on. 15035 foreach my $current_age (@rest_ages) { 15036 next if $current_age->name !~ /^[\d.]*$/; # Skip the non-numeric 15037 15038 my $current_in = $in->add_match_table( 15039 $current_age->name, 15040 Initialize => $current_age + $previous_in, 15041 Description => $description_start 15042 . $current_age->name 15043 . ' or earlier', 15044 ); 15045 foreach my $alias ($current_age->aliases) { 15046 $current_in->add_alias($alias->name); 15047 } 15048 $previous_in = $current_in; 15049 15050 # Add clarifying material for the corresponding age file. This is 15051 # in part because of the confusing and contradictory information 15052 # given in the Standard's documentation itself, as of 5.2. 15053 $current_age->add_description( 15054 "Code point's usage was introduced in version " 15055 . $current_age->name); 15056 $current_age->add_note("See also $in"); 15057 15058 } 15059 15060 # And finally the code points whose usages have yet to be decided are 15061 # the same in both properties. Note that permanently unassigned code 15062 # points actually have their usage assigned (as being permanently 15063 # unassigned), so that these tables are not the same as gc=cn. 15064 my $unassigned = $in->add_match_table($default_map); 15065 my $age_default = $age->table($default_map); 15066 $age_default->add_description(<<END 15067Code point's usage has not been assigned in any Unicode release thus far. 15068END 15069 ); 15070 $unassigned->set_equivalent_to($age_default, Related => 1); 15071 } 15072 15073 my $patws = $perl->add_match_table('_Perl_PatWS', 15074 Perl_Extension => 1, 15075 Fate => $INTERNAL_ONLY); 15076 if (defined (my $off_patws = property_ref('Pattern_White_Space'))) { 15077 $patws->initialize($off_patws->table('Y')); 15078 } 15079 else { 15080 $patws->initialize([ ord("\t"), 15081 ord("\n"), 15082 utf8::unicode_to_native(0x0B), # VT 15083 ord("\f"), 15084 ord("\r"), 15085 ord(" "), 15086 utf8::unicode_to_native(0x85), # NEL 15087 0x200E..0x200F, # Left, Right marks 15088 0x2028..0x2029 # Line, Paragraph seps 15089 ] ); 15090 } 15091 15092 # See L<perlfunc/quotemeta> 15093 my $quotemeta = $perl->add_match_table('_Perl_Quotemeta', 15094 Perl_Extension => 1, 15095 Fate => $INTERNAL_ONLY, 15096 15097 # Initialize to what's common in 15098 # all Unicode releases. 15099 Initialize => 15100 $gc->table('Control') 15101 + $Space 15102 + $patws 15103 + ((~ $Word) & $ASCII) 15104 ); 15105 15106 if (defined (my $patsyn = property_ref('Pattern_Syntax'))) { 15107 $quotemeta += $patsyn->table('Y'); 15108 } 15109 else { 15110 $quotemeta += ((~ $Word) & Range->new(0, 255)) 15111 - utf8::unicode_to_native(0xA8) 15112 - utf8::unicode_to_native(0xAF) 15113 - utf8::unicode_to_native(0xB2) 15114 - utf8::unicode_to_native(0xB3) 15115 - utf8::unicode_to_native(0xB4) 15116 - utf8::unicode_to_native(0xB7) 15117 - utf8::unicode_to_native(0xB8) 15118 - utf8::unicode_to_native(0xB9) 15119 - utf8::unicode_to_native(0xBC) 15120 - utf8::unicode_to_native(0xBD) 15121 - utf8::unicode_to_native(0xBE); 15122 $quotemeta += [ # These are above-Latin1 patsyn; hence should be the 15123 # same in all releases 15124 0x2010 .. 0x2027, 15125 0x2030 .. 0x203E, 15126 0x2041 .. 0x2053, 15127 0x2055 .. 0x205E, 15128 0x2190 .. 0x245F, 15129 0x2500 .. 0x2775, 15130 0x2794 .. 0x2BFF, 15131 0x2E00 .. 0x2E7F, 15132 0x3001 .. 0x3003, 15133 0x3008 .. 0x3020, 15134 0x3030 .. 0x3030, 15135 0xFD3E .. 0xFD3F, 15136 0xFE45 .. 0xFE46 15137 ]; 15138 } 15139 15140 if (defined (my $di = property_ref('Default_Ignorable_Code_Point'))) { 15141 $quotemeta += $di->table('Y') 15142 } 15143 else { 15144 if ($v_version ge v2.0) { 15145 $quotemeta += $gc->table('Cf') 15146 + $gc->table('Cs'); 15147 15148 # These are above the Unicode version 1 max 15149 $quotemeta->add_range(0xE0000, 0xE0FFF); 15150 } 15151 $quotemeta += $gc->table('Cc') 15152 - $Space; 15153 my $temp = Range_List->new(Initialize => [ 0x180B .. 0x180D, 15154 0x2060 .. 0x206F, 15155 0xFE00 .. 0xFE0F, 15156 0xFFF0 .. 0xFFFB, 15157 ]); 15158 $temp->add_range(0xE0000, 0xE0FFF) if $v_version ge v2.0; 15159 $quotemeta += $temp; 15160 } 15161 calculate_DI(); 15162 $quotemeta += $DI; 15163 15164 calculate_NChar(); 15165 15166 # Finished creating all the perl properties. All non-internal non-string 15167 # ones have a synonym of 'Is_' prefixed. (Internal properties begin with 15168 # an underscore.) These do not get a separate entry in the pod file 15169 foreach my $table ($perl->tables) { 15170 foreach my $alias ($table->aliases) { 15171 next if $alias->name =~ /^_/; 15172 $table->add_alias('Is_' . $alias->name, 15173 Re_Pod_Entry => 0, 15174 UCD => 0, 15175 Status => $alias->status, 15176 OK_as_Filename => 0); 15177 } 15178 } 15179 15180 # Perl tailors the WordBreak property so that \b{wb} doesn't split 15181 # adjacent spaces into separate words. Unicode 11.0 moved in that 15182 # direction, but left TAB, FIGURE SPACE (U+2007), and (ironically) NO 15183 # BREAK SPACE as breaking, so we retained the original Perl customization. 15184 # To do this, in the Perl copy of WB, simply replace the mappings of 15185 # horizontal space characters that otherwise would map to the default or 15186 # the 11.0 'WSegSpace' to instead map to our tailoring. 15187 my $perl_wb = property_ref('_Perl_WB'); 15188 my $default = $perl_wb->default_map; 15189 for my $range ($Blank->ranges) { 15190 for my $i ($range->start .. $range->end) { 15191 my $value = $perl_wb->value_of($i); 15192 15193 next unless $value eq $default || $value eq 'WSegSpace'; 15194 $perl_wb->add_map($i, $i, 'Perl_Tailored_HSpace', 15195 Replace => $UNCONDITIONALLY); 15196 } 15197 } 15198 15199 # Also starting in Unicode 11.0, rules for some of the boundary types are 15200 # based on a non-UCD property (which we have read in if it exists). 15201 # Recall that these boundary properties partition the code points into 15202 # equivalence classes (represented as enums). 15203 # 15204 # The loop below goes through each code point that matches the non-UCD 15205 # property, and for each current equivalence class containing such a code 15206 # point, splits it so that those that are in both are now in a newly 15207 # created equivalence class whose name is a combination of the property 15208 # and the old class name, leaving unchanged everything that doesn't match 15209 # the non-UCD property. 15210 my $pictographic_emoji = property_ref('ExtPict'); 15211 if (defined $pictographic_emoji) { 15212 foreach my $base_property (property_ref('GCB'), 15213 property_ref('WB')) 15214 { 15215 my $property = property_ref('_Perl_' . $base_property->name); 15216 foreach my $range ($pictographic_emoji->table('Y')->ranges) { 15217 foreach my $i ($range->start .. $range->end) { 15218 my $current = $property->value_of($i); 15219 $current = $property->table($current)->short_name; 15220 $property->add_map($i, $i, 'ExtPict_' . $current, 15221 Replace => $UNCONDITIONALLY); 15222 } 15223 } 15224 } 15225 } 15226 15227 # Create a version of the LineBreak property with the mappings that are 15228 # omitted in the default algorithm remapped to what 15229 # http://www.unicode.org/reports/tr14 says they should be. 15230 # 15231 # Original Resolved General_Category 15232 # AI, SG, XX AL Any 15233 # SA CM Only Mn or Mc 15234 # SA AL Any except Mn and Mc 15235 # CJ NS Any 15236 # 15237 # All property values are also written out in their long form, as 15238 # regen/mk_invlist.pl expects that. This also fixes occurrences of the 15239 # typo in early Unicode versions: 'inseperable'. 15240 my $perl_lb = property_ref('_Perl_LB'); 15241 if (! defined $perl_lb) { 15242 $perl_lb = Property->new('_Perl_LB', 15243 Fate => $INTERNAL_ONLY, 15244 Perl_Extension => 1, 15245 Directory => $map_directory, 15246 Type => $STRING); 15247 my $lb = property_ref('Line_Break'); 15248 15249 # Populate from $lb, but use full name and fix typo. 15250 foreach my $range ($lb->ranges) { 15251 my $full_name = $lb->table($range->value)->full_name; 15252 $full_name = 'Inseparable' 15253 if standardize($full_name) eq 'inseperable'; 15254 $perl_lb->add_map($range->start, $range->end, $full_name); 15255 } 15256 } 15257 15258 $perl_lb->set_default_map('Alphabetic', 'full_name'); # XX -> AL 15259 my $ea = property_ref('East_Asian_Width'); 15260 15261 for my $range ($perl_lb->ranges) { 15262 my $value = standardize($range->value); 15263 if ( $value eq standardize('Unknown') 15264 || $value eq standardize('Ambiguous') 15265 || $value eq standardize('Surrogate')) 15266 { 15267 $perl_lb->add_map($range->start, $range->end, 'Alphabetic', 15268 Replace => $UNCONDITIONALLY); 15269 } 15270 elsif ($value eq standardize('Conditional_Japanese_Starter')) { 15271 $perl_lb->add_map($range->start, $range->end, 'Nonstarter', 15272 Replace => $UNCONDITIONALLY); 15273 } 15274 elsif ($value eq standardize('Complex_Context')) { 15275 for my $i ($range->start .. $range->end) { 15276 my $gc_val = $gc->value_of($i); 15277 if ($gc_val eq 'Mn' || $gc_val eq 'Mc') { 15278 $perl_lb->add_map($i, $i, 'Combining_Mark', 15279 Replace => $UNCONDITIONALLY); 15280 } 15281 else { 15282 $perl_lb->add_map($i, $i, 'Alphabetic', 15283 Replace => $UNCONDITIONALLY); 15284 } 15285 } 15286 } 15287 elsif ( defined $ea 15288 && ( $value eq standardize('Close_Parenthesis') 15289 || $value eq standardize('Open_Punctuation'))) 15290 { 15291 # Unicode 13 splits the OP and CP properties each into East Asian, 15292 # and non-. We retain the (now somewhat misleading) names OP and 15293 # CP for the non-East Asian variety, as there are very few East 15294 # Asian ones. 15295 my $replace = ($value eq standardize('Open_Punctuation')) 15296 ? 'East_Asian_OP' 15297 : 'East_Asian_CP'; 15298 for my $i ($range->start .. $range->end) { 15299 my $ea_val = $ea->value_of($i); 15300 if ($ea_val eq 'F' || $ea_val eq 'W' || $ea_val eq 'H') { 15301 $perl_lb->add_map($i, $i, $replace, 15302 Replace => $UNCONDITIONALLY); 15303 } 15304 } 15305 } 15306 } 15307 15308 # This property is a modification of the scx property 15309 my $perl_scx = Property->new('_Perl_SCX', 15310 Fate => $INTERNAL_ONLY, 15311 Perl_Extension => 1, 15312 Directory => $map_directory, 15313 Type => $ENUM); 15314 my $source; 15315 15316 # Use scx if available; otherwise sc; if neither is there (a very old 15317 # Unicode version, just say that everything is 'Common' 15318 if (defined $scx) { 15319 $source = $scx; 15320 $perl_scx->set_default_map('Unknown'); 15321 } 15322 elsif (defined $script) { 15323 $source = $script; 15324 15325 # Early versions of 'sc', had everything be 'Common' 15326 if (defined $script->table('Unknown')) { 15327 $perl_scx->set_default_map('Unknown'); 15328 } 15329 else { 15330 $perl_scx->set_default_map('Common'); 15331 } 15332 } else { 15333 $perl_scx->add_match_table('Common'); 15334 $perl_scx->add_map(0, $MAX_UNICODE_CODEPOINT, 'Common'); 15335 15336 $perl_scx->add_match_table('Unknown'); 15337 $perl_scx->set_default_map('Unknown'); 15338 } 15339 15340 $perl_scx->_set_format($STRING_WHITE_SPACE_LIST); 15341 $perl_scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these 15342 15343 if (defined $source) { 15344 $perl_scx->initialize($source); 15345 15346 # UTS 39 says that the scx property should be modified for these 15347 # countries where certain mixed scripts are commonly used. 15348 for my $range ($perl_scx->ranges) { 15349 my $value = $range->value; 15350 my $changed = $value =~ s/ ( \b Han i? \b ) /$1 Hanb Jpan Kore/xi; 15351 $changed |= $value =~ s/ ( \b Hira (gana)? \b ) /$1 Jpan/xi; 15352 $changed |= $value =~ s/ ( \b Kata (kana)? \b ) /$1 Jpan/xi; 15353 $changed |= $value =~ s{ ( \b Katakana_or_Hiragana \b ) } 15354 {$1 Katakana Hiragana Jpan}xi; 15355 $changed |= $value =~ s/ ( \b Hang (ul)? \b ) /$1 Kore/xi; 15356 $changed |= $value =~ s/ ( \b Bopo (mofo)? \b ) /$1 Hanb/xi; 15357 15358 if ($changed) { 15359 $value = join " ", uniques split " ", $value; 15360 $range->set_value($value) 15361 } 15362 } 15363 15364 foreach my $table ($source->tables) { 15365 my $scx_table = $perl_scx->add_match_table($table->name, 15366 Full_Name => $table->full_name); 15367 foreach my $alias ($table->aliases) { 15368 $scx_table->add_alias($alias->name); 15369 } 15370 } 15371 } 15372 15373 # Here done with all the basic stuff. Ready to populate the information 15374 # about each character if annotating them. 15375 if ($annotate) { 15376 15377 # See comments at its declaration 15378 $annotate_ranges = Range_Map->new; 15379 15380 # This separates out the non-characters from the other unassigneds, so 15381 # can give different annotations for each. 15382 $unassigned_sans_noncharacters = Range_List->new( 15383 Initialize => $gc->table('Unassigned')); 15384 $unassigned_sans_noncharacters &= (~ $NChar); 15385 15386 for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT + 1; $i++ ) { 15387 $i = populate_char_info($i); # Note sets $i so may cause skips 15388 15389 } 15390 } 15391 15392 return; 15393} 15394 15395sub add_perl_synonyms() { 15396 # A number of Unicode tables have Perl synonyms that are expressed in 15397 # the single-form, \p{name}. These are: 15398 # All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and 15399 # \p{Is_Name} as synonyms 15400 # \p{Script_Extensions=Value} gets \p{Value}, \p{Is_Value} as synonyms 15401 # \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms 15402 # \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no 15403 # conflict, \p{Value} and \p{Is_Value} as well 15404 # 15405 # This routine generates these synonyms, warning of any unexpected 15406 # conflicts. 15407 15408 # Construct the list of tables to get synonyms for. Start with all the 15409 # binary and the General_Category ones. 15410 my @tables = grep { $_->type == $BINARY || $_->type == $FORCED_BINARY } 15411 property_ref('*'); 15412 push @tables, $gc->tables; 15413 15414 # If the version of Unicode includes the Script Extensions (preferably), 15415 # or Script property, add its tables 15416 if (defined $scx) { 15417 push @tables, $scx->tables; 15418 } 15419 else { 15420 push @tables, $script->tables if defined $script; 15421 } 15422 15423 # The Block tables are kept separate because they are treated differently. 15424 # And the earliest versions of Unicode didn't include them, so add only if 15425 # there are some. 15426 my @blocks; 15427 push @blocks, $block->tables if defined $block; 15428 15429 # Here, have the lists of tables constructed. Process blocks last so that 15430 # if there are name collisions with them, blocks have lowest priority. 15431 # Should there ever be other collisions, manual intervention would be 15432 # required. See the comments at the beginning of the program for a 15433 # possible way to handle those semi-automatically. 15434 foreach my $table (@tables, @blocks) { 15435 15436 # For non-binary properties, the synonym is just the name of the 15437 # table, like Greek, but for binary properties the synonym is the name 15438 # of the property, and means the code points in its 'Y' table. 15439 my $nominal = $table; 15440 my $nominal_property = $nominal->property; 15441 my $actual; 15442 if (! $nominal->isa('Property')) { 15443 $actual = $table; 15444 } 15445 else { 15446 15447 # Here is a binary property. Use the 'Y' table. Verify that is 15448 # there 15449 my $yes = $nominal->table('Y'); 15450 unless (defined $yes) { # Must be defined, but is permissible to 15451 # be empty. 15452 Carp::my_carp_bug("Undefined $nominal, 'Y'. Skipping."); 15453 next; 15454 } 15455 $actual = $yes; 15456 } 15457 15458 foreach my $alias ($nominal->aliases) { 15459 15460 # Attempt to create a table in the perl directory for the 15461 # candidate table, using whatever aliases in it that don't 15462 # conflict. Also add non-conflicting aliases for all these 15463 # prefixed by 'Is_' (and/or 'In_' for Block property tables) 15464 PREFIX: 15465 foreach my $prefix ("", 'Is_', 'In_') { 15466 15467 # Only Block properties can have added 'In_' aliases. 15468 next if $prefix eq 'In_' and $nominal_property != $block; 15469 15470 my $proposed_name = $prefix . $alias->name; 15471 15472 # No Is_Is, In_In, nor combinations thereof 15473 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x; 15474 next if $proposed_name =~ /^ I [ns] _I [ns] _/x; 15475 15476 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace; 15477 15478 # Get a reference to any existing table in the perl 15479 # directory with the desired name. 15480 my $pre_existing = $perl->table($proposed_name); 15481 15482 if (! defined $pre_existing) { 15483 15484 # No name collision, so OK to add the perl synonym. 15485 15486 my $make_re_pod_entry; 15487 my $ok_as_filename; 15488 my $status = $alias->status; 15489 if ($nominal_property == $block) { 15490 15491 # For block properties, only the compound form is 15492 # preferred for external use; the others are 15493 # discouraged. The pod file contains wild cards for 15494 # the 'In' and 'Is' forms so no entries for those; and 15495 # we don't want people using the name without any 15496 # prefix, so discourage that. 15497 if ($prefix eq "") { 15498 $make_re_pod_entry = 1; 15499 $status = $status || $DISCOURAGED; 15500 $ok_as_filename = 0; 15501 } 15502 elsif ($prefix eq 'In_') { 15503 $make_re_pod_entry = 0; 15504 $status = $status || $DISCOURAGED; 15505 $ok_as_filename = 1; 15506 } 15507 else { 15508 $make_re_pod_entry = 0; 15509 $status = $status || $DISCOURAGED; 15510 $ok_as_filename = 0; 15511 } 15512 } 15513 elsif ($prefix ne "") { 15514 15515 # The 'Is' prefix is handled in the pod by a wild 15516 # card, and we won't use it for an external name 15517 $make_re_pod_entry = 0; 15518 $status = $status || $NORMAL; 15519 $ok_as_filename = 0; 15520 } 15521 else { 15522 15523 # Here, is an empty prefix, non block. This gets its 15524 # own pod entry and can be used for an external name. 15525 $make_re_pod_entry = 1; 15526 $status = $status || $NORMAL; 15527 $ok_as_filename = 1; 15528 } 15529 15530 # Here, there isn't a perl pre-existing table with the 15531 # name. Look through the list of equivalents of this 15532 # table to see if one is a perl table. 15533 foreach my $equivalent ($actual->leader->equivalents) { 15534 next if $equivalent->property != $perl; 15535 15536 # Here, have found a table for $perl. Add this alias 15537 # to it, and are done with this prefix. 15538 $equivalent->add_alias($proposed_name, 15539 Re_Pod_Entry => $make_re_pod_entry, 15540 15541 # Currently don't output these in the 15542 # ucd pod, as are strongly discouraged 15543 # from being used 15544 UCD => 0, 15545 15546 Status => $status, 15547 OK_as_Filename => $ok_as_filename); 15548 trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace; 15549 next PREFIX; 15550 } 15551 15552 # Here, $perl doesn't already have a table that is a 15553 # synonym for this property, add one. 15554 my $added_table = $perl->add_match_table($proposed_name, 15555 Re_Pod_Entry => $make_re_pod_entry, 15556 15557 # See UCD comment just above 15558 UCD => 0, 15559 15560 Status => $status, 15561 OK_as_Filename => $ok_as_filename); 15562 # And it will be related to the actual table, since it is 15563 # based on it. 15564 $added_table->set_equivalent_to($actual, Related => 1); 15565 trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace; 15566 next; 15567 } # End of no pre-existing. 15568 15569 # Here, there is a pre-existing table that has the proposed 15570 # name. We could be in trouble, but not if this is just a 15571 # synonym for another table that we have already made a child 15572 # of the pre-existing one. 15573 if ($pre_existing->is_set_equivalent_to($actual)) { 15574 trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace; 15575 $pre_existing->add_alias($proposed_name); 15576 next; 15577 } 15578 15579 # Here, there is a name collision, but it still could be OK if 15580 # the tables match the identical set of code points, in which 15581 # case, we can combine the names. Compare each table's code 15582 # point list to see if they are identical. 15583 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace; 15584 if ($pre_existing->matches_identically_to($actual)) { 15585 15586 # Here, they do match identically. Not a real conflict. 15587 # Make the perl version a child of the Unicode one, except 15588 # in the non-obvious case of where the perl name is 15589 # already a synonym of another Unicode property. (This is 15590 # excluded by the test for it being its own parent.) The 15591 # reason for this exclusion is that then the two Unicode 15592 # properties become related; and we don't really know if 15593 # they are or not. We generate documentation based on 15594 # relatedness, and this would be misleading. Code 15595 # later executed in the process will cause the tables to 15596 # be represented by a single file anyway, without making 15597 # it look in the pod like they are necessarily related. 15598 if ($pre_existing->parent == $pre_existing 15599 && ($pre_existing->property == $perl 15600 || $actual->property == $perl)) 15601 { 15602 trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace; 15603 $pre_existing->set_equivalent_to($actual, Related => 1); 15604 } 15605 elsif (main::DEBUG && $to_trace) { 15606 trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases"; 15607 trace $pre_existing->parent; 15608 } 15609 next PREFIX; 15610 } 15611 15612 # Here they didn't match identically, there is a real conflict 15613 # between our new name and a pre-existing property. 15614 $actual->add_conflicting($proposed_name, 'p', $pre_existing); 15615 $pre_existing->add_conflicting($nominal->full_name, 15616 'p', 15617 $actual); 15618 15619 # Don't output a warning for aliases for the block 15620 # properties (unless they start with 'In_') as it is 15621 # expected that there will be conflicts and the block 15622 # form loses. 15623 if ($verbosity >= $NORMAL_VERBOSITY 15624 && ($actual->property != $block || $prefix eq 'In_')) 15625 { 15626 print simple_fold(join_lines(<<END 15627There is already an alias named $proposed_name (from $pre_existing), 15628so not creating this alias for $actual 15629END 15630 ), "", 4); 15631 } 15632 15633 # Keep track for documentation purposes. 15634 $has_In_conflicts++ if $prefix eq 'In_'; 15635 $has_Is_conflicts++ if $prefix eq 'Is_'; 15636 } 15637 } 15638 } 15639 15640 # There are some properties which have No and Yes (and N and Y) as 15641 # property values, but aren't binary, and could possibly be confused with 15642 # binary ones. So create caveats for them. There are tables that are 15643 # named 'No', and tables that are named 'N', but confusion is not likely 15644 # unless they are the same table. For example, N meaning Number or 15645 # Neutral is not likely to cause confusion, so don't add caveats to things 15646 # like them. 15647 foreach my $property (grep { $_->type != $BINARY 15648 && $_->type != $FORCED_BINARY } 15649 property_ref('*')) 15650 { 15651 my $yes = $property->table('Yes'); 15652 if (defined $yes) { 15653 my $y = $property->table('Y'); 15654 if (defined $y && $yes == $y) { 15655 foreach my $alias ($property->aliases) { 15656 $yes->add_conflicting($alias->name); 15657 } 15658 } 15659 } 15660 my $no = $property->table('No'); 15661 if (defined $no) { 15662 my $n = $property->table('N'); 15663 if (defined $n && $no == $n) { 15664 foreach my $alias ($property->aliases) { 15665 $no->add_conflicting($alias->name, 'P'); 15666 } 15667 } 15668 } 15669 } 15670 15671 return; 15672} 15673 15674sub register_file_for_name($table, $directory_ref, $file) { 15675 # Given info about a table and a datafile that it should be associated 15676 # with, register that association 15677 15678 # $directory_ref # Array of the directory path for the file 15679 # $file # The file name in the final directory. 15680 15681 trace "table=$table, file=$file, directory=@$directory_ref, fate=", $table->fate if main::DEBUG && $to_trace; 15682 15683 if ($table->isa('Property')) { 15684 $table->set_file_path(@$directory_ref, $file); 15685 push @map_properties, $table; 15686 15687 # No swash means don't do the rest of this. 15688 return if $table->fate != $ORDINARY 15689 && ! ($table->name =~ /^_/ && $table->fate == $INTERNAL_ONLY); 15690 15691 # Get the path to the file 15692 my @path = $table->file_path; 15693 15694 # Use just the file name if no subdirectory. 15695 shift @path if $path[0] eq File::Spec->curdir(); 15696 15697 my $file = join '/', @path; 15698 15699 # Create a hash entry for Unicode::UCD to get the file that stores this 15700 # property's map table 15701 foreach my $alias ($table->aliases) { 15702 my $name = $alias->name; 15703 if ($name =~ /^_/) { 15704 $strict_property_to_file_of{lc $name} = $file; 15705 } 15706 else { 15707 $loose_property_to_file_of{standardize($name)} = $file; 15708 } 15709 } 15710 15711 # And a way for Unicode::UCD to find the proper key in the SwashInfo 15712 # hash for this property. 15713 $file_to_swash_name{$file} = "To" . $table->swash_name; 15714 return; 15715 } 15716 15717 # Do all of the work for all equivalent tables when called with the leader 15718 # table, so skip if isn't the leader. 15719 return if $table->leader != $table; 15720 15721 # If this is a complement of another file, use that other file instead, 15722 # with a ! prepended to it. 15723 my $complement; 15724 if (($complement = $table->complement) != 0) { 15725 my @directories = $complement->file_path; 15726 15727 # This assumes that the 0th element is something like 'lib', 15728 # the 1th element the property name (in its own directory), like 15729 # 'AHex', and the 2th element the file like 'Y' which will have a .pl 15730 # appended to it later. 15731 $directories[1] =~ s/^/!/; 15732 $file = pop @directories; 15733 $directory_ref =\@directories; 15734 } 15735 15736 # Join all the file path components together, using slashes. 15737 my $full_filename = join('/', @$directory_ref, $file); 15738 15739 # All go in the same subdirectory of unicore, or the special 15740 # pseudo-directory '#' 15741 if ($directory_ref->[0] !~ / ^ $matches_directory | \# $ /x) { 15742 Carp::my_carp("Unexpected directory in " 15743 . join('/', @{$directory_ref}, $file)); 15744 } 15745 15746 # For this table and all its equivalents ... 15747 foreach my $table ($table, $table->equivalents) { 15748 15749 # Associate it with its file internally. Don't include the 15750 # $matches_directory first component 15751 $table->set_file_path(@$directory_ref, $file); 15752 15753 # No swash means don't do the rest of this. 15754 next if $table->isa('Map_Table') && $table->fate != $ORDINARY; 15755 15756 my $sub_filename = join('/', $directory_ref->[1, -1], $file); 15757 15758 my $property = $table->property; 15759 my $property_name = ($property == $perl) 15760 ? "" # 'perl' is never explicitly stated 15761 : standardize($property->name) . '='; 15762 15763 my $is_default = 0; # Is this table the default one for the property? 15764 15765 # To calculate $is_default, we find if this table is the same as the 15766 # default one for the property. But this is complicated by the 15767 # possibility that there is a master table for this one, and the 15768 # information is stored there instead of here. 15769 my $parent = $table->parent; 15770 my $leader_prop = $parent->property; 15771 my $default_map = $leader_prop->default_map; 15772 if (defined $default_map) { 15773 my $default_table = $leader_prop->table($default_map); 15774 $is_default = 1 if defined $default_table && $parent == $default_table; 15775 } 15776 15777 # Calculate the loose name for this table. Mostly it's just its name, 15778 # standardized. But in the case of Perl tables that are single-form 15779 # equivalents to Unicode properties, it is the latter's name. 15780 my $loose_table_name = 15781 ($property != $perl || $leader_prop == $perl) 15782 ? standardize($table->name) 15783 : standardize($parent->name); 15784 15785 my $deprecated = ($table->status eq $DEPRECATED) 15786 ? $table->status_info 15787 : ""; 15788 my $caseless_equivalent = $table->caseless_equivalent; 15789 15790 # And for each of the table's aliases... This inner loop eventually 15791 # goes through all aliases in the UCD that we generate regex match 15792 # files for 15793 foreach my $alias ($table->aliases) { 15794 my $standard = UCD_name($table, $alias); 15795 15796 # Generate an entry in either the loose or strict hashes, which 15797 # will translate the property and alias names combination into the 15798 # file where the table for them is stored. 15799 if ($alias->loose_match) { 15800 if (exists $loose_to_file_of{$standard}) { 15801 Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'."); 15802 } 15803 else { 15804 $loose_to_file_of{$standard} = $sub_filename; 15805 } 15806 } 15807 else { 15808 if (exists $stricter_to_file_of{$standard}) { 15809 Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'."); 15810 } 15811 else { 15812 $stricter_to_file_of{$standard} = $sub_filename; 15813 15814 # Tightly coupled with how Unicode::UCD works, for a 15815 # floating point number that is a whole number, get rid of 15816 # the trailing decimal point and 0's, so that Unicode::UCD 15817 # will work. Also note that this assumes that such a 15818 # number is matched strictly; so if that were to change, 15819 # this would be wrong. 15820 if ((my $integer_name = $alias->name) 15821 =~ s/^ ( -? \d+ ) \.0+ $ /$1/x) 15822 { 15823 $stricter_to_file_of{$property_name . $integer_name} 15824 = $sub_filename; 15825 } 15826 } 15827 } 15828 15829 # For Unicode::UCD, create a mapping of the prop=value to the 15830 # canonical =value for that property. 15831 if ($standard =~ /=/) { 15832 15833 # This could happen if a strict name mapped into an existing 15834 # loose name. In that event, the strict names would have to 15835 # be moved to a new hash. 15836 if (exists($loose_to_standard_value{$standard})) { 15837 Carp::my_carp_bug("'$standard' conflicts with a pre-existing use. Bad News. Continuing anyway"); 15838 } 15839 $loose_to_standard_value{$standard} = $loose_table_name; 15840 } 15841 15842 # Keep a list of the deprecated properties and their filenames 15843 if ($deprecated && $complement == 0) { 15844 $Unicode::UCD::why_deprecated{$sub_filename} = $deprecated; 15845 } 15846 15847 # And a substitute table, if any, for case-insensitive matching 15848 if ($caseless_equivalent != 0) { 15849 $caseless_equivalent_to{$standard} = $caseless_equivalent; 15850 } 15851 15852 # Add to defaults list if the table this alias belongs to is the 15853 # default one 15854 $loose_defaults{$standard} = 1 if $is_default; 15855 } 15856 } 15857 15858 return; 15859} 15860 15861{ # Closure 15862 my %base_names; # Names already used for avoiding DOS 8.3 filesystem 15863 # conflicts 15864 my %full_dir_name_of; # Full length names of directories used. 15865 15866 sub construct_filename($name, $mutable, $directories_ref) { 15867 # Return a file name for a table, based on the table name, but perhaps 15868 # changed to get rid of non-portable characters in it, and to make 15869 # sure that it is unique on a file system that allows the names before 15870 # any period to be at most 8 characters (DOS). While we're at it 15871 # check and complain if there are any directory conflicts. 15872 15873 # $name # The name to start with 15874 # $mutable # Boolean: can it be changed? If no, but 15875 # yet it must be to work properly, a warning 15876 # is given 15877 # $directories_ref # A reference to an array containing the 15878 # path to the file, with each element one path 15879 # component. This is used because the same 15880 # name can be used in different directories. 15881 15882 my $warn = ! defined wantarray; # If true, then if the name is 15883 # changed, a warning is issued as well. 15884 15885 if (! defined $name) { 15886 Carp::my_carp("Undefined name in directory " 15887 . File::Spec->join(@$directories_ref) 15888 . ". '_' used"); 15889 return '_'; 15890 } 15891 15892 # Make sure that no directory names conflict with each other. Look at 15893 # each directory in the input file's path. If it is already in use, 15894 # assume it is correct, and is merely being re-used, but if we 15895 # truncate it to 8 characters, and find that there are two directories 15896 # that are the same for the first 8 characters, but differ after that, 15897 # then that is a problem. 15898 foreach my $directory (@$directories_ref) { 15899 my $short_dir = substr($directory, 0, 8); 15900 if (defined $full_dir_name_of{$short_dir}) { 15901 next if $full_dir_name_of{$short_dir} eq $directory; 15902 Carp::my_carp("Directory $directory conflicts with directory $full_dir_name_of{$short_dir}. Bad News. Continuing anyway"); 15903 } 15904 else { 15905 $full_dir_name_of{$short_dir} = $directory; 15906 } 15907 } 15908 15909 my $path = join '/', @$directories_ref; 15910 $path .= '/' if $path; 15911 15912 # Remove interior underscores. 15913 (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg; 15914 15915 # Convert the dot in floating point numbers to an underscore 15916 $filename =~ s/\./_/ if $filename =~ / ^ \d+ \. \d+ $ /x; 15917 15918 my $suffix = ""; 15919 15920 # Extract any suffix, delete any non-word character, and truncate to 3 15921 # after the dot 15922 if ($filename =~ m/ ( .*? ) ( \. .* ) /x) { 15923 $filename = $1; 15924 $suffix = $2; 15925 $suffix =~ s/\W+//g; 15926 substr($suffix, 4) = "" if length($suffix) > 4; 15927 } 15928 15929 # Change any non-word character outside the suffix into an underscore, 15930 # and truncate to 8. 15931 $filename =~ s/\W+/_/g; # eg., "L&" -> "L_" 15932 substr($filename, 8) = "" if length($filename) > 8; 15933 15934 # Make sure the basename doesn't conflict with something we 15935 # might have already written. If we have, say, 15936 # InGreekExtended1 15937 # InGreekExtended2 15938 # they become 15939 # InGreekE 15940 # InGreek2 15941 my $warned = 0; 15942 while (my $num = $base_names{$path}{lc "$filename$suffix"}++) { 15943 $num++; # so basenames with numbers start with '2', which 15944 # just looks more natural. 15945 15946 # Want to append $num, but if it'll make the basename longer 15947 # than 8 characters, pre-truncate $filename so that the result 15948 # is acceptable. 15949 my $delta = length($filename) + length($num) - 8; 15950 if ($delta > 0) { 15951 substr($filename, -$delta) = $num; 15952 } 15953 else { 15954 $filename .= $num; 15955 } 15956 if ($warn && ! $warned) { 15957 $warned = 1; 15958 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS). Proceeding anyway."); 15959 } 15960 } 15961 15962 return $filename if $mutable; 15963 15964 # If not changeable, must return the input name, but warn if needed to 15965 # change it beyond shortening it. 15966 if ($name ne $filename 15967 && substr($name, 0, length($filename)) ne $filename) { 15968 Carp::my_carp("'$path$name' had to be changed into '$filename'. Bad News. Proceeding anyway."); 15969 } 15970 return $name; 15971 } 15972} 15973 15974# The pod file contains a very large table. Many of the lines in that table 15975# would exceed a typical output window's size, and so need to be wrapped with 15976# a hanging indent to make them look good. The pod language is really 15977# insufficient here. There is no general construct to do that in pod, so it 15978# is done here by beginning each such line with a space to cause the result to 15979# be output without formatting, and doing all the formatting here. This leads 15980# to the result that if the eventual display window is too narrow it won't 15981# look good, and if the window is too wide, no advantage is taken of that 15982# extra width. A further complication is that the output may be indented by 15983# the formatter so that there is less space than expected. What I (khw) have 15984# done is to assume that that indent is a particular number of spaces based on 15985# what it is in my Linux system; people can always resize their windows if 15986# necessary, but this is obviously less than desirable, but the best that can 15987# be expected. 15988my $automatic_pod_indent = 8; 15989 15990# Try to format so that uses fewest lines, but few long left column entries 15991# slide into the right column. An experiment on 5.1 data yielded the 15992# following percentages that didn't cut into the other side along with the 15993# associated first-column widths 15994# 69% = 24 15995# 80% not too bad except for a few blocks 15996# 90% = 33; # , cuts 353/3053 lines from 37 = 12% 15997# 95% = 37; 15998my $indent_info_column = 27; # 75% of lines didn't have overlap 15999 16000my $FILLER = 3; # Length of initial boiler-plate columns in a pod line 16001 # The 3 is because of: 16002 # 1 for the leading space to tell the pod formatter to 16003 # output as-is 16004 # 1 for the flag 16005 # 1 for the space between the flag and the main data 16006 16007sub format_pod_line($first_column_width, $entry, $info, $status = "", $loose_match = 1 ) { 16008 # Take a pod line and return it, formatted properly 16009 16010 # $entry Contents of left column 16011 # $info Contents of right column 16012 16013 my $flags = ""; 16014 $flags .= $STRICTER if ! $loose_match; 16015 16016 $flags .= $status if $status; 16017 16018 # There is a blank in the left column to cause the pod formatter to 16019 # output the line as-is. 16020 return sprintf " %-*s%-*s %s\n", 16021 # The first * in the format is replaced by this, the -1 is 16022 # to account for the leading blank. There isn't a 16023 # hard-coded blank after this to separate the flags from 16024 # the rest of the line, so that in the unlikely event that 16025 # multiple flags are shown on the same line, they both 16026 # will get displayed at the expense of that separation, 16027 # but since they are left justified, a blank will be 16028 # inserted in the normal case. 16029 $FILLER - 1, 16030 $flags, 16031 16032 # The other * in the format is replaced by this number to 16033 # cause the first main column to right fill with blanks. 16034 # The -1 is for the guaranteed blank following it. 16035 $first_column_width - $FILLER - 1, 16036 $entry, 16037 $info; 16038} 16039 16040my @zero_match_tables; # List of tables that have no matches in this release 16041 16042sub make_re_pod_entries($input_table) { 16043 # This generates the entries for the pod file for a given table. 16044 # Also done at this time are any children tables. The output looks like: 16045 # \p{Common} \p{Script=Common} (Short: \p{Zyyy}) (5178) 16046 16047 # Generate parent and all its children at the same time. 16048 return if $input_table->parent != $input_table; 16049 16050 my $property = $input_table->property; 16051 my $type = $property->type; 16052 my $full_name = $property->full_name; 16053 16054 my $count = $input_table->count; 16055 my $unicode_count; 16056 my $non_unicode_string; 16057 if ($count > $MAX_UNICODE_CODEPOINTS) { 16058 $unicode_count = $count - ($MAX_WORKING_CODEPOINT 16059 - $MAX_UNICODE_CODEPOINT); 16060 $non_unicode_string = " plus all above-Unicode code points"; 16061 } 16062 else { 16063 $unicode_count = $count; 16064 $non_unicode_string = ""; 16065 } 16066 16067 my $string_count = clarify_number($unicode_count) . $non_unicode_string; 16068 16069 my $definition = $input_table->calculate_table_definition; 16070 if ($definition) { 16071 16072 # Save the definition for later use. 16073 $input_table->set_definition($definition); 16074 16075 $definition = ": $definition"; 16076 } 16077 16078 my $status = $input_table->status; 16079 my $status_info = $input_table->status_info; 16080 my $caseless_equivalent = $input_table->caseless_equivalent; 16081 16082 # Don't mention a placeholder equivalent as it isn't to be listed in the 16083 # pod 16084 $caseless_equivalent = 0 if $caseless_equivalent != 0 16085 && $caseless_equivalent->fate > $ORDINARY; 16086 16087 my $entry_for_first_table; # The entry for the first table output. 16088 # Almost certainly, it is the parent. 16089 16090 # For each related table (including itself), we will generate a pod entry 16091 # for each name each table goes by 16092 foreach my $table ($input_table, $input_table->children) { 16093 16094 # Unicode::UCD cannot deal with null string property values, so skip 16095 # any tables that have no non-null names. 16096 next if ! grep { $_->name ne "" } $table->aliases; 16097 16098 # First, gather all the info that applies to this table as a whole. 16099 16100 push @zero_match_tables, $table if $count == 0 16101 # Don't mention special tables 16102 # as being zero length 16103 && $table->fate == $ORDINARY; 16104 16105 my $table_property = $table->property; 16106 16107 # The short name has all the underscores removed, while the full name 16108 # retains them. Later, we decide whether to output a short synonym 16109 # for the full one, we need to compare apples to apples, so we use the 16110 # short name's length including underscores. 16111 my $table_property_short_name_length; 16112 my $table_property_short_name 16113 = $table_property->short_name(\$table_property_short_name_length); 16114 my $table_property_full_name = $table_property->full_name; 16115 16116 # Get how much savings there is in the short name over the full one 16117 # (delta will always be <= 0) 16118 my $table_property_short_delta = $table_property_short_name_length 16119 - length($table_property_full_name); 16120 my @table_description = $table->description; 16121 my @table_note = $table->note; 16122 16123 # Generate an entry for each alias in this table. 16124 my $entry_for_first_alias; # saves the first one encountered. 16125 foreach my $alias ($table->aliases) { 16126 16127 # Skip if not to go in pod. 16128 next unless $alias->make_re_pod_entry; 16129 16130 # Start gathering all the components for the entry 16131 my $name = $alias->name; 16132 16133 # Skip if name is empty, as can't be accessed by regexes. 16134 next if $name eq ""; 16135 16136 my $entry; # Holds the left column, may include extras 16137 my $entry_ref; # To refer to the left column's contents from 16138 # another entry; has no extras 16139 16140 # First the left column of the pod entry. Tables for the $perl 16141 # property always use the single form. 16142 if ($table_property == $perl) { 16143 $entry = "\\p{$name}"; 16144 $entry .= " \\p$name" if length $name == 1; # Show non-braced 16145 # form too 16146 $entry_ref = "\\p{$name}"; 16147 } 16148 else { # Compound form. 16149 16150 # Only generate one entry for all the aliases that mean true 16151 # or false in binary properties. Append a '*' to indicate 16152 # some are missing. (The heading comment notes this.) 16153 my $rhs; 16154 if ($type == $BINARY) { 16155 next if $name ne 'N' && $name ne 'Y'; 16156 $rhs = "$name*"; 16157 } 16158 elsif ($type != $FORCED_BINARY) { 16159 $rhs = $name; 16160 } 16161 else { 16162 16163 # Forced binary properties require special handling. It 16164 # has two sets of tables, one set is true/false; and the 16165 # other set is everything else. Entries are generated for 16166 # each set. Use the Bidi_Mirrored property (which appears 16167 # in all Unicode versions) to get a list of the aliases 16168 # for the true/false tables. Of these, only output the N 16169 # and Y ones, the same as, a regular binary property. And 16170 # output all the rest, same as a non-binary property. 16171 my $bm = property_ref("Bidi_Mirrored"); 16172 if ($name eq 'N' || $name eq 'Y') { 16173 $rhs = "$name*"; 16174 } elsif (grep { $name eq $_->name } $bm->table("Y")->aliases, 16175 $bm->table("N")->aliases) 16176 { 16177 next; 16178 } 16179 else { 16180 $rhs = $name; 16181 } 16182 } 16183 16184 # Colon-space is used to give a little more space to be easier 16185 # to read; 16186 $entry = "\\p{" 16187 . $table_property_full_name 16188 . ": $rhs}"; 16189 16190 # But for the reference to this entry, which will go in the 16191 # right column, where space is at a premium, use equals 16192 # without a space 16193 $entry_ref = "\\p{" . $table_property_full_name . "=$name}"; 16194 } 16195 16196 # Then the right (info) column. This is stored as components of 16197 # an array for the moment, then joined into a string later. For 16198 # non-internal only properties, begin the info with the entry for 16199 # the first table we encountered (if any), as things are ordered 16200 # so that that one is the most descriptive. This leads to the 16201 # info column of an entry being a more descriptive version of the 16202 # name column 16203 my @info; 16204 if ($name =~ /^_/) { 16205 push @info, 16206 '(For internal use by Perl, not necessarily stable)'; 16207 } 16208 elsif ($entry_for_first_alias) { 16209 push @info, $entry_for_first_alias; 16210 } 16211 16212 # If this entry is equivalent to another, add that to the info, 16213 # using the first such table we encountered 16214 if ($entry_for_first_table) { 16215 if (@info) { 16216 push @info, "(= $entry_for_first_table)"; 16217 } 16218 else { 16219 push @info, $entry_for_first_table; 16220 } 16221 } 16222 16223 # If the name is a large integer, add an equivalent with an 16224 # exponent for better readability 16225 if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) { 16226 push @info, sprintf "(= %.1e)", $name 16227 } 16228 16229 my $parenthesized = ""; 16230 if (! $entry_for_first_alias) { 16231 16232 # This is the first alias for the current table. The alias 16233 # array is ordered so that this is the fullest, most 16234 # descriptive alias, so it gets the fullest info. The other 16235 # aliases are mostly merely pointers to this one, using the 16236 # information already added above. 16237 16238 # Display any status message, but only on the parent table 16239 if ($status && ! $entry_for_first_table) { 16240 push @info, $status_info; 16241 } 16242 16243 # Put out any descriptive info 16244 if (@table_description || @table_note) { 16245 push @info, join "; ", @table_description, @table_note; 16246 } 16247 16248 # Look to see if there is a shorter name we can point people 16249 # at 16250 my $standard_name = standardize($name); 16251 my $short_name; 16252 my $proposed_short = $table->short_name; 16253 if (defined $proposed_short) { 16254 my $standard_short = standardize($proposed_short); 16255 16256 # If the short name is shorter than the standard one, or 16257 # even if it's not, but the combination of it and its 16258 # short property name (as in \p{prop=short} ($perl doesn't 16259 # have this form)) saves at least two characters, then, 16260 # cause it to be listed as a shorter synonym. 16261 if (length $standard_short < length $standard_name 16262 || ($table_property != $perl 16263 && (length($standard_short) 16264 - length($standard_name) 16265 + $table_property_short_delta) # (<= 0) 16266 < -2)) 16267 { 16268 $short_name = $proposed_short; 16269 if ($table_property != $perl) { 16270 $short_name = $table_property_short_name 16271 . "=$short_name"; 16272 } 16273 $short_name = "\\p{$short_name}"; 16274 } 16275 } 16276 16277 # And if this is a compound form name, see if there is a 16278 # single form equivalent 16279 my $single_form; 16280 if ($table_property != $perl && $table_property != $block) { 16281 16282 # Special case the binary N tables, so that will print 16283 # \P{single}, but use the Y table values to populate 16284 # 'single', as we haven't likewise populated the N table. 16285 # For forced binary tables, we can't just look at the N 16286 # table, but must see if this table is equivalent to the N 16287 # one, as there are two equivalent beasts in these 16288 # properties. 16289 my $test_table; 16290 my $p; 16291 if ( ($type == $BINARY 16292 && $input_table == $property->table('No')) 16293 || ($type == $FORCED_BINARY 16294 && $property->table('No')-> 16295 is_set_equivalent_to($input_table))) 16296 { 16297 $test_table = $property->table('Yes'); 16298 $p = 'P'; 16299 } 16300 else { 16301 $test_table = $input_table; 16302 $p = 'p'; 16303 } 16304 16305 # Look for a single form amongst all the children. 16306 foreach my $table ($test_table->children) { 16307 next if $table->property != $perl; 16308 my $proposed_name = $table->short_name; 16309 next if ! defined $proposed_name; 16310 16311 # Don't mention internal-only properties as a possible 16312 # single form synonym 16313 next if substr($proposed_name, 0, 1) eq '_'; 16314 16315 $proposed_name = "\\$p\{$proposed_name}"; 16316 if (! defined $single_form 16317 || length($proposed_name) < length $single_form) 16318 { 16319 $single_form = $proposed_name; 16320 16321 # The goal here is to find a single form; not the 16322 # shortest possible one. We've already found a 16323 # short name. So, stop at the first single form 16324 # found, which is likely to be closer to the 16325 # original. 16326 last; 16327 } 16328 } 16329 } 16330 16331 # Output both short and single in the same parenthesized 16332 # expression, but with only one of 'Single', 'Short' if there 16333 # are both items. 16334 if ($short_name || $single_form || $table->conflicting) { 16335 $parenthesized .= "Short: $short_name" if $short_name; 16336 if ($short_name && $single_form) { 16337 $parenthesized .= ', '; 16338 } 16339 elsif ($single_form) { 16340 $parenthesized .= 'Single: '; 16341 } 16342 $parenthesized .= $single_form if $single_form; 16343 } 16344 } 16345 16346 if ($caseless_equivalent != 0) { 16347 $parenthesized .= '; ' if $parenthesized ne ""; 16348 $parenthesized .= "/i= " . $caseless_equivalent->complete_name; 16349 } 16350 16351 16352 # Warn if this property isn't the same as one that a 16353 # semi-casual user might expect. The other components of this 16354 # parenthesized structure are calculated only for the first entry 16355 # for this table, but the conflicting is deemed important enough 16356 # to go on every entry. 16357 my $conflicting = join " NOR ", $table->conflicting; 16358 if ($conflicting) { 16359 $parenthesized .= '; ' if $parenthesized ne ""; 16360 $parenthesized .= "NOT $conflicting"; 16361 } 16362 16363 push @info, "($parenthesized)" if $parenthesized; 16364 16365 if ($name =~ /_$/ && $alias->loose_match) { 16366 push @info, "Note the trailing '_' matters in spite of loose matching rules."; 16367 } 16368 16369 if ($table_property != $perl && $table->perl_extension) { 16370 push @info, '(Perl extension)'; 16371 } 16372 my $definition = $table->definition // ""; 16373 $definition = "" if $entry_for_first_alias; 16374 $definition = ": $definition" if $definition; 16375 push @info, "($string_count$definition)"; 16376 16377 # Now, we have both the entry and info so add them to the 16378 # list of all the properties. 16379 push @match_properties, 16380 format_pod_line($indent_info_column, 16381 $entry, 16382 join( " ", @info), 16383 $alias->status, 16384 $alias->loose_match); 16385 16386 $entry_for_first_alias = $entry_ref unless $entry_for_first_alias; 16387 } # End of looping through the aliases for this table. 16388 16389 if (! $entry_for_first_table) { 16390 $entry_for_first_table = $entry_for_first_alias; 16391 } 16392 } # End of looping through all the related tables 16393 return; 16394} 16395 16396sub make_ucd_table_pod_entries($table) { 16397 # Generate the entries for the UCD section of the pod for $table. This 16398 # also calculates if names are ambiguous, so has to be called even if the 16399 # pod is not being output 16400 16401 my $short_name = $table->name; 16402 my $standard_short_name = standardize($short_name); 16403 my $full_name = $table->full_name; 16404 my $standard_full_name = standardize($full_name); 16405 16406 my $full_info = ""; # Text of info column for full-name entries 16407 my $other_info = ""; # Text of info column for short-name entries 16408 my $short_info = ""; # Text of info column for other entries 16409 my $meaning = ""; # Synonym of this table 16410 16411 my $property = ($table->isa('Property')) 16412 ? $table 16413 : $table->parent->property; 16414 16415 my $perl_extension = $table->perl_extension; 16416 my $is_perl_extension_match_table_but_not_dollar_perl 16417 = $property != $perl 16418 && $perl_extension 16419 && $property != $table; 16420 16421 # Get the more official name for perl extensions that aren't 16422 # stand-alone properties 16423 if ($is_perl_extension_match_table_but_not_dollar_perl) { 16424 if ($property->type == $BINARY) { 16425 $meaning = $property->full_name; 16426 } 16427 else { 16428 $meaning = $table->parent->complete_name; 16429 } 16430 } 16431 16432 # There are three types of info column. One for the short name, one for 16433 # the full name, and one for everything else. They mostly are the same, 16434 # so initialize in the same loop. 16435 16436 foreach my $info_ref (\$full_info, \$short_info, \$other_info) { 16437 if ($info_ref != \$full_info) { 16438 16439 # The non-full name columns include the full name 16440 $$info_ref .= $full_name; 16441 } 16442 16443 16444 if ($is_perl_extension_match_table_but_not_dollar_perl) { 16445 16446 # Add the synonymous name for the non-full name entries; and to 16447 # the full-name entry if it adds extra information 16448 if ( standardize($meaning) ne $standard_full_name 16449 || $info_ref == \$other_info 16450 || $info_ref == \$short_info) 16451 { 16452 my $parenthesized = $info_ref != \$full_info; 16453 $$info_ref .= " " if $$info_ref && $parenthesized; 16454 $$info_ref .= "(=" if $parenthesized; 16455 $$info_ref .= "$meaning"; 16456 $$info_ref .= ")" if $parenthesized; 16457 $$info_ref .= "."; 16458 } 16459 } 16460 16461 # And the full-name entry includes the short name, if shorter 16462 if ($info_ref == \$full_info 16463 && length $standard_short_name < length $standard_full_name) 16464 { 16465 $full_info =~ s/\.\Z//; 16466 $full_info .= " " if $full_info; 16467 $full_info .= "(Short: $short_name)"; 16468 } 16469 16470 if ($table->perl_extension) { 16471 $$info_ref =~ s/\.\Z//; 16472 $$info_ref .= ". " if $$info_ref; 16473 $$info_ref .= "(Perl extension)"; 16474 } 16475 } 16476 16477 my $definition; 16478 my $definition_table; 16479 my $type = $table->property->type; 16480 if ($type == $BINARY || $type == $FORCED_BINARY) { 16481 $definition_table = $table->property->table('Y'); 16482 } 16483 elsif ($table->isa('Match_Table')) { 16484 $definition_table = $table; 16485 } 16486 16487 $definition = $definition_table->calculate_table_definition 16488 if defined $definition_table 16489 && $definition_table != 0; 16490 16491 # Add any extra annotations to the full name entry 16492 foreach my $more_info ($table->description, 16493 $definition, 16494 $table->note, 16495 $table->status_info) 16496 { 16497 next unless $more_info; 16498 $full_info =~ s/\.\Z//; 16499 $full_info .= ". " if $full_info; 16500 $full_info .= $more_info; 16501 } 16502 if ($table->property->type == $FORCED_BINARY) { 16503 if ($full_info) { 16504 $full_info =~ s/\.\Z//; 16505 $full_info .= ". "; 16506 } 16507 $full_info .= "This is a combination property which has both:" 16508 . " 1) a map to various string values; and" 16509 . " 2) a map to boolean Y/N, where 'Y' means the" 16510 . " string value is non-empty. Add the prefix 'is'" 16511 . " to the prop_invmap() call to get the latter"; 16512 } 16513 16514 # These keep track if have created full and short name pod entries for the 16515 # property 16516 my $done_full = 0; 16517 my $done_short = 0; 16518 16519 # Every possible name is kept track of, even those that aren't going to be 16520 # output. This way we can be sure to find the ambiguities. 16521 foreach my $alias ($table->aliases) { 16522 my $name = $alias->name; 16523 my $standard = standardize($name); 16524 my $info; 16525 my $output_this = $alias->ucd; 16526 16527 # If the full and short names are the same, we want to output the full 16528 # one's entry, so it has priority. 16529 if ($standard eq $standard_full_name) { 16530 next if $done_full; 16531 $done_full = 1; 16532 $info = $full_info; 16533 } 16534 elsif ($standard eq $standard_short_name) { 16535 next if $done_short; 16536 $done_short = 1; 16537 next if $standard_short_name eq $standard_full_name; 16538 $info = $short_info; 16539 } 16540 else { 16541 $info = $other_info; 16542 } 16543 16544 $combination_property{$standard} = 1 16545 if $table->property->type == $FORCED_BINARY; 16546 16547 # Here, we have set up the two columns for this entry. But if an 16548 # entry already exists for this name, we have to decide which one 16549 # we're going to later output. 16550 if (exists $ucd_pod{$standard}) { 16551 16552 # If the two entries refer to the same property, it's not going to 16553 # be ambiguous. (Likely it's because the names when standardized 16554 # are the same.) But that means if they are different properties, 16555 # there is ambiguity. 16556 if ($ucd_pod{$standard}->{'property'} != $property) { 16557 16558 # Here, we have an ambiguity. This code assumes that one is 16559 # scheduled to be output and one not and that one is a perl 16560 # extension (which is not to be output) and the other isn't. 16561 # If those assumptions are wrong, things have to be rethought. 16562 if ($ucd_pod{$standard}{'output_this'} == $output_this 16563 || $ucd_pod{$standard}{'perl_extension'} == $perl_extension 16564 || $output_this == $perl_extension) 16565 { 16566 Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations. Proceeding anyway."); 16567 } 16568 16569 # We modify the info column of the one being output to 16570 # indicate the ambiguity. Set $which to point to that one's 16571 # info. 16572 my $which; 16573 if ($ucd_pod{$standard}{'output_this'}) { 16574 $which = \$ucd_pod{$standard}->{'info'}; 16575 } 16576 else { 16577 $which = \$info; 16578 $meaning = $ucd_pod{$standard}{'meaning'}; 16579 } 16580 16581 chomp $$which; 16582 $$which =~ s/\.\Z//; 16583 $$which .= "; NOT '$standard' meaning '$meaning'"; 16584 16585 $ambiguous_names{$standard} = 1; 16586 } 16587 16588 # Use the non-perl-extension variant 16589 next unless $ucd_pod{$standard}{'perl_extension'}; 16590 } 16591 16592 # Store enough information about this entry that we can later look for 16593 # ambiguities, and output it properly. 16594 $ucd_pod{$standard} = { 'name' => $name, 16595 'info' => $info, 16596 'meaning' => $meaning, 16597 'output_this' => $output_this, 16598 'perl_extension' => $perl_extension, 16599 'property' => $property, 16600 'status' => $alias->status, 16601 }; 16602 } # End of looping through all this table's aliases 16603 16604 return; 16605} 16606 16607sub pod_alphanumeric_sort { 16608 # Sort pod entries alphanumerically. 16609 16610 # The first few character columns are filler, plus the '\p{'; and get rid 16611 # of all the trailing stuff, starting with the trailing '}', so as to sort 16612 # on just 'Name=Value' 16613 (my $a = lc $a) =~ s/^ .*? \{ //x; 16614 $a =~ s/}.*//; 16615 (my $b = lc $b) =~ s/^ .*? \{ //x; 16616 $b =~ s/}.*//; 16617 16618 # Determine if the two operands are both internal only or both not. 16619 # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3 16620 # should be the underscore that begins internal only 16621 my $a_is_internal = (substr($a, 0, 1) eq '_'); 16622 my $b_is_internal = (substr($b, 0, 1) eq '_'); 16623 16624 # Sort so the internals come last in the table instead of first (which the 16625 # leading underscore would otherwise indicate). 16626 if ($a_is_internal != $b_is_internal) { 16627 return 1 if $a_is_internal; 16628 return -1 16629 } 16630 16631 # Determine if the two operands are compound or not, and if so if are 16632 # "numeric" property values or not, like \p{Age: 3.0}. But there are also 16633 # things like \p{Canonical_Combining_Class: CCC133} and \p{Age: V10_0}, 16634 # all of which this considers numeric, and for sorting, looks just at the 16635 # numeric parts. It can also be a rational like \p{Numeric Value=-1/2}. 16636 my $split_re = qr/ 16637 ^ ( [^:=]+ ) # $1 is undef if not a compound form, otherwise is the 16638 # property name 16639 [:=] \s* # The syntax for the compound form 16640 (?: # followed by ... 16641 ( # $2 gets defined if what follows is a "numeric" 16642 # expression, which is ... 16643 ( -? \d+ (?: [.\/] \d+)? # An integer, float, or rational 16644 # number, optionally signed 16645 | [[:alpha:]]{2,} \d+ $ ) # or something like CCC131. Either 16646 # of these go into $3 16647 | ( V \d+ _ \d+ ) # or a Unicode's Age property version 16648 # number, into $4 16649 ) 16650 | .* $ # If not "numeric", accept anything so that $1 gets 16651 # defined if it is any compound form 16652 ) /ix; 16653 my ($a_initial, $a_numeric, $a_number, $a_version) = ($a =~ $split_re); 16654 my ($b_initial, $b_numeric, $b_number, $b_version) = ($b =~ $split_re); 16655 16656 # Sort alphabeticlly on the whole property name if either operand isn't 16657 # compound, or they differ. 16658 return $a cmp $b if ! defined $a_initial 16659 || ! defined $b_initial 16660 || $a_initial ne $b_initial; 16661 16662 if (! defined $a_numeric) { 16663 16664 # If neither is numeric, use alpha sort 16665 return $a cmp $b if ! defined $b_numeric; 16666 return 1; # Sort numeric ahead of alpha 16667 } 16668 16669 # Here $a is numeric 16670 return -1 if ! defined $b_numeric; # Numeric sorts before alpha 16671 16672 # Here they are both numeric in the same property. 16673 # Convert version numbers into regular numbers 16674 if (defined $a_version) { 16675 ($a_number = $a_version) =~ s/^V//i; 16676 $a_number =~ s/_/./; 16677 } 16678 else { # Otherwise get rid of the, e.g., CCC in CCC9 */ 16679 $a_number =~ s/ ^ [[:alpha:]]+ //x; 16680 } 16681 if (defined $b_version) { 16682 ($b_number = $b_version) =~ s/^V//i; 16683 $b_number =~ s/_/./; 16684 } 16685 else { 16686 $b_number =~ s/ ^ [[:alpha:]]+ //x; 16687 } 16688 16689 # Convert rationals to floating for the comparison. 16690 $a_number = eval $a_number if $a_number =~ qr{/}; 16691 $b_number = eval $b_number if $b_number =~ qr{/}; 16692 16693 return $a_number <=> $b_number || $a cmp $b; 16694} 16695 16696sub make_pod () { 16697 # Create the .pod file. This generates the various subsections and then 16698 # combines them in one big HERE document. 16699 16700 my $Is_flags_text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)"; 16701 16702 return unless defined $pod_directory; 16703 print "Making pod file\n" if $verbosity >= $PROGRESS; 16704 16705 my $exception_message = 16706 '(Any exceptions are individually noted beginning with the word NOT.)'; 16707 my @block_warning; 16708 if (-e 'Blocks.txt') { 16709 16710 # Add the line: '\p{In_*} \p{Block: *}', with the warning message 16711 # if the global $has_In_conflicts indicates we have them. 16712 push @match_properties, format_pod_line($indent_info_column, 16713 '\p{In_*}', 16714 '\p{Block: *}' 16715 . (($has_In_conflicts) 16716 ? " $exception_message" 16717 : ""), 16718 $DISCOURAGED); 16719 @block_warning = << "END"; 16720 16721In particular, matches in the Block property have single forms 16722defined by Perl that begin with C<"In_">, C<"Is_>, or even with no prefix at 16723all, Like all B<DISCOURAGED> forms, these are not stable. For example, 16724C<\\p{Block=Deseret}> can currently be written as C<\\p{In_Deseret}>, 16725C<\\p{Is_Deseret}>, or C<\\p{Deseret}>. But, a new Unicode version may 16726come along that would force Perl to change the meaning of one or more of 16727these, and your program would no longer be correct. Currently there are no 16728such conflicts with the form that begins C<"In_">, but there are many with the 16729other two shortcuts, and Unicode continues to define new properties that begin 16730with C<"In">, so it's quite possible that a conflict will occur in the future. 16731The compound form is guaranteed to not become obsolete, and its meaning is 16732clearer anyway. See L<perlunicode/"Blocks"> for more information about this. 16733 16734User-defined properties must begin with "In" or "Is". These override any 16735Unicode property of the same name. 16736END 16737 } 16738 my $text = $Is_flags_text; 16739 $text = "$exception_message $text" if $has_Is_conflicts; 16740 16741 # And the 'Is_ line'; 16742 push @match_properties, format_pod_line($indent_info_column, 16743 '\p{Is_*}', 16744 "\\p{*} $text"); 16745 push @match_properties, format_pod_line($indent_info_column, 16746 '\p{Name=*}', 16747 "Combination of Name and Name_Alias properties; has special" 16748 . " loose matching rules, for which see Unicode UAX #44"); 16749 push @match_properties, format_pod_line($indent_info_column, 16750 '\p{Na=*}', 16751 '\p{Name=*}'); 16752 16753 # Sort the properties array for output. It is sorted alphabetically 16754 # except numerically for numeric properties, and only output unique lines. 16755 @match_properties = sort pod_alphanumeric_sort uniques @match_properties; 16756 16757 my $formatted_properties = simple_fold(\@match_properties, 16758 "", 16759 # indent succeeding lines by two extra 16760 # which looks better 16761 $indent_info_column + 2, 16762 16763 # shorten the line length by how much 16764 # the formatter indents, so the folded 16765 # line will fit in the space 16766 # presumably available 16767 $automatic_pod_indent); 16768 # Add column headings, indented to be a little more centered, but not 16769 # exactly 16770 $formatted_properties = format_pod_line($indent_info_column, 16771 ' NAME', 16772 ' INFO') 16773 . "\n" 16774 . $formatted_properties; 16775 16776 # Generate pod documentation lines for the tables that match nothing 16777 my $zero_matches = ""; 16778 if (@zero_match_tables) { 16779 @zero_match_tables = uniques(@zero_match_tables); 16780 $zero_matches = join "\n\n", 16781 map { $_ = '=item \p{' . $_->complete_name . "}" } 16782 sort { $a->complete_name cmp $b->complete_name } 16783 @zero_match_tables; 16784 16785 $zero_matches = <<END; 16786 16787=head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters 16788 16789Unicode has some property-value pairs that currently don't match anything. 16790This happens generally either because they are obsolete, or they exist for 16791symmetry with other forms, but no language has yet been encoded that uses 16792them. In this version of Unicode, the following match zero code points: 16793 16794=over 4 16795 16796$zero_matches 16797 16798=back 16799 16800END 16801 } 16802 16803 # Generate list of properties that we don't accept, grouped by the reasons 16804 # why. This is so only put out the 'why' once, and then list all the 16805 # properties that have that reason under it. 16806 16807 my %why_list; # The keys are the reasons; the values are lists of 16808 # properties that have the key as their reason 16809 16810 # For each property, add it to the list that are suppressed for its reason 16811 # The sort will cause the alphabetically first properties to be added to 16812 # each list first, so each list will be sorted. 16813 foreach my $property (sort keys %why_suppressed) { 16814 next unless $why_suppressed{$property}; 16815 push @{$why_list{$why_suppressed{$property}}}, $property; 16816 } 16817 16818 # For each reason (sorted by the first property that has that reason)... 16819 my @bad_re_properties; 16820 foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] } 16821 keys %why_list) 16822 { 16823 # Add to the output, all the properties that have that reason. 16824 my $has_item = 0; # Flag if actually output anything. 16825 foreach my $name (@{$why_list{$why}}) { 16826 16827 # Split compound names into $property and $table components 16828 my $property = $name; 16829 my $table; 16830 if ($property =~ / (.*) = (.*) /x) { 16831 $property = $1; 16832 $table = $2; 16833 } 16834 16835 # This release of Unicode may not have a property that is 16836 # suppressed, so don't reference a non-existent one. 16837 $property = property_ref($property); 16838 next if ! defined $property; 16839 16840 # And since this list is only for match tables, don't list the 16841 # ones that don't have match tables. 16842 next if ! $property->to_create_match_tables; 16843 16844 # Find any abbreviation, and turn it into a compound name if this 16845 # is a property=value pair. 16846 my $short_name = $property->name; 16847 $short_name .= '=' . $property->table($table)->name if $table; 16848 16849 # Start with an empty line. 16850 push @bad_re_properties, "\n\n" unless $has_item; 16851 16852 # And add the property as an item for the reason. 16853 push @bad_re_properties, "\n=item I<$name> ($short_name)\n"; 16854 $has_item = 1; 16855 } 16856 16857 # And add the reason under the list of properties, if such a list 16858 # actually got generated. Note that the header got added 16859 # unconditionally before. But pod ignores extra blank lines, so no 16860 # harm. 16861 push @bad_re_properties, "\n$why\n" if $has_item; 16862 16863 } # End of looping through each reason. 16864 16865 if (! @bad_re_properties) { 16866 push @bad_re_properties, 16867 "*** This installation accepts ALL non-Unihan properties ***"; 16868 } 16869 else { 16870 # Add =over only if non-empty to avoid an empty =over/=back section, 16871 # which is considered bad form. 16872 unshift @bad_re_properties, "\n=over 4\n"; 16873 push @bad_re_properties, "\n=back\n"; 16874 } 16875 16876 # Similarly, generate a list of files that we don't use, grouped by the 16877 # reasons why (Don't output if the reason is empty). First, create a hash 16878 # whose keys are the reasons, and whose values are anonymous arrays of all 16879 # the files that share that reason. 16880 my %grouped_by_reason; 16881 foreach my $file (keys %skipped_files) { 16882 next unless $skipped_files{$file}; 16883 push @{$grouped_by_reason{$skipped_files{$file}}}, $file; 16884 } 16885 16886 # Then, sort each group. 16887 foreach my $group (keys %grouped_by_reason) { 16888 @{$grouped_by_reason{$group}} = sort { lc $a cmp lc $b } 16889 @{$grouped_by_reason{$group}} ; 16890 } 16891 16892 # Finally, create the output text. For each reason (sorted by the 16893 # alphabetically first file that has that reason)... 16894 my @unused_files; 16895 foreach my $reason (sort { lc $grouped_by_reason{$a}->[0] 16896 cmp lc $grouped_by_reason{$b}->[0] 16897 } 16898 keys %grouped_by_reason) 16899 { 16900 # Add all the files that have that reason to the output. Start 16901 # with an empty line. 16902 push @unused_files, "\n\n"; 16903 push @unused_files, map { "\n=item F<$_> \n" } 16904 @{$grouped_by_reason{$reason}}; 16905 # And add the reason under the list of files 16906 push @unused_files, "\n$reason\n"; 16907 } 16908 16909 # Similarly, create the output text for the UCD section of the pod 16910 my @ucd_pod; 16911 foreach my $key (keys %ucd_pod) { 16912 next unless $ucd_pod{$key}->{'output_this'}; 16913 push @ucd_pod, format_pod_line($indent_info_column, 16914 $ucd_pod{$key}->{'name'}, 16915 $ucd_pod{$key}->{'info'}, 16916 $ucd_pod{$key}->{'status'}, 16917 ); 16918 } 16919 16920 # Sort alphabetically, and fold for output 16921 @ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod; 16922 my $ucd_pod = simple_fold(\@ucd_pod, 16923 ' ', 16924 $indent_info_column, 16925 $automatic_pod_indent); 16926 $ucd_pod = format_pod_line($indent_info_column, 'NAME', ' INFO') 16927 . "\n" 16928 . $ucd_pod; 16929 my $space_hex = sprintf("%02x", ord " "); 16930 local $" = ""; 16931 16932 # Everything is ready to assemble. 16933 my @OUT = << "END"; 16934=begin comment 16935 16936$HEADER 16937 16938To change this file, edit $0 instead. 16939 16940=end comment 16941 16942=head1 NAME 16943 16944$pod_file - Index of Unicode Version $unicode_version character properties in Perl 16945 16946=head1 DESCRIPTION 16947 16948This document provides information about the portion of the Unicode database 16949that deals with character properties, that is the portion that is defined on 16950single code points. (L</Other information in the Unicode data base> 16951below briefly mentions other data that Unicode provides.) 16952 16953Perl can provide access to all non-provisional Unicode character properties, 16954though not all are enabled by default. The omitted ones are the Unihan 16955properties (accessible via the CPAN module L<Unicode::Unihan>) and certain 16956deprecated or Unicode-internal properties. (An installation may choose to 16957recompile Perl's tables to change this. See L</Unicode character 16958properties that are NOT accepted by Perl>.) 16959 16960For most purposes, access to Unicode properties from the Perl core is through 16961regular expression matches, as described in the next section. 16962For some special purposes, and to access the properties that are not suitable 16963for regular expression matching, all the Unicode character properties that 16964Perl handles are accessible via the standard L<Unicode::UCD> module, as 16965described in the section L</Properties accessible through Unicode::UCD>. 16966 16967Perl also provides some additional extensions and short-cut synonyms 16968for Unicode properties. 16969 16970This document merely lists all available properties and does not attempt to 16971explain what each property really means. There is a brief description of each 16972Perl extension; see L<perlunicode/Other Properties> for more information on 16973these. There is some detail about Blocks, Scripts, General_Category, 16974and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the 16975official Unicode properties, refer to the Unicode standard. A good starting 16976place is L<$unicode_reference_url>. 16977 16978Note that you can define your own properties; see 16979L<perlunicode/"User-Defined Character Properties">. 16980 16981=head1 Properties accessible through C<\\p{}> and C<\\P{}> 16982 16983The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to 16984most of the Unicode character properties. The table below shows all these 16985constructs, both single and compound forms. 16986 16987B<Compound forms> consist of two components, separated by an equals sign or a 16988colon. The first component is the property name, and the second component is 16989the particular value of the property to match against, for example, 16990C<\\p{Script_Extensions: Greek}> and C<\\p{Script_Extensions=Greek}> both mean 16991to match characters whose Script_Extensions property value is Greek. 16992(C<Script_Extensions> is an improved version of the C<Script> property.) 16993 16994B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for 16995their equivalent compound forms. The table shows these equivalences. (In our 16996example, C<\\p{Greek}> is a just a shortcut for 16997C<\\p{Script_Extensions=Greek}>). There are also a few Perl-defined single 16998forms that are not shortcuts for a compound form. One such is C<\\p{Word}>. 16999These are also listed in the table. 17000 17001In parsing these constructs, Perl always ignores Upper/lower case differences 17002everywhere within the {braces}. Thus C<\\p{Greek}> means the same thing as 17003C<\\p{greek}>. But note that changing the case of the C<"p"> or C<"P"> before 17004the left brace completely changes the meaning of the construct, from "match" 17005(for C<\\p{}>) to "doesn't match" (for C<\\P{}>). Casing in this document is 17006for improved legibility. 17007 17008Also, white space, hyphens, and underscores are normally ignored 17009everywhere between the {braces}, and hence can be freely added or removed 17010even if the C</x> modifier hasn't been specified on the regular expression. 17011But in the table below $a_bold_stricter at the beginning of an entry 17012means that tighter (stricter) rules are used for that entry: 17013 17014=over 4 17015 17016=over 4 17017 17018=item Single form (C<\\p{name}>) tighter rules: 17019 17020White space, hyphens, and underscores ARE significant 17021except for: 17022 17023=over 4 17024 17025=item * white space adjacent to a non-word character 17026 17027=item * underscores separating digits in numbers 17028 17029=back 17030 17031That means, for example, that you can freely add or remove white space 17032adjacent to (but within) the braces without affecting the meaning. 17033 17034=item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules: 17035 17036The tighter rules given above for the single form apply to everything to the 17037right of the colon or equals; the looser rules still apply to everything to 17038the left. 17039 17040That means, for example, that you can freely add or remove white space 17041adjacent to (but within) the braces and the colon or equal sign. 17042 17043=back 17044 17045=back 17046 17047Some properties are considered obsolete by Unicode, but still available. 17048There are several varieties of obsolescence: 17049 17050=over 4 17051 17052=over 4 17053 17054=item Stabilized 17055 17056A property may be stabilized. Such a determination does not indicate 17057that the property should or should not be used; instead it is a declaration 17058that the property will not be maintained nor extended for newly encoded 17059characters. Such properties are marked with $a_bold_stabilized in the 17060table. 17061 17062=item Deprecated 17063 17064A property may be deprecated, perhaps because its original intent 17065has been replaced by another property, or because its specification was 17066somehow defective. This means that its use is strongly 17067discouraged, so much so that a warning will be issued if used, unless the 17068regular expression is in the scope of a C<S<no warnings 'deprecated'>> 17069statement. $A_bold_deprecated flags each such entry in the table, and 17070the entry there for the longest, most descriptive version of the property will 17071give the reason it is deprecated, and perhaps advice. Perl may issue such a 17072warning, even for properties that aren't officially deprecated by Unicode, 17073when there used to be characters or code points that were matched by them, but 17074no longer. This is to warn you that your program may not work like it did on 17075earlier Unicode releases. 17076 17077A deprecated property may be made unavailable in a future Perl version, so it 17078is best to move away from them. 17079 17080A deprecated property may also be stabilized, but this fact is not shown. 17081 17082=item Obsolete 17083 17084Properties marked with $a_bold_obsolete in the table are considered (plain) 17085obsolete. Generally this designation is given to properties that Unicode once 17086used for internal purposes (but not any longer). 17087 17088=item Discouraged 17089 17090This is not actually a Unicode-specified obsolescence, but applies to certain 17091Perl extensions that are present for backwards compatibility, but are 17092discouraged from being used. These are not obsolete, but their meanings are 17093not stable. Future Unicode versions could force any of these extensions to be 17094removed without warning, replaced by another property with the same name that 17095means something different. $A_bold_discouraged flags each such entry in the 17096table. Use the equivalent shown instead. 17097 17098@block_warning 17099 17100=back 17101 17102=back 17103 17104The table below has two columns. The left column contains the C<\\p{}> 17105constructs to look up, possibly preceded by the flags mentioned above; and 17106the right column contains information about them, like a description, or 17107synonyms. The table shows both the single and compound forms for each 17108property that has them. If the left column is a short name for a property, 17109the right column will give its longer, more descriptive name; and if the left 17110column is the longest name, the right column will show any equivalent shortest 17111name, in both single and compound forms if applicable. 17112 17113If braces are not needed to specify a property (e.g., C<\\pL>), the left 17114column contains both forms, with and without braces. 17115 17116The right column will also caution you if a property means something different 17117than what might normally be expected. 17118 17119All single forms are Perl extensions; a few compound forms are as well, and 17120are noted as such. 17121 17122Numbers in (parentheses) indicate the total number of Unicode code points 17123matched by the property. For the entries that give the longest, most 17124descriptive version of the property, the count is followed by a list of some 17125of the code points matched by it. The list includes all the matched 17126characters in the 0-255 range, enclosed in the familiar [brackets] the same as 17127a regular expression bracketed character class. Following that, the next few 17128higher matching ranges are also given. To avoid visual ambiguity, the SPACE 17129character is represented as C<\\x$space_hex>. 17130 17131For emphasis, those properties that match no code points at all are listed as 17132well in a separate section following the table. 17133 17134Most properties match the same code points regardless of whether C<"/i"> 17135case-insensitive matching is specified or not. But a few properties are 17136affected. These are shown with the notation S<C<(/i= I<other_property>)>> 17137in the second column. Under case-insensitive matching they match the 17138same code pode points as the property I<other_property>. 17139 17140There is no description given for most non-Perl defined properties (See 17141L<$unicode_reference_url> for that). 17142 17143For compactness, 'B<*>' is used as a wildcard instead of showing all possible 17144combinations. For example, entries like: 17145 17146 \\p{Gc: *} \\p{General_Category: *} 17147 17148mean that 'Gc' is a synonym for 'General_Category', and anything that is valid 17149for the latter is also valid for the former. Similarly, 17150 17151 \\p{Is_*} \\p{*} 17152 17153means that if and only if, for example, C<\\p{Foo}> exists, then 17154C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing. 17155And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and 17156C<\\p{IsFoo=Bar}>. "*" here is restricted to something not beginning with an 17157underscore. 17158 17159Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'. 17160And 'No', 'F', and 'False' are all synonyms for 'N'. The table shows 'Y*' and 17161'N*' to indicate this, and doesn't have separate entries for the other 17162possibilities. Note that not all properties which have values 'Yes' and 'No' 17163are binary, and they have all their values spelled out without using this wild 17164card, and a C<NOT> clause in their description that highlights their not being 17165binary. These also require the compound form to match them, whereas true 17166binary properties have both single and compound forms available. 17167 17168Note that all non-essential underscores are removed in the display of the 17169short names below. 17170 17171B<Legend summary:> 17172 17173=over 4 17174 17175=item Z<>B<*> is a wild-card 17176 17177=item B<(\\d+)> in the info column gives the number of Unicode code points matched 17178by this property. 17179 17180=item B<$DEPRECATED> means this is deprecated. 17181 17182=item B<$OBSOLETE> means this is obsolete. 17183 17184=item B<$STABILIZED> means this is stabilized. 17185 17186=item B<$STRICTER> means tighter (stricter) name matching applies. 17187 17188=item B<$DISCOURAGED> means use of this form is discouraged, and may not be 17189stable. 17190 17191=back 17192 17193$formatted_properties 17194 17195$zero_matches 17196 17197=head1 Properties accessible through Unicode::UCD 17198 17199The value of any Unicode (not including Perl extensions) character 17200property mentioned above for any single code point is available through 17201L<Unicode::UCD/charprop()>. L<Unicode::UCD/charprops_all()> returns the 17202values of all the Unicode properties for a given code point. 17203 17204Besides these, all the Unicode character properties mentioned above 17205(except for those marked as for internal use by Perl) are also 17206accessible by L<Unicode::UCD/prop_invlist()>. 17207 17208Due to their nature, not all Unicode character properties are suitable for 17209regular expression matches, nor C<prop_invlist()>. The remaining 17210non-provisional, non-internal ones are accessible via 17211L<Unicode::UCD/prop_invmap()> (except for those that this Perl installation 17212hasn't included; see L<below for which those are|/Unicode character properties 17213that are NOT accepted by Perl>). 17214 17215For compatibility with other parts of Perl, all the single forms given in the 17216table in the L<section above|/Properties accessible through \\p{} and \\P{}> 17217are recognized. BUT, there are some ambiguities between some Perl extensions 17218and the Unicode properties, all of which are silently resolved in favor of the 17219official Unicode property. To avoid surprises, you should only use 17220C<prop_invmap()> for forms listed in the table below, which omits the 17221non-recommended ones. The affected forms are the Perl single form equivalents 17222of Unicode properties, such as C<\\p{sc}> being a single-form equivalent of 17223C<\\p{gc=sc}>, which is treated by C<prop_invmap()> as the C<Script> property, 17224whose short name is C<sc>. The table indicates the current ambiguities in the 17225INFO column, beginning with the word C<"NOT">. 17226 17227The standard Unicode properties listed below are documented in 17228L<$unicode_reference_url>; Perl_Decimal_Digit is documented in 17229L<Unicode::UCD/prop_invmap()>. The other Perl extensions are in 17230L<perlunicode/Other Properties>; 17231 17232The first column in the table is a name for the property; the second column is 17233an alternative name, if any, plus possibly some annotations. The alternative 17234name is the property's full name, unless that would simply repeat the first 17235column, in which case the second column indicates the property's short name 17236(if different). The annotations are given only in the entry for the full 17237name. The annotations for binary properties include a list of the first few 17238ranges that the property matches. To avoid any ambiguity, the SPACE character 17239is represented as C<\\x$space_hex>. 17240 17241If a property is obsolete, etc, the entry will be flagged with the same 17242characters used in the table in the L<section above|/Properties accessible 17243through \\p{} and \\P{}>, like B<$DEPRECATED> or B<$STABILIZED>. 17244 17245$ucd_pod 17246 17247=head1 Properties accessible through other means 17248 17249Certain properties are accessible also via core function calls. These are: 17250 17251 Lowercase_Mapping lc() and lcfirst() 17252 Titlecase_Mapping ucfirst() 17253 Uppercase_Mapping uc() 17254 17255Also, Case_Folding is accessible through the C</i> modifier in regular 17256expressions, the C<\\F> transliteration escape, and the C<L<fc|perlfunc/fc>> 17257operator. 17258 17259Besides being able to say C<\\p{Name=...}>, the Name and Name_Aliases 17260properties are accessible through the C<\\N{}> interpolation in double-quoted 17261strings and regular expressions; and functions C<charnames::viacode()>, 17262C<charnames::vianame()>, and C<charnames::string_vianame()> (which require a 17263C<use charnames ();> to be specified. 17264 17265Finally, most properties related to decomposition are accessible via 17266L<Unicode::Normalize>. 17267 17268=head1 Unicode character properties that are NOT accepted by Perl 17269 17270Perl will generate an error for a few character properties in Unicode when 17271used in a regular expression. The non-Unihan ones are listed below, with the 17272reasons they are not accepted, perhaps with work-arounds. The short names for 17273the properties are listed enclosed in (parentheses). 17274As described after the list, an installation can change the defaults and choose 17275to accept any of these. The list is machine generated based on the 17276choices made for the installation that generated this document. 17277 17278@bad_re_properties 17279 17280An installation can choose to allow any of these to be matched by downloading 17281the Unicode database from L<http://www.unicode.org/Public/> to 17282C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the 17283controlling lists contained in the program 17284C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing. 17285(C<\%Config> is available from the Config module). 17286 17287Also, perl can be recompiled to operate on an earlier version of the Unicode 17288standard. Further information is at 17289C<\$Config{privlib}>/F<unicore/README.perl>. 17290 17291=head1 Other information in the Unicode data base 17292 17293The Unicode data base is delivered in two different formats. The XML version 17294is valid for more modern Unicode releases. The other version is a collection 17295of files. The two are intended to give equivalent information. Perl uses the 17296older form; this allows you to recompile Perl to use early Unicode releases. 17297 17298The only non-character property that Perl currently supports is Named 17299Sequences, in which a sequence of code points 17300is given a name and generally treated as a single entity. (Perl supports 17301these via the C<\\N{...}> double-quotish construct, 17302L<charnames/charnames::string_vianame(name)>, and L<Unicode::UCD/namedseq()>. 17303 17304Below is a list of the files in the Unicode data base that Perl doesn't 17305currently use, along with very brief descriptions of their purposes. 17306Some of the names of the files have been shortened from those that Unicode 17307uses, in order to allow them to be distinguishable from similarly named files 17308on file systems for which only the first 8 characters of a name are 17309significant. 17310 17311=over 4 17312 17313@unused_files 17314 17315=back 17316 17317=head1 SEE ALSO 17318 17319L<$unicode_reference_url> 17320 17321L<perlrecharclass> 17322 17323L<perlunicode> 17324 17325END 17326 17327 # And write it. The 0 means no utf8. 17328 main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT); 17329 return; 17330} 17331 17332sub make_Name_pm () { 17333 # Create and write Name.pm, which contains subroutines and data to use in 17334 # conjunction with Name.pl 17335 17336 # Maybe there's nothing to do. 17337 return unless $has_hangul_syllables || @code_points_ending_in_code_point; 17338 17339 my @name = <<END; 17340$HEADER 17341$INTERNAL_ONLY_HEADER 17342END 17343 17344 # Convert these structures to output format. 17345 my $code_points_ending_in_code_point = 17346 main::simple_dumper(\@code_points_ending_in_code_point, 17347 ' ' x 8); 17348 my $names = main::simple_dumper(\%names_ending_in_code_point, 17349 ' ' x 8); 17350 my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point, 17351 ' ' x 8); 17352 17353 # Do the same with the Hangul names, 17354 my $jamo; 17355 my $jamo_l; 17356 my $jamo_v; 17357 my $jamo_t; 17358 my $jamo_re; 17359 if ($has_hangul_syllables) { 17360 17361 # Construct a regular expression of all the possible 17362 # combinations of the Hangul syllables. 17363 my @L_re; # Leading consonants 17364 for my $i ($LBase .. $LBase + $LCount - 1) { 17365 push @L_re, $Jamo{$i} 17366 } 17367 my @V_re; # Middle vowels 17368 for my $i ($VBase .. $VBase + $VCount - 1) { 17369 push @V_re, $Jamo{$i} 17370 } 17371 my @T_re; # Trailing consonants 17372 for my $i ($TBase + 1 .. $TBase + $TCount - 1) { 17373 push @T_re, $Jamo{$i} 17374 } 17375 17376 # The whole re is made up of the L V T combination. 17377 $jamo_re = '(' 17378 . join ('|', sort @L_re) 17379 . ')(' 17380 . join ('|', sort @V_re) 17381 . ')(' 17382 . join ('|', sort @T_re) 17383 . ')?'; 17384 17385 # These hashes needed by the algorithm were generated 17386 # during reading of the Jamo.txt file 17387 $jamo = main::simple_dumper(\%Jamo, ' ' x 8); 17388 $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8); 17389 $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8); 17390 $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8); 17391 } 17392 17393 push @name, <<END; 17394 17395package charnames; 17396 17397# This module contains machine-generated tables and code for the 17398# algorithmically-determinable Unicode character names. The following 17399# routines can be used to translate between name and code point and vice versa 17400 17401{ # Closure 17402 17403 # Matches legal code point. 4-6 hex numbers, If there are 6, the first 17404 # two must be 10; if there are 5, the first must not be a 0. Written this 17405 # way to decrease backtracking. The first regex allows the code point to 17406 # be at the end of a word, but to work properly, the word shouldn't end 17407 # with a valid hex character. The second one won't match a code point at 17408 # the end of a word, and doesn't have the run-on issue 17409 my \$run_on_code_point_re = qr/$run_on_code_point_re/; 17410 my \$code_point_re = qr/$code_point_re/; 17411 17412 # In the following hash, the keys are the bases of names which include 17413 # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The value 17414 # of each key is another hash which is used to get the low and high ends 17415 # for each range of code points that apply to the name. 17416 my %names_ending_in_code_point = ( 17417$names 17418 ); 17419 17420 # The following hash is a copy of the previous one, except is for loose 17421 # matching, so each name has blanks and dashes squeezed out 17422 my %loose_names_ending_in_code_point = ( 17423$loose_names 17424 ); 17425 17426 # And the following array gives the inverse mapping from code points to 17427 # names. Lowest code points are first 17428 \@code_points_ending_in_code_point = ( 17429$code_points_ending_in_code_point 17430 ); 17431 17432 # Is exportable, make read-only 17433 Internals::SvREADONLY(\@code_points_ending_in_code_point, 1); 17434END 17435 # Earlier releases didn't have Jamos. No sense outputting 17436 # them unless will be used. 17437 if ($has_hangul_syllables) { 17438 push @name, <<END; 17439 17440 # Convert from code point to Jamo short name for use in composing Hangul 17441 # syllable names 17442 my %Jamo = ( 17443$jamo 17444 ); 17445 17446 # Leading consonant (can be null) 17447 my %Jamo_L = ( 17448$jamo_l 17449 ); 17450 17451 # Vowel 17452 my %Jamo_V = ( 17453$jamo_v 17454 ); 17455 17456 # Optional trailing consonant 17457 my %Jamo_T = ( 17458$jamo_t 17459 ); 17460 17461 # Computed re that splits up a Hangul name into LVT or LV syllables 17462 my \$syllable_re = qr/$jamo_re/; 17463 17464 my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE "; 17465 my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE"; 17466 17467 # These constants names and values were taken from the Unicode standard, 17468 # version 5.1, section 3.12. They are used in conjunction with Hangul 17469 # syllables 17470 my \$SBase = $SBase_string; 17471 my \$LBase = $LBase_string; 17472 my \$VBase = $VBase_string; 17473 my \$TBase = $TBase_string; 17474 my \$SCount = $SCount; 17475 my \$LCount = $LCount; 17476 my \$VCount = $VCount; 17477 my \$TCount = $TCount; 17478 my \$NCount = \$VCount * \$TCount; 17479END 17480 } # End of has Jamos 17481 17482 push @name, << 'END'; 17483 17484 sub name_to_code_point_special { 17485 my ($name, $loose) = @_; 17486 17487 # Returns undef if not one of the specially handled names; otherwise 17488 # returns the code point equivalent to the input name 17489 # $loose is non-zero if to use loose matching, 'name' in that case 17490 # must be input as upper case with all blanks and dashes squeezed out. 17491END 17492 if ($has_hangul_syllables) { 17493 push @name, << 'END'; 17494 17495 if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//) 17496 || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//)) 17497 { 17498 return if $name !~ qr/^$syllable_re$/; 17499 my $L = $Jamo_L{$1}; 17500 my $V = $Jamo_V{$2}; 17501 my $T = (defined $3) ? $Jamo_T{$3} : 0; 17502 return ($L * $VCount + $V) * $TCount + $T + $SBase; 17503 } 17504END 17505 } 17506 push @name, << 'END'; 17507 17508 # Name must end in 'code_point' for this to handle. 17509 return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x) 17510 || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x)); 17511 17512 my $base = $1; 17513 my $code_point = CORE::hex $2; 17514 my $names_ref; 17515 17516 if ($loose) { 17517 $names_ref = \%loose_names_ending_in_code_point; 17518 } 17519 else { 17520 return if $base !~ s/-$//; 17521 $names_ref = \%names_ending_in_code_point; 17522 } 17523 17524 # Name must be one of the ones which has the code point in it. 17525 return if ! $names_ref->{$base}; 17526 17527 # Look through the list of ranges that apply to this name to see if 17528 # the code point is in one of them. 17529 for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) { 17530 return if $names_ref->{$base}{'low'}->[$i] > $code_point; 17531 next if $names_ref->{$base}{'high'}->[$i] < $code_point; 17532 17533 # Here, the code point is in the range. 17534 return $code_point; 17535 } 17536 17537 # Here, looked like the name had a code point number in it, but 17538 # did not match one of the valid ones. 17539 return; 17540 } 17541 17542 sub code_point_to_name_special { 17543 my $code_point = shift; 17544 17545 # Returns the name of a code point if algorithmically determinable; 17546 # undef if not 17547END 17548 if ($has_hangul_syllables) { 17549 push @name, << 'END'; 17550 17551 # If in the Hangul range, calculate the name based on Unicode's 17552 # algorithm 17553 if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) { 17554 use integer; 17555 my $SIndex = $code_point - $SBase; 17556 my $L = $LBase + $SIndex / $NCount; 17557 my $V = $VBase + ($SIndex % $NCount) / $TCount; 17558 my $T = $TBase + $SIndex % $TCount; 17559 $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}"; 17560 $name .= $Jamo{$T} if $T != $TBase; 17561 return $name; 17562 } 17563END 17564 } 17565 push @name, << 'END'; 17566 17567 # Look through list of these code points for one in range. 17568 foreach my $hash (@code_points_ending_in_code_point) { 17569 return if $code_point < $hash->{'low'}; 17570 if ($code_point <= $hash->{'high'}) { 17571 return sprintf("%s-%04X", $hash->{'name'}, $code_point); 17572 } 17573 } 17574 return; # None found 17575 } 17576} # End closure 17577 175781; 17579END 17580 17581 main::write("Name.pm", 0, \@name); # The 0 means no utf8. 17582 return; 17583} 17584 17585sub make_UCD () { 17586 # Create and write UCD.pl, which passes info about the tables to 17587 # Unicode::UCD 17588 17589 # Stringify structures for output 17590 my $loose_property_name_of 17591 = simple_dumper(\%loose_property_name_of, ' ' x 4); 17592 chomp $loose_property_name_of; 17593 17594 my $strict_property_name_of 17595 = simple_dumper(\%strict_property_name_of, ' ' x 4); 17596 chomp $strict_property_name_of; 17597 17598 my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4); 17599 chomp $stricter_to_file_of; 17600 17601 my $inline_definitions = simple_dumper(\@inline_definitions, " " x 4); 17602 chomp $inline_definitions; 17603 17604 my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4); 17605 chomp $loose_to_file_of; 17606 17607 my $nv_floating_to_rational 17608 = simple_dumper(\%nv_floating_to_rational, ' ' x 4); 17609 chomp $nv_floating_to_rational; 17610 17611 my $why_deprecated = simple_dumper(\%Unicode::UCD::why_deprecated, ' ' x 4); 17612 chomp $why_deprecated; 17613 17614 # We set the key to the file when we associated files with tables, but we 17615 # couldn't do the same for the value then, as we might not have the file 17616 # for the alternate table figured out at that time. 17617 foreach my $cased (keys %caseless_equivalent_to) { 17618 my @path = $caseless_equivalent_to{$cased}->file_path; 17619 my $path; 17620 if ($path[0] eq "#") { # Pseudo-directory '#' 17621 $path = join '/', @path; 17622 } 17623 else { # Gets rid of lib/ 17624 $path = join '/', @path[1, -1]; 17625 } 17626 $caseless_equivalent_to{$cased} = $path; 17627 } 17628 my $caseless_equivalent_to 17629 = simple_dumper(\%caseless_equivalent_to, ' ' x 4); 17630 chomp $caseless_equivalent_to; 17631 17632 my $loose_property_to_file_of 17633 = simple_dumper(\%loose_property_to_file_of, ' ' x 4); 17634 chomp $loose_property_to_file_of; 17635 17636 my $strict_property_to_file_of 17637 = simple_dumper(\%strict_property_to_file_of, ' ' x 4); 17638 chomp $strict_property_to_file_of; 17639 17640 my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4); 17641 chomp $file_to_swash_name; 17642 17643 # Create a mapping from each alias of Perl single-form extensions to all 17644 # its equivalent aliases, for quick look-up. 17645 my %perlprop_to_aliases; 17646 foreach my $table ($perl->tables) { 17647 17648 # First create the list of the aliases of each extension 17649 my @aliases_list; # List of legal aliases for this extension 17650 17651 my $table_name = $table->name; 17652 my $standard_table_name = standardize($table_name); 17653 my $table_full_name = $table->full_name; 17654 my $standard_table_full_name = standardize($table_full_name); 17655 17656 # Make sure that the list has both the short and full names 17657 push @aliases_list, $table_name, $table_full_name; 17658 17659 my $found_ucd = 0; # ? Did we actually get an alias that should be 17660 # output for this table 17661 17662 # Go through all the aliases (including the two just added), and add 17663 # any new unique ones to the list 17664 foreach my $alias ($table->aliases) { 17665 17666 # Skip non-legal names 17667 next unless $alias->ok_as_filename; 17668 next unless $alias->ucd; 17669 17670 $found_ucd = 1; # have at least one legal name 17671 17672 my $name = $alias->name; 17673 my $standard = standardize($name); 17674 17675 # Don't repeat a name that is equivalent to one already on the 17676 # list 17677 next if $standard eq $standard_table_name; 17678 next if $standard eq $standard_table_full_name; 17679 17680 push @aliases_list, $name; 17681 } 17682 17683 # If there were no legal names, don't output anything. 17684 next unless $found_ucd; 17685 17686 # To conserve memory in the program reading these in, omit full names 17687 # that are identical to the short name, when those are the only two 17688 # aliases for the property. 17689 if (@aliases_list == 2 && $aliases_list[0] eq $aliases_list[1]) { 17690 pop @aliases_list; 17691 } 17692 17693 # Here, @aliases_list is the list of all the aliases that this 17694 # extension legally has. Now can create a map to it from each legal 17695 # standardized alias 17696 foreach my $alias ($table->aliases) { 17697 next unless $alias->ucd; 17698 next unless $alias->ok_as_filename; 17699 push @{$perlprop_to_aliases{standardize($alias->name)}}, 17700 uniques @aliases_list; 17701 } 17702 } 17703 17704 # Make a list of all combinations of properties/values that are suppressed. 17705 my @suppressed; 17706 if (! $debug_skip) { # This tends to fail in this debug mode 17707 foreach my $property_name (keys %why_suppressed) { 17708 17709 # Just the value 17710 my $value_name = $1 if $property_name =~ s/ = ( .* ) //x; 17711 17712 # The hash may contain properties not in this release of Unicode 17713 next unless defined (my $property = property_ref($property_name)); 17714 17715 # Find all combinations 17716 foreach my $prop_alias ($property->aliases) { 17717 my $prop_alias_name = standardize($prop_alias->name); 17718 17719 # If no =value, there's just one combination possible for this 17720 if (! $value_name) { 17721 17722 # The property may be suppressed, but there may be a proxy 17723 # for it, so it shouldn't be listed as suppressed 17724 next if $prop_alias->ucd; 17725 push @suppressed, $prop_alias_name; 17726 } 17727 else { # Otherwise 17728 foreach my $value_alias 17729 ($property->table($value_name)->aliases) 17730 { 17731 next if $value_alias->ucd; 17732 17733 push @suppressed, "$prop_alias_name=" 17734 . standardize($value_alias->name); 17735 } 17736 } 17737 } 17738 } 17739 } 17740 @suppressed = sort @suppressed; # So doesn't change between runs of this 17741 # program 17742 17743 # Convert the structure below (designed for Name.pm) to a form that UCD 17744 # wants, so it doesn't have to modify it at all; i.e. so that it includes 17745 # an element for the Hangul syllables in the appropriate place, and 17746 # otherwise changes the name to include the "-<code point>" suffix. 17747 my @algorithm_names; 17748 my $done_hangul = $v_version lt v2.0.0; # Hanguls as we know them came 17749 # along in this version 17750 # Copy it linearly. 17751 for my $i (0 .. @code_points_ending_in_code_point - 1) { 17752 17753 # Insert the hanguls in the correct place. 17754 if (! $done_hangul 17755 && $code_points_ending_in_code_point[$i]->{'low'} > $SBase) 17756 { 17757 $done_hangul = 1; 17758 push @algorithm_names, { low => $SBase, 17759 high => $SBase + $SCount - 1, 17760 name => '<hangul syllable>', 17761 }; 17762 } 17763 17764 # Copy the current entry, modified. 17765 push @algorithm_names, { 17766 low => $code_points_ending_in_code_point[$i]->{'low'}, 17767 high => $code_points_ending_in_code_point[$i]->{'high'}, 17768 name => 17769 "$code_points_ending_in_code_point[$i]->{'name'}-<code point>", 17770 }; 17771 } 17772 17773 # Serialize these structures for output. 17774 my $loose_to_standard_value 17775 = simple_dumper(\%loose_to_standard_value, ' ' x 4); 17776 chomp $loose_to_standard_value; 17777 17778 my $string_property_loose_to_name 17779 = simple_dumper(\%string_property_loose_to_name, ' ' x 4); 17780 chomp $string_property_loose_to_name; 17781 17782 my $perlprop_to_aliases = simple_dumper(\%perlprop_to_aliases, ' ' x 4); 17783 chomp $perlprop_to_aliases; 17784 17785 my $prop_aliases = simple_dumper(\%prop_aliases, ' ' x 4); 17786 chomp $prop_aliases; 17787 17788 my $prop_value_aliases = simple_dumper(\%prop_value_aliases, ' ' x 4); 17789 chomp $prop_value_aliases; 17790 17791 my $suppressed = (@suppressed) ? simple_dumper(\@suppressed, ' ' x 4) : ""; 17792 chomp $suppressed; 17793 17794 my $algorithm_names = simple_dumper(\@algorithm_names, ' ' x 4); 17795 chomp $algorithm_names; 17796 17797 my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4); 17798 chomp $ambiguous_names; 17799 17800 my $combination_property = simple_dumper(\%combination_property, ' ' x 4); 17801 chomp $combination_property; 17802 17803 my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4); 17804 chomp $loose_defaults; 17805 17806 my @ucd = <<END; 17807$HEADER 17808$INTERNAL_ONLY_HEADER 17809 17810# This file is for the use of Unicode::UCD 17811 17812# Highest legal Unicode code point 17813\$Unicode::UCD::MAX_UNICODE_CODEPOINT = 0x$MAX_UNICODE_CODEPOINT_STRING; 17814 17815# Hangul syllables 17816\$Unicode::UCD::HANGUL_BEGIN = $SBase_string; 17817\$Unicode::UCD::HANGUL_COUNT = $SCount; 17818 17819# Maps Unicode (not Perl single-form extensions) property names in loose 17820# standard form to their corresponding standard names 17821\%Unicode::UCD::loose_property_name_of = ( 17822$loose_property_name_of 17823); 17824 17825# Same, but strict names 17826\%Unicode::UCD::strict_property_name_of = ( 17827$strict_property_name_of 17828); 17829 17830# Gives the definitions (in the form of inversion lists) for those properties 17831# whose definitions aren't kept in files 17832\@Unicode::UCD::inline_definitions = ( 17833$inline_definitions 17834); 17835 17836# Maps property, table to file for those using stricter matching. For paths 17837# whose directory is '#', the file is in the form of a numeric index into 17838# \@inline_definitions 17839\%Unicode::UCD::stricter_to_file_of = ( 17840$stricter_to_file_of 17841); 17842 17843# Maps property, table to file for those using loose matching. For paths 17844# whose directory is '#', the file is in the form of a numeric index into 17845# \@inline_definitions 17846\%Unicode::UCD::loose_to_file_of = ( 17847$loose_to_file_of 17848); 17849 17850# Maps floating point to fractional form 17851\%Unicode::UCD::nv_floating_to_rational = ( 17852$nv_floating_to_rational 17853); 17854 17855# If a %e floating point number doesn't have this number of digits in it after 17856# the decimal point to get this close to a fraction, it isn't considered to be 17857# that fraction even if all the digits it does have match. 17858\$Unicode::UCD::e_precision = $E_FLOAT_PRECISION; 17859 17860# Deprecated tables to generate a warning for. The key is the file containing 17861# the table, so as to avoid duplication, as many property names can map to the 17862# file, but we only need one entry for all of them. 17863\%Unicode::UCD::why_deprecated = ( 17864$why_deprecated 17865); 17866 17867# A few properties have different behavior under /i matching. This maps 17868# those to substitute files to use under /i. 17869\%Unicode::UCD::caseless_equivalent = ( 17870$caseless_equivalent_to 17871); 17872 17873# Property names to mapping files 17874\%Unicode::UCD::loose_property_to_file_of = ( 17875$loose_property_to_file_of 17876); 17877 17878# Property names to mapping files 17879\%Unicode::UCD::strict_property_to_file_of = ( 17880$strict_property_to_file_of 17881); 17882 17883# Files to the swash names within them. 17884\%Unicode::UCD::file_to_swash_name = ( 17885$file_to_swash_name 17886); 17887 17888# Keys are all the possible "prop=value" combinations, in loose form; values 17889# are the standard loose name for the 'value' part of the key 17890\%Unicode::UCD::loose_to_standard_value = ( 17891$loose_to_standard_value 17892); 17893 17894# String property loose names to standard loose name 17895\%Unicode::UCD::string_property_loose_to_name = ( 17896$string_property_loose_to_name 17897); 17898 17899# Keys are Perl extensions in loose form; values are each one's list of 17900# aliases 17901\%Unicode::UCD::loose_perlprop_to_name = ( 17902$perlprop_to_aliases 17903); 17904 17905# Keys are standard property name; values are each one's aliases 17906\%Unicode::UCD::prop_aliases = ( 17907$prop_aliases 17908); 17909 17910# Keys of top level are standard property name; values are keys to another 17911# hash, Each one is one of the property's values, in standard form. The 17912# values are that prop-val's aliases. If only one specified, the short and 17913# long alias are identical. 17914\%Unicode::UCD::prop_value_aliases = ( 17915$prop_value_aliases 17916); 17917 17918# Ordered (by code point ordinal) list of the ranges of code points whose 17919# names are algorithmically determined. Each range entry is an anonymous hash 17920# of the start and end points and a template for the names within it. 17921\@Unicode::UCD::algorithmic_named_code_points = ( 17922$algorithm_names 17923); 17924 17925# The properties that as-is have two meanings, and which must be disambiguated 17926\%Unicode::UCD::ambiguous_names = ( 17927$ambiguous_names 17928); 17929 17930# Keys are the prop-val combinations which are the default values for the 17931# given property, expressed in standard loose form 17932\%Unicode::UCD::loose_defaults = ( 17933$loose_defaults 17934); 17935 17936# The properties that are combinations, in that they have both a map table and 17937# a match table. This is actually for UCD.t, so it knows how to test for 17938# these. 17939\%Unicode::UCD::combination_property = ( 17940$combination_property 17941); 17942 17943# All combinations of names that are suppressed. 17944# This is actually for UCD.t, so it knows which properties shouldn't have 17945# entries. If it got any bigger, would probably want to put it in its own 17946# file to use memory only when it was needed, in testing. 17947\@Unicode::UCD::suppressed_properties = ( 17948$suppressed 17949); 17950 179511; 17952END 17953 17954 main::write("UCD.pl", 0, \@ucd); # The 0 means no utf8. 17955 return; 17956} 17957 17958sub write_all_tables() { 17959 # Write out all the tables generated by this program to files, as well as 17960 # the supporting data structures, pod file, and .t file. 17961 17962 my @writables; # List of tables that actually get written 17963 my %match_tables_to_write; # Used to collapse identical match tables 17964 # into one file. Each key is a hash function 17965 # result to partition tables into buckets. 17966 # Each value is an array of the tables that 17967 # fit in the bucket. 17968 17969 # For each property ... 17970 # (sort so that if there is an immutable file name, it has precedence, so 17971 # some other property can't come in and take over its file name. (We 17972 # don't care if both defined, as they had better be different anyway.) 17973 # The property named 'Perl' needs to be first (it doesn't have any 17974 # immutable file name) because empty properties are defined in terms of 17975 # its table named 'All' under the -annotate option.) We also sort by 17976 # the property's name. This is just for repeatability of the outputs 17977 # between runs of this program, but does not affect correctness. 17978 PROPERTY: 17979 foreach my $property ($perl, 17980 sort { return -1 if defined $a->file; 17981 return 1 if defined $b->file; 17982 return $a->name cmp $b->name; 17983 } grep { $_ != $perl } property_ref('*')) 17984 { 17985 my $type = $property->type; 17986 17987 # And for each table for that property, starting with the mapping 17988 # table for it ... 17989 TABLE: 17990 foreach my $table($property, 17991 17992 # and all the match tables for it (if any), sorted so 17993 # the ones with the shortest associated file name come 17994 # first. The length sorting prevents problems of a 17995 # longer file taking a name that might have to be used 17996 # by a shorter one. The alphabetic sorting prevents 17997 # differences between releases 17998 sort { my $ext_a = $a->external_name; 17999 return 1 if ! defined $ext_a; 18000 my $ext_b = $b->external_name; 18001 return -1 if ! defined $ext_b; 18002 18003 # But return the non-complement table before 18004 # the complement one, as the latter is defined 18005 # in terms of the former, and needs to have 18006 # the information for the former available. 18007 return 1 if $a->complement != 0; 18008 return -1 if $b->complement != 0; 18009 18010 # Similarly, return a subservient table after 18011 # a leader 18012 return 1 if $a->leader != $a; 18013 return -1 if $b->leader != $b; 18014 18015 my $cmp = length $ext_a <=> length $ext_b; 18016 18017 # Return result if lengths not equal 18018 return $cmp if $cmp; 18019 18020 # Alphabetic if lengths equal 18021 return $ext_a cmp $ext_b 18022 } $property->tables 18023 ) 18024 { 18025 18026 # Here we have a table associated with a property. It could be 18027 # the map table (done first for each property), or one of the 18028 # other tables. Determine which type. 18029 my $is_property = $table->isa('Property'); 18030 18031 my $name = $table->name; 18032 my $complete_name = $table->complete_name; 18033 18034 # See if should suppress the table if is empty, but warn if it 18035 # contains something. 18036 my $suppress_if_empty_warn_if_not 18037 = $why_suppress_if_empty_warn_if_not{$complete_name} || 0; 18038 18039 # Calculate if this table should have any code points associated 18040 # with it or not. 18041 my $expected_empty = 18042 18043 # $perl should be empty 18044 ($is_property && ($table == $perl)) 18045 18046 # Match tables in properties we skipped populating should be 18047 # empty 18048 || (! $is_property && ! $property->to_create_match_tables) 18049 18050 # Tables and properties that are expected to have no code 18051 # points should be empty 18052 || $suppress_if_empty_warn_if_not 18053 ; 18054 18055 # Set a boolean if this table is the complement of an empty binary 18056 # table 18057 my $is_complement_of_empty_binary = 18058 $type == $BINARY && 18059 (($table == $property->table('Y') 18060 && $property->table('N')->is_empty) 18061 || ($table == $property->table('N') 18062 && $property->table('Y')->is_empty)); 18063 18064 if ($table->is_empty) { 18065 18066 if ($suppress_if_empty_warn_if_not) { 18067 $table->set_fate($SUPPRESSED, 18068 $suppress_if_empty_warn_if_not); 18069 } 18070 18071 # Suppress (by skipping them) expected empty tables. 18072 next TABLE if $expected_empty; 18073 18074 # And setup to later output a warning for those that aren't 18075 # known to be allowed to be empty. Don't do the warning if 18076 # this table is a child of another one to avoid duplicating 18077 # the warning that should come from the parent one. 18078 if (($table == $property || $table->parent == $table) 18079 && $table->fate != $SUPPRESSED 18080 && $table->fate != $MAP_PROXIED 18081 && ! grep { $complete_name =~ /^$_$/ } 18082 @tables_that_may_be_empty) 18083 { 18084 push @unhandled_properties, "$table"; 18085 } 18086 18087 # The old way of expressing an empty match list was to 18088 # complement the list that matches everything. The new way is 18089 # to create an empty inversion list, but this doesn't work for 18090 # annotating, so use the old way then. 18091 $table->set_complement($All) if $annotate 18092 && $table != $property; 18093 } 18094 elsif ($expected_empty) { 18095 my $because = ""; 18096 if ($suppress_if_empty_warn_if_not) { 18097 $because = " because $suppress_if_empty_warn_if_not"; 18098 } 18099 18100 Carp::my_carp("Not expecting property $table$because. Generating file for it anyway."); 18101 } 18102 18103 # Some tables should match everything 18104 my $expected_full = 18105 ($table->fate == $SUPPRESSED) 18106 ? 0 18107 : ($is_property) 18108 ? # All these types of map tables will be full because 18109 # they will have been populated with defaults 18110 ($type == $ENUM) 18111 18112 : # A match table should match everything if its method 18113 # shows it should 18114 ($table->matches_all 18115 18116 # The complement of an empty binary table will match 18117 # everything 18118 || $is_complement_of_empty_binary 18119 ) 18120 ; 18121 18122 my $count = $table->count; 18123 if ($expected_full) { 18124 if ($count != $MAX_WORKING_CODEPOINTS) { 18125 Carp::my_carp("$table matches only " 18126 . clarify_number($count) 18127 . " Unicode code points but should match " 18128 . clarify_number($MAX_WORKING_CODEPOINTS) 18129 . " (off by " 18130 . clarify_number(abs($MAX_WORKING_CODEPOINTS - $count)) 18131 . "). Proceeding anyway."); 18132 } 18133 18134 # Here is expected to be full. If it is because it is the 18135 # complement of an (empty) binary table that is to be 18136 # suppressed, then suppress this one as well. 18137 if ($is_complement_of_empty_binary) { 18138 my $opposing_name = ($name eq 'Y') ? 'N' : 'Y'; 18139 my $opposing = $property->table($opposing_name); 18140 my $opposing_status = $opposing->status; 18141 if ($opposing_status) { 18142 $table->set_status($opposing_status, 18143 $opposing->status_info); 18144 } 18145 } 18146 } 18147 elsif ($count == $MAX_UNICODE_CODEPOINTS 18148 && $name ne "Any" 18149 && ($table == $property || $table->leader == $table) 18150 && $table->property->status ne $NORMAL) 18151 { 18152 Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway."); 18153 } 18154 18155 if ($table->fate >= $SUPPRESSED) { 18156 if (! $is_property) { 18157 my @children = $table->children; 18158 foreach my $child (@children) { 18159 if ($child->fate < $SUPPRESSED) { 18160 Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't"); 18161 } 18162 } 18163 } 18164 next TABLE; 18165 18166 } 18167 18168 if (! $is_property) { 18169 18170 make_ucd_table_pod_entries($table) if $table->property == $perl; 18171 18172 # Several things need to be done just once for each related 18173 # group of match tables. Do them on the parent. 18174 if ($table->parent == $table) { 18175 18176 # Add an entry in the pod file for the table; it also does 18177 # the children. 18178 make_re_pod_entries($table) if defined $pod_directory; 18179 18180 # See if the table matches identical code points with 18181 # something that has already been processed and is ready 18182 # for output. In that case, no need to have two files 18183 # with the same code points in them. We use the table's 18184 # hash() method to store these in buckets, so that it is 18185 # quite likely that if two tables are in the same bucket 18186 # they will be identical, so don't have to compare tables 18187 # frequently. The tables have to have the same status to 18188 # share a file, so add this to the bucket hash. (The 18189 # reason for this latter is that UCD.pm associates a 18190 # status with a file.) We don't check tables that are 18191 # inverses of others, as it would lead to some coding 18192 # complications, and checking all the regular ones should 18193 # find everything. 18194 if ($table->complement == 0) { 18195 my $hash = $table->hash . ';' . $table->status; 18196 18197 # Look at each table that is in the same bucket as 18198 # this one would be. 18199 foreach my $comparison 18200 (@{$match_tables_to_write{$hash}}) 18201 { 18202 # If the table doesn't point back to this one, we 18203 # see if it matches identically 18204 if ( $comparison->leader != $table 18205 && $table->matches_identically_to($comparison)) 18206 { 18207 $table->set_equivalent_to($comparison, 18208 Related => 0); 18209 next TABLE; 18210 } 18211 } 18212 18213 # Here, not equivalent, add this table to the bucket. 18214 push @{$match_tables_to_write{$hash}}, $table; 18215 } 18216 } 18217 } 18218 else { 18219 18220 # Here is the property itself. 18221 # Don't write out or make references to the $perl property 18222 next if $table == $perl; 18223 18224 make_ucd_table_pod_entries($table); 18225 18226 # There is a mapping stored of the various synonyms to the 18227 # standardized name of the property for Unicode::UCD. 18228 # Also, the pod file contains entries of the form: 18229 # \p{alias: *} \p{full: *} 18230 # rather than show every possible combination of things. 18231 18232 my @property_aliases = $property->aliases; 18233 18234 my $full_property_name = $property->full_name; 18235 my $property_name = $property->name; 18236 my $standard_property_name = standardize($property_name); 18237 my $standard_property_full_name 18238 = standardize($full_property_name); 18239 18240 # We also create for Unicode::UCD a list of aliases for 18241 # the property. The list starts with the property name; 18242 # then its full name. Legacy properties are not listed in 18243 # Unicode::UCD. 18244 my @property_list; 18245 my @standard_list; 18246 if ( $property->fate <= $MAP_PROXIED) { 18247 @property_list = ($property_name, $full_property_name); 18248 @standard_list = ($standard_property_name, 18249 $standard_property_full_name); 18250 } 18251 18252 # For each synonym ... 18253 for my $i (0 .. @property_aliases - 1) { 18254 my $alias = $property_aliases[$i]; 18255 my $alias_name = $alias->name; 18256 my $alias_standard = standardize($alias_name); 18257 18258 18259 # Add other aliases to the list of property aliases 18260 if ($property->fate <= $MAP_PROXIED 18261 && ! grep { $alias_standard eq $_ } @standard_list) 18262 { 18263 push @property_list, $alias_name; 18264 push @standard_list, $alias_standard; 18265 } 18266 18267 # For Unicode::UCD, set the mapping of the alias to the 18268 # property 18269 if ($type == $STRING) { 18270 if ($property->fate <= $MAP_PROXIED) { 18271 $string_property_loose_to_name{$alias_standard} 18272 = $standard_property_name; 18273 } 18274 } 18275 else { 18276 my $hash_ref = ($alias_standard =~ /^_/) 18277 ? \%strict_property_name_of 18278 : \%loose_property_name_of; 18279 if (exists $hash_ref->{$alias_standard}) { 18280 Carp::my_carp("There already is a property with the same standard name as $alias_name: $hash_ref->{$alias_standard}. Old name is retained"); 18281 } 18282 else { 18283 $hash_ref->{$alias_standard} 18284 = $standard_property_name; 18285 } 18286 18287 # Now for the re pod entry for this alias. Skip if not 18288 # outputting a pod; skip the first one, which is the 18289 # full name so won't have an entry like: '\p{full: *} 18290 # \p{full: *}', and skip if don't want an entry for 18291 # this one. 18292 next if $i == 0 18293 || ! defined $pod_directory 18294 || ! $alias->make_re_pod_entry; 18295 18296 my $rhs = "\\p{$full_property_name: *}"; 18297 if ($property != $perl && $table->perl_extension) { 18298 $rhs .= ' (Perl extension)'; 18299 } 18300 push @match_properties, 18301 format_pod_line($indent_info_column, 18302 '\p{' . $alias->name . ': *}', 18303 $rhs, 18304 $alias->status); 18305 } 18306 } 18307 18308 # The list of all possible names is attached to each alias, so 18309 # lookup is easy 18310 if (@property_list) { 18311 push @{$prop_aliases{$standard_list[0]}}, @property_list; 18312 } 18313 18314 if ($property->fate <= $MAP_PROXIED) { 18315 18316 # Similarly, we create for Unicode::UCD a list of 18317 # property-value aliases. 18318 18319 # Look at each table in the property... 18320 foreach my $table ($property->tables) { 18321 my @values_list; 18322 my $table_full_name = $table->full_name; 18323 my $standard_table_full_name 18324 = standardize($table_full_name); 18325 my $table_name = $table->name; 18326 my $standard_table_name = standardize($table_name); 18327 18328 # The list starts with the table name and its full 18329 # name. 18330 push @values_list, $table_name, $table_full_name; 18331 18332 # We add to the table each unique alias that isn't 18333 # discouraged from use. 18334 foreach my $alias ($table->aliases) { 18335 next if $alias->status 18336 && $alias->status eq $DISCOURAGED; 18337 my $name = $alias->name; 18338 my $standard = standardize($name); 18339 next if $standard eq $standard_table_name; 18340 next if $standard eq $standard_table_full_name; 18341 push @values_list, $name; 18342 } 18343 18344 # Here @values_list is a list of all the aliases for 18345 # the table. That is, all the property-values given 18346 # by this table. By agreement with Unicode::UCD, 18347 # if the name and full name are identical, and there 18348 # are no other names, drop the duplicate entry to save 18349 # memory. 18350 if (@values_list == 2 18351 && $values_list[0] eq $values_list[1]) 18352 { 18353 pop @values_list 18354 } 18355 18356 # To save memory, unlike the similar list for property 18357 # aliases above, only the standard forms have the list. 18358 # This forces an extra step of converting from input 18359 # name to standard name, but the savings are 18360 # considerable. (There is only marginal savings if we 18361 # did this with the property aliases.) 18362 push @{$prop_value_aliases{$standard_property_name}{$standard_table_name}}, @values_list; 18363 } 18364 } 18365 18366 # Don't write out a mapping file if not desired. 18367 next if ! $property->to_output_map; 18368 } 18369 18370 # Here, we know we want to write out the table, but don't do it 18371 # yet because there may be other tables that come along and will 18372 # want to share the file, and the file's comments will change to 18373 # mention them. So save for later. 18374 push @writables, $table; 18375 18376 } # End of looping through the property and all its tables. 18377 } # End of looping through all properties. 18378 18379 # Now have all the tables that will have files written for them. Do it. 18380 foreach my $table (@writables) { 18381 my @directory; 18382 my $filename; 18383 my $property = $table->property; 18384 my $is_property = ($table == $property); 18385 18386 # For very short tables, instead of writing them out to actual files, 18387 # we in-line their inversion list definitions into UCD.pm. The 18388 # definition replaces the file name, and the special pseudo-directory 18389 # '#' is used to signal this. This significantly cuts down the number 18390 # of files written at little extra cost to the hashes in UCD.pm. 18391 # And it means, no run-time files to read to get the definitions. 18392 if (! $is_property 18393 && ! $annotate # For annotation, we want to explicitly show 18394 # everything, so keep in files 18395 && $table->ranges <= 3) 18396 { 18397 my @ranges = $table->ranges; 18398 my $count = @ranges; 18399 if ($count == 0) { # 0th index reserved for 0-length lists 18400 $filename = 0; 18401 } 18402 elsif ($table->leader != $table) { 18403 18404 # Here, is a table that is equivalent to another; code 18405 # in register_file_for_name() causes its leader's definition 18406 # to be used 18407 18408 next; 18409 } 18410 else { # No equivalent table so far. 18411 18412 # Build up its definition range-by-range. 18413 my $definition = ""; 18414 while (defined (my $range = shift @ranges)) { 18415 my $end = $range->end; 18416 if ($end < $MAX_WORKING_CODEPOINT) { 18417 $count++; 18418 $end = "\n" . ($end + 1); 18419 } 18420 else { # Extends to infinity, hence no 'end' 18421 $end = ""; 18422 } 18423 $definition .= "\n" . $range->start . $end; 18424 } 18425 $definition = "V$count" . $definition; 18426 $filename = @inline_definitions; 18427 push @inline_definitions, $definition; 18428 } 18429 @directory = "#"; 18430 register_file_for_name($table, \@directory, $filename); 18431 next; 18432 } 18433 18434 if (! $is_property) { 18435 # Match tables for the property go in lib/$subdirectory, which is 18436 # the property's name. Don't use the standard file name for this, 18437 # as may get an unfamiliar alias 18438 @directory = ($matches_directory, ($property->match_subdir) 18439 ? $property->match_subdir 18440 : $property->external_name); 18441 } 18442 else { 18443 18444 @directory = $table->directory; 18445 $filename = $table->file; 18446 } 18447 18448 # Use specified filename if available, or default to property's 18449 # shortest name. We need an 8.3 safe filename (which means "an 8 18450 # safe" filename, since after the dot is only 'pl', which is < 3) 18451 # The 2nd parameter is if the filename shouldn't be changed, and 18452 # it shouldn't iff there is a hard-coded name for this table. 18453 $filename = construct_filename( 18454 $filename || $table->external_name, 18455 ! $filename, # mutable if no filename 18456 \@directory); 18457 18458 register_file_for_name($table, \@directory, $filename); 18459 18460 # Only need to write one file when shared by more than one 18461 # property 18462 next if ! $is_property 18463 && ($table->leader != $table || $table->complement != 0); 18464 18465 # Construct a nice comment to add to the file 18466 $table->set_final_comment; 18467 18468 $table->write; 18469 } 18470 18471 18472 # Write out the pod file 18473 make_pod; 18474 18475 # And Name.pm, UCD.pl 18476 make_Name_pm; 18477 make_UCD; 18478 18479 make_property_test_script() if $make_test_script; 18480 make_normalization_test_script() if $make_norm_test_script; 18481 return; 18482} 18483 18484my @white_space_separators = ( # This used only for making the test script. 18485 "", 18486 ' ', 18487 "\t", 18488 ' ' 18489 ); 18490 18491sub generate_separator($lhs) { 18492 # This used only for making the test script. It generates the colon or 18493 # equal separator between the property and property value, with random 18494 # white space surrounding the separator 18495 18496 return "" if $lhs eq ""; # No separator if there's only one (the r) side 18497 18498 # Choose space before and after randomly 18499 my $spaces_before =$white_space_separators[rand(@white_space_separators)]; 18500 my $spaces_after = $white_space_separators[rand(@white_space_separators)]; 18501 18502 # And return the whole complex, half the time using a colon, half the 18503 # equals 18504 return $spaces_before 18505 . (rand() < 0.5) ? '=' : ':' 18506 . $spaces_after; 18507} 18508 18509sub generate_tests($lhs, $rhs, $valid_code, $invalid_code, $warning) { 18510 # This used only for making the test script. It generates test cases that 18511 # are expected to compile successfully in perl. Note that the LHS and 18512 # RHS are assumed to already be as randomized as the caller wants. 18513 18514 # $lhs # The property: what's to the left of the colon 18515 # or equals separator 18516 # $rhs # The property value; what's to the right 18517 # $valid_code # A code point that's known to be in the 18518 # table given by LHS=RHS; undef if table is 18519 # empty 18520 # $invalid_code # A code point known to not be in the table; 18521 # undef if the table is all code points 18522 # $warning 18523 18524 # Get the colon or equal 18525 my $separator = generate_separator($lhs); 18526 18527 # The whole 'property=value' 18528 my $name = "$lhs$separator$rhs"; 18529 18530 my @output; 18531 # Create a complete set of tests, with complements. 18532 if (defined $valid_code) { 18533 push @output, <<"EOC" 18534Expect(1, $valid_code, '\\p{$name}', $warning); 18535Expect(0, $valid_code, '\\p{^$name}', $warning); 18536Expect(0, $valid_code, '\\P{$name}', $warning); 18537Expect(1, $valid_code, '\\P{^$name}', $warning); 18538EOC 18539 } 18540 if (defined $invalid_code) { 18541 push @output, <<"EOC" 18542Expect(0, $invalid_code, '\\p{$name}', $warning); 18543Expect(1, $invalid_code, '\\p{^$name}', $warning); 18544Expect(1, $invalid_code, '\\P{$name}', $warning); 18545Expect(0, $invalid_code, '\\P{^$name}', $warning); 18546EOC 18547 } 18548 return @output; 18549} 18550 18551sub generate_wildcard_tests($lhs, $rhs, $valid_code, $invalid_code, $warning) { 18552 # This used only for making the test script. It generates wildcardl 18553 # matching test cases that are expected to compile successfully in perl. 18554 18555 # $lhs # The property: what's to the left of the 18556 # or equals separator 18557 # $rhs # The property value; what's to the right 18558 # $valid_code # A code point that's known to be in the 18559 # table given by LHS=RHS; undef if table is 18560 # empty 18561 # $invalid_code # A code point known to not be in the table; 18562 # undef if the table is all code points 18563 # $warning 18564 18565 return if $lhs eq ""; 18566 return if $lhs =~ / ^ Is_ /x; # These are not currently supported 18567 18568 # Generate a standardized pattern, with colon being the delimitter 18569 my $wildcard = "$lhs=:\\A$rhs\\z:"; 18570 18571 my @output; 18572 push @output, "Expect(1, $valid_code, '\\p{$wildcard}', $warning);" 18573 if defined $valid_code; 18574 push @output, "Expect(0, $invalid_code, '\\p{$wildcard}', $warning);" 18575 if defined $invalid_code; 18576 return @output; 18577} 18578 18579sub generate_error($lhs, $rhs, $already_in_error=0) { 18580 # This used only for making the test script. It generates test cases that 18581 # are expected to not only not match, but to be syntax or similar errors 18582 18583 # $lhs # The property: what's to the left of the 18584 # colon or equals separator 18585 # $rhs # The property value; what's to the right 18586 # $already_in_error # Boolean; if true it's known that the 18587 # unmodified LHS and RHS will cause an error. 18588 # This routine should not force another one 18589 # Get the colon or equal 18590 my $separator = generate_separator($lhs); 18591 18592 # Since this is an error only, don't bother to randomly decide whether to 18593 # put the error on the left or right side; and assume that the RHS is 18594 # loosely matched, again for convenience rather than rigor. 18595 $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error; 18596 18597 my $property = $lhs . $separator . $rhs; 18598 18599 return <<"EOC"; 18600Error('\\p{$property}'); 18601Error('\\P{$property}'); 18602EOC 18603} 18604 18605# These are used only for making the test script 18606# XXX Maybe should also have a bad strict seps, which includes underscore. 18607 18608my @good_loose_seps = ( 18609 " ", 18610 "-", 18611 "\t", 18612 "", 18613 "_", 18614 ); 18615my @bad_loose_seps = ( 18616 "/a/", 18617 ':=', 18618 ); 18619 18620sub randomize_stricter_name($name) { 18621 # This used only for making the test script. Take the input name and 18622 # return a randomized, but valid version of it under the stricter matching 18623 # rules. 18624 18625 # If the name looks like a number (integer, floating, or rational), do 18626 # some extra work 18627 if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) { 18628 my $sign = $1; 18629 my $number = $2; 18630 my $separator = $3; 18631 18632 # If there isn't a sign, part of the time add a plus 18633 # Note: Not testing having any denominator having a minus sign 18634 if (! $sign) { 18635 $sign = '+' if rand() <= .3; 18636 } 18637 18638 # And add 0 or more leading zeros. 18639 $name = $sign . ('0' x int rand(10)) . $number; 18640 18641 if (defined $separator) { 18642 my $extra_zeros = '0' x int rand(10); 18643 18644 if ($separator eq '.') { 18645 18646 # Similarly, add 0 or more trailing zeros after a decimal 18647 # point 18648 $name .= $extra_zeros; 18649 } 18650 else { 18651 18652 # Or, leading zeros before the denominator 18653 $name =~ s,/,/$extra_zeros,; 18654 } 18655 } 18656 } 18657 18658 # For legibility of the test, only change the case of whole sections at a 18659 # time. To do this, first split into sections. The split returns the 18660 # delimiters 18661 my @sections; 18662 for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) { 18663 trace $section if main::DEBUG && $to_trace; 18664 18665 if (length $section > 1 && $section !~ /\D/) { 18666 18667 # If the section is a sequence of digits, about half the time 18668 # randomly add underscores between some of them. 18669 if (rand() > .5) { 18670 18671 # Figure out how many underscores to add. max is 1 less than 18672 # the number of digits. (But add 1 at the end to make sure 18673 # result isn't 0, and compensate earlier by subtracting 2 18674 # instead of 1) 18675 my $num_underscores = int rand(length($section) - 2) + 1; 18676 18677 # And add them evenly throughout, for convenience, not rigor 18678 use integer; 18679 my $spacing = (length($section) - 1)/ $num_underscores; 18680 my $temp = $section; 18681 $section = ""; 18682 for my $i (1 .. $num_underscores) { 18683 $section .= substr($temp, 0, $spacing, "") . '_'; 18684 } 18685 $section .= $temp; 18686 } 18687 push @sections, $section; 18688 } 18689 else { 18690 18691 # Here not a sequence of digits. Change the case of the section 18692 # randomly 18693 my $switch = int rand(4); 18694 if ($switch == 0) { 18695 push @sections, uc $section; 18696 } 18697 elsif ($switch == 1) { 18698 push @sections, lc $section; 18699 } 18700 elsif ($switch == 2) { 18701 push @sections, ucfirst $section; 18702 } 18703 else { 18704 push @sections, $section; 18705 } 18706 } 18707 } 18708 trace "returning", join "", @sections if main::DEBUG && $to_trace; 18709 return join "", @sections; 18710} 18711 18712sub randomize_loose_name($name, $want_error=0) { 18713 # This used only for making the test script 18714 18715 $name = randomize_stricter_name($name); 18716 18717 my @parts; 18718 push @parts, $good_loose_seps[rand(@good_loose_seps)]; 18719 18720 # Preserve trailing ones for the sake of not stripping the underscore from 18721 # 'L_' 18722 for my $part (split /[-\s_]+ (?= . )/, $name) { 18723 if (@parts) { 18724 if ($want_error and rand() < 0.3) { 18725 push @parts, $bad_loose_seps[rand(@bad_loose_seps)]; 18726 $want_error = 0; 18727 } 18728 else { 18729 push @parts, $good_loose_seps[rand(@good_loose_seps)]; 18730 } 18731 } 18732 push @parts, $part; 18733 } 18734 my $new = join("", @parts); 18735 trace "$name => $new" if main::DEBUG && $to_trace; 18736 18737 if ($want_error) { 18738 if (rand() >= 0.5) { 18739 $new .= $bad_loose_seps[rand(@bad_loose_seps)]; 18740 } 18741 else { 18742 $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new; 18743 } 18744 } 18745 return $new; 18746} 18747 18748# Used to make sure don't generate duplicate test cases. 18749my %test_generated; 18750 18751sub make_property_test_script() { 18752 # This used only for making the test script 18753 # this written directly -- it's huge. 18754 18755 print "Making test script\n" if $verbosity >= $PROGRESS; 18756 18757 # This uses randomness to test different possibilities without testing all 18758 # possibilities. To ensure repeatability, set the seed to 0. But if 18759 # tests are added, it will perturb all later ones in the .t file 18760 srand 0; 18761 18762 $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name 18763 18764 # Create a list of what the %f representation is for each rational number. 18765 # This will be used below. 18766 my @valid_base_floats = '0.0'; 18767 foreach my $e_representation (keys %nv_floating_to_rational) { 18768 push @valid_base_floats, 18769 eval $nv_floating_to_rational{$e_representation}; 18770 } 18771 18772 # It doesn't matter whether the elements of this array contain single lines 18773 # or multiple lines. main::write doesn't count the lines. 18774 my @output; 18775 18776 push @output, <<'EOF_CODE'; 18777Error('\p{Script=InGreek}'); # Bug #69018 18778Test_GCB("1100 $nobreak 1161"); # Bug #70940 18779Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722 18780Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722 18781Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726 18782Error('\p{InKana}'); # 'Kana' is not a block so InKana shouldn't compile 18783 18784# Make sure this gets tested; it was not part of the official test suite at 18785# the time this was added. Note that this is as it would appear in the 18786# official suite, and gets modified to check for the perl tailoring by 18787# Test_WB() 18788Test_WB("$breakable 0020 $breakable 0020 $breakable 0308 $breakable"); 18789Test_LB("$nobreak 200B $nobreak 0020 $nobreak 0020 $breakable 2060 $breakable"); 18790Expect(1, ord(" "), '\p{gc=:(?aa)s:}', ""); # /aa is valid 18791Expect(1, ord(" "), '\p{gc=:(?-s)s:}', ""); # /-s is valid 18792EOF_CODE 18793 18794 # Sort these so get results in same order on different runs of this 18795 # program 18796 foreach my $property (sort { $a->has_dependency <=> $b->has_dependency 18797 or 18798 lc $a->name cmp lc $b->name 18799 } property_ref('*')) 18800 { 18801 # Non-binary properties should not match \p{}; Test all for that. 18802 if ($property->type != $BINARY && $property->type != $FORCED_BINARY) { 18803 my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } 18804 $property->aliases; 18805 foreach my $property_alias ($property->aliases) { 18806 my $name = standardize($property_alias->name); 18807 18808 # But some names are ambiguous, meaning a binary property with 18809 # the same name when used in \p{}, and a different 18810 # (non-binary) property in other contexts. 18811 next if grep { $name eq $_ } keys %ambiguous_names; 18812 18813 push @output, <<"EOF_CODE"; 18814Error('\\p{$name}'); 18815Error('\\P{$name}'); 18816EOF_CODE 18817 } 18818 } 18819 foreach my $table (sort { $a->has_dependency <=> $b->has_dependency 18820 or 18821 lc $a->name cmp lc $b->name 18822 } $property->tables) 18823 { 18824 18825 # Find code points that match, and don't match this table. 18826 my $valid = $table->get_valid_code_point; 18827 my $invalid = $table->get_invalid_code_point; 18828 my $warning = ($table->status eq $DEPRECATED) 18829 ? "'deprecated'" 18830 : '""'; 18831 18832 # Test each possible combination of the property's aliases with 18833 # the table's. If this gets to be too many, could do what is done 18834 # in the set_final_comment() for Tables 18835 my @table_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->aliases; 18836 next unless @table_aliases; 18837 my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->property->aliases; 18838 next unless @property_aliases; 18839 18840 # Every property can be optionally be prefixed by 'Is_', so test 18841 # that those work, by creating such a new alias for each 18842 # pre-existing one. 18843 push @property_aliases, map { Alias->new("Is_" . $_->name, 18844 $_->loose_match, 18845 $_->make_re_pod_entry, 18846 $_->ok_as_filename, 18847 $_->status, 18848 $_->ucd, 18849 ) 18850 } @property_aliases; 18851 my $max = max(scalar @table_aliases, scalar @property_aliases); 18852 for my $j (0 .. $max - 1) { 18853 18854 # The current alias for property is the next one on the list, 18855 # or if beyond the end, start over. Similarly for table 18856 my $property_name 18857 = $property_aliases[$j % @property_aliases]->name; 18858 18859 $property_name = "" if $table->property == $perl; 18860 my $table_alias = $table_aliases[$j % @table_aliases]; 18861 my $table_name = $table_alias->name; 18862 my $loose_match = $table_alias->loose_match; 18863 18864 # If the table doesn't have a file, any test for it is 18865 # already guaranteed to be in error 18866 my $already_error = ! $table->file_path; 18867 18868 # A table that begins with these could actually be a 18869 # user-defined property, so won't be compile time errors, as 18870 # the definitions of those can be deferred until runtime 18871 next if $already_error && $table_name =~ / ^ I[ns] /x; 18872 18873 # Generate error cases for this alias. 18874 push @output, generate_error($property_name, 18875 $table_name, 18876 $already_error); 18877 18878 # If the table is guaranteed to always generate an error, 18879 # quit now without generating success cases. 18880 next if $already_error; 18881 18882 # Now for the success cases. First, wildcard matching, as it 18883 # shouldn't have any randomization. 18884 if ($table_alias->status eq $NORMAL) { 18885 push @output, generate_wildcard_tests($property_name, 18886 $table_name, 18887 $valid, 18888 $invalid, 18889 $warning, 18890 ); 18891 } 18892 my $random; 18893 if ($loose_match) { 18894 18895 # For loose matching, create an extra test case for the 18896 # standard name. 18897 my $standard = standardize($table_name); 18898 18899 # $test_name should be a unique combination for each test 18900 # case; used just to avoid duplicate tests 18901 my $test_name = "$property_name=$standard"; 18902 18903 # Don't output duplicate test cases. 18904 if (! exists $test_generated{$test_name}) { 18905 $test_generated{$test_name} = 1; 18906 push @output, generate_tests($property_name, 18907 $standard, 18908 $valid, 18909 $invalid, 18910 $warning, 18911 ); 18912 if ($table_alias->status eq $NORMAL) { 18913 push @output, generate_wildcard_tests( 18914 $property_name, 18915 $standard, 18916 $valid, 18917 $invalid, 18918 $warning, 18919 ); 18920 } 18921 } 18922 $random = randomize_loose_name($table_name) 18923 } 18924 else { # Stricter match 18925 $random = randomize_stricter_name($table_name); 18926 } 18927 18928 # Now for the main test case for this alias. 18929 my $test_name = "$property_name=$random"; 18930 if (! exists $test_generated{$test_name}) { 18931 $test_generated{$test_name} = 1; 18932 push @output, generate_tests($property_name, 18933 $random, 18934 $valid, 18935 $invalid, 18936 $warning, 18937 ); 18938 18939 if ($property->name eq 'nv') { 18940 if ($table_name !~ qr{/}) { 18941 push @output, generate_tests($property_name, 18942 sprintf("%.15e", $table_name), 18943 $valid, 18944 $invalid, 18945 $warning, 18946 ); 18947 } 18948 else { 18949 # If the name is a rational number, add tests for a 18950 # non-reduced form, and for a floating point equivalent. 18951 18952 # 60 is a number divisible by a bunch of things 18953 my ($numerator, $denominator) = $table_name 18954 =~ m! (.+) / (.+) !x; 18955 $numerator *= 60; 18956 $denominator *= 60; 18957 push @output, generate_tests($property_name, 18958 "$numerator/$denominator", 18959 $valid, 18960 $invalid, 18961 $warning, 18962 ); 18963 18964 # Calculate the float, and the %e representation 18965 my $float = eval $table_name; 18966 my $e_representation = sprintf("%.*e", 18967 $E_FLOAT_PRECISION, $float); 18968 # Parse that 18969 my ($non_zeros, $zeros, $exponent_sign, $exponent) 18970 = $e_representation 18971 =~ / -? [1-9] \. (\d*?) (0*) e ([+-]) (\d+) /x; 18972 my $min_e_precision; 18973 my $min_f_precision; 18974 18975 if ($exponent_sign eq '+' && $exponent != 0) { 18976 Carp::my_carp_bug("Not yet equipped to handle" 18977 . " positive exponents"); 18978 return; 18979 } 18980 else { 18981 # We're trying to find the minimum precision that 18982 # is needed to indicate this particular rational 18983 # for the given $E_FLOAT_PRECISION. For %e, any 18984 # trailing zeros, like 1.500e-02 aren't needed, so 18985 # the correct value is how many non-trailing zeros 18986 # there are after the decimal point. 18987 $min_e_precision = length $non_zeros; 18988 18989 # For %f, like .01500, we want at least 18990 # $E_FLOAT_PRECISION digits, but any trailing 18991 # zeros aren't needed, so we can subtract the 18992 # length of those. But we also need to include 18993 # the zeros after the decimal point, but before 18994 # the first significant digit. 18995 $min_f_precision = $E_FLOAT_PRECISION 18996 + $exponent 18997 - length $zeros; 18998 } 18999 19000 # Make tests for each possible precision from 1 to 19001 # just past the worst case. 19002 my $upper_limit = ($min_e_precision > $min_f_precision) 19003 ? $min_e_precision 19004 : $min_f_precision; 19005 19006 for my $i (1 .. $upper_limit + 1) { 19007 for my $format ("e", "f") { 19008 my $this_table 19009 = sprintf("%.*$format", $i, $float); 19010 19011 # If we don't have enough precision digits, 19012 # make a fail test; otherwise a pass test. 19013 my $pass = ($format eq "e") 19014 ? $i >= $min_e_precision 19015 : $i >= $min_f_precision; 19016 if ($pass) { 19017 push @output, generate_tests($property_name, 19018 $this_table, 19019 $valid, 19020 $invalid, 19021 $warning, 19022 ); 19023 } 19024 elsif ( $format eq "e" 19025 19026 # Here we would fail, but in the %f 19027 # case, the representation at this 19028 # precision could actually be a 19029 # valid one for some other rational 19030 || ! grep { $this_table 19031 =~ / ^ $_ 0* $ /x } 19032 @valid_base_floats) 19033 { 19034 push @output, 19035 generate_error($property_name, 19036 $this_table, 19037 1 # 1 => already an 19038 # error 19039 ); 19040 } 19041 } 19042 } 19043 } 19044 } 19045 } 19046 } 19047 $table->DESTROY(); 19048 } 19049 $property->DESTROY(); 19050 } 19051 19052 # Make any test of the boundary (break) properties TODO if the code 19053 # doesn't match the version being compiled 19054 my $TODO_FAILING_BREAKS = ($version_of_mk_invlist_bounds ne $v_version) 19055 ? "\nsub TODO_FAILING_BREAKS { 1 }\n" 19056 : "\nsub TODO_FAILING_BREAKS { 0 }\n"; 19057 19058 @output= map { 19059 map s/^/ /mgr, 19060 map "$_;\n", 19061 split /;\n/, $_ 19062 } @output; 19063 19064 # Cause there to be 'if' statements to only execute a portion of this 19065 # long-running test each time, so that we can have a bunch of .t's running 19066 # in parallel 19067 my $chunks = 10 # Number of test files 19068 - 1 # For GCB & SB 19069 - 1 # For WB 19070 - 4; # LB split into this many files 19071 my @output_chunked; 19072 my $chunk_count=0; 19073 my $chunk_size= int(@output / $chunks) + 1; 19074 while (@output) { 19075 $chunk_count++; 19076 my @chunk= splice @output, 0, $chunk_size; 19077 push @output_chunked, 19078 "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n", 19079 @chunk, 19080 "}\n"; 19081 } 19082 19083 $chunk_count++; 19084 push @output_chunked, 19085 "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n", 19086 (map {" Test_GCB('$_');\n"} @backslash_X_tests), 19087 (map {" Test_SB('$_');\n"} @SB_tests), 19088 "}\n"; 19089 19090 19091 $chunk_size= int(@LB_tests / 4) + 1; 19092 @LB_tests = map {" Test_LB('$_');\n"} @LB_tests; 19093 while (@LB_tests) { 19094 $chunk_count++; 19095 my @chunk= splice @LB_tests, 0, $chunk_size; 19096 push @output_chunked, 19097 "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n", 19098 @chunk, 19099 "}\n"; 19100 } 19101 19102 $chunk_count++; 19103 push @output_chunked, 19104 "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n", 19105 (map {" Test_WB('$_');\n"} @WB_tests), 19106 "}\n"; 19107 19108 &write($t_path, 19109 0, # Not utf8; 19110 [$HEADER, 19111 $TODO_FAILING_BREAKS, 19112 <DATA>, 19113 @output_chunked, 19114 "Finished();\n", 19115 ]); 19116 19117 return; 19118} 19119 19120sub make_normalization_test_script() { 19121 print "Making normalization test script\n" if $verbosity >= $PROGRESS; 19122 19123 my $n_path = 'TestNorm.pl'; 19124 19125 unshift @normalization_tests, <<'END'; 19126use utf8; 19127use Test::More; 19128 19129sub ord_string { # Convert packed ords to printable string 19130 use charnames (); 19131 return "'" . join("", map { '\N{' . charnames::viacode($_) . '}' } 19132 unpack "U*", shift) . "'"; 19133 #return "'" . join(" ", map { sprintf "%04X", $_ } unpack "U*", shift) . "'"; 19134} 19135 19136sub Test_N { 19137 my ($source, $nfc, $nfd, $nfkc, $nfkd) = @_; 19138 my $display_source = ord_string($source); 19139 my $display_nfc = ord_string($nfc); 19140 my $display_nfd = ord_string($nfd); 19141 my $display_nfkc = ord_string($nfkc); 19142 my $display_nfkd = ord_string($nfkd); 19143 19144 use Unicode::Normalize; 19145 # NFC 19146 # nfc == toNFC(source) == toNFC(nfc) == toNFC(nfd) 19147 # nfkc == toNFC(nfkc) == toNFC(nfkd) 19148 # 19149 # NFD 19150 # nfd == toNFD(source) == toNFD(nfc) == toNFD(nfd) 19151 # nfkd == toNFD(nfkc) == toNFD(nfkd) 19152 # 19153 # NFKC 19154 # nfkc == toNFKC(source) == toNFKC(nfc) == toNFKC(nfd) == 19155 # toNFKC(nfkc) == toNFKC(nfkd) 19156 # 19157 # NFKD 19158 # nfkd == toNFKD(source) == toNFKD(nfc) == toNFKD(nfd) == 19159 # toNFKD(nfkc) == toNFKD(nfkd) 19160 19161 is(NFC($source), $nfc, "NFC($display_source) eq $display_nfc"); 19162 is(NFC($nfc), $nfc, "NFC($display_nfc) eq $display_nfc"); 19163 is(NFC($nfd), $nfc, "NFC($display_nfd) eq $display_nfc"); 19164 is(NFC($nfkc), $nfkc, "NFC($display_nfkc) eq $display_nfkc"); 19165 is(NFC($nfkd), $nfkc, "NFC($display_nfkd) eq $display_nfkc"); 19166 19167 is(NFD($source), $nfd, "NFD($display_source) eq $display_nfd"); 19168 is(NFD($nfc), $nfd, "NFD($display_nfc) eq $display_nfd"); 19169 is(NFD($nfd), $nfd, "NFD($display_nfd) eq $display_nfd"); 19170 is(NFD($nfkc), $nfkd, "NFD($display_nfkc) eq $display_nfkd"); 19171 is(NFD($nfkd), $nfkd, "NFD($display_nfkd) eq $display_nfkd"); 19172 19173 is(NFKC($source), $nfkc, "NFKC($display_source) eq $display_nfkc"); 19174 is(NFKC($nfc), $nfkc, "NFKC($display_nfc) eq $display_nfkc"); 19175 is(NFKC($nfd), $nfkc, "NFKC($display_nfd) eq $display_nfkc"); 19176 is(NFKC($nfkc), $nfkc, "NFKC($display_nfkc) eq $display_nfkc"); 19177 is(NFKC($nfkd), $nfkc, "NFKC($display_nfkd) eq $display_nfkc"); 19178 19179 is(NFKD($source), $nfkd, "NFKD($display_source) eq $display_nfkd"); 19180 is(NFKD($nfc), $nfkd, "NFKD($display_nfc) eq $display_nfkd"); 19181 is(NFKD($nfd), $nfkd, "NFKD($display_nfd) eq $display_nfkd"); 19182 is(NFKD($nfkc), $nfkd, "NFKD($display_nfkc) eq $display_nfkd"); 19183 is(NFKD($nfkd), $nfkd, "NFKD($display_nfkd) eq $display_nfkd"); 19184} 19185END 19186 19187 &write($n_path, 19188 1, # Is utf8; 19189 [ 19190 @normalization_tests, 19191 'done_testing();' 19192 ]); 19193 return; 19194} 19195 19196# Skip reasons, so will be exact same text and hence the files with each 19197# reason will get grouped together in perluniprops. 19198my $Documentation = "Documentation"; 19199my $Indic_Skip 19200 = "Provisional; for the analysis and processing of Indic scripts"; 19201my $Validation = "Validation Tests"; 19202my $Validation_Documentation = "Documentation of validation Tests"; 19203 19204# This is a list of the input files and how to handle them. The files are 19205# processed in their order in this list. Some reordering is possible if 19206# desired, but the PropertyAliases and PropValueAliases files should be first, 19207# and the extracted before the others (as data in an extracted file can be 19208# over-ridden by the non-extracted. Some other files depend on data derived 19209# from an earlier file, like UnicodeData requires data from Jamo, and the case 19210# changing and folding requires data from Unicode. Mostly, it is safest to 19211# order by first version releases in (except the Jamo). 19212# 19213# The version strings allow the program to know whether to expect a file or 19214# not, but if a file exists in the directory, it will be processed, even if it 19215# is in a version earlier than expected, so you can copy files from a later 19216# release into an earlier release's directory. 19217my @input_file_objects = ( 19218 Input_file->new('PropertyAliases.txt', v3.2, 19219 Handler => \&process_PropertyAliases, 19220 Early => [ \&substitute_PropertyAliases ], 19221 Required_Even_in_Debug_Skip => 1, 19222 ), 19223 Input_file->new(undef, v0, # No file associated with this 19224 Progress_Message => 'Finishing property setup', 19225 Handler => \&finish_property_setup, 19226 ), 19227 Input_file->new('PropValueAliases.txt', v3.2, 19228 Handler => \&process_PropValueAliases, 19229 Early => [ \&substitute_PropValueAliases ], 19230 Has_Missings_Defaults => $NOT_IGNORED, 19231 Required_Even_in_Debug_Skip => 1, 19232 ), 19233 Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0, 19234 Property => 'General_Category', 19235 ), 19236 Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0, 19237 Property => 'Canonical_Combining_Class', 19238 Has_Missings_Defaults => $NOT_IGNORED, 19239 ), 19240 Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0, 19241 Property => 'Numeric_Type', 19242 Has_Missings_Defaults => $NOT_IGNORED, 19243 ), 19244 Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0, 19245 Property => 'East_Asian_Width', 19246 Has_Missings_Defaults => $NOT_IGNORED, 19247 ), 19248 Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0, 19249 Property => 'Line_Break', 19250 Has_Missings_Defaults => $NOT_IGNORED, 19251 ), 19252 Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1, 19253 Property => 'Bidi_Class', 19254 Has_Missings_Defaults => $NOT_IGNORED, 19255 ), 19256 Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0, 19257 Property => 'Decomposition_Type', 19258 Has_Missings_Defaults => $NOT_IGNORED, 19259 ), 19260 Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0), 19261 Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0, 19262 Property => 'Numeric_Value', 19263 Each_Line_Handler => \&filter_numeric_value_line, 19264 Has_Missings_Defaults => $NOT_IGNORED, 19265 ), 19266 Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0, 19267 Property => 'Joining_Group', 19268 Has_Missings_Defaults => $NOT_IGNORED, 19269 ), 19270 19271 Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0, 19272 Property => 'Joining_Type', 19273 Has_Missings_Defaults => $NOT_IGNORED, 19274 ), 19275 Input_file->new("${EXTRACTED}DName.txt", v10.0.0, 19276 Skip => 'This file adds no new information not already' 19277 . ' present in other files', 19278 # And it's unnecessary programmer work to handle this new 19279 # format. Previous Derived files actually had bug fixes 19280 # in them that were useful, but that should not be the 19281 # case here. 19282 ), 19283 Input_file->new('Jamo.txt', v2.0.0, 19284 Property => 'Jamo_Short_Name', 19285 Each_Line_Handler => \&filter_jamo_line, 19286 ), 19287 Input_file->new('UnicodeData.txt', v1.1.5, 19288 Pre_Handler => \&setup_UnicodeData, 19289 19290 # We clean up this file for some early versions. 19291 Each_Line_Handler => [ (($v_version lt v2.0.0 ) 19292 ? \&filter_v1_ucd 19293 : ($v_version eq v2.1.5) 19294 ? \&filter_v2_1_5_ucd 19295 19296 # And for 5.14 Perls with 6.0, 19297 # have to also make changes 19298 : ($v_version ge v6.0.0 19299 && $^V lt v5.17.0) 19300 ? \&filter_v6_ucd 19301 : undef), 19302 19303 # Early versions did not have the 19304 # proper Unicode_1 names for the 19305 # controls 19306 (($v_version lt v3.0.0) 19307 ? \&filter_early_U1_names 19308 : undef), 19309 19310 # Early versions did not correctly 19311 # use the later method for giving 19312 # decimal digit values 19313 (($v_version le v3.2.0) 19314 ? \&filter_bad_Nd_ucd 19315 : undef), 19316 19317 # And the main filter 19318 \&filter_UnicodeData_line, 19319 ], 19320 EOF_Handler => \&EOF_UnicodeData, 19321 ), 19322 Input_file->new('CJKXREF.TXT', v1.1.5, 19323 Withdrawn => v2.0.0, 19324 Skip => 'Gives the mapping of CJK code points ' 19325 . 'between Unicode and various other standards', 19326 ), 19327 Input_file->new('ArabicShaping.txt', v2.0.0, 19328 Each_Line_Handler => 19329 ($v_version lt 4.1.0) 19330 ? \&filter_old_style_arabic_shaping 19331 : undef, 19332 # The first field after the range is a "schematic name" 19333 # not used by Perl 19334 Properties => [ '<ignored>', 'Joining_Type', 'Joining_Group' ], 19335 Has_Missings_Defaults => $NOT_IGNORED, 19336 ), 19337 Input_file->new('Blocks.txt', v2.0.0, 19338 Property => 'Block', 19339 Has_Missings_Defaults => $NOT_IGNORED, 19340 Each_Line_Handler => \&filter_blocks_lines 19341 ), 19342 Input_file->new('Index.txt', v2.0.0, 19343 Skip => 'Alphabetical index of Unicode characters', 19344 ), 19345 Input_file->new('NamesList.txt', v2.0.0, 19346 Skip => 'Annotated list of characters', 19347 ), 19348 Input_file->new('PropList.txt', v2.0.0, 19349 Each_Line_Handler => (($v_version lt v3.1.0) 19350 ? \&filter_old_style_proplist 19351 : undef), 19352 ), 19353 Input_file->new('Props.txt', v2.0.0, 19354 Withdrawn => v3.0.0, 19355 Skip => 'A subset of F<PropList.txt> (which is used instead)', 19356 ), 19357 Input_file->new('ReadMe.txt', v2.0.0, 19358 Skip => $Documentation, 19359 ), 19360 Input_file->new('Unihan.txt', v2.0.0, 19361 Withdrawn => v5.2.0, 19362 Construction_Time_Handler => \&construct_unihan, 19363 Pre_Handler => \&setup_unihan, 19364 Optional => [ "", 19365 'Unicode_Radical_Stroke' 19366 ], 19367 Each_Line_Handler => \&filter_unihan_line, 19368 ), 19369 Input_file->new('SpecialCasing.txt', v2.1.8, 19370 Each_Line_Handler => ($v_version eq 2.1.8) 19371 ? \&filter_2_1_8_special_casing_line 19372 : \&filter_special_casing_line, 19373 Pre_Handler => \&setup_special_casing, 19374 Has_Missings_Defaults => $IGNORED, 19375 ), 19376 Input_file->new( 19377 'LineBreak.txt', v3.0.0, 19378 Has_Missings_Defaults => $NOT_IGNORED, 19379 Property => 'Line_Break', 19380 # Early versions had problematic syntax 19381 Each_Line_Handler => ($v_version ge v3.1.0) 19382 ? undef 19383 : ($v_version lt v3.0.0) 19384 ? \&filter_substitute_lb 19385 : \&filter_early_ea_lb, 19386 # Must use long names for property values see comments at 19387 # sub filter_substitute_lb 19388 Early => [ "LBsubst.txt", '_Perl_LB', 'Alphabetic', 19389 'Alphabetic', # default to this because XX -> 19390 # AL 19391 19392 # Don't use _Perl_LB as a synonym for 19393 # Line_Break in later perls, as it is tailored 19394 # and isn't the same as Line_Break 19395 'ONLY_EARLY' ], 19396 ), 19397 Input_file->new('EastAsianWidth.txt', v3.0.0, 19398 Property => 'East_Asian_Width', 19399 Has_Missings_Defaults => $NOT_IGNORED, 19400 # Early versions had problematic syntax 19401 Each_Line_Handler => (($v_version lt v3.1.0) 19402 ? \&filter_early_ea_lb 19403 : undef), 19404 ), 19405 Input_file->new('CompositionExclusions.txt', v3.0.0, 19406 Property => 'Composition_Exclusion', 19407 ), 19408 Input_file->new('UnicodeData.html', v3.0.0, 19409 Withdrawn => v4.0.1, 19410 Skip => $Documentation, 19411 ), 19412 Input_file->new('BidiMirroring.txt', v3.0.1, 19413 Property => 'Bidi_Mirroring_Glyph', 19414 Has_Missings_Defaults => ($v_version lt v6.2.0) 19415 ? $NO_DEFAULTS 19416 # Is <none> which doesn't mean 19417 # anything to us, we will use the 19418 # null string 19419 : $IGNORED, 19420 ), 19421 Input_file->new('NamesList.html', v3.0.0, 19422 Skip => 'Describes the format and contents of ' 19423 . 'F<NamesList.txt>', 19424 ), 19425 Input_file->new('UnicodeCharacterDatabase.html', v3.0.0, 19426 Withdrawn => v5.1, 19427 Skip => $Documentation, 19428 ), 19429 Input_file->new('CaseFolding.txt', v3.0.1, 19430 Pre_Handler => \&setup_case_folding, 19431 Each_Line_Handler => 19432 [ ($v_version lt v3.1.0) 19433 ? \&filter_old_style_case_folding 19434 : undef, 19435 \&filter_case_folding_line 19436 ], 19437 Has_Missings_Defaults => $IGNORED, 19438 ), 19439 Input_file->new("NormTest.txt", v3.0.1, 19440 Handler => \&process_NormalizationsTest, 19441 Skip => ($make_norm_test_script) ? 0 : $Validation, 19442 ), 19443 Input_file->new('DCoreProperties.txt', v3.1.0, 19444 # 5.2 changed this file 19445 Has_Missings_Defaults => (($v_version ge v5.2.0) 19446 ? $NOT_IGNORED 19447 : $NO_DEFAULTS), 19448 ), 19449 Input_file->new('DProperties.html', v3.1.0, 19450 Withdrawn => v3.2.0, 19451 Skip => $Documentation, 19452 ), 19453 Input_file->new('PropList.html', v3.1.0, 19454 Withdrawn => v5.1, 19455 Skip => $Documentation, 19456 ), 19457 Input_file->new('Scripts.txt', v3.1.0, 19458 Property => 'Script', 19459 Each_Line_Handler => (($v_version le v4.0.0) 19460 ? \&filter_all_caps_script_names 19461 : undef), 19462 Has_Missings_Defaults => $NOT_IGNORED, 19463 ), 19464 Input_file->new('DNormalizationProps.txt', v3.1.0, 19465 Has_Missings_Defaults => $NOT_IGNORED, 19466 Each_Line_Handler => (($v_version lt v4.0.1) 19467 ? \&filter_old_style_normalization_lines 19468 : undef), 19469 ), 19470 Input_file->new('DerivedProperties.html', v3.1.1, 19471 Withdrawn => v5.1, 19472 Skip => $Documentation, 19473 ), 19474 Input_file->new('DAge.txt', v3.2.0, 19475 Has_Missings_Defaults => $NOT_IGNORED, 19476 Property => 'Age' 19477 ), 19478 Input_file->new('HangulSyllableType.txt', v4.0, 19479 Has_Missings_Defaults => $NOT_IGNORED, 19480 Early => [ \&generate_hst, 'Hangul_Syllable_Type' ], 19481 Property => 'Hangul_Syllable_Type' 19482 ), 19483 Input_file->new('NormalizationCorrections.txt', v3.2.0, 19484 # This documents the cumulative fixes to erroneous 19485 # normalizations in earlier Unicode versions. Its main 19486 # purpose is so that someone running on an earlier 19487 # version can use this file to override what got 19488 # published in that earlier release. It would be easy 19489 # for mktables to handle this file. But all the 19490 # corrections in it should already be in the other files 19491 # for the release it is. To get it to actually mean 19492 # something useful, someone would have to be using an 19493 # earlier Unicode release, and copy it into the directory 19494 # for that release and recompile. So far there has been 19495 # no demand to do that, so this hasn't been implemented. 19496 Skip => 'Documentation of corrections already ' 19497 . 'incorporated into the Unicode data base', 19498 ), 19499 Input_file->new('StandardizedVariants.html', v3.2.0, 19500 Skip => 'Obsoleted as of Unicode 9.0, but previously ' 19501 . 'provided a visual display of the standard ' 19502 . 'variant sequences derived from ' 19503 . 'F<StandardizedVariants.txt>.', 19504 # I don't know why the html came earlier than the 19505 # .txt, but both are skipped anyway, so it doesn't 19506 # matter. 19507 ), 19508 Input_file->new('StandardizedVariants.txt', v4.0.0, 19509 Skip => 'Certain glyph variations for character display ' 19510 . 'are standardized. This lists the non-Unihan ' 19511 . 'ones; the Unihan ones are also not used by ' 19512 . 'Perl, and are in a separate Unicode data base ' 19513 . 'L<http://www.unicode.org/ivd>', 19514 ), 19515 Input_file->new('UCD.html', v4.0.0, 19516 Withdrawn => v5.2, 19517 Skip => $Documentation, 19518 ), 19519 Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0, 19520 Early => [ "WBsubst.txt", '_Perl_WB', 'ALetter' ], 19521 Property => 'Word_Break', 19522 Has_Missings_Defaults => $NOT_IGNORED, 19523 ), 19524 Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1, 19525 Early => [ \&generate_GCB, '_Perl_GCB' ], 19526 Property => 'Grapheme_Cluster_Break', 19527 Has_Missings_Defaults => $NOT_IGNORED, 19528 ), 19529 Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0, 19530 Handler => \&process_GCB_test, 19531 retain_trailing_comments => 1, 19532 ), 19533 Input_file->new("$AUXILIARY/GraphemeBreakTest.html", v4.1.0, 19534 Skip => $Validation_Documentation, 19535 ), 19536 Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0, 19537 Handler => \&process_SB_test, 19538 retain_trailing_comments => 1, 19539 ), 19540 Input_file->new("$AUXILIARY/SentenceBreakTest.html", v4.1.0, 19541 Skip => $Validation_Documentation, 19542 ), 19543 Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0, 19544 Handler => \&process_WB_test, 19545 retain_trailing_comments => 1, 19546 ), 19547 Input_file->new("$AUXILIARY/WordBreakTest.html", v4.1.0, 19548 Skip => $Validation_Documentation, 19549 ), 19550 Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0, 19551 Property => 'Sentence_Break', 19552 Early => [ "SBsubst.txt", '_Perl_SB', 'OLetter' ], 19553 Has_Missings_Defaults => $NOT_IGNORED, 19554 ), 19555 Input_file->new('NamedSequences.txt', v4.1.0, 19556 Handler => \&process_NamedSequences 19557 ), 19558 Input_file->new('Unihan.html', v4.1.0, 19559 Withdrawn => v5.2, 19560 Skip => $Documentation, 19561 ), 19562 Input_file->new('NameAliases.txt', v5.0, 19563 Property => 'Name_Alias', 19564 Each_Line_Handler => ($v_version le v6.0.0) 19565 ? \&filter_early_version_name_alias_line 19566 : \&filter_later_version_name_alias_line, 19567 ), 19568 # NameAliases.txt came along in v5.0. The above constructor handles 19569 # this. But until 6.1, it was lacking some information needed by core 19570 # perl. The constructor below handles that. It is either a kludge or 19571 # clever, depending on your point of view. The 'Withdrawn' parameter 19572 # indicates not to use it at all starting in 6.1 (so the above 19573 # constructor applies), and the 'v6.1' parameter indicates to use the 19574 # Early parameter before 6.1. Therefore 'Early" is always used, 19575 # yielding the internal-only property '_Perl_Name_Alias', which it 19576 # gets from a NameAliases.txt from 6.1 or later stored in 19577 # N_Asubst.txt. In combination with the above constructor, 19578 # 'Name_Alias' is publicly accessible starting with v5.0, and the 19579 # better 6.1 version is accessible to perl core in all releases. 19580 Input_file->new("NameAliases.txt", v6.1, 19581 Withdrawn => v6.1, 19582 Early => [ "N_Asubst.txt", '_Perl_Name_Alias', "" ], 19583 Property => 'Name_Alias', 19584 EOF_Handler => \&fixup_early_perl_name_alias, 19585 Each_Line_Handler => 19586 \&filter_later_version_name_alias_line, 19587 ), 19588 Input_file->new('NamedSqProv.txt', v5.0.0, 19589 Skip => 'Named sequences proposed for inclusion in a ' 19590 . 'later version of the Unicode Standard; if you ' 19591 . 'need them now, you can append this file to ' 19592 . 'F<NamedSequences.txt> and recompile perl', 19593 ), 19594 Input_file->new("$AUXILIARY/LBTest.txt", v5.1.0, 19595 Handler => \&process_LB_test, 19596 retain_trailing_comments => 1, 19597 ), 19598 Input_file->new("$AUXILIARY/LineBreakTest.html", v5.1.0, 19599 Skip => $Validation_Documentation, 19600 ), 19601 Input_file->new("BidiTest.txt", v5.2.0, 19602 Skip => $Validation, 19603 ), 19604 Input_file->new('UnihanIndicesDictionary.txt', v5.2.0, 19605 Optional => "", 19606 Each_Line_Handler => \&filter_unihan_line, 19607 ), 19608 Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0, 19609 Optional => "", 19610 Each_Line_Handler => \&filter_unihan_line, 19611 ), 19612 Input_file->new('UnihanIRGSources.txt', v5.2.0, 19613 Optional => [ "", 19614 'kCompatibilityVariant', 19615 'kIICore', 19616 'kIRG_GSource', 19617 'kIRG_HSource', 19618 'kIRG_JSource', 19619 'kIRG_KPSource', 19620 'kIRG_MSource', 19621 'kIRG_KSource', 19622 'kIRG_SSource', 19623 'kIRG_TSource', 19624 'kIRG_USource', 19625 'kIRG_UKSource', 19626 'kIRG_VSource', 19627 ], 19628 Pre_Handler => \&setup_unihan, 19629 Each_Line_Handler => \&filter_unihan_line, 19630 ), 19631 Input_file->new('UnihanNumericValues.txt', v5.2.0, 19632 Optional => [ "", 19633 'kAccountingNumeric', 19634 'kOtherNumeric', 19635 'kPrimaryNumeric', 19636 ], 19637 Each_Line_Handler => \&filter_unihan_line, 19638 ), 19639 Input_file->new('UnihanOtherMappings.txt', v5.2.0, 19640 Optional => "", 19641 Each_Line_Handler => \&filter_unihan_line, 19642 ), 19643 Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0, 19644 Optional => [ "", 19645 'Unicode_Radical_Stroke' 19646 ], 19647 Each_Line_Handler => \&filter_unihan_line, 19648 ), 19649 Input_file->new('UnihanReadings.txt', v5.2.0, 19650 Optional => "", 19651 Each_Line_Handler => \&filter_unihan_line, 19652 ), 19653 Input_file->new('UnihanVariants.txt', v5.2.0, 19654 Optional => "", 19655 Each_Line_Handler => \&filter_unihan_line, 19656 ), 19657 Input_file->new('CJKRadicals.txt', v5.2.0, 19658 Skip => 'Maps the kRSUnicode property values to ' 19659 . 'corresponding code points', 19660 ), 19661 Input_file->new('EmojiSources.txt', v6.0.0, 19662 Skip => 'Maps certain Unicode code points to their ' 19663 . 'legacy Japanese cell-phone values', 19664 ), 19665 # This file is actually not usable as-is until 6.1.0, because the property 19666 # is provisional, so its name is missing from PropertyAliases.txt until 19667 # that release, so that further work would have to be done to get it to 19668 # work properly 19669 Input_file->new('ScriptExtensions.txt', v6.0.0, 19670 Property => 'Script_Extensions', 19671 Early => [ sub {} ], # Doesn't do anything but ensures 19672 # that this isn't skipped for early 19673 # versions 19674 Pre_Handler => \&setup_script_extensions, 19675 Each_Line_Handler => \&filter_script_extensions_line, 19676 Has_Missings_Defaults => (($v_version le v6.0.0) 19677 ? $NO_DEFAULTS 19678 : $IGNORED), 19679 ), 19680 # These two Indic files are actually not usable as-is until 6.1.0, 19681 # because they are provisional, so their property values are missing from 19682 # PropValueAliases.txt until that release, so that further work would have 19683 # to be done to get them to work properly. 19684 Input_file->new('IndicMatraCategory.txt', v6.0.0, 19685 Withdrawn => v8.0.0, 19686 Property => 'Indic_Matra_Category', 19687 Has_Missings_Defaults => $NOT_IGNORED, 19688 Skip => $Indic_Skip, 19689 ), 19690 Input_file->new('IndicSyllabicCategory.txt', v6.0.0, 19691 Property => 'Indic_Syllabic_Category', 19692 Has_Missings_Defaults => $NOT_IGNORED, 19693 Skip => (($v_version lt v8.0.0) 19694 ? $Indic_Skip 19695 : 0), 19696 ), 19697 Input_file->new('USourceData.txt', v6.2.0, 19698 Skip => 'Documentation of status and cross reference of ' 19699 . 'proposals for encoding by Unicode of Unihan ' 19700 . 'characters', 19701 ), 19702 Input_file->new('USourceGlyphs.pdf', v6.2.0, 19703 Skip => 'Pictures of the characters in F<USourceData.txt>', 19704 ), 19705 Input_file->new('BidiBrackets.txt', v6.3.0, 19706 Properties => [ 'Bidi_Paired_Bracket', 19707 'Bidi_Paired_Bracket_Type' 19708 ], 19709 Has_Missings_Defaults => $NO_DEFAULTS, 19710 ), 19711 Input_file->new("BidiCharacterTest.txt", v6.3.0, 19712 Skip => $Validation, 19713 ), 19714 Input_file->new('IndicPositionalCategory.txt', v8.0.0, 19715 Property => 'Indic_Positional_Category', 19716 Has_Missings_Defaults => $NOT_IGNORED, 19717 ), 19718 Input_file->new('TangutSources.txt', v9.0.0, 19719 Skip => 'Specifies source mappings for Tangut ideographs' 19720 . ' and components. This data file also includes' 19721 . ' informative radical-stroke values that are used' 19722 . ' internally by Unicode', 19723 ), 19724 Input_file->new('VerticalOrientation.txt', v10.0.0, 19725 Property => 'Vertical_Orientation', 19726 Has_Missings_Defaults => $NOT_IGNORED, 19727 ), 19728 Input_file->new('NushuSources.txt', v10.0.0, 19729 Skip => 'Specifies source material for Nushu characters', 19730 ), 19731 Input_file->new('EquivalentUnifiedIdeograph.txt', v11.0.0, 19732 Property => 'Equivalent_Unified_Ideograph', 19733 Has_Missings_Defaults => $NOT_IGNORED, 19734 ), 19735 Input_file->new('EmojiData.txt', v11.0.0, 19736 # Is in UAX #51 and not the UCD, so must be updated 19737 # separately, and the first line edited to indicate the 19738 # UCD release we're pretending it to be in. The UTC says 19739 # this is a transitional state, and in fact was moved to 19740 # the UCD in 13.0 19741 Withdrawn => v13.0.0, 19742 Pre_Handler => \&setup_emojidata, 19743 Has_Missings_Defaults => $NOT_IGNORED, 19744 Each_Line_Handler => \&filter_emojidata_line, 19745 UCD => 0, 19746 ), 19747 Input_file->new("$EMOJI/emoji.txt", v13.0.0, 19748 Has_Missings_Defaults => $NOT_IGNORED, 19749 UCD => 0, 19750 ), 19751 Input_file->new("$EMOJI/ReadMe.txt", v13.0.0, 19752 Skip => $Documentation, 19753 UCD => 0, 19754 ), 19755 Input_file->new('IdStatus.txt', v13.0.0, 19756 Pre_Handler => \&setup_IdStatus, 19757 Property => 'Identifier_Status', 19758 UCD => 0, 19759 ), 19760 Input_file->new('IdType.txt', v13.0.0, 19761 Pre_Handler => \&setup_IdType, 19762 Each_Line_Handler => \&filter_IdType_line, 19763 Property => 'Identifier_Type', 19764 UCD => 0, 19765 ), 19766); 19767 19768# End of all the preliminaries. 19769# Do it... 19770 19771if (@missing_early_files) { 19772 print simple_fold(join_lines(<<END 19773 19774The compilation cannot be completed because one or more required input files, 19775listed below, are missing. This is because you are compiling Unicode version 19776$unicode_version, which predates the existence of these file(s). To fully 19777function, perl needs the data that these files would have contained if they 19778had been in this release. To work around this, create copies of later 19779versions of the missing files in the directory containing '$0'. (Perl will 19780make the necessary adjustments to the data to compensate for it not being the 19781same version as is being compiled.) The files are available from unicode.org, 19782via either ftp or http. If using http, they will be under 19783www.unicode.org/versions/. Below are listed the source file name of each 19784missing file, the Unicode version to copy it from, and the name to store it 19785as. (Note that the listed source file name may not be exactly the one that 19786Unicode calls it. If you don't find it, you can look it up in 'README.perl' 19787to get the correct name.) 19788END 19789 )); 19790 print simple_fold(join_lines("\n$_")) for @missing_early_files; 19791 exit 2; 19792} 19793 19794if ($compare_versions) { 19795 Carp::my_carp(<<END 19796Warning. \$compare_versions is set. Output is not suitable for production 19797END 19798 ); 19799} 19800 19801# Put into %potential_files a list of all the files in the directory structure 19802# that could be inputs to this program 19803File::Find::find({ 19804 wanted=>sub { 19805 return unless / \. ( txt | htm l? ) $ /xi; # Some platforms change the 19806 # name's case 19807 my $full = lc(File::Spec->rel2abs($_)); 19808 $potential_files{$full} = 1; 19809 return; 19810 } 19811}, File::Spec->curdir()); 19812 19813my @mktables_list_output_files; 19814my $old_start_time = 0; 19815my $old_options = ""; 19816 19817if (! -e $file_list) { 19818 print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE; 19819 $write_unchanged_files = 1; 19820} elsif ($write_unchanged_files) { 19821 print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE; 19822} 19823else { 19824 print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE; 19825 my $file_handle; 19826 if (! open $file_handle, "<", $file_list) { 19827 Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!"); 19828 $glob_list = 1; 19829 } 19830 else { 19831 my @input; 19832 19833 # Read and parse mktables.lst, placing the results from the first part 19834 # into @input, and the second part into @mktables_list_output_files 19835 for my $list ( \@input, \@mktables_list_output_files ) { 19836 while (<$file_handle>) { 19837 s/^ \s+ | \s+ $//xg; 19838 if (/^ \s* \# \s* Autogenerated\ starting\ on\ (\d+)/x) { 19839 $old_start_time = $1; 19840 next; 19841 } 19842 if (/^ \s* \# \s* From\ options\ (.+) /x) { 19843 $old_options = $1; 19844 next; 19845 } 19846 next if /^ \s* (?: \# .* )? $/x; 19847 last if /^ =+ $/x; 19848 my ( $file ) = split /\t/; 19849 push @$list, $file; 19850 } 19851 @$list = uniques(@$list); 19852 next; 19853 } 19854 19855 # Look through all the input files 19856 foreach my $input (@input) { 19857 next if $input eq 'version'; # Already have checked this. 19858 19859 # Ignore if doesn't exist. The checking about whether we care or 19860 # not is done via the Input_file object. 19861 next if ! file_exists($input); 19862 19863 # The paths are stored with relative names, and with '/' as the 19864 # delimiter; convert to absolute on this machine 19865 my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input))); 19866 $potential_files{lc $full} = 1; 19867 } 19868 } 19869 19870 close $file_handle; 19871} 19872 19873if ($glob_list) { 19874 19875 # Here wants to process all .txt files in the directory structure. 19876 # Convert them to full path names. They are stored in the platform's 19877 # relative style 19878 my @known_files; 19879 foreach my $object (@input_file_objects) { 19880 my $file = $object->file; 19881 next unless defined $file; 19882 push @known_files, File::Spec->rel2abs($file); 19883 } 19884 19885 my @unknown_input_files; 19886 foreach my $file (keys %potential_files) { # The keys are stored in lc 19887 next if grep { $file eq lc($_) } @known_files; 19888 19889 # Here, the file is unknown to us. Get relative path name 19890 $file = File::Spec->abs2rel($file); 19891 push @unknown_input_files, $file; 19892 19893 # What will happen is we create a data structure for it, and add it to 19894 # the list of input files to process. First get the subdirectories 19895 # into an array 19896 my (undef, $directories, undef) = File::Spec->splitpath($file); 19897 $directories =~ s;/$;;; # Can have extraneous trailing '/' 19898 my @directories = File::Spec->splitdir($directories); 19899 19900 # If the file isn't extracted (meaning none of the directories is the 19901 # extracted one), just add it to the end of the list of inputs. 19902 if (! grep { $EXTRACTED_DIR eq $_ } @directories) { 19903 push @input_file_objects, Input_file->new($file, v0); 19904 } 19905 else { 19906 19907 # Here, the file is extracted. It needs to go ahead of most other 19908 # processing. Search for the first input file that isn't a 19909 # special required property (that is, find one whose first_release 19910 # is non-0), and isn't extracted. Also, the Age property file is 19911 # processed before the extracted ones, just in case 19912 # $compare_versions is set. 19913 for (my $i = 0; $i < @input_file_objects; $i++) { 19914 if ($input_file_objects[$i]->first_released ne v0 19915 && lc($input_file_objects[$i]->file) ne 'dage.txt' 19916 && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i) 19917 { 19918 splice @input_file_objects, $i, 0, 19919 Input_file->new($file, v0); 19920 last; 19921 } 19922 } 19923 19924 } 19925 } 19926 if (@unknown_input_files) { 19927 print STDERR simple_fold(join_lines(<<END 19928 19929The following files are unknown as to how to handle. Assuming they are 19930typical property files. You'll know by later error messages if it worked or 19931not: 19932END 19933 ) . " " . join(", ", @unknown_input_files) . "\n\n"); 19934 } 19935} # End of looking through directory structure for more .txt files. 19936 19937# Create the list of input files from the objects we have defined, plus 19938# version 19939my @input_files = qw(version Makefile); 19940foreach my $object (@input_file_objects) { 19941 my $file = $object->file; 19942 next if ! defined $file; # Not all objects have files 19943 next if defined $object->skip;; 19944 push @input_files, $file; 19945} 19946 19947if ( $verbosity >= $VERBOSE ) { 19948 print "Expecting ".scalar( @input_files )." input files. ", 19949 "Checking ".scalar( @mktables_list_output_files )." output files.\n"; 19950} 19951 19952# We set $most_recent to be the most recently changed input file, including 19953# this program itself (done much earlier in this file) 19954foreach my $in (@input_files) { 19955 next unless -e $in; # Keep going even if missing a file 19956 my $mod_time = (stat $in)[9]; 19957 $most_recent = $mod_time if $mod_time > $most_recent; 19958 19959 # See that the input files have distinct names, to warn someone if they 19960 # are adding a new one 19961 if ($make_list) { 19962 my ($volume, $directories, $file ) = File::Spec->splitpath($in); 19963 $directories =~ s;/$;;; # Can have extraneous trailing '/' 19964 my @directories = File::Spec->splitdir($directories); 19965 construct_filename($file, 'mutable', \@directories); 19966 } 19967} 19968 19969# We use 'Makefile' just to see if it has changed since the last time we 19970# rebuilt. Now discard it. 19971@input_files = grep { $_ ne 'Makefile' } @input_files; 19972 19973my $rebuild = $write_unchanged_files # Rebuild: if unconditional rebuild 19974 || ! scalar @mktables_list_output_files # or if no outputs known 19975 || $old_start_time < $most_recent # or out-of-date 19976 || $old_options ne $command_line_arguments; # or with different 19977 # options 19978 19979# Now we check to see if any output files are older than youngest, if 19980# they are, we need to continue on, otherwise we can presumably bail. 19981if (! $rebuild) { 19982 foreach my $out (@mktables_list_output_files) { 19983 if ( ! file_exists($out)) { 19984 print "'$out' is missing.\n" if $verbosity >= $VERBOSE; 19985 $rebuild = 1; 19986 last; 19987 } 19988 #local $to_trace = 1 if main::DEBUG; 19989 trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace; 19990 if ( (stat $out)[9] <= $most_recent ) { 19991 #trace "$out: most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace; 19992 print "'$out' is too old.\n" if $verbosity >= $VERBOSE; 19993 $rebuild = 1; 19994 last; 19995 } 19996 } 19997} 19998if (! $rebuild) { 19999 print "$0: Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n"; 20000 exit(0); 20001} 20002print "$0: Must rebuild tables.\n" if $verbosity >= $VERBOSE; 20003 20004# Ready to do the major processing. First create the perl pseudo-property. 20005$perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1); 20006 20007# Process each input file 20008foreach my $file (@input_file_objects) { 20009 $file->run; 20010} 20011 20012# Finish the table generation. 20013 20014print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS; 20015finish_Unicode(); 20016 20017# For the very specialized case of comparing two Unicode versions... 20018if (DEBUG && $compare_versions) { 20019 handle_compare_versions(); 20020} 20021 20022print "Compiling Perl properties\n" if $verbosity >= $PROGRESS; 20023compile_perl(); 20024 20025print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS; 20026add_perl_synonyms(); 20027 20028print "Writing tables\n" if $verbosity >= $PROGRESS; 20029write_all_tables(); 20030 20031# Write mktables.lst 20032if ( $file_list and $make_list ) { 20033 20034 print "Updating '$file_list'\n" if $verbosity >= $PROGRESS; 20035 foreach my $file (@input_files, @files_actually_output) { 20036 my (undef, $directories, $basefile) = File::Spec->splitpath($file); 20037 my @directories = grep length, File::Spec->splitdir($directories); 20038 $file = join '/', @directories, $basefile; 20039 } 20040 20041 my $ofh; 20042 if (! open $ofh,">",$file_list) { 20043 Carp::my_carp("Can't write to '$file_list'. Skipping: $!"); 20044 return 20045 } 20046 else { 20047 my $localtime = localtime $start_time; 20048 print $ofh <<"END"; 20049# 20050# $file_list -- File list for $0. 20051# 20052# Autogenerated starting on $start_time ($localtime) 20053# From options $command_line_arguments 20054# 20055# - First section is input files 20056# ($0 itself is not listed but is automatically considered an input) 20057# - Section separator is /^=+\$/ 20058# - Second section is a list of output files. 20059# - Lines matching /^\\s*#/ are treated as comments 20060# which along with blank lines are ignored. 20061# 20062 20063# Input files: 20064 20065END 20066 print $ofh "$_\n" for sort(@input_files); 20067 print $ofh "\n=================================\n# Output files:\n\n"; 20068 print $ofh "$_\n" for sort @files_actually_output; 20069 print $ofh "\n# ",scalar(@input_files)," input files\n", 20070 "# ",scalar(@files_actually_output)+1," output files\n\n", 20071 "# End list\n"; 20072 close $ofh 20073 or Carp::my_carp("Failed to close $ofh: $!"); 20074 20075 print "Filelist has ",scalar(@input_files)," input files and ", 20076 scalar(@files_actually_output)+1," output files\n" 20077 if $verbosity >= $VERBOSE; 20078 } 20079} 20080 20081# Output these warnings unless -q explicitly specified. 20082if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) { 20083 if (@unhandled_properties) { 20084 print "\nProperties and tables that unexpectedly have no code points\n"; 20085 foreach my $property (sort @unhandled_properties) { 20086 print $property, "\n"; 20087 } 20088 } 20089 20090 if (%potential_files) { 20091 print "\nInput files that are not considered:\n"; 20092 foreach my $file (sort keys %potential_files) { 20093 print File::Spec->abs2rel($file), "\n"; 20094 } 20095 } 20096 print "\nAll done\n" if $verbosity >= $VERBOSE; 20097} 20098 20099if ($version_of_mk_invlist_bounds lt $v_version) { 20100 Carp::my_carp("WARNING: \\b{} algorithms (regen/mk_invlist.pl) need" 20101 . " to be checked and possibly updated to Unicode" 20102 . " $string_version. Failing tests will be marked TODO"); 20103} 20104 20105exit(0); 20106 20107# TRAILING CODE IS USED BY make_property_test_script() 20108__DATA__ 20109 20110use strict; 20111use warnings; 20112 20113use feature 'signatures'; 20114 20115no warnings 'experimental::signatures'; 20116no warnings 'experimental::uniprop_wildcards'; 20117 20118# Test qr/\X/ and the \p{} regular expression constructs. This file is 20119# constructed by mktables from the tables it generates, so if mktables is 20120# buggy, this won't necessarily catch those bugs. Tests are generated for all 20121# feasible properties; a few aren't currently feasible; see 20122# is_code_point_usable() in mktables for details. 20123 20124# Standard test packages are not used because this manipulates SIG_WARN. It 20125# exits 0 if every non-skipped test succeeded; -1 if any failed. 20126 20127my $Tests = 0; 20128my $Fails = 0; 20129 20130# loc_tools.pl requires this function to be defined 20131sub ok($pass, @msg) { 20132 print "not " unless $pass; 20133 print "ok "; 20134 print ++$Tests; 20135 print " - ", join "", @msg if @msg; 20136 print "\n"; 20137} 20138 20139sub Expect($expected, $ord, $regex, $warning_type='') { 20140 my $line = (caller)[2]; 20141 20142 # Convert the code point to hex form 20143 my $string = sprintf "\"\\x{%04X}\"", $ord; 20144 20145 my @tests = ""; 20146 20147 # The first time through, use all warnings. If the input should generate 20148 # a warning, add another time through with them turned off 20149 push @tests, "no warnings '$warning_type';" if $warning_type; 20150 20151 foreach my $no_warnings (@tests) { 20152 20153 # Store any warning messages instead of outputting them 20154 local $SIG{__WARN__} = $SIG{__WARN__}; 20155 my $warning_message; 20156 $SIG{__WARN__} = sub { $warning_message = $_[0] }; 20157 20158 $Tests++; 20159 20160 # A string eval is needed because of the 'no warnings'. 20161 # Assumes no parentheses in the regular expression 20162 my $result = eval "$no_warnings 20163 my \$RegObj = qr($regex); 20164 $string =~ \$RegObj ? 1 : 0"; 20165 if (not defined $result) { 20166 print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n"; 20167 $Fails++; 20168 } 20169 elsif ($result ^ $expected) { 20170 print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n"; 20171 $Fails++; 20172 } 20173 elsif ($warning_message) { 20174 if (! $warning_type || ($warning_type && $no_warnings)) { 20175 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n"; 20176 $Fails++; 20177 } 20178 else { 20179 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n"; 20180 } 20181 } 20182 elsif ($warning_type && ! $no_warnings) { 20183 print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n"; 20184 $Fails++; 20185 } 20186 else { 20187 print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n"; 20188 } 20189 } 20190 return; 20191} 20192 20193sub Error($regex) { 20194 $Tests++; 20195 if (eval { 'x' =~ qr/$regex/; 1 }) { 20196 $Fails++; 20197 my $line = (caller)[2]; 20198 print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n"; 20199 } 20200 else { 20201 my $line = (caller)[2]; 20202 print "ok $Tests - got and expected error for qr/$regex/; line $line\n"; 20203 } 20204 return; 20205} 20206 20207# Break test files (e.g. GCBTest.txt) character that break allowed here 20208my $breakable_utf8 = my $breakable = chr(utf8::unicode_to_native(0xF7)); 20209utf8::upgrade($breakable_utf8); 20210 20211# Break test files (e.g. GCBTest.txt) character that indicates can't break 20212# here 20213my $nobreak_utf8 = my $nobreak = chr(utf8::unicode_to_native(0xD7)); 20214utf8::upgrade($nobreak_utf8); 20215 20216my $are_ctype_locales_available; 20217my $utf8_locale; 20218chdir 't' if -d 't'; 20219eval { require "./loc_tools.pl" }; 20220if (defined &locales_enabled) { 20221 $are_ctype_locales_available = locales_enabled('LC_CTYPE'); 20222 if ($are_ctype_locales_available) { 20223 $utf8_locale = &find_utf8_ctype_locale; 20224 } 20225} 20226 20227# Eval'd so can run on versions earlier than the property is available in 20228my $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}\p{WB=ZWJ}]/'; 20229if (! defined $WB_Extend_or_Format_re) { 20230 $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}]/'; 20231} 20232 20233sub _test_break($template, $break_type) { 20234 # Test various break property matches. The 2nd parameter gives the 20235 # property name. The input is a line from auxiliary/*Test.txt for the 20236 # given property. Each such line is a sequence of Unicode (not native) 20237 # code points given by their hex numbers, separated by the two characters 20238 # defined just before this subroutine that indicate that either there can 20239 # or cannot be a break between the adjacent code points. All these are 20240 # tested. 20241 # 20242 # For the gcb property extra tests are made. if there isn't a break, that 20243 # means the sequence forms an extended grapheme cluster, which means that 20244 # \X should match the whole thing. If there is a break, \X should stop 20245 # there. This is all converted by this routine into a match: $string =~ 20246 # /(\X)/, Each \X should match the next cluster; and that is what is 20247 # checked. 20248 20249 my $line = (caller 1)[2]; # Line number 20250 my $comment = ""; 20251 20252 if ($template =~ / ( .*? ) \s* \# (.*) /x) { 20253 $template = $1; 20254 $comment = $2; 20255 20256 # Replace leading spaces with a single one. 20257 $comment =~ s/ ^ \s* / # /x; 20258 } 20259 20260 # The line contains characters above the ASCII range, but in Latin1. It 20261 # may or may not be in utf8, and if it is, it may or may not know it. So, 20262 # convert these characters to 8 bits. If knows is in utf8, simply 20263 # downgrade. 20264 if (utf8::is_utf8($template)) { 20265 utf8::downgrade($template); 20266 } else { 20267 20268 # Otherwise, if it is in utf8, but doesn't know it, the next lines 20269 # convert the two problematic characters to their 8-bit equivalents. 20270 # If it isn't in utf8, they don't harm anything. 20271 use bytes; 20272 $template =~ s/$nobreak_utf8/$nobreak/g; 20273 $template =~ s/$breakable_utf8/$breakable/g; 20274 } 20275 20276 # Perl customizes wb. So change the official tests accordingly 20277 if ($break_type eq 'wb' && $WB_Extend_or_Format_re) { 20278 20279 # Split into elements that alternate between code point and 20280 # break/no-break 20281 my @line = split / +/, $template; 20282 20283 # Look at each code point and its following one 20284 for (my $i = 1; $i < @line - 1 - 1; $i+=2) { 20285 20286 # The customization only involves changing some breaks to 20287 # non-breaks. 20288 next if $line[$i+1] =~ /$nobreak/; 20289 20290 my $lhs = chr utf8::unicode_to_native(hex $line[$i]); 20291 my $rhs = chr utf8::unicode_to_native(hex $line[$i+2]); 20292 20293 # And it only affects adjacent space characters. 20294 next if $lhs !~ /\s/u; 20295 20296 # But, we want to make sure to test spaces followed by a Extend 20297 # or Format. 20298 next if $rhs !~ /\s|$WB_Extend_or_Format_re/; 20299 20300 # To test the customization, add some white-space before this to 20301 # create a span. The $lhs white space may or may not be bound to 20302 # that span, and also with the $rhs. If the $rhs is a binding 20303 # character, the $lhs is bound to it and not to the span, unless 20304 # $lhs is vertical space. In all other cases, the $lhs is bound 20305 # to the span. If the $rhs is white space, it is bound to the 20306 # $lhs 20307 my $bound; 20308 my $span; 20309 if ($rhs =~ /$WB_Extend_or_Format_re/) { 20310 if ($lhs =~ /\v/) { 20311 $bound = $breakable; 20312 $span = $nobreak; 20313 } 20314 else { 20315 $bound = $nobreak; 20316 $span = $breakable; 20317 } 20318 } 20319 else { 20320 $span = $nobreak; 20321 $bound = $nobreak; 20322 } 20323 20324 splice @line, $i, 0, ( '0020', $nobreak, '0020', $span); 20325 $i += 4; 20326 $line[$i+1] = $bound; 20327 } 20328 $template = join " ", @line; 20329 } 20330 20331 # The input is just the break/no-break symbols and sequences of Unicode 20332 # code points as hex digits separated by spaces for legibility. e.g.: 20333 # ÷ 0020 × 0308 ÷ 0020 ÷ 20334 # Convert to native \x format 20335 $template =~ s/ \s* ( [[:xdigit:]]+ ) \s* /sprintf("\\x{%02X}", utf8::unicode_to_native(hex $1))/gex; 20336 $template =~ s/ \s* //gx; # Probably the line above removed all spaces; 20337 # but be sure 20338 20339 # Make a copy of the input with the symbols replaced by \b{} and \B{} as 20340 # appropriate 20341 my $break_pattern = $template =~ s/ $breakable /\\b{$break_type}/grx; 20342 $break_pattern =~ s/ $nobreak /\\B{$break_type}/gx; 20343 20344 my $display_string = $template =~ s/[$breakable$nobreak]//gr; 20345 my $string = eval "\"$display_string\""; 20346 20347 # The remaining massaging of the input is for the \X tests. Get rid of 20348 # the leading and trailing breakables 20349 $template =~ s/^ \s* $breakable \s* //x; 20350 $template =~ s/ \s* $breakable \s* $ //x; 20351 20352 # Delete no-breaks 20353 $template =~ s/ \s* $nobreak \s* //xg; 20354 20355 # Split the input into segments that are breakable between them. 20356 my @should_display = split /\s*$breakable\s*/, $template; 20357 my @should_match = map { eval "\"$_\"" } @should_display; 20358 20359 # If a string can be represented in both non-ut8 and utf8, test both cases 20360 my $display_upgrade = ""; 20361 UPGRADE: 20362 for my $to_upgrade (0 .. 1) { 20363 20364 if ($to_upgrade) { 20365 20366 # If already in utf8, would just be a repeat 20367 next UPGRADE if utf8::is_utf8($string); 20368 20369 utf8::upgrade($string); 20370 $display_upgrade = " (utf8-upgraded)"; 20371 } 20372 20373 my @modifiers = qw(a aa d u i); 20374 if ($are_ctype_locales_available) { 20375 push @modifiers, "l$utf8_locale" if defined $utf8_locale; 20376 20377 # The /l modifier has C after it to indicate the locale to try 20378 push @modifiers, "lC"; 20379 } 20380 20381 # Test for each of the regex modifiers. 20382 for my $modifier (@modifiers) { 20383 my $display_locale = ""; 20384 20385 # For /l, set the locale to what it says to. 20386 if ($modifier =~ / ^ l (.*) /x) { 20387 my $locale = $1; 20388 $display_locale = "(locale = $locale)"; 20389 POSIX::setlocale(&POSIX::LC_CTYPE, $locale); 20390 $modifier = 'l'; 20391 } 20392 20393 no warnings qw(locale regexp surrogate); 20394 my $pattern = "(?$modifier:$break_pattern)"; 20395 20396 # Actually do the test 20397 my $matched_text; 20398 my $matched = $string =~ qr/$pattern/; 20399 if ($matched) { 20400 $matched_text = "matched"; 20401 } 20402 else { 20403 $matched_text = "failed to match"; 20404 print "not "; 20405 20406 if (TODO_FAILING_BREAKS) { 20407 $comment = " # $comment" unless $comment =~ / ^ \s* \# /x; 20408 $comment =~ s/#/# TODO/; 20409 } 20410 } 20411 print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$pattern/$display_upgrade; line $line $display_locale$comment\n"; 20412 20413 # Only print the comment on the first use of this line 20414 $comment = ""; 20415 20416 # Repeat with the first \B{} in the pattern. This makes sure the 20417 # code in regexec.c:find_byclass() for \B gets executed 20418 if ($pattern =~ / ( .*? : ) .* ( \\B\{ .* ) /x) { 20419 my $B_pattern = "$1$2"; 20420 $matched = $string =~ qr/$B_pattern/; 20421 print "not " unless $matched; 20422 $matched_text = ($matched) ? "matched" : "failed to match"; 20423 print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$B_pattern/$display_upgrade; line $line $display_locale"; 20424 print " # TODO" if TODO_FAILING_BREAKS && ! $matched; 20425 print "\n"; 20426 } 20427 } 20428 20429 next if $break_type ne 'gcb'; 20430 20431 # Finally, do the \X match. 20432 my @matches = $string =~ /(\X)/g; 20433 20434 # Look through each matched cluster to verify that it matches what we 20435 # expect. 20436 my $min = (@matches < @should_match) ? @matches : @should_match; 20437 for my $i (0 .. $min - 1) { 20438 $Tests++; 20439 if ($matches[$i] eq $should_match[$i]) { 20440 print "ok $Tests - "; 20441 if ($i == 0) { 20442 print "In \"$display_string\" =~ /(\\X)/g, \\X #1"; 20443 } else { 20444 print "And \\X #", $i + 1, 20445 } 20446 print " correctly matched $should_display[$i]; line $line\n"; 20447 } else { 20448 $matches[$i] = join("", map { sprintf "\\x{%04X}", ord $_ } 20449 split "", $matches[$i]); 20450 print "not ok $Tests -"; 20451 print " # TODO" if TODO_FAILING_BREAKS; 20452 print " In \"$display_string\" =~ /(\\X)/g, \\X #", 20453 $i + 1, 20454 " should have matched $should_display[$i]", 20455 " but instead matched $matches[$i]", 20456 ". Abandoning rest of line $line\n"; 20457 next UPGRADE; 20458 } 20459 } 20460 20461 # And the number of matches should equal the number of expected matches. 20462 $Tests++; 20463 if (@matches == @should_match) { 20464 print "ok $Tests - Nothing was left over; line $line\n"; 20465 } else { 20466 print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line"; 20467 print " # TODO" if TODO_FAILING_BREAKS; 20468 print "\n"; 20469 } 20470 } 20471 20472 return; 20473} 20474 20475sub Test_GCB($t) { 20476 _test_break($t, 'gcb'); 20477} 20478 20479sub Test_LB($t) { 20480 _test_break($t, 'lb'); 20481} 20482 20483sub Test_SB($t) { 20484 _test_break($t, 'sb'); 20485} 20486 20487sub Test_WB($t) { 20488 _test_break($t, 'wb'); 20489} 20490 20491sub Finished() { 20492 print "1..$Tests\n"; 20493 exit($Fails ? -1 : 0); 20494} 20495 20496