1#!/usr/bin/perl -w 2 3# !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!! 4# Any files created or read by this program should be listed in 'mktables.lst' 5# Use -makelist to regenerate it. 6 7# There was an attempt when this was first rewritten to make it 5.8 8# compatible, but that has now been abandoned, and newer constructs are used 9# as convenient. 10 11# NOTE: this script can run quite slowly in older/slower systems. 12# It can also consume a lot of memory (128 MB or more), you may need 13# to raise your process resource limits (e.g. in bash, "ulimit -a" 14# to inspect, and "ulimit -d ..." or "ulimit -m ..." to set) 15 16my $start_time; 17BEGIN { # Get the time the script started running; do it at compilation to 18 # get it as close as possible 19 $start_time= time; 20} 21 22require 5.010_001; 23use strict; 24use warnings; 25use Carp; 26use Config; 27use File::Find; 28use File::Path; 29use File::Spec; 30use Text::Tabs; 31use re "/aa"; 32 33use feature 'state'; 34use feature 'signatures'; 35no warnings 'experimental::signatures'; 36 37sub DEBUG () { 0 } # Set to 0 for production; 1 for development 38my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/; 39 40sub NON_ASCII_PLATFORM { ord("A") != 65 } 41 42# When a new version of Unicode is published, unfortunately the algorithms for 43# dealing with various bounds, like \b{gcb}, \b{lb} may have to be updated 44# manually. The changes may or may not be backward compatible with older 45# releases. The code is in regen/mk_invlist.pl and regexec.c. Make the 46# changes, then come back here and set the variable below to what version the 47# code is expecting. If a newer version of Unicode is being compiled than 48# expected, a warning will be generated. If an older version is being 49# compiled, any bounds tests that fail in the generated test file (-maketest 50# option) will be marked as TODO. 51my $version_of_mk_invlist_bounds = v14.0.0; 52 53########################################################################## 54# 55# mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl), 56# from the Unicode database files (lib/unicore/.../*.txt), It also generates 57# a pod file and .t files, depending on option parameters. 58# 59# The structure of this file is: 60# First these introductory comments; then 61# code needed for everywhere, such as debugging stuff; then 62# code to handle input parameters; then 63# data structures likely to be of external interest (some of which depend on 64# the input parameters, so follows them; then 65# more data structures and subroutine and package (class) definitions; then 66# the small actual loop to process the input files and finish up; then 67# a __DATA__ section, for the .t tests 68# 69# This program works on all releases of Unicode so far. The outputs have been 70# scrutinized most intently for release 5.1. The others have been checked for 71# somewhat more than just sanity. It can handle all non-provisional Unicode 72# character properties in those releases. 73# 74# This program is mostly about Unicode character (or code point) properties. 75# A property describes some attribute or quality of a code point, like if it 76# is lowercase or not, its name, what version of Unicode it was first defined 77# in, or what its uppercase equivalent is. Unicode deals with these disparate 78# possibilities by making all properties into mappings from each code point 79# into some corresponding value. In the case of it being lowercase or not, 80# the mapping is either to 'Y' or 'N' (or various synonyms thereof). Each 81# property maps each Unicode code point to a single value, called a "property 82# value". (Some more recently defined properties, map a code point to a set 83# of values.) 84# 85# When using a property in a regular expression, what is desired isn't the 86# mapping of the code point to its property's value, but the reverse (or the 87# mathematical "inverse relation"): starting with the property value, "Does a 88# code point map to it?" These are written in a "compound" form: 89# \p{property=value}, e.g., \p{category=punctuation}. This program generates 90# files containing the lists of code points that map to each such regular 91# expression property value, one file per list 92# 93# There is also a single form shortcut that Perl adds for many of the commonly 94# used properties. This happens for all binary properties, plus script, 95# general_category, and block properties. 96# 97# Thus the outputs of this program are files. There are map files, mostly in 98# the 'To' directory; and there are list files for use in regular expression 99# matching, all in subdirectories of the 'lib' directory, with each 100# subdirectory being named for the property that the lists in it are for. 101# Bookkeeping, test, and documentation files are also generated. 102 103my $matches_directory = 'lib'; # Where match (\p{}) files go. 104my $map_directory = 'To'; # Where map files go. 105 106# DATA STRUCTURES 107# 108# The major data structures of this program are Property, of course, but also 109# Table. There are two kinds of tables, very similar to each other. 110# "Match_Table" is the data structure giving the list of code points that have 111# a particular property value, mentioned above. There is also a "Map_Table" 112# data structure which gives the property's mapping from code point to value. 113# There are two structures because the match tables need to be combined in 114# various ways, such as constructing unions, intersections, complements, etc., 115# and the map ones don't. And there would be problems, perhaps subtle, if 116# a map table were inadvertently operated on in some of those ways. 117# The use of separate classes with operations defined on one but not the other 118# prevents accidentally confusing the two. 119# 120# At the heart of each table's data structure is a "Range_List", which is just 121# an ordered list of "Ranges", plus ancillary information, and methods to 122# operate on them. A Range is a compact way to store property information. 123# Each range has a starting code point, an ending code point, and a value that 124# is meant to apply to all the code points between the two end points, 125# inclusive. For a map table, this value is the property value for those 126# code points. Two such ranges could be written like this: 127# 0x41 .. 0x5A, 'Upper', 128# 0x61 .. 0x7A, 'Lower' 129# 130# Each range also has a type used as a convenience to classify the values. 131# Most ranges in this program will be Type 0, or normal, but there are some 132# ranges that have a non-zero type. These are used only in map tables, and 133# are for mappings that don't fit into the normal scheme of things. Mappings 134# that require a hash entry to communicate with utf8.c are one example; 135# another example is mappings for charnames.pm to use which indicate a name 136# that is algorithmically determinable from its code point (and the reverse). 137# These are used to significantly compact these tables, instead of listing 138# each one of the tens of thousands individually. 139# 140# In a match table, the value of a range is irrelevant (and hence the type as 141# well, which will always be 0), and arbitrarily set to the empty string. 142# Using the example above, there would be two match tables for those two 143# entries, one named Upper would contain the 0x41..0x5A range, and the other 144# named Lower would contain 0x61..0x7A. 145# 146# Actually, there are two types of range lists, "Range_Map" is the one 147# associated with map tables, and "Range_List" with match tables. 148# Again, this is so that methods can be defined on one and not the others so 149# as to prevent operating on them in incorrect ways. 150# 151# Eventually, most tables are written out to files to be read by Unicode::UCD. 152# All tables could in theory be written, but some are suppressed because there 153# is no current practical use for them. It is easy to change which get 154# written by changing various lists that are near the top of the actual code 155# in this file. The table data structures contain enough ancillary 156# information to allow them to be treated as separate entities for writing, 157# such as the path to each one's file. There is a heading in each map table 158# that gives the format of its entries, and what the map is for all the code 159# points missing from it. (This allows tables to be more compact.) 160# 161# The Property data structure contains one or more tables. All properties 162# contain a map table (except the $perl property which is a 163# pseudo-property containing only match tables), and any properties that 164# are usable in regular expression matches also contain various matching 165# tables, one for each value the property can have. A binary property can 166# have two values, True and False (or Y and N, which are preferred by Unicode 167# terminology). Thus each of these properties will have a map table that 168# takes every code point and maps it to Y or N (but having ranges cuts the 169# number of entries in that table way down), and two match tables, one 170# which has a list of all the code points that map to Y, and one for all the 171# code points that map to N. (For each binary property, a third table is also 172# generated for the pseudo Perl property. It contains the identical code 173# points as the Y table, but can be written in regular expressions, not in the 174# compound form, but in a "single" form like \p{IsUppercase}.) Many 175# properties are binary, but some properties have several possible values, 176# some have many, and properties like Name have a different value for every 177# named code point. Those will not, unless the controlling lists are changed, 178# have their match tables written out. But all the ones which can be used in 179# regular expression \p{} and \P{} constructs will. Prior to 5.14, generally 180# a property would have either its map table or its match tables written but 181# not both. Again, what gets written is controlled by lists which can easily 182# be changed. Starting in 5.14, advantage was taken of this, and all the map 183# tables needed to reconstruct the Unicode db are now written out, while 184# suppressing the Unicode .txt files that contain the data. Our tables are 185# much more compact than the .txt files, so a significant space savings was 186# achieved. Also, tables are not written out that are trivially derivable 187# from tables that do get written. So, there typically is no file containing 188# the code points not matched by a binary property (the table for \P{} versus 189# lowercase \p{}), since you just need to invert the True table to get the 190# False table. 191 192# Properties have a 'Type', like 'binary', or 'string', or 'enum' depending on 193# how many match tables there are and the content of the maps. This 'Type' is 194# different than a range 'Type', so don't get confused by the two concepts 195# having the same name. 196# 197# For information about the Unicode properties, see Unicode's UAX44 document: 198 199my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/'; 200 201# As stated earlier, this program will work on any release of Unicode so far. 202# Most obvious problems in earlier data have NOT been corrected except when 203# necessary to make Perl or this program work reasonably, and to keep out 204# potential security issues. For example, no folding information was given in 205# early releases, so this program substitutes lower case instead, just so that 206# a regular expression with the /i option will do something that actually 207# gives the right results in many cases. There are also a couple other 208# corrections for version 1.1.5, commented at the point they are made. As an 209# example of corrections that weren't made (but could be) is this statement 210# from DerivedAge.txt: "The supplementary private use code points and the 211# non-character code points were assigned in version 2.0, but not specifically 212# listed in the UCD until versions 3.0 and 3.1 respectively." (To be precise 213# it was 3.0.1 not 3.0.0) More information on Unicode version glitches is 214# further down in these introductory comments. 215# 216# This program works on all non-provisional properties as of the current 217# Unicode release, though the files for some are suppressed for various 218# reasons. You can change which are output by changing lists in this program. 219# 220# The old version of mktables emphasized the term "Fuzzy" to mean Unicode's 221# loose matchings rules (from Unicode TR18): 222# 223# The recommended names for UCD properties and property values are in 224# PropertyAliases.txt [Prop] and PropertyValueAliases.txt 225# [PropValue]. There are both abbreviated names and longer, more 226# descriptive names. It is strongly recommended that both names be 227# recognized, and that loose matching of property names be used, 228# whereby the case distinctions, whitespace, hyphens, and underbar 229# are ignored. 230# 231# The program still allows Fuzzy to override its determination of if loose 232# matching should be used, but it isn't currently used, as it is no longer 233# needed; the calculations it makes are good enough. 234# 235# SUMMARY OF HOW IT WORKS: 236# 237# Process arguments 238# 239# A list is constructed containing each input file that is to be processed 240# 241# Each file on the list is processed in a loop, using the associated handler 242# code for each: 243# The PropertyAliases.txt and PropValueAliases.txt files are processed 244# first. These files name the properties and property values. 245# Objects are created of all the property and property value names 246# that the rest of the input should expect, including all synonyms. 247# The other input files give mappings from properties to property 248# values. That is, they list code points and say what the mapping 249# is under the given property. Some files give the mappings for 250# just one property; and some for many. This program goes through 251# each file and populates the properties and their map tables from 252# them. Some properties are listed in more than one file, and 253# Unicode has set up a precedence as to which has priority if there 254# is a conflict. Thus the order of processing matters, and this 255# program handles the conflict possibility by processing the 256# overriding input files last, so that if necessary they replace 257# earlier values. 258# After this is all done, the program creates the property mappings not 259# furnished by Unicode, but derivable from what it does give. 260# The tables of code points that match each property value in each 261# property that is accessible by regular expressions are created. 262# The Perl-defined properties are created and populated. Many of these 263# require data determined from the earlier steps 264# Any Perl-defined synonyms are created, and name clashes between Perl 265# and Unicode are reconciled and warned about. 266# All the properties are written to files 267# Any other files are written, and final warnings issued. 268# 269# For clarity, a number of operators have been overloaded to work on tables: 270# ~ means invert (take all characters not in the set). The more 271# conventional '!' is not used because of the possibility of confusing 272# it with the actual boolean operation. 273# + means union 274# - means subtraction 275# & means intersection 276# The precedence of these is the order listed. Parentheses should be 277# copiously used. These are not a general scheme. The operations aren't 278# defined for a number of things, deliberately, to avoid getting into trouble. 279# Operations are done on references and affect the underlying structures, so 280# that the copy constructors for them have been overloaded to not return a new 281# clone, but the input object itself. 282# 283# The bool operator is deliberately not overloaded to avoid confusion with 284# "should it mean if the object merely exists, or also is non-empty?". 285# 286# WHY CERTAIN DESIGN DECISIONS WERE MADE 287# 288# This program needs to be able to run under miniperl. Therefore, it uses a 289# minimum of other modules, and hence implements some things itself that could 290# be gotten from CPAN 291# 292# This program uses inputs published by the Unicode Consortium. These can 293# change incompatibly between releases without the Perl maintainers realizing 294# it. Therefore this program is now designed to try to flag these. It looks 295# at the directories where the inputs are, and flags any unrecognized files. 296# It keeps track of all the properties in the files it handles, and flags any 297# that it doesn't know how to handle. It also flags any input lines that 298# don't match the expected syntax, among other checks. 299# 300# It is also designed so if a new input file matches one of the known 301# templates, one hopefully just needs to add it to a list to have it 302# processed. 303# 304# As mentioned earlier, some properties are given in more than one file. In 305# particular, the files in the extracted directory are supposedly just 306# reformattings of the others. But they contain information not easily 307# derivable from the other files, including results for Unihan (which isn't 308# usually available to this program) and for unassigned code points. They 309# also have historically had errors or been incomplete. In an attempt to 310# create the best possible data, this program thus processes them first to 311# glean information missing from the other files; then processes those other 312# files to override any errors in the extracted ones. Much of the design was 313# driven by this need to store things and then possibly override them. 314# 315# It tries to keep fatal errors to a minimum, to generate something usable for 316# testing purposes. It always looks for files that could be inputs, and will 317# warn about any that it doesn't know how to handle (the -q option suppresses 318# the warning). 319# 320# Why is there more than one type of range? 321# This simplified things. There are some very specialized code points that 322# have to be handled specially for output, such as Hangul syllable names. 323# By creating a range type (done late in the development process), it 324# allowed this to be stored with the range, and overridden by other input. 325# Originally these were stored in another data structure, and it became a 326# mess trying to decide if a second file that was for the same property was 327# overriding the earlier one or not. 328# 329# Why are there two kinds of tables, match and map? 330# (And there is a base class shared by the two as well.) As stated above, 331# they actually are for different things. Development proceeded much more 332# smoothly when I (khw) realized the distinction. Map tables are used to 333# give the property value for every code point (actually every code point 334# that doesn't map to a default value). Match tables are used for regular 335# expression matches, and are essentially the inverse mapping. Separating 336# the two allows more specialized methods, and error checks so that one 337# can't just take the intersection of two map tables, for example, as that 338# is nonsensical. 339# 340# What about 'fate' and 'status'. The concept of a table's fate was created 341# late when it became clear that something more was needed. The difference 342# between this and 'status' is unclean, and could be improved if someone 343# wanted to spend the effort. 344# 345# DEBUGGING 346# 347# This program is written so it will run under miniperl. Occasionally changes 348# will cause an error where the backtrace doesn't work well under miniperl. 349# To diagnose the problem, you can instead run it under regular perl, if you 350# have one compiled. 351# 352# There is a good trace facility. To enable it, first sub DEBUG must be set 353# to return true. Then a line like 354# 355# local $to_trace = 1 if main::DEBUG; 356# 357# can be added to enable tracing in its lexical scope (plus dynamic) or until 358# you insert another line: 359# 360# local $to_trace = 0 if main::DEBUG; 361# 362# To actually trace, use a line like "trace $a, @b, %c, ...; 363# 364# Some of the more complex subroutines already have trace statements in them. 365# Permanent trace statements should be like: 366# 367# trace ... if main::DEBUG && $to_trace; 368# 369# main::stack_trace() will display what its name implies 370# 371# If there is just one or a few files that you're debugging, you can easily 372# cause most everything else to be skipped. Change the line 373# 374# my $debug_skip = 0; 375# 376# to 1, and every file whose object is in @input_file_objects and doesn't have 377# a, 'non_skip => 1,' in its constructor will be skipped. However, skipping 378# Jamo.txt or UnicodeData.txt will likely cause fatal errors. 379# 380# To compare the output tables, it may be useful to specify the -annotate 381# flag. (As of this writing, this can't be done on a clean workspace, due to 382# requirements in Text::Tabs used in this option; so first run mktables 383# without this option.) This option adds comment lines to each table, one for 384# each non-algorithmically named character giving, currently its code point, 385# name, and graphic representation if printable (and you have a font that 386# knows about it). This makes it easier to see what the particular code 387# points are in each output table. Non-named code points are annotated with a 388# description of their status, and contiguous ones with the same description 389# will be output as a range rather than individually. Algorithmically named 390# characters are also output as ranges, except when there are just a few 391# contiguous ones. 392# 393# FUTURE ISSUES 394# 395# The program would break if Unicode were to change its names so that 396# interior white space, underscores, or dashes differences were significant 397# within property and property value names. 398# 399# It might be easier to use the xml versions of the UCD if this program ever 400# would need heavy revision, and the ability to handle old versions was not 401# required. 402# 403# There is the potential for name collisions, in that Perl has chosen names 404# that Unicode could decide it also likes. There have been such collisions in 405# the past, with mostly Perl deciding to adopt the Unicode definition of the 406# name. However in the 5.2 Unicode beta testing, there were a number of such 407# collisions, which were withdrawn before the final release, because of Perl's 408# and other's protests. These all involved new properties which began with 409# 'Is'. Based on the protests, Unicode is unlikely to try that again. Also, 410# many of the Perl-defined synonyms, like Any, Word, etc, are listed in a 411# Unicode document, so they are unlikely to be used by Unicode for another 412# purpose. However, they might try something beginning with 'In', or use any 413# of the other Perl-defined properties. This program will warn you of name 414# collisions, and refuse to generate tables with them, but manual intervention 415# will be required in this event. One scheme that could be implemented, if 416# necessary, would be to have this program generate another file, or add a 417# field to mktables.lst that gives the date of first definition of a property. 418# Each new release of Unicode would use that file as a basis for the next 419# iteration. And the Perl synonym addition code could sort based on the age 420# of the property, so older properties get priority, and newer ones that clash 421# would be refused; hence existing code would not be impacted, and some other 422# synonym would have to be used for the new property. This is ugly, and 423# manual intervention would certainly be easier to do in the short run; lets 424# hope it never comes to this. 425# 426# A NOTE ON UNIHAN 427# 428# This program can generate tables from the Unihan database. But that DB 429# isn't normally available, so it is marked as optional. Prior to version 430# 5.2, this database was in a single file, Unihan.txt. In 5.2 the database 431# was split into 8 different files, all beginning with the letters 'Unihan'. 432# If you plunk those files down into the directory mktables ($0) is in, this 433# program will read them and automatically create tables for the properties 434# from it that are listed in PropertyAliases.txt and PropValueAliases.txt, 435# plus any you add to the @cjk_properties array and the @cjk_property_values 436# array, being sure to add necessary '# @missings' lines to the latter. For 437# Unicode versions earlier than 5.2, most of the Unihan properties are not 438# listed at all in PropertyAliases nor PropValueAliases. This program assumes 439# for these early releases that you want the properties that are specified in 440# the 5.2 release. 441# 442# You may need to adjust the entries to suit your purposes. setup_unihan(), 443# and filter_unihan_line() are the functions where this is done. This program 444# already does some adjusting to make the lines look more like the rest of the 445# Unicode DB; You can see what that is in filter_unihan_line() 446# 447# There is a bug in the 3.2 data file in which some values for the 448# kPrimaryNumeric property have commas and an unexpected comment. A filter 449# could be added to correct these; or for a particular installation, the 450# Unihan.txt file could be edited to fix them. 451# 452# HOW TO ADD A FILE TO BE PROCESSED 453# 454# A new file from Unicode needs to have an object constructed for it in 455# @input_file_objects, probably at the end or at the end of the extracted 456# ones. The program should warn you if its name will clash with others on 457# restrictive file systems, like DOS. If so, figure out a better name, and 458# add lines to the README.perl file giving that. If the file is a character 459# property, it should be in the format that Unicode has implicitly 460# standardized for such files for the more recently introduced ones. 461# If so, the Input_file constructor for @input_file_objects can just be the 462# file name and release it first appeared in. If not, then it should be 463# possible to construct an each_line_handler() to massage the line into the 464# standardized form. 465# 466# For non-character properties, more code will be needed. You can look at 467# the existing entries for clues. 468# 469# UNICODE VERSIONS NOTES 470# 471# The Unicode UCD has had a number of errors in it over the versions. And 472# these remain, by policy, in the standard for that version. Therefore it is 473# risky to correct them, because code may be expecting the error. So this 474# program doesn't generally make changes, unless the error breaks the Perl 475# core. As an example, some versions of 2.1.x Jamo.txt have the wrong value 476# for U+1105, which causes real problems for the algorithms for Jamo 477# calculations, so it is changed here. 478# 479# But it isn't so clear cut as to what to do about concepts that are 480# introduced in a later release; should they extend back to earlier releases 481# where the concept just didn't exist? It was easier to do this than to not, 482# so that's what was done. For example, the default value for code points not 483# in the files for various properties was probably undefined until changed by 484# some version. No_Block for blocks is such an example. This program will 485# assign No_Block even in Unicode versions that didn't have it. This has the 486# benefit that code being written doesn't have to special case earlier 487# versions; and the detriment that it doesn't match the Standard precisely for 488# the affected versions. 489# 490# Here are some observations about some of the issues in early versions: 491# 492# Prior to version 3.0, there were 3 character decompositions. These are not 493# handled by Unicode::Normalize, nor will it compile when presented a version 494# that has them. However, you can trivially get it to compile by simply 495# ignoring those decompositions, by changing the croak to a carp. At the time 496# of this writing, the line (in dist/Unicode-Normalize/Normalize.pm or 497# dist/Unicode-Normalize/mkheader) reads 498# 499# croak("Weird Canonical Decomposition of U+$h"); 500# 501# Simply comment it out. It will compile, but will not know about any three 502# character decompositions. 503 504# The number of code points in \p{alpha=True} halved in 2.1.9. It turns out 505# that the reason is that the CJK block starting at 4E00 was removed from 506# PropList, and was not put back in until 3.1.0. The Perl extension (the 507# single property name \p{alpha}) has the correct values. But the compound 508# form is simply not generated until 3.1, as it can be argued that prior to 509# this release, this was not an official property. The comments for 510# filter_old_style_proplist() give more details. 511# 512# Unicode introduced the synonym Space for White_Space in 4.1. Perl has 513# always had a \p{Space}. In release 3.2 only, they are not synonymous. The 514# reason is that 3.2 introduced U+205F=medium math space, which was not 515# classed as white space, but Perl figured out that it should have been. 4.0 516# reclassified it correctly. 517# 518# Another change between 3.2 and 4.0 is the CCC property value ATBL. In 3.2 519# this was erroneously a synonym for 202 (it should be 200). In 4.0, ATB 520# became 202, and ATBL was left with no code points, as all the ones that 521# mapped to 202 stayed mapped to 202. Thus if your program used the numeric 522# name for the class, it would not have been affected, but if it used the 523# mnemonic, it would have been. 524# 525# \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that, code 526# points which eventually came to have this script property value, instead 527# mapped to "Unknown". But in the next release all these code points were 528# moved to \p{sc=common} instead. 529 530# The tests furnished by Unicode for testing WordBreak and SentenceBreak 531# generate errors in 5.0 and earlier. 532# 533# The default for missing code points for BidiClass is complicated. Starting 534# in 3.1.1, the derived file DBidiClass.txt handles this, but this program 535# tries to do the best it can for earlier releases. It is done in 536# process_PropertyAliases() 537# 538# In version 2.1.2, the entry in UnicodeData.txt: 539# 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;;019F; 540# should instead be 541# 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F 542# Without this change, there are casing problems for this character. 543# 544# Search for $string_compare_versions to see how to compare changes to 545# properties between Unicode versions 546# 547############################################################################## 548 549my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing 550 # and errors 551my $MAX_LINE_WIDTH = 78; 552 553# Debugging aid to skip most files so as to not be distracted by them when 554# concentrating on the ones being debugged. Add 555# non_skip => 1, 556# to the constructor for those files you want processed when you set this. 557# Files with a first version number of 0 are special: they are always 558# processed regardless of the state of this flag. Generally, Jamo.txt and 559# UnicodeData.txt must not be skipped if you want this program to not die 560# before normal completion. 561my $debug_skip = 0; 562 563 564# Normally these are suppressed. 565my $write_Unicode_deprecated_tables = 0; 566 567# Set to 1 to enable tracing. 568our $to_trace = 0; 569 570{ # Closure for trace: debugging aid 571 my $print_caller = 1; # ? Include calling subroutine name 572 my $main_with_colon = 'main::'; 573 my $main_colon_length = length($main_with_colon); 574 575 sub trace { 576 return unless $to_trace; # Do nothing if global flag not set 577 578 my @input = @_; 579 580 local $DB::trace = 0; 581 $DB::trace = 0; # Quiet 'used only once' message 582 583 my $line_number; 584 585 # Loop looking up the stack to get the first non-trace caller 586 my $caller_line; 587 my $caller_name; 588 my $i = 0; 589 do { 590 $line_number = $caller_line; 591 (my $pkg, my $file, $caller_line, my $caller) = caller $i++; 592 $caller = $main_with_colon unless defined $caller; 593 594 $caller_name = $caller; 595 596 # get rid of pkg 597 $caller_name =~ s/.*:://; 598 if (substr($caller_name, 0, $main_colon_length) 599 eq $main_with_colon) 600 { 601 $caller_name = substr($caller_name, $main_colon_length); 602 } 603 604 } until ($caller_name ne 'trace'); 605 606 # If the stack was empty, we were called from the top level 607 $caller_name = 'main' if ($caller_name eq "" 608 || $caller_name eq 'trace'); 609 610 my $output = ""; 611 #print STDERR __LINE__, ": ", join ", ", @input, "\n"; 612 foreach my $string (@input) { 613 if (ref $string eq 'ARRAY' || ref $string eq 'HASH') { 614 $output .= simple_dumper($string); 615 } 616 else { 617 $string = "$string" if ref $string; 618 $string = $UNDEF unless defined $string; 619 chomp $string; 620 $string = '""' if $string eq ""; 621 $output .= " " if $output ne "" 622 && $string ne "" 623 && substr($output, -1, 1) ne " " 624 && substr($string, 0, 1) ne " "; 625 $output .= $string; 626 } 627 } 628 629 print STDERR sprintf "%4d: ", $line_number if defined $line_number; 630 print STDERR "$caller_name: " if $print_caller; 631 print STDERR $output, "\n"; 632 return; 633 } 634} 635 636sub stack_trace() { 637 local $to_trace = 1 if main::DEBUG; 638 my $line = (caller(0))[2]; 639 my $i = 1; 640 641 # Accumulate the stack trace 642 while (1) { 643 my ($pkg, $file, $caller_line, $caller) = caller $i++; 644 645 last unless defined $caller; 646 647 trace "called from $caller() at line $line"; 648 $line = $caller_line; 649 } 650} 651 652# This is for a rarely used development feature that allows you to compare two 653# versions of the Unicode standard without having to deal with changes caused 654# by the code points introduced in the later version. You probably also want 655# to use the -annotate option when using this. Run this program on a unicore 656# containing the starting release you want to compare. Save that output 657# structure. Then, switching to a unicore with the ending release, change the 658# "" in the $string_compare_versions definition just below to a string 659# containing a SINGLE dotted Unicode release number (e.g. "2.1") corresponding 660# to the starting release. This program will then compile, but throw away all 661# code points introduced after the starting release. Finally use a diff tool 662# to compare the two directory structures. They include only the code points 663# common to both releases, and you can see the changes caused just by the 664# underlying release semantic changes. For versions earlier than 3.2, you 665# must copy a version of DAge.txt into the directory. 666my $string_compare_versions = DEBUG && ""; 667my $compare_versions = DEBUG 668 && $string_compare_versions 669 && pack "C*", split /\./, $string_compare_versions; 670 671sub uniques { 672 # Returns non-duplicated input values. From "Perl Best Practices: 673 # Encapsulated Cleverness". p. 455 in first edition. 674 675 my %seen; 676 # Arguably this breaks encapsulation, if the goal is to permit multiple 677 # distinct objects to stringify to the same value, and be interchangeable. 678 # However, for this program, no two objects stringify identically, and all 679 # lists passed to this function are either objects or strings. So this 680 # doesn't affect correctness, but it does give a couple of percent speedup. 681 no overloading; 682 return grep { ! $seen{$_}++ } @_; 683} 684 685$0 = File::Spec->canonpath($0); 686 687my $make_test_script = 0; # ? Should we output a test script 688my $make_norm_test_script = 0; # ? Should we output a normalization test script 689my $write_unchanged_files = 0; # ? Should we update the output files even if 690 # we don't think they have changed 691my $use_directory = ""; # ? Should we chdir somewhere. 692my $pod_directory; # input directory to store the pod file. 693my $pod_file = 'perluniprops'; 694my $t_path; # Path to the .t test file 695my $file_list = 'mktables.lst'; # File to store input and output file names. 696 # This is used to speed up the build, by not 697 # executing the main body of the program if 698 # nothing on the list has changed since the 699 # previous build 700my $make_list = 1; # ? Should we write $file_list. Set to always 701 # make a list so that when the release manager 702 # is preparing a release, they won't have to do 703 # special things 704my $glob_list = 0; # ? Should we try to include unknown .txt files 705 # in the input. 706my $output_range_counts = $debugging_build; # ? Should we include the number 707 # of code points in ranges in 708 # the output 709my $annotate = 0; # ? Should character names be in the output 710 711# Verbosity levels; 0 is quiet 712my $NORMAL_VERBOSITY = 1; 713my $PROGRESS = 2; 714my $VERBOSE = 3; 715 716my $verbosity = $NORMAL_VERBOSITY; 717 718# Stored in mktables.lst so that if this program is called with different 719# options, will regenerate even if the files otherwise look like they're 720# up-to-date. 721my $command_line_arguments = join " ", @ARGV; 722 723# Process arguments 724while (@ARGV) { 725 my $arg = shift @ARGV; 726 if ($arg eq '-v') { 727 $verbosity = $VERBOSE; 728 } 729 elsif ($arg eq '-p') { 730 $verbosity = $PROGRESS; 731 $| = 1; # Flush buffers as we go. 732 } 733 elsif ($arg eq '-q') { 734 $verbosity = 0; 735 } 736 elsif ($arg eq '-w') { 737 # update the files even if they haven't changed 738 $write_unchanged_files = 1; 739 } 740 elsif ($arg eq '-check') { 741 my $this = shift @ARGV; 742 my $ok = shift @ARGV; 743 if ($this ne $ok) { 744 print "Skipping as check params are not the same.\n"; 745 exit(0); 746 } 747 } 748 elsif ($arg eq '-P' && defined ($pod_directory = shift)) { 749 -d $pod_directory or croak "Directory '$pod_directory' doesn't exist"; 750 } 751 elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift))) 752 { 753 $make_test_script = 1; 754 } 755 elsif ($arg eq '-makenormtest') 756 { 757 $make_norm_test_script = 1; 758 } 759 elsif ($arg eq '-makelist') { 760 $make_list = 1; 761 } 762 elsif ($arg eq '-C' && defined ($use_directory = shift)) { 763 -d $use_directory or croak "Unknown directory '$use_directory'"; 764 } 765 elsif ($arg eq '-L') { 766 767 # Existence not tested until have chdir'd 768 $file_list = shift; 769 } 770 elsif ($arg eq '-globlist') { 771 $glob_list = 1; 772 } 773 elsif ($arg eq '-c') { 774 $output_range_counts = ! $output_range_counts 775 } 776 elsif ($arg eq '-annotate') { 777 $annotate = 1; 778 $debugging_build = 1; 779 $output_range_counts = 1; 780 } 781 else { 782 my $with_c = 'with'; 783 $with_c .= 'out' if $output_range_counts; # Complements the state 784 croak <<END; 785usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ] 786 [ -T test_file_path ] [-globlist] [-makelist] [-maketest] 787 [-check A B ] 788 -c : Output comments $with_c number of code points in ranges 789 -q : Quiet Mode: Only output serious warnings. 790 -p : Set verbosity level to normal plus show progress. 791 -v : Set Verbosity level high: Show progress and non-serious 792 warnings 793 -w : Write files regardless 794 -C dir : Change to this directory before proceeding. All relative paths 795 except those specified by the -P and -T options will be done 796 with respect to this directory. 797 -P dir : Output $pod_file file to directory 'dir'. 798 -T path : Create a test script as 'path'; overrides -maketest 799 -L filelist : Use alternate 'filelist' instead of standard one 800 -globlist : Take as input all non-Test *.txt files in current and sub 801 directories 802 -maketest : Make test script 'TestProp.pl' in current (or -C directory), 803 overrides -T 804 -makelist : Rewrite the file list $file_list based on current setup 805 -annotate : Output an annotation for each character in the table files; 806 useful for debugging mktables, looking at diffs; but is slow 807 and memory intensive 808 -check A B : Executes $0 only if A and B are the same 809END 810 } 811} 812 813# Stores the most-recently changed file. If none have changed, can skip the 814# build 815my $most_recent = (stat $0)[9]; # Do this before the chdir! 816 817# Change directories now, because need to read 'version' early. 818if ($use_directory) { 819 if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) { 820 $pod_directory = File::Spec->rel2abs($pod_directory); 821 } 822 if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) { 823 $t_path = File::Spec->rel2abs($t_path); 824 } 825 chdir $use_directory or croak "Failed to chdir to '$use_directory':$!"; 826 if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) { 827 $pod_directory = File::Spec->abs2rel($pod_directory); 828 } 829 if ($t_path && File::Spec->file_name_is_absolute($t_path)) { 830 $t_path = File::Spec->abs2rel($t_path); 831 } 832} 833 834# Get Unicode version into regular and v-string. This is done now because 835# various tables below get populated based on it. These tables are populated 836# here to be near the top of the file, and so easily seeable by those needing 837# to modify things. 838open my $VERSION, "<", "version" 839 or croak "$0: can't open required file 'version': $!\n"; 840my $string_version = <$VERSION>; 841close $VERSION; 842chomp $string_version; 843my $v_version = pack "C*", split /\./, $string_version; # v string 844 845my $unicode_version = ($compare_versions) 846 ? ( "$string_compare_versions (using " 847 . "$string_version rules)") 848 : $string_version; 849 850# The following are the complete names of properties with property values that 851# are known to not match any code points in some versions of Unicode, but that 852# may change in the future so they should be matchable, hence an empty file is 853# generated for them. 854my @tables_that_may_be_empty; 855push @tables_that_may_be_empty, 'Joining_Type=Left_Joining' 856 if $v_version lt v6.3.0; 857push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1; 858push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0; 859push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana' 860 if $v_version ge v4.1.0; 861push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana' 862 if $v_version ge v6.0.0; 863push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend' 864 if $v_version ge v6.1.0; 865push @tables_that_may_be_empty, 'Canonical_Combining_Class=CCC133' 866 if $v_version ge v6.2.0; 867 868# The lists below are hashes, so the key is the item in the list, and the 869# value is the reason why it is in the list. This makes generation of 870# documentation easier. 871 872my %why_suppressed; # No file generated for these. 873 874# Files aren't generated for empty extraneous properties. This is arguable. 875# Extraneous properties generally come about because a property is no longer 876# used in a newer version of Unicode. If we generated a file without code 877# points, programs that used to work on that property will still execute 878# without errors. It just won't ever match (or will always match, with \P{}). 879# This means that the logic is now likely wrong. I (khw) think its better to 880# find this out by getting an error message. Just move them to the table 881# above to change this behavior 882my %why_suppress_if_empty_warn_if_not = ( 883 884 # It is the only property that has ever officially been removed from the 885 # Standard. The database never contained any code points for it. 886 'Special_Case_Condition' => 'Obsolete', 887 888 # Apparently never official, but there were code points in some versions of 889 # old-style PropList.txt 890 'Non_Break' => 'Obsolete', 891); 892 893# These would normally go in the warn table just above, but they were changed 894# a long time before this program was written, so warnings about them are 895# moot. 896if ($v_version gt v3.2.0) { 897 push @tables_that_may_be_empty, 898 'Canonical_Combining_Class=Attached_Below_Left' 899} 900 901# Obsoleted 902if ($v_version ge v11.0.0) { 903 push @tables_that_may_be_empty, qw( 904 Grapheme_Cluster_Break=E_Base 905 Grapheme_Cluster_Break=E_Base_GAZ 906 Grapheme_Cluster_Break=E_Modifier 907 Grapheme_Cluster_Break=Glue_After_Zwj 908 Word_Break=E_Base 909 Word_Break=E_Base_GAZ 910 Word_Break=E_Modifier 911 Word_Break=Glue_After_Zwj); 912} 913 914# Enum values for to_output_map() method in the Map_Table package. (0 is don't 915# output) 916my $EXTERNAL_MAP = 1; 917my $INTERNAL_MAP = 2; 918my $OUTPUT_ADJUSTED = 3; 919 920# To override computed values for writing the map tables for these properties. 921# The default for enum map tables is to write them out, so that the Unicode 922# .txt files can be removed, but all the data to compute any property value 923# for any code point is available in a more compact form. 924my %global_to_output_map = ( 925 # Needed by UCD.pm, but don't want to publicize that it exists, so won't 926 # get stuck supporting it if things change. Since it is a STRING 927 # property, it normally would be listed in the pod, but INTERNAL_MAP 928 # suppresses that. 929 Unicode_1_Name => $INTERNAL_MAP, 930 931 Present_In => 0, # Suppress, as easily computed from Age 932 Block => (NON_ASCII_PLATFORM) ? 1 : 0, # Suppress, as Blocks.txt is 933 # retained, but needed for 934 # non-ASCII 935 936 # Suppress, as mapping can be found instead from the 937 # Perl_Decomposition_Mapping file 938 Decomposition_Type => 0, 939); 940 941# There are several types of obsolete properties defined by Unicode. These 942# must be hand-edited for every new Unicode release. 943my %why_deprecated; # Generates a deprecated warning message if used. 944my %why_stabilized; # Documentation only 945my %why_obsolete; # Documentation only 946 947{ # Closure 948 my $simple = 'Perl uses the more complete version'; 949 my $unihan = 'Unihan properties are by default not enabled in the Perl core.'; 950 951 my $other_properties = 'other properties'; 952 my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone"; 953 my $why_no_expand = "Deprecated by Unicode. These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used. For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character."; 954 955 %why_deprecated = ( 956 'Grapheme_Link' => 'Duplicates ccc=vr (Canonical_Combining_Class=Virama)', 957 'Jamo_Short_Name' => $contributory, 958 'Line_Break=Surrogate' => 'Surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking', 959 'Other_Alphabetic' => $contributory, 960 'Other_Default_Ignorable_Code_Point' => $contributory, 961 'Other_Grapheme_Extend' => $contributory, 962 'Other_ID_Continue' => $contributory, 963 'Other_ID_Start' => $contributory, 964 'Other_Lowercase' => $contributory, 965 'Other_Math' => $contributory, 966 'Other_Uppercase' => $contributory, 967 'Expands_On_NFC' => $why_no_expand, 968 'Expands_On_NFD' => $why_no_expand, 969 'Expands_On_NFKC' => $why_no_expand, 970 'Expands_On_NFKD' => $why_no_expand, 971 ); 972 973 %why_suppressed = ( 974 # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which 975 # contains the same information, but without the algorithmically 976 # determinable Hangul syllables'. This file is not published, so it's 977 # existence is not noted in the comment. 978 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or prop_invmap() or charprop() in Unicode::UCD::', 979 980 # Don't suppress ISO_Comment, as otherwise special handling is needed 981 # to differentiate between it and gc=c, which can be written as 'isc', 982 # which is the same characters as ISO_Comment's short name. 983 984 'Name' => "Accessible via \\N{...} or 'use charnames;' or charprop() or prop_invmap() in Unicode::UCD::", 985 986 'Simple_Case_Folding' => "$simple. Can access this through casefold(), charprop(), or prop_invmap() in Unicode::UCD", 987 'Simple_Lowercase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD", 988 'Simple_Titlecase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD", 989 'Simple_Uppercase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD", 990 991 FC_NFKC_Closure => 'Deprecated by Unicode, and supplanted in usage by NFKC_Casefold; otherwise not useful', 992 ); 993 994 foreach my $property ( 995 996 # The following are suppressed because they were made contributory 997 # or deprecated by Unicode before Perl ever thought about 998 # supporting them. 999 'Jamo_Short_Name', 1000 'Grapheme_Link', 1001 'Expands_On_NFC', 1002 'Expands_On_NFD', 1003 'Expands_On_NFKC', 1004 'Expands_On_NFKD', 1005 1006 # The following are suppressed because they have been marked 1007 # as deprecated for a sufficient amount of time 1008 'Other_Alphabetic', 1009 'Other_Default_Ignorable_Code_Point', 1010 'Other_Grapheme_Extend', 1011 'Other_ID_Continue', 1012 'Other_ID_Start', 1013 'Other_Lowercase', 1014 'Other_Math', 1015 'Other_Uppercase', 1016 ) { 1017 $why_suppressed{$property} = $why_deprecated{$property}; 1018 } 1019 1020 # Customize the message for all the 'Other_' properties 1021 foreach my $property (keys %why_deprecated) { 1022 next if (my $main_property = $property) !~ s/^Other_//; 1023 $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/; 1024 } 1025} 1026 1027if ($write_Unicode_deprecated_tables) { 1028 foreach my $property (keys %why_suppressed) { 1029 delete $why_suppressed{$property} if $property =~ 1030 / ^ Other | Grapheme /x; 1031 } 1032} 1033 1034if ($v_version ge 4.0.0) { 1035 $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14'; 1036 if ($v_version ge 6.0.0) { 1037 $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14'; 1038 } 1039} 1040if ($v_version ge 5.2.0 && $v_version lt 6.0.0) { 1041 $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed'; 1042 if ($v_version ge 6.0.0) { 1043 $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed'; 1044 } 1045} 1046 1047# Probably obsolete forever 1048if ($v_version ge v4.1.0) { 1049 $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common".'; 1050} 1051if ($v_version ge v6.0.0) { 1052 $why_suppressed{'Script=Katakana_Or_Hiragana'} .= ' Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana" (or both)'; 1053 $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"'; 1054} 1055 1056# This program can create files for enumerated-like properties, such as 1057# 'Numeric_Type'. This file would be the same format as for a string 1058# property, with a mapping from code point to its value, so you could look up, 1059# for example, the script a code point is in. But no one so far wants this 1060# mapping, or they have found another way to get it since this is a new 1061# feature. So no file is generated except if it is in this list. 1062my @output_mapped_properties = split "\n", <<END; 1063END 1064 1065# If you want more Unihan properties than the default, you need to add them to 1066# these arrays. Depending on the property type, @missing lines might have to 1067# be added to the second array. A sample entry would be (including the '#'): 1068# @missing: 0000..10FFFF; cjkAccountingNumeric; NaN 1069my @cjk_properties = split "\n", <<'END'; 1070END 1071my @cjk_property_values = split "\n", <<'END'; 1072END 1073 1074# The input files don't list every code point. Those not listed are to be 1075# defaulted to some value. Below are hard-coded what those values are for 1076# non-binary properties as of 5.1. Starting in 5.0, there are 1077# machine-parsable comment lines in the files that give the defaults; so this 1078# list shouldn't have to be extended. The claim is that all missing entries 1079# for binary properties will default to 'N'. Unicode tried to change that in 1080# 5.2, but the beta period produced enough protest that they backed off. 1081# 1082# The defaults for the fields that appear in UnicodeData.txt in this hash must 1083# be in the form that it expects. The others may be synonyms. 1084my $CODE_POINT = '<code point>'; 1085my %default_mapping = ( 1086 Age => "Unassigned", 1087 # Bidi_Class => Complicated; set in code 1088 Bidi_Mirroring_Glyph => "", 1089 Block => 'No_Block', 1090 Canonical_Combining_Class => 0, 1091 Case_Folding => $CODE_POINT, 1092 Decomposition_Mapping => $CODE_POINT, 1093 Decomposition_Type => 'None', 1094 East_Asian_Width => "Neutral", 1095 FC_NFKC_Closure => $CODE_POINT, 1096 General_Category => ($v_version le 6.3.0) ? 'Cn' : 'Unassigned', 1097 Grapheme_Cluster_Break => 'Other', 1098 Hangul_Syllable_Type => 'NA', 1099 ISO_Comment => "", 1100 Jamo_Short_Name => "", 1101 Joining_Group => "No_Joining_Group", 1102 # Joining_Type => Complicated; set in code 1103 kIICore => 'N', # Is converted to binary 1104 #Line_Break => Complicated; set in code 1105 Lowercase_Mapping => $CODE_POINT, 1106 Name => "", 1107 Name_Alias => "", 1108 NFC_QC => 'Yes', 1109 NFD_QC => 'Yes', 1110 NFKC_QC => 'Yes', 1111 NFKD_QC => 'Yes', 1112 Numeric_Type => 'None', 1113 Numeric_Value => 'NaN', 1114 Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown', 1115 Sentence_Break => 'Other', 1116 Simple_Case_Folding => $CODE_POINT, 1117 Simple_Lowercase_Mapping => $CODE_POINT, 1118 Simple_Titlecase_Mapping => $CODE_POINT, 1119 Simple_Uppercase_Mapping => $CODE_POINT, 1120 Titlecase_Mapping => $CODE_POINT, 1121 Unicode_1_Name => "", 1122 Unicode_Radical_Stroke => "", 1123 Uppercase_Mapping => $CODE_POINT, 1124 Word_Break => 'Other', 1125); 1126 1127### End of externally interesting definitions, except for @input_file_objects 1128 1129my $HEADER=<<"EOF"; 1130# !!!!!!! DO NOT EDIT THIS FILE !!!!!!! 1131# This file is machine-generated by $0 from the Unicode 1132# database, Version $unicode_version. Any changes made here will be lost! 1133EOF 1134 1135my $INTERNAL_ONLY_HEADER = <<"EOF"; 1136 1137# !!!!!!! INTERNAL PERL USE ONLY !!!!!!! 1138# This file is for internal use by core Perl only. The format and even the 1139# name or existence of this file are subject to change without notice. Don't 1140# use it directly. Use Unicode::UCD to access the Unicode character data 1141# base. 1142EOF 1143 1144my $DEVELOPMENT_ONLY=<<"EOF"; 1145# !!!!!!! DEVELOPMENT USE ONLY !!!!!!! 1146# This file contains information artificially constrained to code points 1147# present in Unicode release $string_compare_versions. 1148# IT CANNOT BE RELIED ON. It is for use during development only and should 1149# not be used for production. 1150 1151EOF 1152 1153my $MAX_UNICODE_CODEPOINT_STRING = ($v_version ge v2.0.0) 1154 ? "10FFFF" 1155 : "FFFF"; 1156my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING; 1157my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1; 1158 1159# We work with above-Unicode code points, up to IV_MAX, but we may want to use 1160# sentinels above that number. Therefore for internal use, we use a much 1161# smaller number, translating it to IV_MAX only for output. The exact number 1162# is immaterial (all above-Unicode code points are treated exactly the same), 1163# but the algorithm requires it to be at least 1164# 2 * $MAX_UNICODE_CODEPOINTS + 1 1165my $MAX_WORKING_CODEPOINTS= $MAX_UNICODE_CODEPOINT * 8; 1166my $MAX_WORKING_CODEPOINT = $MAX_WORKING_CODEPOINTS - 1; 1167my $MAX_WORKING_CODEPOINT_STRING = sprintf("%X", $MAX_WORKING_CODEPOINT); 1168 1169my $MAX_PLATFORM_CODEPOINT = ~0 >> 1; 1170 1171# Matches legal code point. 4-6 hex numbers, If there are 6, the first 1172# two must be 10; if there are 5, the first must not be a 0. Written this way 1173# to decrease backtracking. The first regex allows the code point to be at 1174# the end of a word, but to work properly, the word shouldn't end with a valid 1175# hex character. The second one won't match a code point at the end of a 1176# word, and doesn't have the run-on issue 1177my $run_on_code_point_re = 1178 qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x; 1179my $code_point_re = qr/\b$run_on_code_point_re/; 1180 1181# This matches the beginning of the line in the Unicode DB files that give the 1182# defaults for code points not listed (i.e., missing) in the file. The code 1183# depends on this ending with a semi-colon, so it can assume it is a valid 1184# field when the line is split() by semi-colons 1185my $missing_defaults_prefix = qr/^#\s+\@missing:\s+0000\.\.10FFFF\s*;/; 1186 1187# Property types. Unicode has more types, but these are sufficient for our 1188# purposes. 1189my $UNKNOWN = -1; # initialized to illegal value 1190my $NON_STRING = 1; # Either binary or enum 1191my $BINARY = 2; 1192my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal 1193 # tables, additional true and false tables are 1194 # generated so that false is anything matching the 1195 # default value, and true is everything else. 1196my $ENUM = 4; # Include catalog 1197my $STRING = 5; # Anything else: string or misc 1198 1199# Some input files have lines that give default values for code points not 1200# contained in the file. Sometimes these should be ignored. 1201my $NO_DEFAULTS = 0; # Must evaluate to false 1202my $NOT_IGNORED = 1; 1203my $IGNORED = 2; 1204 1205# Range types. Each range has a type. Most ranges are type 0, for normal, 1206# and will appear in the main body of the tables in the output files, but 1207# there are other types of ranges as well, listed below, that are specially 1208# handled. There are pseudo-types as well that will never be stored as a 1209# type, but will affect the calculation of the type. 1210 1211# 0 is for normal, non-specials 1212my $MULTI_CP = 1; # Sequence of more than code point 1213my $HANGUL_SYLLABLE = 2; 1214my $CP_IN_NAME = 3; # The NAME contains the code point appended to it. 1215my $NULL = 4; # The map is to the null string; utf8.c can't 1216 # handle these, nor is there an accepted syntax 1217 # for them in \p{} constructs 1218my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would 1219 # otherwise be $MULTI_CP type are instead type 0 1220 1221# process_generic_property_file() can accept certain overrides in its input. 1222# Each of these must begin AND end with $CMD_DELIM. 1223my $CMD_DELIM = "\a"; 1224my $REPLACE_CMD = 'replace'; # Override the Replace 1225my $MAP_TYPE_CMD = 'map_type'; # Override the Type 1226 1227my $NO = 0; 1228my $YES = 1; 1229 1230# Values for the Replace argument to add_range. 1231# $NO # Don't replace; add only the code points not 1232 # already present. 1233my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in 1234 # the comments at the subroutine definition. 1235my $UNCONDITIONALLY = 2; # Replace without conditions. 1236my $MULTIPLE_BEFORE = 4; # Don't replace, but add a duplicate record if 1237 # already there 1238my $MULTIPLE_AFTER = 5; # Don't replace, but add a duplicate record if 1239 # already there 1240my $CROAK = 6; # Die with an error if is already there 1241 1242# Flags to give property statuses. The phrases are to remind maintainers that 1243# if the flag is changed, the indefinite article referring to it in the 1244# documentation may need to be as well. 1245my $NORMAL = ""; 1246my $DEPRECATED = 'D'; 1247my $a_bold_deprecated = "a 'B<$DEPRECATED>'"; 1248my $A_bold_deprecated = "A 'B<$DEPRECATED>'"; 1249my $DISCOURAGED = 'X'; 1250my $a_bold_discouraged = "an 'B<$DISCOURAGED>'"; 1251my $A_bold_discouraged = "An 'B<$DISCOURAGED>'"; 1252my $STRICTER = 'T'; 1253my $a_bold_stricter = "a 'B<$STRICTER>'"; 1254my $A_bold_stricter = "A 'B<$STRICTER>'"; 1255my $STABILIZED = 'S'; 1256my $a_bold_stabilized = "an 'B<$STABILIZED>'"; 1257my $A_bold_stabilized = "An 'B<$STABILIZED>'"; 1258my $OBSOLETE = 'O'; 1259my $a_bold_obsolete = "an 'B<$OBSOLETE>'"; 1260my $A_bold_obsolete = "An 'B<$OBSOLETE>'"; 1261 1262# Aliases can also have an extra status: 1263my $INTERNAL_ALIAS = 'P'; 1264 1265my %status_past_participles = ( 1266 $DISCOURAGED => 'discouraged', 1267 $STABILIZED => 'stabilized', 1268 $OBSOLETE => 'obsolete', 1269 $DEPRECATED => 'deprecated', 1270 $INTERNAL_ALIAS => 'reserved for Perl core internal use only', 1271); 1272 1273# Table fates. These are somewhat ordered, so that fates < $MAP_PROXIED should be 1274# externally documented. 1275my $ORDINARY = 0; # The normal fate. 1276my $MAP_PROXIED = 1; # The map table for the property isn't written out, 1277 # but there is a file written that can be used to 1278 # reconstruct this table 1279my $INTERNAL_ONLY = 2; # The file for this table is written out, but it is 1280 # for Perl's internal use only 1281my $SUPPRESSED = 3; # The file for this table is not written out, and as a 1282 # result, we don't bother to do many computations on 1283 # it. 1284my $PLACEHOLDER = 4; # Like $SUPPRESSED, but we go through all the 1285 # computations anyway, as the values are needed for 1286 # things to work. This happens when we have Perl 1287 # extensions that depend on Unicode tables that 1288 # wouldn't normally be in a given Unicode version. 1289 1290# The format of the values of the tables: 1291my $EMPTY_FORMAT = ""; 1292my $BINARY_FORMAT = 'b'; 1293my $DECIMAL_FORMAT = 'd'; 1294my $FLOAT_FORMAT = 'f'; 1295my $INTEGER_FORMAT = 'i'; 1296my $HEX_FORMAT = 'x'; 1297my $RATIONAL_FORMAT = 'r'; 1298my $STRING_FORMAT = 's'; 1299my $ADJUST_FORMAT = 'a'; 1300my $HEX_ADJUST_FORMAT = 'ax'; 1301my $DECOMP_STRING_FORMAT = 'c'; 1302my $STRING_WHITE_SPACE_LIST = 'sw'; 1303 1304my %map_table_formats = ( 1305 $BINARY_FORMAT => 'binary', 1306 $DECIMAL_FORMAT => 'single decimal digit', 1307 $FLOAT_FORMAT => 'floating point number', 1308 $INTEGER_FORMAT => 'integer', 1309 $HEX_FORMAT => 'non-negative hex whole number; a code point', 1310 $RATIONAL_FORMAT => 'rational: an integer or a fraction', 1311 $STRING_FORMAT => 'string', 1312 $ADJUST_FORMAT => 'some entries need adjustment', 1313 $HEX_ADJUST_FORMAT => 'mapped value in hex; some entries need adjustment', 1314 $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping', 1315 $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators' 1316); 1317 1318# Unicode didn't put such derived files in a separate directory at first. 1319my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : ""; 1320my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : ""; 1321my $AUXILIARY = 'auxiliary'; 1322my $EMOJI = 'emoji'; 1323 1324# Hashes and arrays that will eventually go into UCD.pl for the use of UCD.pm 1325my %loose_to_file_of; # loosely maps table names to their respective 1326 # files 1327my %stricter_to_file_of; # same; but for stricter mapping. 1328my %loose_property_to_file_of; # Maps a loose property name to its map file 1329my %strict_property_to_file_of; # Same, but strict 1330my @inline_definitions = "V0"; # Each element gives a definition of a unique 1331 # inversion list. When a definition is inlined, 1332 # its value in the hash it's in (one of the two 1333 # defined just above) will include an index into 1334 # this array. The 0th element is initialized to 1335 # the definition for a zero length inversion list 1336my %file_to_swash_name; # Maps the file name to its corresponding key name 1337 # in the hash %Unicode::UCD::SwashInfo 1338my %nv_floating_to_rational; # maps numeric values floating point numbers to 1339 # their rational equivalent 1340my %loose_property_name_of; # Loosely maps (non_string) property names to 1341 # standard form 1342my %strict_property_name_of; # Strictly maps (non_string) property names to 1343 # standard form 1344my %string_property_loose_to_name; # Same, for string properties. 1345my %loose_defaults; # keys are of form "prop=value", where 'prop' is 1346 # the property name in standard loose form, and 1347 # 'value' is the default value for that property, 1348 # also in standard loose form. 1349my %loose_to_standard_value; # loosely maps table names to the canonical 1350 # alias for them 1351my %ambiguous_names; # keys are alias names (in standard form) that 1352 # have more than one possible meaning. 1353my %combination_property; # keys are alias names (in standard form) that 1354 # have both a map table, and a binary one that 1355 # yields true for all non-null maps. 1356my %prop_aliases; # Keys are standard property name; values are each 1357 # one's aliases 1358my %prop_value_aliases; # Keys of top level are standard property name; 1359 # values are keys to another hash, Each one is 1360 # one of the property's values, in standard form. 1361 # The values are that prop-val's aliases. 1362my %skipped_files; # List of files that we skip 1363my %ucd_pod; # Holds entries that will go into the UCD section of the pod 1364 1365# Most properties are immune to caseless matching, otherwise you would get 1366# nonsensical results, as properties are a function of a code point, not 1367# everything that is caselessly equivalent to that code point. For example, 1368# Changes_When_Case_Folded('s') should be false, whereas caselessly it would 1369# be true because 's' and 'S' are equivalent caselessly. However, 1370# traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we 1371# extend that concept to those very few properties that are like this. Each 1372# such property will match the full range caselessly. They are hard-coded in 1373# the program; it's not worth trying to make it general as it's extremely 1374# unlikely that they will ever change. 1375my %caseless_equivalent_to; 1376 1377# This is the range of characters that were in Release 1 of Unicode, and 1378# removed in Release 2 (replaced with the current Hangul syllables starting at 1379# U+AC00). The range was reused starting in Release 3 for other purposes. 1380my $FIRST_REMOVED_HANGUL_SYLLABLE = 0x3400; 1381my $FINAL_REMOVED_HANGUL_SYLLABLE = 0x4DFF; 1382 1383# These constants names and values were taken from the Unicode standard, 1384# version 5.1, section 3.12. They are used in conjunction with Hangul 1385# syllables. The '_string' versions are so generated tables can retain the 1386# hex format, which is the more familiar value 1387my $SBase_string = "0xAC00"; 1388my $SBase = CORE::hex $SBase_string; 1389my $LBase_string = "0x1100"; 1390my $LBase = CORE::hex $LBase_string; 1391my $VBase_string = "0x1161"; 1392my $VBase = CORE::hex $VBase_string; 1393my $TBase_string = "0x11A7"; 1394my $TBase = CORE::hex $TBase_string; 1395my $SCount = 11172; 1396my $LCount = 19; 1397my $VCount = 21; 1398my $TCount = 28; 1399my $NCount = $VCount * $TCount; 1400 1401# For Hangul syllables; These store the numbers from Jamo.txt in conjunction 1402# with the above published constants. 1403my %Jamo; 1404my %Jamo_L; # Leading consonants 1405my %Jamo_V; # Vowels 1406my %Jamo_T; # Trailing consonants 1407 1408# For code points whose name contains its ordinal as a '-ABCD' suffix. 1409# The key is the base name of the code point, and the value is an 1410# array giving all the ranges that use this base name. Each range 1411# is actually a hash giving the 'low' and 'high' values of it. 1412my %names_ending_in_code_point; 1413my %loose_names_ending_in_code_point; # Same as above, but has blanks, dashes 1414 # removed from the names 1415# Inverse mapping. The list of ranges that have these kinds of 1416# names. Each element contains the low, high, and base names in an 1417# anonymous hash. 1418my @code_points_ending_in_code_point; 1419 1420# To hold Unicode's normalization test suite 1421my @normalization_tests; 1422 1423# Boolean: does this Unicode version have the hangul syllables, and are we 1424# writing out a table for them? 1425my $has_hangul_syllables = 0; 1426 1427# Does this Unicode version have code points whose names end in their 1428# respective code points, and are we writing out a table for them? 0 for no; 1429# otherwise points to first property that a table is needed for them, so that 1430# if multiple tables are needed, we don't create duplicates 1431my $needing_code_points_ending_in_code_point = 0; 1432 1433my @backslash_X_tests; # List of tests read in for testing \X 1434my @LB_tests; # List of tests read in for testing \b{lb} 1435my @SB_tests; # List of tests read in for testing \b{sb} 1436my @WB_tests; # List of tests read in for testing \b{wb} 1437my @unhandled_properties; # Will contain a list of properties found in 1438 # the input that we didn't process. 1439my @match_properties; # Properties that have match tables, to be 1440 # listed in the pod 1441my @map_properties; # Properties that get map files written 1442my @named_sequences; # NamedSequences.txt contents. 1443my %potential_files; # Generated list of all .txt files in the directory 1444 # structure so we can warn if something is being 1445 # ignored. 1446my @missing_early_files; # Generated list of absent files that we need to 1447 # proceed in compiling this early Unicode version 1448my @files_actually_output; # List of files we generated. 1449my @more_Names; # Some code point names are compound; this is used 1450 # to store the extra components of them. 1451my $E_FLOAT_PRECISION = 2; # The minimum number of digits after the decimal 1452 # point of a normalized floating point number 1453 # needed to match before we consider it equivalent 1454 # to a candidate rational 1455 1456# These store references to certain commonly used property objects 1457my $age; 1458my $ccc; 1459my $gc; 1460my $perl; 1461my $block; 1462my $perl_charname; 1463my $print; 1464my $All; 1465my $Assigned; # All assigned characters in this Unicode release 1466my $DI; # Default_Ignorable_Code_Point property 1467my $NChar; # Noncharacter_Code_Point property 1468my $script; 1469my $scx; # Script_Extensions property 1470my $idt; # Identifier_Type property 1471 1472# Are there conflicting names because of beginning with 'In_', or 'Is_' 1473my $has_In_conflicts = 0; 1474my $has_Is_conflicts = 0; 1475 1476sub internal_file_to_platform ($file=undef) { 1477 # Convert our file paths which have '/' separators to those of the 1478 # platform. 1479 1480 return undef unless defined $file; 1481 1482 return File::Spec->join(split '/', $file); 1483} 1484 1485sub file_exists ($file=undef) { # platform independent '-e'. This program internally 1486 # uses slash as a path separator. 1487 return 0 unless defined $file; 1488 return -e internal_file_to_platform($file); 1489} 1490 1491sub objaddr($addr) { 1492 # Returns the address of the blessed input object. 1493 # It doesn't check for blessedness because that would do a string eval 1494 # every call, and the program is structured so that this is never called 1495 # for a non-blessed object. 1496 1497 no overloading; # If overloaded, numifying below won't work. 1498 1499 # Numifying a ref gives its address. 1500 return pack 'J', $addr; 1501} 1502 1503# These are used only if $annotate is true. 1504# The entire range of Unicode characters is examined to populate these 1505# after all the input has been processed. But most can be skipped, as they 1506# have the same descriptive phrases, such as being unassigned 1507my @viacode; # Contains the 1 million character names 1508my @age; # And their ages ("" if none) 1509my @printable; # boolean: And are those characters printable? 1510my @annotate_char_type; # Contains a type of those characters, specifically 1511 # for the purposes of annotation. 1512my $annotate_ranges; # A map of ranges of code points that have the same 1513 # name for the purposes of annotation. They map to the 1514 # upper edge of the range, so that the end point can 1515 # be immediately found. This is used to skip ahead to 1516 # the end of a range, and avoid processing each 1517 # individual code point in it. 1518my $unassigned_sans_noncharacters; # A Range_List of the unassigned 1519 # characters, but excluding those which are 1520 # also noncharacter code points 1521 1522# The annotation types are an extension of the regular range types, though 1523# some of the latter are folded into one. Make the new types negative to 1524# avoid conflicting with the regular types 1525my $SURROGATE_TYPE = -1; 1526my $UNASSIGNED_TYPE = -2; 1527my $PRIVATE_USE_TYPE = -3; 1528my $NONCHARACTER_TYPE = -4; 1529my $CONTROL_TYPE = -5; 1530my $ABOVE_UNICODE_TYPE = -6; 1531my $UNKNOWN_TYPE = -7; # Used only if there is a bug in this program 1532 1533sub populate_char_info ($i) { 1534 # Used only with the $annotate option. Populates the arrays with the 1535 # input code point's info that are needed for outputting more detailed 1536 # comments. If calling context wants a return, it is the end point of 1537 # any contiguous range of characters that share essentially the same info 1538 1539 $viacode[$i] = $perl_charname->value_of($i) || ""; 1540 $age[$i] = (defined $age) 1541 ? (($age->value_of($i) =~ / ^ \d+ \. \d+ $ /x) 1542 ? $age->value_of($i) 1543 : "") 1544 : ""; 1545 1546 # A character is generally printable if Unicode says it is, 1547 # but below we make sure that most Unicode general category 'C' types 1548 # aren't. 1549 $printable[$i] = $print->contains($i); 1550 1551 # But the characters in this range were removed in v2.0 and replaced by 1552 # different ones later. Modern fonts will be for the replacement 1553 # characters, so suppress printing them. 1554 if (($v_version lt v2.0 1555 || ($compare_versions && $compare_versions lt v2.0)) 1556 && ( $i >= $FIRST_REMOVED_HANGUL_SYLLABLE 1557 && $i <= $FINAL_REMOVED_HANGUL_SYLLABLE)) 1558 { 1559 $printable[$i] = 0; 1560 } 1561 1562 $annotate_char_type[$i] = $perl_charname->type_of($i) || 0; 1563 1564 # Only these two regular types are treated specially for annotations 1565 # purposes 1566 $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME 1567 && $annotate_char_type[$i] != $HANGUL_SYLLABLE; 1568 1569 # Give a generic name to all code points that don't have a real name. 1570 # We output ranges, if applicable, for these. Also calculate the end 1571 # point of the range. 1572 my $end; 1573 if (! $viacode[$i]) { 1574 if ($i > $MAX_UNICODE_CODEPOINT) { 1575 $viacode[$i] = 'Above-Unicode'; 1576 $annotate_char_type[$i] = $ABOVE_UNICODE_TYPE; 1577 $printable[$i] = 0; 1578 $end = $MAX_WORKING_CODEPOINT; 1579 } 1580 elsif ($gc-> table('Private_use')->contains($i)) { 1581 $viacode[$i] = 'Private Use'; 1582 $annotate_char_type[$i] = $PRIVATE_USE_TYPE; 1583 $printable[$i] = 0; 1584 $end = $gc->table('Private_Use')->containing_range($i)->end; 1585 } 1586 elsif ($NChar->contains($i)) { 1587 $viacode[$i] = 'Noncharacter'; 1588 $annotate_char_type[$i] = $NONCHARACTER_TYPE; 1589 $printable[$i] = 0; 1590 $end = $NChar->containing_range($i)->end; 1591 } 1592 elsif ($gc-> table('Control')->contains($i)) { 1593 my $name_ref = property_ref('Name_Alias'); 1594 $name_ref = property_ref('Unicode_1_Name') if ! defined $name_ref; 1595 $viacode[$i] = (defined $name_ref) 1596 ? $name_ref->value_of($i) 1597 : 'Control'; 1598 $annotate_char_type[$i] = $CONTROL_TYPE; 1599 $printable[$i] = 0; 1600 } 1601 elsif ($gc-> table('Unassigned')->contains($i)) { 1602 $annotate_char_type[$i] = $UNASSIGNED_TYPE; 1603 $printable[$i] = 0; 1604 $viacode[$i] = 'Unassigned'; 1605 1606 if (defined $block) { # No blocks in earliest releases 1607 $viacode[$i] .= ', block=' . $block-> value_of($i); 1608 $end = $gc-> table('Unassigned')->containing_range($i)->end; 1609 1610 # Because we name the unassigned by the blocks they are in, it 1611 # can't go past the end of that block, and it also can't go 1612 # past the unassigned range it is in. The special table makes 1613 # sure that the non-characters, which are unassigned, are 1614 # separated out. 1615 $end = min($block->containing_range($i)->end, 1616 $unassigned_sans_noncharacters-> 1617 containing_range($i)->end); 1618 } 1619 else { 1620 $end = $i + 1; 1621 while ($unassigned_sans_noncharacters->contains($end)) { 1622 $end++; 1623 } 1624 $end--; 1625 } 1626 } 1627 elsif ($perl->table('_Perl_Surrogate')->contains($i)) { 1628 $viacode[$i] = 'Surrogate'; 1629 $annotate_char_type[$i] = $SURROGATE_TYPE; 1630 $printable[$i] = 0; 1631 $end = $gc->table('Surrogate')->containing_range($i)->end; 1632 } 1633 else { 1634 Carp::my_carp_bug("Can't figure out how to annotate " 1635 . sprintf("U+%04X", $i) 1636 . ". Proceeding anyway."); 1637 $viacode[$i] = 'UNKNOWN'; 1638 $annotate_char_type[$i] = $UNKNOWN_TYPE; 1639 $printable[$i] = 0; 1640 } 1641 } 1642 1643 # Here, has a name, but if it's one in which the code point number is 1644 # appended to the name, do that. 1645 elsif ($annotate_char_type[$i] == $CP_IN_NAME) { 1646 $viacode[$i] .= sprintf("-%04X", $i); 1647 1648 my $limit = $perl_charname->containing_range($i)->end; 1649 if (defined $age) { 1650 # Do all these as groups of the same age, instead of individually, 1651 # because their names are so meaningless, and there are typically 1652 # large quantities of them. 1653 $end = $i + 1; 1654 while ($end <= $limit && $age->value_of($end) == $age[$i]) { 1655 $end++; 1656 } 1657 $end--; 1658 } 1659 else { 1660 $end = $limit; 1661 } 1662 } 1663 1664 # And here, has a name, but if it's a hangul syllable one, replace it with 1665 # the correct name from the Unicode algorithm 1666 elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) { 1667 use integer; 1668 my $SIndex = $i - $SBase; 1669 my $L = $LBase + $SIndex / $NCount; 1670 my $V = $VBase + ($SIndex % $NCount) / $TCount; 1671 my $T = $TBase + $SIndex % $TCount; 1672 $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}"; 1673 $viacode[$i] .= $Jamo{$T} if $T != $TBase; 1674 $end = $perl_charname->containing_range($i)->end; 1675 } 1676 1677 return if ! defined wantarray; 1678 return $i if ! defined $end; # If not a range, return the input 1679 1680 # Save this whole range so can find the end point quickly 1681 $annotate_ranges->add_map($i, $end, $end); 1682 1683 return $end; 1684} 1685 1686sub max($a, $b) { 1687 return $a >= $b ? $a : $b; 1688} 1689 1690sub min($a, $b) { 1691 return $a <= $b ? $a : $b; 1692} 1693 1694sub clarify_number ($number) { 1695 # This returns the input number with underscores inserted every 3 digits 1696 # in large (5 digits or more) numbers. Input must be entirely digits, not 1697 # checked. 1698 1699 my $pos = length($number) - 3; 1700 return $number if $pos <= 1; 1701 while ($pos > 0) { 1702 substr($number, $pos, 0) = '_'; 1703 $pos -= 3; 1704 } 1705 return $number; 1706} 1707 1708sub clarify_code_point_count ($number) { 1709 # This is like clarify_number(), but the input is assumed to be a count of 1710 # code points, rather than a generic number. 1711 1712 my $append = ""; 1713 1714 if ($number > $MAX_UNICODE_CODEPOINTS) { 1715 $number -= ($MAX_WORKING_CODEPOINTS - $MAX_UNICODE_CODEPOINTS); 1716 return "All above-Unicode code points" if $number == 0; 1717 $append = " + all above-Unicode code points"; 1718 } 1719 return clarify_number($number) . $append; 1720} 1721 1722package Carp; 1723 1724# These routines give a uniform treatment of messages in this program. They 1725# are placed in the Carp package to cause the stack trace to not include them, 1726# although an alternative would be to use another package and set @CARP_NOT 1727# for it. 1728 1729our $Verbose = 1 if main::DEBUG; # Useful info when debugging 1730 1731# This is a work-around suggested by Nicholas Clark to fix a problem with Carp 1732# and overload trying to load Scalar:Util under miniperl. See 1733# http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html 1734undef $overload::VERSION; 1735 1736sub my_carp($message="", $nofold=0) { 1737 1738 if ($message) { 1739 $message = main::join_lines($message); 1740 $message =~ s/^$0: *//; # Remove initial program name 1741 $message =~ s/[.;,]+$//; # Remove certain ending punctuation 1742 $message = "\n$0: $message;"; 1743 1744 # Fold the message with program name, semi-colon end punctuation 1745 # (which looks good with the message that carp appends to it), and a 1746 # hanging indent for continuation lines. 1747 $message = main::simple_fold($message, "", 4) unless $nofold; 1748 $message =~ s/\n$//; # Remove the trailing nl so what carp 1749 # appends is to the same line 1750 } 1751 1752 return $message if defined wantarray; # If a caller just wants the msg 1753 1754 carp $message; 1755 return; 1756} 1757 1758sub my_carp_bug($message="") { 1759 # This is called when it is clear that the problem is caused by a bug in 1760 # this program. 1761 $message =~ s/^$0: *//; 1762 $message = my_carp("Bug in $0. Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message"); 1763 carp $message; 1764 return; 1765} 1766 1767sub carp_too_few_args($args_ref, $count) { 1768 my_carp_bug("Need at least $count arguments to " 1769 . (caller 1)[3] 1770 . ". Instead got: '" 1771 . join ', ', @$args_ref 1772 . "'. No action taken."); 1773 return; 1774} 1775 1776sub carp_extra_args($args_ref) { 1777 unless (ref $args_ref) { 1778 my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref. Not checking arguments."); 1779 return; 1780 } 1781 my ($package, $file, $line) = caller; 1782 my $subroutine = (caller 1)[3]; 1783 1784 my $list; 1785 if (ref $args_ref eq 'HASH') { 1786 foreach my $key (keys %$args_ref) { 1787 $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key}; 1788 } 1789 $list = join ', ', each %{$args_ref}; 1790 } 1791 elsif (ref $args_ref eq 'ARRAY') { 1792 foreach my $arg (@$args_ref) { 1793 $arg = $UNDEF unless defined $arg; 1794 } 1795 $list = join ', ', @$args_ref; 1796 } 1797 else { 1798 my_carp_bug("Can't cope with ref " 1799 . ref($args_ref) 1800 . " . argument to 'carp_extra_args'. Not checking arguments."); 1801 return; 1802 } 1803 1804 my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine. Skipped."); 1805 return; 1806} 1807 1808package main; 1809 1810{ # Closure 1811 1812 # This program uses the inside-out method for objects, as recommended in 1813 # "Perl Best Practices". (This is the best solution still, since this has 1814 # to run under miniperl.) This closure aids in generating those. There 1815 # are two routines. setup_package() is called once per package to set 1816 # things up, and then set_access() is called for each hash representing a 1817 # field in the object. These routines arrange for the object to be 1818 # properly destroyed when no longer used, and for standard accessor 1819 # functions to be generated. If you need more complex accessors, just 1820 # write your own and leave those accesses out of the call to set_access(). 1821 # More details below. 1822 1823 my %constructor_fields; # fields that are to be used in constructors; see 1824 # below 1825 1826 # The values of this hash will be the package names as keys to other 1827 # hashes containing the name of each field in the package as keys, and 1828 # references to their respective hashes as values. 1829 my %package_fields; 1830 1831 sub setup_package { 1832 # Sets up the package, creating standard DESTROY and dump methods 1833 # (unless already defined). The dump method is used in debugging by 1834 # simple_dumper(). 1835 # The optional parameters are: 1836 # a) a reference to a hash, that gets populated by later 1837 # set_access() calls with one of the accesses being 1838 # 'constructor'. The caller can then refer to this, but it is 1839 # not otherwise used by these two routines. 1840 # b) a reference to a callback routine to call during destruction 1841 # of the object, before any fields are actually destroyed 1842 1843 my %args = @_; 1844 my $constructor_ref = delete $args{'Constructor_Fields'}; 1845 my $destroy_callback = delete $args{'Destroy_Callback'}; 1846 Carp::carp_extra_args(\@_) if main::DEBUG && %args; 1847 1848 my %fields; 1849 my $package = (caller)[0]; 1850 1851 $package_fields{$package} = \%fields; 1852 $constructor_fields{$package} = $constructor_ref; 1853 1854 unless ($package->can('DESTROY')) { 1855 my $destroy_name = "${package}::DESTROY"; 1856 no strict "refs"; 1857 1858 # Use typeglob to give the anonymous subroutine the name we want 1859 *$destroy_name = sub { 1860 my $self = shift; 1861 my $addr = do { no overloading; pack 'J', $self; }; 1862 1863 $self->$destroy_callback if $destroy_callback; 1864 foreach my $field (keys %{$package_fields{$package}}) { 1865 #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n"; 1866 delete $package_fields{$package}{$field}{$addr}; 1867 } 1868 return; 1869 } 1870 } 1871 1872 unless ($package->can('dump')) { 1873 my $dump_name = "${package}::dump"; 1874 no strict "refs"; 1875 *$dump_name = sub { 1876 my $self = shift; 1877 return dump_inside_out($self, $package_fields{$package}, @_); 1878 } 1879 } 1880 return; 1881 } 1882 1883 sub set_access($name, $field, @accessors) { 1884 # Arrange for the input field to be garbage collected when no longer 1885 # needed. Also, creates standard accessor functions for the field 1886 # based on the optional parameters-- none if none of these parameters: 1887 # 'addable' creates an 'add_NAME()' accessor function. 1888 # 'readable' or 'readable_array' creates a 'NAME()' accessor 1889 # function. 1890 # 'settable' creates a 'set_NAME()' accessor function. 1891 # 'constructor' doesn't create an accessor function, but adds the 1892 # field to the hash that was previously passed to 1893 # setup_package(); 1894 # Any of the accesses can be abbreviated down, so that 'a', 'ad', 1895 # 'add' etc. all mean 'addable'. 1896 # The read accessor function will work on both array and scalar 1897 # values. If another accessor in the parameter list is 'a', the read 1898 # access assumes an array. You can also force it to be array access 1899 # by specifying 'readable_array' instead of 'readable' 1900 # 1901 # A sort-of 'protected' access can be set-up by preceding the addable, 1902 # readable or settable with some initial portion of 'protected_' (but, 1903 # the underscore is required), like 'p_a', 'pro_set', etc. The 1904 # "protection" is only by convention. All that happens is that the 1905 # accessor functions' names begin with an underscore. So instead of 1906 # calling set_foo, the call is _set_foo. (Real protection could be 1907 # accomplished by having a new subroutine, end_package, called at the 1908 # end of each package, and then storing the __LINE__ ranges and 1909 # checking them on every accessor. But that is way overkill.) 1910 1911 # We create anonymous subroutines as the accessors and then use 1912 # typeglobs to assign them to the proper package and name 1913 1914 # $name Name of the field 1915 # $field Reference to the inside-out hash containing the 1916 # field 1917 1918 my $package = (caller)[0]; 1919 1920 if (! exists $package_fields{$package}) { 1921 croak "$0: Must call 'setup_package' before 'set_access'"; 1922 } 1923 1924 # Stash the field so DESTROY can get it. 1925 $package_fields{$package}{$name} = $field; 1926 1927 # Remaining arguments are the accessors. For each... 1928 foreach my $access (@accessors) { 1929 my $access = lc $access; 1930 1931 my $protected = ""; 1932 1933 # Match the input as far as it goes. 1934 if ($access =~ /^(p[^_]*)_/) { 1935 $protected = $1; 1936 if (substr('protected_', 0, length $protected) 1937 eq $protected) 1938 { 1939 1940 # Add 1 for the underscore not included in $protected 1941 $access = substr($access, length($protected) + 1); 1942 $protected = '_'; 1943 } 1944 else { 1945 $protected = ""; 1946 } 1947 } 1948 1949 if (substr('addable', 0, length $access) eq $access) { 1950 my $subname = "${package}::${protected}add_$name"; 1951 no strict "refs"; 1952 1953 # add_ accessor. Don't add if already there, which we 1954 # determine using 'eq' for scalars and '==' otherwise. 1955 *$subname = sub ($self, $value) { 1956 use strict "refs"; 1957 my $addr = do { no overloading; pack 'J', $self; }; 1958 if (ref $value) { 1959 return if grep { $value == $_ } @{$field->{$addr}}; 1960 } 1961 else { 1962 return if grep { $value eq $_ } @{$field->{$addr}}; 1963 } 1964 push @{$field->{$addr}}, $value; 1965 return; 1966 } 1967 } 1968 elsif (substr('constructor', 0, length $access) eq $access) { 1969 if ($protected) { 1970 Carp::my_carp_bug("Can't set-up 'protected' constructors") 1971 } 1972 else { 1973 $constructor_fields{$package}{$name} = $field; 1974 } 1975 } 1976 elsif (substr('readable_array', 0, length $access) eq $access) { 1977 1978 # Here has read access. If one of the other parameters for 1979 # access is array, or this one specifies array (by being more 1980 # than just 'readable_'), then create a subroutine that 1981 # assumes the data is an array. Otherwise just a scalar 1982 my $subname = "${package}::${protected}$name"; 1983 if (grep { /^a/i } @_ 1984 or length($access) > length('readable_')) 1985 { 1986 no strict "refs"; 1987 *$subname = sub ($_addr) { 1988 use strict "refs"; 1989 my $addr = do { no overloading; pack 'J', $_addr; }; 1990 if (ref $field->{$addr} ne 'ARRAY') { 1991 my $type = ref $field->{$addr}; 1992 $type = 'scalar' unless $type; 1993 Carp::my_carp_bug("Trying to read $name as an array when it is a $type. Big problems."); 1994 return; 1995 } 1996 return scalar @{$field->{$addr}} unless wantarray; 1997 1998 # Make a copy; had problems with caller modifying the 1999 # original otherwise 2000 my @return = @{$field->{$addr}}; 2001 return @return; 2002 } 2003 } 2004 else { 2005 2006 # Here not an array value, a simpler function. 2007 no strict "refs"; 2008 *$subname = sub ($addr) { 2009 use strict "refs"; 2010 no overloading; 2011 return $field->{pack 'J', $addr}; 2012 } 2013 } 2014 } 2015 elsif (substr('settable', 0, length $access) eq $access) { 2016 my $subname = "${package}::${protected}set_$name"; 2017 no strict "refs"; 2018 *$subname = sub ($self, $value) { 2019 use strict "refs"; 2020 # $self is $_[0]; $value is $_[1] 2021 no overloading; 2022 $field->{pack 'J', $self} = $value; 2023 return; 2024 } 2025 } 2026 else { 2027 Carp::my_carp_bug("Unknown accessor type $access. No accessor set."); 2028 } 2029 } 2030 return; 2031 } 2032} 2033 2034package Input_file; 2035 2036# All input files use this object, which stores various attributes about them, 2037# and provides for convenient, uniform handling. The run method wraps the 2038# processing. It handles all the bookkeeping of opening, reading, and closing 2039# the file, returning only significant input lines. 2040# 2041# Each object gets a handler which processes the body of the file, and is 2042# called by run(). All character property files must use the generic, 2043# default handler, which has code scrubbed to handle things you might not 2044# expect, including automatic EBCDIC handling. For files that don't deal with 2045# mapping code points to a property value, such as test files, 2046# PropertyAliases, PropValueAliases, and named sequences, you can override the 2047# handler to be a custom one. Such a handler should basically be a 2048# while(next_line()) {...} loop. 2049# 2050# You can also set up handlers to 2051# 0) call during object construction time, after everything else is done 2052# 1) call before the first line is read, for pre processing 2053# 2) call to adjust each line of the input before the main handler gets 2054# them. This can be automatically generated, if appropriately simple 2055# enough, by specifying a Properties parameter in the constructor. 2056# 3) call upon EOF before the main handler exits its loop 2057# 4) call at the end, for post processing 2058# 2059# $_ is used to store the input line, and is to be filtered by the 2060# each_line_handler()s. So, if the format of the line is not in the desired 2061# format for the main handler, these are used to do that adjusting. They can 2062# be stacked (by enclosing them in an [ anonymous array ] in the constructor, 2063# so the $_ output of one is used as the input to the next. The EOF handler 2064# is also stackable, but none of the others are, but could easily be changed 2065# to be so. 2066# 2067# Some properties are used by the Perl core but aren't defined until later 2068# Unicode releases. The perl interpreter would have problems working when 2069# compiled with an earlier Unicode version that doesn't have them, so we need 2070# to define them somehow for those releases. The 'Early' constructor 2071# parameter can be used to automatically handle this. It is essentially 2072# ignored if the Unicode version being compiled has a data file for this 2073# property. Either code to execute or a file to read can be specified. 2074# Details are at the %early definition. 2075# 2076# Most of the handlers can call insert_lines() or insert_adjusted_lines() 2077# which insert the parameters as lines to be processed before the next input 2078# file line is read. This allows the EOF handler(s) to flush buffers, for 2079# example. The difference between the two routines is that the lines inserted 2080# by insert_lines() are subjected to the each_line_handler()s. (So if you 2081# called it from such a handler, you would get infinite recursion without some 2082# mechanism to prevent that.) Lines inserted by insert_adjusted_lines() go 2083# directly to the main handler without any adjustments. If the 2084# post-processing handler calls any of these, there will be no effect. Some 2085# error checking for these conditions could be added, but it hasn't been done. 2086# 2087# carp_bad_line() should be called to warn of bad input lines, which clears $_ 2088# to prevent further processing of the line. This routine will output the 2089# message as a warning once, and then keep a count of the lines that have the 2090# same message, and output that count at the end of the file's processing. 2091# This keeps the number of messages down to a manageable amount. 2092# 2093# get_missings() should be called to retrieve any @missing input lines. 2094# Messages will be raised if this isn't done if the options aren't to ignore 2095# missings. 2096 2097sub trace { return main::trace(@_); } 2098 2099{ # Closure 2100 # Keep track of fields that are to be put into the constructor. 2101 my %constructor_fields; 2102 2103 main::setup_package(Constructor_Fields => \%constructor_fields); 2104 2105 my %file; # Input file name, required 2106 main::set_access('file', \%file, qw{ c r }); 2107 2108 my %first_released; # Unicode version file was first released in, required 2109 main::set_access('first_released', \%first_released, qw{ c r }); 2110 2111 my %handler; # Subroutine to process the input file, defaults to 2112 # 'process_generic_property_file' 2113 main::set_access('handler', \%handler, qw{ c }); 2114 2115 my %property; 2116 # name of property this file is for. defaults to none, meaning not 2117 # applicable, or is otherwise determinable, for example, from each line. 2118 main::set_access('property', \%property, qw{ c r }); 2119 2120 my %optional; 2121 # This is either an unsigned number, or a list of property names. In the 2122 # former case, if it is non-zero, it means the file is optional, so if the 2123 # file is absent, no warning about that is output. In the latter case, it 2124 # is a list of properties that the file (exclusively) defines. If the 2125 # file is present, tables for those properties will be produced; if 2126 # absent, none will, even if they are listed elsewhere (namely 2127 # PropertyAliases.txt and PropValueAliases.txt) as being in this release, 2128 # and no warnings will be raised about them not being available. (And no 2129 # warning about the file itself will be raised.) 2130 main::set_access('optional', \%optional, qw{ c readable_array } ); 2131 2132 my %non_skip; 2133 # This is used for debugging, to skip processing of all but a few input 2134 # files. Add 'non_skip => 1' to the constructor for those files you want 2135 # processed when you set the $debug_skip global. 2136 main::set_access('non_skip', \%non_skip, 'c'); 2137 2138 my %skip; 2139 # This is used to skip processing of this input file (semi-) permanently. 2140 # The value should be the reason the file is being skipped. It is used 2141 # for files that we aren't planning to process anytime soon, but want to 2142 # allow to be in the directory and be checked for their names not 2143 # conflicting with any other files on a DOS 8.3 name filesystem, but to 2144 # not otherwise be processed, and to not raise a warning about not being 2145 # handled. In the constructor call, any value that evaluates to a numeric 2146 # 0 or undef means don't skip. Any other value is a string giving the 2147 # reason it is being skipped, and this will appear in generated pod. 2148 # However, an empty string reason will suppress the pod entry. 2149 # Internally, calls that evaluate to numeric 0 are changed into undef to 2150 # distinguish them from an empty string call. 2151 main::set_access('skip', \%skip, 'c', 'r'); 2152 2153 my %each_line_handler; 2154 # list of subroutines to look at and filter each non-comment line in the 2155 # file. defaults to none. The subroutines are called in order, each is 2156 # to adjust $_ for the next one, and the final one adjusts it for 2157 # 'handler' 2158 main::set_access('each_line_handler', \%each_line_handler, 'c'); 2159 2160 my %retain_trailing_comments; 2161 # This is used to not discard the comments that end data lines. This 2162 # would be used only for files with non-typical syntax, and most code here 2163 # assumes that comments have been stripped, so special handlers would have 2164 # to be written. It is assumed that the code will use these in 2165 # single-quoted contexts, and so any "'" marks in the comment will be 2166 # prefixed by a backslash. 2167 main::set_access('retain_trailing_comments', \%retain_trailing_comments, 'c'); 2168 2169 my %properties; # Optional ordered list of the properties that occur in each 2170 # meaningful line of the input file. If present, an appropriate 2171 # each_line_handler() is automatically generated and pushed onto the stack 2172 # of such handlers. This is useful when a file contains multiple 2173 # properties per line, but no other special considerations are necessary. 2174 # The special value "<ignored>" means to discard the corresponding input 2175 # field. 2176 # Any @missing lines in the file should also match this syntax; no such 2177 # files exist as of 6.3. But if it happens in a future release, the code 2178 # could be expanded to properly parse them. 2179 main::set_access('properties', \%properties, qw{ c r }); 2180 2181 my %has_missings_defaults; 2182 # ? Are there lines in the file giving default values for code points 2183 # missing from it?. Defaults to NO_DEFAULTS. Otherwise NOT_IGNORED is 2184 # the norm, but IGNORED means it has such lines, but the handler doesn't 2185 # use them. Having these three states allows us to catch changes to the 2186 # UCD that this program should track. XXX This could be expanded to 2187 # specify the syntax for such lines, like %properties above. 2188 main::set_access('has_missings_defaults', 2189 \%has_missings_defaults, qw{ c r }); 2190 2191 my %construction_time_handler; 2192 # Subroutine to call at the end of the new method. If undef, no such 2193 # handler is called. 2194 main::set_access('construction_time_handler', 2195 \%construction_time_handler, qw{ c }); 2196 2197 my %pre_handler; 2198 # Subroutine to call before doing anything else in the file. If undef, no 2199 # such handler is called. 2200 main::set_access('pre_handler', \%pre_handler, qw{ c }); 2201 2202 my %eof_handler; 2203 # Subroutines to call upon getting an EOF on the input file, but before 2204 # that is returned to the main handler. This is to allow buffers to be 2205 # flushed. The handler is expected to call insert_lines() or 2206 # insert_adjusted() with the buffered material 2207 main::set_access('eof_handler', \%eof_handler, qw{ c }); 2208 2209 my %post_handler; 2210 # Subroutine to call after all the lines of the file are read in and 2211 # processed. If undef, no such handler is called. Note that this cannot 2212 # add lines to be processed; instead use eof_handler 2213 main::set_access('post_handler', \%post_handler, qw{ c }); 2214 2215 my %progress_message; 2216 # Message to print to display progress in lieu of the standard one 2217 main::set_access('progress_message', \%progress_message, qw{ c }); 2218 2219 my %handle; 2220 # cache open file handle, internal. Is undef if file hasn't been 2221 # processed at all, empty if has; 2222 main::set_access('handle', \%handle); 2223 2224 my %added_lines; 2225 # cache of lines added virtually to the file, internal 2226 main::set_access('added_lines', \%added_lines); 2227 2228 my %remapped_lines; 2229 # cache of lines added virtually to the file, internal 2230 main::set_access('remapped_lines', \%remapped_lines); 2231 2232 my %errors; 2233 # cache of errors found, internal 2234 main::set_access('errors', \%errors); 2235 2236 my %missings; 2237 # storage of '@missing' defaults lines 2238 main::set_access('missings', \%missings); 2239 2240 my %early; 2241 # Used for properties that must be defined (for Perl's purposes) on 2242 # versions of Unicode earlier than Unicode itself defines them. The 2243 # parameter is an array (it would be better to be a hash, but not worth 2244 # bothering about due to its rare use). 2245 # 2246 # The first element is either a code reference to call when in a release 2247 # earlier than the Unicode file is available in, or it is an alternate 2248 # file to use instead of the non-existent one. This file must have been 2249 # plunked down in the same directory as mktables. Should you be compiling 2250 # on a release that needs such a file, mktables will abort the 2251 # compilation, and tell you where to get the necessary file(s), and what 2252 # name(s) to use to store them as. 2253 # In the case of specifying an alternate file, the array must contain two 2254 # further elements: 2255 # 2256 # [1] is the name of the property that will be generated by this file. 2257 # The class automatically takes the input file and excludes any code 2258 # points in it that were not assigned in the Unicode version being 2259 # compiled. It then uses this result to define the property in the given 2260 # version. Since the property doesn't actually exist in the Unicode 2261 # version being compiled, this should be a name accessible only by core 2262 # perl. If it is the same name as the regular property, the constructor 2263 # will mark the output table as a $PLACEHOLDER so that it doesn't actually 2264 # get output, and so will be unusable by non-core code. Otherwise it gets 2265 # marked as $INTERNAL_ONLY. 2266 # 2267 # [2] is a property value to assign (only when compiling Unicode 1.1.5) to 2268 # the Hangul syllables in that release (which were ripped out in version 2269 # 2) for the given property . (Hence it is ignored except when compiling 2270 # version 1. You only get one value that applies to all of them, which 2271 # may not be the actual reality, but probably nobody cares anyway for 2272 # these obsolete characters.) 2273 # 2274 # [3] if present is the default value for the property to assign for code 2275 # points not given in the input. If not present, the default from the 2276 # normal property is used 2277 # 2278 # [-1] If there is an extra final element that is the string 'ONLY_EARLY'. 2279 # it means to not add the name in [1] as an alias to the property name 2280 # used for these. Normally, when compiling Unicode versions that don't 2281 # invoke the early handling, the name is added as a synonym. 2282 # 2283 # Not all files can be handled in the above way, and so the code ref 2284 # alternative is available. It can do whatever it needs to. The other 2285 # array elements are optional in this case, and the code is free to use or 2286 # ignore them if they are present. 2287 # 2288 # Internally, the constructor unshifts a 0 or 1 onto this array to 2289 # indicate if an early alternative is actually being used or not. This 2290 # makes for easier testing later on. 2291 main::set_access('early', \%early, 'c'); 2292 2293 my %only_early; 2294 main::set_access('only_early', \%only_early, 'c'); 2295 2296 my %required_even_in_debug_skip; 2297 # debug_skip is used to speed up compilation during debugging by skipping 2298 # processing files that are not needed for the task at hand. However, 2299 # some files pretty much can never be skipped, and this is used to specify 2300 # that this is one of them. In order to skip this file, the call to the 2301 # constructor must be edited to comment out this parameter. 2302 main::set_access('required_even_in_debug_skip', 2303 \%required_even_in_debug_skip, 'c'); 2304 2305 my %withdrawn; 2306 # Some files get removed from the Unicode DB. This is a version object 2307 # giving the first release without this file. 2308 main::set_access('withdrawn', \%withdrawn, 'c'); 2309 2310 my %ucd; 2311 # Some files are not actually part of the Unicode Character Database. 2312 # These typically have a different way of indicating their version 2313 main::set_access('ucd', \%ucd, 'c'); 2314 2315 my %in_this_release; 2316 # Calculated value from %first_released and %withdrawn. Are we compiling 2317 # a Unicode release which includes this file? 2318 main::set_access('in_this_release', \%in_this_release); 2319 2320 sub _next_line; 2321 sub _next_line_with_remapped_range; 2322 2323 sub new { 2324 my $class = shift; 2325 2326 my $self = bless \do{ my $anonymous_scalar }, $class; 2327 my $addr = do { no overloading; pack 'J', $self; }; 2328 2329 # Set defaults 2330 $handler{$addr} = \&main::process_generic_property_file; 2331 $retain_trailing_comments{$addr} = 0; 2332 $non_skip{$addr} = 0; 2333 $skip{$addr} = undef; 2334 $has_missings_defaults{$addr} = $NO_DEFAULTS; 2335 $handle{$addr} = undef; 2336 $added_lines{$addr} = [ ]; 2337 $remapped_lines{$addr} = [ ]; 2338 $each_line_handler{$addr} = [ ]; 2339 $eof_handler{$addr} = [ ]; 2340 $errors{$addr} = { }; 2341 $missings{$addr} = [ ]; 2342 $early{$addr} = [ ]; 2343 $optional{$addr} = [ ]; 2344 $ucd{$addr} = 1; 2345 2346 # Two positional parameters. 2347 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2; 2348 $file{$addr} = main::internal_file_to_platform(shift); 2349 $first_released{$addr} = shift; 2350 2351 # The rest of the arguments are key => value pairs 2352 # %constructor_fields has been set up earlier to list all possible 2353 # ones. Either set or push, depending on how the default has been set 2354 # up just above. 2355 my %args = @_; 2356 foreach my $key (keys %args) { 2357 my $argument = $args{$key}; 2358 2359 # Note that the fields are the lower case of the constructor keys 2360 my $hash = $constructor_fields{lc $key}; 2361 if (! defined $hash) { 2362 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self. Skipped"); 2363 next; 2364 } 2365 if (ref $hash->{$addr} eq 'ARRAY') { 2366 if (ref $argument eq 'ARRAY') { 2367 foreach my $argument (@{$argument}) { 2368 next if ! defined $argument; 2369 push @{$hash->{$addr}}, $argument; 2370 } 2371 } 2372 else { 2373 push @{$hash->{$addr}}, $argument if defined $argument; 2374 } 2375 } 2376 else { 2377 $hash->{$addr} = $argument; 2378 } 2379 delete $args{$key}; 2380 }; 2381 2382 $non_skip{$addr} = 1 if $required_even_in_debug_skip{$addr}; 2383 2384 # Convert 0 (meaning don't skip) to undef 2385 undef $skip{$addr} unless $skip{$addr}; 2386 2387 # Handle the case where this file is optional 2388 my $pod_message_for_non_existent_optional = ""; 2389 if ($optional{$addr}->@*) { 2390 2391 # First element is the pod message 2392 $pod_message_for_non_existent_optional 2393 = shift $optional{$addr}->@*; 2394 # Convert a 0 'Optional' argument to an empty list to make later 2395 # code more concise. 2396 if ( $optional{$addr}->@* 2397 && $optional{$addr}->@* == 1 2398 && $optional{$addr}[0] ne "" 2399 && $optional{$addr}[0] !~ /\D/ 2400 && $optional{$addr}[0] == 0) 2401 { 2402 $optional{$addr} = [ ]; 2403 } 2404 else { # But if the only element doesn't evaluate to 0, make sure 2405 # that this file is indeed considered optional below. 2406 unshift $optional{$addr}->@*, 1; 2407 } 2408 } 2409 2410 my $progress; 2411 my $function_instead_of_file = 0; 2412 2413 if ($early{$addr}->@* && $early{$addr}[-1] eq 'ONLY_EARLY') { 2414 $only_early{$addr} = 1; 2415 pop $early{$addr}->@*; 2416 } 2417 2418 # If we are compiling a Unicode release earlier than the file became 2419 # available, the constructor may have supplied a substitute 2420 if ($first_released{$addr} gt $v_version && $early{$addr}->@*) { 2421 2422 # Yes, we have a substitute, that we will use; mark it so 2423 unshift $early{$addr}->@*, 1; 2424 2425 # See the definition of %early for what the array elements mean. 2426 # Note that we have just unshifted onto the array, so the numbers 2427 # below are +1 of those in the %early description. 2428 # If we have a property this defines, create a table and default 2429 # map for it now (at essentially compile time), so that it will be 2430 # available for the whole of run time. (We will want to add this 2431 # name as an alias when we are using the official property name; 2432 # but this must be deferred until run(), because at construction 2433 # time the official names have yet to be defined.) 2434 if ($early{$addr}[2]) { 2435 my $fate = ($property{$addr} 2436 && $property{$addr} eq $early{$addr}[2]) 2437 ? $PLACEHOLDER 2438 : $INTERNAL_ONLY; 2439 my $prop_object = Property->new($early{$addr}[2], 2440 Fate => $fate, 2441 Perl_Extension => 1, 2442 ); 2443 2444 # If not specified by the constructor, use the default mapping 2445 # for the regular property for this substitute one. 2446 if ($early{$addr}[4]) { 2447 $prop_object->set_default_map($early{$addr}[4]); 2448 } 2449 elsif ( defined $property{$addr} 2450 && defined $default_mapping{$property{$addr}}) 2451 { 2452 $prop_object 2453 ->set_default_map($default_mapping{$property{$addr}}); 2454 } 2455 } 2456 2457 if (ref $early{$addr}[1] eq 'CODE') { 2458 $function_instead_of_file = 1; 2459 2460 # If the first element of the array is a code ref, the others 2461 # are optional. 2462 $handler{$addr} = $early{$addr}[1]; 2463 $property{$addr} = $early{$addr}[2] 2464 if defined $early{$addr}[2]; 2465 $progress = "substitute $file{$addr}"; 2466 2467 undef $file{$addr}; 2468 } 2469 else { # Specifying a substitute file 2470 2471 if (! main::file_exists($early{$addr}[1])) { 2472 2473 # If we don't see the substitute file, generate an error 2474 # message giving the needed things, and add it to the list 2475 # of such to output before actual processing happens 2476 # (hence the user finds out all of them in one run). 2477 # Instead of creating a general method for NameAliases, 2478 # hard-code it here, as there is unlikely to ever be a 2479 # second one which needs special handling. 2480 my $string_version = ($file{$addr} eq "NameAliases.txt") 2481 ? 'at least 6.1 (the later, the better)' 2482 : sprintf "%vd", $first_released{$addr}; 2483 push @missing_early_files, <<END; 2484'$file{$addr}' version $string_version should be copied to '$early{$addr}[1]'. 2485END 2486 ; 2487 return; 2488 } 2489 $progress = $early{$addr}[1]; 2490 $progress .= ", substituting for $file{$addr}" if $file{$addr}; 2491 $file{$addr} = $early{$addr}[1]; 2492 $property{$addr} = $early{$addr}[2]; 2493 2494 # Ignore code points not in the version being compiled 2495 push $each_line_handler{$addr}->@*, \&_exclude_unassigned; 2496 2497 if ( $v_version lt v2.0 # Hanguls in this release ... 2498 && defined $early{$addr}[3]) # ... need special treatment 2499 { 2500 push $eof_handler{$addr}->@*, \&_fixup_obsolete_hanguls; 2501 } 2502 } 2503 2504 # And this substitute is valid for all releases. 2505 $first_released{$addr} = v0; 2506 } 2507 else { # Normal behavior 2508 $progress = $file{$addr}; 2509 unshift $early{$addr}->@*, 0; # No substitute 2510 } 2511 2512 my $file = $file{$addr}; 2513 $progress_message{$addr} = "Processing $progress" 2514 unless $progress_message{$addr}; 2515 2516 # A file should be there if it is within the window of versions for 2517 # which Unicode supplies it 2518 if ($withdrawn{$addr} && $withdrawn{$addr} le $v_version) { 2519 $in_this_release{$addr} = 0; 2520 $skip{$addr} = ""; 2521 } 2522 else { 2523 $in_this_release{$addr} = $first_released{$addr} le $v_version; 2524 2525 # Check that the file for this object (possibly using a substitute 2526 # for early releases) exists or we have a function alternative 2527 if ( ! $function_instead_of_file 2528 && ! main::file_exists($file)) 2529 { 2530 # Here there is nothing available for this release. This is 2531 # fine if we aren't expecting anything in this release. 2532 if (! $in_this_release{$addr}) { 2533 $skip{$addr} = ""; # Don't remark since we expected 2534 # nothing and got nothing 2535 } 2536 elsif ($optional{$addr}->@*) { 2537 2538 # Here the file is optional in this release; Use the 2539 # passed in text to document this case in the pod. 2540 $skip{$addr} = $pod_message_for_non_existent_optional; 2541 } 2542 elsif ( $in_this_release{$addr} 2543 && ! defined $skip{$addr} 2544 && defined $file) 2545 { # Doesn't exist but should. 2546 $skip{$addr} = "'$file' not found. Possibly Big problems"; 2547 Carp::my_carp($skip{$addr}); 2548 } 2549 } 2550 elsif ($debug_skip && ! defined $skip{$addr} && ! $non_skip{$addr}) 2551 { 2552 2553 # The file exists; if not skipped for another reason, and we are 2554 # skipping most everything during debugging builds, use that as 2555 # the skip reason. 2556 $skip{$addr} = '$debug_skip is on' 2557 } 2558 } 2559 2560 if ( ! $debug_skip 2561 && $non_skip{$addr} 2562 && ! $required_even_in_debug_skip{$addr} 2563 && $verbosity) 2564 { 2565 print "Warning: " . __PACKAGE__ . " constructor for $file has useless 'non_skip' in it\n"; 2566 } 2567 2568 # Here, we have figured out if we will be skipping this file or not. 2569 # If so, we add any single property it defines to any passed in 2570 # optional property list. These will be dealt with at run time. 2571 if (defined $skip{$addr}) { 2572 if ($property{$addr}) { 2573 push $optional{$addr}->@*, $property{$addr}; 2574 } 2575 } # Otherwise, are going to process the file. 2576 elsif ($property{$addr}) { 2577 2578 # If the file has a property defined in the constructor for it, it 2579 # means that the property is not listed in the file's entries. So 2580 # add a handler (to the list of line handlers) to insert the 2581 # property name into the lines, to provide a uniform interface to 2582 # the final processing subroutine. 2583 push @{$each_line_handler{$addr}}, \&_insert_property_into_line; 2584 } 2585 elsif ($properties{$addr}) { 2586 2587 # Similarly, there may be more than one property represented on 2588 # each line, with no clue but the constructor input what those 2589 # might be. Add a handler for each line in the input so that it 2590 # creates a separate input line for each property in those input 2591 # lines, thus making them suitable to handle generically. 2592 2593 push @{$each_line_handler{$addr}}, 2594 sub { 2595 my $file = shift; 2596 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 2597 my @fields = split /\s*;\s*/, $_, -1; 2598 2599 if (@fields - 1 > @{$properties{$addr}}) { 2600 $file->carp_bad_line('Extra fields'); 2601 $_ = ""; 2602 return; 2603 } 2604 my $range = shift @fields; # 0th element is always the 2605 # range 2606 2607 # The next fields in the input line correspond 2608 # respectively to the stored properties. 2609 for my $i (0 .. @{$properties{$addr}} - 1) { 2610 my $property_name = $properties{$addr}[$i]; 2611 next if $property_name eq '<ignored>'; 2612 $file->insert_adjusted_lines( 2613 "$range; $property_name; $fields[$i]"); 2614 } 2615 $_ = ""; 2616 2617 return; 2618 }; 2619 } 2620 2621 { # On non-ascii platforms, we use a special pre-handler 2622 no strict; 2623 no warnings 'once'; 2624 *next_line = (main::NON_ASCII_PLATFORM) 2625 ? *_next_line_with_remapped_range 2626 : *_next_line; 2627 } 2628 2629 &{$construction_time_handler{$addr}}($self) 2630 if $construction_time_handler{$addr}; 2631 2632 return $self; 2633 } 2634 2635 2636 use overload 2637 fallback => 0, 2638 qw("") => "_operator_stringify", 2639 "." => \&main::_operator_dot, 2640 ".=" => \&main::_operator_dot_equal, 2641 ; 2642 2643 sub _operator_stringify($self, $other="", $reversed=0) { 2644 return __PACKAGE__ . " object for " . $self->file; 2645 } 2646 2647 sub run($self) { 2648 # Process the input object $self. This opens and closes the file and 2649 # calls all the handlers for it. Currently, this can only be called 2650 # once per file, as it destroy's the EOF handlers 2651 2652 # flag to make sure extracted files are processed early 2653 state $seen_non_extracted = 0; 2654 2655 my $addr = do { no overloading; pack 'J', $self; }; 2656 2657 my $file = $file{$addr}; 2658 2659 if (! $file) { 2660 $handle{$addr} = 'pretend_is_open'; 2661 } 2662 else { 2663 if ($seen_non_extracted) { 2664 if ($file =~ /$EXTRACTED/i) # Some platforms may change the 2665 # case of the file's name 2666 { 2667 Carp::my_carp_bug(main::join_lines(<<END 2668$file should be processed just after the 'Prop...Alias' files, and before 2669anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may 2670have subtle problems 2671END 2672 )); 2673 } 2674 } 2675 elsif ($EXTRACTED_DIR 2676 2677 # We only do this check for generic property files 2678 && $handler{$addr} == \&main::process_generic_property_file 2679 2680 && $file !~ /$EXTRACTED/i) 2681 { 2682 # We don't set this (by the 'if' above) if we have no 2683 # extracted directory, so if running on an early version, 2684 # this test won't work. Not worth worrying about. 2685 $seen_non_extracted = 1; 2686 } 2687 2688 # Mark the file as having being processed, and warn if it 2689 # isn't a file we are expecting. As we process the files, 2690 # they are deleted from the hash, so any that remain at the 2691 # end of the program are files that we didn't process. 2692 my $fkey = File::Spec->rel2abs($file); 2693 my $exists = delete $potential_files{lc($fkey)}; 2694 2695 Carp::my_carp("Was not expecting '$file'.") 2696 if $exists && ! $in_this_release{$addr}; 2697 2698 # If there is special handling for compiling Unicode releases 2699 # earlier than the first one in which Unicode defines this 2700 # property ... 2701 if ($early{$addr}->@* > 1) { 2702 2703 # Mark as processed any substitute file that would be used in 2704 # such a release 2705 $fkey = File::Spec->rel2abs($early{$addr}[1]); 2706 delete $potential_files{lc($fkey)}; 2707 2708 # As commented in the constructor code, when using the 2709 # official property, we still have to allow the publicly 2710 # inaccessible early name so that the core code which uses it 2711 # will work regardless. 2712 if ( ! $only_early{$addr} 2713 && ! $early{$addr}[0] 2714 && $early{$addr}->@* > 2) 2715 { 2716 my $early_property_name = $early{$addr}[2]; 2717 if ($property{$addr} ne $early_property_name) { 2718 main::property_ref($property{$addr}) 2719 ->add_alias($early_property_name); 2720 } 2721 } 2722 } 2723 2724 # We may be skipping this file ... 2725 if (defined $skip{$addr}) { 2726 2727 # If the file isn't supposed to be in this release, there is 2728 # nothing to do 2729 if ($in_this_release{$addr}) { 2730 2731 # But otherwise, we may print a message 2732 if ($debug_skip) { 2733 print STDERR "Skipping input file '$file'", 2734 " because '$skip{$addr}'\n"; 2735 } 2736 2737 # And add it to the list of skipped files, which is later 2738 # used to make the pod 2739 $skipped_files{$file} = $skip{$addr}; 2740 2741 # The 'optional' list contains properties that are also to 2742 # be skipped along with the file. (There may also be 2743 # digits which are just placeholders to make sure it isn't 2744 # an empty list 2745 foreach my $property ($optional{$addr}->@*) { 2746 next unless $property =~ /\D/; 2747 my $prop_object = main::property_ref($property); 2748 next unless defined $prop_object; 2749 $prop_object->set_fate($SUPPRESSED, $skip{$addr}); 2750 } 2751 } 2752 2753 return; 2754 } 2755 2756 # Here, we are going to process the file. Open it, converting the 2757 # slashes used in this program into the proper form for the OS 2758 my $file_handle; 2759 if (not open $file_handle, "<", $file) { 2760 Carp::my_carp("Can't open $file. Skipping: $!"); 2761 return; 2762 } 2763 $handle{$addr} = $file_handle; # Cache the open file handle 2764 2765 # If possible, make sure that the file is the correct version. 2766 # (This data isn't available on early Unicode releases or in 2767 # UnicodeData.txt.) We don't do this check if we are using a 2768 # substitute file instead of the official one (though the code 2769 # could be extended to do so). 2770 if ($in_this_release{$addr} 2771 && ! $early{$addr}[0] 2772 && lc($file) ne 'unicodedata.txt') 2773 { 2774 my $this_version; 2775 2776 if ($file !~ /^Unihan/i) { 2777 2778 # The non-Unihan files started getting version numbers in 2779 # 3.2, but some files in 4.0 are unchanged from 3.2, and 2780 # marked as 3.2. 4.0.1 is the first version where there 2781 # are no files marked as being from less than 4.0, though 2782 # some are marked as 4.0. In versions after that, the 2783 # numbers are correct. 2784 if ($v_version ge v4.0.1) { 2785 $_ = <$file_handle>; # The version number is in the 2786 # very first line if it is a 2787 # UCD file; otherwise, it 2788 # might be 2789 goto valid_version if $_ =~ / - $string_version \. /x; 2790 chomp; 2791 if ($ucd{$addr}) { 2792 $_ =~ s/^#\s*//; 2793 2794 # 4.0.1 had some valid files that weren't updated. 2795 goto valid_version 2796 if $v_version eq v4.0.1 && $_ =~ /4\.0\.0/; 2797 $this_version = $_; 2798 goto wrong_version; 2799 } 2800 else { 2801 my $BOM = "\x{FEFF}"; 2802 utf8::encode($BOM); 2803 my $BOM_re = qr/ ^ (?:$BOM)? /x; 2804 2805 while ($_ =~ s/$BOM_re//) { # BOM; seems to be on 2806 # many lines in some files!! 2807 $_ = <$file_handle>; 2808 chomp; 2809 if ($_ =~ /^# Version: (.*)/) { 2810 $this_version = $1; 2811 goto valid_version 2812 if $this_version eq $string_version; 2813 goto valid_version 2814 if "$this_version.0" eq $string_version; 2815 goto wrong_version; 2816 } 2817 } 2818 goto no_version; 2819 } 2820 } 2821 } 2822 elsif ($v_version ge v6.0.0) { # Unihan 2823 2824 # Unihan files didn't get accurate version numbers until 2825 # 6.0. The version is somewhere in the first comment 2826 # block 2827 while (<$file_handle>) { 2828 goto no_version if $_ !~ /^#/; 2829 chomp; 2830 $_ =~ s/^#\s*//; 2831 next if $_ !~ / version: /x; 2832 goto valid_version if $_ =~ /$string_version/; 2833 goto wrong_version; 2834 } 2835 goto no_version; 2836 } 2837 else { # Old Unihan; have to assume is valid 2838 goto valid_version; 2839 } 2840 2841 wrong_version: 2842 die Carp::my_carp("File '$file' is version " 2843 . "'$this_version'. It should be " 2844 . "version $string_version"); 2845 no_version: 2846 Carp::my_carp_bug("Could not find the expected " 2847 . "version info in file '$file'"); 2848 } 2849 } 2850 2851 valid_version: 2852 print "$progress_message{$addr}\n" if $verbosity >= $PROGRESS; 2853 2854 # Call any special handler for before the file. 2855 &{$pre_handler{$addr}}($self) if $pre_handler{$addr}; 2856 2857 # Then the main handler 2858 &{$handler{$addr}}($self); 2859 2860 # Then any special post-file handler. 2861 &{$post_handler{$addr}}($self) if $post_handler{$addr}; 2862 2863 # If any errors have been accumulated, output the counts (as the first 2864 # error message in each class was output when it was encountered). 2865 if ($errors{$addr}) { 2866 my $total = 0; 2867 my $types = 0; 2868 foreach my $error (keys %{$errors{$addr}}) { 2869 $total += $errors{$addr}->{$error}; 2870 delete $errors{$addr}->{$error}; 2871 $types++; 2872 } 2873 if ($total > 1) { 2874 my $message 2875 = "A total of $total lines had errors in $file. "; 2876 2877 $message .= ($types == 1) 2878 ? '(Only the first one was displayed.)' 2879 : '(Only the first of each type was displayed.)'; 2880 Carp::my_carp($message); 2881 } 2882 } 2883 2884 if (@{$missings{$addr}}) { 2885 Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines. Generated tables likely are wrong"); 2886 } 2887 2888 # If a real file handle, close it. 2889 close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if 2890 ref $handle{$addr}; 2891 $handle{$addr} = ""; # Uses empty to indicate that has already seen 2892 # the file, as opposed to undef 2893 return; 2894 } 2895 2896 sub _next_line($self) { 2897 # Sets $_ to be the next logical input line, if any. Returns non-zero 2898 # if such a line exists. 'logical' means that any lines that have 2899 # been added via insert_lines() will be returned in $_ before the file 2900 # is read again. 2901 2902 my $addr = do { no overloading; pack 'J', $self; }; 2903 2904 # Here the file is open (or if the handle is not a ref, is an open 2905 # 'virtual' file). Get the next line; any inserted lines get priority 2906 # over the file itself. 2907 my $adjusted; 2908 2909 LINE: 2910 while (1) { # Loop until find non-comment, non-empty line 2911 #local $to_trace = 1 if main::DEBUG; 2912 my $inserted_ref = shift @{$added_lines{$addr}}; 2913 if (defined $inserted_ref) { 2914 ($adjusted, $_) = @{$inserted_ref}; 2915 trace $adjusted, $_ if main::DEBUG && $to_trace; 2916 return 1 if $adjusted; 2917 } 2918 else { 2919 last if ! ref $handle{$addr}; # Don't read unless is real file 2920 last if ! defined ($_ = readline $handle{$addr}); 2921 } 2922 chomp; 2923 trace $_ if main::DEBUG && $to_trace; 2924 2925 # See if this line is the comment line that defines what property 2926 # value that code points that are not listed in the file should 2927 # have. The format or existence of these lines is not guaranteed 2928 # by Unicode since they are comments, but the documentation says 2929 # that this was added for machine-readability, so probably won't 2930 # change. This works starting in Unicode Version 5.0. They look 2931 # like: 2932 # 2933 # @missing: 0000..10FFFF; Not_Reordered 2934 # @missing: 0000..10FFFF; Decomposition_Mapping; <code point> 2935 # @missing: 0000..10FFFF; ; NaN 2936 # 2937 # Save the line for a later get_missings() call. 2938 if (/$missing_defaults_prefix/) { 2939 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) { 2940 $self->carp_bad_line("Unexpected \@missing line. Assuming no missing entries"); 2941 } 2942 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) { 2943 my @defaults = split /\s* ; \s*/x, $_; 2944 2945 # The first field is the @missing, which ends in a 2946 # semi-colon, so can safely shift. 2947 shift @defaults; 2948 2949 # Some of these lines may have empty field placeholders 2950 # which get in the way. An example is: 2951 # @missing: 0000..10FFFF; ; NaN 2952 # Remove them. Process starting from the top so the 2953 # splice doesn't affect things still to be looked at. 2954 for (my $i = @defaults - 1; $i >= 0; $i--) { 2955 next if $defaults[$i] ne ""; 2956 splice @defaults, $i, 1; 2957 } 2958 2959 # What's left should be just the property (maybe) and the 2960 # default. Having only one element means it doesn't have 2961 # the property. 2962 my $default; 2963 my $property; 2964 if (@defaults >= 1) { 2965 if (@defaults == 1) { 2966 $default = $defaults[0]; 2967 } 2968 else { 2969 $property = $defaults[0]; 2970 $default = $defaults[1]; 2971 } 2972 } 2973 2974 if (@defaults < 1 2975 || @defaults > 2 2976 || ($default =~ /^</ 2977 && $default !~ /^<code *point>$/i 2978 && $default !~ /^<none>$/i 2979 && $default !~ /^<script>$/i)) 2980 { 2981 $self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries"); 2982 } 2983 else { 2984 2985 # If the property is missing from the line, it should 2986 # be the one for the whole file 2987 $property = $property{$addr} if ! defined $property; 2988 2989 # Change <none> to the null string, which is what it 2990 # really means. If the default is the code point 2991 # itself, set it to <code point>, which is what 2992 # Unicode uses (but sometimes they've forgotten the 2993 # space) 2994 if ($default =~ /^<none>$/i) { 2995 $default = ""; 2996 } 2997 elsif ($default =~ /^<code *point>$/i) { 2998 $default = $CODE_POINT; 2999 } 3000 elsif ($default =~ /^<script>$/i) { 3001 3002 # Special case this one. Currently is from 3003 # ScriptExtensions.txt, and means for all unlisted 3004 # code points, use their Script property values. 3005 # For the code points not listed in that file, the 3006 # default value is 'Unknown'. 3007 $default = "Unknown"; 3008 } 3009 3010 # Store them as a sub-arrays with both components. 3011 push @{$missings{$addr}}, [ $default, $property ]; 3012 } 3013 } 3014 3015 # There is nothing for the caller to process on this comment 3016 # line. 3017 next; 3018 } 3019 3020 # Unless to keep, remove comments. If to keep, ignore 3021 # comment-only lines 3022 if ($retain_trailing_comments{$addr}) { 3023 next if / ^ \s* \# /x; 3024 3025 # But escape any single quotes (done in both the comment and 3026 # non-comment portion; this could be a bug someday, but not 3027 # likely) 3028 s/'/\\'/g; 3029 } 3030 else { 3031 s/#.*//; 3032 } 3033 3034 # Remove trailing space, and skip this line if the result is empty 3035 s/\s+$//; 3036 next if /^$/; 3037 3038 # Call any handlers for this line, and skip further processing of 3039 # the line if the handler sets the line to null. 3040 foreach my $sub_ref (@{$each_line_handler{$addr}}) { 3041 &{$sub_ref}($self); 3042 next LINE if /^$/; 3043 } 3044 3045 # Here the line is ok. return success. 3046 return 1; 3047 } # End of looping through lines. 3048 3049 # If there are EOF handlers, call each (only once) and if it generates 3050 # more lines to process go back in the loop to handle them. 3051 while ($eof_handler{$addr}->@*) { 3052 &{$eof_handler{$addr}[0]}($self); 3053 shift $eof_handler{$addr}->@*; # Currently only get one shot at it. 3054 goto LINE if $added_lines{$addr}; 3055 } 3056 3057 # Return failure -- no more lines. 3058 return 0; 3059 3060 } 3061 3062 sub _next_line_with_remapped_range($self) { 3063 # like _next_line(), but for use on non-ASCII platforms. It sets $_ 3064 # to be the next logical input line, if any. Returns non-zero if such 3065 # a line exists. 'logical' means that any lines that have been added 3066 # via insert_lines() will be returned in $_ before the file is read 3067 # again. 3068 # 3069 # The difference from _next_line() is that this remaps the Unicode 3070 # code points in the input to those of the native platform. Each 3071 # input line contains a single code point, or a single contiguous 3072 # range of them This routine splits each range into its individual 3073 # code points and caches them. It returns the cached values, 3074 # translated into their native equivalents, one at a time, for each 3075 # call, before reading the next line. Since native values can only be 3076 # a single byte wide, no translation is needed for code points above 3077 # 0xFF, and ranges that are entirely above that number are not split. 3078 # If an input line contains the range 254-1000, it would be split into 3079 # three elements: 254, 255, and 256-1000. (The downstream table 3080 # insertion code will sort and coalesce the individual code points 3081 # into appropriate ranges.) 3082 3083 my $addr = do { no overloading; pack 'J', $self; }; 3084 3085 while (1) { 3086 3087 # Look in cache before reading the next line. Return any cached 3088 # value, translated 3089 my $inserted = shift @{$remapped_lines{$addr}}; 3090 if (defined $inserted) { 3091 trace $inserted if main::DEBUG && $to_trace; 3092 $_ = $inserted =~ s/^ ( \d+ ) /sprintf("%04X", utf8::unicode_to_native($1))/xer; 3093 trace $_ if main::DEBUG && $to_trace; 3094 return 1; 3095 } 3096 3097 # Get the next line. 3098 return 0 unless _next_line($self); 3099 3100 # If there is a special handler for it, return the line, 3101 # untranslated. This should happen only for files that are 3102 # special, not being code-point related, such as property names. 3103 return 1 if $handler{$addr} 3104 != \&main::process_generic_property_file; 3105 3106 my ($range, $property_name, $map, @remainder) 3107 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 3108 3109 if (@remainder 3110 || ! defined $property_name 3111 || $range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x) 3112 { 3113 Carp::my_carp_bug("Unrecognized input line '$_'. Ignored"); 3114 } 3115 3116 my $low = hex $1; 3117 my $high = (defined $2) ? hex $2 : $low; 3118 3119 # If the input maps the range to another code point, remap the 3120 # target if it is between 0 and 255. 3121 my $tail; 3122 if (defined $map) { 3123 $map =~ s/\b 00 ( [0-9A-F]{2} ) \b/sprintf("%04X", utf8::unicode_to_native(hex $1))/gxe; 3124 $tail = "$property_name; $map"; 3125 $_ = "$range; $tail"; 3126 } 3127 else { 3128 $tail = $property_name; 3129 } 3130 3131 # If entire range is above 255, just return it, unchanged (except 3132 # any mapped-to code point, already changed above) 3133 return 1 if $low > 255; 3134 3135 # Cache an entry for every code point < 255. For those in the 3136 # range above 255, return a dummy entry for just that portion of 3137 # the range. Note that this will be out-of-order, but that is not 3138 # a problem. 3139 foreach my $code_point ($low .. $high) { 3140 if ($code_point > 255) { 3141 $_ = sprintf "%04X..%04X; $tail", $code_point, $high; 3142 return 1; 3143 } 3144 push @{$remapped_lines{$addr}}, "$code_point; $tail"; 3145 } 3146 } # End of looping through lines. 3147 3148 # NOTREACHED 3149 } 3150 3151# Not currently used, not fully tested. 3152# sub peek { 3153# # Non-destructive lookahead one non-adjusted, non-comment, non-blank 3154# # record. Not callable from an each_line_handler(), nor does it call 3155# # an each_line_handler() on the line. 3156# 3157# my $self = shift; 3158# my $addr = do { no overloading; pack 'J', $self; }; 3159# 3160# foreach my $inserted_ref (@{$added_lines{$addr}}) { 3161# my ($adjusted, $line) = @{$inserted_ref}; 3162# next if $adjusted; 3163# 3164# # Remove comments and trailing space, and return a non-empty 3165# # resulting line 3166# $line =~ s/#.*//; 3167# $line =~ s/\s+$//; 3168# return $line if $line ne ""; 3169# } 3170# 3171# return if ! ref $handle{$addr}; # Don't read unless is real file 3172# while (1) { # Loop until find non-comment, non-empty line 3173# local $to_trace = 1 if main::DEBUG; 3174# trace $_ if main::DEBUG && $to_trace; 3175# return if ! defined (my $line = readline $handle{$addr}); 3176# chomp $line; 3177# push @{$added_lines{$addr}}, [ 0, $line ]; 3178# 3179# $line =~ s/#.*//; 3180# $line =~ s/\s+$//; 3181# return $line if $line ne ""; 3182# } 3183# 3184# return; 3185# } 3186 3187 3188 sub insert_lines($self, @lines) { 3189 # Lines can be inserted so that it looks like they were in the input 3190 # file at the place it was when this routine is called. See also 3191 # insert_adjusted_lines(). Lines inserted via this routine go through 3192 # any each_line_handler() 3193 3194 # Each inserted line is an array, with the first element being 0 to 3195 # indicate that this line hasn't been adjusted, and needs to be 3196 # processed. 3197 no overloading; 3198 push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @lines; 3199 return; 3200 } 3201 3202 sub insert_adjusted_lines($self, @lines) { 3203 # Lines can be inserted so that it looks like they were in the input 3204 # file at the place it was when this routine is called. See also 3205 # insert_lines(). Lines inserted via this routine are already fully 3206 # adjusted, ready to be processed; each_line_handler()s handlers will 3207 # not be called. This means this is not a completely general 3208 # facility, as only the last each_line_handler on the stack should 3209 # call this. It could be made more general, by passing to each of the 3210 # line_handlers their position on the stack, which they would pass on 3211 # to this routine, and that would replace the boolean first element in 3212 # the anonymous array pushed here, so that the next_line routine could 3213 # use that to call only those handlers whose index is after it on the 3214 # stack. But this is overkill for what is needed now. 3215 3216 trace $_[0] if main::DEBUG && $to_trace; 3217 3218 # Each inserted line is an array, with the first element being 1 to 3219 # indicate that this line has been adjusted 3220 no overloading; 3221 push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @lines; 3222 return; 3223 } 3224 3225 sub get_missings($self) { 3226 # Returns the stored up @missings lines' values, and clears the list. 3227 # The values are in an array, consisting of the default in the first 3228 # element, and the property in the 2nd. However, since these lines 3229 # can be stacked up, the return is an array of all these arrays. 3230 3231 my $addr = do { no overloading; pack 'J', $self; }; 3232 3233 # If not accepting a list return, just return the first one. 3234 return shift @{$missings{$addr}} unless wantarray; 3235 3236 my @return = @{$missings{$addr}}; 3237 undef @{$missings{$addr}}; 3238 return @return; 3239 } 3240 3241 sub _exclude_unassigned($self) { 3242 3243 # Takes the range in $_ and excludes code points that aren't assigned 3244 # in this release 3245 3246 state $skip_inserted_count = 0; 3247 3248 # Ignore recursive calls. 3249 if ($skip_inserted_count) { 3250 $skip_inserted_count--; 3251 return; 3252 } 3253 3254 # Find what code points are assigned in this release 3255 main::calculate_Assigned() if ! defined $Assigned; 3256 3257 my $addr = do { no overloading; pack 'J', $self; }; 3258 3259 my ($range, @remainder) 3260 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 3261 3262 # Examine the range. 3263 if ($range =~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x) 3264 { 3265 my $low = hex $1; 3266 my $high = (defined $2) ? hex $2 : $low; 3267 3268 # Split the range into subranges of just those code points in it 3269 # that are assigned. 3270 my @ranges = (Range_List->new(Initialize 3271 => Range->new($low, $high)) & $Assigned)->ranges; 3272 3273 # Do nothing if nothing in the original range is assigned in this 3274 # release; handle normally if everything is in this release. 3275 if (! @ranges) { 3276 $_ = ""; 3277 } 3278 elsif (@ranges != 1) { 3279 3280 # Here, some code points in the original range aren't in this 3281 # release; @ranges gives the ones that are. Create fake input 3282 # lines for each of the ranges, and set things up so that when 3283 # this routine is called on that fake input, it will do 3284 # nothing. 3285 $skip_inserted_count = @ranges; 3286 my $remainder = join ";", @remainder; 3287 for my $range (@ranges) { 3288 $self->insert_lines(sprintf("%04X..%04X;%s", 3289 $range->start, $range->end, $remainder)); 3290 } 3291 $_ = ""; # The original range is now defunct. 3292 } 3293 } 3294 3295 return; 3296 } 3297 3298 sub _fixup_obsolete_hanguls($self) { 3299 3300 # This is called only when compiling Unicode version 1. All Unicode 3301 # data for subsequent releases assumes that the code points that were 3302 # Hangul syllables in this release only are something else, so if 3303 # using such data, we have to override it 3304 3305 my $addr = do { no overloading; pack 'J', $self; }; 3306 3307 my $object = main::property_ref($property{$addr}); 3308 $object->add_map($FIRST_REMOVED_HANGUL_SYLLABLE, 3309 $FINAL_REMOVED_HANGUL_SYLLABLE, 3310 $early{$addr}[3], # Passed-in value for these 3311 Replace => $UNCONDITIONALLY); 3312 } 3313 3314 sub _insert_property_into_line($self) { 3315 # Add a property field to $_, if this file requires it. 3316 3317 my $addr = do { no overloading; pack 'J', $self; }; 3318 my $property = $property{$addr}; 3319 3320 $_ =~ s/(;|$)/; $property$1/; 3321 return; 3322 } 3323 3324 sub carp_bad_line($self, $message="") { 3325 # Output consistent error messages, using either a generic one, or the 3326 # one given by the optional parameter. To avoid gazillions of the 3327 # same message in case the syntax of a file is way off, this routine 3328 # only outputs the first instance of each message, incrementing a 3329 # count so the totals can be output at the end of the file. 3330 3331 my $addr = do { no overloading; pack 'J', $self; }; 3332 3333 $message = 'Unexpected line' unless $message; 3334 3335 # No trailing punctuation so as to fit with our addenda. 3336 $message =~ s/[.:;,]$//; 3337 3338 # If haven't seen this exact message before, output it now. Otherwise 3339 # increment the count of how many times it has occurred 3340 unless ($errors{$addr}->{$message}) { 3341 Carp::my_carp("$message in '$_' in " 3342 . $file{$addr} 3343 . " at line $.. Skipping this line;"); 3344 $errors{$addr}->{$message} = 1; 3345 } 3346 else { 3347 $errors{$addr}->{$message}++; 3348 } 3349 3350 # Clear the line to prevent any further (meaningful) processing of it. 3351 $_ = ""; 3352 3353 return; 3354 } 3355} # End closure 3356 3357package Multi_Default; 3358 3359# Certain properties in early versions of Unicode had more than one possible 3360# default for code points missing from the files. In these cases, one 3361# default applies to everything left over after all the others are applied, 3362# and for each of the others, there is a description of which class of code 3363# points applies to it. This object helps implement this by storing the 3364# defaults, and for all but that final default, an eval string that generates 3365# the class that it applies to. 3366 3367use strict; 3368use warnings; 3369 3370use feature 'signatures'; 3371no warnings 'experimental::signatures'; 3372 3373{ # Closure 3374 3375 main::setup_package(); 3376 3377 my %class_defaults; 3378 # The defaults structure for the classes 3379 main::set_access('class_defaults', \%class_defaults); 3380 3381 my %other_default; 3382 # The default that applies to everything left over. 3383 main::set_access('other_default', \%other_default, 'r'); 3384 3385 3386 sub new { 3387 # The constructor is called with default => eval pairs, terminated by 3388 # the left-over default. e.g. 3389 # Multi_Default->new( 3390 # 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C 3391 # - 0x200D', 3392 # 'R' => 'some other expression that evaluates to code points', 3393 # . 3394 # . 3395 # . 3396 # 'U')); 3397 # It is best to leave the final value be the one that matches the 3398 # above-Unicode code points. 3399 3400 my $class = shift; 3401 3402 my $self = bless \do{my $anonymous_scalar}, $class; 3403 my $addr = do { no overloading; pack 'J', $self; }; 3404 3405 while (@_ > 1) { 3406 my $default = shift; 3407 my $eval = shift; 3408 $class_defaults{$addr}->{$default} = $eval; 3409 } 3410 3411 $other_default{$addr} = shift; 3412 3413 return $self; 3414 } 3415 3416 sub get_next_defaults($self) { 3417 # Iterates and returns the next class of defaults. 3418 3419 my $addr = do { no overloading; pack 'J', $self; }; 3420 3421 return each %{$class_defaults{$addr}}; 3422 } 3423} 3424 3425package Alias; 3426 3427# An alias is one of the names that a table goes by. This class defines them 3428# including some attributes. Everything is currently setup in the 3429# constructor. 3430 3431use strict; 3432use warnings; 3433 3434use feature 'signatures'; 3435no warnings 'experimental::signatures'; 3436 3437 3438{ # Closure 3439 3440 main::setup_package(); 3441 3442 my %name; 3443 main::set_access('name', \%name, 'r'); 3444 3445 my %loose_match; 3446 # Should this name match loosely or not. 3447 main::set_access('loose_match', \%loose_match, 'r'); 3448 3449 my %make_re_pod_entry; 3450 # Some aliases should not get their own entries in the re section of the 3451 # pod, because they are covered by a wild-card, and some we want to 3452 # discourage use of. Binary 3453 main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's'); 3454 3455 my %ucd; 3456 # Is this documented to be accessible via Unicode::UCD 3457 main::set_access('ucd', \%ucd, 'r', 's'); 3458 3459 my %status; 3460 # Aliases have a status, like deprecated, or even suppressed (which means 3461 # they don't appear in documentation). Enum 3462 main::set_access('status', \%status, 'r'); 3463 3464 my %ok_as_filename; 3465 # Similarly, some aliases should not be considered as usable ones for 3466 # external use, such as file names, or we don't want documentation to 3467 # recommend them. Boolean 3468 main::set_access('ok_as_filename', \%ok_as_filename, 'r'); 3469 3470 sub new { 3471 my $class = shift; 3472 3473 my $self = bless \do { my $anonymous_scalar }, $class; 3474 my $addr = do { no overloading; pack 'J', $self; }; 3475 3476 $name{$addr} = shift; 3477 $loose_match{$addr} = shift; 3478 $make_re_pod_entry{$addr} = shift; 3479 $ok_as_filename{$addr} = shift; 3480 $status{$addr} = shift; 3481 $ucd{$addr} = shift; 3482 3483 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 3484 3485 # Null names are never ok externally 3486 $ok_as_filename{$addr} = 0 if $name{$addr} eq ""; 3487 3488 return $self; 3489 } 3490} 3491 3492package Range; 3493 3494# A range is the basic unit for storing code points, and is described in the 3495# comments at the beginning of the program. Each range has a starting code 3496# point; an ending code point (not less than the starting one); a value 3497# that applies to every code point in between the two end-points, inclusive; 3498# and an enum type that applies to the value. The type is for the user's 3499# convenience, and has no meaning here, except that a non-zero type is 3500# considered to not obey the normal Unicode rules for having standard forms. 3501# 3502# The same structure is used for both map and match tables, even though in the 3503# latter, the value (and hence type) is irrelevant and could be used as a 3504# comment. In map tables, the value is what all the code points in the range 3505# map to. Type 0 values have the standardized version of the value stored as 3506# well, so as to not have to recalculate it a lot. 3507 3508use strict; 3509use warnings; 3510 3511use feature 'signatures'; 3512no warnings 'experimental::signatures'; 3513 3514sub trace { return main::trace(@_); } 3515 3516{ # Closure 3517 3518 main::setup_package(); 3519 3520 my %start; 3521 main::set_access('start', \%start, 'r', 's'); 3522 3523 my %end; 3524 main::set_access('end', \%end, 'r', 's'); 3525 3526 my %value; 3527 main::set_access('value', \%value, 'r', 's'); 3528 3529 my %type; 3530 main::set_access('type', \%type, 'r'); 3531 3532 my %standard_form; 3533 # The value in internal standard form. Defined only if the type is 0. 3534 main::set_access('standard_form', \%standard_form); 3535 3536 # Note that if these fields change, the dump() method should as well 3537 3538 sub new($class, $_addr, $_end, @_args) { 3539 my $self = bless \do { my $anonymous_scalar }, $class; 3540 my $addr = do { no overloading; pack 'J', $self; }; 3541 3542 $start{$addr} = $_addr; 3543 $end{$addr} = $_end; 3544 3545 my %args = @_args; 3546 3547 my $value = delete $args{'Value'}; # Can be 0 3548 $value = "" unless defined $value; 3549 $value{$addr} = $value; 3550 3551 $type{$addr} = delete $args{'Type'} || 0; 3552 3553 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 3554 3555 return $self; 3556 } 3557 3558 use overload 3559 fallback => 0, 3560 qw("") => "_operator_stringify", 3561 "." => \&main::_operator_dot, 3562 ".=" => \&main::_operator_dot_equal, 3563 ; 3564 3565 sub _operator_stringify($self, $other="", $reversed=0) { 3566 my $addr = do { no overloading; pack 'J', $self; }; 3567 3568 # Output it like '0041..0065 (value)' 3569 my $return = sprintf("%04X", $start{$addr}) 3570 . '..' 3571 . sprintf("%04X", $end{$addr}); 3572 my $value = $value{$addr}; 3573 my $type = $type{$addr}; 3574 $return .= ' ('; 3575 $return .= "$value"; 3576 $return .= ", Type=$type" if $type != 0; 3577 $return .= ')'; 3578 3579 return $return; 3580 } 3581 3582 sub standard_form($self) { 3583 # Calculate the standard form only if needed, and cache the result. 3584 # The standard form is the value itself if the type is special. 3585 # This represents a considerable CPU and memory saving - at the time 3586 # of writing there are 368676 non-special objects, but the standard 3587 # form is only requested for 22047 of them - ie about 6%. 3588 3589 my $addr = do { no overloading; pack 'J', $self; }; 3590 3591 return $standard_form{$addr} if defined $standard_form{$addr}; 3592 3593 my $value = $value{$addr}; 3594 return $value if $type{$addr}; 3595 return $standard_form{$addr} = main::standardize($value); 3596 } 3597 3598 sub dump($self, $indent) { 3599 # Human, not machine readable. For machine readable, comment out this 3600 # entire routine and let the standard one take effect. 3601 my $addr = do { no overloading; pack 'J', $self; }; 3602 3603 my $return = $indent 3604 . sprintf("%04X", $start{$addr}) 3605 . '..' 3606 . sprintf("%04X", $end{$addr}) 3607 . " '$value{$addr}';"; 3608 if (! defined $standard_form{$addr}) { 3609 $return .= "(type=$type{$addr})"; 3610 } 3611 elsif ($standard_form{$addr} ne $value{$addr}) { 3612 $return .= "(standard '$standard_form{$addr}')"; 3613 } 3614 return $return; 3615 } 3616} # End closure 3617 3618package _Range_List_Base; 3619 3620use strict; 3621use warnings; 3622 3623use feature 'signatures'; 3624no warnings 'experimental::signatures'; 3625 3626# Base class for range lists. A range list is simply an ordered list of 3627# ranges, so that the ranges with the lowest starting numbers are first in it. 3628# 3629# When a new range is added that is adjacent to an existing range that has the 3630# same value and type, it merges with it to form a larger range. 3631# 3632# Ranges generally do not overlap, except that there can be multiple entries 3633# of single code point ranges. This is because of NameAliases.txt. 3634# 3635# In this program, there is a standard value such that if two different 3636# values, have the same standard value, they are considered equivalent. This 3637# value was chosen so that it gives correct results on Unicode data 3638 3639# There are a number of methods to manipulate range lists, and some operators 3640# are overloaded to handle them. 3641 3642sub trace { return main::trace(@_); } 3643 3644{ # Closure 3645 3646 our $addr; 3647 3648 # Max is initialized to a negative value that isn't adjacent to 0, for 3649 # simpler tests 3650 my $max_init = -2; 3651 3652 main::setup_package(); 3653 3654 my %ranges; 3655 # The list of ranges 3656 main::set_access('ranges', \%ranges, 'readable_array'); 3657 3658 my %max; 3659 # The highest code point in the list. This was originally a method, but 3660 # actual measurements said it was used a lot. 3661 main::set_access('max', \%max, 'r'); 3662 3663 my %each_range_iterator; 3664 # Iterator position for each_range() 3665 main::set_access('each_range_iterator', \%each_range_iterator); 3666 3667 my %owner_name_of; 3668 # Name of parent this is attached to, if any. Solely for better error 3669 # messages. 3670 main::set_access('owner_name_of', \%owner_name_of, 'p_r'); 3671 3672 my %_search_ranges_cache; 3673 # A cache of the previous result from _search_ranges(), for better 3674 # performance 3675 main::set_access('_search_ranges_cache', \%_search_ranges_cache); 3676 3677 sub new { 3678 my $class = shift; 3679 my %args = @_; 3680 3681 # Optional initialization data for the range list. 3682 my $initialize = delete $args{'Initialize'}; 3683 3684 my $self; 3685 3686 # Use _union() to initialize. _union() returns an object of this 3687 # class, which means that it will call this constructor recursively. 3688 # But it won't have this $initialize parameter so that it won't 3689 # infinitely loop on this. 3690 return _union($class, $initialize, %args) if defined $initialize; 3691 3692 $self = bless \do { my $anonymous_scalar }, $class; 3693 my $addr = do { no overloading; pack 'J', $self; }; 3694 3695 # Optional parent object, only for debug info. 3696 $owner_name_of{$addr} = delete $args{'Owner'}; 3697 $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr}; 3698 3699 # Stringify, in case it is an object. 3700 $owner_name_of{$addr} = "$owner_name_of{$addr}"; 3701 3702 # This is used only for error messages, and so a colon is added 3703 $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne ""; 3704 3705 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 3706 3707 $max{$addr} = $max_init; 3708 3709 $_search_ranges_cache{$addr} = 0; 3710 $ranges{$addr} = []; 3711 3712 return $self; 3713 } 3714 3715 use overload 3716 fallback => 0, 3717 qw("") => "_operator_stringify", 3718 "." => \&main::_operator_dot, 3719 ".=" => \&main::_operator_dot_equal, 3720 ; 3721 3722 sub _operator_stringify($self, $other="", $reversed=0) { 3723 my $addr = do { no overloading; pack 'J', $self; }; 3724 3725 return "Range_List attached to '$owner_name_of{$addr}'" 3726 if $owner_name_of{$addr}; 3727 return "anonymous Range_List " . \$self; 3728 } 3729 3730 sub _union { 3731 # Returns the union of the input code points. It can be called as 3732 # either a constructor or a method. If called as a method, the result 3733 # will be a new() instance of the calling object, containing the union 3734 # of that object with the other parameter's code points; if called as 3735 # a constructor, the first parameter gives the class that the new object 3736 # should be, and the second parameter gives the code points to go into 3737 # it. 3738 # In either case, there are two parameters looked at by this routine; 3739 # any additional parameters are passed to the new() constructor. 3740 # 3741 # The code points can come in the form of some object that contains 3742 # ranges, and has a conventionally named method to access them; or 3743 # they can be an array of individual code points (as integers); or 3744 # just a single code point. 3745 # 3746 # If they are ranges, this routine doesn't make any effort to preserve 3747 # the range values and types of one input over the other. Therefore 3748 # this base class should not allow _union to be called from other than 3749 # initialization code, so as to prevent two tables from being added 3750 # together where the range values matter. The general form of this 3751 # routine therefore belongs in a derived class, but it was moved here 3752 # to avoid duplication of code. The failure to overload this in this 3753 # class keeps it safe. 3754 # 3755 # It does make the effort during initialization to accept tables with 3756 # multiple values for the same code point, and to preserve the order 3757 # of these. If there is only one input range or range set, it doesn't 3758 # sort (as it should already be sorted to the desired order), and will 3759 # accept multiple values per code point. Otherwise it will merge 3760 # multiple values into a single one. 3761 3762 my $self; 3763 my @args; # Arguments to pass to the constructor 3764 3765 my $class = shift; 3766 3767 # If a method call, will start the union with the object itself, and 3768 # the class of the new object will be the same as self. 3769 if (ref $class) { 3770 $self = $class; 3771 $class = ref $self; 3772 push @args, $self; 3773 } 3774 3775 # Add the other required parameter. 3776 push @args, shift; 3777 # Rest of parameters are passed on to the constructor 3778 3779 # Accumulate all records from both lists. 3780 my @records; 3781 my $input_count = 0; 3782 for my $arg (@args) { 3783 #local $to_trace = 0 if main::DEBUG; 3784 trace "argument = $arg" if main::DEBUG && $to_trace; 3785 if (! defined $arg) { 3786 my $message = ""; 3787 if (defined $self) { 3788 no overloading; 3789 $message .= $owner_name_of{pack 'J', $self}; 3790 } 3791 Carp::my_carp_bug($message . "Undefined argument to _union. No union done."); 3792 return; 3793 } 3794 3795 $arg = [ $arg ] if ! ref $arg; 3796 my $type = ref $arg; 3797 if ($type eq 'ARRAY') { 3798 foreach my $element (@$arg) { 3799 push @records, Range->new($element, $element); 3800 $input_count++; 3801 } 3802 } 3803 elsif ($arg->isa('Range')) { 3804 push @records, $arg; 3805 $input_count++; 3806 } 3807 elsif ($arg->can('ranges')) { 3808 push @records, $arg->ranges; 3809 $input_count++; 3810 } 3811 else { 3812 my $message = ""; 3813 if (defined $self) { 3814 no overloading; 3815 $message .= $owner_name_of{pack 'J', $self}; 3816 } 3817 Carp::my_carp_bug($message . "Cannot take the union of a $type. No union done."); 3818 return; 3819 } 3820 } 3821 3822 # Sort with the range containing the lowest ordinal first, but if 3823 # two ranges start at the same code point, sort with the bigger range 3824 # of the two first, because it takes fewer cycles. 3825 if ($input_count > 1) { 3826 @records = sort { ($a->start <=> $b->start) 3827 or 3828 # if b is shorter than a, b->end will be 3829 # less than a->end, and we want to select 3830 # a, so want to return -1 3831 ($b->end <=> $a->end) 3832 } @records; 3833 } 3834 3835 my $new = $class->new(@_); 3836 3837 # Fold in records so long as they add new information. 3838 for my $set (@records) { 3839 my $start = $set->start; 3840 my $end = $set->end; 3841 my $value = $set->value; 3842 my $type = $set->type; 3843 if ($start > $new->max) { 3844 $new->_add_delete('+', $start, $end, $value, Type => $type); 3845 } 3846 elsif ($end > $new->max) { 3847 $new->_add_delete('+', $new->max +1, $end, $value, 3848 Type => $type); 3849 } 3850 elsif ($input_count == 1) { 3851 # Here, overlaps existing range, but is from a single input, 3852 # so preserve the multiple values from that input. 3853 $new->_add_delete('+', $start, $end, $value, Type => $type, 3854 Replace => $MULTIPLE_AFTER); 3855 } 3856 } 3857 3858 return $new; 3859 } 3860 3861 sub range_count($self) { # Return the number of ranges in the range list 3862 no overloading; 3863 return scalar @{$ranges{pack 'J', $self}}; 3864 } 3865 3866 sub min($self) { 3867 # Returns the minimum code point currently in the range list, or if 3868 # the range list is empty, 2 beyond the max possible. This is a 3869 # method because used so rarely, that not worth saving between calls, 3870 # and having to worry about changing it as ranges are added and 3871 # deleted. 3872 3873 my $addr = do { no overloading; pack 'J', $self; }; 3874 3875 # If the range list is empty, return a large value that isn't adjacent 3876 # to any that could be in the range list, for simpler tests 3877 return $MAX_WORKING_CODEPOINT + 2 unless scalar @{$ranges{$addr}}; 3878 return $ranges{$addr}->[0]->start; 3879 } 3880 3881 sub contains($self, $codepoint) { 3882 # Boolean: Is argument in the range list? If so returns $i such that: 3883 # range[$i]->end < $codepoint <= range[$i+1]->end 3884 # which is one beyond what you want; this is so that the 0th range 3885 # doesn't return false 3886 3887 my $i = $self->_search_ranges($codepoint); 3888 return 0 unless defined $i; 3889 3890 # The search returns $i, such that 3891 # range[$i-1]->end < $codepoint <= range[$i]->end 3892 # So is in the table if and only iff it is at least the start position 3893 # of range $i. 3894 no overloading; 3895 return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint; 3896 return $i + 1; 3897 } 3898 3899 sub containing_range($self, $codepoint) { 3900 # Returns the range object that contains the code point, undef if none 3901 my $i = $self->contains($codepoint); 3902 return unless $i; 3903 3904 # contains() returns 1 beyond where we should look 3905 no overloading; 3906 return $ranges{pack 'J', $self}->[$i-1]; 3907 } 3908 3909 sub value_of($self, $codepoint) { 3910 # Returns the value associated with the code point, undef if none 3911 my $range = $self->containing_range($codepoint); 3912 return unless defined $range; 3913 3914 return $range->value; 3915 } 3916 3917 sub type_of($self, $codepoint) { 3918 # Returns the type of the range containing the code point, undef if 3919 # the code point is not in the table 3920 my $range = $self->containing_range($codepoint); 3921 return unless defined $range; 3922 3923 return $range->type; 3924 } 3925 3926 sub _search_ranges($self, $code_point) { 3927 # Find the range in the list which contains a code point, or where it 3928 # should go if were to add it. That is, it returns $i, such that: 3929 # range[$i-1]->end < $codepoint <= range[$i]->end 3930 # Returns undef if no such $i is possible (e.g. at end of table), or 3931 # if there is an error. 3932 my $addr = do { no overloading; pack 'J', $self; }; 3933 3934 return if $code_point > $max{$addr}; 3935 my $r = $ranges{$addr}; # The current list of ranges 3936 my $range_list_size = scalar @$r; 3937 my $i; 3938 3939 use integer; # want integer division 3940 3941 # Use the cached result as the starting guess for this one, because, 3942 # an experiment on 5.1 showed that 90% of the time the cache was the 3943 # same as the result on the next call (and 7% it was one less). 3944 $i = $_search_ranges_cache{$addr}; 3945 $i = 0 if $i >= $range_list_size; # Reset if no longer valid (prob. 3946 # from an intervening deletion 3947 #local $to_trace = 1 if main::DEBUG; 3948 trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point); 3949 return $i if $code_point <= $r->[$i]->end 3950 && ($i == 0 || $r->[$i-1]->end < $code_point); 3951 3952 # Here the cache doesn't yield the correct $i. Try adding 1. 3953 if ($i < $range_list_size - 1 3954 && $r->[$i]->end < $code_point && 3955 $code_point <= $r->[$i+1]->end) 3956 { 3957 $i++; 3958 trace "next \$i is correct: $i" if main::DEBUG && $to_trace; 3959 $_search_ranges_cache{$addr} = $i; 3960 return $i; 3961 } 3962 3963 # Here, adding 1 also didn't work. We do a binary search to 3964 # find the correct position, starting with current $i 3965 my $lower = 0; 3966 my $upper = $range_list_size - 1; 3967 while (1) { 3968 trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace; 3969 3970 if ($code_point <= $r->[$i]->end) { 3971 3972 # Here we have met the upper constraint. We can quit if we 3973 # also meet the lower one. 3974 last if $i == 0 || $r->[$i-1]->end < $code_point; 3975 3976 $upper = $i; # Still too high. 3977 3978 } 3979 else { 3980 3981 # Here, $r[$i]->end < $code_point, so look higher up. 3982 $lower = $i; 3983 } 3984 3985 # Split search domain in half to try again. 3986 my $temp = ($upper + $lower) / 2; 3987 3988 # No point in continuing unless $i changes for next time 3989 # in the loop. 3990 if ($temp == $i) { 3991 3992 # We can't reach the highest element because of the averaging. 3993 # So if one below the upper edge, force it there and try one 3994 # more time. 3995 if ($i == $range_list_size - 2) { 3996 3997 trace "Forcing to upper edge" if main::DEBUG && $to_trace; 3998 $i = $range_list_size - 1; 3999 4000 # Change $lower as well so if fails next time through, 4001 # taking the average will yield the same $i, and we will 4002 # quit with the error message just below. 4003 $lower = $i; 4004 next; 4005 } 4006 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go. No action taken."); 4007 return; 4008 } 4009 $i = $temp; 4010 } # End of while loop 4011 4012 if (main::DEBUG && $to_trace) { 4013 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i; 4014 trace "i= [ $i ]", $r->[$i]; 4015 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1; 4016 } 4017 4018 # Here we have found the offset. Cache it as a starting point for the 4019 # next call. 4020 $_search_ranges_cache{$addr} = $i; 4021 return $i; 4022 } 4023 4024 sub _add_delete { 4025 # Add, replace or delete ranges to or from a list. The $type 4026 # parameter gives which: 4027 # '+' => insert or replace a range, returning a list of any changed 4028 # ranges. 4029 # '-' => delete a range, returning a list of any deleted ranges. 4030 # 4031 # The next three parameters give respectively the start, end, and 4032 # value associated with the range. 'value' should be null unless the 4033 # operation is '+'; 4034 # 4035 # The range list is kept sorted so that the range with the lowest 4036 # starting position is first in the list, and generally, adjacent 4037 # ranges with the same values are merged into a single larger one (see 4038 # exceptions below). 4039 # 4040 # There are more parameters; all are key => value pairs: 4041 # Type gives the type of the value. It is only valid for '+'. 4042 # All ranges have types; if this parameter is omitted, 0 is 4043 # assumed. Ranges with type 0 are assumed to obey the 4044 # Unicode rules for casing, etc; ranges with other types are 4045 # not. Otherwise, the type is arbitrary, for the caller's 4046 # convenience, and looked at only by this routine to keep 4047 # adjacent ranges of different types from being merged into 4048 # a single larger range, and when Replace => 4049 # $IF_NOT_EQUIVALENT is specified (see just below). 4050 # Replace determines what to do if the range list already contains 4051 # ranges which coincide with all or portions of the input 4052 # range. It is only valid for '+': 4053 # => $NO means that the new value is not to replace 4054 # any existing ones, but any empty gaps of the 4055 # range list coinciding with the input range 4056 # will be filled in with the new value. 4057 # => $UNCONDITIONALLY means to replace the existing values with 4058 # this one unconditionally. However, if the 4059 # new and old values are identical, the 4060 # replacement is skipped to save cycles 4061 # => $IF_NOT_EQUIVALENT means to replace the existing values 4062 # (the default) with this one if they are not equivalent. 4063 # Ranges are equivalent if their types are the 4064 # same, and they are the same string; or if 4065 # both are type 0 ranges, if their Unicode 4066 # standard forms are identical. In this last 4067 # case, the routine chooses the more "modern" 4068 # one to use. This is because some of the 4069 # older files are formatted with values that 4070 # are, for example, ALL CAPs, whereas the 4071 # derived files have a more modern style, 4072 # which looks better. By looking for this 4073 # style when the pre-existing and replacement 4074 # standard forms are the same, we can move to 4075 # the modern style 4076 # => $MULTIPLE_BEFORE means that if this range duplicates an 4077 # existing one, but has a different value, 4078 # don't replace the existing one, but insert 4079 # this one so that the same range can occur 4080 # multiple times. They are stored LIFO, so 4081 # that the final one inserted is the first one 4082 # returned in an ordered search of the table. 4083 # If this is an exact duplicate, including the 4084 # value, the original will be moved to be 4085 # first, before any other duplicate ranges 4086 # with different values. 4087 # => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored 4088 # FIFO, so that this one is inserted after all 4089 # others that currently exist. If this is an 4090 # exact duplicate, including value, of an 4091 # existing range, this one is discarded 4092 # (leaving the existing one in its original, 4093 # higher priority position 4094 # => $CROAK Die with an error if is already there 4095 # => anything else is the same as => $IF_NOT_EQUIVALENT 4096 # 4097 # "same value" means identical for non-type-0 ranges, and it means 4098 # having the same standard forms for type-0 ranges. 4099 4100 return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5; 4101 4102 my $self = shift; 4103 my $operation = shift; # '+' for add/replace; '-' for delete; 4104 my $start = shift; 4105 my $end = shift; 4106 my $value = shift; 4107 4108 my %args = @_; 4109 4110 $value = "" if not defined $value; # warning: $value can be "0" 4111 4112 my $replace = delete $args{'Replace'}; 4113 $replace = $IF_NOT_EQUIVALENT unless defined $replace; 4114 4115 my $type = delete $args{'Type'}; 4116 $type = 0 unless defined $type; 4117 4118 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 4119 4120 my $addr = do { no overloading; pack 'J', $self; }; 4121 4122 if ($operation ne '+' && $operation ne '-') { 4123 Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'. No action taken."); 4124 return; 4125 } 4126 unless (defined $start && defined $end) { 4127 Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete. No action taken."); 4128 return; 4129 } 4130 unless ($end >= $start) { 4131 Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken."); 4132 return; 4133 } 4134 #local $to_trace = 1 if main::DEBUG; 4135 4136 if ($operation eq '-') { 4137 if ($replace != $IF_NOT_EQUIVALENT) { 4138 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list. Assuming Replace => \$IF_NOT_EQUIVALENT."); 4139 $replace = $IF_NOT_EQUIVALENT; 4140 } 4141 if ($type) { 4142 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list. Assuming Type => 0."); 4143 $type = 0; 4144 } 4145 if ($value ne "") { 4146 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list. Assuming Value => \"\"."); 4147 $value = ""; 4148 } 4149 } 4150 4151 my $r = $ranges{$addr}; # The current list of ranges 4152 my $range_list_size = scalar @$r; # And its size 4153 my $max = $max{$addr}; # The current high code point in 4154 # the list of ranges 4155 4156 # Do a special case requiring fewer machine cycles when the new range 4157 # starts after the current highest point. The Unicode input data is 4158 # structured so this is common. 4159 if ($start > $max) { 4160 4161 trace "$owner_name_of{$addr} $operation", sprintf("%04X..%04X (%s) type=%d; prev max=%04X", $start, $end, $value, $type, $max) if main::DEBUG && $to_trace; 4162 return if $operation eq '-'; # Deleting a non-existing range is a 4163 # no-op 4164 4165 # If the new range doesn't logically extend the current final one 4166 # in the range list, create a new range at the end of the range 4167 # list. (max cleverly is initialized to a negative number not 4168 # adjacent to 0 if the range list is empty, so even adding a range 4169 # to an empty range list starting at 0 will have this 'if' 4170 # succeed.) 4171 if ($start > $max + 1 # non-adjacent means can't extend. 4172 || @{$r}[-1]->value ne $value # values differ, can't extend. 4173 || @{$r}[-1]->type != $type # types differ, can't extend. 4174 ) { 4175 push @$r, Range->new($start, $end, 4176 Value => $value, 4177 Type => $type); 4178 } 4179 else { 4180 4181 # Here, the new range starts just after the current highest in 4182 # the range list, and they have the same type and value. 4183 # Extend the existing range to incorporate the new one. 4184 @{$r}[-1]->set_end($end); 4185 } 4186 4187 # This becomes the new maximum. 4188 $max{$addr} = $end; 4189 4190 return; 4191 } 4192 #local $to_trace = 0 if main::DEBUG; 4193 4194 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace; 4195 4196 # Here, the input range isn't after the whole rest of the range list. 4197 # Most likely 'splice' will be needed. The rest of the routine finds 4198 # the needed splice parameters, and if necessary, does the splice. 4199 # First, find the offset parameter needed by the splice function for 4200 # the input range. Note that the input range may span multiple 4201 # existing ones, but we'll worry about that later. For now, just find 4202 # the beginning. If the input range is to be inserted starting in a 4203 # position not currently in the range list, it must (obviously) come 4204 # just after the range below it, and just before the range above it. 4205 # Slightly less obviously, it will occupy the position currently 4206 # occupied by the range that is to come after it. More formally, we 4207 # are looking for the position, $i, in the array of ranges, such that: 4208 # 4209 # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end 4210 # 4211 # (The ordered relationships within existing ranges are also shown in 4212 # the equation above). However, if the start of the input range is 4213 # within an existing range, the splice offset should point to that 4214 # existing range's position in the list; that is $i satisfies a 4215 # somewhat different equation, namely: 4216 # 4217 #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end 4218 # 4219 # More briefly, $start can come before or after r[$i]->start, and at 4220 # this point, we don't know which it will be. However, these 4221 # two equations share these constraints: 4222 # 4223 # r[$i-1]->end < $start <= r[$i]->end 4224 # 4225 # And that is good enough to find $i. 4226 4227 my $i = $self->_search_ranges($start); 4228 if (! defined $i) { 4229 Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined. Operation '$operation' not performed"); 4230 return; 4231 } 4232 4233 # The search function returns $i such that: 4234 # 4235 # r[$i-1]->end < $start <= r[$i]->end 4236 # 4237 # That means that $i points to the first range in the range list 4238 # that could possibly be affected by this operation. We still don't 4239 # know if the start of the input range is within r[$i], or if it 4240 # points to empty space between r[$i-1] and r[$i]. 4241 trace "[$i] is the beginning splice point. Existing range there is ", $r->[$i] if main::DEBUG && $to_trace; 4242 4243 # Special case the insertion of data that is not to replace any 4244 # existing data. 4245 if ($replace == $NO) { # If $NO, has to be operation '+' 4246 #local $to_trace = 1 if main::DEBUG; 4247 trace "Doesn't replace" if main::DEBUG && $to_trace; 4248 4249 # Here, the new range is to take effect only on those code points 4250 # that aren't already in an existing range. This can be done by 4251 # looking through the existing range list and finding the gaps in 4252 # the ranges that this new range affects, and then calling this 4253 # function recursively on each of those gaps, leaving untouched 4254 # anything already in the list. Gather up a list of the changed 4255 # gaps first so that changes to the internal state as new ranges 4256 # are added won't be a problem. 4257 my @gap_list; 4258 4259 # First, if the starting point of the input range is outside an 4260 # existing one, there is a gap from there to the beginning of the 4261 # existing range -- add a span to fill the part that this new 4262 # range occupies 4263 if ($start < $r->[$i]->start) { 4264 push @gap_list, Range->new($start, 4265 main::min($end, 4266 $r->[$i]->start - 1), 4267 Type => $type); 4268 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace; 4269 } 4270 4271 # Then look through the range list for other gaps until we reach 4272 # the highest range affected by the input one. 4273 my $j; 4274 for ($j = $i+1; $j < $range_list_size; $j++) { 4275 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace; 4276 last if $end < $r->[$j]->start; 4277 4278 # If there is a gap between when this range starts and the 4279 # previous one ends, add a span to fill it. Note that just 4280 # because there are two ranges doesn't mean there is a 4281 # non-zero gap between them. It could be that they have 4282 # different values or types 4283 if ($r->[$j-1]->end + 1 != $r->[$j]->start) { 4284 push @gap_list, 4285 Range->new($r->[$j-1]->end + 1, 4286 $r->[$j]->start - 1, 4287 Type => $type); 4288 trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace; 4289 } 4290 } 4291 4292 # Here, we have either found an existing range in the range list, 4293 # beyond the area affected by the input one, or we fell off the 4294 # end of the loop because the input range affects the whole rest 4295 # of the range list. In either case, $j is 1 higher than the 4296 # highest affected range. If $j == $i, it means that there are no 4297 # affected ranges, that the entire insertion is in the gap between 4298 # r[$i-1], and r[$i], which we already have taken care of before 4299 # the loop. 4300 # On the other hand, if there are affected ranges, it might be 4301 # that there is a gap that needs filling after the final such 4302 # range to the end of the input range 4303 if ($r->[$j-1]->end < $end) { 4304 push @gap_list, Range->new(main::max($start, 4305 $r->[$j-1]->end + 1), 4306 $end, 4307 Type => $type); 4308 trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace; 4309 } 4310 4311 # Call recursively to fill in all the gaps. 4312 foreach my $gap (@gap_list) { 4313 $self->_add_delete($operation, 4314 $gap->start, 4315 $gap->end, 4316 $value, 4317 Type => $type); 4318 } 4319 4320 return; 4321 } 4322 4323 # Here, we have taken care of the case where $replace is $NO. 4324 # Remember that here, r[$i-1]->end < $start <= r[$i]->end 4325 # If inserting a multiple record, this is where it goes, before the 4326 # first (if any) existing one if inserting LIFO. (If this is to go 4327 # afterwards, FIFO, we below move the pointer to there.) These imply 4328 # an insertion, and no change to any existing ranges. Note that $i 4329 # can be -1 if this new range doesn't actually duplicate any existing, 4330 # and comes at the beginning of the list. 4331 if ($replace == $MULTIPLE_BEFORE || $replace == $MULTIPLE_AFTER) { 4332 4333 if ($start != $end) { 4334 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point. No action taken."); 4335 return; 4336 } 4337 4338 # If the new code point is within a current range ... 4339 if ($end >= $r->[$i]->start) { 4340 4341 # Don't add an exact duplicate, as it isn't really a multiple 4342 my $existing_value = $r->[$i]->value; 4343 my $existing_type = $r->[$i]->type; 4344 return if $value eq $existing_value && $type eq $existing_type; 4345 4346 # If the multiple value is part of an existing range, we want 4347 # to split up that range, so that only the single code point 4348 # is affected. To do this, we first call ourselves 4349 # recursively to delete that code point from the table, having 4350 # preserved its current data above. Then we call ourselves 4351 # recursively again to add the new multiple, which we know by 4352 # the test just above is different than the current code 4353 # point's value, so it will become a range containing a single 4354 # code point: just itself. Finally, we add back in the 4355 # pre-existing code point, which will again be a single code 4356 # point range. Because 'i' likely will have changed as a 4357 # result of these operations, we can't just continue on, but 4358 # do this operation recursively as well. If we are inserting 4359 # LIFO, the pre-existing code point needs to go after the new 4360 # one, so use MULTIPLE_AFTER; and vice versa. 4361 if ($r->[$i]->start != $r->[$i]->end) { 4362 $self->_add_delete('-', $start, $end, ""); 4363 $self->_add_delete('+', $start, $end, $value, Type => $type); 4364 return $self->_add_delete('+', 4365 $start, $end, 4366 $existing_value, 4367 Type => $existing_type, 4368 Replace => ($replace == $MULTIPLE_BEFORE) 4369 ? $MULTIPLE_AFTER 4370 : $MULTIPLE_BEFORE); 4371 } 4372 } 4373 4374 # If to place this new record after, move to beyond all existing 4375 # ones; but don't add this one if identical to any of them, as it 4376 # isn't really a multiple. This leaves the original order, so 4377 # that the current request is ignored. The reasoning is that the 4378 # previous request that wanted this record to have high priority 4379 # should have precedence. 4380 if ($replace == $MULTIPLE_AFTER) { 4381 while ($i < @$r && $r->[$i]->start == $start) { 4382 return if $value eq $r->[$i]->value 4383 && $type eq $r->[$i]->type; 4384 $i++; 4385 } 4386 } 4387 else { 4388 # If instead we are to place this new record before any 4389 # existing ones, remove any identical ones that come after it. 4390 # This changes the existing order so that the new one is 4391 # first, as is being requested. 4392 for (my $j = $i + 1; 4393 $j < @$r && $r->[$j]->start == $start; 4394 $j++) 4395 { 4396 if ($value eq $r->[$j]->value && $type eq $r->[$j]->type) { 4397 splice @$r, $j, 1; 4398 last; # There should only be one instance, so no 4399 # need to keep looking 4400 } 4401 } 4402 } 4403 4404 trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace; 4405 my @return = splice @$r, 4406 $i, 4407 0, 4408 Range->new($start, 4409 $end, 4410 Value => $value, 4411 Type => $type); 4412 if (main::DEBUG && $to_trace) { 4413 trace "After splice:"; 4414 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2; 4415 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1; 4416 trace "i =[", $i, "]", $r->[$i] if $i >= 0; 4417 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1; 4418 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2; 4419 trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3; 4420 } 4421 return @return; 4422 } 4423 4424 # Here, we have taken care of $NO and $MULTIPLE_foo replaces. This 4425 # leaves delete, insert, and replace either unconditionally or if not 4426 # equivalent. $i still points to the first potential affected range. 4427 # Now find the highest range affected, which will determine the length 4428 # parameter to splice. (The input range can span multiple existing 4429 # ones.) If this isn't a deletion, while we are looking through the 4430 # range list, see also if this is a replacement rather than a clean 4431 # insertion; that is if it will change the values of at least one 4432 # existing range. Start off assuming it is an insert, until find it 4433 # isn't. 4434 my $clean_insert = $operation eq '+'; 4435 my $j; # This will point to the highest affected range 4436 4437 # For non-zero types, the standard form is the value itself; 4438 my $standard_form = ($type) ? $value : main::standardize($value); 4439 4440 for ($j = $i; $j < $range_list_size; $j++) { 4441 trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace; 4442 4443 # If find a range that it doesn't overlap into, we can stop 4444 # searching 4445 last if $end < $r->[$j]->start; 4446 4447 # Here, overlaps the range at $j. If the values don't match, 4448 # and so far we think this is a clean insertion, it becomes a 4449 # non-clean insertion, i.e., a 'change' or 'replace' instead. 4450 if ($clean_insert) { 4451 if ($r->[$j]->standard_form ne $standard_form) { 4452 $clean_insert = 0; 4453 if ($replace == $CROAK) { 4454 main::croak("The range to add " 4455 . sprintf("%04X", $start) 4456 . '-' 4457 . sprintf("%04X", $end) 4458 . " with value '$value' overlaps an existing range $r->[$j]"); 4459 } 4460 } 4461 else { 4462 4463 # Here, the two values are essentially the same. If the 4464 # two are actually identical, replacing wouldn't change 4465 # anything so skip it. 4466 my $pre_existing = $r->[$j]->value; 4467 if ($pre_existing ne $value) { 4468 4469 # Here the new and old standardized values are the 4470 # same, but the non-standardized values aren't. If 4471 # replacing unconditionally, then replace 4472 if( $replace == $UNCONDITIONALLY) { 4473 $clean_insert = 0; 4474 } 4475 else { 4476 4477 # Here, are replacing conditionally. Decide to 4478 # replace or not based on which appears to look 4479 # the "nicest". If one is mixed case and the 4480 # other isn't, choose the mixed case one. 4481 my $new_mixed = $value =~ /[A-Z]/ 4482 && $value =~ /[a-z]/; 4483 my $old_mixed = $pre_existing =~ /[A-Z]/ 4484 && $pre_existing =~ /[a-z]/; 4485 4486 if ($old_mixed != $new_mixed) { 4487 $clean_insert = 0 if $new_mixed; 4488 if (main::DEBUG && $to_trace) { 4489 if ($clean_insert) { 4490 trace "Retaining $pre_existing over $value"; 4491 } 4492 else { 4493 trace "Replacing $pre_existing with $value"; 4494 } 4495 } 4496 } 4497 else { 4498 4499 # Here casing wasn't different between the two. 4500 # If one has hyphens or underscores and the 4501 # other doesn't, choose the one with the 4502 # punctuation. 4503 my $new_punct = $value =~ /[-_]/; 4504 my $old_punct = $pre_existing =~ /[-_]/; 4505 4506 if ($old_punct != $new_punct) { 4507 $clean_insert = 0 if $new_punct; 4508 if (main::DEBUG && $to_trace) { 4509 if ($clean_insert) { 4510 trace "Retaining $pre_existing over $value"; 4511 } 4512 else { 4513 trace "Replacing $pre_existing with $value"; 4514 } 4515 } 4516 } # else existing one is just as "good"; 4517 # retain it to save cycles. 4518 } 4519 } 4520 } 4521 } 4522 } 4523 } # End of loop looking for highest affected range. 4524 4525 # Here, $j points to one beyond the highest range that this insertion 4526 # affects (hence to beyond the range list if that range is the final 4527 # one in the range list). 4528 4529 # The splice length is all the affected ranges. Get it before 4530 # subtracting, for efficiency, so we don't have to later add 1. 4531 my $length = $j - $i; 4532 4533 $j--; # $j now points to the highest affected range. 4534 trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace; 4535 4536 # Here, have taken care of $NO and $MULTIPLE_foo replaces. 4537 # $j points to the highest affected range. But it can be < $i or even 4538 # -1. These happen only if the insertion is entirely in the gap 4539 # between r[$i-1] and r[$i]. Here's why: j < i means that the j loop 4540 # above exited first time through with $end < $r->[$i]->start. (And 4541 # then we subtracted one from j) This implies also that $start < 4542 # $r->[$i]->start, but we know from above that $r->[$i-1]->end < 4543 # $start, so the entire input range is in the gap. 4544 if ($j < $i) { 4545 4546 # Here the entire input range is in the gap before $i. 4547 4548 if (main::DEBUG && $to_trace) { 4549 if ($i) { 4550 trace "Entire range is between $r->[$i-1] and $r->[$i]"; 4551 } 4552 else { 4553 trace "Entire range is before $r->[$i]"; 4554 } 4555 } 4556 return if $operation ne '+'; # Deletion of a non-existent range is 4557 # a no-op 4558 } 4559 else { 4560 4561 # Here part of the input range is not in the gap before $i. Thus, 4562 # there is at least one affected one, and $j points to the highest 4563 # such one. 4564 4565 # At this point, here is the situation: 4566 # This is not an insertion of a multiple, nor of tentative ($NO) 4567 # data. 4568 # $i points to the first element in the current range list that 4569 # may be affected by this operation. In fact, we know 4570 # that the range at $i is affected because we are in 4571 # the else branch of this 'if' 4572 # $j points to the highest affected range. 4573 # In other words, 4574 # r[$i-1]->end < $start <= r[$i]->end 4575 # And: 4576 # r[$i-1]->end < $start <= $end < r[$j+1]->start 4577 # 4578 # Also: 4579 # $clean_insert is a boolean which is set true if and only if 4580 # this is a "clean insertion", i.e., not a change nor a 4581 # deletion (multiple was handled above). 4582 4583 # We now have enough information to decide if this call is a no-op 4584 # or not. It is a no-op if this is an insertion of already 4585 # existing data. To be so, it must be contained entirely in one 4586 # range. 4587 4588 if (main::DEBUG && $to_trace && $clean_insert 4589 && $start >= $r->[$i]->start 4590 && $end <= $r->[$i]->end) 4591 { 4592 trace "no-op"; 4593 } 4594 return if $clean_insert 4595 && $start >= $r->[$i]->start 4596 && $end <= $r->[$i]->end; 4597 } 4598 4599 # Here, we know that some action will have to be taken. We have 4600 # calculated the offset and length (though adjustments may be needed) 4601 # for the splice. Now start constructing the replacement list. 4602 my @replacement; 4603 my $splice_start = $i; 4604 4605 my $extends_below; 4606 my $extends_above; 4607 4608 # See if should extend any adjacent ranges. 4609 if ($operation eq '-') { # Don't extend deletions 4610 $extends_below = $extends_above = 0; 4611 } 4612 else { # Here, should extend any adjacent ranges. See if there are 4613 # any. 4614 $extends_below = ($i > 0 4615 # can't extend unless adjacent 4616 && $r->[$i-1]->end == $start -1 4617 # can't extend unless are same standard value 4618 && $r->[$i-1]->standard_form eq $standard_form 4619 # can't extend unless share type 4620 && $r->[$i-1]->type == $type); 4621 $extends_above = ($j+1 < $range_list_size 4622 && $r->[$j+1]->start == $end +1 4623 && $r->[$j+1]->standard_form eq $standard_form 4624 && $r->[$j+1]->type == $type); 4625 } 4626 if ($extends_below && $extends_above) { # Adds to both 4627 $splice_start--; # start replace at element below 4628 $length += 2; # will replace on both sides 4629 trace "Extends both below and above ranges" if main::DEBUG && $to_trace; 4630 4631 # The result will fill in any gap, replacing both sides, and 4632 # create one large range. 4633 @replacement = Range->new($r->[$i-1]->start, 4634 $r->[$j+1]->end, 4635 Value => $value, 4636 Type => $type); 4637 } 4638 else { 4639 4640 # Here we know that the result won't just be the conglomeration of 4641 # a new range with both its adjacent neighbors. But it could 4642 # extend one of them. 4643 4644 if ($extends_below) { 4645 4646 # Here the new element adds to the one below, but not to the 4647 # one above. If inserting, and only to that one range, can 4648 # just change its ending to include the new one. 4649 if ($length == 0 && $clean_insert) { 4650 $r->[$i-1]->set_end($end); 4651 trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace; 4652 return; 4653 } 4654 else { 4655 trace "Changing inserted range to start at ", sprintf("%04X", $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace; 4656 $splice_start--; # start replace at element below 4657 $length++; # will replace the element below 4658 $start = $r->[$i-1]->start; 4659 } 4660 } 4661 elsif ($extends_above) { 4662 4663 # Here the new element adds to the one above, but not below. 4664 # Mirror the code above 4665 if ($length == 0 && $clean_insert) { 4666 $r->[$j+1]->set_start($start); 4667 trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace; 4668 return; 4669 } 4670 else { 4671 trace "Changing inserted range to end at ", sprintf("%04X", $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace; 4672 $length++; # will replace the element above 4673 $end = $r->[$j+1]->end; 4674 } 4675 } 4676 4677 trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace; 4678 4679 # Finally, here we know there will have to be a splice. 4680 # If the change or delete affects only the highest portion of the 4681 # first affected range, the range will have to be split. The 4682 # splice will remove the whole range, but will replace it by a new 4683 # range containing just the unaffected part. So, in this case, 4684 # add to the replacement list just this unaffected portion. 4685 if (! $extends_below 4686 && $start > $r->[$i]->start && $start <= $r->[$i]->end) 4687 { 4688 push @replacement, 4689 Range->new($r->[$i]->start, 4690 $start - 1, 4691 Value => $r->[$i]->value, 4692 Type => $r->[$i]->type); 4693 } 4694 4695 # In the case of an insert or change, but not a delete, we have to 4696 # put in the new stuff; this comes next. 4697 if ($operation eq '+') { 4698 push @replacement, Range->new($start, 4699 $end, 4700 Value => $value, 4701 Type => $type); 4702 } 4703 4704 trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i; 4705 #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace; 4706 4707 # And finally, if we're changing or deleting only a portion of the 4708 # highest affected range, it must be split, as the lowest one was. 4709 if (! $extends_above 4710 && $j >= 0 # Remember that j can be -1 if before first 4711 # current element 4712 && $end >= $r->[$j]->start 4713 && $end < $r->[$j]->end) 4714 { 4715 push @replacement, 4716 Range->new($end + 1, 4717 $r->[$j]->end, 4718 Value => $r->[$j]->value, 4719 Type => $r->[$j]->type); 4720 } 4721 } 4722 4723 # And do the splice, as calculated above 4724 if (main::DEBUG && $to_trace) { 4725 trace "replacing $length element(s) at $i with "; 4726 foreach my $replacement (@replacement) { 4727 trace " $replacement"; 4728 } 4729 trace "Before splice:"; 4730 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2; 4731 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1; 4732 trace "i =[", $i, "]", $r->[$i]; 4733 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1; 4734 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2; 4735 } 4736 4737 my @return = splice @$r, $splice_start, $length, @replacement; 4738 4739 if (main::DEBUG && $to_trace) { 4740 trace "After splice:"; 4741 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2; 4742 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1; 4743 trace "i =[", $i, "]", $r->[$i]; 4744 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1; 4745 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2; 4746 trace "removed ", @return if @return; 4747 } 4748 4749 # An actual deletion could have changed the maximum in the list. 4750 # There was no deletion if the splice didn't return something, but 4751 # otherwise recalculate it. This is done too rarely to worry about 4752 # performance. 4753 if ($operation eq '-' && @return) { 4754 if (@$r) { 4755 $max{$addr} = $r->[-1]->end; 4756 } 4757 else { # Now empty 4758 $max{$addr} = $max_init; 4759 } 4760 } 4761 return @return; 4762 } 4763 4764 sub reset_each_range($self) { # reset the iterator for each_range(); 4765 no overloading; 4766 undef $each_range_iterator{pack 'J', $self}; 4767 return; 4768 } 4769 4770 sub each_range($self) { 4771 # Iterate over each range in a range list. Results are undefined if 4772 # the range list is changed during the iteration. 4773 my $addr = do { no overloading; pack 'J', $self; }; 4774 4775 return if $self->is_empty; 4776 4777 $each_range_iterator{$addr} = -1 4778 if ! defined $each_range_iterator{$addr}; 4779 $each_range_iterator{$addr}++; 4780 return $ranges{$addr}->[$each_range_iterator{$addr}] 4781 if $each_range_iterator{$addr} < @{$ranges{$addr}}; 4782 undef $each_range_iterator{$addr}; 4783 return; 4784 } 4785 4786 sub count($self) { # Returns count of code points in range list 4787 my $addr = do { no overloading; pack 'J', $self; }; 4788 4789 my $count = 0; 4790 foreach my $range (@{$ranges{$addr}}) { 4791 $count += $range->end - $range->start + 1; 4792 } 4793 return $count; 4794 } 4795 4796 sub delete_range($self, $start, $end) { # Delete a range 4797 return $self->_add_delete('-', $start, $end, ""); 4798 } 4799 4800 sub is_empty($self) { # Returns boolean as to if a range list is empty 4801 no overloading; 4802 return scalar @{$ranges{pack 'J', $self}} == 0; 4803 } 4804 4805 sub hash($self) { 4806 # Quickly returns a scalar suitable for separating tables into 4807 # buckets, i.e. it is a hash function of the contents of a table, so 4808 # there are relatively few conflicts. 4809 my $addr = do { no overloading; pack 'J', $self; }; 4810 4811 # These are quickly computable. Return looks like 'min..max;count' 4812 return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}}; 4813 } 4814} # End closure for _Range_List_Base 4815 4816package Range_List; 4817use parent '-norequire', '_Range_List_Base'; 4818 4819use warnings; 4820use strict; 4821 4822use feature 'signatures'; 4823no warnings 'experimental::signatures'; 4824 4825# A Range_List is a range list for match tables; i.e. the range values are 4826# not significant. Thus a number of operations can be safely added to it, 4827# such as inversion, intersection. Note that union is also an unsafe 4828# operation when range values are cared about, and that method is in the base 4829# class, not here. But things are set up so that that method is callable only 4830# during initialization. Only in this derived class, is there an operation 4831# that combines two tables. A Range_Map can thus be used to initialize a 4832# Range_List, and its mappings will be in the list, but are not significant to 4833# this class. 4834 4835sub trace { return main::trace(@_); } 4836 4837{ # Closure 4838 4839 use overload 4840 fallback => 0, 4841 '+' => sub { my $self = shift; 4842 my $other = shift; 4843 4844 return $self->_union($other) 4845 }, 4846 '+=' => sub { my $self = shift; 4847 my $other = shift; 4848 my $reversed = shift; 4849 4850 if ($reversed) { 4851 Carp::my_carp_bug("Bad news. Can't cope with '" 4852 . ref($other) 4853 . ' += ' 4854 . ref($self) 4855 . "'. undef returned."); 4856 return; 4857 } 4858 4859 return $self->_union($other) 4860 }, 4861 '&' => sub { my $self = shift; 4862 my $other = shift; 4863 4864 return $self->_intersect($other, 0); 4865 }, 4866 '&=' => sub { my $self = shift; 4867 my $other = shift; 4868 my $reversed = shift; 4869 4870 if ($reversed) { 4871 Carp::my_carp_bug("Bad news. Can't cope with '" 4872 . ref($other) 4873 . ' &= ' 4874 . ref($self) 4875 . "'. undef returned."); 4876 return; 4877 } 4878 4879 return $self->_intersect($other, 0); 4880 }, 4881 '~' => "_invert", 4882 '-' => "_subtract", 4883 ; 4884 4885 sub _invert($self, @) { 4886 # Returns a new Range_List that gives all code points not in $self. 4887 my $new = Range_List->new; 4888 4889 # Go through each range in the table, finding the gaps between them 4890 my $max = -1; # Set so no gap before range beginning at 0 4891 for my $range ($self->ranges) { 4892 my $start = $range->start; 4893 my $end = $range->end; 4894 4895 # If there is a gap before this range, the inverse will contain 4896 # that gap. 4897 if ($start > $max + 1) { 4898 $new->add_range($max + 1, $start - 1); 4899 } 4900 $max = $end; 4901 } 4902 4903 # And finally, add the gap from the end of the table to the max 4904 # possible code point 4905 if ($max < $MAX_WORKING_CODEPOINT) { 4906 $new->add_range($max + 1, $MAX_WORKING_CODEPOINT); 4907 } 4908 return $new; 4909 } 4910 4911 sub _subtract($self, $other, $reversed=0) { 4912 # Returns a new Range_List with the argument deleted from it. The 4913 # argument can be a single code point, a range, or something that has 4914 # a range, with the _range_list() method on it returning them 4915 4916 if ($reversed) { 4917 Carp::my_carp_bug("Bad news. Can't cope with '" 4918 . ref($other) 4919 . ' - ' 4920 . ref($self) 4921 . "'. undef returned."); 4922 return; 4923 } 4924 4925 my $new = Range_List->new(Initialize => $self); 4926 4927 if (! ref $other) { # Single code point 4928 $new->delete_range($other, $other); 4929 } 4930 elsif ($other->isa('Range')) { 4931 $new->delete_range($other->start, $other->end); 4932 } 4933 elsif ($other->can('_range_list')) { 4934 foreach my $range ($other->_range_list->ranges) { 4935 $new->delete_range($range->start, $range->end); 4936 } 4937 } 4938 else { 4939 Carp::my_carp_bug("Can't cope with a " 4940 . ref($other) 4941 . " argument to '-'. Subtraction ignored." 4942 ); 4943 return $self; 4944 } 4945 4946 return $new; 4947 } 4948 4949 sub _intersect($a_object, $b_object, $check_if_overlapping=0) { 4950 # Returns either a boolean giving whether the two inputs' range lists 4951 # intersect (overlap), or a new Range_List containing the intersection 4952 # of the two lists. The optional final parameter being true indicates 4953 # to do the check instead of the intersection. 4954 4955 if (! defined $b_object) { 4956 my $message = ""; 4957 $message .= $a_object->_owner_name_of if defined $a_object; 4958 Carp::my_carp_bug($message .= "Called with undefined value. Intersection not done."); 4959 return; 4960 } 4961 4962 # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b ) 4963 # Thus the intersection could be much more simply be written: 4964 # return ~(~$a_object + ~$b_object); 4965 # But, this is slower, and when taking the inverse of a large 4966 # range_size_1 table, back when such tables were always stored that 4967 # way, it became prohibitively slow, hence the code was changed to the 4968 # below 4969 4970 if ($b_object->isa('Range')) { 4971 $b_object = Range_List->new(Initialize => $b_object, 4972 Owner => $a_object->_owner_name_of); 4973 } 4974 $b_object = $b_object->_range_list if $b_object->can('_range_list'); 4975 4976 my @a_ranges = $a_object->ranges; 4977 my @b_ranges = $b_object->ranges; 4978 4979 #local $to_trace = 1 if main::DEBUG; 4980 trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace; 4981 4982 # Start with the first range in each list 4983 my $a_i = 0; 4984 my $range_a = $a_ranges[$a_i]; 4985 my $b_i = 0; 4986 my $range_b = $b_ranges[$b_i]; 4987 4988 my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of) 4989 if ! $check_if_overlapping; 4990 4991 # If either list is empty, there is no intersection and no overlap 4992 if (! defined $range_a || ! defined $range_b) { 4993 return $check_if_overlapping ? 0 : $new; 4994 } 4995 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace; 4996 4997 # Otherwise, must calculate the intersection/overlap. Start with the 4998 # very first code point in each list 4999 my $a = $range_a->start; 5000 my $b = $range_b->start; 5001 5002 # Loop through all the ranges of each list; in each iteration, $a and 5003 # $b are the current code points in their respective lists 5004 while (1) { 5005 5006 # If $a and $b are the same code point, ... 5007 if ($a == $b) { 5008 5009 # it means the lists overlap. If just checking for overlap 5010 # know the answer now, 5011 return 1 if $check_if_overlapping; 5012 5013 # The intersection includes this code point plus anything else 5014 # common to both current ranges. 5015 my $start = $a; 5016 my $end = main::min($range_a->end, $range_b->end); 5017 if (! $check_if_overlapping) { 5018 trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace; 5019 $new->add_range($start, $end); 5020 } 5021 5022 # Skip ahead to the end of the current intersect 5023 $a = $b = $end; 5024 5025 # If the current intersect ends at the end of either range (as 5026 # it must for at least one of them), the next possible one 5027 # will be the beginning code point in it's list's next range. 5028 if ($a == $range_a->end) { 5029 $range_a = $a_ranges[++$a_i]; 5030 last unless defined $range_a; 5031 $a = $range_a->start; 5032 } 5033 if ($b == $range_b->end) { 5034 $range_b = $b_ranges[++$b_i]; 5035 last unless defined $range_b; 5036 $b = $range_b->start; 5037 } 5038 5039 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace; 5040 } 5041 elsif ($a < $b) { 5042 5043 # Not equal, but if the range containing $a encompasses $b, 5044 # change $a to be the middle of the range where it does equal 5045 # $b, so the next iteration will get the intersection 5046 if ($range_a->end >= $b) { 5047 $a = $b; 5048 } 5049 else { 5050 5051 # Here, the current range containing $a is entirely below 5052 # $b. Go try to find a range that could contain $b. 5053 $a_i = $a_object->_search_ranges($b); 5054 5055 # If no range found, quit. 5056 last unless defined $a_i; 5057 5058 # The search returns $a_i, such that 5059 # range_a[$a_i-1]->end < $b <= range_a[$a_i]->end 5060 # Set $a to the beginning of this new range, and repeat. 5061 $range_a = $a_ranges[$a_i]; 5062 $a = $range_a->start; 5063 } 5064 } 5065 else { # Here, $b < $a. 5066 5067 # Mirror image code to the leg just above 5068 if ($range_b->end >= $a) { 5069 $b = $a; 5070 } 5071 else { 5072 $b_i = $b_object->_search_ranges($a); 5073 last unless defined $b_i; 5074 $range_b = $b_ranges[$b_i]; 5075 $b = $range_b->start; 5076 } 5077 } 5078 } # End of looping through ranges. 5079 5080 # Intersection fully computed, or now know that there is no overlap 5081 return $check_if_overlapping ? 0 : $new; 5082 } 5083 5084 sub overlaps($self, $other) { 5085 # Returns boolean giving whether the two arguments overlap somewhere 5086 return $self->_intersect($other, 1); 5087 } 5088 5089 sub add_range($self, $start, $end) { 5090 # Add a range to the list. 5091 return $self->_add_delete('+', $start, $end, ""); 5092 } 5093 5094 sub matches_identically_to($self, $other) { 5095 # Return a boolean as to whether or not two Range_Lists match identical 5096 # sets of code points. 5097 # These are ordered in increasing real time to figure out (at least 5098 # until a patch changes that and doesn't change this) 5099 return 0 if $self->max != $other->max; 5100 return 0 if $self->min != $other->min; 5101 return 0 if $self->range_count != $other->range_count; 5102 return 0 if $self->count != $other->count; 5103 5104 # Here they could be identical because all the tests above passed. 5105 # The loop below is somewhat simpler since we know they have the same 5106 # number of elements. Compare range by range, until reach the end or 5107 # find something that differs. 5108 my @a_ranges = $self->ranges; 5109 my @b_ranges = $other->ranges; 5110 for my $i (0 .. @a_ranges - 1) { 5111 my $a = $a_ranges[$i]; 5112 my $b = $b_ranges[$i]; 5113 trace "self $a; other $b" if main::DEBUG && $to_trace; 5114 return 0 if ! defined $b 5115 || $a->start != $b->start 5116 || $a->end != $b->end; 5117 } 5118 return 1; 5119 } 5120 5121 sub is_code_point_usable($code, $try_hard) { 5122 # This used only for making the test script. See if the input 5123 # proposed trial code point is one that Perl will handle. If second 5124 # parameter is 0, it won't select some code points for various 5125 # reasons, noted below. 5126 return 0 if $code < 0; # Never use a negative 5127 5128 # shun null. I'm (khw) not sure why this was done, but NULL would be 5129 # the character very frequently used. 5130 return $try_hard if $code == 0x0000; 5131 5132 # shun non-character code points. 5133 return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF; 5134 return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF 5135 5136 return $try_hard if $code > $MAX_UNICODE_CODEPOINT; # keep in range 5137 return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate 5138 5139 return 1; 5140 } 5141 5142 sub get_valid_code_point($self) { 5143 # Return a code point that's part of the range list. Returns nothing 5144 # if the table is empty or we can't find a suitable code point. This 5145 # used only for making the test script. 5146 my $addr = do { no overloading; pack 'J', $self; }; 5147 5148 # On first pass, don't choose less desirable code points; if no good 5149 # one is found, repeat, allowing a less desirable one to be selected. 5150 for my $try_hard (0, 1) { 5151 5152 # Look through all the ranges for a usable code point. 5153 for my $set (reverse $self->ranges) { 5154 5155 # Try the edge cases first, starting with the end point of the 5156 # range. 5157 my $end = $set->end; 5158 return $end if is_code_point_usable($end, $try_hard); 5159 $end = $MAX_UNICODE_CODEPOINT + 1 if $end > $MAX_UNICODE_CODEPOINT; 5160 5161 # End point didn't, work. Start at the beginning and try 5162 # every one until find one that does work. 5163 for my $trial ($set->start .. $end - 1) { 5164 return $trial if is_code_point_usable($trial, $try_hard); 5165 } 5166 } 5167 } 5168 return (); # If none found, give up. 5169 } 5170 5171 sub get_invalid_code_point($self) { 5172 # Return a code point that's not part of the table. Returns nothing 5173 # if the table covers all code points or a suitable code point can't 5174 # be found. This used only for making the test script. 5175 5176 # Just find a valid code point of the inverse, if any. 5177 return Range_List->new(Initialize => ~ $self)->get_valid_code_point; 5178 } 5179} # end closure for Range_List 5180 5181package Range_Map; 5182use parent '-norequire', '_Range_List_Base'; 5183 5184use strict; 5185use warnings; 5186 5187use feature 'signatures'; 5188no warnings 'experimental::signatures'; 5189 5190# A Range_Map is a range list in which the range values (called maps) are 5191# significant, and hence shouldn't be manipulated by our other code, which 5192# could be ambiguous or lose things. For example, in taking the union of two 5193# lists, which share code points, but which have differing values, which one 5194# has precedence in the union? 5195# It turns out that these operations aren't really necessary for map tables, 5196# and so this class was created to make sure they aren't accidentally 5197# applied to them. 5198 5199{ # Closure 5200 5201 sub add_map($self, @add) { 5202 # Add a range containing a mapping value to the list 5203 return $self->_add_delete('+', @add); 5204 } 5205 5206 sub replace_map($self, @list) { 5207 # Replace a range 5208 return $self->_add_delete('+', @list, Replace => $UNCONDITIONALLY); 5209 } 5210 5211 sub add_duplicate { 5212 # Adds entry to a range list which can duplicate an existing entry 5213 5214 my $self = shift; 5215 my $code_point = shift; 5216 my $value = shift; 5217 my %args = @_; 5218 my $replace = delete $args{'Replace'} // $MULTIPLE_BEFORE; 5219 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 5220 5221 return $self->add_map($code_point, $code_point, 5222 $value, Replace => $replace); 5223 } 5224} # End of closure for package Range_Map 5225 5226package _Base_Table; 5227 5228use strict; 5229use warnings; 5230 5231use feature 'signatures'; 5232no warnings 'experimental::signatures'; 5233 5234# A table is the basic data structure that gets written out into a file for 5235# use by the Perl core. This is the abstract base class implementing the 5236# common elements from the derived ones. A list of the methods to be 5237# furnished by an implementing class is just after the constructor. 5238 5239sub standardize { return main::standardize($_[0]); } 5240sub trace { return main::trace(@_); } 5241 5242{ # Closure 5243 5244 main::setup_package(); 5245 5246 my %range_list; 5247 # Object containing the ranges of the table. 5248 main::set_access('range_list', \%range_list, 'p_r', 'p_s'); 5249 5250 my %full_name; 5251 # The full table name. 5252 main::set_access('full_name', \%full_name, 'r'); 5253 5254 my %name; 5255 # The table name, almost always shorter 5256 main::set_access('name', \%name, 'r'); 5257 5258 my %short_name; 5259 # The shortest of all the aliases for this table, with underscores removed 5260 main::set_access('short_name', \%short_name); 5261 5262 my %nominal_short_name_length; 5263 # The length of short_name before removing underscores 5264 main::set_access('nominal_short_name_length', 5265 \%nominal_short_name_length); 5266 5267 my %complete_name; 5268 # The complete name, including property. 5269 main::set_access('complete_name', \%complete_name, 'r'); 5270 5271 my %property; 5272 # Parent property this table is attached to. 5273 main::set_access('property', \%property, 'r'); 5274 5275 my %aliases; 5276 # Ordered list of alias objects of the table's name. The first ones in 5277 # the list are output first in comments 5278 main::set_access('aliases', \%aliases, 'readable_array'); 5279 5280 my %comment; 5281 # A comment associated with the table for human readers of the files 5282 main::set_access('comment', \%comment, 's'); 5283 5284 my %description; 5285 # A comment giving a short description of the table's meaning for human 5286 # readers of the files. 5287 main::set_access('description', \%description, 'readable_array'); 5288 5289 my %note; 5290 # A comment giving a short note about the table for human readers of the 5291 # files. 5292 main::set_access('note', \%note, 'readable_array'); 5293 5294 my %fate; 5295 # Enum; there are a number of possibilities for what happens to this 5296 # table: it could be normal, or suppressed, or not for external use. See 5297 # values at definition for $SUPPRESSED. 5298 main::set_access('fate', \%fate, 'r'); 5299 5300 my %find_table_from_alias; 5301 # The parent property passes this pointer to a hash which this class adds 5302 # all its aliases to, so that the parent can quickly take an alias and 5303 # find this table. 5304 main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r'); 5305 5306 my %locked; 5307 # After this table is made equivalent to another one; we shouldn't go 5308 # changing the contents because that could mean it's no longer equivalent 5309 main::set_access('locked', \%locked, 'r'); 5310 5311 my %file_path; 5312 # This gives the final path to the file containing the table. Each 5313 # directory in the path is an element in the array 5314 main::set_access('file_path', \%file_path, 'readable_array'); 5315 5316 my %status; 5317 # What is the table's status, normal, $OBSOLETE, etc. Enum 5318 main::set_access('status', \%status, 'r'); 5319 5320 my %status_info; 5321 # A comment about its being obsolete, or whatever non normal status it has 5322 main::set_access('status_info', \%status_info, 'r'); 5323 5324 my %caseless_equivalent; 5325 # The table this is equivalent to under /i matching, if any. 5326 main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's'); 5327 5328 my %range_size_1; 5329 # Is the table to be output with each range only a single code point? 5330 # This is done to avoid breaking existing code that may have come to rely 5331 # on this behavior in previous versions of this program.) 5332 main::set_access('range_size_1', \%range_size_1, 'r', 's'); 5333 5334 my %perl_extension; 5335 # A boolean set iff this table is a Perl extension to the Unicode 5336 # standard. 5337 main::set_access('perl_extension', \%perl_extension, 'r'); 5338 5339 my %output_range_counts; 5340 # A boolean set iff this table is to have comments written in the 5341 # output file that contain the number of code points in the range. 5342 # The constructor can override the global flag of the same name. 5343 main::set_access('output_range_counts', \%output_range_counts, 'r'); 5344 5345 my %write_as_invlist; 5346 # A boolean set iff the output file for this table is to be in the form of 5347 # an inversion list/map. 5348 main::set_access('write_as_invlist', \%write_as_invlist, 'r'); 5349 5350 my %format; 5351 # The format of the entries of the table. This is calculated from the 5352 # data in the table (or passed in the constructor). This is an enum e.g., 5353 # $STRING_FORMAT. It is marked protected as it should not be generally 5354 # used to override calculations. 5355 main::set_access('format', \%format, 'r', 'p_s'); 5356 5357 my %has_dependency; 5358 # A boolean that gives whether some other table in this property is 5359 # defined as the complement of this table. This is a crude, but currently 5360 # sufficient, mechanism to make this table not get destroyed before what 5361 # is dependent on it is. Other dependencies could be added, so the name 5362 # was chosen to reflect a more general situation than actually is 5363 # currently the case. 5364 main::set_access('has_dependency', \%has_dependency, 'r', 's'); 5365 5366 sub new { 5367 # All arguments are key => value pairs, which you can see below, most 5368 # of which match fields documented above. Otherwise: Re_Pod_Entry, 5369 # OK_as_Filename, and Fuzzy apply to the names of the table, and are 5370 # documented in the Alias package 5371 5372 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2; 5373 5374 my $class = shift; 5375 5376 my $self = bless \do { my $anonymous_scalar }, $class; 5377 my $addr = do { no overloading; pack 'J', $self; }; 5378 5379 my %args = @_; 5380 5381 $name{$addr} = delete $args{'Name'}; 5382 $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'}; 5383 $full_name{$addr} = delete $args{'Full_Name'}; 5384 my $complete_name = $complete_name{$addr} 5385 = delete $args{'Complete_Name'}; 5386 $format{$addr} = delete $args{'Format'}; 5387 $output_range_counts{$addr} = delete $args{'Output_Range_Counts'}; 5388 $property{$addr} = delete $args{'_Property'}; 5389 $range_list{$addr} = delete $args{'_Range_List'}; 5390 $status{$addr} = delete $args{'Status'} || $NORMAL; 5391 $status_info{$addr} = delete $args{'_Status_Info'} || ""; 5392 $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0; 5393 $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0; 5394 $fate{$addr} = delete $args{'Fate'} || $ORDINARY; 5395 $write_as_invlist{$addr} = delete $args{'Write_As_Invlist'};# No default 5396 my $ucd = delete $args{'UCD'}; 5397 5398 my $description = delete $args{'Description'}; 5399 my $ok_as_filename = delete $args{'OK_as_Filename'}; 5400 my $loose_match = delete $args{'Fuzzy'}; 5401 my $note = delete $args{'Note'}; 5402 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'}; 5403 my $perl_extension = delete $args{'Perl_Extension'}; 5404 my $suppression_reason = delete $args{'Suppression_Reason'}; 5405 5406 # Shouldn't have any left over 5407 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 5408 5409 # Can't use || above because conceivably the name could be 0, and 5410 # can't use // operator in case this program gets used in Perl 5.8 5411 $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr}; 5412 $output_range_counts{$addr} = $output_range_counts if 5413 ! defined $output_range_counts{$addr}; 5414 5415 $aliases{$addr} = [ ]; 5416 $comment{$addr} = [ ]; 5417 $description{$addr} = [ ]; 5418 $note{$addr} = [ ]; 5419 $file_path{$addr} = [ ]; 5420 $locked{$addr} = ""; 5421 $has_dependency{$addr} = 0; 5422 5423 push @{$description{$addr}}, $description if $description; 5424 push @{$note{$addr}}, $note if $note; 5425 5426 if ($fate{$addr} == $PLACEHOLDER) { 5427 5428 # A placeholder table doesn't get documented, is a perl extension, 5429 # and quite likely will be empty 5430 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry; 5431 $perl_extension = 1 if ! defined $perl_extension; 5432 $ucd = 0 if ! defined $ucd; 5433 push @tables_that_may_be_empty, $complete_name{$addr}; 5434 $self->add_comment(<<END); 5435This is a placeholder because it is not in Version $string_version of Unicode, 5436but is needed by the Perl core to work gracefully. Because it is not in this 5437version of Unicode, it will not be listed in $pod_file.pod 5438END 5439 } 5440 elsif (exists $why_suppressed{$complete_name} 5441 # Don't suppress if overridden 5442 && ! grep { $_ eq $complete_name{$addr} } 5443 @output_mapped_properties) 5444 { 5445 $fate{$addr} = $SUPPRESSED; 5446 } 5447 elsif ($fate{$addr} == $SUPPRESSED) { 5448 Carp::my_carp_bug("Need reason for suppressing") unless $suppression_reason; 5449 # Though currently unused 5450 } 5451 elsif ($suppression_reason) { 5452 Carp::my_carp_bug("A reason was given for suppressing, but not suppressed"); 5453 } 5454 5455 # If hasn't set its status already, see if it is on one of the 5456 # lists of properties or tables that have particular statuses; if 5457 # not, is normal. The lists are prioritized so the most serious 5458 # ones are checked first 5459 if (! $status{$addr}) { 5460 if (exists $why_deprecated{$complete_name}) { 5461 $status{$addr} = $DEPRECATED; 5462 } 5463 elsif (exists $why_stabilized{$complete_name}) { 5464 $status{$addr} = $STABILIZED; 5465 } 5466 elsif (exists $why_obsolete{$complete_name}) { 5467 $status{$addr} = $OBSOLETE; 5468 } 5469 5470 # Existence above doesn't necessarily mean there is a message 5471 # associated with it. Use the most serious message. 5472 if ($status{$addr}) { 5473 if ($why_deprecated{$complete_name}) { 5474 $status_info{$addr} 5475 = $why_deprecated{$complete_name}; 5476 } 5477 elsif ($why_stabilized{$complete_name}) { 5478 $status_info{$addr} 5479 = $why_stabilized{$complete_name}; 5480 } 5481 elsif ($why_obsolete{$complete_name}) { 5482 $status_info{$addr} 5483 = $why_obsolete{$complete_name}; 5484 } 5485 } 5486 } 5487 5488 $perl_extension{$addr} = $perl_extension || 0; 5489 5490 # Don't list a property by default that is internal only 5491 if ($fate{$addr} > $MAP_PROXIED) { 5492 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry; 5493 $ucd = 0 if ! defined $ucd; 5494 } 5495 else { 5496 $ucd = 1 if ! defined $ucd; 5497 } 5498 5499 # By convention what typically gets printed only or first is what's 5500 # first in the list, so put the full name there for good output 5501 # clarity. Other routines rely on the full name being first on the 5502 # list 5503 $self->add_alias($full_name{$addr}, 5504 OK_as_Filename => $ok_as_filename, 5505 Fuzzy => $loose_match, 5506 Re_Pod_Entry => $make_re_pod_entry, 5507 Status => $status{$addr}, 5508 UCD => $ucd, 5509 ); 5510 5511 # Then comes the other name, if meaningfully different. 5512 if (standardize($full_name{$addr}) ne standardize($name{$addr})) { 5513 $self->add_alias($name{$addr}, 5514 OK_as_Filename => $ok_as_filename, 5515 Fuzzy => $loose_match, 5516 Re_Pod_Entry => $make_re_pod_entry, 5517 Status => $status{$addr}, 5518 UCD => $ucd, 5519 ); 5520 } 5521 5522 return $self; 5523 } 5524 5525 # Here are the methods that are required to be defined by any derived 5526 # class 5527 for my $sub (qw( 5528 handle_special_range 5529 append_to_body 5530 pre_body 5531 )) 5532 # write() knows how to write out normal ranges, but it calls 5533 # handle_special_range() when it encounters a non-normal one. 5534 # append_to_body() is called by it after it has handled all 5535 # ranges to add anything after the main portion of the table. 5536 # And finally, pre_body() is called after all this to build up 5537 # anything that should appear before the main portion of the 5538 # table. Doing it this way allows things in the middle to 5539 # affect what should appear before the main portion of the 5540 # table. 5541 { 5542 no strict "refs"; 5543 *$sub = sub { 5544 Carp::my_carp_bug( __LINE__ 5545 . ": Must create method '$sub()' for " 5546 . ref shift); 5547 return; 5548 } 5549 } 5550 5551 use overload 5552 fallback => 0, 5553 "." => \&main::_operator_dot, 5554 ".=" => \&main::_operator_dot_equal, 5555 '!=' => \&main::_operator_not_equal, 5556 '==' => \&main::_operator_equal, 5557 ; 5558 5559 sub ranges { 5560 # Returns the array of ranges associated with this table. 5561 5562 no overloading; 5563 return $range_list{pack 'J', shift}->ranges; 5564 } 5565 5566 sub add_alias { 5567 # Add a synonym for this table. 5568 5569 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3; 5570 5571 my $self = shift; 5572 my $name = shift; # The name to add. 5573 my $pointer = shift; # What the alias hash should point to. For 5574 # map tables, this is the parent property; 5575 # for match tables, it is the table itself. 5576 5577 my %args = @_; 5578 my $loose_match = delete $args{'Fuzzy'}; 5579 5580 my $ok_as_filename = delete $args{'OK_as_Filename'}; 5581 $ok_as_filename = 1 unless defined $ok_as_filename; 5582 5583 # An internal name does not get documented, unless overridden by the 5584 # input; same for making tests for it. 5585 my $status = delete $args{'Status'} || (($name =~ /^_/) 5586 ? $INTERNAL_ALIAS 5587 : $NORMAL); 5588 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'} 5589 // (($status ne $INTERNAL_ALIAS) 5590 ? (($name =~ /^_/) ? $NO : $YES) 5591 : $NO); 5592 my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1); 5593 5594 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 5595 5596 # Capitalize the first letter of the alias unless it is one of the CJK 5597 # ones which specifically begins with a lower 'k'. Do this because 5598 # Unicode has varied whether they capitalize first letters or not, and 5599 # have later changed their minds and capitalized them, but not the 5600 # other way around. So do it always and avoid changes from release to 5601 # release 5602 $name = ucfirst($name) unless $name =~ /^k[A-Z]/; 5603 5604 my $addr = do { no overloading; pack 'J', $self; }; 5605 5606 # Figure out if should be loosely matched if not already specified. 5607 if (! defined $loose_match) { 5608 5609 # Is a loose_match if isn't null, and doesn't begin with an 5610 # underscore and isn't just a number 5611 if ($name ne "" 5612 && substr($name, 0, 1) ne '_' 5613 && $name !~ qr{^[0-9_.+-/]+$}) 5614 { 5615 $loose_match = 1; 5616 } 5617 else { 5618 $loose_match = 0; 5619 } 5620 } 5621 5622 # If this alias has already been defined, do nothing. 5623 return if defined $find_table_from_alias{$addr}->{$name}; 5624 5625 # That includes if it is standardly equivalent to an existing alias, 5626 # in which case, add this name to the list, so won't have to search 5627 # for it again. 5628 my $standard_name = main::standardize($name); 5629 if (defined $find_table_from_alias{$addr}->{$standard_name}) { 5630 $find_table_from_alias{$addr}->{$name} 5631 = $find_table_from_alias{$addr}->{$standard_name}; 5632 return; 5633 } 5634 5635 # Set the index hash for this alias for future quick reference. 5636 $find_table_from_alias{$addr}->{$name} = $pointer; 5637 $find_table_from_alias{$addr}->{$standard_name} = $pointer; 5638 local $to_trace = 0 if main::DEBUG; 5639 trace "adding alias $name to $pointer" if main::DEBUG && $to_trace; 5640 trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace; 5641 5642 5643 # Put the new alias at the end of the list of aliases unless the final 5644 # element begins with an underscore (meaning it is for internal perl 5645 # use) or is all numeric, in which case, put the new one before that 5646 # one. This floats any all-numeric or underscore-beginning aliases to 5647 # the end. This is done so that they are listed last in output lists, 5648 # to encourage the user to use a better name (either more descriptive 5649 # or not an internal-only one) instead. This ordering is relied on 5650 # implicitly elsewhere in this program, like in short_name() 5651 my $list = $aliases{$addr}; 5652 my $insert_position = (@$list == 0 5653 || (substr($list->[-1]->name, 0, 1) ne '_' 5654 && $list->[-1]->name =~ /\D/)) 5655 ? @$list 5656 : @$list - 1; 5657 splice @$list, 5658 $insert_position, 5659 0, 5660 Alias->new($name, $loose_match, $make_re_pod_entry, 5661 $ok_as_filename, $status, $ucd); 5662 5663 # This name may be shorter than any existing ones, so clear the cache 5664 # of the shortest, so will have to be recalculated. 5665 no overloading; 5666 undef $short_name{pack 'J', $self}; 5667 return; 5668 } 5669 5670 sub short_name($self, $nominal_length_ptr=undef) { 5671 # Returns a name suitable for use as the base part of a file name. 5672 # That is, shorter wins. It can return undef if there is no suitable 5673 # name. The name has all non-essential underscores removed. 5674 5675 # The optional second parameter is a reference to a scalar in which 5676 # this routine will store the length the returned name had before the 5677 # underscores were removed, or undef if the return is undef. 5678 5679 # The shortest name can change if new aliases are added. So using 5680 # this should be deferred until after all these are added. The code 5681 # that does that should clear this one's cache. 5682 # Any name with alphabetics is preferred over an all numeric one, even 5683 # if longer. 5684 5685 my $addr = do { no overloading; pack 'J', $self; }; 5686 5687 # For efficiency, don't recalculate, but this means that adding new 5688 # aliases could change what the shortest is, so the code that does 5689 # that needs to undef this. 5690 if (defined $short_name{$addr}) { 5691 if ($nominal_length_ptr) { 5692 $$nominal_length_ptr = $nominal_short_name_length{$addr}; 5693 } 5694 return $short_name{$addr}; 5695 } 5696 5697 # Look at each alias 5698 my $is_last_resort = 0; 5699 my $deprecated_or_discouraged 5700 = qr/ ^ (?: $DEPRECATED | $DISCOURAGED ) $/x; 5701 foreach my $alias ($self->aliases()) { 5702 5703 # Don't use an alias that isn't ok to use for an external name. 5704 next if ! $alias->ok_as_filename; 5705 5706 my $name = main::Standardize($alias->name); 5707 trace $self, $name if main::DEBUG && $to_trace; 5708 5709 # Take the first one, or any non-deprecated non-discouraged one 5710 # over one that is, or a shorter one that isn't numeric. This 5711 # relies on numeric aliases always being last in the array 5712 # returned by aliases(). Any alpha one will have precedence. 5713 if ( ! defined $short_name{$addr} 5714 || ( $is_last_resort 5715 && $alias->status !~ $deprecated_or_discouraged) 5716 || ($name =~ /\D/ 5717 && length($name) < length($short_name{$addr}))) 5718 { 5719 # Remove interior underscores. 5720 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg; 5721 5722 $nominal_short_name_length{$addr} = length $name; 5723 $is_last_resort = $alias->status =~ $deprecated_or_discouraged; 5724 } 5725 } 5726 5727 # If the short name isn't a nice one, perhaps an equivalent table has 5728 # a better one. 5729 if ( $self->can('children') 5730 && ( ! defined $short_name{$addr} 5731 || $short_name{$addr} eq "" 5732 || $short_name{$addr} eq "_")) 5733 { 5734 my $return; 5735 foreach my $follower ($self->children) { # All equivalents 5736 my $follower_name = $follower->short_name; 5737 next unless defined $follower_name; 5738 5739 # Anything (except undefined) is better than underscore or 5740 # empty 5741 if (! defined $return || $return eq "_") { 5742 $return = $follower_name; 5743 next; 5744 } 5745 5746 # If the new follower name isn't "_" and is shorter than the 5747 # current best one, prefer the new one. 5748 next if $follower_name eq "_"; 5749 next if length $follower_name > length $return; 5750 $return = $follower_name; 5751 } 5752 $short_name{$addr} = $return if defined $return; 5753 } 5754 5755 # If no suitable external name return undef 5756 if (! defined $short_name{$addr}) { 5757 $$nominal_length_ptr = undef if $nominal_length_ptr; 5758 return; 5759 } 5760 5761 # Don't allow a null short name. 5762 if ($short_name{$addr} eq "") { 5763 $short_name{$addr} = '_'; 5764 $nominal_short_name_length{$addr} = 1; 5765 } 5766 5767 trace $self, $short_name{$addr} if main::DEBUG && $to_trace; 5768 5769 if ($nominal_length_ptr) { 5770 $$nominal_length_ptr = $nominal_short_name_length{$addr}; 5771 } 5772 return $short_name{$addr}; 5773 } 5774 5775 sub external_name($self) { 5776 # Returns the external name that this table should be known by. This 5777 # is usually the short_name, but not if the short_name is undefined, 5778 # in which case the external_name is arbitrarily set to the 5779 # underscore. 5780 5781 my $short = $self->short_name; 5782 return $short if defined $short; 5783 5784 return '_'; 5785 } 5786 5787 sub add_description($self, $description) { # Adds the parameter as a short description. 5788 no overloading; 5789 push @{$description{pack 'J', $self}}, $description; 5790 5791 return; 5792 } 5793 5794 sub add_note($self, $note) { # Adds the parameter as a short note. 5795 no overloading; 5796 push @{$note{pack 'J', $self}}, $note; 5797 5798 return; 5799 } 5800 5801 sub add_comment($self, $comment) { # Adds the parameter as a comment. 5802 5803 return unless $debugging_build; 5804 5805 chomp $comment; 5806 5807 no overloading; 5808 push @{$comment{pack 'J', $self}}, $comment; 5809 5810 return; 5811 } 5812 5813 sub comment($self) { 5814 # Return the current comment for this table. If called in list 5815 # context, returns the array of comments. In scalar, returns a string 5816 # of each element joined together with a period ending each. 5817 5818 my $addr = do { no overloading; pack 'J', $self; }; 5819 my @list = @{$comment{$addr}}; 5820 return @list if wantarray; 5821 my $return = ""; 5822 foreach my $sentence (@list) { 5823 $return .= '. ' if $return; 5824 $return .= $sentence; 5825 $return =~ s/\.$//; 5826 } 5827 $return .= '.' if $return; 5828 return $return; 5829 } 5830 5831 sub initialize($self, $initialization) { 5832 # Initialize the table with the argument which is any valid 5833 # initialization for range lists. 5834 5835 my $addr = do { no overloading; pack 'J', $self; }; 5836 5837 # Replace the current range list with a new one of the same exact 5838 # type. 5839 my $class = ref $range_list{$addr}; 5840 $range_list{$addr} = $class->new(Owner => $self, 5841 Initialize => $initialization); 5842 return; 5843 5844 } 5845 5846 sub header($self) { 5847 # The header that is output for the table in the file it is written 5848 # in. 5849 my $return = ""; 5850 $return .= $DEVELOPMENT_ONLY if $compare_versions; 5851 $return .= $HEADER; 5852 return $return; 5853 } 5854 5855 sub merge_single_annotation_line ($output, $annotation, $annotation_column) { 5856 5857 # This appends an annotation comment, $annotation, to $output, 5858 # starting in or after column $annotation_column, removing any 5859 # pre-existing comment from $output. 5860 5861 $annotation =~ s/^ \s* \# \ //x; 5862 $output =~ s/ \s* ( \# \N* )? \n //x; 5863 $output = Text::Tabs::expand($output); 5864 5865 my $spaces = $annotation_column - length $output; 5866 $spaces = 2 if $spaces < 0; # Have 2 blanks before the comment 5867 5868 $output = sprintf "%s%*s# %s", 5869 $output, 5870 $spaces, 5871 " ", 5872 $annotation; 5873 return Text::Tabs::unexpand $output; 5874 } 5875 5876 sub write($self, $use_adjustments=0, $suppress_value=0) { 5877 # Write a representation of the table to its file. It calls several 5878 # functions furnished by sub-classes of this abstract base class to 5879 # handle non-normal ranges, to add stuff before the table, and at its 5880 # end. If the table is to be written so that adjustments are 5881 # required, this does that conversion. 5882 5883 5884 # $use_adjustments ? output in adjusted format or not 5885 # $suppress_value Optional, if the value associated with 5886 # a range equals this one, don't write 5887 # the range 5888 5889 my $addr = do { no overloading; pack 'J', $self; }; 5890 my $write_as_invlist = $write_as_invlist{$addr}; 5891 5892 # Start with the header 5893 my @HEADER = $self->header; 5894 5895 # Then the comments 5896 push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n" 5897 if $comment{$addr}; 5898 5899 # Things discovered processing the main body of the document may 5900 # affect what gets output before it, therefore pre_body() isn't called 5901 # until after all other processing of the table is done. 5902 5903 # The main body looks like a 'here' document. If there are comments, 5904 # get rid of them when processing it. 5905 my @OUT; 5906 if ($annotate || $output_range_counts) { 5907 # Use the line below in Perls that don't have /r 5908 #push @OUT, 'return join "\n", map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n"; 5909 push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n"; 5910 } else { 5911 push @OUT, "return <<'END';\n"; 5912 } 5913 5914 if ($range_list{$addr}->is_empty) { 5915 5916 # This is a kludge for empty tables to silence a warning in 5917 # utf8.c, which can't really deal with empty tables, but it can 5918 # deal with a table that matches nothing, as the inverse of 'All' 5919 # does. 5920 push @OUT, "!Unicode::UCD::All\n"; 5921 } 5922 elsif ($self->name eq 'N' 5923 5924 # To save disk space and table cache space, avoid putting out 5925 # binary N tables, but instead create a file which just inverts 5926 # the Y table. Since the file will still exist and occupy a 5927 # certain number of blocks, might as well output the whole 5928 # thing if it all will fit in one block. The number of 5929 # ranges below is an approximate number for that. 5930 && ($self->property->type == $BINARY 5931 || $self->property->type == $FORCED_BINARY) 5932 # && $self->property->tables == 2 Can't do this because the 5933 # non-binary properties, like NFDQC aren't specifiable 5934 # by the notation 5935 && $range_list{$addr}->ranges > 15 5936 && ! $annotate) # Under --annotate, want to see everything 5937 { 5938 push @OUT, "!Unicode::UCD::" . $self->property->name . "\n"; 5939 } 5940 else { 5941 my $range_size_1 = $range_size_1{$addr}; 5942 5943 # To make it more readable, use a minimum indentation 5944 my $comment_indent; 5945 5946 # These are used only in $annotate option 5947 my $format; # e.g. $HEX_ADJUST_FORMAT 5948 my $include_name; # ? Include the character's name in the 5949 # annotation? 5950 my $include_cp; # ? Include its code point 5951 5952 if (! $annotate) { 5953 $comment_indent = ($self->isa('Map_Table')) 5954 ? 24 5955 : ($write_as_invlist) 5956 ? 8 5957 : 16; 5958 } 5959 else { 5960 $format = $self->format; 5961 5962 # The name of the character is output only for tables that 5963 # don't already include the name in the output. 5964 my $property = $self->property; 5965 $include_name = 5966 ! ($property == $perl_charname 5967 || $property == main::property_ref('Unicode_1_Name') 5968 || $property == main::property_ref('Name') 5969 || $property == main::property_ref('Name_Alias') 5970 ); 5971 5972 # Don't include the code point in the annotation where all 5973 # lines are a single code point, so it can be easily found in 5974 # the first column 5975 $include_cp = ! $range_size_1; 5976 5977 if (! $self->isa('Map_Table')) { 5978 $comment_indent = ($write_as_invlist) ? 8 : 16; 5979 } 5980 else { 5981 $comment_indent = 16; 5982 5983 # There are just a few short ranges in this table, so no 5984 # need to include the code point in the annotation. 5985 $include_cp = 0 if $format eq $DECOMP_STRING_FORMAT; 5986 5987 # We're trying to get this to look good, as the whole 5988 # point is to make human-readable tables. It is easier to 5989 # read if almost all the annotation comments begin in the 5990 # same column. Map tables have varying width maps, so can 5991 # create a jagged comment appearance. This code does a 5992 # preliminary pass through these tables looking for the 5993 # maximum width map in each, and causing the comments to 5994 # begin just to the right of that. However, if the 5995 # comments begin too far to the right of most lines, it's 5996 # hard to line them up horizontally with their real data. 5997 # Therefore we ignore the longest outliers 5998 my $ignore_longest_X_percent = 2; # Discard longest X% 5999 6000 # Each key in this hash is a width of at least one of the 6001 # maps in the table. Its value is how many lines have 6002 # that width. 6003 my %widths; 6004 6005 # We won't space things further left than one tab stop 6006 # after the rest of the line; initializing it to that 6007 # number saves some work. 6008 my $max_map_width = 8; 6009 6010 # Fill in the %widths hash 6011 my $total = 0; 6012 for my $set ($range_list{$addr}->ranges) { 6013 my $value = $set->value; 6014 6015 # These range types don't appear in the main table 6016 next if $set->type == 0 6017 && defined $suppress_value 6018 && $value eq $suppress_value; 6019 next if $set->type == $MULTI_CP 6020 || $set->type == $NULL; 6021 6022 # Include 2 spaces before the beginning of the 6023 # comment 6024 my $this_width = length($value) + 2; 6025 6026 # Ranges of the remaining non-zero types usually 6027 # occupy just one line (maybe occasionally two, but 6028 # this doesn't have to be dead accurate). This is 6029 # because these ranges are like "unassigned code 6030 # points" 6031 my $count = ($set->type != 0) 6032 ? 1 6033 : $set->end - $set->start + 1; 6034 $widths{$this_width} += $count; 6035 $total += $count; 6036 $max_map_width = $this_width 6037 if $max_map_width < $this_width; 6038 } 6039 6040 # If the widest map gives us less than two tab stops 6041 # worth, just take it as-is. 6042 if ($max_map_width > 16) { 6043 6044 # Otherwise go through %widths until we have included 6045 # the desired percentage of lines in the whole table. 6046 my $running_total = 0; 6047 foreach my $width (sort { $a <=> $b } keys %widths) 6048 { 6049 $running_total += $widths{$width}; 6050 use integer; 6051 if ($running_total * 100 / $total 6052 >= 100 - $ignore_longest_X_percent) 6053 { 6054 $max_map_width = $width; 6055 last; 6056 } 6057 } 6058 } 6059 $comment_indent += $max_map_width; 6060 } 6061 } 6062 6063 # Values for previous time through the loop. Initialize to 6064 # something that won't be adjacent to the first iteration; 6065 # only $previous_end matters for that. 6066 my $previous_start; 6067 my $previous_end = -2; 6068 my $previous_value; 6069 6070 # Values for next time through the portion of the loop that splits 6071 # the range. 0 in $next_start means there is no remaining portion 6072 # to deal with. 6073 my $next_start = 0; 6074 my $next_end; 6075 my $next_value; 6076 my $offset = 0; 6077 my $invlist_count = 0; 6078 6079 my $output_value_in_hex = $self->isa('Map_Table') 6080 && ($self->format eq $HEX_ADJUST_FORMAT 6081 || $self->to_output_map == $EXTERNAL_MAP); 6082 # Use leading zeroes just for files whose format should not be 6083 # changed from what it has been. Otherwise, they just take up 6084 # space and time to process. 6085 my $hex_format = ($self->isa('Map_Table') 6086 && $self->to_output_map == $EXTERNAL_MAP) 6087 ? "%04X" 6088 : "%X"; 6089 6090 # The values for some of these tables are stored in mktables as 6091 # hex strings. Normally, these are just output as strings without 6092 # change, but when we are doing adjustments, we have to operate on 6093 # these numerically, so we convert those to decimal to do that, 6094 # and back to hex for output 6095 my $convert_map_to_from_hex = 0; 6096 my $output_map_in_hex = 0; 6097 if ($self->isa('Map_Table')) { 6098 $convert_map_to_from_hex 6099 = ($use_adjustments && $self->format eq $HEX_ADJUST_FORMAT) 6100 || ($annotate && $self->format eq $HEX_FORMAT); 6101 $output_map_in_hex = $convert_map_to_from_hex 6102 || $self->format eq $HEX_FORMAT; 6103 } 6104 6105 # To store any annotations about the characters. 6106 my @annotation; 6107 6108 # Output each range as part of the here document. 6109 RANGE: 6110 for my $set ($range_list{$addr}->ranges) { 6111 if ($set->type != 0) { 6112 $self->handle_special_range($set); 6113 next RANGE; 6114 } 6115 my $start = $set->start; 6116 my $end = $set->end; 6117 my $value = $set->value; 6118 6119 # Don't output ranges whose value is the one to suppress 6120 next RANGE if defined $suppress_value 6121 && $value eq $suppress_value; 6122 6123 $value = CORE::hex $value if $convert_map_to_from_hex; 6124 6125 6126 { # This bare block encloses the scope where we may need to 6127 # 'redo' to. Consider a table that is to be written out 6128 # using single item ranges. This is given in the 6129 # $range_size_1 boolean. To accomplish this, we split the 6130 # range each time through the loop into two portions, the 6131 # first item, and the rest. We handle that first item 6132 # this time in the loop, and 'redo' to repeat the process 6133 # for the rest of the range. 6134 # 6135 # We may also have to do it, with other special handling, 6136 # if the table has adjustments. Consider the table that 6137 # contains the lowercasing maps. mktables stores the 6138 # ASCII range ones as 26 ranges: 6139 # ord('A') => ord('a'), .. ord('Z') => ord('z') 6140 # For compactness, the table that gets written has this as 6141 # just one range 6142 # ( ord('A') .. ord('Z') ) => ord('a') 6143 # and the software that reads the tables is smart enough 6144 # to "connect the dots". This change is accomplished in 6145 # this loop by looking to see if the current iteration 6146 # fits the paradigm of the previous iteration, and if so, 6147 # we merge them by replacing the final output item with 6148 # the merged data. Repeated 25 times, this gets A-Z. But 6149 # we also have to make sure we don't screw up cases where 6150 # we have internally stored 6151 # ( 0x1C4 .. 0x1C6 ) => 0x1C5 6152 # This single internal range has to be output as 3 ranges, 6153 # which is done by splitting, like we do for $range_size_1 6154 # tables. (There are very few of such ranges that need to 6155 # be split, so the gain of doing the combining of other 6156 # ranges far outweighs the splitting of these.) The 6157 # values to use for the redo at the end of this block are 6158 # set up just below in the scalars whose names begin with 6159 # '$next_'. 6160 6161 if (($use_adjustments || $range_size_1) && $end != $start) 6162 { 6163 $next_start = $start + 1; 6164 $next_end = $end; 6165 $next_value = $value; 6166 $end = $start; 6167 } 6168 6169 if ($use_adjustments && ! $range_size_1) { 6170 6171 # If this range is adjacent to the previous one, and 6172 # the values in each are integers that are also 6173 # adjacent (differ by 1), then this range really 6174 # extends the previous one that is already in element 6175 # $OUT[-1]. So we pop that element, and pretend that 6176 # the range starts with whatever it started with. 6177 # $offset is incremented by 1 each time so that it 6178 # gives the current offset from the first element in 6179 # the accumulating range, and we keep in $value the 6180 # value of that first element. 6181 if ($start == $previous_end + 1 6182 && $value =~ /^ -? \d+ $/xa 6183 && $previous_value =~ /^ -? \d+ $/xa 6184 && ($value == ($previous_value + ++$offset))) 6185 { 6186 pop @OUT; 6187 $start = $previous_start; 6188 $value = $previous_value; 6189 } 6190 else { 6191 $offset = 0; 6192 if (@annotation == 1) { 6193 $OUT[-1] = merge_single_annotation_line( 6194 $OUT[-1], $annotation[0], $comment_indent); 6195 } 6196 else { 6197 push @OUT, @annotation; 6198 } 6199 } 6200 undef @annotation; 6201 6202 # Save the current values for the next time through 6203 # the loop. 6204 $previous_start = $start; 6205 $previous_end = $end; 6206 $previous_value = $value; 6207 } 6208 6209 if ($write_as_invlist) { 6210 if ( $previous_end > 0 6211 && $output_range_counts{$addr}) 6212 { 6213 my $complement_count = $start - $previous_end - 1; 6214 if ($complement_count > 1) { 6215 $OUT[-1] = merge_single_annotation_line( 6216 $OUT[-1], 6217 "#" 6218 . (" " x 17) 6219 . "[" 6220 . main::clarify_code_point_count( 6221 $complement_count) 6222 . "] in complement\n", 6223 $comment_indent); 6224 } 6225 } 6226 6227 # Inversion list format has a single number per line, 6228 # the starting code point of a range that matches the 6229 # property 6230 push @OUT, $start, "\n"; 6231 $invlist_count++; 6232 6233 # Add a comment with the size of the range, if 6234 # requested. 6235 if ($output_range_counts{$addr}) { 6236 $OUT[-1] = merge_single_annotation_line( 6237 $OUT[-1], 6238 "# [" 6239 . main::clarify_code_point_count($end - $start + 1) 6240 . "]\n", 6241 $comment_indent); 6242 } 6243 } 6244 elsif ($start != $end) { # If there is a range 6245 if ($end == $MAX_WORKING_CODEPOINT) { 6246 push @OUT, sprintf "$hex_format\t$hex_format", 6247 $start, 6248 $MAX_PLATFORM_CODEPOINT; 6249 } 6250 else { 6251 push @OUT, sprintf "$hex_format\t$hex_format", 6252 $start, $end; 6253 } 6254 if (length $value) { 6255 if ($convert_map_to_from_hex) { 6256 $OUT[-1] .= sprintf "\t$hex_format\n", $value; 6257 } 6258 else { 6259 $OUT[-1] .= "\t$value\n"; 6260 } 6261 } 6262 6263 # Add a comment with the size of the range, if 6264 # requested. 6265 if ($output_range_counts{$addr}) { 6266 $OUT[-1] = merge_single_annotation_line( 6267 $OUT[-1], 6268 "# [" 6269 . main::clarify_code_point_count($end - $start + 1) 6270 . "]\n", 6271 $comment_indent); 6272 } 6273 } 6274 else { # Here to output a single code point per line. 6275 6276 # Use any passed in subroutine to output. 6277 if (ref $range_size_1 eq 'CODE') { 6278 for my $i ($start .. $end) { 6279 push @OUT, &{$range_size_1}($i, $value); 6280 } 6281 } 6282 else { 6283 6284 # Here, caller is ok with default output. 6285 for (my $i = $start; $i <= $end; $i++) { 6286 if ($convert_map_to_from_hex) { 6287 push @OUT, 6288 sprintf "$hex_format\t\t$hex_format\n", 6289 $i, $value; 6290 } 6291 else { 6292 push @OUT, sprintf $hex_format, $i; 6293 $OUT[-1] .= "\t\t$value" if $value ne ""; 6294 $OUT[-1] .= "\n"; 6295 } 6296 } 6297 } 6298 } 6299 6300 if ($annotate) { 6301 for (my $i = $start; $i <= $end; $i++) { 6302 my $annotation = ""; 6303 6304 # Get character information if don't have it already 6305 main::populate_char_info($i) 6306 if ! defined $viacode[$i]; 6307 my $type = $annotate_char_type[$i]; 6308 6309 # Figure out if should output the next code points 6310 # as part of a range or not. If this is not in an 6311 # annotation range, then won't output as a range, 6312 # so returns $i. Otherwise use the end of the 6313 # annotation range, but no further than the 6314 # maximum possible end point of the loop. 6315 my $range_end = 6316 $range_size_1 6317 ? $start 6318 : main::min( 6319 $annotate_ranges->value_of($i) || $i, 6320 $end); 6321 6322 # Use a range if it is a range, and either is one 6323 # of the special annotation ranges, or the range 6324 # is at most 3 long. This last case causes the 6325 # algorithmically named code points to be output 6326 # individually in spans of at most 3, as they are 6327 # the ones whose $type is > 0. 6328 if ($range_end != $i 6329 && ( $type < 0 || $range_end - $i > 2)) 6330 { 6331 # Here is to output a range. We don't allow a 6332 # caller-specified output format--just use the 6333 # standard one. 6334 my $range_name = $viacode[$i]; 6335 6336 # For the code points which end in their hex 6337 # value, we eliminate that from the output 6338 # annotation, and capitalize only the first 6339 # letter of each word. 6340 if ($type == $CP_IN_NAME) { 6341 my $hex = sprintf $hex_format, $i; 6342 $range_name =~ s/-$hex$//; 6343 my @words = split " ", $range_name; 6344 for my $word (@words) { 6345 $word = 6346 ucfirst(lc($word)) if $word ne 'CJK'; 6347 } 6348 $range_name = join " ", @words; 6349 } 6350 elsif ($type == $HANGUL_SYLLABLE) { 6351 $range_name = "Hangul Syllable"; 6352 } 6353 6354 # If the annotation would just repeat what's 6355 # already being output as the range, skip it. 6356 # (When an inversion list is being written, it 6357 # isn't a repeat, as that always is in 6358 # decimal) 6359 if ( $write_as_invlist 6360 || $i != $start 6361 || $range_end < $end) 6362 { 6363 if ($range_end < $MAX_WORKING_CODEPOINT) 6364 { 6365 $annotation = sprintf "%04X..%04X", 6366 $i, $range_end; 6367 } 6368 else { 6369 $annotation = sprintf "%04X..INFINITY", 6370 $i; 6371 } 6372 } 6373 else { # Indent if not displaying code points 6374 $annotation = " " x 4; 6375 } 6376 6377 if ($range_name) { 6378 $annotation .= " $age[$i]" if $age[$i]; 6379 $annotation .= " $range_name"; 6380 } 6381 6382 # Include the number of code points in the 6383 # range 6384 my $count = 6385 main::clarify_code_point_count($range_end - $i + 1); 6386 $annotation .= " [$count]\n"; 6387 6388 # Skip to the end of the range 6389 $i = $range_end; 6390 } 6391 else { # Not in a range. 6392 my $comment = ""; 6393 6394 # When outputting the names of each character, 6395 # use the character itself if printable 6396 $comment .= "'" . main::display_chr($i) . "' " 6397 if $printable[$i]; 6398 6399 my $output_value = $value; 6400 6401 # Determine the annotation 6402 if ($format eq $DECOMP_STRING_FORMAT) { 6403 6404 # This is very specialized, with the type 6405 # of decomposition beginning the line 6406 # enclosed in <...>, and the code points 6407 # that the code point decomposes to 6408 # separated by blanks. Create two 6409 # strings, one of the printable 6410 # characters, and one of their official 6411 # names. 6412 (my $map = $output_value) 6413 =~ s/ \ * < .*? > \ +//x; 6414 my $tostr = ""; 6415 my $to_name = ""; 6416 my $to_chr = ""; 6417 foreach my $to (split " ", $map) { 6418 $to = CORE::hex $to; 6419 $to_name .= " + " if $to_name; 6420 $to_chr .= main::display_chr($to); 6421 main::populate_char_info($to) 6422 if ! defined $viacode[$to]; 6423 $to_name .= $viacode[$to]; 6424 } 6425 6426 $comment .= 6427 "=> '$to_chr'; $viacode[$i] => $to_name"; 6428 } 6429 else { 6430 $output_value += $i - $start 6431 if $use_adjustments 6432 # Don't try to adjust a 6433 # non-integer 6434 && $output_value !~ /[-\D]/; 6435 6436 if ($output_map_in_hex) { 6437 main::populate_char_info($output_value) 6438 if ! defined $viacode[$output_value]; 6439 $comment .= " => '" 6440 . main::display_chr($output_value) 6441 . "'; " if $printable[$output_value]; 6442 } 6443 if ($include_name && $viacode[$i]) { 6444 $comment .= " " if $comment; 6445 $comment .= $viacode[$i]; 6446 } 6447 if ($output_map_in_hex) { 6448 $comment .= 6449 " => $viacode[$output_value]" 6450 if $viacode[$output_value]; 6451 $output_value = sprintf($hex_format, 6452 $output_value); 6453 } 6454 } 6455 6456 if ($include_cp) { 6457 $annotation = sprintf "%04X %s", $i, $age[$i]; 6458 if ($use_adjustments) { 6459 $annotation .= " => $output_value"; 6460 } 6461 } 6462 6463 if ($comment ne "") { 6464 $annotation .= " " if $annotation ne ""; 6465 $annotation .= $comment; 6466 } 6467 $annotation .= "\n" if $annotation ne ""; 6468 } 6469 6470 if ($annotation ne "") { 6471 push @annotation, (" " x $comment_indent) 6472 . "# $annotation"; 6473 } 6474 } 6475 6476 # If not adjusting, we don't have to go through the 6477 # loop again to know that the annotation comes next 6478 # in the output. 6479 if (! $use_adjustments) { 6480 if (@annotation == 1) { 6481 $OUT[-1] = merge_single_annotation_line( 6482 $OUT[-1], $annotation[0], $comment_indent); 6483 } 6484 else { 6485 push @OUT, map { Text::Tabs::unexpand $_ } 6486 @annotation; 6487 } 6488 undef @annotation; 6489 } 6490 } 6491 6492 # Add the beginning of the range that doesn't match the 6493 # property, except if the just added match range extends 6494 # to infinity. We do this after any annotations for the 6495 # match range. 6496 if ($write_as_invlist && $end < $MAX_WORKING_CODEPOINT) { 6497 push @OUT, $end + 1, "\n"; 6498 $invlist_count++; 6499 } 6500 6501 # If we split the range, set up so the next time through 6502 # we get the remainder, and redo. 6503 if ($next_start) { 6504 $start = $next_start; 6505 $end = $next_end; 6506 $value = $next_value; 6507 $next_start = 0; 6508 redo; 6509 } 6510 } # End of redo block 6511 } # End of loop through all the table's ranges 6512 6513 push @OUT, @annotation; # Add orphaned annotation, if any 6514 6515 splice @OUT, 1, 0, "V$invlist_count\n" if $invlist_count; 6516 } 6517 6518 # Add anything that goes after the main body, but within the here 6519 # document, 6520 my $append_to_body = $self->append_to_body; 6521 push @OUT, $append_to_body if $append_to_body; 6522 6523 # And finish the here document. 6524 push @OUT, "END\n"; 6525 6526 # Done with the main portion of the body. Can now figure out what 6527 # should appear before it in the file. 6528 my $pre_body = $self->pre_body; 6529 push @HEADER, $pre_body, "\n" if $pre_body; 6530 6531 # All these files should have a .pl suffix added to them. 6532 my @file_with_pl = @{$file_path{$addr}}; 6533 $file_with_pl[-1] .= '.pl'; 6534 6535 main::write(\@file_with_pl, 6536 $annotate, # utf8 iff annotating 6537 \@HEADER, 6538 \@OUT); 6539 return; 6540 } 6541 6542 sub set_status($self, $status, $info) { # Set the table's status 6543 # status The status enum value 6544 # info Any message associated with it. 6545 my $addr = do { no overloading; pack 'J', $self; }; 6546 6547 $status{$addr} = $status; 6548 $status_info{$addr} = $info; 6549 return; 6550 } 6551 6552 sub set_fate($self, $fate, $reason=undef) { # Set the fate of a table 6553 my $addr = do { no overloading; pack 'J', $self; }; 6554 6555 return if $fate{$addr} == $fate; # If no-op 6556 6557 # Can only change the ordinary fate, except if going to $MAP_PROXIED 6558 return if $fate{$addr} != $ORDINARY && $fate != $MAP_PROXIED; 6559 6560 $fate{$addr} = $fate; 6561 6562 # Don't document anything to do with a non-normal fated table 6563 if ($fate != $ORDINARY) { 6564 my $put_in_pod = ($fate == $MAP_PROXIED) ? 1 : 0; 6565 foreach my $alias ($self->aliases) { 6566 $alias->set_ucd($put_in_pod); 6567 6568 # MAP_PROXIED doesn't affect the match tables 6569 next if $fate == $MAP_PROXIED; 6570 $alias->set_make_re_pod_entry($put_in_pod); 6571 } 6572 } 6573 6574 # Save the reason for suppression for output 6575 if ($fate >= $SUPPRESSED) { 6576 $reason = "" unless defined $reason; 6577 $why_suppressed{$complete_name{$addr}} = $reason; 6578 } 6579 6580 return; 6581 } 6582 6583 sub lock($self) { 6584 # Don't allow changes to the table from now on. This stores a stack 6585 # trace of where it was called, so that later attempts to modify it 6586 # can immediately show where it got locked. 6587 my $addr = do { no overloading; pack 'J', $self; }; 6588 6589 $locked{$addr} = ""; 6590 6591 my $line = (caller(0))[2]; 6592 my $i = 1; 6593 6594 # Accumulate the stack trace 6595 while (1) { 6596 my ($pkg, $file, $caller_line, $caller) = caller $i++; 6597 6598 last unless defined $caller; 6599 6600 $locked{$addr} .= " called from $caller() at line $line\n"; 6601 $line = $caller_line; 6602 } 6603 $locked{$addr} .= " called from main at line $line\n"; 6604 6605 return; 6606 } 6607 6608 sub carp_if_locked($self) { 6609 # Return whether a table is locked or not, and, by the way, complain 6610 # if is locked 6611 my $addr = do { no overloading; pack 'J', $self; }; 6612 6613 return 0 if ! $locked{$addr}; 6614 Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n"); 6615 return 1; 6616 } 6617 6618 sub set_file_path($self, @path) { # Set the final directory path for this table 6619 no overloading; 6620 @{$file_path{pack 'J', $self}} = @path; 6621 return 6622 } 6623 6624 # Accessors for the range list stored in this table. First for 6625 # unconditional 6626 for my $sub (qw( 6627 containing_range 6628 contains 6629 count 6630 each_range 6631 hash 6632 is_empty 6633 matches_identically_to 6634 max 6635 min 6636 range_count 6637 reset_each_range 6638 type_of 6639 value_of 6640 )) 6641 { 6642 no strict "refs"; 6643 *$sub = sub { 6644 use strict "refs"; 6645 my $self = shift; 6646 return $self->_range_list->$sub(@_); 6647 } 6648 } 6649 6650 # Then for ones that should fail if locked 6651 for my $sub (qw( 6652 delete_range 6653 )) 6654 { 6655 no strict "refs"; 6656 *$sub = sub { 6657 use strict "refs"; 6658 my $self = shift; 6659 6660 return if $self->carp_if_locked; 6661 no overloading; 6662 return $self->_range_list->$sub(@_); 6663 } 6664 } 6665 6666} # End closure 6667 6668package Map_Table; 6669use parent '-norequire', '_Base_Table'; 6670 6671# A Map Table is a table that contains the mappings from code points to 6672# values. There are two weird cases: 6673# 1) Anomalous entries are ones that aren't maps of ranges of code points, but 6674# are written in the table's file at the end of the table nonetheless. It 6675# requires specially constructed code to handle these; utf8.c can not read 6676# these in, so they should not go in $map_directory. As of this writing, 6677# the only case that these happen is for named sequences used in 6678# charnames.pm. But this code doesn't enforce any syntax on these, so 6679# something else could come along that uses it. 6680# 2) Specials are anything that doesn't fit syntactically into the body of the 6681# table. The ranges for these have a map type of non-zero. The code below 6682# knows about and handles each possible type. In most cases, these are 6683# written as part of the header. 6684# 6685# A map table deliberately can't be manipulated at will unlike match tables. 6686# This is because of the ambiguities having to do with what to do with 6687# overlapping code points. And there just isn't a need for those things; 6688# what one wants to do is just query, add, replace, or delete mappings, plus 6689# write the final result. 6690# However, there is a method to get the list of possible ranges that aren't in 6691# this table to use for defaulting missing code point mappings. And, 6692# map_add_or_replace_non_nulls() does allow one to add another table to this 6693# one, but it is clearly very specialized, and defined that the other's 6694# non-null values replace this one's if there is any overlap. 6695 6696sub trace { return main::trace(@_); } 6697 6698{ # Closure 6699 6700 main::setup_package(); 6701 6702 my %default_map; 6703 # Many input files omit some entries; this gives what the mapping for the 6704 # missing entries should be 6705 main::set_access('default_map', \%default_map, 'r'); 6706 6707 my %anomalous_entries; 6708 # Things that go in the body of the table which don't fit the normal 6709 # scheme of things, like having a range. Not much can be done with these 6710 # once there except to output them. This was created to handle named 6711 # sequences. 6712 main::set_access('anomalous_entry', \%anomalous_entries, 'a'); 6713 main::set_access('anomalous_entries', # Append singular, read plural 6714 \%anomalous_entries, 6715 'readable_array'); 6716 my %to_output_map; 6717 # Enum as to whether or not to write out this map table, and how: 6718 # 0 don't output 6719 # $EXTERNAL_MAP means its existence is noted in the documentation, and 6720 # it should not be removed nor its format changed. This 6721 # is done for those files that have traditionally been 6722 # output. 6723 # $INTERNAL_MAP means Perl reserves the right to do anything it wants 6724 # with this file 6725 # $OUTPUT_ADJUSTED means that it is an $INTERNAL_MAP, and instead of 6726 # outputting the actual mappings as-is, we adjust things 6727 # to create a much more compact table. Only those few 6728 # tables where the mapping is convertible at least to an 6729 # integer and compacting makes a big difference should 6730 # have this. Hence, the default is to not do this 6731 # unless the table's default mapping is to $CODE_POINT, 6732 # and the range size is not 1. 6733 main::set_access('to_output_map', \%to_output_map, 's'); 6734 6735 sub new { 6736 my $class = shift; 6737 my $name = shift; 6738 6739 my %args = @_; 6740 6741 # Optional initialization data for the table. 6742 my $initialize = delete $args{'Initialize'}; 6743 6744 my $default_map = delete $args{'Default_Map'}; 6745 my $property = delete $args{'_Property'}; 6746 my $full_name = delete $args{'Full_Name'}; 6747 my $to_output_map = delete $args{'To_Output_Map'}; 6748 6749 # Rest of parameters passed on 6750 6751 my $range_list = Range_Map->new(Owner => $property); 6752 6753 my $self = $class->SUPER::new( 6754 Name => $name, 6755 Complete_Name => $full_name, 6756 Full_Name => $full_name, 6757 _Property => $property, 6758 _Range_List => $range_list, 6759 Write_As_Invlist => 0, 6760 %args); 6761 6762 my $addr = do { no overloading; pack 'J', $self; }; 6763 6764 $anomalous_entries{$addr} = []; 6765 $default_map{$addr} = $default_map; 6766 $to_output_map{$addr} = $to_output_map; 6767 6768 $self->initialize($initialize) if defined $initialize; 6769 6770 return $self; 6771 } 6772 6773 use overload 6774 fallback => 0, 6775 qw("") => "_operator_stringify", 6776 ; 6777 6778 sub _operator_stringify($self, $other="", $reversed=0) { 6779 6780 my $name = $self->property->full_name; 6781 $name = '""' if $name eq ""; 6782 return "Map table for Property '$name'"; 6783 } 6784 6785 sub add_alias { 6786 # Add a synonym for this table (which means the property itself) 6787 my $self = shift; 6788 my $name = shift; 6789 # Rest of parameters passed on. 6790 6791 $self->SUPER::add_alias($name, $self->property, @_); 6792 return; 6793 } 6794 6795 sub add_map { 6796 # Add a range of code points to the list of specially-handled code 6797 # points. 0 is assumed if the type of special is not passed 6798 # in. 6799 6800 my $self = shift; 6801 my $lower = shift; 6802 my $upper = shift; 6803 my $string = shift; 6804 my %args = @_; 6805 6806 my $type = delete $args{'Type'} || 0; 6807 # Rest of parameters passed on 6808 6809 # Can't change the table if locked. 6810 return if $self->carp_if_locked; 6811 6812 my $addr = do { no overloading; pack 'J', $self; }; 6813 6814 $self->_range_list->add_map($lower, $upper, 6815 $string, 6816 @_, 6817 Type => $type); 6818 return; 6819 } 6820 6821 sub append_to_body($self) { 6822 # Adds to the written HERE document of the table's body any anomalous 6823 # entries in the table.. 6824 my $addr = do { no overloading; pack 'J', $self; }; 6825 6826 return "" unless @{$anomalous_entries{$addr}}; 6827 return join("\n", @{$anomalous_entries{$addr}}) . "\n"; 6828 } 6829 6830 sub map_add_or_replace_non_nulls($self, $other) { 6831 # This adds the mappings in the table $other to $self. Non-null 6832 # mappings from $other override those in $self. It essentially merges 6833 # the two tables, with the second having priority except for null 6834 # mappings. 6835 return if $self->carp_if_locked; 6836 6837 if (! $other->isa(__PACKAGE__)) { 6838 Carp::my_carp_bug("$other should be a " 6839 . __PACKAGE__ 6840 . ". Not a '" 6841 . ref($other) 6842 . "'. Not added;"); 6843 return; 6844 } 6845 6846 my $addr = do { no overloading; pack 'J', $self; }; 6847 my $other_addr = do { no overloading; pack 'J', $other; }; 6848 6849 local $to_trace = 0 if main::DEBUG; 6850 6851 my $self_range_list = $self->_range_list; 6852 my $other_range_list = $other->_range_list; 6853 foreach my $range ($other_range_list->ranges) { 6854 my $value = $range->value; 6855 next if $value eq ""; 6856 $self_range_list->_add_delete('+', 6857 $range->start, 6858 $range->end, 6859 $value, 6860 Type => $range->type, 6861 Replace => $UNCONDITIONALLY); 6862 } 6863 6864 return; 6865 } 6866 6867 sub set_default_map($self, $map, $use_full_name=0) { 6868 # Define what code points that are missing from the input files should 6869 # map to. The optional second parameter 'full_name' indicates to 6870 # force using the full name of the map instead of its standard name. 6871 if ($use_full_name && $use_full_name ne 'full_name') { 6872 Carp::my_carp_bug("Second parameter to set_default_map() if" 6873 . " present, must be 'full_name'"); 6874 } 6875 6876 my $addr = do { no overloading; pack 'J', $self; }; 6877 6878 # Convert the input to the standard equivalent, if any (won't have any 6879 # for $STRING properties) 6880 my $standard = $self->property->table($map); 6881 if (defined $standard) { 6882 $map = ($use_full_name) 6883 ? $standard->full_name 6884 : $standard->name; 6885 } 6886 6887 # Warn if there already is a non-equivalent default map for this 6888 # property. Note that a default map can be a ref, which means that 6889 # what it actually means is delayed until later in the program, and it 6890 # IS permissible to override it here without a message. 6891 my $default_map = $default_map{$addr}; 6892 if (defined $default_map 6893 && ! ref($default_map) 6894 && $default_map ne $map 6895 && main::Standardize($map) ne $default_map) 6896 { 6897 my $property = $self->property; 6898 my $map_table = $property->table($map); 6899 my $default_table = $property->table($default_map); 6900 if (defined $map_table 6901 && defined $default_table 6902 && $map_table != $default_table) 6903 { 6904 Carp::my_carp("Changing the default mapping for " 6905 . $property 6906 . " from $default_map to $map'"); 6907 } 6908 } 6909 6910 $default_map{$addr} = $map; 6911 6912 # Don't also create any missing table for this map at this point, 6913 # because if we did, it could get done before the main table add is 6914 # done for PropValueAliases.txt; instead the caller will have to make 6915 # sure it exists, if desired. 6916 return; 6917 } 6918 6919 sub to_output_map($self) { 6920 # Returns boolean: should we write this map table? 6921 my $addr = do { no overloading; pack 'J', $self; }; 6922 6923 # If overridden, use that 6924 return $to_output_map{$addr} if defined $to_output_map{$addr}; 6925 6926 my $full_name = $self->full_name; 6927 return $global_to_output_map{$full_name} 6928 if defined $global_to_output_map{$full_name}; 6929 6930 # If table says to output, do so; if says to suppress it, do so. 6931 my $fate = $self->fate; 6932 return $INTERNAL_MAP if $fate == $INTERNAL_ONLY; 6933 return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties; 6934 return 0 if $fate == $SUPPRESSED || $fate == $MAP_PROXIED; 6935 6936 my $type = $self->property->type; 6937 6938 # Don't want to output binary map tables even for debugging. 6939 return 0 if $type == $BINARY; 6940 6941 # But do want to output string ones. All the ones that remain to 6942 # be dealt with (i.e. which haven't explicitly been set to external) 6943 # are for internal Perl use only. The default for those that map to 6944 # $CODE_POINT and haven't been restricted to a single element range 6945 # is to use the adjusted form. 6946 if ($type == $STRING) { 6947 return $INTERNAL_MAP if $self->range_size_1 6948 || $default_map{$addr} ne $CODE_POINT; 6949 return $OUTPUT_ADJUSTED; 6950 } 6951 6952 # Otherwise is an $ENUM, do output it, for Perl's purposes 6953 return $INTERNAL_MAP; 6954 } 6955 6956 sub inverse_list($self) { 6957 # Returns a Range_List that is gaps of the current table. That is, 6958 # the inversion 6959 my $current = Range_List->new(Initialize => $self->_range_list, 6960 Owner => $self->property); 6961 return ~ $current; 6962 } 6963 6964 sub header($self) { 6965 my $return = $self->SUPER::header(); 6966 6967 if ($self->to_output_map >= $INTERNAL_MAP) { 6968 $return .= $INTERNAL_ONLY_HEADER; 6969 } 6970 else { 6971 # Other properties have fixed formats. 6972 my $property_name = $self->property->full_name; 6973 6974 $return .= <<END; 6975 6976# !!!!!!! IT IS DEPRECATED TO USE THIS FILE !!!!!!! 6977 6978# This file is for internal use by core Perl only. It is retained for 6979# backwards compatibility with applications that may have come to rely on it, 6980# but its format and even its name or existence are subject to change without 6981# notice in a future Perl version. Don't use it directly. Instead, its 6982# contents are now retrievable through a stable API in the Unicode::UCD 6983# module: Unicode::UCD::prop_invmap('$property_name') (Values for individual 6984# code points can be retrieved via Unicode::UCD::charprop()); 6985END 6986 } 6987 return $return; 6988 } 6989 6990 sub set_final_comment($self) { 6991 # Just before output, create the comment that heads the file 6992 # containing this table. 6993 6994 return unless $debugging_build; 6995 6996 # No sense generating a comment if aren't going to write it out. 6997 return if ! $self->to_output_map; 6998 6999 my $addr = do { no overloading; pack 'J', $self; }; 7000 7001 my $property = $self->property; 7002 7003 # Get all the possible names for this property. Don't use any that 7004 # aren't ok for use in a file name, etc. This is perhaps causing that 7005 # flag to do double duty, and may have to be changed in the future to 7006 # have our own flag for just this purpose; but it works now to exclude 7007 # Perl generated synonyms from the lists for properties, where the 7008 # name is always the proper Unicode one. 7009 my @property_aliases = grep { $_->ok_as_filename } $self->aliases; 7010 7011 my $count = $self->count; 7012 my $default_map = $default_map{$addr}; 7013 7014 # The ranges that map to the default aren't output, so subtract that 7015 # to get those actually output. A property with matching tables 7016 # already has the information calculated. 7017 if ($property->type != $STRING && $property->type != $FORCED_BINARY) { 7018 $count -= $property->table($default_map)->count; 7019 } 7020 elsif (defined $default_map) { 7021 7022 # But for $STRING properties, must calculate now. Subtract the 7023 # count from each range that maps to the default. 7024 foreach my $range ($self->_range_list->ranges) { 7025 if ($range->value eq $default_map) { 7026 $count -= $range->end +1 - $range->start; 7027 } 7028 } 7029 7030 } 7031 7032 # Get a string version of $count with underscores in large numbers, 7033 # for clarity. 7034 my $string_count = main::clarify_code_point_count($count); 7035 7036 my $code_points = ($count == 1) 7037 ? 'single code point' 7038 : "$string_count code points"; 7039 7040 my $mapping; 7041 my $these_mappings; 7042 my $are; 7043 if (@property_aliases <= 1) { 7044 $mapping = 'mapping'; 7045 $these_mappings = 'this mapping'; 7046 $are = 'is' 7047 } 7048 else { 7049 $mapping = 'synonymous mappings'; 7050 $these_mappings = 'these mappings'; 7051 $are = 'are' 7052 } 7053 my $cp; 7054 if ($count >= $MAX_UNICODE_CODEPOINTS) { 7055 $cp = "any code point in Unicode Version $string_version"; 7056 } 7057 else { 7058 my $map_to; 7059 if ($default_map eq "") { 7060 $map_to = 'the empty string'; 7061 } 7062 elsif ($default_map eq $CODE_POINT) { 7063 $map_to = "itself"; 7064 } 7065 else { 7066 $map_to = "'$default_map'"; 7067 } 7068 if ($count == 1) { 7069 $cp = "the single code point"; 7070 } 7071 else { 7072 $cp = "one of the $code_points"; 7073 } 7074 $cp .= " in Unicode Version $unicode_version for which the mapping is not to $map_to"; 7075 } 7076 7077 my $comment = ""; 7078 7079 my $status = $self->status; 7080 if ($status ne $NORMAL) { 7081 my $warn = uc $status_past_participles{$status}; 7082 $comment .= <<END; 7083 7084!!!!!!! $warn !!!!!!!!!!!!!!!!!!! 7085 All property or property=value combinations contained in this file are $warn. 7086 See $unicode_reference_url for what this means. 7087 7088END 7089 } 7090 $comment .= "This file returns the $mapping:\n"; 7091 7092 my $ucd_accessible_name = ""; 7093 my $has_underscore_name = 0; 7094 my $full_name = $self->property->full_name; 7095 for my $i (0 .. @property_aliases - 1) { 7096 my $name = $property_aliases[$i]->name; 7097 $has_underscore_name = 1 if $name =~ /^_/; 7098 $comment .= sprintf("%-8s%s\n", " ", $name . '(cp)'); 7099 if ($property_aliases[$i]->ucd) { 7100 if ($name eq $full_name) { 7101 $ucd_accessible_name = $full_name; 7102 } 7103 elsif (! $ucd_accessible_name) { 7104 $ucd_accessible_name = $name; 7105 } 7106 } 7107 } 7108 $comment .= "\nwhere 'cp' is $cp."; 7109 if ($ucd_accessible_name) { 7110 $comment .= " Note that $these_mappings"; 7111 if ($has_underscore_name) { 7112 $comment .= " (except for the one(s) that begin with an underscore)"; 7113 } 7114 $comment .= " $are accessible via the functions prop_invmap('$full_name') or charprop() in Unicode::UCD"; 7115 7116 } 7117 7118 # And append any commentary already set from the actual property. 7119 $comment .= "\n\n" . $self->comment if $self->comment; 7120 if ($self->description) { 7121 $comment .= "\n\n" . join " ", $self->description; 7122 } 7123 if ($self->note) { 7124 $comment .= "\n\n" . join " ", $self->note; 7125 } 7126 $comment .= "\n"; 7127 7128 if (! $self->perl_extension) { 7129 $comment .= <<END; 7130 7131For information about what this property really means, see: 7132$unicode_reference_url 7133END 7134 } 7135 7136 if ($count) { # Format differs for empty table 7137 $comment.= "\nThe format of the "; 7138 if ($self->range_size_1) { 7139 $comment.= <<END; 7140main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT 7141is in hex; MAPPING is what CODE_POINT maps to. 7142END 7143 } 7144 else { 7145 7146 # There are tables which end up only having one element per 7147 # range, but it is not worth keeping track of for making just 7148 # this comment a little better. 7149 $comment .= <<END; 7150non-comment portions of the main body of lines of this file is: 7151START\\tSTOP\\tMAPPING where START is the starting code point of the 7152range, in hex; STOP is the ending point, or if omitted, the range has just one 7153code point; MAPPING is what each code point between START and STOP maps to. 7154END 7155 if ($self->output_range_counts) { 7156 $comment .= <<END; 7157Numbers in comments in [brackets] indicate how many code points are in the 7158range (omitted when the range is a single code point or if the mapping is to 7159the null string). 7160END 7161 } 7162 } 7163 } 7164 $self->set_comment(main::join_lines($comment)); 7165 return; 7166 } 7167 7168 my %swash_keys; # Makes sure don't duplicate swash names. 7169 7170 # The remaining variables are temporaries used while writing each table, 7171 # to output special ranges. 7172 my @multi_code_point_maps; # Map is to more than one code point. 7173 7174 sub handle_special_range($self, $range) { 7175 # Called in the middle of write when it finds a range it doesn't know 7176 # how to handle. 7177 7178 my $addr = do { no overloading; pack 'J', $self; }; 7179 7180 my $type = $range->type; 7181 7182 my $low = $range->start; 7183 my $high = $range->end; 7184 my $map = $range->value; 7185 7186 # No need to output the range if it maps to the default. 7187 return if $map eq $default_map{$addr}; 7188 7189 my $property = $self->property; 7190 7191 # Switch based on the map type... 7192 if ($type == $HANGUL_SYLLABLE) { 7193 7194 # These are entirely algorithmically determinable based on 7195 # some constants furnished by Unicode; for now, just set a 7196 # flag to indicate that have them. After everything is figured 7197 # out, we will output the code that does the algorithm. (Don't 7198 # output them if not needed because we are suppressing this 7199 # property.) 7200 $has_hangul_syllables = 1 if $property->to_output_map; 7201 } 7202 elsif ($type == $CP_IN_NAME) { 7203 7204 # Code points whose name ends in their code point are also 7205 # algorithmically determinable, but need information about the map 7206 # to do so. Both the map and its inverse are stored in data 7207 # structures output in the file. They are stored in the mean time 7208 # in global lists The lists will be written out later into Name.pm, 7209 # which is created only if needed. In order to prevent duplicates 7210 # in the list, only add to them for one property, should multiple 7211 # ones need them. 7212 if ($needing_code_points_ending_in_code_point == 0) { 7213 $needing_code_points_ending_in_code_point = $property; 7214 } 7215 if ($property == $needing_code_points_ending_in_code_point) { 7216 push @{$names_ending_in_code_point{$map}->{'low'}}, $low; 7217 push @{$names_ending_in_code_point{$map}->{'high'}}, $high; 7218 7219 my $squeezed = $map =~ s/[-\s]+//gr; 7220 push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}}, 7221 $low; 7222 push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}}, 7223 $high; 7224 7225 # Calculate the set of legal characters in names of this 7226 # series. It includes every character in the name prefix. 7227 my %legal; 7228 $legal{$_} = 1 for split //, $map; 7229 7230 # Plus the hex code point chars, blank, and minus. Also \n 7231 # can show up as being required due to anchoring 7232 for my $i ('0' .. '9', 'A' .. 'F', '-', ' ', "\n") { 7233 $legal{$i} = 1; 7234 } 7235 my $legal = join "", sort { $a cmp $b } keys %legal; 7236 7237 # The legal chars can be used in match optimizations 7238 push @code_points_ending_in_code_point, { low => $low, 7239 high => $high, 7240 name => $map, 7241 legal => $legal, 7242 }; 7243 } 7244 } 7245 elsif ($range->type == $MULTI_CP || $range->type == $NULL) { 7246 7247 # Multi-code point maps and null string maps have an entry 7248 # for each code point in the range. They use the same 7249 # output format. 7250 for my $code_point ($low .. $high) { 7251 7252 # The pack() below can't cope with surrogates. XXX This may 7253 # no longer be true 7254 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) { 7255 Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self. No map created"); 7256 next; 7257 } 7258 7259 # Generate the hash entries for these in the form that 7260 # utf8.c understands. 7261 my $tostr = ""; 7262 my $to_name = ""; 7263 my $to_chr = ""; 7264 foreach my $to (split " ", $map) { 7265 if ($to !~ /^$code_point_re$/) { 7266 Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self. No map created"); 7267 next; 7268 } 7269 $tostr .= sprintf "\\x{%s}", $to; 7270 $to = CORE::hex $to; 7271 if ($annotate) { 7272 $to_name .= " + " if $to_name; 7273 $to_chr .= main::display_chr($to); 7274 main::populate_char_info($to) 7275 if ! defined $viacode[$to]; 7276 $to_name .= $viacode[$to]; 7277 } 7278 } 7279 7280 # The unpack yields a list of the bytes that comprise the 7281 # UTF-8 of $code_point, which are each placed in \xZZ format 7282 # and output in the %s to map to $tostr, so the result looks 7283 # like: 7284 # "\xC4\xB0" => "\x{0069}\x{0307}", 7285 my $utf8 = sprintf(qq["%s" => "$tostr",], 7286 join("", map { sprintf "\\x%02X", $_ } 7287 unpack("U0C*", chr $code_point))); 7288 7289 # Add a comment so that a human reader can more easily 7290 # see what's going on. 7291 push @multi_code_point_maps, 7292 sprintf("%-45s # U+%04X", $utf8, $code_point); 7293 if (! $annotate) { 7294 $multi_code_point_maps[-1] .= " => $map"; 7295 } 7296 else { 7297 main::populate_char_info($code_point) 7298 if ! defined $viacode[$code_point]; 7299 $multi_code_point_maps[-1] .= " '" 7300 . main::display_chr($code_point) 7301 . "' => '$to_chr'; $viacode[$code_point] => $to_name"; 7302 } 7303 } 7304 } 7305 else { 7306 Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self. Not written"); 7307 } 7308 7309 return; 7310 } 7311 7312 sub pre_body($self) { 7313 # Returns the string that should be output in the file before the main 7314 # body of this table. It isn't called until the main body is 7315 # calculated, saving a pass. The string includes some hash entries 7316 # identifying the format of the body, and what the single value should 7317 # be for all ranges missing from it. It also includes any code points 7318 # which have map_types that don't go in the main table. 7319 7320 my $addr = do { no overloading; pack 'J', $self; }; 7321 7322 my $name = $self->property->swash_name; 7323 7324 # Currently there is nothing in the pre_body unless a swash is being 7325 # generated. 7326 return unless defined $name; 7327 7328 if (defined $swash_keys{$name}) { 7329 Carp::my_carp(main::join_lines(<<END 7330Already created a swash name '$name' for $swash_keys{$name}. This means that 7331the same name desired for $self shouldn't be used. Bad News. This must be 7332fixed before production use, but proceeding anyway 7333END 7334 )); 7335 } 7336 $swash_keys{$name} = "$self"; 7337 7338 my $pre_body = ""; 7339 7340 # Here we assume we were called after have gone through the whole 7341 # file. If we actually generated anything for each map type, add its 7342 # respective header and trailer 7343 my $specials_name = ""; 7344 if (@multi_code_point_maps) { 7345 $specials_name = "Unicode::UCD::ToSpec$name"; 7346 $pre_body .= <<END; 7347 7348# Some code points require special handling because their mappings are each to 7349# multiple code points. These do not appear in the main body, but are defined 7350# in the hash below. 7351 7352# Each key is the string of N bytes that together make up the UTF-8 encoding 7353# for the code point. (i.e. the same as looking at the code point's UTF-8 7354# under "use bytes"). Each value is the UTF-8 of the translation, for speed. 7355\%$specials_name = ( 7356END 7357 $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n"; 7358 } 7359 7360 my $format = $self->format; 7361 7362 my $return = ""; 7363 7364 my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED); 7365 if ($output_adjusted) { 7366 if ($specials_name) { 7367 $return .= <<END; 7368# The mappings in the non-hash portion of this file must be modified to get the 7369# correct values by adding the code point ordinal number to each one that is 7370# numeric. 7371END 7372 } 7373 else { 7374 $return .= <<END; 7375# The mappings must be modified to get the correct values by adding the code 7376# point ordinal number to each one that is numeric. 7377END 7378 } 7379 } 7380 7381 $return .= <<END; 7382 7383# The name this table is to be known by, with the format of the mappings in 7384# the main body of the table, and what all code points missing from this file 7385# map to. 7386\$Unicode::UCD::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format} 7387END 7388 if ($specials_name) { 7389 $return .= <<END; 7390\$Unicode::UCD::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings 7391END 7392 } 7393 my $default_map = $default_map{$addr}; 7394 7395 # For $CODE_POINT default maps and using adjustments, instead the default 7396 # becomes zero. 7397 $return .= "\$Unicode::UCD::SwashInfo{'To$name'}{'missing'} = '" 7398 . (($output_adjusted && $default_map eq $CODE_POINT) 7399 ? "0" 7400 : $default_map) 7401 . "';"; 7402 7403 if ($default_map eq $CODE_POINT) { 7404 $return .= ' # code point maps to itself'; 7405 } 7406 elsif ($default_map eq "") { 7407 $return .= ' # code point maps to the empty string'; 7408 } 7409 $return .= "\n"; 7410 7411 $return .= $pre_body; 7412 7413 return $return; 7414 } 7415 7416 sub write($self) { 7417 # Write the table to the file. 7418 7419 my $addr = do { no overloading; pack 'J', $self; }; 7420 7421 # Clear the temporaries 7422 undef @multi_code_point_maps; 7423 7424 # Calculate the format of the table if not already done. 7425 my $format = $self->format; 7426 my $type = $self->property->type; 7427 my $default_map = $self->default_map; 7428 if (! defined $format) { 7429 if ($type == $BINARY) { 7430 7431 # Don't bother checking the values, because we elsewhere 7432 # verify that a binary table has only 2 values. 7433 $format = $BINARY_FORMAT; 7434 } 7435 else { 7436 my @ranges = $self->_range_list->ranges; 7437 7438 # default an empty table based on its type and default map 7439 if (! @ranges) { 7440 7441 # But it turns out that the only one we can say is a 7442 # non-string (besides binary, handled above) is when the 7443 # table is a string and the default map is to a code point 7444 if ($type == $STRING && $default_map eq $CODE_POINT) { 7445 $format = $HEX_FORMAT; 7446 } 7447 else { 7448 $format = $STRING_FORMAT; 7449 } 7450 } 7451 else { 7452 7453 # Start with the most restrictive format, and as we find 7454 # something that doesn't fit with that, change to the next 7455 # most restrictive, and so on. 7456 $format = $DECIMAL_FORMAT; 7457 foreach my $range (@ranges) { 7458 next if $range->type != 0; # Non-normal ranges don't 7459 # affect the main body 7460 my $map = $range->value; 7461 if ($map ne $default_map) { 7462 last if $format eq $STRING_FORMAT; # already at 7463 # least 7464 # restrictive 7465 $format = $INTEGER_FORMAT 7466 if $format eq $DECIMAL_FORMAT 7467 && $map !~ / ^ [0-9] $ /x; 7468 $format = $FLOAT_FORMAT 7469 if $format eq $INTEGER_FORMAT 7470 && $map !~ / ^ -? [0-9]+ $ /x; 7471 $format = $RATIONAL_FORMAT 7472 if $format eq $FLOAT_FORMAT 7473 && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x; 7474 $format = $HEX_FORMAT 7475 if ($format eq $RATIONAL_FORMAT 7476 && $map !~ 7477 m/ ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x) 7478 # Assume a leading zero means hex, 7479 # even if all digits are 0-9 7480 || ($format eq $INTEGER_FORMAT 7481 && $map =~ /^0[0-9A-F]/); 7482 $format = $STRING_FORMAT if $format eq $HEX_FORMAT 7483 && $map =~ /[^0-9A-F]/; 7484 } 7485 } 7486 } 7487 } 7488 } # end of calculating format 7489 7490 if ($default_map eq $CODE_POINT 7491 && $format ne $HEX_FORMAT 7492 && ! defined $self->format) # manual settings are always 7493 # considered ok 7494 { 7495 Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'") 7496 } 7497 7498 # If the output is to be adjusted, the format of the table that gets 7499 # output is actually 'a' or 'ax' instead of whatever it is stored 7500 # internally as. 7501 my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED); 7502 if ($output_adjusted) { 7503 if ($default_map eq $CODE_POINT) { 7504 $format = $HEX_ADJUST_FORMAT; 7505 } 7506 else { 7507 $format = $ADJUST_FORMAT; 7508 } 7509 } 7510 7511 $self->_set_format($format); 7512 7513 return $self->SUPER::write( 7514 $output_adjusted, 7515 $default_map); # don't write defaulteds 7516 } 7517 7518 # Accessors for the underlying list that should fail if locked. 7519 for my $sub (qw( 7520 add_duplicate 7521 replace_map 7522 )) 7523 { 7524 no strict "refs"; 7525 *$sub = sub { 7526 use strict "refs"; 7527 my $self = shift; 7528 7529 return if $self->carp_if_locked; 7530 return $self->_range_list->$sub(@_); 7531 } 7532 } 7533} # End closure for Map_Table 7534 7535package Match_Table; 7536use parent '-norequire', '_Base_Table'; 7537 7538# A Match table is one which is a list of all the code points that have 7539# the same property and property value, for use in \p{property=value} 7540# constructs in regular expressions. It adds very little data to the base 7541# structure, but many methods, as these lists can be combined in many ways to 7542# form new ones. 7543# There are only a few concepts added: 7544# 1) Equivalents and Relatedness. 7545# Two tables can match the identical code points, but have different names. 7546# This always happens when there is a perl single form extension 7547# \p{IsProperty} for the Unicode compound form \P{Property=True}. The two 7548# tables are set to be related, with the Perl extension being a child, and 7549# the Unicode property being the parent. 7550# 7551# It may be that two tables match the identical code points and we don't 7552# know if they are related or not. This happens most frequently when the 7553# Block and Script properties have the exact range. But note that a 7554# revision to Unicode could add new code points to the script, which would 7555# now have to be in a different block (as the block was filled, or there 7556# would have been 'Unknown' script code points in it and they wouldn't have 7557# been identical). So we can't rely on any two properties from Unicode 7558# always matching the same code points from release to release, and thus 7559# these tables are considered coincidentally equivalent--not related. When 7560# two tables are unrelated but equivalent, one is arbitrarily chosen as the 7561# 'leader', and the others are 'equivalents'. This concept is useful 7562# to minimize the number of tables written out. Only one file is used for 7563# any identical set of code points, with entries in UCD.pl mapping all 7564# the involved tables to it. 7565# 7566# Related tables will always be identical; we set them up to be so. Thus 7567# if the Unicode one is deprecated, the Perl one will be too. Not so for 7568# unrelated tables. Relatedness makes generating the documentation easier. 7569# 7570# 2) Complement. 7571# Like equivalents, two tables may be the inverses of each other, the 7572# intersection between them is null, and the union is every Unicode code 7573# point. The two tables that occupy a binary property are necessarily like 7574# this. By specifying one table as the complement of another, we can avoid 7575# storing it on disk (using the other table and performing a fast 7576# transform), and some memory and calculations. 7577# 7578# 3) Conflicting. It may be that there will eventually be name clashes, with 7579# the same name meaning different things. For a while, there actually were 7580# conflicts, but they have so far been resolved by changing Perl's or 7581# Unicode's definitions to match the other, but when this code was written, 7582# it wasn't clear that that was what was going to happen. (Unicode changed 7583# because of protests during their beta period.) Name clashes are warned 7584# about during compilation, and the documentation. The generated tables 7585# are sane, free of name clashes, because the code suppresses the Perl 7586# version. But manual intervention to decide what the actual behavior 7587# should be may be required should this happen. The introductory comments 7588# have more to say about this. 7589# 7590# 4) Definition. This is a string for human consumption that specifies the 7591# code points that this table matches. This is used only for the generated 7592# pod file. It may be specified explicitly, or automatically computed. 7593# Only the first portion of complicated definitions is computed and 7594# displayed. 7595 7596sub standardize { return main::standardize($_[0]); } 7597sub trace { return main::trace(@_); } 7598 7599 7600{ # Closure 7601 7602 main::setup_package(); 7603 7604 my %leader; 7605 # The leader table of this one; initially $self. 7606 main::set_access('leader', \%leader, 'r'); 7607 7608 my %equivalents; 7609 # An array of any tables that have this one as their leader 7610 main::set_access('equivalents', \%equivalents, 'readable_array'); 7611 7612 my %parent; 7613 # The parent table to this one, initially $self. This allows us to 7614 # distinguish between equivalent tables that are related (for which this 7615 # is set to), and those which may not be, but share the same output file 7616 # because they match the exact same set of code points in the current 7617 # Unicode release. 7618 main::set_access('parent', \%parent, 'r'); 7619 7620 my %children; 7621 # An array of any tables that have this one as their parent 7622 main::set_access('children', \%children, 'readable_array'); 7623 7624 my %conflicting; 7625 # Array of any tables that would have the same name as this one with 7626 # a different meaning. This is used for the generated documentation. 7627 main::set_access('conflicting', \%conflicting, 'readable_array'); 7628 7629 my %matches_all; 7630 # Set in the constructor for tables that are expected to match all code 7631 # points. 7632 main::set_access('matches_all', \%matches_all, 'r'); 7633 7634 my %complement; 7635 # Points to the complement that this table is expressed in terms of; 0 if 7636 # none. 7637 main::set_access('complement', \%complement, 'r'); 7638 7639 my %definition; 7640 # Human readable string of the first few ranges of code points matched by 7641 # this table 7642 main::set_access('definition', \%definition, 'r', 's'); 7643 7644 sub new { 7645 my $class = shift; 7646 7647 my %args = @_; 7648 7649 # The property for which this table is a listing of property values. 7650 my $property = delete $args{'_Property'}; 7651 7652 my $name = delete $args{'Name'}; 7653 my $full_name = delete $args{'Full_Name'}; 7654 $full_name = $name if ! defined $full_name; 7655 7656 # Optional 7657 my $initialize = delete $args{'Initialize'}; 7658 my $matches_all = delete $args{'Matches_All'} || 0; 7659 my $format = delete $args{'Format'}; 7660 my $definition = delete $args{'Definition'} // ""; 7661 # Rest of parameters passed on. 7662 7663 my $range_list = Range_List->new(Initialize => $initialize, 7664 Owner => $property); 7665 7666 my $complete = $full_name; 7667 $complete = '""' if $complete eq ""; # A null name shouldn't happen, 7668 # but this helps debug if it 7669 # does 7670 # The complete name for a match table includes it's property in a 7671 # compound form 'property=table', except if the property is the 7672 # pseudo-property, perl, in which case it is just the single form, 7673 # 'table' (If you change the '=' must also change the ':' in lots of 7674 # places in this program that assume an equal sign) 7675 $complete = $property->full_name . "=$complete" if $property != $perl; 7676 7677 my $self = $class->SUPER::new(%args, 7678 Name => $name, 7679 Complete_Name => $complete, 7680 Full_Name => $full_name, 7681 _Property => $property, 7682 _Range_List => $range_list, 7683 Format => $EMPTY_FORMAT, 7684 Write_As_Invlist => 1, 7685 ); 7686 my $addr = do { no overloading; pack 'J', $self; }; 7687 7688 $conflicting{$addr} = [ ]; 7689 $equivalents{$addr} = [ ]; 7690 $children{$addr} = [ ]; 7691 $matches_all{$addr} = $matches_all; 7692 $leader{$addr} = $self; 7693 $parent{$addr} = $self; 7694 $complement{$addr} = 0; 7695 $definition{$addr} = $definition; 7696 7697 if (defined $format && $format ne $EMPTY_FORMAT) { 7698 Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'. Using '$EMPTY_FORMAT'"); 7699 } 7700 7701 return $self; 7702 } 7703 7704 # See this program's beginning comment block about overloading these. 7705 use overload 7706 fallback => 0, 7707 qw("") => "_operator_stringify", 7708 '=' => sub { 7709 my $self = shift; 7710 7711 return if $self->carp_if_locked; 7712 return $self; 7713 }, 7714 7715 '+' => sub { 7716 my $self = shift; 7717 my $other = shift; 7718 7719 return $self->_range_list + $other; 7720 }, 7721 '&' => sub { 7722 my $self = shift; 7723 my $other = shift; 7724 7725 return $self->_range_list & $other; 7726 }, 7727 '+=' => sub { 7728 my $self = shift; 7729 my $other = shift; 7730 my $reversed = shift; 7731 7732 if ($reversed) { 7733 Carp::my_carp_bug("Bad news. Can't cope with '" 7734 . ref($other) 7735 . ' += ' 7736 . ref($self) 7737 . "'. undef returned."); 7738 return; 7739 } 7740 7741 return if $self->carp_if_locked; 7742 7743 my $addr = do { no overloading; pack 'J', $self; }; 7744 7745 if (ref $other) { 7746 7747 # Change the range list of this table to be the 7748 # union of the two. 7749 $self->_set_range_list($self->_range_list 7750 + $other); 7751 } 7752 else { # $other is just a simple value 7753 $self->add_range($other, $other); 7754 } 7755 return $self; 7756 }, 7757 '&=' => sub { 7758 my $self = shift; 7759 my $other = shift; 7760 my $reversed = shift; 7761 7762 if ($reversed) { 7763 Carp::my_carp_bug("Bad news. Can't cope with '" 7764 . ref($other) 7765 . ' &= ' 7766 . ref($self) 7767 . "'. undef returned."); 7768 return; 7769 } 7770 7771 return if $self->carp_if_locked; 7772 $self->_set_range_list($self->_range_list & $other); 7773 return $self; 7774 }, 7775 '-' => sub { my $self = shift; 7776 my $other = shift; 7777 my $reversed = shift; 7778 if ($reversed) { 7779 Carp::my_carp_bug("Bad news. Can't cope with '" 7780 . ref($other) 7781 . ' - ' 7782 . ref($self) 7783 . "'. undef returned."); 7784 return; 7785 } 7786 7787 return $self->_range_list - $other; 7788 }, 7789 '~' => sub { my $self = shift; 7790 return ~ $self->_range_list; 7791 }, 7792 ; 7793 7794 sub _operator_stringify($self, $other="", $reversed=0) { 7795 7796 my $name = $self->complete_name; 7797 return "Table '$name'"; 7798 } 7799 7800 sub _range_list { 7801 # Returns the range list associated with this table, which will be the 7802 # complement's if it has one. 7803 7804 my $self = shift; 7805 my $complement = $self->complement; 7806 7807 # In order to avoid re-complementing on each access, only do the 7808 # complement the first time, and store the result in this table's 7809 # range list to use henceforth. However, this wouldn't work if the 7810 # controlling (complement) table changed after we do this, so lock it. 7811 # Currently, the value of the complement isn't needed until after it 7812 # is fully constructed, so this works. If this were to change, the 7813 # each_range iteration functionality would no longer work on this 7814 # complement. 7815 if ($complement != 0 && $self->SUPER::_range_list->count == 0) { 7816 $self->_set_range_list($self->SUPER::_range_list 7817 + ~ $complement->_range_list); 7818 $complement->lock; 7819 } 7820 7821 return $self->SUPER::_range_list; 7822 } 7823 7824 sub add_alias { 7825 # Add a synonym for this table. See the comments in the base class 7826 7827 my $self = shift; 7828 my $name = shift; 7829 # Rest of parameters passed on. 7830 7831 $self->SUPER::add_alias($name, $self, @_); 7832 return; 7833 } 7834 7835 sub add_conflicting { 7836 # Add the name of some other object to the list of ones that name 7837 # clash with this match table. 7838 7839 my $self = shift; 7840 my $conflicting_name = shift; # The name of the conflicting object 7841 my $p = shift || 'p'; # Optional, is this a \p{} or \P{} ? 7842 my $conflicting_object = shift; # Optional, the conflicting object 7843 # itself. This is used to 7844 # disambiguate the text if the input 7845 # name is identical to any of the 7846 # aliases $self is known by. 7847 # Sometimes the conflicting object is 7848 # merely hypothetical, so this has to 7849 # be an optional parameter. 7850 Carp::carp_extra_args(\@_) if main::DEBUG && @_; 7851 7852 my $addr = do { no overloading; pack 'J', $self; }; 7853 7854 # Check if the conflicting name is exactly the same as any existing 7855 # alias in this table (as long as there is a real object there to 7856 # disambiguate with). 7857 if (defined $conflicting_object) { 7858 foreach my $alias ($self->aliases) { 7859 if (standardize($alias->name) eq standardize($conflicting_name)) { 7860 7861 # Here, there is an exact match. This results in 7862 # ambiguous comments, so disambiguate by changing the 7863 # conflicting name to its object's complete equivalent. 7864 $conflicting_name = $conflicting_object->complete_name; 7865 last; 7866 } 7867 } 7868 } 7869 7870 # Convert to the \p{...} final name 7871 $conflicting_name = "\\$p" . "{$conflicting_name}"; 7872 7873 # Only add once 7874 return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}}; 7875 7876 push @{$conflicting{$addr}}, $conflicting_name; 7877 7878 return; 7879 } 7880 7881 sub is_set_equivalent_to($self, $other=undef) { 7882 # Return boolean of whether or not the other object is a table of this 7883 # type and has been marked equivalent to this one. 7884 7885 return 0 if ! defined $other; # Can happen for incomplete early 7886 # releases 7887 unless ($other->isa(__PACKAGE__)) { 7888 my $ref_other = ref $other; 7889 my $ref_self = ref $self; 7890 Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self."); 7891 return 0; 7892 } 7893 7894 # Two tables are equivalent if they have the same leader. 7895 no overloading; 7896 return $leader{pack 'J', $self} == $leader{pack 'J', $other}; 7897 return; 7898 } 7899 7900 sub set_equivalent_to { 7901 # Set $self equivalent to the parameter table. 7902 # The required Related => 'x' parameter is a boolean indicating 7903 # whether these tables are related or not. If related, $other becomes 7904 # the 'parent' of $self; if unrelated it becomes the 'leader' 7905 # 7906 # Related tables share all characteristics except names; equivalents 7907 # not quite so many. 7908 # If they are related, one must be a perl extension. This is because 7909 # we can't guarantee that Unicode won't change one or the other in a 7910 # later release even if they are identical now. 7911 7912 my $self = shift; 7913 my $other = shift; 7914 7915 my %args = @_; 7916 my $related = delete $args{'Related'}; 7917 7918 Carp::carp_extra_args(\%args) if main::DEBUG && %args; 7919 7920 return if ! defined $other; # Keep on going; happens in some early 7921 # Unicode releases. 7922 7923 if (! defined $related) { 7924 Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter. Assuming $self is not related to $other"); 7925 $related = 0; 7926 } 7927 7928 # If already are equivalent, no need to re-do it; if subroutine 7929 # returns null, it found an error, also do nothing 7930 my $are_equivalent = $self->is_set_equivalent_to($other); 7931 return if ! defined $are_equivalent || $are_equivalent; 7932 7933 my $addr = do { no overloading; pack 'J', $self; }; 7934 my $current_leader = ($related) ? $parent{$addr} : $leader{$addr}; 7935 7936 if ($related) { 7937 if ($current_leader->perl_extension) { 7938 if ($other->perl_extension) { 7939 Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent."); 7940 return; 7941 } 7942 } elsif ($self->property != $other->property # Depending on 7943 # situation, might 7944 # be better to use 7945 # add_alias() 7946 # instead for same 7947 # property 7948 && ! $other->perl_extension 7949 7950 # We allow the sc and scx properties to be marked as 7951 # related. They are in fact related, and this allows 7952 # the pod to show that better. This test isn't valid 7953 # if this is an early Unicode release without the scx 7954 # property (having that also implies the sc property 7955 # exists, so don't have to test for no 'sc') 7956 && ( ! defined $scx 7957 && ! ( ( $self->property == $script 7958 || $self->property == $scx) 7959 && ( $self->property == $script 7960 || $self->property == $scx)))) 7961 { 7962 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other"); 7963 $related = 0; 7964 } 7965 } 7966 7967 if (! $self->is_empty && ! $self->matches_identically_to($other)) { 7968 Carp::my_carp_bug("$self should be empty or match identically to $other. Not setting equivalent"); 7969 return; 7970 } 7971 7972 my $leader = do { no overloading; pack 'J', $current_leader; }; 7973 my $other_addr = do { no overloading; pack 'J', $other; }; 7974 7975 # Any tables that are equivalent to or children of this table must now 7976 # instead be equivalent to or (children) to the new leader (parent), 7977 # still equivalent. The equivalency includes their matches_all info, 7978 # and for related tables, their fate and status. 7979 # All related tables are of necessity equivalent, but the converse 7980 # isn't necessarily true 7981 my $status = $other->status; 7982 my $status_info = $other->status_info; 7983 my $fate = $other->fate; 7984 my $matches_all = $matches_all{other_addr}; 7985 my $caseless_equivalent = $other->caseless_equivalent; 7986 foreach my $table ($current_leader, @{$equivalents{$leader}}) { 7987 next if $table == $other; 7988 trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace; 7989 7990 my $table_addr = do { no overloading; pack 'J', $table; }; 7991 $leader{$table_addr} = $other; 7992 $matches_all{$table_addr} = $matches_all; 7993 $self->_set_range_list($other->_range_list); 7994 push @{$equivalents{$other_addr}}, $table; 7995 if ($related) { 7996 $parent{$table_addr} = $other; 7997 push @{$children{$other_addr}}, $table; 7998 $table->set_status($status, $status_info); 7999 8000 # This reason currently doesn't get exposed outside; otherwise 8001 # would have to look up the parent's reason and use it instead. 8002 $table->set_fate($fate, "Parent's fate"); 8003 8004 $self->set_caseless_equivalent($caseless_equivalent); 8005 } 8006 } 8007 8008 # Now that we've declared these to be equivalent, any changes to one 8009 # of the tables would invalidate that equivalency. 8010 $self->lock; 8011 $other->lock; 8012 return; 8013 } 8014 8015 sub set_complement($self, $other) { 8016 # Set $self to be the complement of the parameter table. $self is 8017 # locked, as what it contains should all come from the other table. 8018 8019 if ($other->complement != 0) { 8020 Carp::my_carp_bug("Can't set $self to be the complement of $other, which itself is the complement of " . $other->complement); 8021 return; 8022 } 8023 my $addr = do { no overloading; pack 'J', $self; }; 8024 $complement{$addr} = $other; 8025 8026 # Be sure the other property knows we are depending on them; or the 8027 # other table if it is one in the current property. 8028 if ($self->property != $other->property) { 8029 $other->property->set_has_dependency(1); 8030 } 8031 else { 8032 $other->set_has_dependency(1); 8033 } 8034 $self->lock; 8035 return; 8036 } 8037 8038 sub add_range($self, @range) { # Add a range to the list for this table. 8039 # Rest of parameters passed on 8040 8041 return if $self->carp_if_locked; 8042 return $self->_range_list->add_range(@range); 8043 } 8044 8045 sub header($self) { 8046 # All match tables are to be used only by the Perl core. 8047 return $self->SUPER::header() . $INTERNAL_ONLY_HEADER; 8048 } 8049 8050 sub pre_body { # Does nothing for match tables. 8051 return 8052 } 8053 8054 sub append_to_body { # Does nothing for match tables. 8055 return 8056 } 8057 8058 sub set_fate($self, $fate, $reason=undef) { 8059 $self->SUPER::set_fate($fate, $reason); 8060 8061 # All children share this fate 8062 foreach my $child ($self->children) { 8063 $child->set_fate($fate, $reason); 8064 } 8065 return; 8066 } 8067 8068 sub calculate_table_definition 8069 { 8070 # Returns a human-readable string showing some or all of the code 8071 # points matched by this table. The string will include a 8072 # bracketed-character class for all characters matched in the 00-FF 8073 # range, and the first few ranges matched beyond that. 8074 my $max_ranges = 6; 8075 8076 my $self = shift; 8077 my $definition = $self->definition || ""; 8078 8079 # Skip this if already have a definition. 8080 return $definition if $definition; 8081 8082 my $lows_string = ""; # The string representation of the 0-FF 8083 # characters 8084 my $string_range = ""; # The string rep. of the above FF ranges 8085 my $range_count = 0; # How many ranges in $string_rage 8086 8087 my @lows_invlist; # The inversion list of the 0-FF code points 8088 my $first_non_control = ord(" "); # Everything below this is a 8089 # control, on ASCII or EBCDIC 8090 my $max_table_code_point = $self->max; 8091 8092 # On ASCII platforms, the range 80-FF contains no printables. 8093 my $highest_printable = ((main::NON_ASCII_PLATFORM) ? 255 : 126); 8094 8095 8096 # Look through the first few ranges matched by this table. 8097 $self->reset_each_range; # Defensive programming 8098 while (defined (my $range = $self->each_range())) { 8099 my $start = $range->start; 8100 my $end = $range->end; 8101 8102 # Accumulate an inversion list of the 00-FF code points 8103 if ($start < 256 && ($start > 0 || $end < 256)) { 8104 push @lows_invlist, $start; 8105 push @lows_invlist, 1 + (($end < 256) ? $end : 255); 8106 8107 # Get next range if there are more ranges below 256 8108 next if $end < 256 && $end < $max_table_code_point; 8109 8110 # If the range straddles the 255/256 boundary, we split it 8111 # there. We already added above the low portion to the 8112 # inversion list 8113 $start = 256 if $end > 256; 8114 } 8115 8116 # Here, @lows_invlist contains the code points below 256, and 8117 # there is no other range, or the current one starts at or above 8118 # 256. Generate the [char class] for the 0-255 ones. 8119 while (@lows_invlist) { 8120 8121 # If this range (necessarily the first one, by the way) starts 8122 # at 0 ... 8123 if ($lows_invlist[0] == 0) { 8124 8125 # If it ends within the block of controls, that means that 8126 # some controls are in it and some aren't. Since Unicode 8127 # properties pretty much only know about a few of the 8128 # controls, like \n, \t, this means that its one of them 8129 # that isn't in the range. Complement the inversion list 8130 # which will likely cause these to be output using their 8131 # mnemonics, hence being clearer. 8132 if ($lows_invlist[1] < $first_non_control) { 8133 $lows_string .= '^'; 8134 shift @lows_invlist; 8135 push @lows_invlist, 256; 8136 } 8137 elsif ($lows_invlist[1] <= $highest_printable) { 8138 8139 # Here, it extends into the printables block. Split 8140 # into two ranges so that the controls are separate. 8141 $lows_string .= sprintf "\\x00-\\x%02x", 8142 $first_non_control - 1; 8143 $lows_invlist[0] = $first_non_control; 8144 } 8145 } 8146 8147 # If the range completely contains the printables, don't 8148 # individually spell out the printables. 8149 if ( $lows_invlist[0] <= $first_non_control 8150 && $lows_invlist[1] > $highest_printable) 8151 { 8152 $lows_string .= sprintf "\\x%02x-\\x%02x", 8153 $lows_invlist[0], $lows_invlist[1] - 1; 8154 shift @lows_invlist; 8155 shift @lows_invlist; 8156 next; 8157 } 8158 8159 # Here, the range may include some but not all printables. 8160 # Look at each one individually 8161 foreach my $ord (shift @lows_invlist .. shift(@lows_invlist) - 1) { 8162 my $char = chr $ord; 8163 8164 # If there is already something in the list, an 8165 # alphanumeric char could be the next in sequence. If so, 8166 # we start or extend a range. That is, we could have so 8167 # far something like 'a-c', and the next char is a 'd', so 8168 # we change it to 'a-d'. We use native_to_unicode() 8169 # because a-z on EBCDIC means 26 chars, and excludes the 8170 # gap ones. 8171 if ($lows_string ne "" && $char =~ /[[:alnum:]]/) { 8172 my $prev = substr($lows_string, -1); 8173 if ( $prev !~ /[[:alnum:]]/ 8174 || utf8::native_to_unicode(ord $prev) + 1 8175 != utf8::native_to_unicode(ord $char)) 8176 { 8177 # Not extending the range 8178 $lows_string .= $char; 8179 } 8180 elsif ( length $lows_string > 1 8181 && substr($lows_string, -2, 1) eq '-') 8182 { 8183 # We had a sequence like '-c' and the current 8184 # character is 'd'. Extend the range. 8185 substr($lows_string, -1, 1) = $char; 8186 } 8187 else { 8188 # We had something like 'd' and this is 'e'. 8189 # Start a range. 8190 $lows_string .= "-$char"; 8191 } 8192 } 8193 elsif ($char =~ /[[:graph:]]/) { 8194 8195 # We output a graphic char as-is, preceded by a 8196 # backslash if it is a metacharacter 8197 $lows_string .= '\\' 8198 if $char =~ /[\\\^\$\@\%\|()\[\]\{\}\-\/"']/; 8199 $lows_string .= $char; 8200 } # Otherwise use mnemonic for any that have them 8201 elsif ($char =~ /[\a]/) { 8202 $lows_string .= '\a'; 8203 } 8204 elsif ($char =~ /[\b]/) { 8205 $lows_string .= '\b'; 8206 } 8207 elsif ($char eq "\e") { 8208 $lows_string .= '\e'; 8209 } 8210 elsif ($char eq "\f") { 8211 $lows_string .= '\f'; 8212 } 8213 elsif ($char eq "\cK") { 8214 $lows_string .= '\cK'; 8215 } 8216 elsif ($char eq "\n") { 8217 $lows_string .= '\n'; 8218 } 8219 elsif ($char eq "\r") { 8220 $lows_string .= '\r'; 8221 } 8222 elsif ($char eq "\t") { 8223 $lows_string .= '\t'; 8224 } 8225 else { 8226 8227 # Here is a non-graphic without a mnemonic. We use \x 8228 # notation. But if the ordinal of this is one above 8229 # the previous, create or extend the range 8230 my $hex_representation = sprintf("%02x", ord $char); 8231 if ( length $lows_string >= 4 8232 && substr($lows_string, -4, 2) eq '\\x' 8233 && hex(substr($lows_string, -2)) + 1 == ord $char) 8234 { 8235 if ( length $lows_string >= 5 8236 && substr($lows_string, -5, 1) eq '-' 8237 && ( length $lows_string == 5 8238 || substr($lows_string, -6, 1) ne '\\')) 8239 { 8240 substr($lows_string, -2) = $hex_representation; 8241 } 8242 else { 8243 $lows_string .= '-\\x' . $hex_representation; 8244 } 8245 } 8246 else { 8247 $lows_string .= '\\x' . $hex_representation; 8248 } 8249 } 8250 } 8251 } 8252 8253 # Done with assembling the string of all lows. If there are only 8254 # lows in the property, are completely done. 8255 if ($max_table_code_point < 256) { 8256 $self->reset_each_range; 8257 last; 8258 } 8259 8260 # Otherwise, quit if reached max number of non-lows ranges. If 8261 # there are lows, count them as one unit towards the maximum. 8262 $range_count++; 8263 if ($range_count > (($lows_string eq "") ? $max_ranges : $max_ranges - 1)) { 8264 $string_range .= " ..."; 8265 $self->reset_each_range; 8266 last; 8267 } 8268 8269 # Otherwise add this range. 8270 $string_range .= ", " if $string_range ne ""; 8271 if ($start == $end) { 8272 $string_range .= sprintf("U+%04X", $start); 8273 } 8274 elsif ($end >= $MAX_WORKING_CODEPOINT) { 8275 $string_range .= sprintf("U+%04X..infinity", $start); 8276 } 8277 else { 8278 $string_range .= sprintf("U+%04X..%04X", 8279 $start, $end); 8280 } 8281 } 8282 8283 # Done with all the ranges we're going to look at. Assemble the 8284 # definition from the lows + non-lows. 8285 8286 if ($lows_string ne "" || $string_range ne "") { 8287 if ($lows_string ne "") { 8288 $definition .= "[$lows_string]"; 8289 $definition .= ", " if $string_range; 8290 } 8291 $definition .= $string_range; 8292 } 8293 8294 return $definition; 8295 } 8296 8297 sub write($self) { 8298 return $self->SUPER::write(0); # No adjustments 8299 } 8300 8301 # $leader - Should only be called on the leader table of an equivalent group 8302 sub set_final_comment($leader) { 8303 # This creates a comment for the file that is to hold the match table 8304 # $self. It is somewhat convoluted to make the English read nicely, 8305 # but, heh, it's just a comment. 8306 # This should be called only with the leader match table of all the 8307 # ones that share the same file. It lists all such tables, ordered so 8308 # that related ones are together. 8309 8310 return unless $debugging_build; 8311 8312 my $addr = do { no overloading; pack 'J', $leader; }; 8313 8314 if ($leader{$addr} != $leader) { 8315 Carp::my_carp_bug(<<END 8316set_final_comment() must be called on a leader table, which $leader is not. 8317It is equivalent to $leader{$addr}. No comment created 8318END 8319 ); 8320 return; 8321 } 8322 8323 # Get the number of code points matched by each of the tables in this 8324 # file, and add underscores for clarity. 8325 my $count = $leader->count; 8326 my $unicode_count; 8327 my $non_unicode_string; 8328 if ($count > $MAX_UNICODE_CODEPOINTS) { 8329 $unicode_count = $count - ($MAX_WORKING_CODEPOINT 8330 - $MAX_UNICODE_CODEPOINT); 8331 $non_unicode_string = "All above-Unicode code points match as well, and are also returned"; 8332 } 8333 else { 8334 $unicode_count = $count; 8335 $non_unicode_string = ""; 8336 } 8337 my $string_count = main::clarify_code_point_count($unicode_count); 8338 8339 my $loose_count = 0; # how many aliases loosely matched 8340 my $compound_name = ""; # ? Are any names compound?, and if so, an 8341 # example 8342 my $properties_with_compound_names = 0; # count of these 8343 8344 8345 my %flags; # The status flags used in the file 8346 my $total_entries = 0; # number of entries written in the comment 8347 my $matches_comment = ""; # The portion of the comment about the 8348 # \p{}'s 8349 my @global_comments; # List of all the tables' comments that are 8350 # there before this routine was called. 8351 my $has_ucd_alias = 0; # If there is an alias that is accessible via 8352 # Unicode::UCD. If not, then don't say it is 8353 # in the comment 8354 8355 # Get list of all the parent tables that are equivalent to this one 8356 # (including itself). 8357 my @parents = grep { $parent{main::objaddr $_} == $_ } 8358 main::uniques($leader, @{$equivalents{$addr}}); 8359 my $has_unrelated = (@parents >= 2); # boolean, ? are there unrelated 8360 # tables 8361 for my $parent (@parents) { 8362 8363 my $property = $parent->property; 8364 8365 # Special case 'N' tables in properties with two match tables when 8366 # the other is a 'Y' one. These are likely to be binary tables, 8367 # but not necessarily. In either case, \P{} will match the 8368 # complement of \p{}, and so if something is a synonym of \p, the 8369 # complement of that something will be the synonym of \P. This 8370 # would be true of any property with just two match tables, not 8371 # just those whose values are Y and N; but that would require a 8372 # little extra work, and there are none such so far in Unicode. 8373 my $perl_p = 'p'; # which is it? \p{} or \P{} 8374 my @yes_perl_synonyms; # list of any synonyms for the 'Y' table 8375 8376 if (scalar $property->tables == 2 8377 && $parent == $property->table('N') 8378 && defined (my $yes = $property->table('Y'))) 8379 { 8380 my $yes_addr = do { no overloading; pack 'J', $yes; }; 8381 @yes_perl_synonyms 8382 = grep { $_->property == $perl } 8383 main::uniques($yes, 8384 $parent{$yes_addr}, 8385 $parent{$yes_addr}->children); 8386 8387 # But these synonyms are \P{} ,not \p{} 8388 $perl_p = 'P'; 8389 } 8390 8391 my @description; # Will hold the table description 8392 my @note; # Will hold the table notes. 8393 my @conflicting; # Will hold the table conflicts. 8394 8395 # Look at the parent, any yes synonyms, and all the children 8396 my $parent_addr = do { no overloading; pack 'J', $parent; }; 8397 for my $table ($parent, 8398 @yes_perl_synonyms, 8399 @{$children{$parent_addr}}) 8400 { 8401 my $table_addr = do { no overloading; pack 'J', $table; }; 8402 my $table_property = $table->property; 8403 8404 # Tables are separated by a blank line to create a grouping. 8405 $matches_comment .= "\n" if $matches_comment; 8406 8407 # The table is named based on the property and value 8408 # combination it is for, like script=greek. But there may be 8409 # a number of synonyms for each side, like 'sc' for 'script', 8410 # and 'grek' for 'greek'. Any combination of these is a valid 8411 # name for this table. In this case, there are three more, 8412 # 'sc=grek', 'sc=greek', and 'script='grek'. Rather than 8413 # listing all possible combinations in the comment, we make 8414 # sure that each synonym occurs at least once, and add 8415 # commentary that the other combinations are possible. 8416 # Because regular expressions don't recognize things like 8417 # \p{jsn=}, only look at non-null right-hand-sides 8418 my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table_property->aliases; 8419 my @table_aliases = grep { $_->name ne "" } $table->aliases; 8420 8421 # The alias lists above are already ordered in the order we 8422 # want to output them. To ensure that each synonym is listed, 8423 # we must use the max of the two numbers. But if there are no 8424 # legal synonyms (nothing in @table_aliases), then we don't 8425 # list anything. 8426 my $listed_combos = (@table_aliases) 8427 ? main::max(scalar @table_aliases, 8428 scalar @property_aliases) 8429 : 0; 8430 trace "$listed_combos, tables=", scalar @table_aliases, "; property names=", scalar @property_aliases if main::DEBUG; 8431 8432 my $property_had_compound_name = 0; 8433 8434 for my $i (0 .. $listed_combos - 1) { 8435 $total_entries++; 8436 8437 # The current alias for the property is the next one on 8438 # the list, or if beyond the end, start over. Similarly 8439 # for the table (\p{prop=table}) 8440 my $property_alias = $property_aliases 8441 [$i % @property_aliases]->name; 8442 my $table_alias_object = $table_aliases 8443 [$i % @table_aliases]; 8444 my $table_alias = $table_alias_object->name; 8445 my $loose_match = $table_alias_object->loose_match; 8446 $has_ucd_alias |= $table_alias_object->ucd; 8447 8448 if ($table_alias !~ /\D/) { # Clarify large numbers. 8449 $table_alias = main::clarify_number($table_alias) 8450 } 8451 8452 # Add a comment for this alias combination 8453 my $current_match_comment; 8454 if ($table_property == $perl) { 8455 $current_match_comment = "\\$perl_p" 8456 . "{$table_alias}"; 8457 } 8458 else { 8459 $current_match_comment 8460 = "\\p{$property_alias=$table_alias}"; 8461 $property_had_compound_name = 1; 8462 } 8463 8464 # Flag any abnormal status for this table. 8465 my $flag = $property->status 8466 || $table->status 8467 || $table_alias_object->status; 8468 if ($flag && $flag ne $PLACEHOLDER) { 8469 $flags{$flag} = $status_past_participles{$flag}; 8470 } 8471 8472 $loose_count++; 8473 8474 # Pretty up the comment. Note the \b; it says don't make 8475 # this line a continuation. 8476 $matches_comment .= sprintf("\b%-1s%-s%s\n", 8477 $flag, 8478 " " x 7, 8479 $current_match_comment); 8480 } # End of generating the entries for this table. 8481 8482 # Save these for output after this group of related tables. 8483 push @description, $table->description; 8484 push @note, $table->note; 8485 push @conflicting, $table->conflicting; 8486 8487 # And this for output after all the tables. 8488 push @global_comments, $table->comment; 8489 8490 # Compute an alternate compound name using the final property 8491 # synonym and the first table synonym with a colon instead of 8492 # the equal sign used elsewhere. 8493 if ($property_had_compound_name) { 8494 $properties_with_compound_names ++; 8495 if (! $compound_name || @property_aliases > 1) { 8496 $compound_name = $property_aliases[-1]->name 8497 . ': ' 8498 . $table_aliases[0]->name; 8499 } 8500 } 8501 } # End of looping through all children of this table 8502 8503 # Here have assembled in $matches_comment all the related tables 8504 # to the current parent (preceded by the same info for all the 8505 # previous parents). Put out information that applies to all of 8506 # the current family. 8507 if (@conflicting) { 8508 8509 # But output the conflicting information now, as it applies to 8510 # just this table. 8511 my $conflicting = join ", ", @conflicting; 8512 if ($conflicting) { 8513 $matches_comment .= <<END; 8514 8515 Note that contrary to what you might expect, the above is NOT the same as 8516END 8517 $matches_comment .= "any of: " if @conflicting > 1; 8518 $matches_comment .= "$conflicting\n"; 8519 } 8520 } 8521 if (@description) { 8522 $matches_comment .= "\n Meaning: " 8523 . join('; ', @description) 8524 . "\n"; 8525 } 8526 if (@note) { 8527 $matches_comment .= "\n Note: " 8528 . join("\n ", @note) 8529 . "\n"; 8530 } 8531 } # End of looping through all tables 8532 8533 $matches_comment .= "\n$non_unicode_string\n" if $non_unicode_string; 8534 8535 8536 my $code_points; 8537 my $match; 8538 my $any_of_these; 8539 if ($unicode_count == 1) { 8540 $match = 'matches'; 8541 $code_points = 'single code point'; 8542 } 8543 else { 8544 $match = 'match'; 8545 $code_points = "$string_count code points"; 8546 } 8547 8548 my $synonyms; 8549 my $entries; 8550 if ($total_entries == 1) { 8551 $synonyms = ""; 8552 $entries = 'entry'; 8553 $any_of_these = 'this' 8554 } 8555 else { 8556 $synonyms = " any of the following regular expression constructs"; 8557 $entries = 'entries'; 8558 $any_of_these = 'any of these' 8559 } 8560 8561 my $comment = ""; 8562 if ($has_ucd_alias) { 8563 $comment .= "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n"; 8564 } 8565 if ($has_unrelated) { 8566 $comment .= <<END; 8567This file is for tables that are not necessarily related: To conserve 8568resources, every table that matches the identical set of code points in this 8569version of Unicode uses this file. Each one is listed in a separate group 8570below. It could be that the tables will match the same set of code points in 8571other Unicode releases, or it could be purely coincidence that they happen to 8572be the same in Unicode $unicode_version, and hence may not in other versions. 8573 8574END 8575 } 8576 8577 if (%flags) { 8578 foreach my $flag (sort keys %flags) { 8579 $comment .= <<END; 8580'$flag' below means that this form is $flags{$flag}. 8581END 8582 if ($flag eq $INTERNAL_ALIAS) { 8583 $comment .= "DO NOT USE!!!"; 8584 } 8585 else { 8586 $comment .= "Consult $pod_file.pod"; 8587 } 8588 $comment .= "\n"; 8589 } 8590 $comment .= "\n"; 8591 } 8592 8593 if ($total_entries == 0) { 8594 Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string. Creating file anyway."); 8595 $comment .= <<END; 8596This file returns the $code_points in Unicode Version 8597$unicode_version for 8598$leader, but it is inaccessible through Perl regular expressions, as 8599"\\p{prop=}" is not recognized. 8600END 8601 8602 } else { 8603 $comment .= <<END; 8604This file returns the $code_points in Unicode Version 8605$unicode_version that 8606$match$synonyms: 8607 8608$matches_comment 8609$pod_file.pod should be consulted for the syntax rules for $any_of_these, 8610including if adding or subtracting white space, underscore, and hyphen 8611characters matters or doesn't matter, and other permissible syntactic 8612variants. Upper/lower case distinctions never matter. 8613END 8614 8615 } 8616 if ($compound_name) { 8617 $comment .= <<END; 8618 8619A colon can be substituted for the equals sign, and 8620END 8621 if ($properties_with_compound_names > 1) { 8622 $comment .= <<END; 8623within each group above, 8624END 8625 } 8626 $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name); 8627 8628 # Note the \b below, it says don't make that line a continuation. 8629 $comment .= <<END; 8630anything to the left of the equals (or colon) can be combined with anything to 8631the right. Thus, for example, 8632$compound_name 8633\bis also valid. 8634END 8635 } 8636 8637 # And append any comment(s) from the actual tables. They are all 8638 # gathered here, so may not read all that well. 8639 if (@global_comments) { 8640 $comment .= "\n" . join("\n\n", @global_comments) . "\n"; 8641 } 8642 8643 if ($count) { # The format differs if no code points, and needs no 8644 # explanation in that case 8645 if ($leader->write_as_invlist) { 8646 $comment.= <<END; 8647 8648The first data line of this file begins with the letter V to indicate it is in 8649inversion list format. The number following the V gives the number of lines 8650remaining. Each of those remaining lines is a single number representing the 8651starting code point of a range which goes up to but not including the number 8652on the next line; The 0th, 2nd, 4th... ranges are for code points that match 8653the property; the 1st, 3rd, 5th... are ranges of code points that don't match 8654the property. The final line's range extends to the platform's infinity. 8655END 8656 } 8657 else { 8658 $comment.= <<END; 8659The format of the lines of this file is: 8660START\\tSTOP\\twhere START is the starting code point of the range, in hex; 8661STOP is the ending point, or if omitted, the range has just one code point. 8662END 8663 } 8664 if ($leader->output_range_counts) { 8665 $comment .= <<END; 8666Numbers in comments in [brackets] indicate how many code points are in the 8667range. 8668END 8669 } 8670 } 8671 8672 $leader->set_comment(main::join_lines($comment)); 8673 return; 8674 } 8675 8676 # Accessors for the underlying list 8677 for my $sub (qw( 8678 get_valid_code_point 8679 get_invalid_code_point 8680 )) 8681 { 8682 no strict "refs"; 8683 *$sub = sub { 8684 use strict "refs"; 8685 my $self = shift; 8686 8687 return $self->_range_list->$sub(@_); 8688 } 8689 } 8690} # End closure for Match_Table 8691 8692package Property; 8693 8694# The Property class represents a Unicode property, or the $perl 8695# pseudo-property. It contains a map table initialized empty at construction 8696# time, and for properties accessible through regular expressions, various 8697# match tables, created through the add_match_table() method, and referenced 8698# by the table('NAME') or tables() methods, the latter returning a list of all 8699# of the match tables. Otherwise table operations implicitly are for the map 8700# table. 8701# 8702# Most of the data in the property is actually about its map table, so it 8703# mostly just uses that table's accessors for most methods. The two could 8704# have been combined into one object, but for clarity because of their 8705# differing semantics, they have been kept separate. It could be argued that 8706# the 'file' and 'directory' fields should be kept with the map table. 8707# 8708# Each property has a type. This can be set in the constructor, or in the 8709# set_type accessor, but mostly it is figured out by the data. Every property 8710# starts with unknown type, overridden by a parameter to the constructor, or 8711# as match tables are added, or ranges added to the map table, the data is 8712# inspected, and the type changed. After the table is mostly or entirely 8713# filled, compute_type() should be called to finalize they analysis. 8714# 8715# There are very few operations defined. One can safely remove a range from 8716# the map table, and property_add_or_replace_non_nulls() adds the maps from another 8717# table to this one, replacing any in the intersection of the two. 8718 8719sub standardize { return main::standardize($_[0]); } 8720sub trace { return main::trace(@_) if main::DEBUG && $to_trace } 8721 8722{ # Closure 8723 8724 # This hash will contain as keys, all the aliases of all properties, and 8725 # as values, pointers to their respective property objects. This allows 8726 # quick look-up of a property from any of its names. 8727 my %alias_to_property_of; 8728 8729 sub dump_alias_to_property_of { 8730 # For debugging 8731 8732 print "\n", main::simple_dumper (\%alias_to_property_of), "\n"; 8733 return; 8734 } 8735 8736 sub property_ref($name) { 8737 # This is a package subroutine, not called as a method. 8738 # If the single parameter is a literal '*' it returns a list of all 8739 # defined properties. 8740 # Otherwise, the single parameter is a name, and it returns a pointer 8741 # to the corresponding property object, or undef if none. 8742 # 8743 # Properties can have several different names. The 'standard' form of 8744 # each of them is stored in %alias_to_property_of as they are defined. 8745 # But it's possible that this subroutine will be called with some 8746 # variant, so if the initial lookup fails, it is repeated with the 8747 # standardized form of the input name. If found, besides returning the 8748 # result, the input name is added to the list so future calls won't 8749 # have to do the conversion again. 8750 8751 if (! defined $name) { 8752 Carp::my_carp_bug("Undefined input property. No action taken."); 8753 return; 8754 } 8755 8756 return main::uniques(values %alias_to_property_of) if $name eq '*'; 8757 8758 # Return cached result if have it. 8759 my $result = $alias_to_property_of{$name}; 8760 return $result if defined $result; 8761 8762 # Convert the input to standard form. 8763 my $standard_name = standardize($name); 8764 8765 $result = $alias_to_property_of{$standard_name}; 8766 return unless defined $result; # Don't cache undefs 8767 8768 # Cache the result before returning it. 8769 $alias_to_property_of{$name} = $result; 8770 return $result; 8771 } 8772 8773 8774 main::setup_package(); 8775 8776 my %map; 8777 # A pointer to the map table object for this property 8778 main::set_access('map', \%map); 8779 8780 my %full_name; 8781 # The property's full name. This is a duplicate of the copy kept in the 8782 # map table, but is needed because stringify needs it during 8783 # construction of the map table, and then would have a chicken before egg 8784 # problem. 8785 main::set_access('full_name', \%full_name, 'r'); 8786 8787 my %table_ref; 8788 # This hash will contain as keys, all the aliases of any match tables 8789 # attached to this property, and as values, the pointers to their 8790 # respective tables. This allows quick look-up of a table from any of its 8791 # names. 8792 main::set_access('table_ref', \%table_ref); 8793 8794 my %type; 8795 # The type of the property, $ENUM, $BINARY, etc 8796 main::set_access('type', \%type, 'r'); 8797 8798 my %file; 8799 # The filename where the map table will go (if actually written). 8800 # Normally defaulted, but can be overridden. 8801 main::set_access('file', \%file, 'r', 's'); 8802 8803 my %directory; 8804 # The directory where the map table will go (if actually written). 8805 # Normally defaulted, but can be overridden. 8806 main::set_access('directory', \%directory, 's'); 8807 8808 my %pseudo_map_type; 8809 # This is used to affect the calculation of the map types for all the 8810 # ranges in the table. It should be set to one of the values that signify 8811 # to alter the calculation. 8812 main::set_access('pseudo_map_type', \%pseudo_map_type, 'r'); 8813 8814 my %has_only_code_point_maps; 8815 # A boolean used to help in computing the type of data in the map table. 8816 main::set_access('has_only_code_point_maps', \%has_only_code_point_maps); 8817 8818 my %unique_maps; 8819 # A list of the first few distinct mappings this property has. This is 8820 # used to disambiguate between binary and enum property types, so don't 8821 # have to keep more than three. 8822 main::set_access('unique_maps', \%unique_maps); 8823 8824 my %pre_declared_maps; 8825 # A boolean that gives whether the input data should declare all the 8826 # tables used, or not. If the former, unknown ones raise a warning. 8827 main::set_access('pre_declared_maps', 8828 \%pre_declared_maps, 'r', 's'); 8829 8830 my %match_subdir; 8831 # For properties whose shortest names are too long for a DOS 8.3 8832 # filesystem to distinguish between, this is used to manually give short 8833 # names for the directory name immediately under $match_tables that the 8834 # match tables for this property should be placed in. 8835 main::set_access('match_subdir', \%match_subdir, 'r'); 8836 8837 my %has_dependency; 8838 # A boolean that gives whether some table somewhere is defined as the 8839 # complement of a table in this property. This is a crude, but currently 8840 # sufficient, mechanism to make this property not get destroyed before 8841 # what is dependent on it is. Other dependencies could be added, so the 8842 # name was chosen to reflect a more general situation than actually is 8843 # currently the case. 8844 main::set_access('has_dependency', \%has_dependency, 'r', 's'); 8845 8846 sub new { 8847 # The only required parameter is the positionally first, name. All 8848 # other parameters are key => value pairs. See the documentation just 8849 # above for the meanings of the ones not passed directly on to the map 8850 # table constructor. 8851 8852 my $class = shift; 8853 my $name = shift || ""; 8854 8855 my $self = property_ref($name); 8856 if (defined $self) { 8857 my $options_string = join ", ", @_; 8858 $options_string = ". Ignoring options $options_string" if $options_string; 8859 Carp::my_carp("$self is already in use. Using existing one$options_string;"); 8860 return $self; 8861 } 8862 8863 my %args = @_; 8864 8865 $self = bless \do { my $anonymous_scalar }, $class; 8866 my $addr = do { no overloading; pack 'J', $self; }; 8867 8868 $directory{$addr} = delete $args{'Directory'}; 8869 $file{$addr} = delete $args{'File'}; 8870 $full_name{$addr} = delete $args{'Full_Name'} || $name; 8871 $type{$addr} = delete $args{'Type'} || $UNKNOWN; 8872 $pseudo_map_type{$addr} = delete $args{'Map_Type'}; 8873 $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'} 8874 # Starting in this release, property 8875 # values should be defined for all 8876 # properties, except those overriding this 8877 // $v_version ge v5.1.0; 8878 $match_subdir{$addr} = delete $args{'Match_SubDir'}; 8879 8880 # Rest of parameters passed on. 8881 8882 $has_only_code_point_maps{$addr} = 1; 8883 $table_ref{$addr} = { }; 8884 $unique_maps{$addr} = { }; 8885 $has_dependency{$addr} = 0; 8886 8887 $map{$addr} = Map_Table->new($name, 8888 Full_Name => $full_name{$addr}, 8889 _Alias_Hash => \%alias_to_property_of, 8890 _Property => $self, 8891 %args); 8892 return $self; 8893 } 8894 8895 # See this program's beginning comment block about overloading the copy 8896 # constructor. Few operations are defined on properties, but a couple are 8897 # useful. It is safe to take the inverse of a property, and to remove a 8898 # single code point from it. 8899 use overload 8900 fallback => 0, 8901 qw("") => "_operator_stringify", 8902 "." => \&main::_operator_dot, 8903 ".=" => \&main::_operator_dot_equal, 8904 '==' => \&main::_operator_equal, 8905 '!=' => \&main::_operator_not_equal, 8906 '=' => sub { return shift }, 8907 '-=' => "_minus_and_equal", 8908 ; 8909 8910 sub _operator_stringify($self, $other="", $reversed=0) { 8911 return "Property '" . shift->full_name . "'"; 8912 } 8913 8914 sub _minus_and_equal($self, $other, $reversed=0) { 8915 # Remove a single code point from the map table of a property. 8916 if (ref $other) { 8917 Carp::my_carp_bug("Bad news. Can't cope with a " 8918 . ref($other) 8919 . " argument to '-='. Subtraction ignored."); 8920 return $self; 8921 } 8922 elsif ($reversed) { # Shouldn't happen in a -=, but just in case 8923 Carp::my_carp_bug("Bad news. Can't cope with subtracting a " 8924 . ref $self 8925 . " from a non-object. undef returned."); 8926 return; 8927 } 8928 else { 8929 no overloading; 8930 $map{pack 'J', $self}->delete_range($other, $other); 8931 } 8932 return $self; 8933 } 8934 8935 sub add_match_table { 8936 # Add a new match table for this property, with name given by the 8937 # parameter. It returns a pointer to the table. 8938 8939 my $self = shift; 8940 my $name = shift; 8941 my %args = @_; 8942 8943 my $addr = do { no overloading; pack 'J', $self; }; 8944 8945 my $table = $table_ref{$addr}{$name}; 8946 my $standard_name = main::standardize($name); 8947 if (defined $table 8948 || (defined ($table = $table_ref{$addr}{$standard_name}))) 8949 { 8950 Carp::my_carp("Table '$name' in $self is already in use. Using existing one"); 8951 $table_ref{$addr}{$name} = $table; 8952 return $table; 8953 } 8954 else { 8955 8956 # See if this is a perl extension, if not passed in. 8957 my $perl_extension = delete $args{'Perl_Extension'}; 8958 $perl_extension 8959 = $self->perl_extension if ! defined $perl_extension; 8960 8961 my $fate; 8962 my $suppression_reason = ""; 8963 if ($self->name =~ /^_/) { 8964 $fate = $SUPPRESSED; 8965 $suppression_reason = "Parent property is internal only"; 8966 } 8967 elsif ($self->fate >= $SUPPRESSED) { 8968 $fate = $self->fate; 8969 $suppression_reason = $why_suppressed{$self->complete_name}; 8970 8971 } 8972 elsif ($name =~ /^_/) { 8973 $fate = $INTERNAL_ONLY; 8974 } 8975 $table = Match_Table->new( 8976 Name => $name, 8977 Perl_Extension => $perl_extension, 8978 _Alias_Hash => $table_ref{$addr}, 8979 _Property => $self, 8980 Fate => $fate, 8981 Suppression_Reason => $suppression_reason, 8982 Status => $self->status, 8983 _Status_Info => $self->status_info, 8984 %args); 8985 return unless defined $table; 8986 } 8987 8988 # Save the names for quick look up 8989 $table_ref{$addr}{$standard_name} = $table; 8990 $table_ref{$addr}{$name} = $table; 8991 8992 # Perhaps we can figure out the type of this property based on the 8993 # fact of adding this match table. First, string properties don't 8994 # have match tables; second, a binary property can't have 3 match 8995 # tables 8996 if ($type{$addr} == $UNKNOWN) { 8997 $type{$addr} = $NON_STRING; 8998 } 8999 elsif ($type{$addr} == $STRING) { 9000 Carp::my_carp("$self Added a match table '$name' to a string property '$self'. Changed it to a non-string property. Bad News."); 9001 $type{$addr} = $NON_STRING; 9002 } 9003 elsif ($type{$addr} != $ENUM && $type{$addr} != $FORCED_BINARY) { 9004 if (scalar main::uniques(values %{$table_ref{$addr}}) > 2) { 9005 if ($type{$addr} == $BINARY) { 9006 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary. Changing its type to 'enum'. Bad News."); 9007 } 9008 $type{$addr} = $ENUM; 9009 } 9010 } 9011 9012 return $table; 9013 } 9014 9015 sub delete_match_table($self, $table_to_remove) { 9016 # Delete the table referred to by $2 from the property $1. 9017 my $addr = do { no overloading; pack 'J', $self; }; 9018 9019 # Remove all names that refer to it. 9020 foreach my $key (keys %{$table_ref{$addr}}) { 9021 delete $table_ref{$addr}{$key} 9022 if $table_ref{$addr}{$key} == $table_to_remove; 9023 } 9024 9025 $table_to_remove->DESTROY; 9026 return; 9027 } 9028 9029 sub table($self, $name) { 9030 # Return a pointer to the match table (with name given by the 9031 # parameter) associated with this property; undef if none. 9032 my $addr = do { no overloading; pack 'J', $self; }; 9033 9034 return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name}; 9035 9036 # If quick look-up failed, try again using the standard form of the 9037 # input name. If that succeeds, cache the result before returning so 9038 # won't have to standardize this input name again. 9039 my $standard_name = main::standardize($name); 9040 return unless defined $table_ref{$addr}{$standard_name}; 9041 9042 $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name}; 9043 return $table_ref{$addr}{$name}; 9044 } 9045 9046 sub tables { 9047 # Return a list of pointers to all the match tables attached to this 9048 # property 9049 9050 no overloading; 9051 return main::uniques(values %{$table_ref{pack 'J', shift}}); 9052 } 9053 9054 sub directory { 9055 # Returns the directory the map table for this property should be 9056 # output in. If a specific directory has been specified, that has 9057 # priority; 'undef' is returned if the type isn't defined; 9058 # or $map_directory for everything else. 9059 9060 my $addr = do { no overloading; pack 'J', shift; }; 9061 9062 return $directory{$addr} if defined $directory{$addr}; 9063 return undef if $type{$addr} == $UNKNOWN; 9064 return $map_directory; 9065 } 9066 9067 sub swash_name($self) { 9068 # Return the name that is used to both: 9069 # 1) Name the file that the map table is written to. 9070 # 2) The name of swash related stuff inside that file. 9071 # The reason for this is that the Perl core historically has used 9072 # certain names that aren't the same as the Unicode property names. 9073 # To continue using these, $file is hard-coded in this file for those, 9074 # but otherwise the standard name is used. This is different from the 9075 # external_name, so that the rest of the files, like in lib can use 9076 # the standard name always, without regard to historical precedent. 9077 my $addr = do { no overloading; pack 'J', $self; }; 9078 9079 # Swash names are used only on either 9080 # 1) regular or internal-only map tables 9081 # 2) otherwise there should be no access to the 9082 # property map table from other parts of Perl. 9083 return if $map{$addr}->fate != $ORDINARY 9084 && ! ($map{$addr}->name =~ /^_/ 9085 && $map{$addr}->fate == $INTERNAL_ONLY); 9086 9087 return $file{$addr} if defined $file{$addr}; 9088 return $map{$addr}->external_name; 9089 } 9090 9091 sub to_create_match_tables($self) { 9092 # Returns a boolean as to whether or not match tables should be 9093 # created for this property. 9094 9095 # The whole point of this pseudo property is match tables. 9096 return 1 if $self == $perl; 9097 9098 my $addr = do { no overloading; pack 'J', $self; }; 9099 9100 # Don't generate tables of code points that match the property values 9101 # of a string property. Such a list would most likely have many 9102 # property values, each with just one or very few code points mapping 9103 # to it. 9104 return 0 if $type{$addr} == $STRING; 9105 9106 # Otherwise, do. 9107 return 1; 9108 } 9109 9110 sub property_add_or_replace_non_nulls($self, $other) { 9111 # This adds the mappings in the property $other to $self. Non-null 9112 # mappings from $other override those in $self. It essentially merges 9113 # the two properties, with the second having priority except for null 9114 # mappings. 9115 9116 if (! $other->isa(__PACKAGE__)) { 9117 Carp::my_carp_bug("$other should be a " 9118 . __PACKAGE__ 9119 . ". Not a '" 9120 . ref($other) 9121 . "'. Not added;"); 9122 return; 9123 } 9124 9125 no overloading; 9126 return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other}); 9127 } 9128 9129 sub set_proxy_for { 9130 # Certain tables are not generally written out to files, but 9131 # Unicode::UCD has the intelligence to know that the file for $self 9132 # can be used to reconstruct those tables. This routine just changes 9133 # things so that UCD pod entries for those suppressed tables are 9134 # generated, so the fact that a proxy is used is invisible to the 9135 # user. 9136 9137 my $self = shift; 9138 9139 foreach my $property_name (@_) { 9140 my $ref = property_ref($property_name); 9141 next if $ref->to_output_map; 9142 $ref->set_fate($MAP_PROXIED); 9143 } 9144 } 9145 9146 sub set_type($self, $type) { 9147 # Set the type of the property. Mostly this is figured out by the 9148 # data in the table. But this is used to set it explicitly. The 9149 # reason it is not a standard accessor is that when setting a binary 9150 # property, we need to make sure that all the true/false aliases are 9151 # present, as they were omitted in early Unicode releases. 9152 9153 if ($type != $ENUM 9154 && $type != $BINARY 9155 && $type != $FORCED_BINARY 9156 && $type != $STRING) 9157 { 9158 Carp::my_carp("Unrecognized type '$type'. Type not set"); 9159 return; 9160 } 9161 9162 { no overloading; $type{pack 'J', $self} = $type; } 9163 return if $type != $BINARY && $type != $FORCED_BINARY; 9164 9165 my $yes = $self->table('Y'); 9166 $yes = $self->table('Yes') if ! defined $yes; 9167 $yes = $self->add_match_table('Y', Full_Name => 'Yes') 9168 if ! defined $yes; 9169 9170 # Add aliases in order wanted, duplicates will be ignored. We use a 9171 # binary property present in all releases for its ordered lists of 9172 # true/false aliases. Note, that could run into problems in 9173 # outputting things in that we don't distinguish between the name and 9174 # full name of these. Hopefully, if the table was already created 9175 # before this code is executed, it was done with these set properly. 9176 my $bm = property_ref("Bidi_Mirrored"); 9177 foreach my $alias ($bm->table("Y")->aliases) { 9178 $yes->add_alias($alias->name); 9179 } 9180 my $no = $self->table('N'); 9181 $no = $self->table('No') if ! defined $no; 9182 $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no; 9183 foreach my $alias ($bm->table("N")->aliases) { 9184 $no->add_alias($alias->name); 9185 } 9186 9187 return; 9188 } 9189 9190 sub add_map { 9191 # Add a map to the property's map table. This also keeps 9192 # track of the maps so that the property type can be determined from 9193 # its data. 9194 9195 my $self = shift; 9196 my $start = shift; # First code point in range 9197 my $end = shift; # Final code point in range 9198 my $map = shift; # What the range maps to. 9199 # Rest of parameters passed on. 9200 9201 my $addr = do { no overloading; pack 'J', $self; }; 9202 9203 # If haven't the type of the property, gather information to figure it 9204 # out. 9205 if ($type{$addr} == $UNKNOWN) { 9206 9207 # If the map contains an interior blank or dash, or most other 9208 # nonword characters, it will be a string property. This 9209 # heuristic may actually miss some string properties. If so, they 9210 # may need to have explicit set_types called for them. This 9211 # happens in the Unihan properties. 9212 if ($map =~ / (?<= . ) [ -] (?= . ) /x 9213 || $map =~ / [^\w.\/\ -] /x) 9214 { 9215 $self->set_type($STRING); 9216 9217 # $unique_maps is used for disambiguating between ENUM and 9218 # BINARY later; since we know the property is not going to be 9219 # one of those, no point in keeping the data around 9220 undef $unique_maps{$addr}; 9221 } 9222 else { 9223 9224 # Not necessarily a string. The final decision has to be 9225 # deferred until all the data are in. We keep track of if all 9226 # the values are code points for that eventual decision. 9227 $has_only_code_point_maps{$addr} &= 9228 $map =~ / ^ $code_point_re $/x; 9229 9230 # For the purposes of disambiguating between binary and other 9231 # enumerations at the end, we keep track of the first three 9232 # distinct property values. Once we get to three, we know 9233 # it's not going to be binary, so no need to track more. 9234 if (scalar keys %{$unique_maps{$addr}} < 3) { 9235 $unique_maps{$addr}{main::standardize($map)} = 1; 9236 } 9237 } 9238 } 9239 9240 # Add the mapping by calling our map table's method 9241 return $map{$addr}->add_map($start, $end, $map, @_); 9242 } 9243 9244 sub compute_type($self) { 9245 # Compute the type of the property: $ENUM, $STRING, or $BINARY. This 9246 # should be called after the property is mostly filled with its maps. 9247 # We have been keeping track of what the property values have been, 9248 # and now have the necessary information to figure out the type. 9249 9250 my $addr = do { no overloading; pack 'J', $self; }; 9251 9252 my $type = $type{$addr}; 9253 9254 # If already have figured these out, no need to do so again, but we do 9255 # a double check on ENUMS to make sure that a string property hasn't 9256 # improperly been classified as an ENUM, so continue on with those. 9257 return if $type == $STRING 9258 || $type == $BINARY 9259 || $type == $FORCED_BINARY; 9260 9261 # If every map is to a code point, is a string property. 9262 if ($type == $UNKNOWN 9263 && ($has_only_code_point_maps{$addr} 9264 || (defined $map{$addr}->default_map 9265 && $map{$addr}->default_map eq ""))) 9266 { 9267 $self->set_type($STRING); 9268 } 9269 else { 9270 9271 # Otherwise, it is to some sort of enumeration. (The case where 9272 # it is a Unicode miscellaneous property, and treated like a 9273 # string in this program is handled in add_map()). Distinguish 9274 # between binary and some other enumeration type. Of course, if 9275 # there are more than two values, it's not binary. But more 9276 # subtle is the test that the default mapping is defined means it 9277 # isn't binary. This in fact may change in the future if Unicode 9278 # changes the way its data is structured. But so far, no binary 9279 # properties ever have @missing lines for them, so the default map 9280 # isn't defined for them. The few properties that are two-valued 9281 # and aren't considered binary have the default map defined 9282 # starting in Unicode 5.0, when the @missing lines appeared; and 9283 # this program has special code to put in a default map for them 9284 # for earlier than 5.0 releases. 9285 if ($type == $ENUM 9286 || scalar keys %{$unique_maps{$addr}} > 2 9287 || defined $self->default_map) 9288 { 9289 my $tables = $self->tables; 9290 my $count = $self->count; 9291 if ($verbosity && $tables > 500 && $tables/$count > .1) { 9292 Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $tables\n"); 9293 } 9294 $self->set_type($ENUM); 9295 } 9296 else { 9297 $self->set_type($BINARY); 9298 } 9299 } 9300 undef $unique_maps{$addr}; # Garbage collect 9301 return; 9302 } 9303 9304 # $reaons - Ignored unless suppressing 9305 sub set_fate($self, $fate, $reason=undef) { 9306 my $addr = do { no overloading; pack 'J', $self; }; 9307 if ($fate >= $SUPPRESSED) { 9308 $why_suppressed{$self->complete_name} = $reason; 9309 } 9310 9311 # Each table shares the property's fate, except that MAP_PROXIED 9312 # doesn't affect match tables 9313 $map{$addr}->set_fate($fate, $reason); 9314 if ($fate != $MAP_PROXIED) { 9315 foreach my $table ($map{$addr}, $self->tables) { 9316 $table->set_fate($fate, $reason); 9317 } 9318 } 9319 return; 9320 } 9321 9322 9323 # Most of the accessors for a property actually apply to its map table. 9324 # Setup up accessor functions for those, referring to %map 9325 for my $sub (qw( 9326 add_alias 9327 add_anomalous_entry 9328 add_comment 9329 add_conflicting 9330 add_description 9331 add_duplicate 9332 add_note 9333 aliases 9334 comment 9335 complete_name 9336 containing_range 9337 count 9338 default_map 9339 definition 9340 delete_range 9341 description 9342 each_range 9343 external_name 9344 fate 9345 file_path 9346 format 9347 initialize 9348 inverse_list 9349 is_empty 9350 name 9351 note 9352 perl_extension 9353 property 9354 range_count 9355 ranges 9356 range_size_1 9357 replace_map 9358 reset_each_range 9359 set_comment 9360 set_default_map 9361 set_file_path 9362 set_final_comment 9363 _set_format 9364 set_range_size_1 9365 set_status 9366 set_to_output_map 9367 short_name 9368 status 9369 status_info 9370 to_output_map 9371 type_of 9372 value_of 9373 write 9374 )) 9375 # 'property' above is for symmetry, so that one can take 9376 # the property of a property and get itself, and so don't 9377 # have to distinguish between properties and tables in 9378 # calling code 9379 { 9380 no strict "refs"; 9381 *$sub = sub { 9382 use strict "refs"; 9383 my $self = shift; 9384 no overloading; 9385 return $map{pack 'J', $self}->$sub(@_); 9386 } 9387 } 9388 9389 9390} # End closure 9391 9392package main; 9393 9394sub display_chr { 9395 # Converts an ordinal printable character value to a displayable string, 9396 # using a dotted circle to hold combining characters. 9397 9398 my $ord = shift; 9399 my $chr = chr $ord; 9400 return $chr if $ccc->table(0)->contains($ord); 9401 return "\x{25CC}$chr"; 9402} 9403 9404sub join_lines($input) { 9405 # Returns lines of the input joined together, so that they can be folded 9406 # properly. 9407 # This causes continuation lines to be joined together into one long line 9408 # for folding. A continuation line is any line that doesn't begin with a 9409 # space or "\b" (the latter is stripped from the output). This is so 9410 # lines can be in a HERE document so as to fit nicely in the terminal 9411 # width, but be joined together in one long line, and then folded with 9412 # indents, '#' prefixes, etc, properly handled. 9413 # A blank separates the joined lines except if there is a break; an extra 9414 # blank is inserted after a period ending a line. 9415 9416 # Initialize the return with the first line. 9417 my ($return, @lines) = split "\n", $input; 9418 9419 # If the first line is null, it was an empty line, add the \n back in 9420 $return = "\n" if $return eq ""; 9421 9422 # Now join the remainder of the physical lines. 9423 for my $line (@lines) { 9424 9425 # An empty line means wanted a blank line, so add two \n's to get that 9426 # effect, and go to the next line. 9427 if (length $line == 0) { 9428 $return .= "\n\n"; 9429 next; 9430 } 9431 9432 # Look at the last character of what we have so far. 9433 my $previous_char = substr($return, -1, 1); 9434 9435 # And at the next char to be output. 9436 my $next_char = substr($line, 0, 1); 9437 9438 if ($previous_char ne "\n") { 9439 9440 # Here didn't end wth a nl. If the next char a blank or \b, it 9441 # means that here there is a break anyway. So add a nl to the 9442 # output. 9443 if ($next_char eq " " || $next_char eq "\b") { 9444 $previous_char = "\n"; 9445 $return .= $previous_char; 9446 } 9447 9448 # Add an extra space after periods. 9449 $return .= " " if $previous_char eq '.'; 9450 } 9451 9452 # Here $previous_char is still the latest character to be output. If 9453 # it isn't a nl, it means that the next line is to be a continuation 9454 # line, with a blank inserted between them. 9455 $return .= " " if $previous_char ne "\n"; 9456 9457 # Get rid of any \b 9458 substr($line, 0, 1) = "" if $next_char eq "\b"; 9459 9460 # And append this next line. 9461 $return .= $line; 9462 } 9463 9464 return $return; 9465} 9466 9467sub simple_fold( $line, $prefix="", $hanging_indent=0, $right_margin=0) { 9468 # Returns a string of the input (string or an array of strings) folded 9469 # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus 9470 # a \n 9471 # This is tailored for the kind of text written by this program, 9472 # especially the pod file, which can have very long names with 9473 # underscores in the middle, or words like AbcDefgHij.... We allow 9474 # breaking in the middle of such constructs if the line won't fit 9475 # otherwise. The break in such cases will come either just after an 9476 # underscore, or just before one of the Capital letters. 9477 9478 local $to_trace = 0 if main::DEBUG; 9479 9480 # $prefix Optional string to prepend to each output line 9481 # $hanging_indent Optional number of spaces to indent 9482 # continuation lines 9483 # $right_margin Optional number of spaces to narrow the 9484 # total width by. 9485 9486 # The space available doesn't include what's automatically prepended 9487 # to each line, or what's reserved on the right. 9488 my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin; 9489 # XXX Instead of using the 'nofold' perhaps better to look up the stack 9490 9491 if (DEBUG && $hanging_indent >= $max) { 9492 Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max. Using 0", 'nofold'); 9493 $hanging_indent = 0; 9494 } 9495 9496 # First, split into the current physical lines. 9497 my @line; 9498 if (ref $line) { # Better be an array, because not bothering to 9499 # test 9500 foreach my $line (@{$line}) { 9501 push @line, split /\n/, $line; 9502 } 9503 } 9504 else { 9505 @line = split /\n/, $line; 9506 } 9507 9508 #local $to_trace = 1 if main::DEBUG; 9509 trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace; 9510 9511 # Look at each current physical line. 9512 for (my $i = 0; $i < @line; $i++) { 9513 Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/; 9514 #local $to_trace = 1 if main::DEBUG; 9515 trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace; 9516 9517 # Remove prefix, because will be added back anyway, don't want 9518 # doubled prefix 9519 $line[$i] =~ s/^$prefix//; 9520 9521 # Remove trailing space 9522 $line[$i] =~ s/\s+\Z//; 9523 9524 # If the line is too long, fold it. 9525 if (length $line[$i] > $max) { 9526 my $remainder; 9527 9528 # Here needs to fold. Save the leading space in the line for 9529 # later. 9530 $line[$i] =~ /^ ( \s* )/x; 9531 my $leading_space = $1; 9532 trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace; 9533 9534 # If character at final permissible position is white space, 9535 # fold there, which will delete that white space 9536 if (substr($line[$i], $max - 1, 1) =~ /\s/) { 9537 $remainder = substr($line[$i], $max); 9538 $line[$i] = substr($line[$i], 0, $max - 1); 9539 } 9540 else { 9541 9542 # Otherwise fold at an acceptable break char closest to 9543 # the max length. Look at just the maximal initial 9544 # segment of the line 9545 my $segment = substr($line[$i], 0, $max - 1); 9546 if ($segment =~ 9547 /^ ( .{$hanging_indent} # Don't look before the 9548 # indent. 9549 \ * # Don't look in leading 9550 # blanks past the indent 9551 [^ ] .* # Find the right-most 9552 (?: # acceptable break: 9553 [ \s = ] # space or equal 9554 | - (?! [.0-9] ) # or non-unary minus. 9555 | [^\\[(] (?= \\ )# break before single backslash 9556 # not immediately after opening 9557 # punctuation 9558 ) # $1 includes the character 9559 )/x) 9560 { 9561 # Split into the initial part that fits, and remaining 9562 # part of the input 9563 $remainder = substr($line[$i], length $1); 9564 $line[$i] = $1; 9565 trace $line[$i] if DEBUG && $to_trace; 9566 trace $remainder if DEBUG && $to_trace; 9567 } 9568 9569 # If didn't find a good breaking spot, see if there is a 9570 # not-so-good breaking spot. These are just after 9571 # underscores or where the case changes from lower to 9572 # upper. Use \a as a soft hyphen, but give up 9573 # and don't break the line if there is actually a \a 9574 # already in the input. We use an ascii character for the 9575 # soft-hyphen to avoid any attempt by miniperl to try to 9576 # access the files that this program is creating. 9577 elsif ($segment !~ /\a/ 9578 && ($segment =~ s/_/_\a/g 9579 || $segment =~ s/ ( (?!\\) [a-z] ) (?= [A-Z] )/$1\a/xg)) 9580 { 9581 # Here were able to find at least one place to insert 9582 # our substitute soft hyphen. Find the right-most one 9583 # and replace it by a real hyphen. 9584 trace $segment if DEBUG && $to_trace; 9585 substr($segment, 9586 rindex($segment, "\a"), 9587 1) = '-'; 9588 9589 # Then remove the soft hyphen substitutes. 9590 $segment =~ s/\a//g; 9591 trace $segment if DEBUG && $to_trace; 9592 9593 # And split into the initial part that fits, and 9594 # remainder of the line 9595 my $pos = rindex($segment, '-'); 9596 $remainder = substr($line[$i], $pos); 9597 trace $remainder if DEBUG && $to_trace; 9598 $line[$i] = substr($segment, 0, $pos + 1); 9599 } 9600 } 9601 9602 # Here we know if we can fold or not. If we can, $remainder 9603 # is what remains to be processed in the next iteration. 9604 if (defined $remainder) { 9605 trace "folded='$line[$i]'" if main::DEBUG && $to_trace; 9606 9607 # Insert the folded remainder of the line as a new element 9608 # of the array. (It may still be too long, but we will 9609 # deal with that next time through the loop.) Omit any 9610 # leading space in the remainder. 9611 $remainder =~ s/^\s+//; 9612 trace "remainder='$remainder'" if main::DEBUG && $to_trace; 9613 9614 # But then indent by whichever is larger of: 9615 # 1) the leading space on the input line; 9616 # 2) the hanging indent. 9617 # This preserves indentation in the original line. 9618 my $lead = ($leading_space) 9619 ? length $leading_space 9620 : $hanging_indent; 9621 $lead = max($lead, $hanging_indent); 9622 splice @line, $i+1, 0, (" " x $lead) . $remainder; 9623 } 9624 } 9625 9626 # Ready to output the line. Get rid of any trailing space 9627 # And prefix by the required $prefix passed in. 9628 $line[$i] =~ s/\s+$//; 9629 $line[$i] = "$prefix$line[$i]\n"; 9630 } # End of looping through all the lines. 9631 9632 return join "", @line; 9633} 9634 9635sub property_ref { # Returns a reference to a property object. 9636 return Property::property_ref(@_); 9637} 9638 9639sub force_unlink ($filename) { 9640 return unless file_exists($filename); 9641 return if CORE::unlink($filename); 9642 9643 # We might need write permission 9644 chmod 0777, $filename; 9645 CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename. Proceeding anyway: $!"); 9646 return; 9647} 9648 9649sub write ($file, $use_utf8, @lines) { 9650 # Given a filename and references to arrays of lines, write the lines of 9651 # each array to the file 9652 # Filename can be given as an arrayref of directory names 9653 9654 # Get into a single string if an array, and get rid of, in Unix terms, any 9655 # leading '.' 9656 $file= File::Spec->join(@$file) if ref $file eq 'ARRAY'; 9657 $file = File::Spec->canonpath($file); 9658 9659 # If has directories, make sure that they all exist 9660 (undef, my $directories, undef) = File::Spec->splitpath($file); 9661 File::Path::mkpath($directories) if $directories && ! -d $directories; 9662 9663 push @files_actually_output, $file; 9664 9665 force_unlink ($file); 9666 9667 my $OUT; 9668 if (not open $OUT, ">", $file) { 9669 Carp::my_carp("can't open $file for output. Skipping this file: $!"); 9670 return; 9671 } 9672 9673 binmode $OUT, ":utf8" if $use_utf8; 9674 9675 foreach my $lines_ref (@lines) { 9676 unless (@$lines_ref) { 9677 Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;"); 9678 } 9679 9680 print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!"); 9681 } 9682 close $OUT or die Carp::my_carp("close '$file' failed: $!"); 9683 9684 print "$file written.\n" if $verbosity >= $VERBOSE; 9685 9686 return; 9687} 9688 9689 9690sub Standardize($name=undef) { 9691 # This converts the input name string into a standardized equivalent to 9692 # use internally. 9693 9694 unless (defined $name) { 9695 Carp::my_carp_bug("Standardize() called with undef. Returning undef."); 9696 return; 9697 } 9698 9699 # Remove any leading or trailing white space 9700 $name =~ s/^\s+//g; 9701 $name =~ s/\s+$//g; 9702 9703 # Convert interior white space and hyphens into underscores. 9704 $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg; 9705 9706 # Capitalize the letter following an underscore, and convert a sequence of 9707 # multiple underscores to a single one 9708 $name =~ s/ (?<= .) _+ (.) /_\u$1/xg; 9709 9710 # And capitalize the first letter, but not for the special cjk ones. 9711 $name = ucfirst($name) unless $name =~ /^k[A-Z]/; 9712 return $name; 9713} 9714 9715sub standardize ($str=undef) { 9716 # Returns a lower-cased standardized name, without underscores. This form 9717 # is chosen so that it can distinguish between any real versus superficial 9718 # Unicode name differences. It relies on the fact that Unicode doesn't 9719 # have interior underscores, white space, nor dashes in any 9720 # stricter-matched name. It should not be used on Unicode code point 9721 # names (the Name property), as they mostly, but not always follow these 9722 # rules. 9723 9724 my $name = Standardize($str); 9725 return if !defined $name; 9726 9727 $name =~ s/ (?<= .) _ (?= . ) //xg; 9728 return lc $name; 9729} 9730 9731sub UCD_name ($table, $alias) { 9732 # Returns the name that Unicode::UCD will use to find a table. XXX 9733 # perhaps this function should be placed somewhere, like UCD.pm so that 9734 # Unicode::UCD can use it directly without duplicating code that can get 9735 # out-of sync. 9736 9737 my $property = $table->property; 9738 $property = ($property == $perl) 9739 ? "" # 'perl' is never explicitly stated 9740 : standardize($property->name) . '='; 9741 if ($alias->loose_match) { 9742 return $property . standardize($alias->name); 9743 } 9744 else { 9745 return lc ($property . $alias->name); 9746 } 9747 9748 return; 9749} 9750 9751{ # Closure 9752 9753 my $indent_increment = " " x ( $debugging_build ? 2 : 0); 9754 %main::already_output = (); 9755 9756 $main::simple_dumper_nesting = 0; 9757 9758 sub simple_dumper( $item, $indent = "" ) { 9759 # Like Simple Data::Dumper. Good enough for our needs. We can't use 9760 # the real thing as we have to run under miniperl. 9761 9762 # It is designed so that on input it is at the beginning of a line, 9763 # and the final thing output in any call is a trailing ",\n". 9764 9765 $indent = "" if ! $debugging_build; 9766 9767 # nesting level is localized, so that as the call stack pops, it goes 9768 # back to the prior value. 9769 local $main::simple_dumper_nesting = $main::simple_dumper_nesting; 9770 local %main::already_output = %main::already_output; 9771 $main::simple_dumper_nesting++; 9772 #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n"; 9773 9774 # Determine the indent for recursive calls. 9775 my $next_indent = $indent . $indent_increment; 9776 9777 my $output; 9778 if (! ref $item) { 9779 9780 # Dump of scalar: just output it in quotes if not a number. To do 9781 # so we must escape certain characters, and therefore need to 9782 # operate on a copy to avoid changing the original 9783 my $copy = $item; 9784 $copy = $UNDEF unless defined $copy; 9785 9786 # Quote non-integers (integers also have optional leading '-') 9787 if ($copy eq "" || $copy !~ /^ -? \d+ $/x) { 9788 9789 # Escape apostrophe and backslash 9790 $copy =~ s/ ( ['\\] ) /\\$1/xg; 9791 $copy = "'$copy'"; 9792 } 9793 $output = "$indent$copy,\n"; 9794 } 9795 else { 9796 9797 # Keep track of cycles in the input, and refuse to infinitely loop 9798 my $addr = do { no overloading; pack 'J', $item; }; 9799 if (defined $main::already_output{$addr}) { 9800 return "${indent}ALREADY OUTPUT: $item\n"; 9801 } 9802 $main::already_output{$addr} = $item; 9803 9804 if (ref $item eq 'ARRAY') { 9805 my $using_brackets; 9806 $output = $indent; 9807 if ($main::simple_dumper_nesting > 1) { 9808 $output .= '['; 9809 $using_brackets = 1; 9810 } 9811 else { 9812 $using_brackets = 0; 9813 } 9814 9815 # If the array is empty, put the closing bracket on the same 9816 # line. Otherwise, recursively add each array element 9817 if (@$item == 0) { 9818 $output .= " "; 9819 } 9820 else { 9821 $output .= "\n"; 9822 for (my $i = 0; $i < @$item; $i++) { 9823 9824 # Indent array elements one level 9825 $output .= &simple_dumper($item->[$i], $next_indent); 9826 next if ! $debugging_build; 9827 $output =~ s/\n$//; # Remove any trailing nl so 9828 $output .= " # [$i]\n"; # as to add a comment giving 9829 # the array index 9830 } 9831 $output .= $indent; # Indent closing ']' to orig level 9832 } 9833 $output .= ']' if $using_brackets; 9834 $output .= ",\n"; 9835 } 9836 elsif (ref $item eq 'HASH') { 9837 my $is_first_line; 9838 my $using_braces; 9839 my $body_indent; 9840 9841 # No surrounding braces at top level 9842 $output .= $indent; 9843 if ($main::simple_dumper_nesting > 1) { 9844 $output .= "{\n"; 9845 $is_first_line = 0; 9846 $body_indent = $next_indent; 9847 $next_indent .= $indent_increment; 9848 $using_braces = 1; 9849 } 9850 else { 9851 $is_first_line = 1; 9852 $body_indent = $indent; 9853 $using_braces = 0; 9854 } 9855 9856 # Output hashes sorted alphabetically instead of apparently 9857 # random. Use caseless alphabetic sort 9858 foreach my $key (sort { lc $a cmp lc $b } keys %$item) 9859 { 9860 if ($is_first_line) { 9861 $is_first_line = 0; 9862 } 9863 else { 9864 $output .= "$body_indent"; 9865 } 9866 9867 # The key must be a scalar, but this recursive call quotes 9868 # it 9869 $output .= &simple_dumper($key); 9870 9871 # And change the trailing comma and nl to the hash fat 9872 # comma for clarity, and so the value can be on the same 9873 # line 9874 $output =~ s/,\n$/ => /; 9875 9876 # Recursively call to get the value's dump. 9877 my $next = &simple_dumper($item->{$key}, $next_indent); 9878 9879 # If the value is all on one line, remove its indent, so 9880 # will follow the => immediately. If it takes more than 9881 # one line, start it on a new line. 9882 if ($next !~ /\n.*\n/) { 9883 $next =~ s/^ *//; 9884 } 9885 else { 9886 $output .= "\n"; 9887 } 9888 $output .= $next; 9889 } 9890 9891 $output .= "$indent},\n" if $using_braces; 9892 } 9893 elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') { 9894 $output = $indent . ref($item) . "\n"; 9895 # XXX see if blessed 9896 } 9897 elsif ($item->can('dump')) { 9898 9899 # By convention in this program, objects furnish a 'dump' 9900 # method. Since not doing any output at this level, just pass 9901 # on the input indent 9902 $output = $item->dump($indent); 9903 } 9904 else { 9905 Carp::my_carp("Can't cope with dumping a " . ref($item) . ". Skipping."); 9906 } 9907 } 9908 return $output; 9909 } 9910} 9911 9912sub dump_inside_out( $object, $fields_ref ) { 9913 # Dump inside-out hashes in an object's state by converting them to a 9914 # regular hash and then calling simple_dumper on that. 9915 9916 my $addr = do { no overloading; pack 'J', $object; }; 9917 9918 my %hash; 9919 foreach my $key (keys %$fields_ref) { 9920 $hash{$key} = $fields_ref->{$key}{$addr}; 9921 } 9922 9923 return simple_dumper(\%hash, @_); 9924} 9925 9926sub _operator_dot($self, $other="", $reversed=0) { 9927 # Overloaded '.' method that is common to all packages. It uses the 9928 # package's stringify method. 9929 9930 foreach my $which (\$self, \$other) { 9931 next unless ref $$which; 9932 if ($$which->can('_operator_stringify')) { 9933 $$which = $$which->_operator_stringify; 9934 } 9935 else { 9936 my $ref = ref $$which; 9937 my $addr = do { no overloading; pack 'J', $$which; }; 9938 $$which = "$ref ($addr)"; 9939 } 9940 } 9941 return ($reversed) 9942 ? "$other$self" 9943 : "$self$other"; 9944} 9945 9946sub _operator_dot_equal($self, $other="", $reversed=0) { 9947 # Overloaded '.=' method that is common to all packages. 9948 9949 if ($reversed) { 9950 return $other .= "$self"; 9951 } 9952 else { 9953 return "$self" . "$other"; 9954 } 9955} 9956 9957sub _operator_equal($self, $other, @) { 9958 # Generic overloaded '==' routine. To be equal, they must be the exact 9959 # same object 9960 9961 return 0 unless defined $other; 9962 return 0 unless ref $other; 9963 no overloading; 9964 return $self == $other; 9965} 9966 9967sub _operator_not_equal($self, $other, @) { 9968 return ! _operator_equal($self, $other); 9969} 9970 9971sub substitute_PropertyAliases($file_object) { 9972 # Deal with early releases that don't have the crucial PropertyAliases.txt 9973 # file. 9974 9975 $file_object->insert_lines(get_old_property_aliases()); 9976 9977 process_PropertyAliases($file_object); 9978} 9979 9980 9981sub process_PropertyAliases($file) { 9982 # This reads in the PropertyAliases.txt file, which contains almost all 9983 # the character properties in Unicode and their equivalent aliases: 9984 # scf ; Simple_Case_Folding ; sfc 9985 # 9986 # Field 0 is the preferred short name for the property. 9987 # Field 1 is the full name. 9988 # Any succeeding ones are other accepted names. 9989 9990 # Add any cjk properties that may have been defined. 9991 $file->insert_lines(@cjk_properties); 9992 9993 while ($file->next_line) { 9994 9995 my @data = split /\s*;\s*/; 9996 9997 my $full = $data[1]; 9998 9999 # This line is defective in early Perls. The property in Unihan.txt 10000 # is kRSUnicode. 10001 if ($full eq 'Unicode_Radical_Stroke' && @data < 3) { 10002 push @data, qw(cjkRSUnicode kRSUnicode); 10003 } 10004 10005 my $this = Property->new($data[0], Full_Name => $full); 10006 10007 $this->set_fate($SUPPRESSED, $why_suppressed{$full}) 10008 if $why_suppressed{$full}; 10009 10010 # Start looking for more aliases after these two. 10011 for my $i (2 .. @data - 1) { 10012 $this->add_alias($data[$i]); 10013 } 10014 10015 } 10016 10017 my $scf = property_ref("Simple_Case_Folding"); 10018 $scf->add_alias("scf"); 10019 $scf->add_alias("sfc"); 10020 10021 return; 10022} 10023 10024sub finish_property_setup($file) { 10025 # Finishes setting up after PropertyAliases. 10026 10027 # This entry was missing from this file in earlier Unicode versions 10028 if (-e 'Jamo.txt' && ! defined property_ref('JSN')) { 10029 Property->new('JSN', Full_Name => 'Jamo_Short_Name'); 10030 } 10031 10032 # These are used so much, that we set globals for them. 10033 $gc = property_ref('General_Category'); 10034 $block = property_ref('Block'); 10035 $script = property_ref('Script'); 10036 $age = property_ref('Age'); 10037 10038 # Perl adds this alias. 10039 $gc->add_alias('Category'); 10040 10041 # Unicode::Normalize expects this file with this name and directory. 10042 $ccc = property_ref('Canonical_Combining_Class'); 10043 if (defined $ccc) { 10044 $ccc->set_file('CombiningClass'); 10045 $ccc->set_directory(File::Spec->curdir()); 10046 } 10047 10048 # These two properties aren't actually used in the core, but unfortunately 10049 # the names just above that are in the core interfere with these, so 10050 # choose different names. These aren't a problem unless the map tables 10051 # for these files get written out. 10052 my $lowercase = property_ref('Lowercase'); 10053 $lowercase->set_file('IsLower') if defined $lowercase; 10054 my $uppercase = property_ref('Uppercase'); 10055 $uppercase->set_file('IsUpper') if defined $uppercase; 10056 10057 # Set up the hard-coded default mappings, but only on properties defined 10058 # for this release 10059 foreach my $property (keys %default_mapping) { 10060 my $property_object = property_ref($property); 10061 next if ! defined $property_object; 10062 my $default_map = $default_mapping{$property}; 10063 $property_object->set_default_map($default_map); 10064 10065 # A map of <code point> implies the property is string. 10066 if ($property_object->type == $UNKNOWN 10067 && $default_map eq $CODE_POINT) 10068 { 10069 $property_object->set_type($STRING); 10070 } 10071 } 10072 10073 # The following use the Multi_Default class to create objects for 10074 # defaults. 10075 10076 # Bidi class has a complicated default, but the derived file takes care of 10077 # the complications, leaving just 'L'. 10078 if (file_exists("${EXTRACTED}DBidiClass.txt")) { 10079 property_ref('Bidi_Class')->set_default_map('L'); 10080 } 10081 else { 10082 my $default; 10083 10084 # The derived file was introduced in 3.1.1. The values below are 10085 # taken from table 3-8, TUS 3.0 10086 my $default_R = 10087 'my $default = Range_List->new; 10088 $default->add_range(0x0590, 0x05FF); 10089 $default->add_range(0xFB1D, 0xFB4F);' 10090 ; 10091 10092 # The defaults apply only to unassigned characters 10093 $default_R .= '$gc->table("Unassigned") & $default;'; 10094 10095 if ($v_version lt v3.0.0) { 10096 $default = Multi_Default->new(R => $default_R, 'L'); 10097 } 10098 else { 10099 10100 # AL apparently not introduced until 3.0: TUS 2.x references are 10101 # not on-line to check it out 10102 my $default_AL = 10103 'my $default = Range_List->new; 10104 $default->add_range(0x0600, 0x07BF); 10105 $default->add_range(0xFB50, 0xFDFF); 10106 $default->add_range(0xFE70, 0xFEFF);' 10107 ; 10108 10109 # Non-character code points introduced in this release; aren't AL 10110 if ($v_version ge 3.1.0) { 10111 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);'; 10112 } 10113 $default_AL .= '$gc->table("Unassigned") & $default'; 10114 $default = Multi_Default->new(AL => $default_AL, 10115 R => $default_R, 10116 'L'); 10117 } 10118 property_ref('Bidi_Class')->set_default_map($default); 10119 } 10120 10121 # Joining type has a complicated default, but the derived file takes care 10122 # of the complications, leaving just 'U' (or Non_Joining), except the file 10123 # is bad in 3.1.0 10124 if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') { 10125 if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) { 10126 property_ref('Joining_Type')->set_default_map('Non_Joining'); 10127 } 10128 else { 10129 10130 # Otherwise, there are not one, but two possibilities for the 10131 # missing defaults: T and U. 10132 # The missing defaults that evaluate to T are given by: 10133 # T = Mn + Cf - ZWNJ - ZWJ 10134 # where Mn and Cf are the general category values. In other words, 10135 # any non-spacing mark or any format control character, except 10136 # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO 10137 # WIDTH JOINER (joining type C). 10138 my $default = Multi_Default->new( 10139 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D', 10140 'Non_Joining'); 10141 property_ref('Joining_Type')->set_default_map($default); 10142 } 10143 } 10144 10145 # Line break has a complicated default in early releases. It is 'Unknown' 10146 # for non-assigned code points; 'AL' for assigned. 10147 if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') { 10148 my $lb = property_ref('Line_Break'); 10149 if (file_exists("${EXTRACTED}DLineBreak.txt")) { 10150 $lb->set_default_map('Unknown'); 10151 } 10152 else { 10153 my $default = Multi_Default->new('AL' => '~ $gc->table("Cn")', 10154 'Unknown', 10155 ); 10156 $lb->set_default_map($default); 10157 } 10158 } 10159 10160 # For backwards compatibility with applications that may read the mapping 10161 # file directly (it was documented in 5.12 and 5.14 as being thusly 10162 # usable), keep it from being adjusted. (range_size_1 is 10163 # used to force the traditional format.) 10164 if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) { 10165 $nfkc_cf->set_to_output_map($EXTERNAL_MAP); 10166 $nfkc_cf->set_range_size_1(1); 10167 } 10168 if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) { 10169 $bmg->set_to_output_map($EXTERNAL_MAP); 10170 $bmg->set_range_size_1(1); 10171 } 10172 10173 property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED); 10174 10175 return; 10176} 10177 10178sub get_old_property_aliases() { 10179 # Returns what would be in PropertyAliases.txt if it existed in very old 10180 # versions of Unicode. It was derived from the one in 3.2, and pared 10181 # down based on the data that was actually in the older releases. 10182 # An attempt was made to use the existence of files to mean inclusion or 10183 # not of various aliases, but if this was not sufficient, using version 10184 # numbers was resorted to. 10185 10186 my @return; 10187 10188 # These are to be used in all versions (though some are constructed by 10189 # this program if missing) 10190 push @return, split /\n/, <<'END'; 10191bc ; Bidi_Class 10192Bidi_M ; Bidi_Mirrored 10193cf ; Case_Folding 10194ccc ; Canonical_Combining_Class 10195dm ; Decomposition_Mapping 10196dt ; Decomposition_Type 10197gc ; General_Category 10198isc ; ISO_Comment 10199lc ; Lowercase_Mapping 10200na ; Name 10201na1 ; Unicode_1_Name 10202nt ; Numeric_Type 10203nv ; Numeric_Value 10204scf ; Simple_Case_Folding 10205slc ; Simple_Lowercase_Mapping 10206stc ; Simple_Titlecase_Mapping 10207suc ; Simple_Uppercase_Mapping 10208tc ; Titlecase_Mapping 10209uc ; Uppercase_Mapping 10210END 10211 10212 if (-e 'Blocks.txt') { 10213 push @return, "blk ; Block\n"; 10214 } 10215 if (-e 'ArabicShaping.txt') { 10216 push @return, split /\n/, <<'END'; 10217jg ; Joining_Group 10218jt ; Joining_Type 10219END 10220 } 10221 if (-e 'PropList.txt') { 10222 10223 # This first set is in the original old-style proplist. 10224 push @return, split /\n/, <<'END'; 10225Bidi_C ; Bidi_Control 10226Dash ; Dash 10227Dia ; Diacritic 10228Ext ; Extender 10229Hex ; Hex_Digit 10230Hyphen ; Hyphen 10231IDC ; ID_Continue 10232Ideo ; Ideographic 10233Join_C ; Join_Control 10234Math ; Math 10235QMark ; Quotation_Mark 10236Term ; Terminal_Punctuation 10237WSpace ; White_Space 10238END 10239 # The next sets were added later 10240 if ($v_version ge v3.0.0) { 10241 push @return, split /\n/, <<'END'; 10242Upper ; Uppercase 10243Lower ; Lowercase 10244END 10245 } 10246 if ($v_version ge v3.0.1) { 10247 push @return, split /\n/, <<'END'; 10248NChar ; Noncharacter_Code_Point 10249END 10250 } 10251 # The next sets were added in the new-style 10252 if ($v_version ge v3.1.0) { 10253 push @return, split /\n/, <<'END'; 10254OAlpha ; Other_Alphabetic 10255OLower ; Other_Lowercase 10256OMath ; Other_Math 10257OUpper ; Other_Uppercase 10258END 10259 } 10260 if ($v_version ge v3.1.1) { 10261 push @return, "AHex ; ASCII_Hex_Digit\n"; 10262 } 10263 } 10264 if (-e 'EastAsianWidth.txt') { 10265 push @return, "ea ; East_Asian_Width\n"; 10266 } 10267 if (-e 'CompositionExclusions.txt') { 10268 push @return, "CE ; Composition_Exclusion\n"; 10269 } 10270 if (-e 'LineBreak.txt') { 10271 push @return, "lb ; Line_Break\n"; 10272 } 10273 if (-e 'BidiMirroring.txt') { 10274 push @return, "bmg ; Bidi_Mirroring_Glyph\n"; 10275 } 10276 if (-e 'Scripts.txt') { 10277 push @return, "sc ; Script\n"; 10278 } 10279 if (-e 'DNormalizationProps.txt') { 10280 push @return, split /\n/, <<'END'; 10281Comp_Ex ; Full_Composition_Exclusion 10282FC_NFKC ; FC_NFKC_Closure 10283NFC_QC ; NFC_Quick_Check 10284NFD_QC ; NFD_Quick_Check 10285NFKC_QC ; NFKC_Quick_Check 10286NFKD_QC ; NFKD_Quick_Check 10287XO_NFC ; Expands_On_NFC 10288XO_NFD ; Expands_On_NFD 10289XO_NFKC ; Expands_On_NFKC 10290XO_NFKD ; Expands_On_NFKD 10291END 10292 } 10293 if (-e 'DCoreProperties.txt') { 10294 push @return, split /\n/, <<'END'; 10295Alpha ; Alphabetic 10296IDS ; ID_Start 10297XIDC ; XID_Continue 10298XIDS ; XID_Start 10299END 10300 # These can also appear in some versions of PropList.txt 10301 push @return, "Lower ; Lowercase\n" 10302 unless grep { $_ =~ /^Lower\b/} @return; 10303 push @return, "Upper ; Uppercase\n" 10304 unless grep { $_ =~ /^Upper\b/} @return; 10305 } 10306 10307 # This flag requires the DAge.txt file to be copied into the directory. 10308 if (DEBUG && $compare_versions) { 10309 push @return, 'age ; Age'; 10310 } 10311 10312 return @return; 10313} 10314 10315sub substitute_PropValueAliases($file_object) { 10316 # Deal with early releases that don't have the crucial 10317 # PropValueAliases.txt file. 10318 10319 $file_object->insert_lines(get_old_property_value_aliases()); 10320 10321 process_PropValueAliases($file_object); 10322} 10323 10324sub process_PropValueAliases($file) { 10325 # This file contains values that properties look like: 10326 # bc ; AL ; Arabic_Letter 10327 # blk; n/a ; Greek_And_Coptic ; Greek 10328 # 10329 # Field 0 is the property. 10330 # Field 1 is the short name of a property value or 'n/a' if no 10331 # short name exists; 10332 # Field 2 is the full property value name; 10333 # Any other fields are more synonyms for the property value. 10334 # Purely numeric property values are omitted from the file; as are some 10335 # others, fewer and fewer in later releases 10336 10337 # Entries for the ccc property have an extra field before the 10338 # abbreviation: 10339 # ccc; 0; NR ; Not_Reordered 10340 # It is the numeric value that the names are synonyms for. 10341 10342 # There are comment entries for values missing from this file: 10343 # # @missing: 0000..10FFFF; ISO_Comment; <none> 10344 # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point> 10345 10346 if ($v_version lt 4.0.0) { 10347 $file->insert_lines(split /\n/, <<'END' 10348Hangul_Syllable_Type; L ; Leading_Jamo 10349Hangul_Syllable_Type; LV ; LV_Syllable 10350Hangul_Syllable_Type; LVT ; LVT_Syllable 10351Hangul_Syllable_Type; NA ; Not_Applicable 10352Hangul_Syllable_Type; T ; Trailing_Jamo 10353Hangul_Syllable_Type; V ; Vowel_Jamo 10354END 10355 ); 10356 } 10357 if ($v_version lt 4.1.0) { 10358 $file->insert_lines(split /\n/, <<'END' 10359_Perl_GCB; CN ; Control 10360_Perl_GCB; CR ; CR 10361_Perl_GCB; EX ; Extend 10362_Perl_GCB; L ; L 10363_Perl_GCB; LF ; LF 10364_Perl_GCB; LV ; LV 10365_Perl_GCB; LVT ; LVT 10366_Perl_GCB; T ; T 10367_Perl_GCB; V ; V 10368_Perl_GCB; XX ; Other 10369END 10370 ); 10371 } 10372 10373 # Add any explicit cjk values 10374 $file->insert_lines(@cjk_property_values); 10375 10376 # This line is used only for testing the code that checks for name 10377 # conflicts. There is a script Inherited, and when this line is executed 10378 # it causes there to be a name conflict with the 'Inherited' that this 10379 # program generates for this block property value 10380 #$file->insert_lines('blk; n/a; Herited'); 10381 10382 # Process each line of the file ... 10383 while ($file->next_line) { 10384 10385 # Fix typo in input file 10386 s/CCC133/CCC132/g if $v_version eq v6.1.0; 10387 10388 my ($property, @data) = split /\s*;\s*/; 10389 10390 # The ccc property has an extra field at the beginning, which is the 10391 # numeric value. Move it to be after the other two, mnemonic, fields, 10392 # so that those will be used as the property value's names, and the 10393 # number will be an extra alias. (Rightmost splice removes field 1-2, 10394 # returning them in a slice; left splice inserts that before anything, 10395 # thus shifting the former field 0 to after them.) 10396 splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc'; 10397 10398 if ($v_version le v5.0.0 && $property eq 'blk' && $data[1] =~ /-/) { 10399 my $new_style = $data[1] =~ s/-/_/gr; 10400 splice @data, 1, 0, $new_style; 10401 } 10402 10403 # Field 0 is a short name unless "n/a"; field 1 is the full name. If 10404 # there is no short name, use the full one in element 1 10405 if ($data[0] eq "n/a") { 10406 $data[0] = $data[1]; 10407 } 10408 elsif ($data[0] ne $data[1] 10409 && standardize($data[0]) eq standardize($data[1]) 10410 && $data[1] !~ /[[:upper:]]/) 10411 { 10412 # Also, there is a bug in the file in which "n/a" is omitted, and 10413 # the two fields are identical except for case, and the full name 10414 # is all lower case. Copy the "short" name unto the full one to 10415 # give it some upper case. 10416 10417 $data[1] = $data[0]; 10418 } 10419 10420 # Earlier releases had the pseudo property 'qc' that should expand to 10421 # the ones that replace it below. 10422 if ($property eq 'qc') { 10423 if (lc $data[0] eq 'y') { 10424 $file->insert_lines('NFC_QC; Y ; Yes', 10425 'NFD_QC; Y ; Yes', 10426 'NFKC_QC; Y ; Yes', 10427 'NFKD_QC; Y ; Yes', 10428 ); 10429 } 10430 elsif (lc $data[0] eq 'n') { 10431 $file->insert_lines('NFC_QC; N ; No', 10432 'NFD_QC; N ; No', 10433 'NFKC_QC; N ; No', 10434 'NFKD_QC; N ; No', 10435 ); 10436 } 10437 elsif (lc $data[0] eq 'm') { 10438 $file->insert_lines('NFC_QC; M ; Maybe', 10439 'NFKC_QC; M ; Maybe', 10440 ); 10441 } 10442 else { 10443 $file->carp_bad_line("qc followed by unexpected '$data[0]"); 10444 } 10445 next; 10446 } 10447 10448 # The first field is the short name, 2nd is the full one. 10449 my $property_object = property_ref($property); 10450 my $table = $property_object->add_match_table($data[0], 10451 Full_Name => $data[1]); 10452 10453 # Start looking for more aliases after these two. 10454 for my $i (2 .. @data - 1) { 10455 $table->add_alias($data[$i]); 10456 } 10457 } # End of looping through the file 10458 10459 # As noted in the comments early in the program, it generates tables for 10460 # the default values for all releases, even those for which the concept 10461 # didn't exist at the time. Here we add those if missing. 10462 if (defined $age && ! defined $age->table('Unassigned')) { 10463 $age->add_match_table('Unassigned'); 10464 } 10465 $block->add_match_table('No_Block') if -e 'Blocks.txt' 10466 && ! defined $block->table('No_Block'); 10467 10468 10469 # Now set the default mappings of the properties from the file. This is 10470 # done after the loop because a number of properties have only @missings 10471 # entries in the file, and may not show up until the end. 10472 my @defaults = $file->get_missings; 10473 foreach my $default_ref (@defaults) { 10474 my $default = $default_ref->[0]; 10475 my $property = property_ref($default_ref->[1]); 10476 $property->set_default_map($default); 10477 } 10478 return; 10479} 10480 10481sub get_old_property_value_aliases () { 10482 # Returns what would be in PropValueAliases.txt if it existed in very old 10483 # versions of Unicode. It was derived from the one in 3.2, and pared 10484 # down. An attempt was made to use the existence of files to mean 10485 # inclusion or not of various aliases, but if this was not sufficient, 10486 # using version numbers was resorted to. 10487 10488 my @return = split /\n/, <<'END'; 10489bc ; AN ; Arabic_Number 10490bc ; B ; Paragraph_Separator 10491bc ; CS ; Common_Separator 10492bc ; EN ; European_Number 10493bc ; ES ; European_Separator 10494bc ; ET ; European_Terminator 10495bc ; L ; Left_To_Right 10496bc ; ON ; Other_Neutral 10497bc ; R ; Right_To_Left 10498bc ; WS ; White_Space 10499 10500Bidi_M; N; No; F; False 10501Bidi_M; Y; Yes; T; True 10502 10503# The standard combining classes are very much different in v1, so only use 10504# ones that look right (not checked thoroughly) 10505ccc; 0; NR ; Not_Reordered 10506ccc; 1; OV ; Overlay 10507ccc; 7; NK ; Nukta 10508ccc; 8; KV ; Kana_Voicing 10509ccc; 9; VR ; Virama 10510ccc; 202; ATBL ; Attached_Below_Left 10511ccc; 216; ATAR ; Attached_Above_Right 10512ccc; 218; BL ; Below_Left 10513ccc; 220; B ; Below 10514ccc; 222; BR ; Below_Right 10515ccc; 224; L ; Left 10516ccc; 228; AL ; Above_Left 10517ccc; 230; A ; Above 10518ccc; 232; AR ; Above_Right 10519ccc; 234; DA ; Double_Above 10520 10521dt ; can ; canonical 10522dt ; enc ; circle 10523dt ; fin ; final 10524dt ; font ; font 10525dt ; fra ; fraction 10526dt ; init ; initial 10527dt ; iso ; isolated 10528dt ; med ; medial 10529dt ; n/a ; none 10530dt ; nb ; noBreak 10531dt ; sqr ; square 10532dt ; sub ; sub 10533dt ; sup ; super 10534 10535gc ; C ; Other # Cc | Cf | Cn | Co | Cs 10536gc ; Cc ; Control 10537gc ; Cn ; Unassigned 10538gc ; Co ; Private_Use 10539gc ; L ; Letter # Ll | Lm | Lo | Lt | Lu 10540gc ; LC ; Cased_Letter # Ll | Lt | Lu 10541gc ; Ll ; Lowercase_Letter 10542gc ; Lm ; Modifier_Letter 10543gc ; Lo ; Other_Letter 10544gc ; Lu ; Uppercase_Letter 10545gc ; M ; Mark # Mc | Me | Mn 10546gc ; Mc ; Spacing_Mark 10547gc ; Mn ; Nonspacing_Mark 10548gc ; N ; Number # Nd | Nl | No 10549gc ; Nd ; Decimal_Number 10550gc ; No ; Other_Number 10551gc ; P ; Punctuation # Pc | Pd | Pe | Pf | Pi | Po | Ps 10552gc ; Pd ; Dash_Punctuation 10553gc ; Pe ; Close_Punctuation 10554gc ; Po ; Other_Punctuation 10555gc ; Ps ; Open_Punctuation 10556gc ; S ; Symbol # Sc | Sk | Sm | So 10557gc ; Sc ; Currency_Symbol 10558gc ; Sm ; Math_Symbol 10559gc ; So ; Other_Symbol 10560gc ; Z ; Separator # Zl | Zp | Zs 10561gc ; Zl ; Line_Separator 10562gc ; Zp ; Paragraph_Separator 10563gc ; Zs ; Space_Separator 10564 10565nt ; de ; Decimal 10566nt ; di ; Digit 10567nt ; n/a ; None 10568nt ; nu ; Numeric 10569END 10570 10571 if (-e 'ArabicShaping.txt') { 10572 push @return, split /\n/, <<'END'; 10573jg ; n/a ; AIN 10574jg ; n/a ; ALEF 10575jg ; n/a ; DAL 10576jg ; n/a ; GAF 10577jg ; n/a ; LAM 10578jg ; n/a ; MEEM 10579jg ; n/a ; NO_JOINING_GROUP 10580jg ; n/a ; NOON 10581jg ; n/a ; QAF 10582jg ; n/a ; SAD 10583jg ; n/a ; SEEN 10584jg ; n/a ; TAH 10585jg ; n/a ; WAW 10586 10587jt ; C ; Join_Causing 10588jt ; D ; Dual_Joining 10589jt ; L ; Left_Joining 10590jt ; R ; Right_Joining 10591jt ; U ; Non_Joining 10592jt ; T ; Transparent 10593END 10594 if ($v_version ge v3.0.0) { 10595 push @return, split /\n/, <<'END'; 10596jg ; n/a ; ALAPH 10597jg ; n/a ; BEH 10598jg ; n/a ; BETH 10599jg ; n/a ; DALATH_RISH 10600jg ; n/a ; E 10601jg ; n/a ; FEH 10602jg ; n/a ; FINAL_SEMKATH 10603jg ; n/a ; GAMAL 10604jg ; n/a ; HAH 10605jg ; n/a ; HAMZA_ON_HEH_GOAL 10606jg ; n/a ; HE 10607jg ; n/a ; HEH 10608jg ; n/a ; HEH_GOAL 10609jg ; n/a ; HETH 10610jg ; n/a ; KAF 10611jg ; n/a ; KAPH 10612jg ; n/a ; KNOTTED_HEH 10613jg ; n/a ; LAMADH 10614jg ; n/a ; MIM 10615jg ; n/a ; NUN 10616jg ; n/a ; PE 10617jg ; n/a ; QAPH 10618jg ; n/a ; REH 10619jg ; n/a ; REVERSED_PE 10620jg ; n/a ; SADHE 10621jg ; n/a ; SEMKATH 10622jg ; n/a ; SHIN 10623jg ; n/a ; SWASH_KAF 10624jg ; n/a ; TAW 10625jg ; n/a ; TEH_MARBUTA 10626jg ; n/a ; TETH 10627jg ; n/a ; YEH 10628jg ; n/a ; YEH_BARREE 10629jg ; n/a ; YEH_WITH_TAIL 10630jg ; n/a ; YUDH 10631jg ; n/a ; YUDH_HE 10632jg ; n/a ; ZAIN 10633END 10634 } 10635 } 10636 10637 10638 if (-e 'EastAsianWidth.txt') { 10639 push @return, split /\n/, <<'END'; 10640ea ; A ; Ambiguous 10641ea ; F ; Fullwidth 10642ea ; H ; Halfwidth 10643ea ; N ; Neutral 10644ea ; Na ; Narrow 10645ea ; W ; Wide 10646END 10647 } 10648 10649 if (-e 'LineBreak.txt' || -e 'LBsubst.txt') { 10650 my @lb = split /\n/, <<'END'; 10651lb ; AI ; Ambiguous 10652lb ; AL ; Alphabetic 10653lb ; B2 ; Break_Both 10654lb ; BA ; Break_After 10655lb ; BB ; Break_Before 10656lb ; BK ; Mandatory_Break 10657lb ; CB ; Contingent_Break 10658lb ; CL ; Close_Punctuation 10659lb ; CM ; Combining_Mark 10660lb ; CR ; Carriage_Return 10661lb ; EX ; Exclamation 10662lb ; GL ; Glue 10663lb ; HY ; Hyphen 10664lb ; ID ; Ideographic 10665lb ; IN ; Inseperable 10666lb ; IS ; Infix_Numeric 10667lb ; LF ; Line_Feed 10668lb ; NS ; Nonstarter 10669lb ; NU ; Numeric 10670lb ; OP ; Open_Punctuation 10671lb ; PO ; Postfix_Numeric 10672lb ; PR ; Prefix_Numeric 10673lb ; QU ; Quotation 10674lb ; SA ; Complex_Context 10675lb ; SG ; Surrogate 10676lb ; SP ; Space 10677lb ; SY ; Break_Symbols 10678lb ; XX ; Unknown 10679lb ; ZW ; ZWSpace 10680END 10681 # If this Unicode version predates the lb property, we use our 10682 # substitute one 10683 if (-e 'LBsubst.txt') { 10684 $_ = s/^lb/_Perl_LB/r for @lb; 10685 } 10686 push @return, @lb; 10687 } 10688 10689 if (-e 'DNormalizationProps.txt') { 10690 push @return, split /\n/, <<'END'; 10691qc ; M ; Maybe 10692qc ; N ; No 10693qc ; Y ; Yes 10694END 10695 } 10696 10697 if (-e 'Scripts.txt') { 10698 push @return, split /\n/, <<'END'; 10699sc ; Arab ; Arabic 10700sc ; Armn ; Armenian 10701sc ; Beng ; Bengali 10702sc ; Bopo ; Bopomofo 10703sc ; Cans ; Canadian_Aboriginal 10704sc ; Cher ; Cherokee 10705sc ; Cyrl ; Cyrillic 10706sc ; Deva ; Devanagari 10707sc ; Dsrt ; Deseret 10708sc ; Ethi ; Ethiopic 10709sc ; Geor ; Georgian 10710sc ; Goth ; Gothic 10711sc ; Grek ; Greek 10712sc ; Gujr ; Gujarati 10713sc ; Guru ; Gurmukhi 10714sc ; Hang ; Hangul 10715sc ; Hani ; Han 10716sc ; Hebr ; Hebrew 10717sc ; Hira ; Hiragana 10718sc ; Ital ; Old_Italic 10719sc ; Kana ; Katakana 10720sc ; Khmr ; Khmer 10721sc ; Knda ; Kannada 10722sc ; Laoo ; Lao 10723sc ; Latn ; Latin 10724sc ; Mlym ; Malayalam 10725sc ; Mong ; Mongolian 10726sc ; Mymr ; Myanmar 10727sc ; Ogam ; Ogham 10728sc ; Orya ; Oriya 10729sc ; Qaai ; Inherited 10730sc ; Runr ; Runic 10731sc ; Sinh ; Sinhala 10732sc ; Syrc ; Syriac 10733sc ; Taml ; Tamil 10734sc ; Telu ; Telugu 10735sc ; Thaa ; Thaana 10736sc ; Thai ; Thai 10737sc ; Tibt ; Tibetan 10738sc ; Yiii ; Yi 10739sc ; Zyyy ; Common 10740END 10741 } 10742 10743 if ($v_version ge v2.0.0) { 10744 push @return, split /\n/, <<'END'; 10745dt ; com ; compat 10746dt ; nar ; narrow 10747dt ; sml ; small 10748dt ; vert ; vertical 10749dt ; wide ; wide 10750 10751gc ; Cf ; Format 10752gc ; Cs ; Surrogate 10753gc ; Lt ; Titlecase_Letter 10754gc ; Me ; Enclosing_Mark 10755gc ; Nl ; Letter_Number 10756gc ; Pc ; Connector_Punctuation 10757gc ; Sk ; Modifier_Symbol 10758END 10759 } 10760 if ($v_version ge v2.1.2) { 10761 push @return, "bc ; S ; Segment_Separator\n"; 10762 } 10763 if ($v_version ge v2.1.5) { 10764 push @return, split /\n/, <<'END'; 10765gc ; Pf ; Final_Punctuation 10766gc ; Pi ; Initial_Punctuation 10767END 10768 } 10769 if ($v_version ge v2.1.8) { 10770 push @return, "ccc; 240; IS ; Iota_Subscript\n"; 10771 } 10772 10773 if ($v_version ge v3.0.0) { 10774 push @return, split /\n/, <<'END'; 10775bc ; AL ; Arabic_Letter 10776bc ; BN ; Boundary_Neutral 10777bc ; LRE ; Left_To_Right_Embedding 10778bc ; LRO ; Left_To_Right_Override 10779bc ; NSM ; Nonspacing_Mark 10780bc ; PDF ; Pop_Directional_Format 10781bc ; RLE ; Right_To_Left_Embedding 10782bc ; RLO ; Right_To_Left_Override 10783 10784ccc; 233; DB ; Double_Below 10785END 10786 } 10787 10788 if ($v_version ge v3.1.0) { 10789 push @return, "ccc; 226; R ; Right\n"; 10790 } 10791 10792 return @return; 10793} 10794 10795sub process_NormalizationsTest($file) { 10796 10797 # Each line looks like: 10798 # source code point; NFC; NFD; NFKC; NFKD 10799 # e.g. 10800 # 1E0A;1E0A;0044 0307;1E0A;0044 0307; 10801 10802 # Process each line of the file ... 10803 while ($file->next_line) { 10804 10805 next if /^@/; 10806 10807 my ($c1, $c2, $c3, $c4, $c5) = split /\s*;\s*/; 10808 10809 foreach my $var (\$c1, \$c2, \$c3, \$c4, \$c5) { 10810 $$var = pack "U0U*", map { hex } split " ", $$var; 10811 $$var =~ s/(\\)/$1$1/g; 10812 } 10813 10814 push @normalization_tests, 10815 "Test_N(q$c1, q$c2, q$c3, q$c4, q$c5);\n"; 10816 } # End of looping through the file 10817} 10818 10819sub output_perl_charnames_line ($a, $b) { 10820 10821 # Output the entries in Perl_charnames specially, using 5 digits instead 10822 # of four. This makes the entries a constant length, and simplifies 10823 # charnames.pm which this table is for. Unicode can have 6 digit 10824 # ordinals, but they are all private use or noncharacters which do not 10825 # have names, so won't be in this table. 10826 10827 return sprintf "%05X\n%s\n\n", $_[0], $_[1]; 10828} 10829 10830{ # Closure 10831 10832 # These are constants to the $property_info hash in this subroutine, to 10833 # avoid using a quoted-string which might have a typo. 10834 my $TYPE = 'type'; 10835 my $DEFAULT_MAP = 'default_map'; 10836 my $DEFAULT_TABLE = 'default_table'; 10837 my $PSEUDO_MAP_TYPE = 'pseudo_map_type'; 10838 my $MISSINGS = 'missings'; 10839 10840 sub process_generic_property_file($file) { 10841 # This processes a file containing property mappings and puts them 10842 # into internal map tables. It should be used to handle any property 10843 # files that have mappings from a code point or range thereof to 10844 # something else. This means almost all the UCD .txt files. 10845 # each_line_handlers() should be set to adjust the lines of these 10846 # files, if necessary, to what this routine understands: 10847 # 10848 # 0374 ; NFD_QC; N 10849 # 003C..003E ; Math 10850 # 10851 # the fields are: "codepoint-range ; property; map" 10852 # 10853 # meaning the codepoints in the range all have the value 'map' under 10854 # 'property'. 10855 # Beginning and trailing white space in each field are not significant. 10856 # Note there is not a trailing semi-colon in the above. A trailing 10857 # semi-colon means the map is a null-string. An omitted map, as 10858 # opposed to a null-string, is assumed to be 'Y', based on Unicode 10859 # table syntax. (This could have been hidden from this routine by 10860 # doing it in the $file object, but that would require parsing of the 10861 # line there, so would have to parse it twice, or change the interface 10862 # to pass this an array. So not done.) 10863 # 10864 # The map field may begin with a sequence of commands that apply to 10865 # this range. Each such command begins and ends with $CMD_DELIM. 10866 # These are used to indicate, for example, that the mapping for a 10867 # range has a non-default type. 10868 # 10869 # This loops through the file, calling its next_line() method, and 10870 # then taking the map and adding it to the property's table. 10871 # Complications arise because any number of properties can be in the 10872 # file, in any order, interspersed in any way. The first time a 10873 # property is seen, it gets information about that property and 10874 # caches it for quick retrieval later. It also normalizes the maps 10875 # so that only one of many synonyms is stored. The Unicode input 10876 # files do use some multiple synonyms. 10877 10878 my %property_info; # To keep track of what properties 10879 # have already had entries in the 10880 # current file, and info about each, 10881 # so don't have to recompute. 10882 my $property_name; # property currently being worked on 10883 my $property_type; # and its type 10884 my $previous_property_name = ""; # name from last time through loop 10885 my $property_object; # pointer to the current property's 10886 # object 10887 my $property_addr; # the address of that object 10888 my $default_map; # the string that code points missing 10889 # from the file map to 10890 my $default_table; # For non-string properties, a 10891 # reference to the match table that 10892 # will contain the list of code 10893 # points that map to $default_map. 10894 10895 # Get the next real non-comment line 10896 LINE: 10897 while ($file->next_line) { 10898 10899 # Default replacement type; means that if parts of the range have 10900 # already been stored in our tables, the new map overrides them if 10901 # they differ more than cosmetically 10902 my $replace = $IF_NOT_EQUIVALENT; 10903 my $map_type; # Default type for the map of this range 10904 10905 #local $to_trace = 1 if main::DEBUG; 10906 trace $_ if main::DEBUG && $to_trace; 10907 10908 # Split the line into components 10909 my ($range, $property_name, $map, @remainder) 10910 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 10911 10912 # If more or less on the line than we are expecting, warn and skip 10913 # the line 10914 if (@remainder) { 10915 $file->carp_bad_line('Extra fields'); 10916 next LINE; 10917 } 10918 elsif ( ! defined $property_name) { 10919 $file->carp_bad_line('Missing property'); 10920 next LINE; 10921 } 10922 10923 # Examine the range. 10924 if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x) 10925 { 10926 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)"); 10927 next LINE; 10928 } 10929 my $low = hex $1; 10930 my $high = (defined $2) ? hex $2 : $low; 10931 10932 # If changing to a new property, get the things constant per 10933 # property 10934 if ($previous_property_name ne $property_name) { 10935 10936 $property_object = property_ref($property_name); 10937 if (! defined $property_object) { 10938 $file->carp_bad_line("Unexpected property '$property_name'. Skipped"); 10939 next LINE; 10940 } 10941 { no overloading; $property_addr = pack 'J', $property_object; } 10942 10943 # Defer changing names until have a line that is acceptable 10944 # (the 'next' statement above means is unacceptable) 10945 $previous_property_name = $property_name; 10946 10947 # If not the first time for this property, retrieve info about 10948 # it from the cache 10949 if (defined ($property_info{$property_addr}{$TYPE})) { 10950 $property_type = $property_info{$property_addr}{$TYPE}; 10951 $default_map = $property_info{$property_addr}{$DEFAULT_MAP}; 10952 $map_type 10953 = $property_info{$property_addr}{$PSEUDO_MAP_TYPE}; 10954 $default_table 10955 = $property_info{$property_addr}{$DEFAULT_TABLE}; 10956 } 10957 else { 10958 10959 # Here, is the first time for this property. Set up the 10960 # cache. 10961 $property_type = $property_info{$property_addr}{$TYPE} 10962 = $property_object->type; 10963 $map_type 10964 = $property_info{$property_addr}{$PSEUDO_MAP_TYPE} 10965 = $property_object->pseudo_map_type; 10966 10967 # The Unicode files are set up so that if the map is not 10968 # defined, it is a binary property 10969 if (! defined $map && $property_type != $BINARY) { 10970 if ($property_type != $UNKNOWN 10971 && $property_type != $NON_STRING) 10972 { 10973 $file->carp_bad_line("No mapping defined on a non-binary property. Using 'Y' for the map"); 10974 } 10975 else { 10976 $property_object->set_type($BINARY); 10977 $property_type 10978 = $property_info{$property_addr}{$TYPE} 10979 = $BINARY; 10980 } 10981 } 10982 10983 # Get any @missings default for this property. This 10984 # should precede the first entry for the property in the 10985 # input file, and is located in a comment that has been 10986 # stored by the Input_file class until we access it here. 10987 # It's possible that there is more than one such line 10988 # waiting for us; collect them all, and parse 10989 my @missings_list = $file->get_missings 10990 if $file->has_missings_defaults; 10991 foreach my $default_ref (@missings_list) { 10992 my $default = $default_ref->[0]; 10993 my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); }; 10994 10995 # For string properties, the default is just what the 10996 # file says, but non-string properties should already 10997 # have set up a table for the default property value; 10998 # use the table for these, so can resolve synonyms 10999 # later to a single standard one. 11000 if ($property_type == $STRING 11001 || $property_type == $UNKNOWN) 11002 { 11003 $property_info{$addr}{$MISSINGS} = $default; 11004 } 11005 else { 11006 $property_info{$addr}{$MISSINGS} 11007 = $property_object->table($default); 11008 } 11009 } 11010 11011 # Finished storing all the @missings defaults in the input 11012 # file so far. Get the one for the current property. 11013 my $missings = $property_info{$property_addr}{$MISSINGS}; 11014 11015 # But we likely have separately stored what the default 11016 # should be. (This is to accommodate versions of the 11017 # standard where the @missings lines are absent or 11018 # incomplete.) Hopefully the two will match. But check 11019 # it out. 11020 $default_map = $property_object->default_map; 11021 11022 # If the map is a ref, it means that the default won't be 11023 # processed until later, so undef it, so next few lines 11024 # will redefine it to something that nothing will match 11025 undef $default_map if ref $default_map; 11026 11027 # Create a $default_map if don't have one; maybe a dummy 11028 # that won't match anything. 11029 if (! defined $default_map) { 11030 11031 # Use any @missings line in the file. 11032 if (defined $missings) { 11033 if (ref $missings) { 11034 $default_map = $missings->full_name; 11035 $default_table = $missings; 11036 } 11037 else { 11038 $default_map = $missings; 11039 } 11040 11041 # And store it with the property for outside use. 11042 $property_object->set_default_map($default_map); 11043 } 11044 else { 11045 11046 # Neither an @missings nor a default map. Create 11047 # a dummy one, so won't have to test definedness 11048 # in the main loop. 11049 $default_map = '_Perl This will never be in a file 11050 from Unicode'; 11051 } 11052 } 11053 11054 # Here, we have $default_map defined, possibly in terms of 11055 # $missings, but maybe not, and possibly is a dummy one. 11056 if (defined $missings) { 11057 11058 # Make sure there is no conflict between the two. 11059 # $missings has priority. 11060 if (ref $missings) { 11061 $default_table 11062 = $property_object->table($default_map); 11063 if (! defined $default_table 11064 || $default_table != $missings) 11065 { 11066 if (! defined $default_table) { 11067 $default_table = $UNDEF; 11068 } 11069 $file->carp_bad_line(<<END 11070The \@missings line for $property_name in $file says that missings default to 11071$missings, but we expect it to be $default_table. $missings used. 11072END 11073 ); 11074 $default_table = $missings; 11075 $default_map = $missings->full_name; 11076 } 11077 $property_info{$property_addr}{$DEFAULT_TABLE} 11078 = $default_table; 11079 } 11080 elsif ($default_map ne $missings) { 11081 $file->carp_bad_line(<<END 11082The \@missings line for $property_name in $file says that missings default to 11083$missings, but we expect it to be $default_map. $missings used. 11084END 11085 ); 11086 $default_map = $missings; 11087 } 11088 } 11089 11090 $property_info{$property_addr}{$DEFAULT_MAP} 11091 = $default_map; 11092 11093 # If haven't done so already, find the table corresponding 11094 # to this map for non-string properties. 11095 if (! defined $default_table 11096 && $property_type != $STRING 11097 && $property_type != $UNKNOWN) 11098 { 11099 $default_table = $property_info{$property_addr} 11100 {$DEFAULT_TABLE} 11101 = $property_object->table($default_map); 11102 } 11103 } # End of is first time for this property 11104 } # End of switching properties. 11105 11106 # Ready to process the line. 11107 # The Unicode files are set up so that if the map is not defined, 11108 # it is a binary property with value 'Y' 11109 if (! defined $map) { 11110 $map = 'Y'; 11111 } 11112 else { 11113 11114 # If the map begins with a special command to us (enclosed in 11115 # delimiters), extract the command(s). 11116 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) { 11117 my $command = $1; 11118 if ($command =~ / ^ $REPLACE_CMD= (.*) /x) { 11119 $replace = $1; 11120 } 11121 elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) { 11122 $map_type = $1; 11123 } 11124 else { 11125 $file->carp_bad_line("Unknown command line: '$1'"); 11126 next LINE; 11127 } 11128 } 11129 } 11130 11131 if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x) 11132 { 11133 11134 # Here, we have a map to a particular code point, and the 11135 # default map is to a code point itself. If the range 11136 # includes the particular code point, change that portion of 11137 # the range to the default. This makes sure that in the final 11138 # table only the non-defaults are listed. 11139 my $decimal_map = hex $map; 11140 if ($low <= $decimal_map && $decimal_map <= $high) { 11141 11142 # If the range includes stuff before or after the map 11143 # we're changing, split it and process the split-off parts 11144 # later. 11145 if ($low < $decimal_map) { 11146 $file->insert_adjusted_lines( 11147 sprintf("%04X..%04X; %s; %s", 11148 $low, 11149 $decimal_map - 1, 11150 $property_name, 11151 $map)); 11152 } 11153 if ($high > $decimal_map) { 11154 $file->insert_adjusted_lines( 11155 sprintf("%04X..%04X; %s; %s", 11156 $decimal_map + 1, 11157 $high, 11158 $property_name, 11159 $map)); 11160 } 11161 $low = $high = $decimal_map; 11162 $map = $CODE_POINT; 11163 } 11164 } 11165 11166 # If we can tell that this is a synonym for the default map, use 11167 # the default one instead. 11168 if ($property_type != $STRING 11169 && $property_type != $UNKNOWN) 11170 { 11171 my $table = $property_object->table($map); 11172 if (defined $table && $table == $default_table) { 11173 $map = $default_map; 11174 } 11175 } 11176 11177 # And figure out the map type if not known. 11178 if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) { 11179 if ($map eq "") { # Nulls are always $NULL map type 11180 $map_type = $NULL; 11181 } # Otherwise, non-strings, and those that don't allow 11182 # $MULTI_CP, and those that aren't multiple code points are 11183 # 0 11184 elsif 11185 (($property_type != $STRING && $property_type != $UNKNOWN) 11186 || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP) 11187 || $map !~ /^ $code_point_re ( \ $code_point_re )+ $ /x) 11188 { 11189 $map_type = 0; 11190 } 11191 else { 11192 $map_type = $MULTI_CP; 11193 } 11194 } 11195 11196 $property_object->add_map($low, $high, 11197 $map, 11198 Type => $map_type, 11199 Replace => $replace); 11200 } # End of loop through file's lines 11201 11202 return; 11203 } 11204} 11205 11206{ # Closure for UnicodeData.txt handling 11207 11208 # This file was the first one in the UCD; its design leads to some 11209 # awkwardness in processing. Here is a sample line: 11210 # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061; 11211 # The fields in order are: 11212 my $i = 0; # The code point is in field 0, and is shifted off. 11213 my $CHARNAME = $i++; # character name (e.g. "LATIN CAPITAL LETTER A") 11214 my $CATEGORY = $i++; # category (e.g. "Lu") 11215 my $CCC = $i++; # Canonical combining class (e.g. "230") 11216 my $BIDI = $i++; # directional class (e.g. "L") 11217 my $PERL_DECOMPOSITION = $i++; # decomposition mapping 11218 my $PERL_DECIMAL_DIGIT = $i++; # decimal digit value 11219 my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript 11220 # Dual-use in this program; see below 11221 my $NUMERIC = $i++; # numeric value 11222 my $MIRRORED = $i++; # ? mirrored 11223 my $UNICODE_1_NAME = $i++; # name in Unicode 1.0 11224 my $COMMENT = $i++; # iso comment 11225 my $UPPER = $i++; # simple uppercase mapping 11226 my $LOWER = $i++; # simple lowercase mapping 11227 my $TITLE = $i++; # simple titlecase mapping 11228 my $input_field_count = $i; 11229 11230 # This routine in addition outputs these extra fields: 11231 11232 my $DECOMP_TYPE = $i++; # Decomposition type 11233 11234 # These fields are modifications of ones above, and are usually 11235 # suppressed; they must come last, as for speed, the loop upper bound is 11236 # normally set to ignore them 11237 my $NAME = $i++; # This is the strict name field, not the one that 11238 # charnames uses. 11239 my $DECOMP_MAP = $i++; # Strict decomposition mapping; not the one used 11240 # by Unicode::Normalize 11241 my $last_field = $i - 1; 11242 11243 # All these are read into an array for each line, with the indices defined 11244 # above. The empty fields in the example line above indicate that the 11245 # value is defaulted. The handler called for each line of the input 11246 # changes these to their defaults. 11247 11248 # Here are the official names of the properties, in a parallel array: 11249 my @field_names; 11250 $field_names[$BIDI] = 'Bidi_Class'; 11251 $field_names[$CATEGORY] = 'General_Category'; 11252 $field_names[$CCC] = 'Canonical_Combining_Class'; 11253 $field_names[$CHARNAME] = 'Perl_Charnames'; 11254 $field_names[$COMMENT] = 'ISO_Comment'; 11255 $field_names[$DECOMP_MAP] = 'Decomposition_Mapping'; 11256 $field_names[$DECOMP_TYPE] = 'Decomposition_Type'; 11257 $field_names[$LOWER] = 'Lowercase_Mapping'; 11258 $field_names[$MIRRORED] = 'Bidi_Mirrored'; 11259 $field_names[$NAME] = 'Name'; 11260 $field_names[$NUMERIC] = 'Numeric_Value'; 11261 $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type'; 11262 $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit'; 11263 $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping'; 11264 $field_names[$TITLE] = 'Titlecase_Mapping'; 11265 $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name'; 11266 $field_names[$UPPER] = 'Uppercase_Mapping'; 11267 11268 # Some of these need a little more explanation: 11269 # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode 11270 # property, but is used in calculating the Numeric_Type. Perl however, 11271 # creates a file from this field, so a Perl property is created from it. 11272 # Similarly, the Other_Digit field is used only for calculating the 11273 # Numeric_Type, and so it can be safely re-used as the place to store 11274 # the value for Numeric_Type; hence it is referred to as 11275 # $NUMERIC_TYPE_OTHER_DIGIT. 11276 # The input field named $PERL_DECOMPOSITION is a combination of both the 11277 # decomposition mapping and its type. Perl creates a file containing 11278 # exactly this field, so it is used for that. The two properties are 11279 # separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE. 11280 # $DECOMP_MAP is usually suppressed (unless the lists are changed to 11281 # output it), as Perl doesn't use it directly. 11282 # The input field named here $CHARNAME is used to construct the 11283 # Perl_Charnames property, which is a combination of the Name property 11284 # (which the input field contains), and the Unicode_1_Name property, and 11285 # others from other files. Since, the strict Name property is not used 11286 # by Perl, this field is used for the table that Perl does use. The 11287 # strict Name property table is usually suppressed (unless the lists are 11288 # changed to output it), so it is accumulated in a separate field, 11289 # $NAME, which to save time is discarded unless the table is actually to 11290 # be output 11291 11292 # This file is processed like most in this program. Control is passed to 11293 # process_generic_property_file() which calls filter_UnicodeData_line() 11294 # for each input line. This filter converts the input into line(s) that 11295 # process_generic_property_file() understands. There is also a setup 11296 # routine called before any of the file is processed, and a handler for 11297 # EOF processing, all in this closure. 11298 11299 # A huge speed-up occurred at the cost of some added complexity when these 11300 # routines were altered to buffer the outputs into ranges. Almost all the 11301 # lines of the input file apply to just one code point, and for most 11302 # properties, the map for the next code point up is the same as the 11303 # current one. So instead of creating a line for each property for each 11304 # input line, filter_UnicodeData_line() remembers what the previous map 11305 # of a property was, and doesn't generate a line to pass on until it has 11306 # to, as when the map changes; and that passed-on line encompasses the 11307 # whole contiguous range of code points that have the same map for that 11308 # property. This means a slight amount of extra setup, and having to 11309 # flush these buffers on EOF, testing if the maps have changed, plus 11310 # remembering state information in the closure. But it means a lot less 11311 # real time in not having to change the data base for each property on 11312 # each line. 11313 11314 # Another complication is that there are already a few ranges designated 11315 # in the input. There are two lines for each, with the same maps except 11316 # the code point and name on each line. This was actually the hardest 11317 # thing to design around. The code points in those ranges may actually 11318 # have real maps not given by these two lines. These maps will either 11319 # be algorithmically determinable, or be in the extracted files furnished 11320 # with the UCD. In the event of conflicts between these extracted files, 11321 # and this one, Unicode says that this one prevails. But it shouldn't 11322 # prevail for conflicts that occur in these ranges. The data from the 11323 # extracted files prevails in those cases. So, this program is structured 11324 # so that those files are processed first, storing maps. Then the other 11325 # files are processed, generally overwriting what the extracted files 11326 # stored. But just the range lines in this input file are processed 11327 # without overwriting. This is accomplished by adding a special string to 11328 # the lines output to tell process_generic_property_file() to turn off the 11329 # overwriting for just this one line. 11330 # A similar mechanism is used to tell it that the map is of a non-default 11331 # type. 11332 11333 sub setup_UnicodeData($file) { # Called before any lines of the input are read 11334 11335 # Create a new property specially located that is a combination of 11336 # various Name properties: Name, Unicode_1_Name, Named Sequences, and 11337 # _Perl_Name_Alias properties. (The final one duplicates elements of the 11338 # first, and starting in v6.1, is the same as the 'Name_Alias 11339 # property.) A comment for the new property will later be constructed 11340 # based on the actual properties present and used 11341 $perl_charname = Property->new('Perl_Charnames', 11342 Default_Map => "", 11343 Directory => File::Spec->curdir(), 11344 File => 'Name', 11345 Fate => $INTERNAL_ONLY, 11346 Perl_Extension => 1, 11347 Range_Size_1 => \&output_perl_charnames_line, 11348 Type => $STRING, 11349 ); 11350 $perl_charname->set_proxy_for('Name'); 11351 11352 my $Perl_decomp = Property->new('Perl_Decomposition_Mapping', 11353 Directory => File::Spec->curdir(), 11354 File => 'Decomposition', 11355 Format => $DECOMP_STRING_FORMAT, 11356 Fate => $INTERNAL_ONLY, 11357 Perl_Extension => 1, 11358 Default_Map => $CODE_POINT, 11359 11360 # normalize.pm can't cope with these 11361 Output_Range_Counts => 0, 11362 11363 # This is a specially formatted table 11364 # explicitly for normalize.pm, which 11365 # is expecting a particular format, 11366 # which means that mappings containing 11367 # multiple code points are in the main 11368 # body of the table 11369 Map_Type => $COMPUTE_NO_MULTI_CP, 11370 Type => $STRING, 11371 To_Output_Map => $INTERNAL_MAP, 11372 ); 11373 $Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type'); 11374 $Perl_decomp->add_comment(join_lines(<<END 11375This mapping is a combination of the Unicode 'Decomposition_Type' and 11376'Decomposition_Mapping' properties, formatted for use by normalize.pm. It is 11377identical to the official Unicode 'Decomposition_Mapping' property except for 11378two things: 11379 1) It omits the algorithmically determinable Hangul syllable decompositions, 11380which normalize.pm handles algorithmically. 11381 2) It contains the decomposition type as well. Non-canonical decompositions 11382begin with a word in angle brackets, like <super>, which denotes the 11383compatible decomposition type. If the map does not begin with the <angle 11384brackets>, the decomposition is canonical. 11385END 11386 )); 11387 11388 my $Decimal_Digit = Property->new("Perl_Decimal_Digit", 11389 Default_Map => "", 11390 Perl_Extension => 1, 11391 Directory => $map_directory, 11392 Type => $STRING, 11393 To_Output_Map => $OUTPUT_ADJUSTED, 11394 ); 11395 $Decimal_Digit->add_comment(join_lines(<<END 11396This file gives the mapping of all code points which represent a single 11397decimal digit [0-9] to their respective digits, but it has ranges of 10 code 11398points, and the mapping of each non-initial element of each range is actually 11399not to "0", but to the offset that element has from its corresponding DIGIT 0. 11400These code points are those that have Numeric_Type=Decimal; not special 11401things, like subscripts nor Roman numerals. 11402END 11403 )); 11404 11405 # These properties are not used for generating anything else, and are 11406 # usually not output. By making them last in the list, we can just 11407 # change the high end of the loop downwards to avoid the work of 11408 # generating a table(s) that is/are just going to get thrown away. 11409 if (! property_ref('Decomposition_Mapping')->to_output_map 11410 && ! property_ref('Name')->to_output_map) 11411 { 11412 $last_field = min($NAME, $DECOMP_MAP) - 1; 11413 } elsif (property_ref('Decomposition_Mapping')->to_output_map) { 11414 $last_field = $DECOMP_MAP; 11415 } elsif (property_ref('Name')->to_output_map) { 11416 $last_field = $NAME; 11417 } 11418 return; 11419 } 11420 11421 my $first_time = 1; # ? Is this the first line of the file 11422 my $in_range = 0; # ? Are we in one of the file's ranges 11423 my $previous_cp; # hex code point of previous line 11424 my $decimal_previous_cp = -1; # And its decimal equivalent 11425 my @start; # For each field, the current starting 11426 # code point in hex for the range 11427 # being accumulated. 11428 my @fields; # The input fields; 11429 my @previous_fields; # And those from the previous call 11430 11431 sub filter_UnicodeData_line($file) { 11432 # Handle a single input line from UnicodeData.txt; see comments above 11433 # Conceptually this takes a single line from the file containing N 11434 # properties, and converts it into N lines with one property per line, 11435 # which is what the final handler expects. But there are 11436 # complications due to the quirkiness of the input file, and to save 11437 # time, it accumulates ranges where the property values don't change 11438 # and only emits lines when necessary. This is about an order of 11439 # magnitude fewer lines emitted. 11440 11441 # $_ contains the input line. 11442 # -1 in split means retain trailing null fields 11443 (my $cp, @fields) = split /\s*;\s*/, $_, -1; 11444 11445 #local $to_trace = 1 if main::DEBUG; 11446 trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace; 11447 if (@fields > $input_field_count) { 11448 $file->carp_bad_line('Extra fields'); 11449 $_ = ""; 11450 return; 11451 } 11452 11453 my $decimal_cp = hex $cp; 11454 11455 # We have to output all the buffered ranges when the next code point 11456 # is not exactly one after the previous one, which means there is a 11457 # gap in the ranges. 11458 my $force_output = ($decimal_cp != $decimal_previous_cp + 1); 11459 11460 # The decomposition mapping field requires special handling. It looks 11461 # like either: 11462 # 11463 # <compat> 0032 0020 11464 # 0041 0300 11465 # 11466 # The decomposition type is enclosed in <brackets>; if missing, it 11467 # means the type is canonical. There are two decomposition mapping 11468 # tables: the one for use by Perl's normalize.pm has a special format 11469 # which is this field intact; the other, for general use is of 11470 # standard format. In either case we have to find the decomposition 11471 # type. Empty fields have None as their type, and map to the code 11472 # point itself 11473 if ($fields[$PERL_DECOMPOSITION] eq "") { 11474 $fields[$DECOMP_TYPE] = 'None'; 11475 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT; 11476 } 11477 else { 11478 ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION] 11479 =~ / < ( .+? ) > \s* ( .+ ) /x; 11480 if (! defined $fields[$DECOMP_TYPE]) { 11481 $fields[$DECOMP_TYPE] = 'Canonical'; 11482 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION]; 11483 } 11484 else { 11485 $fields[$DECOMP_MAP] = $map; 11486 } 11487 } 11488 11489 # The 3 numeric fields also require special handling. The 2 digit 11490 # fields must be either empty or match the number field. This means 11491 # that if it is empty, they must be as well, and the numeric type is 11492 # None, and the numeric value is 'Nan'. 11493 # The decimal digit field must be empty or match the other digit 11494 # field. If the decimal digit field is non-empty, the code point is 11495 # a decimal digit, and the other two fields will have the same value. 11496 # If it is empty, but the other digit field is non-empty, the code 11497 # point is an 'other digit', and the number field will have the same 11498 # value as the other digit field. If the other digit field is empty, 11499 # but the number field is non-empty, the code point is a generic 11500 # numeric type. 11501 if ($fields[$NUMERIC] eq "") { 11502 if ($fields[$PERL_DECIMAL_DIGIT] ne "" 11503 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "" 11504 ) { 11505 $file->carp_bad_line("Numeric values inconsistent. Trying to process anyway"); 11506 } 11507 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None'; 11508 $fields[$NUMERIC] = 'NaN'; 11509 } 11510 else { 11511 $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number. Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x; 11512 if ($fields[$PERL_DECIMAL_DIGIT] ne "") { 11513 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC]; 11514 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should be empty since the general category ($fields[$CATEGORY]) isn't 'Nd'. Processing as Decimal") if $fields[$CATEGORY] ne "Nd"; 11515 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal'; 11516 } 11517 elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") { 11518 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC]; 11519 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit'; 11520 } 11521 else { 11522 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric'; 11523 11524 # Rationals require extra effort. 11525 if ($fields[$NUMERIC] =~ qr{/}) { 11526 reduce_fraction(\$fields[$NUMERIC]); 11527 register_fraction($fields[$NUMERIC]) 11528 } 11529 } 11530 } 11531 11532 # For the properties that have empty fields in the file, and which 11533 # mean something different from empty, change them to that default. 11534 # Certain fields just haven't been empty so far in any Unicode 11535 # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC, 11536 # $CATEGORY. This leaves just the two fields, and so we hard-code in 11537 # the defaults; which are very unlikely to ever change. 11538 $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq ""; 11539 $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq ""; 11540 11541 # UAX44 says that if title is empty, it is the same as whatever upper 11542 # is, 11543 $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq ""; 11544 11545 # There are a few pairs of lines like: 11546 # AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; 11547 # D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; 11548 # that define ranges. These should be processed after the fields are 11549 # adjusted above, as they may override some of them; but mostly what 11550 # is left is to possibly adjust the $CHARNAME field. The names of all the 11551 # paired lines start with a '<', but this is also true of '<control>, 11552 # which isn't one of these special ones. 11553 if ($fields[$CHARNAME] eq '<control>') { 11554 11555 # Some code points in this file have the pseudo-name 11556 # '<control>', but the official name for such ones is the null 11557 # string. 11558 $fields[$NAME] = $fields[$CHARNAME] = ""; 11559 11560 # We had better not be in between range lines. 11561 if ($in_range) { 11562 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway"); 11563 $in_range = 0; 11564 } 11565 } 11566 elsif (substr($fields[$CHARNAME], 0, 1) ne '<') { 11567 11568 # Here is a non-range line. We had better not be in between range 11569 # lines. 11570 if ($in_range) { 11571 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway"); 11572 $in_range = 0; 11573 } 11574 if ($fields[$CHARNAME] =~ s/- $cp $//x) { 11575 11576 # These are code points whose names end in their code points, 11577 # which means the names are algorithmically derivable from the 11578 # code points. To shorten the output Name file, the algorithm 11579 # for deriving these is placed in the file instead of each 11580 # code point, so they have map type $CP_IN_NAME 11581 $fields[$CHARNAME] = $CMD_DELIM 11582 . $MAP_TYPE_CMD 11583 . '=' 11584 . $CP_IN_NAME 11585 . $CMD_DELIM 11586 . $fields[$CHARNAME]; 11587 } 11588 $fields[$NAME] = $fields[$CHARNAME]; 11589 } 11590 elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) { 11591 $fields[$CHARNAME] = $fields[$NAME] = $1; 11592 11593 # Here we are at the beginning of a range pair. 11594 if ($in_range) { 11595 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'. Trying anyway"); 11596 } 11597 $in_range = 1; 11598 11599 # Because the properties in the range do not overwrite any already 11600 # in the db, we must flush the buffers of what's already there, so 11601 # they get handled in the normal scheme. 11602 $force_output = 1; 11603 11604 } 11605 elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) { 11606 $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME]. Ignoring this line."); 11607 $_ = ""; 11608 return; 11609 } 11610 else { # Here, we are at the last line of a range pair. 11611 11612 if (! $in_range) { 11613 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one. Ignoring this line."); 11614 $_ = ""; 11615 return; 11616 } 11617 $in_range = 0; 11618 11619 $fields[$NAME] = $fields[$CHARNAME]; 11620 11621 # Check that the input is valid: that the closing of the range is 11622 # the same as the beginning. 11623 foreach my $i (0 .. $last_field) { 11624 next if $fields[$i] eq $previous_fields[$i]; 11625 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'. Bad News. Trying anyway"); 11626 } 11627 11628 # The processing differs depending on the type of range, 11629 # determined by its $CHARNAME 11630 if ($fields[$CHARNAME] =~ /^Hangul Syllable/) { 11631 11632 # Check that the data looks right. 11633 if ($decimal_previous_cp != $SBase) { 11634 $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp. Bad News. Results will be wrong"); 11635 } 11636 if ($decimal_cp != $SBase + $SCount - 1) { 11637 $file->carp_bad_line("Unexpected Hangul syllable end = $cp. Bad News. Results will be wrong"); 11638 } 11639 11640 # The Hangul syllable range has a somewhat complicated name 11641 # generation algorithm. Each code point in it has a canonical 11642 # decomposition also computable by an algorithm. The 11643 # perl decomposition map table built from these is used only 11644 # by normalize.pm, which has the algorithm built in it, so the 11645 # decomposition maps are not needed, and are large, so are 11646 # omitted from it. If the full decomposition map table is to 11647 # be output, the decompositions are generated for it, in the 11648 # EOF handling code for this input file. 11649 11650 $previous_fields[$DECOMP_TYPE] = 'Canonical'; 11651 11652 # This range is stored in our internal structure with its 11653 # own map type, different from all others. 11654 $previous_fields[$CHARNAME] = $previous_fields[$NAME] 11655 = $CMD_DELIM 11656 . $MAP_TYPE_CMD 11657 . '=' 11658 . $HANGUL_SYLLABLE 11659 . $CMD_DELIM 11660 . $fields[$CHARNAME]; 11661 } 11662 elsif ($fields[$CATEGORY] eq 'Lo') { # Is a letter 11663 11664 # All the CJK ranges like this have the name given as a 11665 # special case in the next code line. And for the others, we 11666 # hope that Unicode continues to use the correct name in 11667 # future releases, so we don't have to make further special 11668 # cases. 11669 my $name = ($fields[$CHARNAME] =~ /^CJK/) 11670 ? 'CJK UNIFIED IDEOGRAPH' 11671 : uc $fields[$CHARNAME]; 11672 11673 # The name for these contains the code point itself, and all 11674 # are defined to have the same base name, regardless of what 11675 # is in the file. They are stored in our internal structure 11676 # with a map type of $CP_IN_NAME 11677 $previous_fields[$CHARNAME] = $previous_fields[$NAME] 11678 = $CMD_DELIM 11679 . $MAP_TYPE_CMD 11680 . '=' 11681 . $CP_IN_NAME 11682 . $CMD_DELIM 11683 . $name; 11684 11685 } 11686 elsif ($fields[$CATEGORY] eq 'Co' 11687 || $fields[$CATEGORY] eq 'Cs') 11688 { 11689 # The names of all the code points in these ranges are set to 11690 # null, as there are no names for the private use and 11691 # surrogate code points. 11692 11693 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = ""; 11694 } 11695 else { 11696 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY]. Attempting to process it."); 11697 } 11698 11699 # The first line of the range caused everything else to be output, 11700 # and then its values were stored as the beginning values for the 11701 # next set of ranges, which this one ends. Now, for each value, 11702 # add a command to tell the handler that these values should not 11703 # replace any existing ones in our database. 11704 foreach my $i (0 .. $last_field) { 11705 $previous_fields[$i] = $CMD_DELIM 11706 . $REPLACE_CMD 11707 . '=' 11708 . $NO 11709 . $CMD_DELIM 11710 . $previous_fields[$i]; 11711 } 11712 11713 # And change things so it looks like the entire range has been 11714 # gone through with this being the final part of it. Adding the 11715 # command above to each field will cause this range to be flushed 11716 # during the next iteration, as it guaranteed that the stored 11717 # field won't match whatever value the next one has. 11718 $previous_cp = $cp; 11719 $decimal_previous_cp = $decimal_cp; 11720 11721 # We are now set up for the next iteration; so skip the remaining 11722 # code in this subroutine that does the same thing, but doesn't 11723 # know about these ranges. 11724 $_ = ""; 11725 11726 return; 11727 } 11728 11729 # On the very first line, we fake it so the code below thinks there is 11730 # nothing to output, and initialize so that when it does get output it 11731 # uses the first line's values for the lowest part of the range. 11732 # (One could avoid this by using peek(), but then one would need to 11733 # know the adjustments done above and do the same ones in the setup 11734 # routine; not worth it) 11735 if ($first_time) { 11736 $first_time = 0; 11737 @previous_fields = @fields; 11738 @start = ($cp) x scalar @fields; 11739 $decimal_previous_cp = $decimal_cp - 1; 11740 } 11741 11742 # For each field, output the stored up ranges that this code point 11743 # doesn't fit in. Earlier we figured out if all ranges should be 11744 # terminated because of changing the replace or map type styles, or if 11745 # there is a gap between this new code point and the previous one, and 11746 # that is stored in $force_output. But even if those aren't true, we 11747 # need to output the range if this new code point's value for the 11748 # given property doesn't match the stored range's. 11749 #local $to_trace = 1 if main::DEBUG; 11750 foreach my $i (0 .. $last_field) { 11751 my $field = $fields[$i]; 11752 if ($force_output || $field ne $previous_fields[$i]) { 11753 11754 # Flush the buffer of stored values. 11755 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]"); 11756 11757 # Start a new range with this code point and its value 11758 $start[$i] = $cp; 11759 $previous_fields[$i] = $field; 11760 } 11761 } 11762 11763 # Set the values for the next time. 11764 $previous_cp = $cp; 11765 $decimal_previous_cp = $decimal_cp; 11766 11767 # The input line has generated whatever adjusted lines are needed, and 11768 # should not be looked at further. 11769 $_ = ""; 11770 return; 11771 } 11772 11773 sub EOF_UnicodeData($file) { 11774 # Called upon EOF to flush the buffers, and create the Hangul 11775 # decomposition mappings if needed. 11776 11777 # Flush the buffers. 11778 foreach my $i (0 .. $last_field) { 11779 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]"); 11780 } 11781 11782 if (-e 'Jamo.txt') { 11783 11784 # The algorithm is published by Unicode, based on values in 11785 # Jamo.txt, (which should have been processed before this 11786 # subroutine), and the results left in %Jamo 11787 unless (%Jamo) { 11788 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt. Hangul syllables not generated."); 11789 return; 11790 } 11791 11792 # If the full decomposition map table is being output, insert 11793 # into it the Hangul syllable mappings. This is to avoid having 11794 # to publish a subroutine in it to compute them. (which would 11795 # essentially be this code.) This uses the algorithm published by 11796 # Unicode. (No hangul syllables in version 1) 11797 if ($v_version ge v2.0.0 11798 && property_ref('Decomposition_Mapping')->to_output_map) { 11799 for (my $S = $SBase; $S < $SBase + $SCount; $S++) { 11800 use integer; 11801 my $SIndex = $S - $SBase; 11802 my $L = $LBase + $SIndex / $NCount; 11803 my $V = $VBase + ($SIndex % $NCount) / $TCount; 11804 my $T = $TBase + $SIndex % $TCount; 11805 11806 trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace; 11807 my $decomposition = sprintf("%04X %04X", $L, $V); 11808 $decomposition .= sprintf(" %04X", $T) if $T != $TBase; 11809 $file->insert_adjusted_lines( 11810 sprintf("%04X; Decomposition_Mapping; %s", 11811 $S, 11812 $decomposition)); 11813 } 11814 } 11815 } 11816 11817 return; 11818 } 11819 11820 sub filter_v1_ucd($file) { 11821 # Fix UCD lines in version 1. This is probably overkill, but this 11822 # fixes some glaring errors in Version 1 UnicodeData.txt. That file: 11823 # 1) had many Hangul (U+3400 - U+4DFF) code points that were later 11824 # removed. This program retains them 11825 # 2) didn't include ranges, which it should have, and which are now 11826 # added in @corrected_lines below. It was hand populated by 11827 # taking the data from Version 2, verified by analyzing 11828 # DAge.txt. 11829 # 3) There is a syntax error in the entry for U+09F8 which could 11830 # cause problems for Unicode::UCD, and so is changed. It's 11831 # numeric value was simply a minus sign, without any number. 11832 # (Eventually Unicode changed the code point to non-numeric.) 11833 # 4) The decomposition types often don't match later versions 11834 # exactly, and the whole syntax of that field is different; so 11835 # the syntax is changed as well as the types to their later 11836 # terminology. Otherwise normalize.pm would be very unhappy 11837 # 5) Many ccc classes are different. These are left intact. 11838 # 6) U+FF10..U+FF19 are missing their numeric values in all three 11839 # fields. These are unchanged because it doesn't really cause 11840 # problems for Perl. 11841 # 7) A number of code points, such as controls, don't have their 11842 # Unicode Version 1 Names in this file. These are added. 11843 # 8) A number of Symbols were marked as Lm. This changes those in 11844 # the Latin1 range, so that regexes work. 11845 # 9) The odd characters U+03DB .. U+03E1 weren't encoded but are 11846 # referred to by their lc equivalents. Not fixed. 11847 11848 my @corrected_lines = split /\n/, <<'END'; 118494E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; 118509FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; 11851E000;<Private Use, First>;Co;0;L;;;;;N;;;;; 11852F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;; 11853F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;; 11854FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;; 11855END 11856 11857 #local $to_trace = 1 if main::DEBUG; 11858 trace $_ if main::DEBUG && $to_trace; 11859 11860 # -1 => retain trailing null fields 11861 my ($code_point, @fields) = split /\s*;\s*/, $_, -1; 11862 11863 # At the first place that is wrong in the input, insert all the 11864 # corrections, replacing the wrong line. 11865 if ($code_point eq '4E00') { 11866 my @copy = @corrected_lines; 11867 $_ = shift @copy; 11868 ($code_point, @fields) = split /\s*;\s*/, $_, -1; 11869 11870 $file->insert_lines(@copy); 11871 } 11872 elsif ($code_point =~ /^00/ && $fields[$CATEGORY] eq 'Lm') { 11873 11874 # There are no Lm characters in Latin1; these should be 'Sk', but 11875 # there isn't that in V1. 11876 $fields[$CATEGORY] = 'So'; 11877 } 11878 11879 if ($fields[$NUMERIC] eq '-') { 11880 $fields[$NUMERIC] = '-1'; # This is what 2.0 made it. 11881 } 11882 11883 if ($fields[$PERL_DECOMPOSITION] ne "") { 11884 11885 # Several entries have this change to superscript 2 or 3 in the 11886 # middle. Convert these to the modern version, which is to use 11887 # the actual U+00B2 and U+00B3 (the superscript forms) instead. 11888 # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes 11889 # 'HHHH HHHH 00B3 HHHH'. 11890 # It turns out that all of these that don't have another 11891 # decomposition defined at the beginning of the line have the 11892 # <square> decomposition in later releases. 11893 if ($code_point ne '00B2' && $code_point ne '00B3') { 11894 if ($fields[$PERL_DECOMPOSITION] 11895 =~ s/<\+sup> 003([23]) <-sup>/00B$1/) 11896 { 11897 if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') { 11898 $fields[$PERL_DECOMPOSITION] = '<square> ' 11899 . $fields[$PERL_DECOMPOSITION]; 11900 } 11901 } 11902 } 11903 11904 # If is like '<+circled> 0052 <-circled>', convert to 11905 # '<circled> 0052' 11906 $fields[$PERL_DECOMPOSITION] =~ 11907 s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/xg; 11908 11909 # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc. 11910 $fields[$PERL_DECOMPOSITION] =~ 11911 s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x 11912 or $fields[$PERL_DECOMPOSITION] =~ 11913 s/ <join> \s* (.*?) \s* <join> /<medial> $1/x 11914 or $fields[$PERL_DECOMPOSITION] =~ 11915 s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x 11916 or $fields[$PERL_DECOMPOSITION] =~ 11917 s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x; 11918 11919 # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc. 11920 $fields[$PERL_DECOMPOSITION] =~ 11921 s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x; 11922 11923 # Change names to modern form. 11924 $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g; 11925 $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g; 11926 $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g; 11927 $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g; 11928 11929 # One entry has weird braces 11930 $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g; 11931 11932 # One entry at U+2116 has an extra <sup> 11933 $fields[$PERL_DECOMPOSITION] =~ s/( < .*? > .* ) < .*? > \ * /$1/x; 11934 } 11935 11936 $_ = join ';', $code_point, @fields; 11937 trace $_ if main::DEBUG && $to_trace; 11938 return; 11939 } 11940 11941 sub filter_bad_Nd_ucd { 11942 # Early versions specified a value in the decimal digit field even 11943 # though the code point wasn't a decimal digit. Clear the field in 11944 # that situation, so that the main code doesn't think it is a decimal 11945 # digit. 11946 11947 my ($code_point, @fields) = split /\s*;\s*/, $_, -1; 11948 if ($fields[$PERL_DECIMAL_DIGIT] ne "" && $fields[$CATEGORY] ne 'Nd') { 11949 $fields[$PERL_DECIMAL_DIGIT] = ""; 11950 $_ = join ';', $code_point, @fields; 11951 } 11952 return; 11953 } 11954 11955 my @U1_control_names = split /\n/, <<'END'; 11956NULL 11957START OF HEADING 11958START OF TEXT 11959END OF TEXT 11960END OF TRANSMISSION 11961ENQUIRY 11962ACKNOWLEDGE 11963BELL 11964BACKSPACE 11965HORIZONTAL TABULATION 11966LINE FEED 11967VERTICAL TABULATION 11968FORM FEED 11969CARRIAGE RETURN 11970SHIFT OUT 11971SHIFT IN 11972DATA LINK ESCAPE 11973DEVICE CONTROL ONE 11974DEVICE CONTROL TWO 11975DEVICE CONTROL THREE 11976DEVICE CONTROL FOUR 11977NEGATIVE ACKNOWLEDGE 11978SYNCHRONOUS IDLE 11979END OF TRANSMISSION BLOCK 11980CANCEL 11981END OF MEDIUM 11982SUBSTITUTE 11983ESCAPE 11984FILE SEPARATOR 11985GROUP SEPARATOR 11986RECORD SEPARATOR 11987UNIT SEPARATOR 11988DELETE 11989BREAK PERMITTED HERE 11990NO BREAK HERE 11991INDEX 11992NEXT LINE 11993START OF SELECTED AREA 11994END OF SELECTED AREA 11995CHARACTER TABULATION SET 11996CHARACTER TABULATION WITH JUSTIFICATION 11997LINE TABULATION SET 11998PARTIAL LINE DOWN 11999PARTIAL LINE UP 12000REVERSE LINE FEED 12001SINGLE SHIFT TWO 12002SINGLE SHIFT THREE 12003DEVICE CONTROL STRING 12004PRIVATE USE ONE 12005PRIVATE USE TWO 12006SET TRANSMIT STATE 12007CANCEL CHARACTER 12008MESSAGE WAITING 12009START OF GUARDED AREA 12010END OF GUARDED AREA 12011START OF STRING 12012SINGLE CHARACTER INTRODUCER 12013CONTROL SEQUENCE INTRODUCER 12014STRING TERMINATOR 12015OPERATING SYSTEM COMMAND 12016PRIVACY MESSAGE 12017APPLICATION PROGRAM COMMAND 12018END 12019 12020 sub filter_early_U1_names { 12021 # Very early versions did not have the Unicode_1_name field specified. 12022 # They differed in which ones were present; make sure a U1 name 12023 # exists, so that Unicode::UCD::charinfo will work 12024 12025 my ($code_point, @fields) = split /\s*;\s*/, $_, -1; 12026 12027 12028 # @U1_control names above are entirely positional, so we pull them out 12029 # in the exact order required, with gaps for the ones that don't have 12030 # names. 12031 if ($code_point =~ /^00[01]/ 12032 || $code_point eq '007F' 12033 || $code_point =~ /^008[2-9A-F]/ 12034 || $code_point =~ /^009[0-8A-F]/) 12035 { 12036 my $u1_name = shift @U1_control_names; 12037 $fields[$UNICODE_1_NAME] = $u1_name unless $fields[$UNICODE_1_NAME]; 12038 $_ = join ';', $code_point, @fields; 12039 } 12040 return; 12041 } 12042 12043 sub filter_v2_1_5_ucd { 12044 # A dozen entries in this 2.1.5 file had the mirrored and numeric 12045 # columns swapped; These all had mirrored be 'N'. So if the numeric 12046 # column appears to be N, swap it back. 12047 12048 my ($code_point, @fields) = split /\s*;\s*/, $_, -1; 12049 if ($fields[$NUMERIC] eq 'N') { 12050 $fields[$NUMERIC] = $fields[$MIRRORED]; 12051 $fields[$MIRRORED] = 'N'; 12052 $_ = join ';', $code_point, @fields; 12053 } 12054 return; 12055 } 12056 12057 sub filter_v6_ucd { 12058 12059 # Unicode 6.0 co-opted the name BELL for U+1F514, but until 5.17, 12060 # it wasn't accepted, to allow for some deprecation cycles. This 12061 # function is not called after 5.16 12062 12063 return if $_ !~ /^(?:0007|1F514|070F);/; 12064 12065 my ($code_point, @fields) = split /\s*;\s*/, $_, -1; 12066 if ($code_point eq '0007') { 12067 $fields[$CHARNAME] = ""; 12068 } 12069 elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see 12070 # http://www.unicode.org/versions/corrigendum8.html 12071 $fields[$BIDI] = "AL"; 12072 } 12073 elsif ($^V lt v5.18.0) { # For 5.18 will convert to use Unicode's name 12074 $fields[$CHARNAME] = ""; 12075 } 12076 12077 $_ = join ';', $code_point, @fields; 12078 12079 return; 12080 } 12081} # End closure for UnicodeData 12082 12083sub process_GCB_test($file) { 12084 12085 while ($file->next_line) { 12086 push @backslash_X_tests, $_; 12087 } 12088 12089 return; 12090} 12091 12092sub process_LB_test($file) { 12093 12094 while ($file->next_line) { 12095 push @LB_tests, $_; 12096 } 12097 12098 return; 12099} 12100 12101sub process_SB_test($file) { 12102 12103 while ($file->next_line) { 12104 push @SB_tests, $_; 12105 } 12106 12107 return; 12108} 12109 12110sub process_WB_test($file) { 12111 12112 while ($file->next_line) { 12113 push @WB_tests, $_; 12114 } 12115 12116 return; 12117} 12118 12119sub process_NamedSequences($file) { 12120 # NamedSequences.txt entries are just added to an array. Because these 12121 # don't look like the other tables, they have their own handler. 12122 # An example: 12123 # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300 12124 # 12125 # This just adds the sequence to an array for later handling 12126 12127 while ($file->next_line) { 12128 my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1; 12129 if (@remainder) { 12130 $file->carp_bad_line( 12131 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'"); 12132 next; 12133 } 12134 12135 # Code points need to be 5 digits long like the other entries in 12136 # Name.pl, for regcomp.c parsing; and the ones below 0x0100 need to be 12137 # converted to native 12138 $sequence = join " ", map { sprintf("%05X", 12139 utf8::unicode_to_native(hex $_)) 12140 } split / /, $sequence; 12141 push @named_sequences, "$sequence\n$name\n"; 12142 } 12143 return; 12144} 12145 12146{ # Closure 12147 12148 my $first_range; 12149 12150 sub filter_early_ea_lb { 12151 # Fixes early EastAsianWidth.txt and LineBreak.txt files. These had a 12152 # third field be the name of the code point, which can be ignored in 12153 # most cases. But it can be meaningful if it marks a range: 12154 # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE 12155 # 3400;W;<CJK Ideograph Extension A, First> 12156 # 12157 # We need to see the First in the example above to know it's a range. 12158 # They did not use the later range syntaxes. This routine changes it 12159 # to use the modern syntax. 12160 # $1 is the Input_file object. 12161 12162 my @fields = split /\s*;\s*/; 12163 if ($fields[2] =~ /^<.*, First>/) { 12164 $first_range = $fields[0]; 12165 $_ = ""; 12166 } 12167 elsif ($fields[2] =~ /^<.*, Last>/) { 12168 $_ = $_ = "$first_range..$fields[0]; $fields[1]"; 12169 } 12170 else { 12171 undef $first_range; 12172 $_ = "$fields[0]; $fields[1]"; 12173 } 12174 12175 return; 12176 } 12177} 12178 12179sub filter_substitute_lb { 12180 # Used on Unicodes that predate the LB property, where there is a 12181 # substitute file. This just does the regular ea_lb handling for such 12182 # files, and then substitutes the long property value name for the short 12183 # one that comes with the file. (The other break files have the long 12184 # names in them, so this is the odd one out.) The reason for doing this 12185 # kludge is that regen/mk_invlists.pl is expecting the long name. This 12186 # also fixes the typo 'Inseperable' that leads to problems. 12187 12188 filter_early_ea_lb; 12189 return unless $_; 12190 12191 my @fields = split /\s*;\s*/; 12192 $fields[1] = property_ref('_Perl_LB')->table($fields[1])->full_name; 12193 $fields[1] = 'Inseparable' if lc $fields[1] eq 'inseperable'; 12194 $_ = join '; ', @fields; 12195} 12196 12197sub filter_old_style_arabic_shaping { 12198 # Early versions used a different term for the later one. 12199 12200 my @fields = split /\s*;\s*/; 12201 $fields[3] =~ s/<no shaping>/No_Joining_Group/; 12202 $fields[3] =~ s/\s+/_/g; # Change spaces to underscores 12203 $_ = join ';', @fields; 12204 return; 12205} 12206 12207{ # Closure 12208 my $lc; # Table for lowercase mapping 12209 my $tc; 12210 my $uc; 12211 my %special_casing_code_points; 12212 12213 sub setup_special_casing($file) { 12214 # SpecialCasing.txt contains the non-simple case change mappings. The 12215 # simple ones are in UnicodeData.txt, which should already have been 12216 # read in to the full property data structures, so as to initialize 12217 # these with the simple ones. Then the SpecialCasing.txt entries 12218 # add or overwrite the ones which have different full mappings. 12219 12220 # This routine sees if the simple mappings are to be output, and if 12221 # so, copies what has already been put into the full mapping tables, 12222 # while they still contain only the simple mappings. 12223 12224 # The reason it is done this way is that the simple mappings are 12225 # probably not going to be output, so it saves work to initialize the 12226 # full tables with the simple mappings, and then overwrite those 12227 # relatively few entries in them that have different full mappings, 12228 # and thus skip the simple mapping tables altogether. 12229 12230 $lc = property_ref('lc'); 12231 $tc = property_ref('tc'); 12232 $uc = property_ref('uc'); 12233 12234 # For each of the case change mappings... 12235 foreach my $full_casing_table ($lc, $tc, $uc) { 12236 my $full_casing_name = $full_casing_table->name; 12237 my $full_casing_full_name = $full_casing_table->full_name; 12238 unless (defined $full_casing_table 12239 && ! $full_casing_table->is_empty) 12240 { 12241 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated."); 12242 } 12243 12244 $full_casing_table->add_comment(join_lines( <<END 12245This file includes both the simple and full case changing maps. The simple 12246ones are in the main body of the table below, and the full ones adding to or 12247overriding them are in the hash. 12248END 12249 )); 12250 12251 # The simple version's name in each mapping merely has an 's' in 12252 # front of the full one's 12253 my $simple_name = 's' . $full_casing_name; 12254 my $simple = property_ref($simple_name); 12255 $simple->initialize($full_casing_table) if $simple->to_output_map(); 12256 } 12257 12258 return; 12259 } 12260 12261 sub filter_2_1_8_special_casing_line { 12262 12263 # This version had duplicate entries in this file. Delete all but the 12264 # first one 12265 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null 12266 # fields 12267 if (exists $special_casing_code_points{$fields[0]}) { 12268 $_ = ""; 12269 return; 12270 } 12271 12272 $special_casing_code_points{$fields[0]} = 1; 12273 filter_special_casing_line(@_); 12274 } 12275 12276 sub filter_special_casing_line($file) { 12277 # Change the format of $_ from SpecialCasing.txt into something that 12278 # the generic handler understands. Each input line contains three 12279 # case mappings. This will generate three lines to pass to the 12280 # generic handler for each of those. 12281 12282 # The input syntax (after stripping comments and trailing white space 12283 # is like one of the following (with the final two being entries that 12284 # we ignore): 12285 # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S 12286 # 03A3; 03C2; 03A3; 03A3; Final_Sigma; 12287 # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE 12288 # Note the trailing semi-colon, unlike many of the input files. That 12289 # means that there will be an extra null field generated by the split 12290 12291 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null 12292 # fields 12293 12294 # field #4 is when this mapping is conditional. If any of these get 12295 # implemented, it would be by hard-coding in the casing functions in 12296 # the Perl core, not through tables. But if there is a new condition 12297 # we don't know about, output a warning. We know about all the 12298 # conditions through 6.0 12299 if ($fields[4] ne "") { 12300 my @conditions = split ' ', $fields[4]; 12301 if ($conditions[0] ne 'tr' # We know that these languages have 12302 # conditions, and some are multiple 12303 && $conditions[0] ne 'az' 12304 && $conditions[0] ne 'lt' 12305 12306 # And, we know about a single condition Final_Sigma, but 12307 # nothing else. 12308 && ($v_version gt v5.2.0 12309 && (@conditions > 1 || $conditions[0] ne 'Final_Sigma'))) 12310 { 12311 $file->carp_bad_line("Unknown condition '$fields[4]'. You should inspect it and either add code to handle it, or add to list of those that are to ignore"); 12312 } 12313 elsif ($conditions[0] ne 'Final_Sigma') { 12314 12315 # Don't print out a message for Final_Sigma, because we 12316 # have hard-coded handling for it. (But the standard 12317 # could change what the rule should be, but it wouldn't 12318 # show up here anyway. 12319 12320 print "# SKIPPING Special Casing: $_\n" 12321 if $verbosity >= $VERBOSE; 12322 } 12323 $_ = ""; 12324 return; 12325 } 12326 elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) { 12327 $file->carp_bad_line('Extra fields'); 12328 $_ = ""; 12329 return; 12330 } 12331 12332 my $decimal_code_point = hex $fields[0]; 12333 12334 # Loop to handle each of the three mappings in the input line, in 12335 # order, with $i indicating the current field number. 12336 my $i = 0; 12337 for my $object ($lc, $tc, $uc) { 12338 $i++; # First time through, $i = 0 ... 3rd time = 3 12339 12340 my $value = $object->value_of($decimal_code_point); 12341 $value = ($value eq $CODE_POINT) 12342 ? $decimal_code_point 12343 : hex $value; 12344 12345 # If this isn't a multi-character mapping, it should already have 12346 # been read in. 12347 if ($fields[$i] !~ / /) { 12348 if ($value != hex $fields[$i]) { 12349 Carp::my_carp("Bad news. UnicodeData.txt thinks " 12350 . $object->name 12351 . "(0x$fields[0]) is $value" 12352 . " and SpecialCasing.txt thinks it is " 12353 . hex($fields[$i]) 12354 . ". Good luck. Retaining UnicodeData value, and proceeding anyway."); 12355 } 12356 } 12357 else { 12358 12359 # The mapping is additional, beyond the simple mapping. 12360 $file->insert_adjusted_lines("$fields[0]; " 12361 . $object->name 12362 . "; " 12363 . $CMD_DELIM 12364 . "$REPLACE_CMD=$MULTIPLE_BEFORE" 12365 . $CMD_DELIM 12366 . $fields[$i]); 12367 } 12368 } 12369 12370 # Everything has been handled by the insert_adjusted_lines() 12371 $_ = ""; 12372 12373 return; 12374 } 12375} 12376 12377sub filter_old_style_case_folding($file) { 12378 # This transforms $_ containing the case folding style of 3.0.1, to 3.1 12379 # and later style. Different letters were used in the earlier. 12380 12381 my @fields = split /\s*;\s*/; 12382 12383 if ($fields[1] eq 'L') { 12384 $fields[1] = 'C'; # L => C always 12385 } 12386 elsif ($fields[1] eq 'E') { 12387 if ($fields[2] =~ / /) { # E => C if one code point; F otherwise 12388 $fields[1] = 'F' 12389 } 12390 else { 12391 $fields[1] = 'C' 12392 } 12393 } 12394 else { 12395 $file->carp_bad_line("Expecting L or E in second field"); 12396 $_ = ""; 12397 return; 12398 } 12399 $_ = join("; ", @fields) . ';'; 12400 return; 12401} 12402 12403{ # Closure for case folding 12404 12405 # Create the map for simple only if are going to output it, for otherwise 12406 # it takes no part in anything we do. 12407 my $to_output_simple; 12408 12409 sub setup_case_folding { 12410 # Read in the case foldings in CaseFolding.txt. This handles both 12411 # simple and full case folding. 12412 12413 $to_output_simple 12414 = property_ref('Simple_Case_Folding')->to_output_map; 12415 12416 if (! $to_output_simple) { 12417 property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding'); 12418 } 12419 12420 # If we ever wanted to show that these tables were combined, a new 12421 # property method could be created, like set_combined_props() 12422 property_ref('Case_Folding')->add_comment(join_lines( <<END 12423This file includes both the simple and full case folding maps. The simple 12424ones are in the main body of the table below, and the full ones adding to or 12425overriding them are in the hash. 12426END 12427 )); 12428 return; 12429 } 12430 12431 sub filter_case_folding_line($file) { 12432 # Called for each line in CaseFolding.txt 12433 # Input lines look like: 12434 # 0041; C; 0061; # LATIN CAPITAL LETTER A 12435 # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S 12436 # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S 12437 # 12438 # 'C' means that folding is the same for both simple and full 12439 # 'F' that it is only for full folding 12440 # 'S' that it is only for simple folding 12441 # 'T' is locale-dependent, and ignored 12442 # 'I' is a type of 'F' used in some early releases. 12443 # Note the trailing semi-colon, unlike many of the input files. That 12444 # means that there will be an extra null field generated by the split 12445 # below, which we ignore and hence is not an error. 12446 12447 my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1; 12448 if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) { 12449 $file->carp_bad_line('Extra fields'); 12450 $_ = ""; 12451 return; 12452 } 12453 12454 if ($type =~ / ^ [IT] $/x) { # Skip Turkic case folding, is locale dependent 12455 $_ = ""; 12456 return; 12457 } 12458 12459 # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase 12460 # I are all full foldings; S is single-char. For S, there is always 12461 # an F entry, so we must allow multiple values for the same code 12462 # point. Fortunately this table doesn't need further manipulation 12463 # which would preclude using multiple-values. The S is now included 12464 # so that _swash_inversion_hash() is able to construct closures 12465 # without having to worry about F mappings. 12466 if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') { 12467 $_ = "$range; Case_Folding; " 12468 . "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map"; 12469 } 12470 else { 12471 $_ = ""; 12472 $file->carp_bad_line('Expecting C F I S or T in second field'); 12473 } 12474 12475 # C and S are simple foldings, but simple case folding is not needed 12476 # unless we explicitly want its map table output. 12477 if ($to_output_simple && $type eq 'C' || $type eq 'S') { 12478 $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map"); 12479 } 12480 12481 return; 12482 } 12483 12484} # End case fold closure 12485 12486sub filter_jamo_line { 12487 # Filter Jamo.txt lines. This routine mainly is used to populate hashes 12488 # from this file that is used in generating the Name property for Jamo 12489 # code points. But, it also is used to convert early versions' syntax 12490 # into the modern form. Here are two examples: 12491 # 1100; G # HANGUL CHOSEONG KIYEOK # Modern syntax 12492 # U+1100; G; HANGUL CHOSEONG KIYEOK # 2.0 syntax 12493 # 12494 # The input is $_, the output is $_ filtered. 12495 12496 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 12497 12498 # Let the caller handle unexpected input. In earlier versions, there was 12499 # a third field which is supposed to be a comment, but did not have a '#' 12500 # before it. 12501 return if @fields > (($v_version gt v3.0.0) ? 2 : 3); 12502 12503 $fields[0] =~ s/^U\+//; # Also, early versions had this extraneous 12504 # beginning. 12505 12506 # Some 2.1 versions had this wrong. Causes havoc with the algorithm. 12507 $fields[1] = 'R' if $fields[0] eq '1105'; 12508 12509 # Add to structure so can generate Names from it. 12510 my $cp = hex $fields[0]; 12511 my $short_name = $fields[1]; 12512 $Jamo{$cp} = $short_name; 12513 if ($cp <= $LBase + $LCount) { 12514 $Jamo_L{$short_name} = $cp - $LBase; 12515 } 12516 elsif ($cp <= $VBase + $VCount) { 12517 $Jamo_V{$short_name} = $cp - $VBase; 12518 } 12519 elsif ($cp <= $TBase + $TCount) { 12520 $Jamo_T{$short_name} = $cp - $TBase; 12521 } 12522 else { 12523 Carp::my_carp_bug("Unexpected Jamo code point in $_"); 12524 } 12525 12526 12527 # Reassemble using just the first two fields to look like a typical 12528 # property file line 12529 $_ = "$fields[0]; $fields[1]"; 12530 12531 return; 12532} 12533 12534sub register_fraction($rational) { 12535 # This registers the input rational number so that it can be passed on to 12536 # Unicode::UCD, both in rational and floating forms. 12537 12538 my $floating = eval $rational; 12539 12540 my @floats = sprintf "%.*e", $E_FLOAT_PRECISION, $floating; 12541 12542 # See if the denominator is a power of 2. 12543 $rational =~ m!.*/(.*)!; 12544 my $denominator = $1; 12545 if (defined $denominator && (($denominator & ($denominator - 1)) == 0)) { 12546 12547 # Here the denominator is a power of 2. This means it has an exact 12548 # representation in binary, so rounding could go either way. It turns 12549 # out that Windows doesn't necessarily round towards even, so output 12550 # an extra entry. This happens when the final digit we output is even 12551 # and the next digits would be 50* to the precision of the machine. 12552 my $extra_digit_float = sprintf "%e", $floating; 12553 my $q = $E_FLOAT_PRECISION - 1; 12554 if ($extra_digit_float =~ / ( .* \. \d{$q} ) 12555 ( [02468] ) 5 0* ( e .*) 12556 /ix) 12557 { 12558 push @floats, $1 . ($2 + 1) . $3; 12559 } 12560 } 12561 12562 foreach my $float (@floats) { 12563 # Strip off any leading zeros beyond 2 digits to make it C99 12564 # compliant. (Windows has 3 digit exponents, contrary to C99) 12565 $float =~ s/ ( .* e [-+] ) 0* ( \d{2,}? ) /$1$2/x; 12566 12567 if ( defined $nv_floating_to_rational{$float} 12568 && $nv_floating_to_rational{$float} ne $rational) 12569 { 12570 die Carp::my_carp_bug("Both '$rational' and" 12571 . " '$nv_floating_to_rational{$float}' evaluate to" 12572 . " the same floating point number." 12573 . " \$E_FLOAT_PRECISION must be increased"); 12574 } 12575 $nv_floating_to_rational{$float} = $rational; 12576 } 12577 return; 12578} 12579 12580sub gcd($a, $b) { # Greatest-common-divisor; from 12581 # http://en.wikipedia.org/wiki/Euclidean_algorithm 12582 use integer; 12583 12584 while ($b != 0) { 12585 my $temp = $b; 12586 $b = $a % $b; 12587 $a = $temp; 12588 } 12589 return $a; 12590} 12591 12592sub reduce_fraction($fraction_ref) { 12593 # Reduce a fraction to lowest terms. The Unicode data may be reducible, 12594 # hence this is needed. The argument is a reference to the 12595 # string denoting the fraction, which must be of the form: 12596 if ($$fraction_ref !~ / ^ (-?) (\d+) \/ (\d+) $ /ax) { 12597 Carp::my_carp_bug("Non-fraction input '$$fraction_ref'. Unchanged"); 12598 return; 12599 } 12600 12601 my $sign = $1; 12602 my $numerator = $2; 12603 my $denominator = $3; 12604 12605 use integer; 12606 12607 # Find greatest common divisor 12608 my $gcd = gcd($numerator, $denominator); 12609 12610 # And reduce using the gcd. 12611 if ($gcd != 1) { 12612 $numerator /= $gcd; 12613 $denominator /= $gcd; 12614 $$fraction_ref = "$sign$numerator/$denominator"; 12615 } 12616 12617 return; 12618} 12619 12620sub filter_numeric_value_line($file) { 12621 # DNumValues contains lines of a different syntax than the typical 12622 # property file: 12623 # 0F33 ; -0.5 ; ; -1/2 # No TIBETAN DIGIT HALF ZERO 12624 # 12625 # This routine transforms $_ containing the anomalous syntax to the 12626 # typical, by filtering out the extra columns, and convert early version 12627 # decimal numbers to strings that look like rational numbers. 12628 12629 # Starting in 5.1, there is a rational field. Just use that, omitting the 12630 # extra columns. Otherwise convert the decimal number in the second field 12631 # to a rational, and omit extraneous columns. 12632 my @fields = split /\s*;\s*/, $_, -1; 12633 my $rational; 12634 12635 if ($v_version ge v5.1.0) { 12636 if (@fields != 4) { 12637 $file->carp_bad_line('Not 4 semi-colon separated fields'); 12638 $_ = ""; 12639 return; 12640 } 12641 reduce_fraction(\$fields[3]) if $fields[3] =~ qr{/}; 12642 $rational = $fields[3]; 12643 12644 $_ = join '; ', @fields[ 0, 3 ]; 12645 } 12646 else { 12647 12648 # Here, is an older Unicode file, which has decimal numbers instead of 12649 # rationals in it. Use the fraction to calculate the denominator and 12650 # convert to rational. 12651 12652 if (@fields != 2 && @fields != 3) { 12653 $file->carp_bad_line('Not 2 or 3 semi-colon separated fields'); 12654 $_ = ""; 12655 return; 12656 } 12657 12658 my $codepoints = $fields[0]; 12659 my $decimal = $fields[1]; 12660 if ($decimal =~ s/\.0+$//) { 12661 12662 # Anything ending with a decimal followed by nothing but 0's is an 12663 # integer 12664 $_ = "$codepoints; $decimal"; 12665 $rational = $decimal; 12666 } 12667 else { 12668 12669 my $denominator; 12670 if ($decimal =~ /\.50*$/) { 12671 $denominator = 2; 12672 } 12673 12674 # Here have the hardcoded repeating decimals in the fraction, and 12675 # the denominator they imply. There were only a few denominators 12676 # in the older Unicode versions of this file which this code 12677 # handles, so it is easy to convert them. 12678 12679 # The 4 is because of a round-off error in the Unicode 3.2 files 12680 elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) { 12681 $denominator = 3; 12682 } 12683 elsif ($decimal =~ /\.[27]50*$/) { 12684 $denominator = 4; 12685 } 12686 elsif ($decimal =~ /\.[2468]0*$/) { 12687 $denominator = 5; 12688 } 12689 elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) { 12690 $denominator = 6; 12691 } 12692 elsif ($decimal =~ /\.(12|37|62|87)50*$/) { 12693 $denominator = 8; 12694 } 12695 if ($denominator) { 12696 my $sign = ($decimal < 0) ? "-" : ""; 12697 my $numerator = int((abs($decimal) * $denominator) + .5); 12698 $rational = "$sign$numerator/$denominator"; 12699 $_ = "$codepoints; $rational"; 12700 } 12701 else { 12702 $file->carp_bad_line("Can't cope with number '$decimal'."); 12703 $_ = ""; 12704 return; 12705 } 12706 } 12707 } 12708 12709 register_fraction($rational) if $rational =~ qr{/}; 12710 return; 12711} 12712 12713{ # Closure 12714 my %unihan_properties; 12715 12716 sub construct_unihan($file_object) { 12717 12718 return unless file_exists($file_object->file); 12719 12720 if ($v_version lt v4.0.0) { 12721 push @cjk_properties, 'URS ; Unicode_Radical_Stroke'; 12722 push @cjk_property_values, split "\n", <<'END'; 12723# @missing: 0000..10FFFF; Unicode_Radical_Stroke; <none> 12724END 12725 } 12726 12727 if ($v_version ge v3.0.0) { 12728 push @cjk_properties, split "\n", <<'END'; 12729cjkIRG_GSource; kIRG_GSource 12730cjkIRG_JSource; kIRG_JSource 12731cjkIRG_KSource; kIRG_KSource 12732cjkIRG_TSource; kIRG_TSource 12733cjkIRG_VSource; kIRG_VSource 12734END 12735 push @cjk_property_values, split "\n", <<'END'; 12736# @missing: 0000..10FFFF; cjkIRG_GSource; <none> 12737# @missing: 0000..10FFFF; cjkIRG_JSource; <none> 12738# @missing: 0000..10FFFF; cjkIRG_KSource; <none> 12739# @missing: 0000..10FFFF; cjkIRG_TSource; <none> 12740# @missing: 0000..10FFFF; cjkIRG_VSource; <none> 12741END 12742 } 12743 if ($v_version ge v3.1.0) { 12744 push @cjk_properties, 'cjkIRG_HSource; kIRG_HSource'; 12745 push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_HSource; <none>'; 12746 } 12747 if ($v_version ge v3.1.1) { 12748 push @cjk_properties, 'cjkIRG_KPSource; kIRG_KPSource'; 12749 push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_KPSource; <none>'; 12750 } 12751 if ($v_version ge v3.2.0) { 12752 push @cjk_properties, split "\n", <<'END'; 12753cjkAccountingNumeric; kAccountingNumeric 12754cjkCompatibilityVariant; kCompatibilityVariant 12755cjkOtherNumeric; kOtherNumeric 12756cjkPrimaryNumeric; kPrimaryNumeric 12757END 12758 push @cjk_property_values, split "\n", <<'END'; 12759# @missing: 0000..10FFFF; cjkAccountingNumeric; NaN 12760# @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point> 12761# @missing: 0000..10FFFF; cjkOtherNumeric; NaN 12762# @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN 12763END 12764 } 12765 if ($v_version gt v4.0.0) { 12766 push @cjk_properties, 'cjkIRG_USource; kIRG_USource'; 12767 push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_USource; <none>'; 12768 } 12769 12770 if ($v_version ge v4.1.0) { 12771 push @cjk_properties, 'cjkIICore ; kIICore'; 12772 push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIICore; <none>'; 12773 } 12774 } 12775 12776 sub setup_unihan { 12777 # Do any special setup for Unihan properties. 12778 12779 # This property gives the wrong computed type, so override. 12780 my $usource = property_ref('kIRG_USource'); 12781 $usource->set_type($STRING) if defined $usource; 12782 12783 # This property is to be considered binary (it says so in 12784 # http://www.unicode.org/reports/tr38/) 12785 my $iicore = property_ref('kIICore'); 12786 if (defined $iicore) { 12787 $iicore->set_type($FORCED_BINARY); 12788 $iicore->table("Y")->add_note("Matches any code point which has a non-null value for this property; see unicode.org UAX #38."); 12789 12790 # Unicode doesn't include the maps for this property, so don't 12791 # warn that they are missing. 12792 $iicore->set_pre_declared_maps(0); 12793 $iicore->add_comment(join_lines( <<END 12794This property contains string values, but any non-empty ones are considered to 12795be 'core', so Perl creates tables for both: 1) its string values, plus 2) 12796tables so that \\p{kIICore} matches any code point which has a non-empty 12797value for this property. 12798END 12799 )); 12800 } 12801 12802 return; 12803 } 12804 12805 sub filter_unihan_line { 12806 # Change unihan db lines to look like the others in the db. Here is 12807 # an input sample: 12808 # U+341C kCangjie IEKN 12809 12810 # Tabs are used instead of semi-colons to separate fields; therefore 12811 # they may have semi-colons embedded in them. Change these to periods 12812 # so won't screw up the rest of the code. 12813 s/;/./g; 12814 12815 # Remove lines that don't look like ones we accept. 12816 if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) { 12817 $_ = ""; 12818 return; 12819 } 12820 12821 # Extract the property, and save a reference to its object. 12822 my $property = $1; 12823 if (! exists $unihan_properties{$property}) { 12824 $unihan_properties{$property} = property_ref($property); 12825 } 12826 12827 # Don't do anything unless the property is one we're handling, which 12828 # we determine by seeing if there is an object defined for it or not 12829 if (! defined $unihan_properties{$property}) { 12830 $_ = ""; 12831 return; 12832 } 12833 12834 # Convert the tab separators to our standard semi-colons, and convert 12835 # the U+HHHH notation to the rest of the standard's HHHH 12836 s/\t/;/g; 12837 s/\b U \+ (?= $code_point_re )//xg; 12838 12839 #local $to_trace = 1 if main::DEBUG; 12840 trace $_ if main::DEBUG && $to_trace; 12841 12842 return; 12843 } 12844} 12845 12846sub filter_blocks_lines($file) { 12847 # In the Blocks.txt file, the names of the blocks don't quite match the 12848 # names given in PropertyValueAliases.txt, so this changes them so they 12849 # do match: Blanks and hyphens are changed into underscores. Also makes 12850 # early release versions look like later ones 12851 # 12852 # $_ is transformed to the correct value. 12853 12854 if ($v_version lt v3.2.0) { 12855 if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted 12856 $_ = ""; 12857 return; 12858 } 12859 12860 # Old versions used a different syntax to mark the range. 12861 $_ =~ s/;\s+/../ if $v_version lt v3.1.0; 12862 } 12863 12864 my @fields = split /\s*;\s*/, $_, -1; 12865 if (@fields != 2) { 12866 $file->carp_bad_line("Expecting exactly two fields"); 12867 $_ = ""; 12868 return; 12869 } 12870 12871 # Change hyphens and blanks in the block name field only 12872 $fields[1] =~ s/[ -]/_/g; 12873 $fields[1] =~ s/_ ( [a-z] ) /_\u$1/xg; # Capitalize first letter of word 12874 12875 $_ = join("; ", @fields); 12876 return; 12877} 12878 12879{ # Closure 12880 my $current_property; 12881 12882 sub filter_old_style_proplist { 12883 # PropList.txt has been in Unicode since version 2.0. Until 3.1, it 12884 # was in a completely different syntax. Ken Whistler of Unicode says 12885 # that it was something he used as an aid for his own purposes, but 12886 # was never an official part of the standard. Many of the properties 12887 # in it were incorporated into the later PropList.txt, but some were 12888 # not. This program uses this early file to generate property tables 12889 # that are otherwise not accessible in the early UCD's. It does this 12890 # for the ones that eventually became official, and don't appear to be 12891 # too different in their contents from the later official version, and 12892 # throws away the rest. It could be argued that the ones it generates 12893 # were probably not really official at that time, so should be 12894 # ignored. You can easily modify things to skip all of them by 12895 # changing this function to just set $_ to "", and return; and to skip 12896 # certain of them by simply removing their declarations from 12897 # get_old_property_aliases(). 12898 # 12899 # Here is a list of all the ones that are thrown away: 12900 # Alphabetic The definitions for this are very 12901 # defective, so better to not mislead 12902 # people into thinking it works. 12903 # Instead the Perl extension of the 12904 # same name is constructed from first 12905 # principles. 12906 # Bidi=* duplicates UnicodeData.txt 12907 # Combining never made into official property; 12908 # is \P{ccc=0} 12909 # Composite never made into official property. 12910 # Currency Symbol duplicates UnicodeData.txt: gc=sc 12911 # Decimal Digit duplicates UnicodeData.txt: gc=nd 12912 # Delimiter never made into official property; 12913 # removed in 3.0.1 12914 # Format Control never made into official property; 12915 # similar to gc=cf 12916 # High Surrogate duplicates Blocks.txt 12917 # Ignorable Control never made into official property; 12918 # similar to di=y 12919 # ISO Control duplicates UnicodeData.txt: gc=cc 12920 # Left of Pair never made into official property; 12921 # Line Separator duplicates UnicodeData.txt: gc=zl 12922 # Low Surrogate duplicates Blocks.txt 12923 # Non-break was actually listed as a property 12924 # in 3.2, but without any code 12925 # points. Unicode denies that this 12926 # was ever an official property 12927 # Non-spacing duplicate UnicodeData.txt: gc=mn 12928 # Numeric duplicates UnicodeData.txt: gc=cc 12929 # Paired Punctuation never made into official property; 12930 # appears to be gc=ps + gc=pe 12931 # Paragraph Separator duplicates UnicodeData.txt: gc=cc 12932 # Private Use duplicates UnicodeData.txt: gc=co 12933 # Private Use High Surrogate duplicates Blocks.txt 12934 # Punctuation duplicates UnicodeData.txt: gc=p 12935 # Space different definition than eventual 12936 # one. 12937 # Titlecase duplicates UnicodeData.txt: gc=lt 12938 # Unassigned Code Value duplicates UnicodeData.txt: gc=cn 12939 # Zero-width never made into official property; 12940 # subset of gc=cf 12941 # Most of the properties have the same names in this file as in later 12942 # versions, but a couple do not. 12943 # 12944 # This subroutine filters $_, converting it from the old style into 12945 # the new style. Here's a sample of the old-style 12946 # 12947 # ******************************************* 12948 # 12949 # Property dump for: 0x100000A0 (Join Control) 12950 # 12951 # 200C..200D (2 chars) 12952 # 12953 # In the example, the property is "Join Control". It is kept in this 12954 # closure between calls to the subroutine. The numbers beginning with 12955 # 0x were internal to Ken's program that generated this file. 12956 12957 # If this line contains the property name, extract it. 12958 if (/^Property dump for: [^(]*\((.*)\)/) { 12959 $_ = $1; 12960 12961 # Convert white space to underscores. 12962 s/ /_/g; 12963 12964 # Convert the few properties that don't have the same name as 12965 # their modern counterparts 12966 s/Identifier_Part/ID_Continue/ 12967 or s/Not_a_Character/NChar/; 12968 12969 # If the name matches an existing property, use it. 12970 if (defined property_ref($_)) { 12971 trace "new property=", $_ if main::DEBUG && $to_trace; 12972 $current_property = $_; 12973 } 12974 else { # Otherwise discard it 12975 trace "rejected property=", $_ if main::DEBUG && $to_trace; 12976 undef $current_property; 12977 } 12978 $_ = ""; # The property is saved for the next lines of the 12979 # file, but this defining line is of no further use, 12980 # so clear it so that the caller won't process it 12981 # further. 12982 } 12983 elsif (! defined $current_property || $_ !~ /^$code_point_re/) { 12984 12985 # Here, the input line isn't a header defining a property for the 12986 # following section, and either we aren't in such a section, or 12987 # the line doesn't look like one that defines the code points in 12988 # such a section. Ignore this line. 12989 $_ = ""; 12990 } 12991 else { 12992 12993 # Here, we have a line defining the code points for the current 12994 # stashed property. Anything starting with the first blank is 12995 # extraneous. Otherwise, it should look like a normal range to 12996 # the caller. Append the property name so that it looks just like 12997 # a modern PropList entry. 12998 12999 $_ =~ s/\s.*//; 13000 $_ .= "; $current_property"; 13001 } 13002 trace $_ if main::DEBUG && $to_trace; 13003 return; 13004 } 13005} # End closure for old style proplist 13006 13007sub filter_old_style_normalization_lines { 13008 # For early releases of Unicode, the lines were like: 13009 # 74..2A76 ; NFKD_NO 13010 # For later releases this became: 13011 # 74..2A76 ; NFKD_QC; N 13012 # Filter $_ to look like those in later releases. 13013 # Similarly for MAYBEs 13014 13015 s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x; 13016 13017 # Also, the property FC_NFKC was abbreviated to FNC 13018 s/FNC/FC_NFKC/; 13019 return; 13020} 13021 13022sub setup_script_extensions { 13023 # The Script_Extensions property starts out with a clone of the Script 13024 # property. 13025 13026 $scx = property_ref("Script_Extensions"); 13027 return unless defined $scx; 13028 13029 $scx->_set_format($STRING_WHITE_SPACE_LIST); 13030 $scx->initialize($script); 13031 $scx->set_default_map($script->default_map); 13032 $scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these 13033 $scx->add_comment(join_lines( <<END 13034The values for code points that appear in one script are just the same as for 13035the 'Script' property. Likewise the values for those that appear in many 13036scripts are either 'Common' or 'Inherited', same as with 'Script'. But the 13037values of code points that appear in a few scripts are a space separated list 13038of those scripts. 13039END 13040 )); 13041 13042 # Initialize scx's tables and the aliases for them to be the same as sc's 13043 foreach my $table ($script->tables) { 13044 my $scx_table = $scx->add_match_table($table->name, 13045 Full_Name => $table->full_name); 13046 foreach my $alias ($table->aliases) { 13047 $scx_table->add_alias($alias->name); 13048 } 13049 } 13050} 13051 13052sub filter_script_extensions_line { 13053 # The Scripts file comes with the full name for the scripts; the 13054 # ScriptExtensions, with the short name. The final mapping file is a 13055 # combination of these, and without adjustment, would have inconsistent 13056 # entries. This filters the latter file to convert to full names. 13057 # Entries look like this: 13058 # 064B..0655 ; Arab Syrc # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW 13059 13060 my @fields = split /\s*;\s*/; 13061 13062 # This script was erroneously omitted in this Unicode version. 13063 $fields[1] .= ' Takr' if $v_version eq v6.1.0 && $fields[0] =~ /^0964/; 13064 13065 my @full_names; 13066 foreach my $short_name (split " ", $fields[1]) { 13067 push @full_names, $script->table($short_name)->full_name; 13068 } 13069 $fields[1] = join " ", @full_names; 13070 $_ = join "; ", @fields; 13071 13072 return; 13073} 13074 13075sub setup_emojidata { 13076 my $prop_ref = Property->new('ExtPict', 13077 Full_Name => 'Extended_Pictographic', 13078 ); 13079 $prop_ref->set_fate($PLACEHOLDER, 13080 "Not part of the Unicode Character Database"); 13081} 13082 13083sub filter_emojidata_line { 13084 # We only are interested in this single property from this non-UCD data 13085 # file, and we turn it into a Perl property, so that it isn't accessible 13086 # to the users 13087 13088 $_ = "" unless /\bExtended_Pictographic\b/; 13089 13090 return; 13091} 13092 13093sub setup_IdStatus { 13094 my $ids = Property->new('Identifier_Status', 13095 Match_SubDir => 'IdStatus', 13096 Default_Map => 'Restricted', 13097 ); 13098 $ids->add_match_table('Allowed'); 13099} 13100 13101sub setup_IdType { 13102 $idt = Property->new('Identifier_Type', 13103 Match_SubDir => 'IdType', 13104 Default_Map => 'Not_Character', 13105 Format => $STRING_WHITE_SPACE_LIST, 13106 ); 13107} 13108 13109sub filter_IdType_line { 13110 13111 # Some code points have more than one type, separated by spaces on the 13112 # input. For now, we just add everything as a property value. Later when 13113 # we look for properties with format $STRING_WHITE_SPACE_LIST, we resolve 13114 # things 13115 13116 my @fields = split /\s*;\s*/; 13117 my $types = $fields[1]; 13118 $idt->add_match_table($types) unless defined $idt->table($types); 13119 13120 return; 13121} 13122 13123sub generate_hst($file) { 13124 13125 # Populates the Hangul Syllable Type property from first principles 13126 13127 # These few ranges are hard-coded in. 13128 $file->insert_lines(split /\n/, <<'END' 131291100..1159 ; L 13130115F ; L 131311160..11A2 ; V 1313211A8..11F9 ; T 13133END 13134); 13135 13136 # The Hangul syllables in version 1 are at different code points than 13137 # those that came along starting in version 2, and have different names; 13138 # they comprise about 60% of the code points of the later version. 13139 # From my (khw) research on them (see <558493EB.4000807@att.net>), the 13140 # initial set is a subset of the later version, with different English 13141 # transliterations. I did not see an easy mapping between them. The 13142 # later set includes essentially all possibilities, even ones that aren't 13143 # in modern use (if they ever were), and over 96% of the new ones are type 13144 # LVT. Mathematically, the early set must also contain a preponderance of 13145 # LVT values. In lieu of doing nothing, we just set them all to LVT, and 13146 # expect that this will be right most of the time, which is better than 13147 # not being right at all. 13148 if ($v_version lt v2.0.0) { 13149 my $property = property_ref($file->property); 13150 $file->insert_lines(sprintf("%04X..%04X; LVT\n", 13151 $FIRST_REMOVED_HANGUL_SYLLABLE, 13152 $FINAL_REMOVED_HANGUL_SYLLABLE)); 13153 push @tables_that_may_be_empty, $property->table('LV')->complete_name; 13154 return; 13155 } 13156 13157 # The algorithmically derived syllables are almost all LVT ones, so 13158 # initialize the whole range with that. 13159 $file->insert_lines(sprintf "%04X..%04X; LVT\n", 13160 $SBase, $SBase + $SCount -1); 13161 13162 # Those ones that aren't LVT are LV, and they occur at intervals of 13163 # $TCount code points, starting with the first code point, at $SBase. 13164 for (my $i = $SBase; $i < $SBase + $SCount; $i += $TCount) { 13165 $file->insert_lines(sprintf "%04X..%04X; LV\n", $i, $i); 13166 } 13167 13168 return; 13169} 13170 13171sub generate_GCB($file) { 13172 13173 # Populates the Grapheme Cluster Break property from first principles 13174 13175 # All these definitions are from 13176 # http://www.unicode.org/reports/tr29/tr29-3.html with confirmation 13177 # from http://www.unicode.org/reports/tr29/tr29-4.html 13178 13179 foreach my $range ($gc->ranges) { 13180 13181 # Extend includes gc=Me and gc=Mn, while Control includes gc=Cc 13182 # and gc=Cf 13183 if ($range->value =~ / ^ M [en] $ /x) { 13184 $file->insert_lines(sprintf "%04X..%04X; Extend", 13185 $range->start, $range->end); 13186 } 13187 elsif ($range->value =~ / ^ C [cf] $ /x) { 13188 $file->insert_lines(sprintf "%04X..%04X; Control", 13189 $range->start, $range->end); 13190 } 13191 } 13192 $file->insert_lines("2028; Control"); # Line Separator 13193 $file->insert_lines("2029; Control"); # Paragraph Separator 13194 13195 $file->insert_lines("000D; CR"); 13196 $file->insert_lines("000A; LF"); 13197 13198 # Also from http://www.unicode.org/reports/tr29/tr29-3.html. 13199 foreach my $code_point ( qw{ 13200 09BE 09D7 0B3E 0B57 0BBE 0BD7 0CC2 0CD5 0CD6 13201 0D3E 0D57 0DCF 0DDF FF9E FF9F 1D165 1D16E 1D16F 13202 } 13203 ) { 13204 my $category = $gc->value_of(hex $code_point); 13205 next if ! defined $category || $category eq 'Cn'; # But not if 13206 # unassigned in this 13207 # release 13208 $file->insert_lines("$code_point; Extend"); 13209 } 13210 13211 my $hst = property_ref('Hangul_Syllable_Type'); 13212 if ($hst->count > 0) { 13213 foreach my $range ($hst->ranges) { 13214 $file->insert_lines(sprintf "%04X..%04X; %s", 13215 $range->start, $range->end, $range->value); 13216 } 13217 } 13218 else { 13219 generate_hst($file); 13220 } 13221 13222 main::process_generic_property_file($file); 13223} 13224 13225 13226sub fixup_early_perl_name_alias($file) { 13227 13228 # Different versions of Unicode have varying support for the name synonyms 13229 # below. Just include everything. As of 6.1, all these are correct in 13230 # the Unicode-supplied file. 13231 13232 # ALERT did not come along until 6.0, at which point it became preferred 13233 # over BELL. By inserting it last in early releases, BELL is preferred 13234 # over it; and vice-vers in 6.0 13235 my $type_for_bell = ($v_version lt v6.0.0) 13236 ? 'correction' 13237 : 'alternate'; 13238 $file->insert_lines(split /\n/, <<END 132390007;BELL; $type_for_bell 13240000A;LINE FEED (LF);alternate 13241000C;FORM FEED (FF);alternate 13242000D;CARRIAGE RETURN (CR);alternate 132430085;NEXT LINE (NEL);alternate 13244END 13245 13246 ); 13247 13248 # One might think that the 'Unicode_1_Name' field, could work for most 13249 # of the above names, but sadly that field varies depending on the 13250 # release. Version 1.1.5 had no names for any of the controls; Version 13251 # 2.0 introduced names for the C0 controls, and 3.0 introduced C1 names. 13252 # 3.0.1 removed the name INDEX; and 3.2 changed some names: 13253 # changed to parenthesized versions like "NEXT LINE" to 13254 # "NEXT LINE (NEL)"; 13255 # changed PARTIAL LINE DOWN to PARTIAL LINE FORWARD 13256 # changed PARTIAL LINE UP to PARTIAL LINE BACKWARD;; 13257 # changed e.g. FILE SEPARATOR to INFORMATION SEPARATOR FOUR 13258 # 13259 # All these are present in the 6.1 NameAliases.txt 13260 13261 return; 13262} 13263 13264sub filter_later_version_name_alias_line { 13265 13266 # This file has an extra entry per line for the alias type. This is 13267 # handled by creating a compound entry: "$alias: $type"; First, split 13268 # the line into components. 13269 my ($range, $alias, $type, @remainder) 13270 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 13271 13272 # This file contains multiple entries for some components, so tell the 13273 # downstream code to allow this in our internal tables; the 13274 # $MULTIPLE_AFTER preserves the input ordering. 13275 $_ = join ";", $range, $CMD_DELIM 13276 . $REPLACE_CMD 13277 . '=' 13278 . $MULTIPLE_AFTER 13279 . $CMD_DELIM 13280 . "$alias: $type", 13281 @remainder; 13282 return; 13283} 13284 13285sub filter_early_version_name_alias_line { 13286 13287 # Early versions did not have the trailing alias type field; implicitly it 13288 # was 'correction'. 13289 $_ .= "; correction"; 13290 13291 filter_later_version_name_alias_line; 13292 return; 13293} 13294 13295sub filter_all_caps_script_names { 13296 13297 # Some early Unicode releases had the script names in all CAPS. This 13298 # converts them to just the first letter of each word being capital. 13299 13300 my ($range, $script, @remainder) 13301 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields 13302 my @words = split /[_-]/, $script; 13303 for my $word (@words) { 13304 $word = 13305 ucfirst(lc($word)) if $word ne 'CJK'; 13306 } 13307 $script = join "_", @words; 13308 $_ = join ";", $range, $script, @remainder; 13309} 13310 13311sub finish_Unicode() { 13312 # This routine should be called after all the Unicode files have been read 13313 # in. It: 13314 # 1) Creates properties that are missing from the version of Unicode being 13315 # compiled, and which, for whatever reason, are needed for the Perl 13316 # core to function properly. These are minimally populated as 13317 # necessary. 13318 # 2) Adds the mappings for code points missing from the files which have 13319 # defaults specified for them. 13320 # 3) At this point all mappings are known, so it computes the type of 13321 # each property whose type hasn't been determined yet. 13322 # 4) Calculates all the regular expression match tables based on the 13323 # mappings. 13324 # 5) Calculates and adds the tables which are defined by Unicode, but 13325 # which aren't derived by them, and certain derived tables that Perl 13326 # uses. 13327 13328 # Folding information was introduced later into Unicode data. To get 13329 # Perl's case ignore (/i) to work at all in releases that don't have 13330 # folding, use the best available alternative, which is lower casing. 13331 my $fold = property_ref('Case_Folding'); 13332 if ($fold->is_empty) { 13333 $fold->initialize(property_ref('Lowercase_Mapping')); 13334 $fold->add_note(join_lines(<<END 13335WARNING: This table uses lower case as a substitute for missing fold 13336information 13337END 13338 )); 13339 } 13340 13341 # Multiple-character mapping was introduced later into Unicode data, so it 13342 # is by default the simple version. If to output the simple versions and 13343 # not present, just use the regular (which in these Unicode versions is 13344 # the simple as well). 13345 foreach my $map (qw { Uppercase_Mapping 13346 Lowercase_Mapping 13347 Titlecase_Mapping 13348 Case_Folding 13349 } ) 13350 { 13351 my $comment = <<END; 13352 13353Note that although the Perl core uses this file, it has the standard values 13354for code points from U+0000 to U+00FF compiled in, so changing this table will 13355not change the core's behavior with respect to these code points. Use 13356Unicode::Casing to override this table. 13357END 13358 if ($map eq 'Case_Folding') { 13359 $comment .= <<END; 13360(/i regex matching is not overridable except by using a custom regex engine) 13361END 13362 } 13363 property_ref($map)->add_comment(join_lines($comment)); 13364 my $simple = property_ref("Simple_$map"); 13365 next if ! $simple->is_empty; 13366 if ($simple->to_output_map) { 13367 $simple->initialize(property_ref($map)); 13368 } 13369 else { 13370 property_ref($map)->set_proxy_for($simple->name); 13371 } 13372 } 13373 13374 # For each property, fill in any missing mappings, and calculate the re 13375 # match tables. If a property has more than one missing mapping, the 13376 # default is a reference to a data structure, and may require data from 13377 # other properties to resolve. The sort is used to cause these to be 13378 # processed last, after all the other properties have been calculated. 13379 # (Fortunately, the missing properties so far don't depend on each other.) 13380 foreach my $property 13381 (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 } 13382 property_ref('*')) 13383 { 13384 # $perl has been defined, but isn't one of the Unicode properties that 13385 # need to be finished up. 13386 next if $property == $perl; 13387 13388 # Nor do we need to do anything with properties that aren't going to 13389 # be output. 13390 next if $property->fate == $SUPPRESSED; 13391 13392 # Handle the properties that have more than one possible default 13393 if (ref $property->default_map) { 13394 my $default_map = $property->default_map; 13395 13396 # These properties have stored in the default_map: 13397 # One or more of: 13398 # 1) A default map which applies to all code points in a 13399 # certain class 13400 # 2) an expression which will evaluate to the list of code 13401 # points in that class 13402 # And 13403 # 3) the default map which applies to every other missing code 13404 # point. 13405 # 13406 # Go through each list. 13407 while (my ($default, $eval) = $default_map->get_next_defaults) { 13408 13409 # Get the class list, and intersect it with all the so-far 13410 # unspecified code points yielding all the code points 13411 # in the class that haven't been specified. 13412 my $list = eval $eval; 13413 if ($@) { 13414 Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'"); 13415 last; 13416 } 13417 13418 # Narrow down the list to just those code points we don't have 13419 # maps for yet. 13420 $list = $list & $property->inverse_list; 13421 13422 # Add mappings to the property for each code point in the list 13423 foreach my $range ($list->ranges) { 13424 $property->add_map($range->start, $range->end, $default, 13425 Replace => $CROAK); 13426 } 13427 } 13428 13429 # All remaining code points have the other mapping. Set that up 13430 # so the normal single-default mapping code will work on them 13431 $property->set_default_map($default_map->other_default); 13432 13433 # And fall through to do that 13434 } 13435 13436 # We should have enough data now to compute the type of the property. 13437 my $property_name = $property->name; 13438 $property->compute_type; 13439 my $property_type = $property->type; 13440 13441 next if ! $property->to_create_match_tables; 13442 13443 # Here want to create match tables for this property 13444 13445 # The Unicode db always (so far, and they claim into the future) have 13446 # the default for missing entries in binary properties be 'N' (unless 13447 # there is a '@missing' line that specifies otherwise) 13448 if (! defined $property->default_map) { 13449 if ($property_type == $BINARY) { 13450 $property->set_default_map('N'); 13451 } 13452 elsif ($property_type == $ENUM) { 13453 Carp::my_carp("Property '$property_name doesn't have a default mapping. Using a fake one"); 13454 $property->set_default_map('XXX This makes sure there is a default map'); 13455 } 13456 } 13457 13458 # Add any remaining code points to the mapping, using the default for 13459 # missing code points. 13460 my $default_table; 13461 my $default_map = $property->default_map; 13462 if ($property_type == $FORCED_BINARY) { 13463 13464 # A forced binary property creates a 'Y' table that matches all 13465 # non-default values. The actual string values are also written out 13466 # as a map table. (The default value will almost certainly be the 13467 # empty string, so the pod glosses over the distinction, and just 13468 # talks about empty vs non-empty.) 13469 my $yes = $property->table("Y"); 13470 foreach my $range ($property->ranges) { 13471 next if $range->value eq $default_map; 13472 $yes->add_range($range->start, $range->end); 13473 } 13474 $property->table("N")->set_complement($yes); 13475 } 13476 else { 13477 if (defined $default_map) { 13478 13479 # Make sure there is a match table for the default 13480 if (! defined ($default_table = $property->table($default_map))) 13481 { 13482 $default_table = $property->add_match_table($default_map); 13483 } 13484 13485 # And, if the property is binary, the default table will just 13486 # be the complement of the other table. 13487 if ($property_type == $BINARY) { 13488 my $non_default_table; 13489 13490 # Find the non-default table. 13491 for my $table ($property->tables) { 13492 if ($table == $default_table) { 13493 if ($v_version le v5.0.0) { 13494 $table->add_alias($_) for qw(N No F False); 13495 } 13496 next; 13497 } elsif ($v_version le v5.0.0) { 13498 $table->add_alias($_) for qw(Y Yes T True); 13499 } 13500 $non_default_table = $table; 13501 } 13502 $default_table->set_complement($non_default_table); 13503 } 13504 else { 13505 13506 # This fills in any missing values with the default. It's 13507 # not necessary to do this with binary properties, as the 13508 # default is defined completely in terms of the Y table. 13509 $property->add_map(0, $MAX_WORKING_CODEPOINT, 13510 $default_map, Replace => $NO); 13511 } 13512 } 13513 13514 # Have all we need to populate the match tables. 13515 my $maps_should_be_defined = $property->pre_declared_maps; 13516 foreach my $range ($property->ranges) { 13517 my $map = $range->value; 13518 my $table = $property->table($map); 13519 if (! defined $table) { 13520 13521 # Integral and rational property values are not 13522 # necessarily defined in PropValueAliases, but whether all 13523 # the other ones should be depends on the property. 13524 if ($maps_should_be_defined 13525 && $map !~ /^ -? \d+ ( \/ \d+ )? $/x) 13526 { 13527 Carp::my_carp("Table '$property_name=$map' should " 13528 . "have been defined. Defining it now.") 13529 } 13530 $table = $property->add_match_table($map); 13531 } 13532 13533 next if $table->complement != 0; # Don't need to populate these 13534 $table->add_range($range->start, $range->end); 13535 } 13536 } 13537 13538 # For Perl 5.6 compatibility, all properties matchable in regexes can 13539 # have an optional 'Is_' prefix. This is now done in Unicode::UCD. 13540 # But warn if this creates a conflict with a (new) Unicode property 13541 # name, although it appears that Unicode has made a decision never to 13542 # begin a property name with 'Is_', so this shouldn't happen. 13543 foreach my $alias ($property->aliases) { 13544 my $Is_name = 'Is_' . $alias->name; 13545 if (defined (my $pre_existing = property_ref($Is_name))) { 13546 Carp::my_carp(<<END 13547There is already an alias named $Is_name (from " . $pre_existing . "), so 13548creating one for $property won't work. This is bad news. If it is not too 13549late, get Unicode to back off. Otherwise go back to the old scheme (findable 13550from the git blame log for this area of the code that suppressed individual 13551aliases that conflict with the new Unicode names. Proceeding anyway. 13552END 13553 ); 13554 } 13555 } # End of loop through aliases for this property 13556 13557 13558 # Properties that have sets of values for some characters are now 13559 # converted. For example, the Script_Extensions property started out 13560 # as a clone of the Script property. But processing its data file 13561 # caused some elements to be replaced with different data. (These 13562 # elements were for the Common and Inherited properties.) This data 13563 # is a qw() list of all the scripts that the code points in the given 13564 # range are in. An example line is: 13565 # 13566 # 060C ; Arab Syrc Thaa # Po ARABIC COMMA 13567 # 13568 # Code executed earlier has created a new match table named "Arab Syrc 13569 # Thaa" which contains 060C. (The cloned table started out with this 13570 # code point mapping to "Common".) Now we add 060C to each of the 13571 # Arab, Syrc, and Thaa match tables. Then we delete the now spurious 13572 # "Arab Syrc Thaa" match table. This is repeated for all these tables 13573 # and ranges. The map data is retained in the map table for 13574 # reference, but the spurious match tables are deleted. 13575 my $format = $property->format; 13576 if (defined $format && $format eq $STRING_WHITE_SPACE_LIST) { 13577 foreach my $table ($property->tables) { 13578 13579 # Space separates the entries which should go in multiple 13580 # tables 13581 next unless $table->name =~ /\s/; 13582 13583 # The list of the entries, hence the names of the tables that 13584 # everything in this combo table should be added to. 13585 my @list = split /\s+/, $table->name; 13586 13587 # Add the entries from the combo table to each individual 13588 # table 13589 foreach my $individual (@list) { 13590 my $existing_table = $property->table($individual); 13591 13592 # This should only be necessary if this particular entry 13593 # occurs only in combo with others. 13594 $existing_table = $property->add_match_table($individual) 13595 unless defined $existing_table; 13596 $existing_table += $table; 13597 } 13598 $property->delete_match_table($table); 13599 } 13600 } 13601 } # End of loop through all Unicode properties. 13602 13603 # Fill in the mappings that Unicode doesn't completely furnish. First the 13604 # single letter major general categories. If Unicode were to start 13605 # delivering the values, this would be redundant, but better that than to 13606 # try to figure out if should skip and not get it right. Ths could happen 13607 # if a new major category were to be introduced, and the hard-coded test 13608 # wouldn't know about it. 13609 # This routine depends on the standard names for the general categories 13610 # being what it thinks they are, like 'Cn'. The major categories are the 13611 # union of all the general category tables which have the same first 13612 # letters. eg. L = Lu + Lt + Ll + Lo + Lm 13613 foreach my $minor_table ($gc->tables) { 13614 my $minor_name = $minor_table->name; 13615 next if length $minor_name == 1; 13616 if (length $minor_name != 2) { 13617 Carp::my_carp_bug("Unexpected general category '$minor_name'. Skipped."); 13618 next; 13619 } 13620 13621 my $major_name = uc(substr($minor_name, 0, 1)); 13622 my $major_table = $gc->table($major_name); 13623 $major_table += $minor_table; 13624 } 13625 13626 # LC is Ll, Lu, and Lt. (used to be L& or L_, but PropValueAliases.txt 13627 # defines it as LC) 13628 my $LC = $gc->table('LC'); 13629 $LC->add_alias('L_', Status => $DISCOURAGED); # For backwards... 13630 $LC->add_alias('L&', Status => $DISCOURAGED); # compatibility. 13631 13632 13633 if ($LC->is_empty) { # Assume if not empty that Unicode has started to 13634 # deliver the correct values in it 13635 $LC->initialize($gc->table('Ll') + $gc->table('Lu')); 13636 13637 # Lt not in release 1. 13638 if (defined $gc->table('Lt')) { 13639 $LC += $gc->table('Lt'); 13640 $gc->table('Lt')->set_caseless_equivalent($LC); 13641 } 13642 } 13643 $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]'); 13644 13645 $gc->table('Ll')->set_caseless_equivalent($LC); 13646 $gc->table('Lu')->set_caseless_equivalent($LC); 13647 13648 # Make sure this assumption in perl core code is valid in this Unicode 13649 # release, with known exceptions 13650 foreach my $range (property_ref('Numeric-Type')->table('Decimal')->ranges) { 13651 next if $range->end - $range->start == 9; 13652 next if $range->start == 0x1D7CE; # This whole range was added in 3.1 13653 next if $range->end == 0x19DA && $v_version eq v5.2.0; 13654 next if $range->end - $range->start < 9 && $v_version le 4.0.0; 13655 Carp::my_carp("Range $range unexpectedly doesn't contain 10" 13656 . " decimal digits. Code in regcomp.c assumes it does," 13657 . " and will have to be fixed. Proceeding anyway."); 13658 } 13659 13660 # Mark the scx table as the parent of the corresponding sc table for those 13661 # which are identical. This causes the pod for the script table to refer 13662 # to the corresponding scx one. This is done after everything, so as to 13663 # wait until the tables are stabilized before checking for equivalency. 13664 if (defined $scx) { 13665 if (defined $pod_directory) { 13666 foreach my $table ($scx->tables) { 13667 my $plain_sc_equiv = $script->table($table->name); 13668 if ($table->matches_identically_to($plain_sc_equiv)) { 13669 $plain_sc_equiv->set_equivalent_to($table, Related => 1); 13670 } 13671 } 13672 } 13673 } 13674 13675 return; 13676} 13677 13678sub pre_3_dot_1_Nl () { 13679 13680 # Return a range list for gc=nl for Unicode versions prior to 3.1, which 13681 # is when Unicode's became fully usable. These code points were 13682 # determined by inspection and experimentation. gc=nl is important for 13683 # certain Perl-extension properties that should be available in all 13684 # releases. 13685 13686 my $Nl = Range_List->new(); 13687 if (defined (my $official = $gc->table('Nl'))) { 13688 $Nl += $official; 13689 } 13690 else { 13691 $Nl->add_range(0x2160, 0x2182); 13692 $Nl->add_range(0x3007, 0x3007); 13693 $Nl->add_range(0x3021, 0x3029); 13694 } 13695 $Nl->add_range(0xFE20, 0xFE23); 13696 $Nl->add_range(0x16EE, 0x16F0) if $v_version ge v3.0.0; # 3.0 was when 13697 # these were added 13698 return $Nl; 13699} 13700 13701sub calculate_Assigned() { # Set $Assigned to the gc != Cn code points; may be 13702 # called before the Cn's are completely filled. 13703 # Works on Unicodes earlier than ones that 13704 # explicitly specify Cn. 13705 return if defined $Assigned; 13706 13707 if (! defined $gc || $gc->is_empty()) { 13708 Carp::my_carp_bug("calculate_Assigned() called before $gc is populated"); 13709 } 13710 13711 $Assigned = $perl->add_match_table('Assigned', 13712 Description => "All assigned code points", 13713 ); 13714 while (defined (my $range = $gc->each_range())) { 13715 my $standard_value = standardize($range->value); 13716 next if $standard_value eq 'cn' || $standard_value eq 'unassigned'; 13717 $Assigned->add_range($range->start, $range->end); 13718 } 13719} 13720 13721sub calculate_DI() { # Set $DI to a Range_List equivalent to the 13722 # Default_Ignorable_Code_Point property. Works on 13723 # Unicodes earlier than ones that explicitly specify 13724 # DI. 13725 return if defined $DI; 13726 13727 if (defined (my $di = property_ref('Default_Ignorable_Code_Point'))) { 13728 $DI = $di->table('Y'); 13729 } 13730 else { 13731 $DI = Range_List->new(Initialize => [ 0x180B .. 0x180D, 13732 0x2060 .. 0x206F, 13733 0xFE00 .. 0xFE0F, 13734 0xFFF0 .. 0xFFFB, 13735 ]); 13736 if ($v_version ge v2.0) { 13737 $DI += $gc->table('Cf') 13738 + $gc->table('Cs'); 13739 13740 # These are above the Unicode version 1 max 13741 $DI->add_range(0xE0000, 0xE0FFF); 13742 } 13743 $DI += $gc->table('Cc') 13744 - ord("\t") 13745 - utf8::unicode_to_native(0x0A) # LINE FEED 13746 - utf8::unicode_to_native(0x0B) # VERTICAL TAB 13747 - ord("\f") 13748 - utf8::unicode_to_native(0x0D) # CARRIAGE RETURN 13749 - utf8::unicode_to_native(0x85); # NEL 13750 } 13751} 13752 13753sub calculate_NChar() { # Create a Perl extension match table which is the 13754 # same as the Noncharacter_Code_Point property, and 13755 # set $NChar to point to it. Works on Unicodes 13756 # earlier than ones that explicitly specify NChar 13757 return if defined $NChar; 13758 13759 $NChar = $perl->add_match_table('_Perl_Nchar', 13760 Perl_Extension => 1, 13761 Fate => $INTERNAL_ONLY); 13762 if (defined (my $off_nchar = property_ref('NChar'))) { 13763 $NChar->initialize($off_nchar->table('Y')); 13764 } 13765 else { 13766 $NChar->initialize([ 0xFFFE .. 0xFFFF ]); 13767 if ($v_version ge v2.0) { # First release with these nchars 13768 for (my $i = 0x1FFFE; $i <= 0x10FFFE; $i += 0x10000) { 13769 $NChar += [ $i .. $i+1 ]; 13770 } 13771 } 13772 } 13773} 13774 13775sub handle_compare_versions () { 13776 # This fixes things up for the $compare_versions capability, where we 13777 # compare Unicode version X with version Y (with Y > X), and we are 13778 # running it on the Unicode Data for version Y. 13779 # 13780 # It works by calculating the code points whose meaning has been specified 13781 # after release X, by using the Age property. The complement of this set 13782 # is the set of code points whose meaning is unchanged between the 13783 # releases. This is the set the program restricts itself to. It includes 13784 # everything whose meaning has been specified by the time version X came 13785 # along, plus those still unassigned by the time of version Y. (We will 13786 # continue to use the word 'assigned' to mean 'meaning has been 13787 # specified', as it's shorter and is accurate in all cases except the 13788 # Noncharacter code points.) 13789 # 13790 # This function is run after all the properties specified by Unicode have 13791 # been calculated for release Y. This makes sure we get all the nuances 13792 # of Y's rules. (It is done before the Perl extensions are calculated, as 13793 # those are based entirely on the Unicode ones.) But doing it after the 13794 # Unicode table calculations means we have to fix up the Unicode tables. 13795 # We do this by subtracting the code points that have been assigned since 13796 # X (which is actually done by ANDing each table of assigned code points 13797 # with the set of unchanged code points). Most Unicode properties are of 13798 # the form such that all unassigned code points have a default, grab-bag, 13799 # property value which is changed when the code point gets assigned. For 13800 # these, we just remove the changed code points from the table for the 13801 # latter property value, and add them back in to the grab-bag one. A few 13802 # other properties are not entirely of this form and have values for some 13803 # or all unassigned code points that are not the grab-bag one. These have 13804 # to be handled specially, and are hard-coded in to this routine based on 13805 # manual inspection of the Unicode character database. A list of the 13806 # outlier code points is made for each of these properties, and those 13807 # outliers are excluded from adding and removing from tables. 13808 # 13809 # Note that there are glitches when comparing against Unicode 1.1, as some 13810 # Hangul syllables in it were later ripped out and eventually replaced 13811 # with other things. 13812 13813 print "Fixing up for version comparison\n" if $verbosity >= $PROGRESS; 13814 13815 my $after_first_version = "All matching code points were added after " 13816 . "Unicode $string_compare_versions"; 13817 13818 # Calculate the delta as those code points that have been newly assigned 13819 # since the first compare version. 13820 my $delta = Range_List->new(); 13821 foreach my $table ($age->tables) { 13822 use version; 13823 next if $table == $age->table('Unassigned'); 13824 next if version->parse($table->name) 13825 le version->parse($string_compare_versions); 13826 $delta += $table; 13827 } 13828 if ($delta->is_empty) { 13829 die ("No changes; perhaps you need a 'DAge.txt' file?"); 13830 } 13831 13832 my $unchanged = ~ $delta; 13833 13834 calculate_Assigned() if ! defined $Assigned; 13835 $Assigned &= $unchanged; 13836 13837 # $Assigned now contains the code points that were assigned as of Unicode 13838 # version X. 13839 13840 # A block is all or nothing. If nothing is assigned in it, it all goes 13841 # back to the No_Block pool; but if even one code point is assigned, the 13842 # block is retained. 13843 my $no_block = $block->table('No_Block'); 13844 foreach my $this_block ($block->tables) { 13845 next if $this_block == $no_block 13846 || ! ($this_block & $Assigned)->is_empty; 13847 $this_block->set_fate($SUPPRESSED, $after_first_version); 13848 foreach my $range ($this_block->ranges) { 13849 $block->replace_map($range->start, $range->end, 'No_Block') 13850 } 13851 $no_block += $this_block; 13852 } 13853 13854 my @special_delta_properties; # List of properties that have to be 13855 # handled specially. 13856 my %restricted_delta; # Keys are the entries in 13857 # @special_delta_properties; values 13858 # are the range list of the code points 13859 # that behave normally when they get 13860 # assigned. 13861 13862 # In the next three properties, the Default Ignorable code points are 13863 # outliers. 13864 calculate_DI(); 13865 $DI &= $unchanged; 13866 13867 push @special_delta_properties, property_ref('_Perl_GCB'); 13868 $restricted_delta{$special_delta_properties[-1]} = ~ $DI; 13869 13870 if (defined (my $cwnfkcc = property_ref('Changes_When_NFKC_Casefolded'))) 13871 { 13872 push @special_delta_properties, $cwnfkcc; 13873 $restricted_delta{$special_delta_properties[-1]} = ~ $DI; 13874 } 13875 13876 calculate_NChar(); # Non-character code points 13877 $NChar &= $unchanged; 13878 13879 # This may have to be updated from time-to-time to get the most accurate 13880 # results. 13881 my $default_BC_non_LtoR = Range_List->new(Initialize => 13882 # These came from the comments in v8.0 DBidiClass.txt 13883 [ # AL 13884 0x0600 .. 0x07BF, 13885 0x08A0 .. 0x08FF, 13886 0xFB50 .. 0xFDCF, 13887 0xFDF0 .. 0xFDFF, 13888 0xFE70 .. 0xFEFF, 13889 0x1EE00 .. 0x1EEFF, 13890 # R 13891 0x0590 .. 0x05FF, 13892 0x07C0 .. 0x089F, 13893 0xFB1D .. 0xFB4F, 13894 0x10800 .. 0x10FFF, 13895 0x1E800 .. 0x1EDFF, 13896 0x1EF00 .. 0x1EFFF, 13897 # ET 13898 0x20A0 .. 0x20CF, 13899 ] 13900 ); 13901 $default_BC_non_LtoR += $DI + $NChar; 13902 push @special_delta_properties, property_ref('BidiClass'); 13903 $restricted_delta{$special_delta_properties[-1]} = ~ $default_BC_non_LtoR; 13904 13905 if (defined (my $eaw = property_ref('East_Asian_Width'))) { 13906 13907 my $default_EA_width_W = Range_List->new(Initialize => 13908 # From comments in v8.0 EastAsianWidth.txt 13909 [ 13910 0x3400 .. 0x4DBF, 13911 0x4E00 .. 0x9FFF, 13912 0xF900 .. 0xFAFF, 13913 0x20000 .. 0x2A6DF, 13914 0x2A700 .. 0x2B73F, 13915 0x2B740 .. 0x2B81F, 13916 0x2B820 .. 0x2CEAF, 13917 0x2F800 .. 0x2FA1F, 13918 0x20000 .. 0x2FFFD, 13919 0x30000 .. 0x3FFFD, 13920 ] 13921 ); 13922 push @special_delta_properties, $eaw; 13923 $restricted_delta{$special_delta_properties[-1]} 13924 = ~ $default_EA_width_W; 13925 13926 # Line break came along in the same release as East_Asian_Width, and 13927 # the non-grab-bag default set is a superset of the EAW one. 13928 if (defined (my $lb = property_ref('Line_Break'))) { 13929 my $default_LB_non_XX = Range_List->new(Initialize => 13930 # From comments in v8.0 LineBreak.txt 13931 [ 0x20A0 .. 0x20CF ]); 13932 $default_LB_non_XX += $default_EA_width_W; 13933 push @special_delta_properties, $lb; 13934 $restricted_delta{$special_delta_properties[-1]} 13935 = ~ $default_LB_non_XX; 13936 } 13937 } 13938 13939 # Go through every property, skipping those we've already worked on, those 13940 # that are immutable, and the perl ones that will be calculated after this 13941 # routine has done its fixup. 13942 foreach my $property (property_ref('*')) { 13943 next if $property == $perl # Done later in the program 13944 || $property == $block # Done just above 13945 || $property == $DI # Done just above 13946 || $property == $NChar # Done just above 13947 13948 # The next two are invariant across Unicode versions 13949 || $property == property_ref('Pattern_Syntax') 13950 || $property == property_ref('Pattern_White_Space'); 13951 13952 # Find the grab-bag value. 13953 my $default_map = $property->default_map; 13954 13955 if (! $property->to_create_match_tables) { 13956 13957 # Here there aren't any match tables. So far, all such properties 13958 # have a default map, and don't require special handling. Just 13959 # change each newly assigned code point back to the default map, 13960 # as if they were unassigned. 13961 foreach my $range ($delta->ranges) { 13962 $property->add_map($range->start, 13963 $range->end, 13964 $default_map, 13965 Replace => $UNCONDITIONALLY); 13966 } 13967 } 13968 else { # Here there are match tables. Find the one (if any) for the 13969 # grab-bag value that unassigned code points go to. 13970 my $default_table; 13971 if (defined $default_map) { 13972 $default_table = $property->table($default_map); 13973 } 13974 13975 # If some code points don't go back to the grab-bag when they 13976 # are considered unassigned, exclude them from the list that does 13977 # that. 13978 my $this_delta = $delta; 13979 my $this_unchanged = $unchanged; 13980 if (grep { $_ == $property } @special_delta_properties) { 13981 $this_delta = $delta & $restricted_delta{$property}; 13982 $this_unchanged = ~ $this_delta; 13983 } 13984 13985 # Fix up each match table for this property. 13986 foreach my $table ($property->tables) { 13987 if (defined $default_table && $table == $default_table) { 13988 13989 # The code points assigned after release X (the ones we 13990 # are excluding in this routine) go back on to the default 13991 # (grab-bag) table. However, some of these tables don't 13992 # actually exist, but are specified solely by the other 13993 # tables. (In a binary property, we don't need to 13994 # actually have an 'N' table, as it's just the complement 13995 # of the 'Y' table.) Such tables will be locked, so just 13996 # skip those. 13997 $table += $this_delta unless $table->locked; 13998 } 13999 else { 14000 14001 # Here the table is not for the default value. We need to 14002 # subtract the code points we are ignoring for this 14003 # comparison (the deltas) from it. But if the table 14004 # started out with nothing, no need to exclude anything, 14005 # and want to skip it here anyway, so it gets listed 14006 # properly in the pod. 14007 next if $table->is_empty; 14008 14009 # Save the deltas for later, before we do the subtraction 14010 my $deltas = $table & $this_delta; 14011 14012 $table &= $this_unchanged; 14013 14014 # Suppress the table if the subtraction left it with 14015 # nothing in it 14016 if ($table->is_empty) { 14017 if ($property->type == $BINARY) { 14018 push @tables_that_may_be_empty, $table->complete_name; 14019 } 14020 else { 14021 $table->set_fate($SUPPRESSED, $after_first_version); 14022 } 14023 } 14024 14025 # Now we add the removed code points to the property's 14026 # map, as they should now map to the grab-bag default 14027 # property (which they did in the first comparison 14028 # version). But we don't have to do this if the map is 14029 # only for internal use. 14030 if (defined $default_map && $property->to_output_map) { 14031 14032 # The gc property has pseudo property values whose names 14033 # have length 1. These are the union of all the 14034 # property values whose name is longer than 1 and 14035 # whose first letter is all the same. The replacement 14036 # is done once for the longer-named tables. 14037 next if $property == $gc && length $table->name == 1; 14038 14039 foreach my $range ($deltas->ranges) { 14040 $property->add_map($range->start, 14041 $range->end, 14042 $default_map, 14043 Replace => $UNCONDITIONALLY); 14044 } 14045 } 14046 } 14047 } 14048 } 14049 } 14050 14051 # The above code doesn't work on 'gc=C', as it is a superset of the default 14052 # ('Cn') table. It's easiest to just special case it here. 14053 my $C = $gc->table('C'); 14054 $C += $gc->table('Cn'); 14055 14056 return; 14057} 14058 14059sub compile_perl() { 14060 # Create perl-defined tables. Almost all are part of the pseudo-property 14061 # named 'perl' internally to this program. Many of these are recommended 14062 # in UTS#18 "Unicode Regular Expressions", and their derivations are based 14063 # on those found there. 14064 # Almost all of these are equivalent to some Unicode property. 14065 # A number of these properties have equivalents restricted to the ASCII 14066 # range, with their names prefaced by 'Posix', to signify that these match 14067 # what the Posix standard says they should match. A couple are 14068 # effectively this, but the name doesn't have 'Posix' in it because there 14069 # just isn't any Posix equivalent. 'XPosix' are the Posix tables extended 14070 # to the full Unicode range, by our guesses as to what is appropriate. 14071 14072 # 'All' is all code points. As an error check, instead of just setting it 14073 # to be that, construct it to be the union of all the major categories 14074 $All = $perl->add_match_table('All', 14075 Description 14076 => "All code points, including those above Unicode. Same as qr/./s", 14077 Matches_All => 1); 14078 14079 foreach my $major_table ($gc->tables) { 14080 14081 # Major categories are the ones with single letter names. 14082 next if length($major_table->name) != 1; 14083 14084 $All += $major_table; 14085 } 14086 14087 if ($All->max != $MAX_WORKING_CODEPOINT) { 14088 Carp::my_carp_bug("Generated highest code point (" 14089 . sprintf("%X", $All->max) 14090 . ") doesn't match expected value $MAX_WORKING_CODEPOINT_STRING.") 14091 } 14092 if ($All->range_count != 1 || $All->min != 0) { 14093 Carp::my_carp_bug("Generated table 'All' doesn't match all code points.") 14094 } 14095 14096 my $Any = $perl->add_match_table('Any', 14097 Description => "All Unicode code points"); 14098 $Any->add_range(0, $MAX_UNICODE_CODEPOINT); 14099 $Any->add_alias('Unicode'); 14100 14101 calculate_Assigned(); 14102 14103 my $ASCII = $perl->add_match_table('ASCII'); 14104 if (defined $block) { # This is equivalent to the block if have it. 14105 my $Unicode_ASCII = $block->table('Basic_Latin'); 14106 if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) { 14107 $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1); 14108 } 14109 } 14110 14111 # Very early releases didn't have blocks, so initialize ASCII ourselves if 14112 # necessary 14113 if ($ASCII->is_empty) { 14114 if (! NON_ASCII_PLATFORM) { 14115 $ASCII->add_range(0, 127); 14116 } 14117 else { 14118 for my $i (0 .. 127) { 14119 $ASCII->add_range(utf8::unicode_to_native($i), 14120 utf8::unicode_to_native($i)); 14121 } 14122 } 14123 } 14124 14125 # Get the best available case definitions. Early Unicode versions didn't 14126 # have Uppercase and Lowercase defined, so use the general category 14127 # instead for them, modified by hard-coding in the code points each is 14128 # missing. 14129 my $Lower = $perl->add_match_table('XPosixLower'); 14130 my $Unicode_Lower = property_ref('Lowercase'); 14131 if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) { 14132 $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1); 14133 14134 } 14135 else { 14136 $Lower += $gc->table('Lowercase_Letter'); 14137 14138 # There are quite a few code points in Lower, that aren't in gc=lc, 14139 # and not all are in all releases. 14140 my $temp = Range_List->new(Initialize => [ 14141 utf8::unicode_to_native(0xAA), 14142 utf8::unicode_to_native(0xBA), 14143 0x02B0 .. 0x02B8, 14144 0x02C0 .. 0x02C1, 14145 0x02E0 .. 0x02E4, 14146 0x0345, 14147 0x037A, 14148 0x1D2C .. 0x1D6A, 14149 0x1D78, 14150 0x1D9B .. 0x1DBF, 14151 0x2071, 14152 0x207F, 14153 0x2090 .. 0x209C, 14154 0x2170 .. 0x217F, 14155 0x24D0 .. 0x24E9, 14156 0x2C7C .. 0x2C7D, 14157 0xA770, 14158 0xA7F8 .. 0xA7F9, 14159 ]); 14160 $Lower += $temp & $Assigned; 14161 } 14162 my $Posix_Lower = $perl->add_match_table("PosixLower", 14163 Initialize => $Lower & $ASCII, 14164 ); 14165 14166 my $Upper = $perl->add_match_table("XPosixUpper"); 14167 my $Unicode_Upper = property_ref('Uppercase'); 14168 if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) { 14169 $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1); 14170 } 14171 else { 14172 14173 # Unlike Lower, there are only two ranges in Upper that aren't in 14174 # gc=Lu, and all code points were assigned in all releases. 14175 $Upper += $gc->table('Uppercase_Letter'); 14176 $Upper->add_range(0x2160, 0x216F); # Uppercase Roman numerals 14177 $Upper->add_range(0x24B6, 0x24CF); # Circled Latin upper case letters 14178 } 14179 my $Posix_Upper = $perl->add_match_table("PosixUpper", 14180 Initialize => $Upper & $ASCII, 14181 ); 14182 14183 # Earliest releases didn't have title case. Initialize it to empty if not 14184 # otherwise present 14185 my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase', 14186 Description => '(= \p{Gc=Lt})'); 14187 my $lt = $gc->table('Lt'); 14188 14189 # Earlier versions of mktables had this related to $lt since they have 14190 # identical code points, but their caseless equivalents are not the same, 14191 # one being 'Cased' and the other being 'LC', and so now must be kept as 14192 # separate entities. 14193 if (defined $lt) { 14194 $Title += $lt; 14195 } 14196 else { 14197 push @tables_that_may_be_empty, $Title->complete_name; 14198 } 14199 14200 my $Unicode_Cased = property_ref('Cased'); 14201 if (defined $Unicode_Cased) { 14202 my $yes = $Unicode_Cased->table('Y'); 14203 my $no = $Unicode_Cased->table('N'); 14204 $Title->set_caseless_equivalent($yes); 14205 if (defined $Unicode_Upper) { 14206 $Unicode_Upper->table('Y')->set_caseless_equivalent($yes); 14207 $Unicode_Upper->table('N')->set_caseless_equivalent($no); 14208 } 14209 $Upper->set_caseless_equivalent($yes); 14210 if (defined $Unicode_Lower) { 14211 $Unicode_Lower->table('Y')->set_caseless_equivalent($yes); 14212 $Unicode_Lower->table('N')->set_caseless_equivalent($no); 14213 } 14214 $Lower->set_caseless_equivalent($yes); 14215 } 14216 else { 14217 # If this Unicode version doesn't have Cased, set up the Perl 14218 # extension from first principles. From Unicode 5.1: Definition D120: 14219 # A character C is defined to be cased if and only if C has the 14220 # Lowercase or Uppercase property or has a General_Category value of 14221 # Titlecase_Letter. 14222 my $cased = $perl->add_match_table('Cased', 14223 Initialize => $Lower + $Upper + $Title, 14224 Description => 'Uppercase or Lowercase or Titlecase', 14225 ); 14226 # $notcased is purely for the caseless equivalents below 14227 my $notcased = $perl->add_match_table('_Not_Cased', 14228 Initialize => ~ $cased, 14229 Fate => $INTERNAL_ONLY, 14230 Description => 'All not-cased code points'); 14231 $Title->set_caseless_equivalent($cased); 14232 if (defined $Unicode_Upper) { 14233 $Unicode_Upper->table('Y')->set_caseless_equivalent($cased); 14234 $Unicode_Upper->table('N')->set_caseless_equivalent($notcased); 14235 } 14236 $Upper->set_caseless_equivalent($cased); 14237 if (defined $Unicode_Lower) { 14238 $Unicode_Lower->table('Y')->set_caseless_equivalent($cased); 14239 $Unicode_Lower->table('N')->set_caseless_equivalent($notcased); 14240 } 14241 $Lower->set_caseless_equivalent($cased); 14242 } 14243 14244 # The remaining perl defined tables are mostly based on Unicode TR 18, 14245 # "Annex C: Compatibility Properties". All of these have two versions, 14246 # one whose name generally begins with Posix that is posix-compliant, and 14247 # one that matches Unicode characters beyond the Posix, ASCII range 14248 14249 my $Alpha = $perl->add_match_table('XPosixAlpha'); 14250 14251 # Alphabetic was not present in early releases 14252 my $Alphabetic = property_ref('Alphabetic'); 14253 if (defined $Alphabetic && ! $Alphabetic->is_empty) { 14254 $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1); 14255 } 14256 else { 14257 14258 # The Alphabetic property doesn't exist for early releases, so 14259 # generate it. The actual definition, in 5.2 terms is: 14260 # 14261 # gc=L + gc=Nl + Other_Alphabetic 14262 # 14263 # Other_Alphabetic is also not defined in these early releases, but it 14264 # contains one gc=So range plus most of gc=Mn and gc=Mc, so we add 14265 # those last two as well, then subtract the relatively few of them that 14266 # shouldn't have been added. (The gc=So range is the circled capital 14267 # Latin characters. Early releases mistakenly didn't also include the 14268 # lower-case versions of these characters, and so we don't either, to 14269 # maintain consistency with those releases that first had this 14270 # property. 14271 $Alpha->initialize($gc->table('Letter') 14272 + pre_3_dot_1_Nl() 14273 + $gc->table('Mn') 14274 + $gc->table('Mc') 14275 ); 14276 $Alpha->add_range(0x24D0, 0x24E9); # gc=So 14277 foreach my $range ( [ 0x0300, 0x0344 ], 14278 [ 0x0346, 0x034E ], 14279 [ 0x0360, 0x0362 ], 14280 [ 0x0483, 0x0486 ], 14281 [ 0x0591, 0x05AF ], 14282 [ 0x06DF, 0x06E0 ], 14283 [ 0x06EA, 0x06EC ], 14284 [ 0x0740, 0x074A ], 14285 0x093C, 14286 0x094D, 14287 [ 0x0951, 0x0954 ], 14288 0x09BC, 14289 0x09CD, 14290 0x0A3C, 14291 0x0A4D, 14292 0x0ABC, 14293 0x0ACD, 14294 0x0B3C, 14295 0x0B4D, 14296 0x0BCD, 14297 0x0C4D, 14298 0x0CCD, 14299 0x0D4D, 14300 0x0DCA, 14301 [ 0x0E47, 0x0E4C ], 14302 0x0E4E, 14303 [ 0x0EC8, 0x0ECC ], 14304 [ 0x0F18, 0x0F19 ], 14305 0x0F35, 14306 0x0F37, 14307 0x0F39, 14308 [ 0x0F3E, 0x0F3F ], 14309 [ 0x0F82, 0x0F84 ], 14310 [ 0x0F86, 0x0F87 ], 14311 0x0FC6, 14312 0x1037, 14313 0x1039, 14314 [ 0x17C9, 0x17D3 ], 14315 [ 0x20D0, 0x20DC ], 14316 0x20E1, 14317 [ 0x302A, 0x302F ], 14318 [ 0x3099, 0x309A ], 14319 [ 0xFE20, 0xFE23 ], 14320 [ 0x1D165, 0x1D169 ], 14321 [ 0x1D16D, 0x1D172 ], 14322 [ 0x1D17B, 0x1D182 ], 14323 [ 0x1D185, 0x1D18B ], 14324 [ 0x1D1AA, 0x1D1AD ], 14325 ) { 14326 if (ref $range) { 14327 $Alpha->delete_range($range->[0], $range->[1]); 14328 } 14329 else { 14330 $Alpha->delete_range($range, $range); 14331 } 14332 } 14333 $Alpha->add_description('Alphabetic'); 14334 $Alpha->add_alias('Alphabetic'); 14335 } 14336 my $Posix_Alpha = $perl->add_match_table("PosixAlpha", 14337 Initialize => $Alpha & $ASCII, 14338 ); 14339 $Posix_Upper->set_caseless_equivalent($Posix_Alpha); 14340 $Posix_Lower->set_caseless_equivalent($Posix_Alpha); 14341 14342 my $Alnum = $perl->add_match_table('Alnum', Full_Name => 'XPosixAlnum', 14343 Description => 'Alphabetic and (decimal) Numeric', 14344 Initialize => $Alpha + $gc->table('Decimal_Number'), 14345 ); 14346 $perl->add_match_table("PosixAlnum", 14347 Initialize => $Alnum & $ASCII, 14348 ); 14349 14350 my $Word = $perl->add_match_table('Word', Full_Name => 'XPosixWord', 14351 Description => '\w, including beyond ASCII;' 14352 . ' = \p{Alnum} + \pM + \p{Pc}' 14353 . ' + \p{Join_Control}', 14354 Initialize => $Alnum + $gc->table('Mark'), 14355 ); 14356 my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1 14357 if (defined $Pc) { 14358 $Word += $Pc; 14359 } 14360 else { 14361 $Word += ord('_'); # Make sure this is a $Word 14362 } 14363 my $JC = property_ref('Join_Control'); # Wasn't in release 1 14364 if (defined $JC) { 14365 $Word += $JC->table('Y'); 14366 } 14367 else { 14368 $Word += 0x200C + 0x200D; 14369 } 14370 14371 # This is a Perl extension, so the name doesn't begin with Posix. 14372 my $PerlWord = $perl->add_match_table('PosixWord', 14373 Description => '\w, restricted to ASCII', 14374 Initialize => $Word & $ASCII, 14375 ); 14376 $PerlWord->add_alias('PerlWord'); 14377 14378 my $Blank = $perl->add_match_table('Blank', Full_Name => 'XPosixBlank', 14379 Description => '\h, Horizontal white space', 14380 14381 # 200B is Zero Width Space which is for line 14382 # break control, and was listed as 14383 # Space_Separator in early releases 14384 Initialize => $gc->table('Space_Separator') 14385 + ord("\t") 14386 - 0x200B, # ZWSP 14387 ); 14388 $Blank->add_alias('HorizSpace'); # Another name for it. 14389 $perl->add_match_table("PosixBlank", 14390 Initialize => $Blank & $ASCII, 14391 ); 14392 14393 my $VertSpace = $perl->add_match_table('VertSpace', 14394 Description => '\v', 14395 Initialize => 14396 $gc->table('Line_Separator') 14397 + $gc->table('Paragraph_Separator') 14398 + utf8::unicode_to_native(0x0A) # LINE FEED 14399 + utf8::unicode_to_native(0x0B) # VERTICAL TAB 14400 + ord("\f") 14401 + utf8::unicode_to_native(0x0D) # CARRIAGE RETURN 14402 + utf8::unicode_to_native(0x85) # NEL 14403 ); 14404 # No Posix equivalent for vertical space 14405 14406 my $Space = $perl->add_match_table('XPosixSpace', 14407 Description => '\s including beyond ASCII and vertical tab', 14408 Initialize => $Blank + $VertSpace, 14409 ); 14410 $Space->add_alias('XPerlSpace'); # Pre-existing synonyms 14411 $Space->add_alias('SpacePerl'); 14412 $Space->add_alias('Space') if $v_version lt v4.1.0; 14413 14414 my $Posix_space = $perl->add_match_table("PosixSpace", 14415 Initialize => $Space & $ASCII, 14416 ); 14417 $Posix_space->add_alias('PerlSpace'); # A pre-existing synonym 14418 14419 my $Cntrl = $perl->add_match_table('Cntrl', Full_Name => 'XPosixCntrl', 14420 Description => 'Control characters'); 14421 $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1); 14422 $perl->add_match_table("PosixCntrl", 14423 Description => "ASCII control characters", 14424 Definition => "ACK, BEL, BS, CAN, CR, DC1, DC2," 14425 . " DC3, DC4, DEL, DLE, ENQ, EOM," 14426 . " EOT, ESC, ETB, ETX, FF, FS, GS," 14427 . " HT, LF, NAK, NUL, RS, SI, SO," 14428 . " SOH, STX, SUB, SYN, US, VT", 14429 Initialize => $Cntrl & $ASCII, 14430 ); 14431 14432 my $perl_surrogate = $perl->add_match_table('_Perl_Surrogate'); 14433 my $Cs = $gc->table('Cs'); 14434 if (defined $Cs && ! $Cs->is_empty) { 14435 $perl_surrogate += $Cs; 14436 } 14437 else { 14438 push @tables_that_may_be_empty, '_Perl_Surrogate'; 14439 } 14440 14441 # $controls is a temporary used to construct Graph. 14442 my $controls = Range_List->new(Initialize => $gc->table('Unassigned') 14443 + $gc->table('Control') 14444 + $perl_surrogate); 14445 14446 # Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls) 14447 my $Graph = $perl->add_match_table('Graph', Full_Name => 'XPosixGraph', 14448 Description => 'Characters that are graphical', 14449 Initialize => ~ ($Space + $controls), 14450 ); 14451 $perl->add_match_table("PosixGraph", 14452 Initialize => $Graph & $ASCII, 14453 ); 14454 14455 $print = $perl->add_match_table('Print', Full_Name => 'XPosixPrint', 14456 Description => 'Characters that are graphical plus space characters (but no controls)', 14457 Initialize => $Blank + $Graph - $gc->table('Control'), 14458 ); 14459 $perl->add_match_table("PosixPrint", 14460 Initialize => $print & $ASCII, 14461 ); 14462 14463 my $Punct = $perl->add_match_table('Punct'); 14464 $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1); 14465 14466 # \p{punct} doesn't include the symbols, which posix does 14467 my $XPosixPunct = $perl->add_match_table('XPosixPunct', 14468 Description => '\p{Punct} + ASCII-range \p{Symbol}', 14469 Initialize => $gc->table('Punctuation') 14470 + ($ASCII & $gc->table('Symbol')), 14471 Perl_Extension => 1 14472 ); 14473 $perl->add_match_table('PosixPunct', Perl_Extension => 1, 14474 Initialize => $ASCII & $XPosixPunct, 14475 ); 14476 14477 my $Digit = $perl->add_match_table('Digit', Full_Name => 'XPosixDigit', 14478 Description => '[0-9] + all other decimal digits'); 14479 $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1); 14480 my $PosixDigit = $perl->add_match_table("PosixDigit", 14481 Initialize => $Digit & $ASCII, 14482 ); 14483 14484 # Hex_Digit was not present in first release 14485 my $Xdigit = $perl->add_match_table('XDigit', Full_Name => 'XPosixXDigit'); 14486 my $Hex = property_ref('Hex_Digit'); 14487 if (defined $Hex && ! $Hex->is_empty) { 14488 $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1); 14489 } 14490 else { 14491 $Xdigit->initialize([ ord('0') .. ord('9'), 14492 ord('A') .. ord('F'), 14493 ord('a') .. ord('f'), 14494 0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]); 14495 } 14496 14497 # AHex was not present in early releases 14498 my $PosixXDigit = $perl->add_match_table('PosixXDigit'); 14499 my $AHex = property_ref('ASCII_Hex_Digit'); 14500 if (defined $AHex && ! $AHex->is_empty) { 14501 $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1); 14502 } 14503 else { 14504 $PosixXDigit->initialize($Xdigit & $ASCII); 14505 $PosixXDigit->add_alias('AHex'); 14506 $PosixXDigit->add_alias('Ascii_Hex_Digit'); 14507 } 14508 14509 my $any_folds = $perl->add_match_table("_Perl_Any_Folds", 14510 Description => "Code points that particpate in some fold", 14511 ); 14512 my $loc_problem_folds = $perl->add_match_table( 14513 "_Perl_Problematic_Locale_Folds", 14514 Description => 14515 "Code points that are in some way problematic under locale", 14516 ); 14517 14518 # This allows regexec.c to skip some work when appropriate. Some of the 14519 # entries in _Perl_Problematic_Locale_Folds are multi-character folds, 14520 my $loc_problem_folds_start = $perl->add_match_table( 14521 "_Perl_Problematic_Locale_Foldeds_Start", 14522 Description => 14523 "The first character of every sequence in _Perl_Problematic_Locale_Folds", 14524 ); 14525 14526 my $cf = property_ref('Case_Folding'); 14527 14528 # Every character 0-255 is problematic because what each folds to depends 14529 # on the current locale 14530 $loc_problem_folds->add_range(0, 255); 14531 $loc_problem_folds->add_range(0x130, 0x131); # These are problematic in 14532 # Turkic locales 14533 $loc_problem_folds_start += $loc_problem_folds; 14534 14535 # Also problematic are anything these fold to outside the range. Likely 14536 # forever the only thing folded to by these outside the 0-255 range is the 14537 # GREEK SMALL MU (from the MICRO SIGN), but it's easy to make the code 14538 # completely general, which should catch any unexpected changes or errors. 14539 # We look at each code point 0-255, and add its fold (including each part 14540 # of a multi-char fold) to the list. See commit message 14541 # 31f05a37c4e9c37a7263491f2fc0237d836e1a80 for a more complete description 14542 # of the MU issue. 14543 foreach my $range ($loc_problem_folds->ranges) { 14544 foreach my $code_point ($range->start .. $range->end) { 14545 my $fold_range = $cf->containing_range($code_point); 14546 next unless defined $fold_range; 14547 14548 # Skip if folds to itself 14549 next if $fold_range->value eq $CODE_POINT; 14550 14551 my @hex_folds = split " ", $fold_range->value; 14552 my $start_cp = $hex_folds[0]; 14553 next if $start_cp eq $CODE_POINT; 14554 $start_cp = hex $start_cp; 14555 foreach my $i (0 .. @hex_folds - 1) { 14556 my $cp = $hex_folds[$i]; 14557 next if $cp eq $CODE_POINT; 14558 $cp = hex $cp; 14559 next unless $cp > 255; # Already have the < 256 ones 14560 14561 $loc_problem_folds->add_range($cp, $cp); 14562 $loc_problem_folds_start->add_range($start_cp, $start_cp); 14563 } 14564 } 14565 } 14566 14567 my $folds_to_multi_char = $perl->add_match_table( 14568 "_Perl_Folds_To_Multi_Char", 14569 Description => 14570 "Code points whose fold is a string of more than one character", 14571 ); 14572 my $in_multi_fold = $perl->add_match_table( 14573 "_Perl_Is_In_Multi_Char_Fold", 14574 Description => 14575 "Code points that are in some multiple character fold", 14576 ); 14577 if ($v_version lt v3.0.1) { 14578 push @tables_that_may_be_empty, '_Perl_Folds_To_Multi_Char', 14579 '_Perl_Is_In_Multi_Char_Fold', 14580 '_Perl_Non_Final_Folds'; 14581 } 14582 14583 # Look through all the known folds to populate these tables. 14584 foreach my $range ($cf->ranges) { 14585 next if $range->value eq $CODE_POINT; 14586 my $start = $range->start; 14587 my $end = $range->end; 14588 $any_folds->add_range($start, $end); 14589 14590 my @hex_folds = split " ", $range->value; 14591 if (@hex_folds > 1) { # Is multi-char fold 14592 $folds_to_multi_char->add_range($start, $end); 14593 } 14594 14595 my $found_locale_problematic = 0; 14596 14597 my $folded_count = @hex_folds; 14598 if ($folded_count > 3) { 14599 die Carp::my_carp("Maximum number of characters in a fold should be 3: Instead, it's $folded_count for U+" . sprintf "%04X", $range->start); 14600 } 14601 14602 # Look at each of the folded-to characters... 14603 foreach my $i (1 .. $folded_count) { 14604 my $cp = hex $hex_folds[$i-1]; 14605 $any_folds->add_range($cp, $cp); 14606 14607 # The fold is problematic if any of the folded-to characters is 14608 # already considered problematic. 14609 if ($loc_problem_folds->contains($cp)) { 14610 $loc_problem_folds->add_range($start, $end); 14611 $found_locale_problematic = 1; 14612 } 14613 14614 if ($folded_count > 1) { 14615 $in_multi_fold->add_range($cp, $cp); 14616 } 14617 } 14618 14619 # If this is a problematic fold, add to the start chars the 14620 # folding-from characters and first folded-to character. 14621 if ($found_locale_problematic) { 14622 $loc_problem_folds_start->add_range($start, $end); 14623 my $cp = hex $hex_folds[0]; 14624 $loc_problem_folds_start->add_range($cp, $cp); 14625 } 14626 } 14627 14628 my $dt = property_ref('Decomposition_Type'); 14629 $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical', 14630 Initialize => ~ ($dt->table('None') + $dt->table('Canonical')), 14631 Perl_Extension => 1, 14632 Note => 'Union of all non-canonical decompositions', 14633 ); 14634 14635 # For backward compatibility, Perl has its own definition for IDStart. 14636 # It is regular XID_Start plus the underscore, but all characters must be 14637 # Word characters as well 14638 my $XID_Start = property_ref('XID_Start'); 14639 my $perl_xids = $perl->add_match_table('_Perl_IDStart', 14640 Perl_Extension => 1, 14641 Fate => $INTERNAL_ONLY, 14642 Initialize => ord('_') 14643 ); 14644 if (defined $XID_Start 14645 || defined ($XID_Start = property_ref('ID_Start'))) 14646 { 14647 $perl_xids += $XID_Start->table('Y'); 14648 } 14649 else { 14650 # For Unicode versions that don't have the property, construct our own 14651 # from first principles. The actual definition is: 14652 # Letters 14653 # + letter numbers (Nl) 14654 # - Pattern_Syntax 14655 # - Pattern_White_Space 14656 # + stability extensions 14657 # - NKFC modifications 14658 # 14659 # What we do in the code below is to include the identical code points 14660 # that are in the first release that had Unicode's version of this 14661 # property, essentially extrapolating backwards. There were no 14662 # stability extensions until v4.1, so none are included; likewise in 14663 # no Unicode version so far do subtracting PatSyn and PatWS make any 14664 # difference, so those also are ignored. 14665 $perl_xids += $gc->table('Letter') + pre_3_dot_1_Nl(); 14666 14667 # We do subtract the NFKC modifications that are in the first version 14668 # that had this property. We don't bother to test if they are in the 14669 # version in question, because if they aren't, the operation is a 14670 # no-op. The NKFC modifications are discussed in 14671 # http://www.unicode.org/reports/tr31/#NFKC_Modifications 14672 foreach my $range ( 0x037A, 14673 0x0E33, 14674 0x0EB3, 14675 [ 0xFC5E, 0xFC63 ], 14676 [ 0xFDFA, 0xFE70 ], 14677 [ 0xFE72, 0xFE76 ], 14678 0xFE78, 14679 0xFE7A, 14680 0xFE7C, 14681 0xFE7E, 14682 [ 0xFF9E, 0xFF9F ], 14683 ) { 14684 if (ref $range) { 14685 $perl_xids->delete_range($range->[0], $range->[1]); 14686 } 14687 else { 14688 $perl_xids->delete_range($range, $range); 14689 } 14690 } 14691 } 14692 14693 $perl_xids &= $Word; 14694 14695 my $perl_xidc = $perl->add_match_table('_Perl_IDCont', 14696 Perl_Extension => 1, 14697 Fate => $INTERNAL_ONLY); 14698 my $XIDC = property_ref('XID_Continue'); 14699 if (defined $XIDC 14700 || defined ($XIDC = property_ref('ID_Continue'))) 14701 { 14702 $perl_xidc += $XIDC->table('Y'); 14703 } 14704 else { 14705 # Similarly, we construct our own XIDC if necessary for early Unicode 14706 # versions. The definition is: 14707 # everything in XIDS 14708 # + Gc=Mn 14709 # + Gc=Mc 14710 # + Gc=Nd 14711 # + Gc=Pc 14712 # - Pattern_Syntax 14713 # - Pattern_White_Space 14714 # + stability extensions 14715 # - NFKC modifications 14716 # 14717 # The same thing applies to this as with XIDS for the PatSyn, PatWS, 14718 # and stability extensions. There is a somewhat different set of NFKC 14719 # mods to remove (and add in this case). The ones below make this 14720 # have identical code points as in the first release that defined it. 14721 $perl_xidc += $perl_xids 14722 + $gc->table('L') 14723 + $gc->table('Mn') 14724 + $gc->table('Mc') 14725 + $gc->table('Nd') 14726 + utf8::unicode_to_native(0xB7) 14727 ; 14728 if (defined (my $pc = $gc->table('Pc'))) { 14729 $perl_xidc += $pc; 14730 } 14731 else { # 1.1.5 didn't have Pc, but these should have been in it 14732 $perl_xidc += 0xFF3F; 14733 $perl_xidc->add_range(0x203F, 0x2040); 14734 $perl_xidc->add_range(0xFE33, 0xFE34); 14735 $perl_xidc->add_range(0xFE4D, 0xFE4F); 14736 } 14737 14738 # Subtract the NFKC mods 14739 foreach my $range ( 0x037A, 14740 [ 0xFC5E, 0xFC63 ], 14741 [ 0xFDFA, 0xFE1F ], 14742 0xFE70, 14743 [ 0xFE72, 0xFE76 ], 14744 0xFE78, 14745 0xFE7A, 14746 0xFE7C, 14747 0xFE7E, 14748 ) { 14749 if (ref $range) { 14750 $perl_xidc->delete_range($range->[0], $range->[1]); 14751 } 14752 else { 14753 $perl_xidc->delete_range($range, $range); 14754 } 14755 } 14756 } 14757 14758 $perl_xidc &= $Word; 14759 14760 my $charname_begin = $perl->add_match_table('_Perl_Charname_Begin', 14761 Perl_Extension => 1, 14762 Fate => $INTERNAL_ONLY, 14763 Initialize => $gc->table('Letter') & $Alpha & $perl_xids, 14764 ); 14765 14766 my $charname_continue = $perl->add_match_table('_Perl_Charname_Continue', 14767 Perl_Extension => 1, 14768 Fate => $INTERNAL_ONLY, 14769 Initialize => $perl_xidc 14770 + ord(" ") 14771 + ord("(") 14772 + ord(")") 14773 + ord("-") 14774 ); 14775 14776 my @composition = ('Name', 'Unicode_1_Name', '_Perl_Name_Alias'); 14777 14778 if (@named_sequences) { 14779 push @composition, 'Named_Sequence'; 14780 foreach my $sequence (@named_sequences) { 14781 $perl_charname->add_anomalous_entry($sequence); 14782 } 14783 } 14784 14785 my $alias_sentence = ""; 14786 my %abbreviations; 14787 my $alias = property_ref('_Perl_Name_Alias'); 14788 $perl_charname->set_proxy_for('_Perl_Name_Alias'); 14789 14790 # Add each entry in _Perl_Name_Alias to Perl_Charnames. Where these go 14791 # with respect to any existing entry depends on the entry type. 14792 # Corrections go before said entry, as they should be returned in 14793 # preference over the existing entry. (A correction to a correction 14794 # should be later in the _Perl_Name_Alias table, so it will correctly 14795 # precede the erroneous correction in Perl_Charnames.) 14796 # 14797 # Abbreviations go after everything else, so they are saved temporarily in 14798 # a hash for later. 14799 # 14800 # Everything else is added afterwards, which preserves the input 14801 # ordering 14802 14803 foreach my $range ($alias->ranges) { 14804 next if $range->value eq ""; 14805 my $code_point = $range->start; 14806 if ($code_point != $range->end) { 14807 Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;"); 14808 } 14809 my ($value, $type) = split ': ', $range->value; 14810 my $replace_type; 14811 if ($type eq 'correction') { 14812 $replace_type = $MULTIPLE_BEFORE; 14813 } 14814 elsif ($type eq 'abbreviation') { 14815 14816 # Save for later 14817 $abbreviations{$value} = $code_point; 14818 next; 14819 } 14820 else { 14821 $replace_type = $MULTIPLE_AFTER; 14822 } 14823 14824 # Actually add; before or after current entry(ies) as determined 14825 # above. 14826 14827 $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type); 14828 } 14829 $alias_sentence = <<END; 14830The _Perl_Name_Alias property adds duplicate code point entries that are 14831alternatives to the original name. If an addition is a corrected 14832name, it will be physically first in the table. The original (less correct, 14833but still valid) name will be next; then any alternatives, in no particular 14834order; and finally any abbreviations, again in no particular order. 14835END 14836 14837 # Now add the Unicode_1 names for the controls. The Unicode_1 names had 14838 # precedence before 6.1, including the awful ones like "LINE FEED (LF)", 14839 # so should be first in the file; the other names have precedence starting 14840 # in 6.1, 14841 my $before_or_after = ($v_version lt v6.1.0) 14842 ? $MULTIPLE_BEFORE 14843 : $MULTIPLE_AFTER; 14844 14845 foreach my $range (property_ref('Unicode_1_Name')->ranges) { 14846 my $code_point = $range->start; 14847 my $unicode_1_value = $range->value; 14848 next if $unicode_1_value eq ""; # Skip if name doesn't exist. 14849 14850 if ($code_point != $range->end) { 14851 Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;"); 14852 } 14853 14854 # To handle EBCDIC, we don't hard code in the code points of the 14855 # controls; instead realizing that all of them are below 256. 14856 last if $code_point > 255; 14857 14858 # We only add in the controls. 14859 next if $gc->value_of($code_point) ne 'Cc'; 14860 14861 # We reject this Unicode1 name for later Perls, as it is used for 14862 # another code point 14863 next if $unicode_1_value eq 'BELL' && $^V ge v5.17.0; 14864 14865 # This won't add an exact duplicate. 14866 $perl_charname->add_duplicate($code_point, $unicode_1_value, 14867 Replace => $before_or_after); 14868 } 14869 14870 # Now that have everything added, add in abbreviations after 14871 # everything else. Sort so results don't change between runs of this 14872 # program 14873 foreach my $value (sort keys %abbreviations) { 14874 $perl_charname->add_duplicate($abbreviations{$value}, $value, 14875 Replace => $MULTIPLE_AFTER); 14876 } 14877 14878 my $comment; 14879 if (@composition <= 2) { # Always at least 2 14880 $comment = join " and ", @composition; 14881 } 14882 else { 14883 $comment = join ", ", @composition[0 .. scalar @composition - 2]; 14884 $comment .= ", and $composition[-1]"; 14885 } 14886 14887 $perl_charname->add_comment(join_lines( <<END 14888This file is for charnames.pm. It is the union of the $comment properties. 14889Unicode_1_Name entries are used only for nameless code points in the Name 14890property. 14891$alias_sentence 14892This file doesn't include the algorithmically determinable names. For those, 14893use 'unicore/Name.pm' 14894END 14895 )); 14896 property_ref('Name')->add_comment(join_lines( <<END 14897This file doesn't include the algorithmically determinable names. For those, 14898use 'unicore/Name.pm' 14899END 14900 )); 14901 14902 # Construct the Present_In property from the Age property. 14903 if (-e 'DAge.txt' && defined $age) { 14904 my $default_map = $age->default_map; 14905 my $in = Property->new('In', 14906 Default_Map => $default_map, 14907 Full_Name => "Present_In", 14908 Perl_Extension => 1, 14909 Type => $ENUM, 14910 Initialize => $age, 14911 ); 14912 $in->add_comment(join_lines(<<END 14913THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE. The values in this file are the 14914same as for $age, and not for what $in really means. This is because anything 14915defined in a given release should have multiple values: that release and all 14916higher ones. But only one value per code point can be represented in a table 14917like this. 14918END 14919 )); 14920 14921 # The Age tables are named like 1.5, 2.0, 2.1, .... Sort so that the 14922 # lowest numbered (earliest) come first, with the non-numeric one 14923 # last. 14924 my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/) 14925 ? 1 14926 : ($b->name !~ /^[\d.]*$/) 14927 ? -1 14928 : $a->name <=> $b->name 14929 } $age->tables; 14930 14931 # The Present_In property is the cumulative age properties. The first 14932 # one hence is identical to the first age one. 14933 my $previous_in = $in->add_match_table($first_age->name); 14934 $previous_in->set_equivalent_to($first_age, Related => 1); 14935 14936 my $description_start = "Code point's usage introduced in version "; 14937 $first_age->add_description($description_start . $first_age->name); 14938 14939 # To construct the accumulated values, for each of the age tables 14940 # starting with the 2nd earliest, merge the earliest with it, to get 14941 # all those code points existing in the 2nd earliest. Repeat merging 14942 # the new 2nd earliest with the 3rd earliest to get all those existing 14943 # in the 3rd earliest, and so on. 14944 foreach my $current_age (@rest_ages) { 14945 next if $current_age->name !~ /^[\d.]*$/; # Skip the non-numeric 14946 14947 my $current_in = $in->add_match_table( 14948 $current_age->name, 14949 Initialize => $current_age + $previous_in, 14950 Description => $description_start 14951 . $current_age->name 14952 . ' or earlier', 14953 ); 14954 foreach my $alias ($current_age->aliases) { 14955 $current_in->add_alias($alias->name); 14956 } 14957 $previous_in = $current_in; 14958 14959 # Add clarifying material for the corresponding age file. This is 14960 # in part because of the confusing and contradictory information 14961 # given in the Standard's documentation itself, as of 5.2. 14962 $current_age->add_description( 14963 "Code point's usage was introduced in version " 14964 . $current_age->name); 14965 $current_age->add_note("See also $in"); 14966 14967 } 14968 14969 # And finally the code points whose usages have yet to be decided are 14970 # the same in both properties. Note that permanently unassigned code 14971 # points actually have their usage assigned (as being permanently 14972 # unassigned), so that these tables are not the same as gc=cn. 14973 my $unassigned = $in->add_match_table($default_map); 14974 my $age_default = $age->table($default_map); 14975 $age_default->add_description(<<END 14976Code point's usage has not been assigned in any Unicode release thus far. 14977END 14978 ); 14979 $unassigned->set_equivalent_to($age_default, Related => 1); 14980 } 14981 14982 my $patws = $perl->add_match_table('_Perl_PatWS', 14983 Perl_Extension => 1, 14984 Fate => $INTERNAL_ONLY); 14985 if (defined (my $off_patws = property_ref('Pattern_White_Space'))) { 14986 $patws->initialize($off_patws->table('Y')); 14987 } 14988 else { 14989 $patws->initialize([ ord("\t"), 14990 ord("\n"), 14991 utf8::unicode_to_native(0x0B), # VT 14992 ord("\f"), 14993 ord("\r"), 14994 ord(" "), 14995 utf8::unicode_to_native(0x85), # NEL 14996 0x200E..0x200F, # Left, Right marks 14997 0x2028..0x2029 # Line, Paragraph seps 14998 ] ); 14999 } 15000 15001 # See L<perlfunc/quotemeta> 15002 my $quotemeta = $perl->add_match_table('_Perl_Quotemeta', 15003 Perl_Extension => 1, 15004 Fate => $INTERNAL_ONLY, 15005 15006 # Initialize to what's common in 15007 # all Unicode releases. 15008 Initialize => 15009 $gc->table('Control') 15010 + $Space 15011 + $patws 15012 + ((~ $Word) & $ASCII) 15013 ); 15014 15015 if (defined (my $patsyn = property_ref('Pattern_Syntax'))) { 15016 $quotemeta += $patsyn->table('Y'); 15017 } 15018 else { 15019 $quotemeta += ((~ $Word) & Range->new(0, 255)) 15020 - utf8::unicode_to_native(0xA8) 15021 - utf8::unicode_to_native(0xAF) 15022 - utf8::unicode_to_native(0xB2) 15023 - utf8::unicode_to_native(0xB3) 15024 - utf8::unicode_to_native(0xB4) 15025 - utf8::unicode_to_native(0xB7) 15026 - utf8::unicode_to_native(0xB8) 15027 - utf8::unicode_to_native(0xB9) 15028 - utf8::unicode_to_native(0xBC) 15029 - utf8::unicode_to_native(0xBD) 15030 - utf8::unicode_to_native(0xBE); 15031 $quotemeta += [ # These are above-Latin1 patsyn; hence should be the 15032 # same in all releases 15033 0x2010 .. 0x2027, 15034 0x2030 .. 0x203E, 15035 0x2041 .. 0x2053, 15036 0x2055 .. 0x205E, 15037 0x2190 .. 0x245F, 15038 0x2500 .. 0x2775, 15039 0x2794 .. 0x2BFF, 15040 0x2E00 .. 0x2E7F, 15041 0x3001 .. 0x3003, 15042 0x3008 .. 0x3020, 15043 0x3030 .. 0x3030, 15044 0xFD3E .. 0xFD3F, 15045 0xFE45 .. 0xFE46 15046 ]; 15047 } 15048 15049 if (defined (my $di = property_ref('Default_Ignorable_Code_Point'))) { 15050 $quotemeta += $di->table('Y') 15051 } 15052 else { 15053 if ($v_version ge v2.0) { 15054 $quotemeta += $gc->table('Cf') 15055 + $gc->table('Cs'); 15056 15057 # These are above the Unicode version 1 max 15058 $quotemeta->add_range(0xE0000, 0xE0FFF); 15059 } 15060 $quotemeta += $gc->table('Cc') 15061 - $Space; 15062 my $temp = Range_List->new(Initialize => [ 0x180B .. 0x180D, 15063 0x2060 .. 0x206F, 15064 0xFE00 .. 0xFE0F, 15065 0xFFF0 .. 0xFFFB, 15066 ]); 15067 $temp->add_range(0xE0000, 0xE0FFF) if $v_version ge v2.0; 15068 $quotemeta += $temp; 15069 } 15070 calculate_DI(); 15071 $quotemeta += $DI; 15072 15073 calculate_NChar(); 15074 15075 # Finished creating all the perl properties. All non-internal non-string 15076 # ones have a synonym of 'Is_' prefixed. (Internal properties begin with 15077 # an underscore.) These do not get a separate entry in the pod file 15078 foreach my $table ($perl->tables) { 15079 foreach my $alias ($table->aliases) { 15080 next if $alias->name =~ /^_/; 15081 $table->add_alias('Is_' . $alias->name, 15082 Re_Pod_Entry => 0, 15083 UCD => 0, 15084 Status => $alias->status, 15085 OK_as_Filename => 0); 15086 } 15087 } 15088 15089 # Perl tailors the WordBreak property so that \b{wb} doesn't split 15090 # adjacent spaces into separate words. Unicode 11.0 moved in that 15091 # direction, but left TAB, FIGURE SPACE (U+2007), and (ironically) NO 15092 # BREAK SPACE as breaking, so we retained the original Perl customization. 15093 # To do this, in the Perl copy of WB, simply replace the mappings of 15094 # horizontal space characters that otherwise would map to the default or 15095 # the 11.0 'WSegSpace' to instead map to our tailoring. 15096 my $perl_wb = property_ref('_Perl_WB'); 15097 my $default = $perl_wb->default_map; 15098 for my $range ($Blank->ranges) { 15099 for my $i ($range->start .. $range->end) { 15100 my $value = $perl_wb->value_of($i); 15101 15102 next unless $value eq $default || $value eq 'WSegSpace'; 15103 $perl_wb->add_map($i, $i, 'Perl_Tailored_HSpace', 15104 Replace => $UNCONDITIONALLY); 15105 } 15106 } 15107 15108 # Also starting in Unicode 11.0, rules for some of the boundary types are 15109 # based on a non-UCD property (which we have read in if it exists). 15110 # Recall that these boundary properties partition the code points into 15111 # equivalence classes (represented as enums). 15112 # 15113 # The loop below goes through each code point that matches the non-UCD 15114 # property, and for each current equivalence class containing such a code 15115 # point, splits it so that those that are in both are now in a newly 15116 # created equivalence class whose name is a combination of the property 15117 # and the old class name, leaving unchanged everything that doesn't match 15118 # the non-UCD property. 15119 my $ep = property_ref('ExtPict'); 15120 $ep = $ep->table('Y') if defined $ep; 15121 if (defined $ep) { 15122 foreach my $base_property (property_ref('GCB'), 15123 property_ref('WB')) 15124 { 15125 my $property = property_ref('_Perl_' . $base_property->name); 15126 foreach my $range ($ep->ranges) { 15127 foreach my $i ($range->start .. $range->end) { 15128 my $current = $property->value_of($i); 15129 $current = $property->table($current)->short_name; 15130 $property->add_map($i, $i, 'ExtPict_' . $current, 15131 Replace => $UNCONDITIONALLY); 15132 } 15133 } 15134 } 15135 } 15136 15137 # Create a version of the LineBreak property with the mappings that are 15138 # omitted in the default algorithm remapped to what 15139 # http://www.unicode.org/reports/tr14 says they should be. 15140 # 15141 # First, create a plain copy, but with all property values written out in 15142 # their long form, as regen/mk_invlist.pl expects that, and also fix 15143 # occurrences of the typo in early Unicode versions: 'inseperable'. 15144 my $perl_lb = property_ref('_Perl_LB'); 15145 if (! defined $perl_lb) { 15146 $perl_lb = Property->new('_Perl_LB', 15147 Fate => $INTERNAL_ONLY, 15148 Perl_Extension => 1, 15149 Directory => $map_directory, 15150 Type => $STRING); 15151 my $lb = property_ref('Line_Break'); 15152 15153 # Populate from $lb, but use full name and fix typo. 15154 foreach my $range ($lb->ranges) { 15155 my $full_name = $lb->table($range->value)->full_name; 15156 $full_name = 'Inseparable' 15157 if standardize($full_name) eq 'inseperable'; 15158 $perl_lb->add_map($range->start, $range->end, $full_name); 15159 } 15160 } 15161 15162 # What tr14 says is this: 15163 15164 # Original Resolved General_Category 15165 # AI, SG, XX AL Any 15166 # SA CM Only Mn or Mc 15167 # SA AL Any except Mn and Mc 15168 # CJ NS Any 15169 15170 $perl_lb->set_default_map('Alphabetic', 'full_name'); # XX -> AL 15171 15172 my $ea = property_ref('East_Asian_Width'); 15173 my $Cn_EP; 15174 $Cn_EP = $ep & $gc->table('Unassigned') if defined $ep; 15175 15176 for my $range ($perl_lb->ranges) { 15177 my $value = standardize($range->value); 15178 if ( $value eq standardize('Unknown') 15179 || $value eq standardize('Ambiguous') 15180 || $value eq standardize('Surrogate')) 15181 { 15182 $perl_lb->add_map($range->start, $range->end, 'Alphabetic', 15183 Replace => $UNCONDITIONALLY); 15184 } 15185 elsif ($value eq standardize('Conditional_Japanese_Starter')) { 15186 $perl_lb->add_map($range->start, $range->end, 'Nonstarter', 15187 Replace => $UNCONDITIONALLY); 15188 } 15189 elsif ($value eq standardize('Complex_Context')) { 15190 for my $i ($range->start .. $range->end) { 15191 my $gc_val = $gc->value_of($i); 15192 if ($gc_val eq 'Mn' || $gc_val eq 'Mc') { 15193 $perl_lb->add_map($i, $i, 'Combining_Mark', 15194 Replace => $UNCONDITIONALLY); 15195 } 15196 else { 15197 $perl_lb->add_map($i, $i, 'Alphabetic', 15198 Replace => $UNCONDITIONALLY); 15199 } 15200 } 15201 } 15202 elsif (defined $ep && $value eq standardize('Ideographic')) { 15203 15204 # Unicode 14 adds a rule to not break lines before any potential 15205 # EBase, They say that any unassigned code point that is ExtPict, 15206 # is potentially an EBase. In 14.0, all such ones are in the 15207 # ExtPict=ID category. We must split that category for the 15208 # pairwise rule table to work. 15209 for my $i ($range->start .. $range->end) { 15210 if ($Cn_EP->contains($i)) { 15211 $perl_lb->add_map($i, $i, 15212 'Unassigned_Extended_Pictographic_Ideographic', 15213 Replace => $UNCONDITIONALLY); 15214 } 15215 } 15216 } 15217 elsif ( defined $ea 15218 && ( $value eq standardize('Close_Parenthesis') 15219 || $value eq standardize('Open_Punctuation'))) 15220 { 15221 # Unicode 13 splits the OP and CP properties each into East Asian, 15222 # and non-. We retain the (now somewhat misleading) names OP and 15223 # CP for the non-East Asian variety, as there are very few East 15224 # Asian ones. 15225 my $replace = ($value eq standardize('Open_Punctuation')) 15226 ? 'East_Asian_OP' 15227 : 'East_Asian_CP'; 15228 for my $i ($range->start .. $range->end) { 15229 my $ea_val = $ea->value_of($i); 15230 if ($ea_val eq 'F' || $ea_val eq 'W' || $ea_val eq 'H') { 15231 $perl_lb->add_map($i, $i, $replace, 15232 Replace => $UNCONDITIONALLY); 15233 } 15234 } 15235 } 15236 } 15237 15238 # This property is a modification of the scx property 15239 my $perl_scx = Property->new('_Perl_SCX', 15240 Fate => $INTERNAL_ONLY, 15241 Perl_Extension => 1, 15242 Directory => $map_directory, 15243 Type => $ENUM); 15244 my $source; 15245 15246 # Use scx if available; otherwise sc; if neither is there (a very old 15247 # Unicode version, just say that everything is 'Common' 15248 if (defined $scx) { 15249 $source = $scx; 15250 $perl_scx->set_default_map('Unknown'); 15251 } 15252 elsif (defined $script) { 15253 $source = $script; 15254 15255 # Early versions of 'sc', had everything be 'Common' 15256 if (defined $script->table('Unknown')) { 15257 $perl_scx->set_default_map('Unknown'); 15258 } 15259 else { 15260 $perl_scx->set_default_map('Common'); 15261 } 15262 } else { 15263 $perl_scx->add_match_table('Common'); 15264 $perl_scx->add_map(0, $MAX_UNICODE_CODEPOINT, 'Common'); 15265 15266 $perl_scx->add_match_table('Unknown'); 15267 $perl_scx->set_default_map('Unknown'); 15268 } 15269 15270 $perl_scx->_set_format($STRING_WHITE_SPACE_LIST); 15271 $perl_scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these 15272 15273 if (defined $source) { 15274 $perl_scx->initialize($source); 15275 15276 # UTS 39 says that the scx property should be modified for these 15277 # countries where certain mixed scripts are commonly used. 15278 for my $range ($perl_scx->ranges) { 15279 my $value = $range->value; 15280 my $changed = $value =~ s/ ( \b Han i? \b ) /$1 Hanb Jpan Kore/xi; 15281 $changed |= $value =~ s/ ( \b Hira (gana)? \b ) /$1 Jpan/xi; 15282 $changed |= $value =~ s/ ( \b Kata (kana)? \b ) /$1 Jpan/xi; 15283 $changed |= $value =~ s{ ( \b Katakana_or_Hiragana \b ) } 15284 {$1 Katakana Hiragana Jpan}xi; 15285 $changed |= $value =~ s/ ( \b Hang (ul)? \b ) /$1 Kore/xi; 15286 $changed |= $value =~ s/ ( \b Bopo (mofo)? \b ) /$1 Hanb/xi; 15287 15288 if ($changed) { 15289 $value = join " ", uniques split " ", $value; 15290 $range->set_value($value) 15291 } 15292 } 15293 15294 foreach my $table ($source->tables) { 15295 my $scx_table = $perl_scx->add_match_table($table->name, 15296 Full_Name => $table->full_name); 15297 foreach my $alias ($table->aliases) { 15298 $scx_table->add_alias($alias->name); 15299 } 15300 } 15301 } 15302 15303 # Here done with all the basic stuff. Ready to populate the information 15304 # about each character if annotating them. 15305 if ($annotate) { 15306 15307 # See comments at its declaration 15308 $annotate_ranges = Range_Map->new; 15309 15310 # This separates out the non-characters from the other unassigneds, so 15311 # can give different annotations for each. 15312 $unassigned_sans_noncharacters = Range_List->new( 15313 Initialize => $gc->table('Unassigned')); 15314 $unassigned_sans_noncharacters &= (~ $NChar); 15315 15316 for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT + 1; $i++ ) { 15317 $i = populate_char_info($i); # Note sets $i so may cause skips 15318 15319 } 15320 } 15321 15322 return; 15323} 15324 15325sub add_perl_synonyms() { 15326 # A number of Unicode tables have Perl synonyms that are expressed in 15327 # the single-form, \p{name}. These are: 15328 # All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and 15329 # \p{Is_Name} as synonyms 15330 # \p{Script_Extensions=Value} gets \p{Value}, \p{Is_Value} as synonyms 15331 # \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms 15332 # \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no 15333 # conflict, \p{Value} and \p{Is_Value} as well 15334 # 15335 # This routine generates these synonyms, warning of any unexpected 15336 # conflicts. 15337 15338 # Construct the list of tables to get synonyms for. Start with all the 15339 # binary and the General_Category ones. 15340 my @tables = grep { $_->type == $BINARY || $_->type == $FORCED_BINARY } 15341 property_ref('*'); 15342 push @tables, $gc->tables; 15343 15344 # If the version of Unicode includes the Script Extensions (preferably), 15345 # or Script property, add its tables 15346 if (defined $scx) { 15347 push @tables, $scx->tables; 15348 } 15349 else { 15350 push @tables, $script->tables if defined $script; 15351 } 15352 15353 # The Block tables are kept separate because they are treated differently. 15354 # And the earliest versions of Unicode didn't include them, so add only if 15355 # there are some. 15356 my @blocks; 15357 push @blocks, $block->tables if defined $block; 15358 15359 # Here, have the lists of tables constructed. Process blocks last so that 15360 # if there are name collisions with them, blocks have lowest priority. 15361 # Should there ever be other collisions, manual intervention would be 15362 # required. See the comments at the beginning of the program for a 15363 # possible way to handle those semi-automatically. 15364 foreach my $table (@tables, @blocks) { 15365 15366 # For non-binary properties, the synonym is just the name of the 15367 # table, like Greek, but for binary properties the synonym is the name 15368 # of the property, and means the code points in its 'Y' table. 15369 my $nominal = $table; 15370 my $nominal_property = $nominal->property; 15371 my $actual; 15372 if (! $nominal->isa('Property')) { 15373 $actual = $table; 15374 } 15375 else { 15376 15377 # Here is a binary property. Use the 'Y' table. Verify that is 15378 # there 15379 my $yes = $nominal->table('Y'); 15380 unless (defined $yes) { # Must be defined, but is permissible to 15381 # be empty. 15382 Carp::my_carp_bug("Undefined $nominal, 'Y'. Skipping."); 15383 next; 15384 } 15385 $actual = $yes; 15386 } 15387 15388 foreach my $alias ($nominal->aliases) { 15389 15390 # Attempt to create a table in the perl directory for the 15391 # candidate table, using whatever aliases in it that don't 15392 # conflict. Also add non-conflicting aliases for all these 15393 # prefixed by 'Is_' (and/or 'In_' for Block property tables) 15394 PREFIX: 15395 foreach my $prefix ("", 'Is_', 'In_') { 15396 15397 # Only Block properties can have added 'In_' aliases. 15398 next if $prefix eq 'In_' and $nominal_property != $block; 15399 15400 my $proposed_name = $prefix . $alias->name; 15401 15402 # No Is_Is, In_In, nor combinations thereof 15403 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x; 15404 next if $proposed_name =~ /^ I [ns] _I [ns] _/x; 15405 15406 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace; 15407 15408 # Get a reference to any existing table in the perl 15409 # directory with the desired name. 15410 my $pre_existing = $perl->table($proposed_name); 15411 15412 if (! defined $pre_existing) { 15413 15414 # No name collision, so OK to add the perl synonym. 15415 15416 my $make_re_pod_entry; 15417 my $ok_as_filename; 15418 my $status = $alias->status; 15419 if ($nominal_property == $block) { 15420 15421 # For block properties, only the compound form is 15422 # preferred for external use; the others are 15423 # discouraged. The pod file contains wild cards for 15424 # the 'In' and 'Is' forms so no entries for those; and 15425 # we don't want people using the name without any 15426 # prefix, so discourage that. 15427 if ($prefix eq "") { 15428 $make_re_pod_entry = 1; 15429 $status = $status || $DISCOURAGED; 15430 $ok_as_filename = 0; 15431 } 15432 elsif ($prefix eq 'In_') { 15433 $make_re_pod_entry = 0; 15434 $status = $status || $DISCOURAGED; 15435 $ok_as_filename = 1; 15436 } 15437 else { 15438 $make_re_pod_entry = 0; 15439 $status = $status || $DISCOURAGED; 15440 $ok_as_filename = 0; 15441 } 15442 } 15443 elsif ($prefix ne "") { 15444 15445 # The 'Is' prefix is handled in the pod by a wild 15446 # card, and we won't use it for an external name 15447 $make_re_pod_entry = 0; 15448 $status = $status || $NORMAL; 15449 $ok_as_filename = 0; 15450 } 15451 else { 15452 15453 # Here, is an empty prefix, non block. This gets its 15454 # own pod entry and can be used for an external name. 15455 $make_re_pod_entry = 1; 15456 $status = $status || $NORMAL; 15457 $ok_as_filename = 1; 15458 } 15459 15460 # Here, there isn't a perl pre-existing table with the 15461 # name. Look through the list of equivalents of this 15462 # table to see if one is a perl table. 15463 foreach my $equivalent ($actual->leader->equivalents) { 15464 next if $equivalent->property != $perl; 15465 15466 # Here, have found a table for $perl. Add this alias 15467 # to it, and are done with this prefix. 15468 $equivalent->add_alias($proposed_name, 15469 Re_Pod_Entry => $make_re_pod_entry, 15470 15471 # Currently don't output these in the 15472 # ucd pod, as are strongly discouraged 15473 # from being used 15474 UCD => 0, 15475 15476 Status => $status, 15477 OK_as_Filename => $ok_as_filename); 15478 trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace; 15479 next PREFIX; 15480 } 15481 15482 # Here, $perl doesn't already have a table that is a 15483 # synonym for this property, add one. 15484 my $added_table = $perl->add_match_table($proposed_name, 15485 Re_Pod_Entry => $make_re_pod_entry, 15486 15487 # See UCD comment just above 15488 UCD => 0, 15489 15490 Status => $status, 15491 OK_as_Filename => $ok_as_filename); 15492 # And it will be related to the actual table, since it is 15493 # based on it. 15494 $added_table->set_equivalent_to($actual, Related => 1); 15495 trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace; 15496 next; 15497 } # End of no pre-existing. 15498 15499 # Here, there is a pre-existing table that has the proposed 15500 # name. We could be in trouble, but not if this is just a 15501 # synonym for another table that we have already made a child 15502 # of the pre-existing one. 15503 if ($pre_existing->is_set_equivalent_to($actual)) { 15504 trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace; 15505 $pre_existing->add_alias($proposed_name); 15506 next; 15507 } 15508 15509 # Here, there is a name collision, but it still could be OK if 15510 # the tables match the identical set of code points, in which 15511 # case, we can combine the names. Compare each table's code 15512 # point list to see if they are identical. 15513 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace; 15514 if ($pre_existing->matches_identically_to($actual)) { 15515 15516 # Here, they do match identically. Not a real conflict. 15517 # Make the perl version a child of the Unicode one, except 15518 # in the non-obvious case of where the perl name is 15519 # already a synonym of another Unicode property. (This is 15520 # excluded by the test for it being its own parent.) The 15521 # reason for this exclusion is that then the two Unicode 15522 # properties become related; and we don't really know if 15523 # they are or not. We generate documentation based on 15524 # relatedness, and this would be misleading. Code 15525 # later executed in the process will cause the tables to 15526 # be represented by a single file anyway, without making 15527 # it look in the pod like they are necessarily related. 15528 if ($pre_existing->parent == $pre_existing 15529 && ($pre_existing->property == $perl 15530 || $actual->property == $perl)) 15531 { 15532 trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace; 15533 $pre_existing->set_equivalent_to($actual, Related => 1); 15534 } 15535 elsif (main::DEBUG && $to_trace) { 15536 trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases"; 15537 trace $pre_existing->parent; 15538 } 15539 next PREFIX; 15540 } 15541 15542 # Here they didn't match identically, there is a real conflict 15543 # between our new name and a pre-existing property. 15544 $actual->add_conflicting($proposed_name, 'p', $pre_existing); 15545 $pre_existing->add_conflicting($nominal->full_name, 15546 'p', 15547 $actual); 15548 15549 # Don't output a warning for aliases for the block 15550 # properties (unless they start with 'In_') as it is 15551 # expected that there will be conflicts and the block 15552 # form loses. 15553 if ($verbosity >= $NORMAL_VERBOSITY 15554 && ($actual->property != $block || $prefix eq 'In_')) 15555 { 15556 print simple_fold(join_lines(<<END 15557There is already an alias named $proposed_name (from $pre_existing), 15558so not creating this alias for $actual 15559END 15560 ), "", 4); 15561 } 15562 15563 # Keep track for documentation purposes. 15564 $has_In_conflicts++ if $prefix eq 'In_'; 15565 $has_Is_conflicts++ if $prefix eq 'Is_'; 15566 } 15567 } 15568 } 15569 15570 # There are some properties which have No and Yes (and N and Y) as 15571 # property values, but aren't binary, and could possibly be confused with 15572 # binary ones. So create caveats for them. There are tables that are 15573 # named 'No', and tables that are named 'N', but confusion is not likely 15574 # unless they are the same table. For example, N meaning Number or 15575 # Neutral is not likely to cause confusion, so don't add caveats to things 15576 # like them. 15577 foreach my $property (grep { $_->type != $BINARY 15578 && $_->type != $FORCED_BINARY } 15579 property_ref('*')) 15580 { 15581 my $yes = $property->table('Yes'); 15582 if (defined $yes) { 15583 my $y = $property->table('Y'); 15584 if (defined $y && $yes == $y) { 15585 foreach my $alias ($property->aliases) { 15586 $yes->add_conflicting($alias->name); 15587 } 15588 } 15589 } 15590 my $no = $property->table('No'); 15591 if (defined $no) { 15592 my $n = $property->table('N'); 15593 if (defined $n && $no == $n) { 15594 foreach my $alias ($property->aliases) { 15595 $no->add_conflicting($alias->name, 'P'); 15596 } 15597 } 15598 } 15599 } 15600 15601 return; 15602} 15603 15604sub register_file_for_name($table, $directory_ref, $file) { 15605 # Given info about a table and a datafile that it should be associated 15606 # with, register that association 15607 15608 # $directory_ref # Array of the directory path for the file 15609 # $file # The file name in the final directory. 15610 15611 trace "table=$table, file=$file, directory=@$directory_ref, fate=", $table->fate if main::DEBUG && $to_trace; 15612 15613 if ($table->isa('Property')) { 15614 $table->set_file_path(@$directory_ref, $file); 15615 push @map_properties, $table; 15616 15617 # No swash means don't do the rest of this. 15618 return if $table->fate != $ORDINARY 15619 && ! ($table->name =~ /^_/ && $table->fate == $INTERNAL_ONLY); 15620 15621 # Get the path to the file 15622 my @path = $table->file_path; 15623 15624 # Use just the file name if no subdirectory. 15625 shift @path if $path[0] eq File::Spec->curdir(); 15626 15627 my $file = join '/', @path; 15628 15629 # Create a hash entry for Unicode::UCD to get the file that stores this 15630 # property's map table 15631 foreach my $alias ($table->aliases) { 15632 my $name = $alias->name; 15633 if ($name =~ /^_/) { 15634 $strict_property_to_file_of{lc $name} = $file; 15635 } 15636 else { 15637 $loose_property_to_file_of{standardize($name)} = $file; 15638 } 15639 } 15640 15641 # And a way for Unicode::UCD to find the proper key in the SwashInfo 15642 # hash for this property. 15643 $file_to_swash_name{$file} = "To" . $table->swash_name; 15644 return; 15645 } 15646 15647 # Do all of the work for all equivalent tables when called with the leader 15648 # table, so skip if isn't the leader. 15649 return if $table->leader != $table; 15650 15651 # If this is a complement of another file, use that other file instead, 15652 # with a ! prepended to it. 15653 my $complement; 15654 if (($complement = $table->complement) != 0) { 15655 my @directories = $complement->file_path; 15656 15657 # This assumes that the 0th element is something like 'lib', 15658 # the 1th element the property name (in its own directory), like 15659 # 'AHex', and the 2th element the file like 'Y' which will have a .pl 15660 # appended to it later. 15661 $directories[1] =~ s/^/!/; 15662 $file = pop @directories; 15663 $directory_ref =\@directories; 15664 } 15665 15666 # Join all the file path components together, using slashes. 15667 my $full_filename = join('/', @$directory_ref, $file); 15668 15669 # All go in the same subdirectory of unicore, or the special 15670 # pseudo-directory '#' 15671 if ($directory_ref->[0] !~ / ^ $matches_directory | \# $ /x) { 15672 Carp::my_carp("Unexpected directory in " 15673 . join('/', @{$directory_ref}, $file)); 15674 } 15675 15676 # For this table and all its equivalents ... 15677 foreach my $table ($table, $table->equivalents) { 15678 15679 # Associate it with its file internally. Don't include the 15680 # $matches_directory first component 15681 $table->set_file_path(@$directory_ref, $file); 15682 15683 # No swash means don't do the rest of this. 15684 next if $table->isa('Map_Table') && $table->fate != $ORDINARY; 15685 15686 my $sub_filename = join('/', $directory_ref->[1, -1], $file); 15687 15688 my $property = $table->property; 15689 my $property_name = ($property == $perl) 15690 ? "" # 'perl' is never explicitly stated 15691 : standardize($property->name) . '='; 15692 15693 my $is_default = 0; # Is this table the default one for the property? 15694 15695 # To calculate $is_default, we find if this table is the same as the 15696 # default one for the property. But this is complicated by the 15697 # possibility that there is a master table for this one, and the 15698 # information is stored there instead of here. 15699 my $parent = $table->parent; 15700 my $leader_prop = $parent->property; 15701 my $default_map = $leader_prop->default_map; 15702 if (defined $default_map) { 15703 my $default_table = $leader_prop->table($default_map); 15704 $is_default = 1 if defined $default_table && $parent == $default_table; 15705 } 15706 15707 # Calculate the loose name for this table. Mostly it's just its name, 15708 # standardized. But in the case of Perl tables that are single-form 15709 # equivalents to Unicode properties, it is the latter's name. 15710 my $loose_table_name = 15711 ($property != $perl || $leader_prop == $perl) 15712 ? standardize($table->name) 15713 : standardize($parent->name); 15714 15715 my $deprecated = ($table->status eq $DEPRECATED) 15716 ? $table->status_info 15717 : ""; 15718 my $caseless_equivalent = $table->caseless_equivalent; 15719 15720 # And for each of the table's aliases... This inner loop eventually 15721 # goes through all aliases in the UCD that we generate regex match 15722 # files for 15723 foreach my $alias ($table->aliases) { 15724 my $standard = UCD_name($table, $alias); 15725 15726 # Generate an entry in either the loose or strict hashes, which 15727 # will translate the property and alias names combination into the 15728 # file where the table for them is stored. 15729 if ($alias->loose_match) { 15730 if (exists $loose_to_file_of{$standard}) { 15731 Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'."); 15732 } 15733 else { 15734 $loose_to_file_of{$standard} = $sub_filename; 15735 } 15736 } 15737 else { 15738 if (exists $stricter_to_file_of{$standard}) { 15739 Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'."); 15740 } 15741 else { 15742 $stricter_to_file_of{$standard} = $sub_filename; 15743 15744 # Tightly coupled with how Unicode::UCD works, for a 15745 # floating point number that is a whole number, get rid of 15746 # the trailing decimal point and 0's, so that Unicode::UCD 15747 # will work. Also note that this assumes that such a 15748 # number is matched strictly; so if that were to change, 15749 # this would be wrong. 15750 if ((my $integer_name = $alias->name) 15751 =~ s/^ ( -? \d+ ) \.0+ $ /$1/x) 15752 { 15753 $stricter_to_file_of{$property_name . $integer_name} 15754 = $sub_filename; 15755 } 15756 } 15757 } 15758 15759 # For Unicode::UCD, create a mapping of the prop=value to the 15760 # canonical =value for that property. 15761 if ($standard =~ /=/) { 15762 15763 # This could happen if a strict name mapped into an existing 15764 # loose name. In that event, the strict names would have to 15765 # be moved to a new hash. 15766 if (exists($loose_to_standard_value{$standard})) { 15767 Carp::my_carp_bug("'$standard' conflicts with a pre-existing use. Bad News. Continuing anyway"); 15768 } 15769 $loose_to_standard_value{$standard} = $loose_table_name; 15770 } 15771 15772 # Keep a list of the deprecated properties and their filenames 15773 if ($deprecated && $complement == 0) { 15774 $Unicode::UCD::why_deprecated{$sub_filename} = $deprecated; 15775 } 15776 15777 # And a substitute table, if any, for case-insensitive matching 15778 if ($caseless_equivalent != 0) { 15779 $caseless_equivalent_to{$standard} = $caseless_equivalent; 15780 } 15781 15782 # Add to defaults list if the table this alias belongs to is the 15783 # default one 15784 $loose_defaults{$standard} = 1 if $is_default; 15785 } 15786 } 15787 15788 return; 15789} 15790 15791{ # Closure 15792 my %base_names; # Names already used for avoiding DOS 8.3 filesystem 15793 # conflicts 15794 my %full_dir_name_of; # Full length names of directories used. 15795 15796 sub construct_filename($name, $mutable, $directories_ref) { 15797 # Return a file name for a table, based on the table name, but perhaps 15798 # changed to get rid of non-portable characters in it, and to make 15799 # sure that it is unique on a file system that allows the names before 15800 # any period to be at most 8 characters (DOS). While we're at it 15801 # check and complain if there are any directory conflicts. 15802 15803 # $name # The name to start with 15804 # $mutable # Boolean: can it be changed? If no, but 15805 # yet it must be to work properly, a warning 15806 # is given 15807 # $directories_ref # A reference to an array containing the 15808 # path to the file, with each element one path 15809 # component. This is used because the same 15810 # name can be used in different directories. 15811 15812 my $warn = ! defined wantarray; # If true, then if the name is 15813 # changed, a warning is issued as well. 15814 15815 if (! defined $name) { 15816 Carp::my_carp("Undefined name in directory " 15817 . File::Spec->join(@$directories_ref) 15818 . ". '_' used"); 15819 return '_'; 15820 } 15821 15822 # Make sure that no directory names conflict with each other. Look at 15823 # each directory in the input file's path. If it is already in use, 15824 # assume it is correct, and is merely being re-used, but if we 15825 # truncate it to 8 characters, and find that there are two directories 15826 # that are the same for the first 8 characters, but differ after that, 15827 # then that is a problem. 15828 foreach my $directory (@$directories_ref) { 15829 my $short_dir = substr($directory, 0, 8); 15830 if (defined $full_dir_name_of{$short_dir}) { 15831 next if $full_dir_name_of{$short_dir} eq $directory; 15832 Carp::my_carp("Directory $directory conflicts with directory $full_dir_name_of{$short_dir}. Bad News. Continuing anyway"); 15833 } 15834 else { 15835 $full_dir_name_of{$short_dir} = $directory; 15836 } 15837 } 15838 15839 my $path = join '/', @$directories_ref; 15840 $path .= '/' if $path; 15841 15842 # Remove interior underscores. 15843 (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg; 15844 15845 # Convert the dot in floating point numbers to an underscore 15846 $filename =~ s/\./_/ if $filename =~ / ^ \d+ \. \d+ $ /x; 15847 15848 my $suffix = ""; 15849 15850 # Extract any suffix, delete any non-word character, and truncate to 3 15851 # after the dot 15852 if ($filename =~ m/ ( .*? ) ( \. .* ) /x) { 15853 $filename = $1; 15854 $suffix = $2; 15855 $suffix =~ s/\W+//g; 15856 substr($suffix, 4) = "" if length($suffix) > 4; 15857 } 15858 15859 # Change any non-word character outside the suffix into an underscore, 15860 # and truncate to 8. 15861 $filename =~ s/\W+/_/g; # eg., "L&" -> "L_" 15862 substr($filename, 8) = "" if length($filename) > 8; 15863 15864 # Make sure the basename doesn't conflict with something we 15865 # might have already written. If we have, say, 15866 # InGreekExtended1 15867 # InGreekExtended2 15868 # they become 15869 # InGreekE 15870 # InGreek2 15871 my $warned = 0; 15872 while (my $num = $base_names{$path}{lc "$filename$suffix"}++) { 15873 $num++; # so basenames with numbers start with '2', which 15874 # just looks more natural. 15875 15876 # Want to append $num, but if it'll make the basename longer 15877 # than 8 characters, pre-truncate $filename so that the result 15878 # is acceptable. 15879 my $delta = length($filename) + length($num) - 8; 15880 if ($delta > 0) { 15881 substr($filename, -$delta) = $num; 15882 } 15883 else { 15884 $filename .= $num; 15885 } 15886 if ($warn && ! $warned) { 15887 $warned = 1; 15888 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS). Proceeding anyway."); 15889 } 15890 } 15891 15892 return $filename if $mutable; 15893 15894 # If not changeable, must return the input name, but warn if needed to 15895 # change it beyond shortening it. 15896 if ($name ne $filename 15897 && substr($name, 0, length($filename)) ne $filename) { 15898 Carp::my_carp("'$path$name' had to be changed into '$filename'. Bad News. Proceeding anyway."); 15899 } 15900 return $name; 15901 } 15902} 15903 15904# The pod file contains a very large table. Many of the lines in that table 15905# would exceed a typical output window's size, and so need to be wrapped with 15906# a hanging indent to make them look good. The pod language is really 15907# insufficient here. There is no general construct to do that in pod, so it 15908# is done here by beginning each such line with a space to cause the result to 15909# be output without formatting, and doing all the formatting here. This leads 15910# to the result that if the eventual display window is too narrow it won't 15911# look good, and if the window is too wide, no advantage is taken of that 15912# extra width. A further complication is that the output may be indented by 15913# the formatter so that there is less space than expected. What I (khw) have 15914# done is to assume that that indent is a particular number of spaces based on 15915# what it is in my Linux system; people can always resize their windows if 15916# necessary, but this is obviously less than desirable, but the best that can 15917# be expected. 15918my $automatic_pod_indent = 8; 15919 15920# Try to format so that uses fewest lines, but few long left column entries 15921# slide into the right column. An experiment on 5.1 data yielded the 15922# following percentages that didn't cut into the other side along with the 15923# associated first-column widths 15924# 69% = 24 15925# 80% not too bad except for a few blocks 15926# 90% = 33; # , cuts 353/3053 lines from 37 = 12% 15927# 95% = 37; 15928my $indent_info_column = 27; # 75% of lines didn't have overlap 15929 15930my $FILLER = 3; # Length of initial boiler-plate columns in a pod line 15931 # The 3 is because of: 15932 # 1 for the leading space to tell the pod formatter to 15933 # output as-is 15934 # 1 for the flag 15935 # 1 for the space between the flag and the main data 15936 15937sub format_pod_line($first_column_width, $entry, $info, $status = "", $loose_match = 1 ) { 15938 # Take a pod line and return it, formatted properly 15939 15940 # $entry Contents of left column 15941 # $info Contents of right column 15942 15943 my $flags = ""; 15944 $flags .= $STRICTER if ! $loose_match; 15945 15946 $flags .= $status if $status; 15947 15948 # There is a blank in the left column to cause the pod formatter to 15949 # output the line as-is. 15950 return sprintf " %-*s%-*s %s\n", 15951 # The first * in the format is replaced by this, the -1 is 15952 # to account for the leading blank. There isn't a 15953 # hard-coded blank after this to separate the flags from 15954 # the rest of the line, so that in the unlikely event that 15955 # multiple flags are shown on the same line, they both 15956 # will get displayed at the expense of that separation, 15957 # but since they are left justified, a blank will be 15958 # inserted in the normal case. 15959 $FILLER - 1, 15960 $flags, 15961 15962 # The other * in the format is replaced by this number to 15963 # cause the first main column to right fill with blanks. 15964 # The -1 is for the guaranteed blank following it. 15965 $first_column_width - $FILLER - 1, 15966 $entry, 15967 $info; 15968} 15969 15970my @zero_match_tables; # List of tables that have no matches in this release 15971 15972sub make_re_pod_entries($input_table) { 15973 # This generates the entries for the pod file for a given table. 15974 # Also done at this time are any children tables. The output looks like: 15975 # \p{Common} \p{Script=Common} (Short: \p{Zyyy}) (5178) 15976 15977 # Generate parent and all its children at the same time. 15978 return if $input_table->parent != $input_table; 15979 15980 my $property = $input_table->property; 15981 my $type = $property->type; 15982 my $full_name = $property->full_name; 15983 15984 my $count = $input_table->count; 15985 my $unicode_count; 15986 my $non_unicode_string; 15987 if ($count > $MAX_UNICODE_CODEPOINTS) { 15988 $unicode_count = $count - ($MAX_WORKING_CODEPOINT 15989 - $MAX_UNICODE_CODEPOINT); 15990 $non_unicode_string = " plus all above-Unicode code points"; 15991 } 15992 else { 15993 $unicode_count = $count; 15994 $non_unicode_string = ""; 15995 } 15996 15997 my $string_count = clarify_number($unicode_count) . $non_unicode_string; 15998 15999 my $definition = $input_table->calculate_table_definition; 16000 if ($definition) { 16001 16002 # Save the definition for later use. 16003 $input_table->set_definition($definition); 16004 16005 $definition = ": $definition"; 16006 } 16007 16008 my $status = $input_table->status; 16009 my $status_info = $input_table->status_info; 16010 my $caseless_equivalent = $input_table->caseless_equivalent; 16011 16012 # Don't mention a placeholder equivalent as it isn't to be listed in the 16013 # pod 16014 $caseless_equivalent = 0 if $caseless_equivalent != 0 16015 && $caseless_equivalent->fate > $ORDINARY; 16016 16017 my $entry_for_first_table; # The entry for the first table output. 16018 # Almost certainly, it is the parent. 16019 16020 # For each related table (including itself), we will generate a pod entry 16021 # for each name each table goes by 16022 foreach my $table ($input_table, $input_table->children) { 16023 16024 # Unicode::UCD cannot deal with null string property values, so skip 16025 # any tables that have no non-null names. 16026 next if ! grep { $_->name ne "" } $table->aliases; 16027 16028 # First, gather all the info that applies to this table as a whole. 16029 16030 push @zero_match_tables, $table if $count == 0 16031 # Don't mention special tables 16032 # as being zero length 16033 && $table->fate == $ORDINARY; 16034 16035 my $table_property = $table->property; 16036 16037 # The short name has all the underscores removed, while the full name 16038 # retains them. Later, we decide whether to output a short synonym 16039 # for the full one, we need to compare apples to apples, so we use the 16040 # short name's length including underscores. 16041 my $table_property_short_name_length; 16042 my $table_property_short_name 16043 = $table_property->short_name(\$table_property_short_name_length); 16044 my $table_property_full_name = $table_property->full_name; 16045 16046 # Get how much savings there is in the short name over the full one 16047 # (delta will always be <= 0) 16048 my $table_property_short_delta = $table_property_short_name_length 16049 - length($table_property_full_name); 16050 my @table_description = $table->description; 16051 my @table_note = $table->note; 16052 16053 # Generate an entry for each alias in this table. 16054 my $entry_for_first_alias; # saves the first one encountered. 16055 foreach my $alias ($table->aliases) { 16056 16057 # Skip if not to go in pod. 16058 next unless $alias->make_re_pod_entry; 16059 16060 # Start gathering all the components for the entry 16061 my $name = $alias->name; 16062 16063 # Skip if name is empty, as can't be accessed by regexes. 16064 next if $name eq ""; 16065 16066 my $entry; # Holds the left column, may include extras 16067 my $entry_ref; # To refer to the left column's contents from 16068 # another entry; has no extras 16069 16070 # First the left column of the pod entry. Tables for the $perl 16071 # property always use the single form. 16072 if ($table_property == $perl) { 16073 $entry = "\\p{$name}"; 16074 $entry .= " \\p$name" if length $name == 1; # Show non-braced 16075 # form too 16076 $entry_ref = "\\p{$name}"; 16077 } 16078 else { # Compound form. 16079 16080 # Only generate one entry for all the aliases that mean true 16081 # or false in binary properties. Append a '*' to indicate 16082 # some are missing. (The heading comment notes this.) 16083 my $rhs; 16084 if ($type == $BINARY) { 16085 next if $name ne 'N' && $name ne 'Y'; 16086 $rhs = "$name*"; 16087 } 16088 elsif ($type != $FORCED_BINARY) { 16089 $rhs = $name; 16090 } 16091 else { 16092 16093 # Forced binary properties require special handling. It 16094 # has two sets of tables, one set is true/false; and the 16095 # other set is everything else. Entries are generated for 16096 # each set. Use the Bidi_Mirrored property (which appears 16097 # in all Unicode versions) to get a list of the aliases 16098 # for the true/false tables. Of these, only output the N 16099 # and Y ones, the same as, a regular binary property. And 16100 # output all the rest, same as a non-binary property. 16101 my $bm = property_ref("Bidi_Mirrored"); 16102 if ($name eq 'N' || $name eq 'Y') { 16103 $rhs = "$name*"; 16104 } elsif (grep { $name eq $_->name } $bm->table("Y")->aliases, 16105 $bm->table("N")->aliases) 16106 { 16107 next; 16108 } 16109 else { 16110 $rhs = $name; 16111 } 16112 } 16113 16114 # Colon-space is used to give a little more space to be easier 16115 # to read; 16116 $entry = "\\p{" 16117 . $table_property_full_name 16118 . ": $rhs}"; 16119 16120 # But for the reference to this entry, which will go in the 16121 # right column, where space is at a premium, use equals 16122 # without a space 16123 $entry_ref = "\\p{" . $table_property_full_name . "=$name}"; 16124 } 16125 16126 # Then the right (info) column. This is stored as components of 16127 # an array for the moment, then joined into a string later. For 16128 # non-internal only properties, begin the info with the entry for 16129 # the first table we encountered (if any), as things are ordered 16130 # so that that one is the most descriptive. This leads to the 16131 # info column of an entry being a more descriptive version of the 16132 # name column 16133 my @info; 16134 if ($name =~ /^_/) { 16135 push @info, 16136 '(For internal use by Perl, not necessarily stable)'; 16137 } 16138 elsif ($entry_for_first_alias) { 16139 push @info, $entry_for_first_alias; 16140 } 16141 16142 # If this entry is equivalent to another, add that to the info, 16143 # using the first such table we encountered 16144 if ($entry_for_first_table) { 16145 if (@info) { 16146 push @info, "(= $entry_for_first_table)"; 16147 } 16148 else { 16149 push @info, $entry_for_first_table; 16150 } 16151 } 16152 16153 # If the name is a large integer, add an equivalent with an 16154 # exponent for better readability 16155 if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) { 16156 push @info, sprintf "(= %.1e)", $name 16157 } 16158 16159 my $parenthesized = ""; 16160 if (! $entry_for_first_alias) { 16161 16162 # This is the first alias for the current table. The alias 16163 # array is ordered so that this is the fullest, most 16164 # descriptive alias, so it gets the fullest info. The other 16165 # aliases are mostly merely pointers to this one, using the 16166 # information already added above. 16167 16168 # Display any status message, but only on the parent table 16169 if ($status && ! $entry_for_first_table) { 16170 push @info, $status_info; 16171 } 16172 16173 # Put out any descriptive info 16174 if (@table_description || @table_note) { 16175 push @info, join "; ", @table_description, @table_note; 16176 } 16177 16178 # Look to see if there is a shorter name we can point people 16179 # at 16180 my $standard_name = standardize($name); 16181 my $short_name; 16182 my $proposed_short = $table->short_name; 16183 if (defined $proposed_short) { 16184 my $standard_short = standardize($proposed_short); 16185 16186 # If the short name is shorter than the standard one, or 16187 # even if it's not, but the combination of it and its 16188 # short property name (as in \p{prop=short} ($perl doesn't 16189 # have this form)) saves at least two characters, then, 16190 # cause it to be listed as a shorter synonym. 16191 if (length $standard_short < length $standard_name 16192 || ($table_property != $perl 16193 && (length($standard_short) 16194 - length($standard_name) 16195 + $table_property_short_delta) # (<= 0) 16196 < -2)) 16197 { 16198 $short_name = $proposed_short; 16199 if ($table_property != $perl) { 16200 $short_name = $table_property_short_name 16201 . "=$short_name"; 16202 } 16203 $short_name = "\\p{$short_name}"; 16204 } 16205 } 16206 16207 # And if this is a compound form name, see if there is a 16208 # single form equivalent 16209 my $single_form; 16210 if ($table_property != $perl && $table_property != $block) { 16211 16212 # Special case the binary N tables, so that will print 16213 # \P{single}, but use the Y table values to populate 16214 # 'single', as we haven't likewise populated the N table. 16215 # For forced binary tables, we can't just look at the N 16216 # table, but must see if this table is equivalent to the N 16217 # one, as there are two equivalent beasts in these 16218 # properties. 16219 my $test_table; 16220 my $p; 16221 if ( ($type == $BINARY 16222 && $input_table == $property->table('No')) 16223 || ($type == $FORCED_BINARY 16224 && $property->table('No')-> 16225 is_set_equivalent_to($input_table))) 16226 { 16227 $test_table = $property->table('Yes'); 16228 $p = 'P'; 16229 } 16230 else { 16231 $test_table = $input_table; 16232 $p = 'p'; 16233 } 16234 16235 # Look for a single form amongst all the children. 16236 foreach my $table ($test_table->children) { 16237 next if $table->property != $perl; 16238 my $proposed_name = $table->short_name; 16239 next if ! defined $proposed_name; 16240 16241 # Don't mention internal-only properties as a possible 16242 # single form synonym 16243 next if substr($proposed_name, 0, 1) eq '_'; 16244 16245 $proposed_name = "\\$p\{$proposed_name}"; 16246 if (! defined $single_form 16247 || length($proposed_name) < length $single_form) 16248 { 16249 $single_form = $proposed_name; 16250 16251 # The goal here is to find a single form; not the 16252 # shortest possible one. We've already found a 16253 # short name. So, stop at the first single form 16254 # found, which is likely to be closer to the 16255 # original. 16256 last; 16257 } 16258 } 16259 } 16260 16261 # Output both short and single in the same parenthesized 16262 # expression, but with only one of 'Single', 'Short' if there 16263 # are both items. 16264 if ($short_name || $single_form || $table->conflicting) { 16265 $parenthesized .= "Short: $short_name" if $short_name; 16266 if ($short_name && $single_form) { 16267 $parenthesized .= ', '; 16268 } 16269 elsif ($single_form) { 16270 $parenthesized .= 'Single: '; 16271 } 16272 $parenthesized .= $single_form if $single_form; 16273 } 16274 } 16275 16276 if ($caseless_equivalent != 0) { 16277 $parenthesized .= '; ' if $parenthesized ne ""; 16278 $parenthesized .= "/i= " . $caseless_equivalent->complete_name; 16279 } 16280 16281 16282 # Warn if this property isn't the same as one that a 16283 # semi-casual user might expect. The other components of this 16284 # parenthesized structure are calculated only for the first entry 16285 # for this table, but the conflicting is deemed important enough 16286 # to go on every entry. 16287 my $conflicting = join " NOR ", $table->conflicting; 16288 if ($conflicting) { 16289 $parenthesized .= '; ' if $parenthesized ne ""; 16290 $parenthesized .= "NOT $conflicting"; 16291 } 16292 16293 push @info, "($parenthesized)" if $parenthesized; 16294 16295 if ($name =~ /_$/ && $alias->loose_match) { 16296 push @info, "Note the trailing '_' matters in spite of loose matching rules."; 16297 } 16298 16299 if ($table_property != $perl && $table->perl_extension) { 16300 push @info, '(Perl extension)'; 16301 } 16302 my $definition = $table->definition // ""; 16303 $definition = "" if $entry_for_first_alias; 16304 $definition = ": $definition" if $definition; 16305 push @info, "($string_count$definition)"; 16306 16307 # Now, we have both the entry and info so add them to the 16308 # list of all the properties. 16309 push @match_properties, 16310 format_pod_line($indent_info_column, 16311 $entry, 16312 join( " ", @info), 16313 $alias->status, 16314 $alias->loose_match); 16315 16316 $entry_for_first_alias = $entry_ref unless $entry_for_first_alias; 16317 } # End of looping through the aliases for this table. 16318 16319 if (! $entry_for_first_table) { 16320 $entry_for_first_table = $entry_for_first_alias; 16321 } 16322 } # End of looping through all the related tables 16323 return; 16324} 16325 16326sub make_ucd_table_pod_entries($table) { 16327 # Generate the entries for the UCD section of the pod for $table. This 16328 # also calculates if names are ambiguous, so has to be called even if the 16329 # pod is not being output 16330 16331 my $short_name = $table->name; 16332 my $standard_short_name = standardize($short_name); 16333 my $full_name = $table->full_name; 16334 my $standard_full_name = standardize($full_name); 16335 16336 my $full_info = ""; # Text of info column for full-name entries 16337 my $other_info = ""; # Text of info column for short-name entries 16338 my $short_info = ""; # Text of info column for other entries 16339 my $meaning = ""; # Synonym of this table 16340 16341 my $property = ($table->isa('Property')) 16342 ? $table 16343 : $table->parent->property; 16344 16345 my $perl_extension = $table->perl_extension; 16346 my $is_perl_extension_match_table_but_not_dollar_perl 16347 = $property != $perl 16348 && $perl_extension 16349 && $property != $table; 16350 16351 # Get the more official name for perl extensions that aren't 16352 # stand-alone properties 16353 if ($is_perl_extension_match_table_but_not_dollar_perl) { 16354 if ($property->type == $BINARY) { 16355 $meaning = $property->full_name; 16356 } 16357 else { 16358 $meaning = $table->parent->complete_name; 16359 } 16360 } 16361 16362 # There are three types of info column. One for the short name, one for 16363 # the full name, and one for everything else. They mostly are the same, 16364 # so initialize in the same loop. 16365 16366 foreach my $info_ref (\$full_info, \$short_info, \$other_info) { 16367 if ($info_ref != \$full_info) { 16368 16369 # The non-full name columns include the full name 16370 $$info_ref .= $full_name; 16371 } 16372 16373 16374 if ($is_perl_extension_match_table_but_not_dollar_perl) { 16375 16376 # Add the synonymous name for the non-full name entries; and to 16377 # the full-name entry if it adds extra information 16378 if ( standardize($meaning) ne $standard_full_name 16379 || $info_ref == \$other_info 16380 || $info_ref == \$short_info) 16381 { 16382 my $parenthesized = $info_ref != \$full_info; 16383 $$info_ref .= " " if $$info_ref && $parenthesized; 16384 $$info_ref .= "(=" if $parenthesized; 16385 $$info_ref .= "$meaning"; 16386 $$info_ref .= ")" if $parenthesized; 16387 $$info_ref .= "."; 16388 } 16389 } 16390 16391 # And the full-name entry includes the short name, if shorter 16392 if ($info_ref == \$full_info 16393 && length $standard_short_name < length $standard_full_name) 16394 { 16395 $full_info =~ s/\.\Z//; 16396 $full_info .= " " if $full_info; 16397 $full_info .= "(Short: $short_name)"; 16398 } 16399 16400 if ($table->perl_extension) { 16401 $$info_ref =~ s/\.\Z//; 16402 $$info_ref .= ". " if $$info_ref; 16403 $$info_ref .= "(Perl extension)"; 16404 } 16405 } 16406 16407 my $definition; 16408 my $definition_table; 16409 my $type = $table->property->type; 16410 if ($type == $BINARY || $type == $FORCED_BINARY) { 16411 $definition_table = $table->property->table('Y'); 16412 } 16413 elsif ($table->isa('Match_Table')) { 16414 $definition_table = $table; 16415 } 16416 16417 $definition = $definition_table->calculate_table_definition 16418 if defined $definition_table 16419 && $definition_table != 0; 16420 16421 # Add any extra annotations to the full name entry 16422 foreach my $more_info ($table->description, 16423 $definition, 16424 $table->note, 16425 $table->status_info) 16426 { 16427 next unless $more_info; 16428 $full_info =~ s/\.\Z//; 16429 $full_info .= ". " if $full_info; 16430 $full_info .= $more_info; 16431 } 16432 if ($table->property->type == $FORCED_BINARY) { 16433 if ($full_info) { 16434 $full_info =~ s/\.\Z//; 16435 $full_info .= ". "; 16436 } 16437 $full_info .= "This is a combination property which has both:" 16438 . " 1) a map to various string values; and" 16439 . " 2) a map to boolean Y/N, where 'Y' means the" 16440 . " string value is non-empty. Add the prefix 'is'" 16441 . " to the prop_invmap() call to get the latter"; 16442 } 16443 16444 # These keep track if have created full and short name pod entries for the 16445 # property 16446 my $done_full = 0; 16447 my $done_short = 0; 16448 16449 # Every possible name is kept track of, even those that aren't going to be 16450 # output. This way we can be sure to find the ambiguities. 16451 foreach my $alias ($table->aliases) { 16452 my $name = $alias->name; 16453 my $standard = standardize($name); 16454 my $info; 16455 my $output_this = $alias->ucd; 16456 16457 # If the full and short names are the same, we want to output the full 16458 # one's entry, so it has priority. 16459 if ($standard eq $standard_full_name) { 16460 next if $done_full; 16461 $done_full = 1; 16462 $info = $full_info; 16463 } 16464 elsif ($standard eq $standard_short_name) { 16465 next if $done_short; 16466 $done_short = 1; 16467 next if $standard_short_name eq $standard_full_name; 16468 $info = $short_info; 16469 } 16470 else { 16471 $info = $other_info; 16472 } 16473 16474 $combination_property{$standard} = 1 16475 if $table->property->type == $FORCED_BINARY; 16476 16477 # Here, we have set up the two columns for this entry. But if an 16478 # entry already exists for this name, we have to decide which one 16479 # we're going to later output. 16480 if (exists $ucd_pod{$standard}) { 16481 16482 # If the two entries refer to the same property, it's not going to 16483 # be ambiguous. (Likely it's because the names when standardized 16484 # are the same.) But that means if they are different properties, 16485 # there is ambiguity. 16486 if ($ucd_pod{$standard}->{'property'} != $property) { 16487 16488 # Here, we have an ambiguity. This code assumes that one is 16489 # scheduled to be output and one not and that one is a perl 16490 # extension (which is not to be output) and the other isn't. 16491 # If those assumptions are wrong, things have to be rethought. 16492 if ($ucd_pod{$standard}{'output_this'} == $output_this 16493 || $ucd_pod{$standard}{'perl_extension'} == $perl_extension 16494 || $output_this == $perl_extension) 16495 { 16496 Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations. Proceeding anyway."); 16497 } 16498 16499 # We modify the info column of the one being output to 16500 # indicate the ambiguity. Set $which to point to that one's 16501 # info. 16502 my $which; 16503 if ($ucd_pod{$standard}{'output_this'}) { 16504 $which = \$ucd_pod{$standard}->{'info'}; 16505 } 16506 else { 16507 $which = \$info; 16508 $meaning = $ucd_pod{$standard}{'meaning'}; 16509 } 16510 16511 chomp $$which; 16512 $$which =~ s/\.\Z//; 16513 $$which .= "; NOT '$standard' meaning '$meaning'"; 16514 16515 $ambiguous_names{$standard} = 1; 16516 } 16517 16518 # Use the non-perl-extension variant 16519 next unless $ucd_pod{$standard}{'perl_extension'}; 16520 } 16521 16522 # Store enough information about this entry that we can later look for 16523 # ambiguities, and output it properly. 16524 $ucd_pod{$standard} = { 'name' => $name, 16525 'info' => $info, 16526 'meaning' => $meaning, 16527 'output_this' => $output_this, 16528 'perl_extension' => $perl_extension, 16529 'property' => $property, 16530 'status' => $alias->status, 16531 }; 16532 } # End of looping through all this table's aliases 16533 16534 return; 16535} 16536 16537sub pod_alphanumeric_sort { 16538 # Sort pod entries alphanumerically. 16539 16540 # The first few character columns are filler, plus the '\p{'; and get rid 16541 # of all the trailing stuff, starting with the trailing '}', so as to sort 16542 # on just 'Name=Value' 16543 (my $a = lc $a) =~ s/^ .*? \{ //x; 16544 $a =~ s/}.*//; 16545 (my $b = lc $b) =~ s/^ .*? \{ //x; 16546 $b =~ s/}.*//; 16547 16548 # Determine if the two operands are both internal only or both not. 16549 # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3 16550 # should be the underscore that begins internal only 16551 my $a_is_internal = (substr($a, 0, 1) eq '_'); 16552 my $b_is_internal = (substr($b, 0, 1) eq '_'); 16553 16554 # Sort so the internals come last in the table instead of first (which the 16555 # leading underscore would otherwise indicate). 16556 if ($a_is_internal != $b_is_internal) { 16557 return 1 if $a_is_internal; 16558 return -1 16559 } 16560 16561 # Determine if the two operands are compound or not, and if so if are 16562 # "numeric" property values or not, like \p{Age: 3.0}. But there are also 16563 # things like \p{Canonical_Combining_Class: CCC133} and \p{Age: V10_0}, 16564 # all of which this considers numeric, and for sorting, looks just at the 16565 # numeric parts. It can also be a rational like \p{Numeric Value=-1/2}. 16566 my $split_re = qr/ 16567 ^ ( [^:=]+ ) # $1 is undef if not a compound form, otherwise is the 16568 # property name 16569 [:=] \s* # The syntax for the compound form 16570 (?: # followed by ... 16571 ( # $2 gets defined if what follows is a "numeric" 16572 # expression, which is ... 16573 ( -? \d+ (?: [.\/] \d+)? # An integer, float, or rational 16574 # number, optionally signed 16575 | [[:alpha:]]{2,} \d+ $ ) # or something like CCC131. Either 16576 # of these go into $3 16577 | ( V \d+ _ \d+ ) # or a Unicode's Age property version 16578 # number, into $4 16579 ) 16580 | .* $ # If not "numeric", accept anything so that $1 gets 16581 # defined if it is any compound form 16582 ) /ix; 16583 my ($a_initial, $a_numeric, $a_number, $a_version) = ($a =~ $split_re); 16584 my ($b_initial, $b_numeric, $b_number, $b_version) = ($b =~ $split_re); 16585 16586 # Sort alphabeticlly on the whole property name if either operand isn't 16587 # compound, or they differ. 16588 return $a cmp $b if ! defined $a_initial 16589 || ! defined $b_initial 16590 || $a_initial ne $b_initial; 16591 16592 if (! defined $a_numeric) { 16593 16594 # If neither is numeric, use alpha sort 16595 return $a cmp $b if ! defined $b_numeric; 16596 return 1; # Sort numeric ahead of alpha 16597 } 16598 16599 # Here $a is numeric 16600 return -1 if ! defined $b_numeric; # Numeric sorts before alpha 16601 16602 # Here they are both numeric in the same property. 16603 # Convert version numbers into regular numbers 16604 if (defined $a_version) { 16605 ($a_number = $a_version) =~ s/^V//i; 16606 $a_number =~ s/_/./; 16607 } 16608 else { # Otherwise get rid of the, e.g., CCC in CCC9 */ 16609 $a_number =~ s/ ^ [[:alpha:]]+ //x; 16610 } 16611 if (defined $b_version) { 16612 ($b_number = $b_version) =~ s/^V//i; 16613 $b_number =~ s/_/./; 16614 } 16615 else { 16616 $b_number =~ s/ ^ [[:alpha:]]+ //x; 16617 } 16618 16619 # Convert rationals to floating for the comparison. 16620 $a_number = eval $a_number if $a_number =~ qr{/}; 16621 $b_number = eval $b_number if $b_number =~ qr{/}; 16622 16623 return $a_number <=> $b_number || $a cmp $b; 16624} 16625 16626sub make_pod () { 16627 # Create the .pod file. This generates the various subsections and then 16628 # combines them in one big HERE document. 16629 16630 my $Is_flags_text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)"; 16631 16632 return unless defined $pod_directory; 16633 print "Making pod file\n" if $verbosity >= $PROGRESS; 16634 16635 my $exception_message = 16636 '(Any exceptions are individually noted beginning with the word NOT.)'; 16637 my @block_warning; 16638 if (-e 'Blocks.txt') { 16639 16640 # Add the line: '\p{In_*} \p{Block: *}', with the warning message 16641 # if the global $has_In_conflicts indicates we have them. 16642 push @match_properties, format_pod_line($indent_info_column, 16643 '\p{In_*}', 16644 '\p{Block: *}' 16645 . (($has_In_conflicts) 16646 ? " $exception_message" 16647 : ""), 16648 $DISCOURAGED); 16649 @block_warning = << "END"; 16650 16651In particular, matches in the Block property have single forms 16652defined by Perl that begin with C<"In_">, C<"Is_>, or even with no prefix at 16653all, Like all B<DISCOURAGED> forms, these are not stable. For example, 16654C<\\p{Block=Deseret}> can currently be written as C<\\p{In_Deseret}>, 16655C<\\p{Is_Deseret}>, or C<\\p{Deseret}>. But, a new Unicode version may 16656come along that would force Perl to change the meaning of one or more of 16657these, and your program would no longer be correct. Currently there are no 16658such conflicts with the form that begins C<"In_">, but there are many with the 16659other two shortcuts, and Unicode continues to define new properties that begin 16660with C<"In">, so it's quite possible that a conflict will occur in the future. 16661The compound form is guaranteed to not become obsolete, and its meaning is 16662clearer anyway. See L<perlunicode/"Blocks"> for more information about this. 16663 16664User-defined properties must begin with "In" or "Is". These override any 16665Unicode property of the same name. 16666END 16667 } 16668 my $text = $Is_flags_text; 16669 $text = "$exception_message $text" if $has_Is_conflicts; 16670 16671 # And the 'Is_ line'; 16672 push @match_properties, format_pod_line($indent_info_column, 16673 '\p{Is_*}', 16674 "\\p{*} $text"); 16675 push @match_properties, format_pod_line($indent_info_column, 16676 '\p{Name=*}', 16677 "Combination of Name and Name_Alias properties; has special" 16678 . " loose matching rules, for which see Unicode UAX #44"); 16679 push @match_properties, format_pod_line($indent_info_column, 16680 '\p{Na=*}', 16681 '\p{Name=*}'); 16682 16683 # Sort the properties array for output. It is sorted alphabetically 16684 # except numerically for numeric properties, and only output unique lines. 16685 @match_properties = sort pod_alphanumeric_sort uniques @match_properties; 16686 16687 my $formatted_properties = simple_fold(\@match_properties, 16688 "", 16689 # indent succeeding lines by two extra 16690 # which looks better 16691 $indent_info_column + 2, 16692 16693 # shorten the line length by how much 16694 # the formatter indents, so the folded 16695 # line will fit in the space 16696 # presumably available 16697 $automatic_pod_indent); 16698 # Add column headings, indented to be a little more centered, but not 16699 # exactly 16700 $formatted_properties = format_pod_line($indent_info_column, 16701 ' NAME', 16702 ' INFO') 16703 . "\n" 16704 . $formatted_properties; 16705 16706 # Generate pod documentation lines for the tables that match nothing 16707 my $zero_matches = ""; 16708 if (@zero_match_tables) { 16709 @zero_match_tables = uniques(@zero_match_tables); 16710 $zero_matches = join "\n\n", 16711 map { $_ = '=item \p{' . $_->complete_name . "}" } 16712 sort { $a->complete_name cmp $b->complete_name } 16713 @zero_match_tables; 16714 16715 $zero_matches = <<END; 16716 16717=head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters 16718 16719Unicode has some property-value pairs that currently don't match anything. 16720This happens generally either because they are obsolete, or they exist for 16721symmetry with other forms, but no language has yet been encoded that uses 16722them. In this version of Unicode, the following match zero code points: 16723 16724=over 4 16725 16726$zero_matches 16727 16728=back 16729 16730END 16731 } 16732 16733 # Generate list of properties that we don't accept, grouped by the reasons 16734 # why. This is so only put out the 'why' once, and then list all the 16735 # properties that have that reason under it. 16736 16737 my %why_list; # The keys are the reasons; the values are lists of 16738 # properties that have the key as their reason 16739 16740 # For each property, add it to the list that are suppressed for its reason 16741 # The sort will cause the alphabetically first properties to be added to 16742 # each list first, so each list will be sorted. 16743 foreach my $property (sort keys %why_suppressed) { 16744 next unless $why_suppressed{$property}; 16745 push @{$why_list{$why_suppressed{$property}}}, $property; 16746 } 16747 16748 # For each reason (sorted by the first property that has that reason)... 16749 my @bad_re_properties; 16750 foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] } 16751 keys %why_list) 16752 { 16753 # Add to the output, all the properties that have that reason. 16754 my $has_item = 0; # Flag if actually output anything. 16755 foreach my $name (@{$why_list{$why}}) { 16756 16757 # Split compound names into $property and $table components 16758 my $property = $name; 16759 my $table; 16760 if ($property =~ / (.*) = (.*) /x) { 16761 $property = $1; 16762 $table = $2; 16763 } 16764 16765 # This release of Unicode may not have a property that is 16766 # suppressed, so don't reference a non-existent one. 16767 $property = property_ref($property); 16768 next if ! defined $property; 16769 16770 # And since this list is only for match tables, don't list the 16771 # ones that don't have match tables. 16772 next if ! $property->to_create_match_tables; 16773 16774 # Find any abbreviation, and turn it into a compound name if this 16775 # is a property=value pair. 16776 my $short_name = $property->name; 16777 $short_name .= '=' . $property->table($table)->name if $table; 16778 16779 # Start with an empty line. 16780 push @bad_re_properties, "\n\n" unless $has_item; 16781 16782 # And add the property as an item for the reason. 16783 push @bad_re_properties, "\n=item I<$name> ($short_name)\n"; 16784 $has_item = 1; 16785 } 16786 16787 # And add the reason under the list of properties, if such a list 16788 # actually got generated. Note that the header got added 16789 # unconditionally before. But pod ignores extra blank lines, so no 16790 # harm. 16791 push @bad_re_properties, "\n$why\n" if $has_item; 16792 16793 } # End of looping through each reason. 16794 16795 if (! @bad_re_properties) { 16796 push @bad_re_properties, 16797 "*** This installation accepts ALL non-Unihan properties ***"; 16798 } 16799 else { 16800 # Add =over only if non-empty to avoid an empty =over/=back section, 16801 # which is considered bad form. 16802 unshift @bad_re_properties, "\n=over 4\n"; 16803 push @bad_re_properties, "\n=back\n"; 16804 } 16805 16806 # Similarly, generate a list of files that we don't use, grouped by the 16807 # reasons why (Don't output if the reason is empty). First, create a hash 16808 # whose keys are the reasons, and whose values are anonymous arrays of all 16809 # the files that share that reason. 16810 my %grouped_by_reason; 16811 foreach my $file (keys %skipped_files) { 16812 next unless $skipped_files{$file}; 16813 push @{$grouped_by_reason{$skipped_files{$file}}}, $file; 16814 } 16815 16816 # Then, sort each group. 16817 foreach my $group (keys %grouped_by_reason) { 16818 @{$grouped_by_reason{$group}} = sort { lc $a cmp lc $b } 16819 @{$grouped_by_reason{$group}} ; 16820 } 16821 16822 # Finally, create the output text. For each reason (sorted by the 16823 # alphabetically first file that has that reason)... 16824 my @unused_files; 16825 foreach my $reason (sort { lc $grouped_by_reason{$a}->[0] 16826 cmp lc $grouped_by_reason{$b}->[0] 16827 } 16828 keys %grouped_by_reason) 16829 { 16830 # Add all the files that have that reason to the output. Start 16831 # with an empty line. 16832 push @unused_files, "\n\n"; 16833 push @unused_files, map { "\n=item F<$_> \n" } 16834 @{$grouped_by_reason{$reason}}; 16835 # And add the reason under the list of files 16836 push @unused_files, "\n$reason\n"; 16837 } 16838 16839 # Similarly, create the output text for the UCD section of the pod 16840 my @ucd_pod; 16841 foreach my $key (keys %ucd_pod) { 16842 next unless $ucd_pod{$key}->{'output_this'}; 16843 push @ucd_pod, format_pod_line($indent_info_column, 16844 $ucd_pod{$key}->{'name'}, 16845 $ucd_pod{$key}->{'info'}, 16846 $ucd_pod{$key}->{'status'}, 16847 ); 16848 } 16849 16850 # Sort alphabetically, and fold for output 16851 @ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod; 16852 my $ucd_pod = simple_fold(\@ucd_pod, 16853 ' ', 16854 $indent_info_column, 16855 $automatic_pod_indent); 16856 $ucd_pod = format_pod_line($indent_info_column, 'NAME', ' INFO') 16857 . "\n" 16858 . $ucd_pod; 16859 my $space_hex = sprintf("%02x", ord " "); 16860 local $" = ""; 16861 16862 # Everything is ready to assemble. 16863 my @OUT = << "END"; 16864=begin comment 16865 16866$HEADER 16867 16868To change this file, edit $0 instead. 16869 16870=end comment 16871 16872=head1 NAME 16873 16874$pod_file - Index of Unicode Version $unicode_version character properties in Perl 16875 16876=head1 DESCRIPTION 16877 16878This document provides information about the portion of the Unicode database 16879that deals with character properties, that is the portion that is defined on 16880single code points. (L</Other information in the Unicode data base> 16881below briefly mentions other data that Unicode provides.) 16882 16883Perl can provide access to all non-provisional Unicode character properties, 16884though not all are enabled by default. The omitted ones are the Unihan 16885properties and certain 16886deprecated or Unicode-internal properties. (An installation may choose to 16887recompile Perl's tables to change this. See L</Unicode character 16888properties that are NOT accepted by Perl>.) 16889 16890For most purposes, access to Unicode properties from the Perl core is through 16891regular expression matches, as described in the next section. 16892For some special purposes, and to access the properties that are not suitable 16893for regular expression matching, all the Unicode character properties that 16894Perl handles are accessible via the standard L<Unicode::UCD> module, as 16895described in the section L</Properties accessible through Unicode::UCD>. 16896 16897Perl also provides some additional extensions and short-cut synonyms 16898for Unicode properties. 16899 16900This document merely lists all available properties and does not attempt to 16901explain what each property really means. There is a brief description of each 16902Perl extension; see L<perlunicode/Other Properties> for more information on 16903these. There is some detail about Blocks, Scripts, General_Category, 16904and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the 16905official Unicode properties, refer to the Unicode standard. A good starting 16906place is L<$unicode_reference_url>. 16907 16908Note that you can define your own properties; see 16909L<perlunicode/"User-Defined Character Properties">. 16910 16911=head1 Properties accessible through C<\\p{}> and C<\\P{}> 16912 16913The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to 16914most of the Unicode character properties. The table below shows all these 16915constructs, both single and compound forms. 16916 16917B<Compound forms> consist of two components, separated by an equals sign or a 16918colon. The first component is the property name, and the second component is 16919the particular value of the property to match against, for example, 16920C<\\p{Script_Extensions: Greek}> and C<\\p{Script_Extensions=Greek}> both mean 16921to match characters whose Script_Extensions property value is Greek. 16922(C<Script_Extensions> is an improved version of the C<Script> property.) 16923 16924B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for 16925their equivalent compound forms. The table shows these equivalences. (In our 16926example, C<\\p{Greek}> is a just a shortcut for 16927C<\\p{Script_Extensions=Greek}>). There are also a few Perl-defined single 16928forms that are not shortcuts for a compound form. One such is C<\\p{Word}>. 16929These are also listed in the table. 16930 16931In parsing these constructs, Perl always ignores Upper/lower case differences 16932everywhere within the {braces}. Thus C<\\p{Greek}> means the same thing as 16933C<\\p{greek}>. But note that changing the case of the C<"p"> or C<"P"> before 16934the left brace completely changes the meaning of the construct, from "match" 16935(for C<\\p{}>) to "doesn't match" (for C<\\P{}>). Casing in this document is 16936for improved legibility. 16937 16938Also, white space, hyphens, and underscores are normally ignored 16939everywhere between the {braces}, and hence can be freely added or removed 16940even if the C</x> modifier hasn't been specified on the regular expression. 16941But in the table below $a_bold_stricter at the beginning of an entry 16942means that tighter (stricter) rules are used for that entry: 16943 16944=over 4 16945 16946=over 4 16947 16948=item Single form (C<\\p{name}>) tighter rules: 16949 16950White space, hyphens, and underscores ARE significant 16951except for: 16952 16953=over 4 16954 16955=item * white space adjacent to a non-word character 16956 16957=item * underscores separating digits in numbers 16958 16959=back 16960 16961That means, for example, that you can freely add or remove white space 16962adjacent to (but within) the braces without affecting the meaning. 16963 16964=item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules: 16965 16966The tighter rules given above for the single form apply to everything to the 16967right of the colon or equals; the looser rules still apply to everything to 16968the left. 16969 16970That means, for example, that you can freely add or remove white space 16971adjacent to (but within) the braces and the colon or equal sign. 16972 16973=back 16974 16975=back 16976 16977Some properties are considered obsolete by Unicode, but still available. 16978There are several varieties of obsolescence: 16979 16980=over 4 16981 16982=over 4 16983 16984=item Stabilized 16985 16986A property may be stabilized. Such a determination does not indicate 16987that the property should or should not be used; instead it is a declaration 16988that the property will not be maintained nor extended for newly encoded 16989characters. Such properties are marked with $a_bold_stabilized in the 16990table. 16991 16992=item Deprecated 16993 16994A property may be deprecated, perhaps because its original intent 16995has been replaced by another property, or because its specification was 16996somehow defective. This means that its use is strongly 16997discouraged, so much so that a warning will be issued if used, unless the 16998regular expression is in the scope of a C<S<no warnings 'deprecated'>> 16999statement. $A_bold_deprecated flags each such entry in the table, and 17000the entry there for the longest, most descriptive version of the property will 17001give the reason it is deprecated, and perhaps advice. Perl may issue such a 17002warning, even for properties that aren't officially deprecated by Unicode, 17003when there used to be characters or code points that were matched by them, but 17004no longer. This is to warn you that your program may not work like it did on 17005earlier Unicode releases. 17006 17007A deprecated property may be made unavailable in a future Perl version, so it 17008is best to move away from them. 17009 17010A deprecated property may also be stabilized, but this fact is not shown. 17011 17012=item Obsolete 17013 17014Properties marked with $a_bold_obsolete in the table are considered (plain) 17015obsolete. Generally this designation is given to properties that Unicode once 17016used for internal purposes (but not any longer). 17017 17018=item Discouraged 17019 17020This is not actually a Unicode-specified obsolescence, but applies to certain 17021Perl extensions that are present for backwards compatibility, but are 17022discouraged from being used. These are not obsolete, but their meanings are 17023not stable. Future Unicode versions could force any of these extensions to be 17024removed without warning, replaced by another property with the same name that 17025means something different. $A_bold_discouraged flags each such entry in the 17026table. Use the equivalent shown instead. 17027 17028@block_warning 17029 17030=back 17031 17032=back 17033 17034The table below has two columns. The left column contains the C<\\p{}> 17035constructs to look up, possibly preceded by the flags mentioned above; and 17036the right column contains information about them, like a description, or 17037synonyms. The table shows both the single and compound forms for each 17038property that has them. If the left column is a short name for a property, 17039the right column will give its longer, more descriptive name; and if the left 17040column is the longest name, the right column will show any equivalent shortest 17041name, in both single and compound forms if applicable. 17042 17043If braces are not needed to specify a property (e.g., C<\\pL>), the left 17044column contains both forms, with and without braces. 17045 17046The right column will also caution you if a property means something different 17047than what might normally be expected. 17048 17049All single forms are Perl extensions; a few compound forms are as well, and 17050are noted as such. 17051 17052Numbers in (parentheses) indicate the total number of Unicode code points 17053matched by the property. For the entries that give the longest, most 17054descriptive version of the property, the count is followed by a list of some 17055of the code points matched by it. The list includes all the matched 17056characters in the 0-255 range, enclosed in the familiar [brackets] the same as 17057a regular expression bracketed character class. Following that, the next few 17058higher matching ranges are also given. To avoid visual ambiguity, the SPACE 17059character is represented as C<\\x$space_hex>. 17060 17061For emphasis, those properties that match no code points at all are listed as 17062well in a separate section following the table. 17063 17064Most properties match the same code points regardless of whether C<"/i"> 17065case-insensitive matching is specified or not. But a few properties are 17066affected. These are shown with the notation S<C<(/i= I<other_property>)>> 17067in the second column. Under case-insensitive matching they match the 17068same code pode points as the property I<other_property>. 17069 17070There is no description given for most non-Perl defined properties (See 17071L<$unicode_reference_url> for that). 17072 17073For compactness, 'B<*>' is used as a wildcard instead of showing all possible 17074combinations. For example, entries like: 17075 17076 \\p{Gc: *} \\p{General_Category: *} 17077 17078mean that 'Gc' is a synonym for 'General_Category', and anything that is valid 17079for the latter is also valid for the former. Similarly, 17080 17081 \\p{Is_*} \\p{*} 17082 17083means that if and only if, for example, C<\\p{Foo}> exists, then 17084C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing. 17085And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and 17086C<\\p{IsFoo=Bar}>. "*" here is restricted to something not beginning with an 17087underscore. 17088 17089Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'. 17090And 'No', 'F', and 'False' are all synonyms for 'N'. The table shows 'Y*' and 17091'N*' to indicate this, and doesn't have separate entries for the other 17092possibilities. Note that not all properties which have values 'Yes' and 'No' 17093are binary, and they have all their values spelled out without using this wild 17094card, and a C<NOT> clause in their description that highlights their not being 17095binary. These also require the compound form to match them, whereas true 17096binary properties have both single and compound forms available. 17097 17098Note that all non-essential underscores are removed in the display of the 17099short names below. 17100 17101B<Legend summary:> 17102 17103=over 4 17104 17105=item Z<>B<*> is a wild-card 17106 17107=item B<(\\d+)> in the info column gives the number of Unicode code points matched 17108by this property. 17109 17110=item B<$DEPRECATED> means this is deprecated. 17111 17112=item B<$OBSOLETE> means this is obsolete. 17113 17114=item B<$STABILIZED> means this is stabilized. 17115 17116=item B<$STRICTER> means tighter (stricter) name matching applies. 17117 17118=item B<$DISCOURAGED> means use of this form is discouraged, and may not be 17119stable. 17120 17121=back 17122 17123$formatted_properties 17124 17125$zero_matches 17126 17127=head1 Properties accessible through Unicode::UCD 17128 17129The value of any Unicode (not including Perl extensions) character 17130property mentioned above for any single code point is available through 17131L<Unicode::UCD/charprop()>. L<Unicode::UCD/charprops_all()> returns the 17132values of all the Unicode properties for a given code point. 17133 17134Besides these, all the Unicode character properties mentioned above 17135(except for those marked as for internal use by Perl) are also 17136accessible by L<Unicode::UCD/prop_invlist()>. 17137 17138Due to their nature, not all Unicode character properties are suitable for 17139regular expression matches, nor C<prop_invlist()>. The remaining 17140non-provisional, non-internal ones are accessible via 17141L<Unicode::UCD/prop_invmap()> (except for those that this Perl installation 17142hasn't included; see L<below for which those are|/Unicode character properties 17143that are NOT accepted by Perl>). 17144 17145For compatibility with other parts of Perl, all the single forms given in the 17146table in the L<section above|/Properties accessible through \\p{} and \\P{}> 17147are recognized. BUT, there are some ambiguities between some Perl extensions 17148and the Unicode properties, all of which are silently resolved in favor of the 17149official Unicode property. To avoid surprises, you should only use 17150C<prop_invmap()> for forms listed in the table below, which omits the 17151non-recommended ones. The affected forms are the Perl single form equivalents 17152of Unicode properties, such as C<\\p{sc}> being a single-form equivalent of 17153C<\\p{gc=sc}>, which is treated by C<prop_invmap()> as the C<Script> property, 17154whose short name is C<sc>. The table indicates the current ambiguities in the 17155INFO column, beginning with the word C<"NOT">. 17156 17157The standard Unicode properties listed below are documented in 17158L<$unicode_reference_url>; Perl_Decimal_Digit is documented in 17159L<Unicode::UCD/prop_invmap()>. The other Perl extensions are in 17160L<perlunicode/Other Properties>; 17161 17162The first column in the table is a name for the property; the second column is 17163an alternative name, if any, plus possibly some annotations. The alternative 17164name is the property's full name, unless that would simply repeat the first 17165column, in which case the second column indicates the property's short name 17166(if different). The annotations are given only in the entry for the full 17167name. The annotations for binary properties include a list of the first few 17168ranges that the property matches. To avoid any ambiguity, the SPACE character 17169is represented as C<\\x$space_hex>. 17170 17171If a property is obsolete, etc, the entry will be flagged with the same 17172characters used in the table in the L<section above|/Properties accessible 17173through \\p{} and \\P{}>, like B<$DEPRECATED> or B<$STABILIZED>. 17174 17175$ucd_pod 17176 17177=head1 Properties accessible through other means 17178 17179Certain properties are accessible also via core function calls. These are: 17180 17181 Lowercase_Mapping lc() and lcfirst() 17182 Titlecase_Mapping ucfirst() 17183 Uppercase_Mapping uc() 17184 17185Also, Case_Folding is accessible through the C</i> modifier in regular 17186expressions, the C<\\F> transliteration escape, and the C<L<fc|perlfunc/fc>> 17187operator. 17188 17189Besides being able to say C<\\p{Name=...}>, the Name and Name_Aliases 17190properties are accessible through the C<\\N{}> interpolation in double-quoted 17191strings and regular expressions; and functions C<charnames::viacode()>, 17192C<charnames::vianame()>, and C<charnames::string_vianame()> (which require a 17193C<use charnames ();> to be specified. 17194 17195Finally, most properties related to decomposition are accessible via 17196L<Unicode::Normalize>. 17197 17198=head1 Unicode character properties that are NOT accepted by Perl 17199 17200Perl will generate an error for a few character properties in Unicode when 17201used in a regular expression. The non-Unihan ones are listed below, with the 17202reasons they are not accepted, perhaps with work-arounds. The short names for 17203the properties are listed enclosed in (parentheses). 17204As described after the list, an installation can change the defaults and choose 17205to accept any of these. The list is machine generated based on the 17206choices made for the installation that generated this document. 17207 17208@bad_re_properties 17209 17210An installation can choose to allow any of these to be matched by downloading 17211the Unicode database from L<http://www.unicode.org/Public/> to 17212C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the 17213controlling lists contained in the program 17214C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing. 17215(C<\%Config> is available from the Config module). 17216 17217Also, perl can be recompiled to operate on an earlier version of the Unicode 17218standard. Further information is at 17219C<\$Config{privlib}>/F<unicore/README.perl>. 17220 17221=head1 Other information in the Unicode data base 17222 17223The Unicode data base is delivered in two different formats. The XML version 17224is valid for more modern Unicode releases. The other version is a collection 17225of files. The two are intended to give equivalent information. Perl uses the 17226older form; this allows you to recompile Perl to use early Unicode releases. 17227 17228The only non-character property that Perl currently supports is Named 17229Sequences, in which a sequence of code points 17230is given a name and generally treated as a single entity. (Perl supports 17231these via the C<\\N{...}> double-quotish construct, 17232L<charnames/charnames::string_vianame(name)>, and L<Unicode::UCD/namedseq()>. 17233 17234Below is a list of the files in the Unicode data base that Perl doesn't 17235currently use, along with very brief descriptions of their purposes. 17236Some of the names of the files have been shortened from those that Unicode 17237uses, in order to allow them to be distinguishable from similarly named files 17238on file systems for which only the first 8 characters of a name are 17239significant. 17240 17241=over 4 17242 17243@unused_files 17244 17245=back 17246 17247=head1 SEE ALSO 17248 17249L<$unicode_reference_url> 17250 17251L<perlrecharclass> 17252 17253L<perlunicode> 17254 17255END 17256 17257 # And write it. The 0 means no utf8. 17258 main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT); 17259 return; 17260} 17261 17262sub make_Name_pm () { 17263 # Create and write Name.pm, which contains subroutines and data to use in 17264 # conjunction with Name.pl 17265 17266 # Maybe there's nothing to do. 17267 return unless $has_hangul_syllables || @code_points_ending_in_code_point; 17268 17269 my @name = <<END; 17270$HEADER 17271$INTERNAL_ONLY_HEADER 17272 17273=head1 NAME -- Internal generated file for use by charnames 17274 17275=cut 17276 17277END 17278 17279 # Convert these structures to output format. 17280 my $code_points_ending_in_code_point = 17281 main::simple_dumper(\@code_points_ending_in_code_point, 17282 ' ' x 8); 17283 my $names = main::simple_dumper(\%names_ending_in_code_point, 17284 ' ' x 8); 17285 my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point, 17286 ' ' x 8); 17287 17288 # Do the same with the Hangul names, 17289 my $jamo; 17290 my $jamo_l; 17291 my $jamo_v; 17292 my $jamo_t; 17293 my $jamo_re; 17294 if ($has_hangul_syllables) { 17295 17296 # Construct a regular expression of all the possible 17297 # combinations of the Hangul syllables. 17298 my @L_re; # Leading consonants 17299 for my $i ($LBase .. $LBase + $LCount - 1) { 17300 push @L_re, $Jamo{$i} 17301 } 17302 my @V_re; # Middle vowels 17303 for my $i ($VBase .. $VBase + $VCount - 1) { 17304 push @V_re, $Jamo{$i} 17305 } 17306 my @T_re; # Trailing consonants 17307 for my $i ($TBase + 1 .. $TBase + $TCount - 1) { 17308 push @T_re, $Jamo{$i} 17309 } 17310 17311 # The whole re is made up of the L V T combination. 17312 $jamo_re = '(' 17313 . join ('|', sort @L_re) 17314 . ')(' 17315 . join ('|', sort @V_re) 17316 . ')(' 17317 . join ('|', sort @T_re) 17318 . ')?'; 17319 17320 # These hashes needed by the algorithm were generated 17321 # during reading of the Jamo.txt file 17322 $jamo = main::simple_dumper(\%Jamo, ' ' x 8); 17323 $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8); 17324 $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8); 17325 $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8); 17326 } 17327 17328 push @name, <<END; 17329 17330package charnames; 17331 17332# This module contains machine-generated tables and code for the 17333# algorithmically-determinable Unicode character names. The following 17334# routines can be used to translate between name and code point and vice versa 17335 17336{ # Closure 17337 17338 # Matches legal code point. 4-6 hex numbers, If there are 6, the first 17339 # two must be 10; if there are 5, the first must not be a 0. Written this 17340 # way to decrease backtracking. The first regex allows the code point to 17341 # be at the end of a word, but to work properly, the word shouldn't end 17342 # with a valid hex character. The second one won't match a code point at 17343 # the end of a word, and doesn't have the run-on issue 17344 my \$run_on_code_point_re = qr/$run_on_code_point_re/; 17345 my \$code_point_re = qr/$code_point_re/; 17346 17347 # In the following hash, the keys are the bases of names which include 17348 # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The value 17349 # of each key is another hash which is used to get the low and high ends 17350 # for each range of code points that apply to the name. 17351 my %names_ending_in_code_point = ( 17352$names 17353 ); 17354 17355 # The following hash is a copy of the previous one, except is for loose 17356 # matching, so each name has blanks and dashes squeezed out 17357 my %loose_names_ending_in_code_point = ( 17358$loose_names 17359 ); 17360 17361 # And the following array gives the inverse mapping from code points to 17362 # names. Lowest code points are first 17363 \@code_points_ending_in_code_point = ( 17364$code_points_ending_in_code_point 17365 ); 17366 17367 # Is exportable, make read-only 17368 Internals::SvREADONLY(\@code_points_ending_in_code_point, 1); 17369END 17370 # Earlier releases didn't have Jamos. No sense outputting 17371 # them unless will be used. 17372 if ($has_hangul_syllables) { 17373 push @name, <<END; 17374 17375 # Convert from code point to Jamo short name for use in composing Hangul 17376 # syllable names 17377 my %Jamo = ( 17378$jamo 17379 ); 17380 17381 # Leading consonant (can be null) 17382 my %Jamo_L = ( 17383$jamo_l 17384 ); 17385 17386 # Vowel 17387 my %Jamo_V = ( 17388$jamo_v 17389 ); 17390 17391 # Optional trailing consonant 17392 my %Jamo_T = ( 17393$jamo_t 17394 ); 17395 17396 # Computed re that splits up a Hangul name into LVT or LV syllables 17397 my \$syllable_re = qr/$jamo_re/; 17398 17399 my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE "; 17400 my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE"; 17401 17402 # These constants names and values were taken from the Unicode standard, 17403 # version 5.1, section 3.12. They are used in conjunction with Hangul 17404 # syllables 17405 my \$SBase = $SBase_string; 17406 my \$LBase = $LBase_string; 17407 my \$VBase = $VBase_string; 17408 my \$TBase = $TBase_string; 17409 my \$SCount = $SCount; 17410 my \$LCount = $LCount; 17411 my \$VCount = $VCount; 17412 my \$TCount = $TCount; 17413 my \$NCount = \$VCount * \$TCount; 17414END 17415 } # End of has Jamos 17416 17417 push @name, << 'END'; 17418 17419 sub name_to_code_point_special { 17420 my ($name, $loose) = @_; 17421 17422 # Returns undef if not one of the specially handled names; otherwise 17423 # returns the code point equivalent to the input name 17424 # $loose is non-zero if to use loose matching, 'name' in that case 17425 # must be input as upper case with all blanks and dashes squeezed out. 17426END 17427 if ($has_hangul_syllables) { 17428 push @name, << 'END'; 17429 17430 if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//) 17431 || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//)) 17432 { 17433 return if $name !~ qr/^$syllable_re$/; 17434 my $L = $Jamo_L{$1}; 17435 my $V = $Jamo_V{$2}; 17436 my $T = (defined $3) ? $Jamo_T{$3} : 0; 17437 return ($L * $VCount + $V) * $TCount + $T + $SBase; 17438 } 17439END 17440 } 17441 push @name, << 'END'; 17442 17443 # Name must end in 'code_point' for this to handle. 17444 return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x) 17445 || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x)); 17446 17447 my $base = $1; 17448 my $code_point = CORE::hex $2; 17449 my $names_ref; 17450 17451 if ($loose) { 17452 $names_ref = \%loose_names_ending_in_code_point; 17453 } 17454 else { 17455 return if $base !~ s/-$//; 17456 $names_ref = \%names_ending_in_code_point; 17457 } 17458 17459 # Name must be one of the ones which has the code point in it. 17460 return if ! $names_ref->{$base}; 17461 17462 # Look through the list of ranges that apply to this name to see if 17463 # the code point is in one of them. 17464 for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) { 17465 return if $names_ref->{$base}{'low'}->[$i] > $code_point; 17466 next if $names_ref->{$base}{'high'}->[$i] < $code_point; 17467 17468 # Here, the code point is in the range. 17469 return $code_point; 17470 } 17471 17472 # Here, looked like the name had a code point number in it, but 17473 # did not match one of the valid ones. 17474 return; 17475 } 17476 17477 sub code_point_to_name_special { 17478 my $code_point = shift; 17479 17480 # Returns the name of a code point if algorithmically determinable; 17481 # undef if not 17482END 17483 if ($has_hangul_syllables) { 17484 push @name, << 'END'; 17485 17486 # If in the Hangul range, calculate the name based on Unicode's 17487 # algorithm 17488 if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) { 17489 use integer; 17490 my $SIndex = $code_point - $SBase; 17491 my $L = $LBase + $SIndex / $NCount; 17492 my $V = $VBase + ($SIndex % $NCount) / $TCount; 17493 my $T = $TBase + $SIndex % $TCount; 17494 $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}"; 17495 $name .= $Jamo{$T} if $T != $TBase; 17496 return $name; 17497 } 17498END 17499 } 17500 push @name, << 'END'; 17501 17502 # Look through list of these code points for one in range. 17503 foreach my $hash (@code_points_ending_in_code_point) { 17504 return if $code_point < $hash->{'low'}; 17505 if ($code_point <= $hash->{'high'}) { 17506 return sprintf("%s-%04X", $hash->{'name'}, $code_point); 17507 } 17508 } 17509 return; # None found 17510 } 17511} # End closure 17512 175131; 17514END 17515 17516 main::write("Name.pm", 0, \@name); # The 0 means no utf8. 17517 return; 17518} 17519 17520sub make_UCD () { 17521 # Create and write UCD.pl, which passes info about the tables to 17522 # Unicode::UCD 17523 17524 # Stringify structures for output 17525 my $loose_property_name_of 17526 = simple_dumper(\%loose_property_name_of, ' ' x 4); 17527 chomp $loose_property_name_of; 17528 17529 my $strict_property_name_of 17530 = simple_dumper(\%strict_property_name_of, ' ' x 4); 17531 chomp $strict_property_name_of; 17532 17533 my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4); 17534 chomp $stricter_to_file_of; 17535 17536 my $inline_definitions = simple_dumper(\@inline_definitions, " " x 4); 17537 chomp $inline_definitions; 17538 17539 my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4); 17540 chomp $loose_to_file_of; 17541 17542 my $nv_floating_to_rational 17543 = simple_dumper(\%nv_floating_to_rational, ' ' x 4); 17544 chomp $nv_floating_to_rational; 17545 17546 my $why_deprecated = simple_dumper(\%Unicode::UCD::why_deprecated, ' ' x 4); 17547 chomp $why_deprecated; 17548 17549 # We set the key to the file when we associated files with tables, but we 17550 # couldn't do the same for the value then, as we might not have the file 17551 # for the alternate table figured out at that time. 17552 foreach my $cased (keys %caseless_equivalent_to) { 17553 my @path = $caseless_equivalent_to{$cased}->file_path; 17554 my $path; 17555 if ($path[0] eq "#") { # Pseudo-directory '#' 17556 $path = join '/', @path; 17557 } 17558 else { # Gets rid of lib/ 17559 $path = join '/', @path[1, -1]; 17560 } 17561 $caseless_equivalent_to{$cased} = $path; 17562 } 17563 my $caseless_equivalent_to 17564 = simple_dumper(\%caseless_equivalent_to, ' ' x 4); 17565 chomp $caseless_equivalent_to; 17566 17567 my $loose_property_to_file_of 17568 = simple_dumper(\%loose_property_to_file_of, ' ' x 4); 17569 chomp $loose_property_to_file_of; 17570 17571 my $strict_property_to_file_of 17572 = simple_dumper(\%strict_property_to_file_of, ' ' x 4); 17573 chomp $strict_property_to_file_of; 17574 17575 my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4); 17576 chomp $file_to_swash_name; 17577 17578 # Create a mapping from each alias of Perl single-form extensions to all 17579 # its equivalent aliases, for quick look-up. 17580 my %perlprop_to_aliases; 17581 foreach my $table ($perl->tables) { 17582 17583 # First create the list of the aliases of each extension 17584 my @aliases_list; # List of legal aliases for this extension 17585 17586 my $table_name = $table->name; 17587 my $standard_table_name = standardize($table_name); 17588 my $table_full_name = $table->full_name; 17589 my $standard_table_full_name = standardize($table_full_name); 17590 17591 # Make sure that the list has both the short and full names 17592 push @aliases_list, $table_name, $table_full_name; 17593 17594 my $found_ucd = 0; # ? Did we actually get an alias that should be 17595 # output for this table 17596 17597 # Go through all the aliases (including the two just added), and add 17598 # any new unique ones to the list 17599 foreach my $alias ($table->aliases) { 17600 17601 # Skip non-legal names 17602 next unless $alias->ok_as_filename; 17603 next unless $alias->ucd; 17604 17605 $found_ucd = 1; # have at least one legal name 17606 17607 my $name = $alias->name; 17608 my $standard = standardize($name); 17609 17610 # Don't repeat a name that is equivalent to one already on the 17611 # list 17612 next if $standard eq $standard_table_name; 17613 next if $standard eq $standard_table_full_name; 17614 17615 push @aliases_list, $name; 17616 } 17617 17618 # If there were no legal names, don't output anything. 17619 next unless $found_ucd; 17620 17621 # To conserve memory in the program reading these in, omit full names 17622 # that are identical to the short name, when those are the only two 17623 # aliases for the property. 17624 if (@aliases_list == 2 && $aliases_list[0] eq $aliases_list[1]) { 17625 pop @aliases_list; 17626 } 17627 17628 # Here, @aliases_list is the list of all the aliases that this 17629 # extension legally has. Now can create a map to it from each legal 17630 # standardized alias 17631 foreach my $alias ($table->aliases) { 17632 next unless $alias->ucd; 17633 next unless $alias->ok_as_filename; 17634 push @{$perlprop_to_aliases{standardize($alias->name)}}, 17635 uniques @aliases_list; 17636 } 17637 } 17638 17639 # Make a list of all combinations of properties/values that are suppressed. 17640 my @suppressed; 17641 if (! $debug_skip) { # This tends to fail in this debug mode 17642 foreach my $property_name (keys %why_suppressed) { 17643 17644 # Just the value 17645 my $value_name = $1 if $property_name =~ s/ = ( .* ) //x; 17646 17647 # The hash may contain properties not in this release of Unicode 17648 next unless defined (my $property = property_ref($property_name)); 17649 17650 # Find all combinations 17651 foreach my $prop_alias ($property->aliases) { 17652 my $prop_alias_name = standardize($prop_alias->name); 17653 17654 # If no =value, there's just one combination possible for this 17655 if (! $value_name) { 17656 17657 # The property may be suppressed, but there may be a proxy 17658 # for it, so it shouldn't be listed as suppressed 17659 next if $prop_alias->ucd; 17660 push @suppressed, $prop_alias_name; 17661 } 17662 else { # Otherwise 17663 foreach my $value_alias 17664 ($property->table($value_name)->aliases) 17665 { 17666 next if $value_alias->ucd; 17667 17668 push @suppressed, "$prop_alias_name=" 17669 . standardize($value_alias->name); 17670 } 17671 } 17672 } 17673 } 17674 } 17675 @suppressed = sort @suppressed; # So doesn't change between runs of this 17676 # program 17677 17678 # Convert the structure below (designed for Name.pm) to a form that UCD 17679 # wants, so it doesn't have to modify it at all; i.e. so that it includes 17680 # an element for the Hangul syllables in the appropriate place, and 17681 # otherwise changes the name to include the "-<code point>" suffix. 17682 my @algorithm_names; 17683 my $done_hangul = $v_version lt v2.0.0; # Hanguls as we know them came 17684 # along in this version 17685 # Copy it linearly. 17686 for my $i (0 .. @code_points_ending_in_code_point - 1) { 17687 17688 # Insert the hanguls in the correct place. 17689 if (! $done_hangul 17690 && $code_points_ending_in_code_point[$i]->{'low'} > $SBase) 17691 { 17692 $done_hangul = 1; 17693 push @algorithm_names, { low => $SBase, 17694 high => $SBase + $SCount - 1, 17695 name => '<hangul syllable>', 17696 }; 17697 } 17698 17699 # Copy the current entry, modified. 17700 push @algorithm_names, { 17701 low => $code_points_ending_in_code_point[$i]->{'low'}, 17702 high => $code_points_ending_in_code_point[$i]->{'high'}, 17703 name => 17704 "$code_points_ending_in_code_point[$i]->{'name'}-<code point>", 17705 }; 17706 } 17707 17708 # Serialize these structures for output. 17709 my $loose_to_standard_value 17710 = simple_dumper(\%loose_to_standard_value, ' ' x 4); 17711 chomp $loose_to_standard_value; 17712 17713 my $string_property_loose_to_name 17714 = simple_dumper(\%string_property_loose_to_name, ' ' x 4); 17715 chomp $string_property_loose_to_name; 17716 17717 my $perlprop_to_aliases = simple_dumper(\%perlprop_to_aliases, ' ' x 4); 17718 chomp $perlprop_to_aliases; 17719 17720 my $prop_aliases = simple_dumper(\%prop_aliases, ' ' x 4); 17721 chomp $prop_aliases; 17722 17723 my $prop_value_aliases = simple_dumper(\%prop_value_aliases, ' ' x 4); 17724 chomp $prop_value_aliases; 17725 17726 my $suppressed = (@suppressed) ? simple_dumper(\@suppressed, ' ' x 4) : ""; 17727 chomp $suppressed; 17728 17729 my $algorithm_names = simple_dumper(\@algorithm_names, ' ' x 4); 17730 chomp $algorithm_names; 17731 17732 my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4); 17733 chomp $ambiguous_names; 17734 17735 my $combination_property = simple_dumper(\%combination_property, ' ' x 4); 17736 chomp $combination_property; 17737 17738 my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4); 17739 chomp $loose_defaults; 17740 17741 my @ucd = <<END; 17742$HEADER 17743$INTERNAL_ONLY_HEADER 17744 17745# This file is for the use of Unicode::UCD 17746 17747# Highest legal Unicode code point 17748\$Unicode::UCD::MAX_UNICODE_CODEPOINT = 0x$MAX_UNICODE_CODEPOINT_STRING; 17749 17750# Hangul syllables 17751\$Unicode::UCD::HANGUL_BEGIN = $SBase_string; 17752\$Unicode::UCD::HANGUL_COUNT = $SCount; 17753 17754# Maps Unicode (not Perl single-form extensions) property names in loose 17755# standard form to their corresponding standard names 17756\%Unicode::UCD::loose_property_name_of = ( 17757$loose_property_name_of 17758); 17759 17760# Same, but strict names 17761\%Unicode::UCD::strict_property_name_of = ( 17762$strict_property_name_of 17763); 17764 17765# Gives the definitions (in the form of inversion lists) for those properties 17766# whose definitions aren't kept in files 17767\@Unicode::UCD::inline_definitions = ( 17768$inline_definitions 17769); 17770 17771# Maps property, table to file for those using stricter matching. For paths 17772# whose directory is '#', the file is in the form of a numeric index into 17773# \@inline_definitions 17774\%Unicode::UCD::stricter_to_file_of = ( 17775$stricter_to_file_of 17776); 17777 17778# Maps property, table to file for those using loose matching. For paths 17779# whose directory is '#', the file is in the form of a numeric index into 17780# \@inline_definitions 17781\%Unicode::UCD::loose_to_file_of = ( 17782$loose_to_file_of 17783); 17784 17785# Maps floating point to fractional form 17786\%Unicode::UCD::nv_floating_to_rational = ( 17787$nv_floating_to_rational 17788); 17789 17790# If a %e floating point number doesn't have this number of digits in it after 17791# the decimal point to get this close to a fraction, it isn't considered to be 17792# that fraction even if all the digits it does have match. 17793\$Unicode::UCD::e_precision = $E_FLOAT_PRECISION; 17794 17795# Deprecated tables to generate a warning for. The key is the file containing 17796# the table, so as to avoid duplication, as many property names can map to the 17797# file, but we only need one entry for all of them. 17798\%Unicode::UCD::why_deprecated = ( 17799$why_deprecated 17800); 17801 17802# A few properties have different behavior under /i matching. This maps 17803# those to substitute files to use under /i. 17804\%Unicode::UCD::caseless_equivalent = ( 17805$caseless_equivalent_to 17806); 17807 17808# Property names to mapping files 17809\%Unicode::UCD::loose_property_to_file_of = ( 17810$loose_property_to_file_of 17811); 17812 17813# Property names to mapping files 17814\%Unicode::UCD::strict_property_to_file_of = ( 17815$strict_property_to_file_of 17816); 17817 17818# Files to the swash names within them. 17819\%Unicode::UCD::file_to_swash_name = ( 17820$file_to_swash_name 17821); 17822 17823# Keys are all the possible "prop=value" combinations, in loose form; values 17824# are the standard loose name for the 'value' part of the key 17825\%Unicode::UCD::loose_to_standard_value = ( 17826$loose_to_standard_value 17827); 17828 17829# String property loose names to standard loose name 17830\%Unicode::UCD::string_property_loose_to_name = ( 17831$string_property_loose_to_name 17832); 17833 17834# Keys are Perl extensions in loose form; values are each one's list of 17835# aliases 17836\%Unicode::UCD::loose_perlprop_to_name = ( 17837$perlprop_to_aliases 17838); 17839 17840# Keys are standard property name; values are each one's aliases 17841\%Unicode::UCD::prop_aliases = ( 17842$prop_aliases 17843); 17844 17845# Keys of top level are standard property name; values are keys to another 17846# hash, Each one is one of the property's values, in standard form. The 17847# values are that prop-val's aliases. If only one specified, the short and 17848# long alias are identical. 17849\%Unicode::UCD::prop_value_aliases = ( 17850$prop_value_aliases 17851); 17852 17853# Ordered (by code point ordinal) list of the ranges of code points whose 17854# names are algorithmically determined. Each range entry is an anonymous hash 17855# of the start and end points and a template for the names within it. 17856\@Unicode::UCD::algorithmic_named_code_points = ( 17857$algorithm_names 17858); 17859 17860# The properties that as-is have two meanings, and which must be disambiguated 17861\%Unicode::UCD::ambiguous_names = ( 17862$ambiguous_names 17863); 17864 17865# Keys are the prop-val combinations which are the default values for the 17866# given property, expressed in standard loose form 17867\%Unicode::UCD::loose_defaults = ( 17868$loose_defaults 17869); 17870 17871# The properties that are combinations, in that they have both a map table and 17872# a match table. This is actually for UCD.t, so it knows how to test for 17873# these. 17874\%Unicode::UCD::combination_property = ( 17875$combination_property 17876); 17877 17878# All combinations of names that are suppressed. 17879# This is actually for UCD.t, so it knows which properties shouldn't have 17880# entries. If it got any bigger, would probably want to put it in its own 17881# file to use memory only when it was needed, in testing. 17882\@Unicode::UCD::suppressed_properties = ( 17883$suppressed 17884); 17885 178861; 17887END 17888 17889 main::write("UCD.pl", 0, \@ucd); # The 0 means no utf8. 17890 return; 17891} 17892 17893sub write_all_tables() { 17894 # Write out all the tables generated by this program to files, as well as 17895 # the supporting data structures, pod file, and .t file. 17896 17897 my @writables; # List of tables that actually get written 17898 my %match_tables_to_write; # Used to collapse identical match tables 17899 # into one file. Each key is a hash function 17900 # result to partition tables into buckets. 17901 # Each value is an array of the tables that 17902 # fit in the bucket. 17903 17904 # For each property ... 17905 # (sort so that if there is an immutable file name, it has precedence, so 17906 # some other property can't come in and take over its file name. (We 17907 # don't care if both defined, as they had better be different anyway.) 17908 # The property named 'Perl' needs to be first (it doesn't have any 17909 # immutable file name) because empty properties are defined in terms of 17910 # its table named 'All' under the -annotate option.) We also sort by 17911 # the property's name. This is just for repeatability of the outputs 17912 # between runs of this program, but does not affect correctness. 17913 PROPERTY: 17914 foreach my $property ($perl, 17915 sort { return -1 if defined $a->file; 17916 return 1 if defined $b->file; 17917 return $a->name cmp $b->name; 17918 } grep { $_ != $perl } property_ref('*')) 17919 { 17920 my $type = $property->type; 17921 17922 # And for each table for that property, starting with the mapping 17923 # table for it ... 17924 TABLE: 17925 foreach my $table($property, 17926 17927 # and all the match tables for it (if any), sorted so 17928 # the ones with the shortest associated file name come 17929 # first. The length sorting prevents problems of a 17930 # longer file taking a name that might have to be used 17931 # by a shorter one. The alphabetic sorting prevents 17932 # differences between releases 17933 sort { my $ext_a = $a->external_name; 17934 return 1 if ! defined $ext_a; 17935 my $ext_b = $b->external_name; 17936 return -1 if ! defined $ext_b; 17937 17938 # But return the non-complement table before 17939 # the complement one, as the latter is defined 17940 # in terms of the former, and needs to have 17941 # the information for the former available. 17942 return 1 if $a->complement != 0; 17943 return -1 if $b->complement != 0; 17944 17945 # Similarly, return a subservient table after 17946 # a leader 17947 return 1 if $a->leader != $a; 17948 return -1 if $b->leader != $b; 17949 17950 my $cmp = length $ext_a <=> length $ext_b; 17951 17952 # Return result if lengths not equal 17953 return $cmp if $cmp; 17954 17955 # Alphabetic if lengths equal 17956 return $ext_a cmp $ext_b 17957 } $property->tables 17958 ) 17959 { 17960 17961 # Here we have a table associated with a property. It could be 17962 # the map table (done first for each property), or one of the 17963 # other tables. Determine which type. 17964 my $is_property = $table->isa('Property'); 17965 17966 my $name = $table->name; 17967 my $complete_name = $table->complete_name; 17968 17969 # See if should suppress the table if is empty, but warn if it 17970 # contains something. 17971 my $suppress_if_empty_warn_if_not 17972 = $why_suppress_if_empty_warn_if_not{$complete_name} || 0; 17973 17974 # Calculate if this table should have any code points associated 17975 # with it or not. 17976 my $expected_empty = 17977 17978 # $perl should be empty 17979 ($is_property && ($table == $perl)) 17980 17981 # Match tables in properties we skipped populating should be 17982 # empty 17983 || (! $is_property && ! $property->to_create_match_tables) 17984 17985 # Tables and properties that are expected to have no code 17986 # points should be empty 17987 || $suppress_if_empty_warn_if_not 17988 ; 17989 17990 # Set a boolean if this table is the complement of an empty binary 17991 # table 17992 my $is_complement_of_empty_binary = 17993 $type == $BINARY && 17994 (($table == $property->table('Y') 17995 && $property->table('N')->is_empty) 17996 || ($table == $property->table('N') 17997 && $property->table('Y')->is_empty)); 17998 17999 if ($table->is_empty) { 18000 18001 if ($suppress_if_empty_warn_if_not) { 18002 $table->set_fate($SUPPRESSED, 18003 $suppress_if_empty_warn_if_not); 18004 } 18005 18006 # Suppress (by skipping them) expected empty tables. 18007 next TABLE if $expected_empty; 18008 18009 # And setup to later output a warning for those that aren't 18010 # known to be allowed to be empty. Don't do the warning if 18011 # this table is a child of another one to avoid duplicating 18012 # the warning that should come from the parent one. 18013 if (($table == $property || $table->parent == $table) 18014 && $table->fate != $SUPPRESSED 18015 && $table->fate != $MAP_PROXIED 18016 && ! grep { $complete_name =~ /^$_$/ } 18017 @tables_that_may_be_empty) 18018 { 18019 push @unhandled_properties, "$table"; 18020 } 18021 18022 # The old way of expressing an empty match list was to 18023 # complement the list that matches everything. The new way is 18024 # to create an empty inversion list, but this doesn't work for 18025 # annotating, so use the old way then. 18026 $table->set_complement($All) if $annotate 18027 && $table != $property; 18028 } 18029 elsif ($expected_empty) { 18030 my $because = ""; 18031 if ($suppress_if_empty_warn_if_not) { 18032 $because = " because $suppress_if_empty_warn_if_not"; 18033 } 18034 18035 Carp::my_carp("Not expecting property $table$because. Generating file for it anyway."); 18036 } 18037 18038 # Some tables should match everything 18039 my $expected_full = 18040 ($table->fate == $SUPPRESSED) 18041 ? 0 18042 : ($is_property) 18043 ? # All these types of map tables will be full because 18044 # they will have been populated with defaults 18045 ($type == $ENUM) 18046 18047 : # A match table should match everything if its method 18048 # shows it should 18049 ($table->matches_all 18050 18051 # The complement of an empty binary table will match 18052 # everything 18053 || $is_complement_of_empty_binary 18054 ) 18055 ; 18056 18057 my $count = $table->count; 18058 if ($expected_full) { 18059 if ($count != $MAX_WORKING_CODEPOINTS) { 18060 Carp::my_carp("$table matches only " 18061 . clarify_number($count) 18062 . " Unicode code points but should match " 18063 . clarify_number($MAX_WORKING_CODEPOINTS) 18064 . " (off by " 18065 . clarify_number(abs($MAX_WORKING_CODEPOINTS - $count)) 18066 . "). Proceeding anyway."); 18067 } 18068 18069 # Here is expected to be full. If it is because it is the 18070 # complement of an (empty) binary table that is to be 18071 # suppressed, then suppress this one as well. 18072 if ($is_complement_of_empty_binary) { 18073 my $opposing_name = ($name eq 'Y') ? 'N' : 'Y'; 18074 my $opposing = $property->table($opposing_name); 18075 my $opposing_status = $opposing->status; 18076 if ($opposing_status) { 18077 $table->set_status($opposing_status, 18078 $opposing->status_info); 18079 } 18080 } 18081 } 18082 elsif ($count == $MAX_UNICODE_CODEPOINTS 18083 && $name ne "Any" 18084 && ($table == $property || $table->leader == $table) 18085 && $table->property->status ne $NORMAL) 18086 { 18087 Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway."); 18088 } 18089 18090 if ($table->fate >= $SUPPRESSED) { 18091 if (! $is_property) { 18092 my @children = $table->children; 18093 foreach my $child (@children) { 18094 if ($child->fate < $SUPPRESSED) { 18095 Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't"); 18096 } 18097 } 18098 } 18099 next TABLE; 18100 18101 } 18102 18103 if (! $is_property) { 18104 18105 make_ucd_table_pod_entries($table) if $table->property == $perl; 18106 18107 # Several things need to be done just once for each related 18108 # group of match tables. Do them on the parent. 18109 if ($table->parent == $table) { 18110 18111 # Add an entry in the pod file for the table; it also does 18112 # the children. 18113 make_re_pod_entries($table) if defined $pod_directory; 18114 18115 # See if the table matches identical code points with 18116 # something that has already been processed and is ready 18117 # for output. In that case, no need to have two files 18118 # with the same code points in them. We use the table's 18119 # hash() method to store these in buckets, so that it is 18120 # quite likely that if two tables are in the same bucket 18121 # they will be identical, so don't have to compare tables 18122 # frequently. The tables have to have the same status to 18123 # share a file, so add this to the bucket hash. (The 18124 # reason for this latter is that UCD.pm associates a 18125 # status with a file.) We don't check tables that are 18126 # inverses of others, as it would lead to some coding 18127 # complications, and checking all the regular ones should 18128 # find everything. 18129 if ($table->complement == 0) { 18130 my $hash = $table->hash . ';' . $table->status; 18131 18132 # Look at each table that is in the same bucket as 18133 # this one would be. 18134 foreach my $comparison 18135 (@{$match_tables_to_write{$hash}}) 18136 { 18137 # If the table doesn't point back to this one, we 18138 # see if it matches identically 18139 if ( $comparison->leader != $table 18140 && $table->matches_identically_to($comparison)) 18141 { 18142 $table->set_equivalent_to($comparison, 18143 Related => 0); 18144 next TABLE; 18145 } 18146 } 18147 18148 # Here, not equivalent, add this table to the bucket. 18149 push @{$match_tables_to_write{$hash}}, $table; 18150 } 18151 } 18152 } 18153 else { 18154 18155 # Here is the property itself. 18156 # Don't write out or make references to the $perl property 18157 next if $table == $perl; 18158 18159 make_ucd_table_pod_entries($table); 18160 18161 # There is a mapping stored of the various synonyms to the 18162 # standardized name of the property for Unicode::UCD. 18163 # Also, the pod file contains entries of the form: 18164 # \p{alias: *} \p{full: *} 18165 # rather than show every possible combination of things. 18166 18167 my @property_aliases = $property->aliases; 18168 18169 my $full_property_name = $property->full_name; 18170 my $property_name = $property->name; 18171 my $standard_property_name = standardize($property_name); 18172 my $standard_property_full_name 18173 = standardize($full_property_name); 18174 18175 # We also create for Unicode::UCD a list of aliases for 18176 # the property. The list starts with the property name; 18177 # then its full name. 18178 my @property_list; 18179 my @standard_list; 18180 if ( $property->fate <= $MAP_PROXIED) { 18181 @property_list = ($property_name, $full_property_name); 18182 @standard_list = ($standard_property_name, 18183 $standard_property_full_name); 18184 } 18185 18186 # For each synonym ... 18187 for my $i (0 .. @property_aliases - 1) { 18188 my $alias = $property_aliases[$i]; 18189 my $alias_name = $alias->name; 18190 my $alias_standard = standardize($alias_name); 18191 18192 18193 # Add other aliases to the list of property aliases 18194 if ($property->fate <= $MAP_PROXIED 18195 && ! grep { $alias_standard eq $_ } @standard_list) 18196 { 18197 push @property_list, $alias_name; 18198 push @standard_list, $alias_standard; 18199 } 18200 18201 # For Unicode::UCD, set the mapping of the alias to the 18202 # property 18203 if ($type == $STRING) { 18204 if ($property->fate <= $MAP_PROXIED) { 18205 $string_property_loose_to_name{$alias_standard} 18206 = $standard_property_name; 18207 } 18208 } 18209 else { 18210 my $hash_ref = ($alias_standard =~ /^_/) 18211 ? \%strict_property_name_of 18212 : \%loose_property_name_of; 18213 if (exists $hash_ref->{$alias_standard}) { 18214 Carp::my_carp("There already is a property with the same standard name as $alias_name: $hash_ref->{$alias_standard}. Old name is retained"); 18215 } 18216 else { 18217 $hash_ref->{$alias_standard} 18218 = $standard_property_name; 18219 } 18220 18221 # Now for the re pod entry for this alias. Skip if not 18222 # outputting a pod; skip the first one, which is the 18223 # full name so won't have an entry like: '\p{full: *} 18224 # \p{full: *}', and skip if don't want an entry for 18225 # this one. 18226 next if $i == 0 18227 || ! defined $pod_directory 18228 || ! $alias->make_re_pod_entry; 18229 18230 my $rhs = "\\p{$full_property_name: *}"; 18231 if ($property != $perl && $table->perl_extension) { 18232 $rhs .= ' (Perl extension)'; 18233 } 18234 push @match_properties, 18235 format_pod_line($indent_info_column, 18236 '\p{' . $alias->name . ': *}', 18237 $rhs, 18238 $alias->status); 18239 } 18240 } 18241 18242 # The list of all possible names is attached to each alias, so 18243 # lookup is easy 18244 if (@property_list) { 18245 push @{$prop_aliases{$standard_list[0]}}, @property_list; 18246 } 18247 18248 if ($property->fate <= $MAP_PROXIED) { 18249 18250 # Similarly, we create for Unicode::UCD a list of 18251 # property-value aliases. 18252 18253 # Look at each table in the property... 18254 foreach my $table ($property->tables) { 18255 my @values_list; 18256 my $table_full_name = $table->full_name; 18257 my $standard_table_full_name 18258 = standardize($table_full_name); 18259 my $table_name = $table->name; 18260 my $standard_table_name = standardize($table_name); 18261 18262 # The list starts with the table name and its full 18263 # name. 18264 push @values_list, $table_name, $table_full_name; 18265 18266 # We add to the table each unique alias that isn't 18267 # discouraged from use. 18268 foreach my $alias ($table->aliases) { 18269 next if $alias->status 18270 && $alias->status eq $DISCOURAGED; 18271 my $name = $alias->name; 18272 my $standard = standardize($name); 18273 next if $standard eq $standard_table_name; 18274 next if $standard eq $standard_table_full_name; 18275 push @values_list, $name; 18276 } 18277 18278 # Here @values_list is a list of all the aliases for 18279 # the table. That is, all the property-values given 18280 # by this table. By agreement with Unicode::UCD, 18281 # if the name and full name are identical, and there 18282 # are no other names, drop the duplicate entry to save 18283 # memory. 18284 if (@values_list == 2 18285 && $values_list[0] eq $values_list[1]) 18286 { 18287 pop @values_list 18288 } 18289 18290 # To save memory, unlike the similar list for property 18291 # aliases above, only the standard forms have the list. 18292 # This forces an extra step of converting from input 18293 # name to standard name, but the savings are 18294 # considerable. (There is only marginal savings if we 18295 # did this with the property aliases.) 18296 push @{$prop_value_aliases{$standard_property_name}{$standard_table_name}}, @values_list; 18297 } 18298 } 18299 18300 # Don't write out a mapping file if not desired. 18301 next if ! $property->to_output_map; 18302 } 18303 18304 # Here, we know we want to write out the table, but don't do it 18305 # yet because there may be other tables that come along and will 18306 # want to share the file, and the file's comments will change to 18307 # mention them. So save for later. 18308 push @writables, $table; 18309 18310 } # End of looping through the property and all its tables. 18311 } # End of looping through all properties. 18312 18313 # Now have all the tables that will have files written for them. Do it. 18314 foreach my $table (@writables) { 18315 my @directory; 18316 my $filename; 18317 my $property = $table->property; 18318 my $is_property = ($table == $property); 18319 18320 # For very short tables, instead of writing them out to actual files, 18321 # we in-line their inversion list definitions into UCD.pm. The 18322 # definition replaces the file name, and the special pseudo-directory 18323 # '#' is used to signal this. This significantly cuts down the number 18324 # of files written at little extra cost to the hashes in UCD.pm. 18325 # And it means, no run-time files to read to get the definitions. 18326 if (! $is_property 18327 && ! $annotate # For annotation, we want to explicitly show 18328 # everything, so keep in files 18329 && $table->ranges <= 3) 18330 { 18331 my @ranges = $table->ranges; 18332 my $count = @ranges; 18333 if ($count == 0) { # 0th index reserved for 0-length lists 18334 $filename = 0; 18335 } 18336 elsif ($table->leader != $table) { 18337 18338 # Here, is a table that is equivalent to another; code 18339 # in register_file_for_name() causes its leader's definition 18340 # to be used 18341 18342 next; 18343 } 18344 else { # No equivalent table so far. 18345 18346 # Build up its definition range-by-range. 18347 my $definition = ""; 18348 while (defined (my $range = shift @ranges)) { 18349 my $end = $range->end; 18350 if ($end < $MAX_WORKING_CODEPOINT) { 18351 $count++; 18352 $end = "\n" . ($end + 1); 18353 } 18354 else { # Extends to infinity, hence no 'end' 18355 $end = ""; 18356 } 18357 $definition .= "\n" . $range->start . $end; 18358 } 18359 $definition = "V$count" . $definition; 18360 $filename = @inline_definitions; 18361 push @inline_definitions, $definition; 18362 } 18363 @directory = "#"; 18364 register_file_for_name($table, \@directory, $filename); 18365 next; 18366 } 18367 18368 if (! $is_property) { 18369 # Match tables for the property go in lib/$subdirectory, which is 18370 # the property's name. Don't use the standard file name for this, 18371 # as may get an unfamiliar alias 18372 @directory = ($matches_directory, ($property->match_subdir) 18373 ? $property->match_subdir 18374 : $property->external_name); 18375 } 18376 else { 18377 18378 @directory = $table->directory; 18379 $filename = $table->file; 18380 } 18381 18382 # Use specified filename if available, or default to property's 18383 # shortest name. We need an 8.3 safe filename (which means "an 8 18384 # safe" filename, since after the dot is only 'pl', which is < 3) 18385 # The 2nd parameter is if the filename shouldn't be changed, and 18386 # it shouldn't iff there is a hard-coded name for this table. 18387 $filename = construct_filename( 18388 $filename || $table->external_name, 18389 ! $filename, # mutable if no filename 18390 \@directory); 18391 18392 register_file_for_name($table, \@directory, $filename); 18393 18394 # Only need to write one file when shared by more than one 18395 # property 18396 next if ! $is_property 18397 && ($table->leader != $table || $table->complement != 0); 18398 18399 # Construct a nice comment to add to the file 18400 $table->set_final_comment; 18401 18402 $table->write; 18403 } 18404 18405 18406 # Write out the pod file 18407 make_pod; 18408 18409 # And Name.pm, UCD.pl 18410 make_Name_pm; 18411 make_UCD; 18412 18413 make_property_test_script() if $make_test_script; 18414 make_normalization_test_script() if $make_norm_test_script; 18415 return; 18416} 18417 18418my @white_space_separators = ( # This used only for making the test script. 18419 "", 18420 ' ', 18421 "\t", 18422 ' ' 18423 ); 18424 18425sub generate_separator($lhs) { 18426 # This used only for making the test script. It generates the colon or 18427 # equal separator between the property and property value, with random 18428 # white space surrounding the separator 18429 18430 return "" if $lhs eq ""; # No separator if there's only one (the r) side 18431 18432 # Choose space before and after randomly 18433 my $spaces_before =$white_space_separators[rand(@white_space_separators)]; 18434 my $spaces_after = $white_space_separators[rand(@white_space_separators)]; 18435 18436 # And return the whole complex, half the time using a colon, half the 18437 # equals 18438 return $spaces_before 18439 . (rand() < 0.5) ? '=' : ':' 18440 . $spaces_after; 18441} 18442 18443sub generate_tests($lhs, $rhs, $valid_code, $invalid_code, $warning) { 18444 # This used only for making the test script. It generates test cases that 18445 # are expected to compile successfully in perl. Note that the LHS and 18446 # RHS are assumed to already be as randomized as the caller wants. 18447 18448 # $lhs # The property: what's to the left of the colon 18449 # or equals separator 18450 # $rhs # The property value; what's to the right 18451 # $valid_code # A code point that's known to be in the 18452 # table given by LHS=RHS; undef if table is 18453 # empty 18454 # $invalid_code # A code point known to not be in the table; 18455 # undef if the table is all code points 18456 # $warning 18457 18458 # Get the colon or equal 18459 my $separator = generate_separator($lhs); 18460 18461 # The whole 'property=value' 18462 my $name = "$lhs$separator$rhs"; 18463 18464 my @output; 18465 # Create a complete set of tests, with complements. 18466 if (defined $valid_code) { 18467 push @output, <<"EOC" 18468Expect(1, $valid_code, '\\p{$name}', $warning); 18469Expect(0, $valid_code, '\\p{^$name}', $warning); 18470Expect(0, $valid_code, '\\P{$name}', $warning); 18471Expect(1, $valid_code, '\\P{^$name}', $warning); 18472EOC 18473 } 18474 if (defined $invalid_code) { 18475 push @output, <<"EOC" 18476Expect(0, $invalid_code, '\\p{$name}', $warning); 18477Expect(1, $invalid_code, '\\p{^$name}', $warning); 18478Expect(1, $invalid_code, '\\P{$name}', $warning); 18479Expect(0, $invalid_code, '\\P{^$name}', $warning); 18480EOC 18481 } 18482 return @output; 18483} 18484 18485sub generate_wildcard_tests($lhs, $rhs, $valid_code, $invalid_code, $warning) { 18486 # This used only for making the test script. It generates wildcardl 18487 # matching test cases that are expected to compile successfully in perl. 18488 18489 # $lhs # The property: what's to the left of the 18490 # or equals separator 18491 # $rhs # The property value; what's to the right 18492 # $valid_code # A code point that's known to be in the 18493 # table given by LHS=RHS; undef if table is 18494 # empty 18495 # $invalid_code # A code point known to not be in the table; 18496 # undef if the table is all code points 18497 # $warning 18498 18499 return if $lhs eq ""; 18500 return if $lhs =~ / ^ Is_ /x; # These are not currently supported 18501 18502 # Generate a standardized pattern, with colon being the delimitter 18503 my $wildcard = "$lhs=:\\A$rhs\\z:"; 18504 18505 my @output; 18506 push @output, "Expect(1, $valid_code, '\\p{$wildcard}', $warning);" 18507 if defined $valid_code; 18508 push @output, "Expect(0, $invalid_code, '\\p{$wildcard}', $warning);" 18509 if defined $invalid_code; 18510 return @output; 18511} 18512 18513sub generate_error($lhs, $rhs, $already_in_error=0) { 18514 # This used only for making the test script. It generates test cases that 18515 # are expected to not only not match, but to be syntax or similar errors 18516 18517 # $lhs # The property: what's to the left of the 18518 # colon or equals separator 18519 # $rhs # The property value; what's to the right 18520 # $already_in_error # Boolean; if true it's known that the 18521 # unmodified LHS and RHS will cause an error. 18522 # This routine should not force another one 18523 # Get the colon or equal 18524 my $separator = generate_separator($lhs); 18525 18526 # Since this is an error only, don't bother to randomly decide whether to 18527 # put the error on the left or right side; and assume that the RHS is 18528 # loosely matched, again for convenience rather than rigor. 18529 $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error; 18530 18531 my $property = $lhs . $separator . $rhs; 18532 18533 return <<"EOC"; 18534Error('\\p{$property}'); 18535Error('\\P{$property}'); 18536EOC 18537} 18538 18539# These are used only for making the test script 18540# XXX Maybe should also have a bad strict seps, which includes underscore. 18541 18542my @good_loose_seps = ( 18543 " ", 18544 "-", 18545 "\t", 18546 "", 18547 "_", 18548 ); 18549my @bad_loose_seps = ( 18550 "/a/", 18551 ':=', 18552 ); 18553 18554sub randomize_stricter_name($name) { 18555 # This used only for making the test script. Take the input name and 18556 # return a randomized, but valid version of it under the stricter matching 18557 # rules. 18558 18559 # If the name looks like a number (integer, floating, or rational), do 18560 # some extra work 18561 if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) { 18562 my $sign = $1; 18563 my $number = $2; 18564 my $separator = $3; 18565 18566 # If there isn't a sign, part of the time add a plus 18567 # Note: Not testing having any denominator having a minus sign 18568 if (! $sign) { 18569 $sign = '+' if rand() <= .3; 18570 } 18571 18572 # And add 0 or more leading zeros. 18573 $name = $sign . ('0' x int rand(10)) . $number; 18574 18575 if (defined $separator) { 18576 my $extra_zeros = '0' x int rand(10); 18577 18578 if ($separator eq '.') { 18579 18580 # Similarly, add 0 or more trailing zeros after a decimal 18581 # point 18582 $name .= $extra_zeros; 18583 } 18584 else { 18585 18586 # Or, leading zeros before the denominator 18587 $name =~ s,/,/$extra_zeros,; 18588 } 18589 } 18590 } 18591 18592 # For legibility of the test, only change the case of whole sections at a 18593 # time. To do this, first split into sections. The split returns the 18594 # delimiters 18595 my @sections; 18596 for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) { 18597 trace $section if main::DEBUG && $to_trace; 18598 18599 if (length $section > 1 && $section !~ /\D/) { 18600 18601 # If the section is a sequence of digits, about half the time 18602 # randomly add underscores between some of them. 18603 if (rand() > .5) { 18604 18605 # Figure out how many underscores to add. max is 1 less than 18606 # the number of digits. (But add 1 at the end to make sure 18607 # result isn't 0, and compensate earlier by subtracting 2 18608 # instead of 1) 18609 my $num_underscores = int rand(length($section) - 2) + 1; 18610 18611 # And add them evenly throughout, for convenience, not rigor 18612 use integer; 18613 my $spacing = (length($section) - 1)/ $num_underscores; 18614 my $temp = $section; 18615 $section = ""; 18616 for my $i (1 .. $num_underscores) { 18617 $section .= substr($temp, 0, $spacing, "") . '_'; 18618 } 18619 $section .= $temp; 18620 } 18621 push @sections, $section; 18622 } 18623 else { 18624 18625 # Here not a sequence of digits. Change the case of the section 18626 # randomly 18627 my $switch = int rand(4); 18628 if ($switch == 0) { 18629 push @sections, uc $section; 18630 } 18631 elsif ($switch == 1) { 18632 push @sections, lc $section; 18633 } 18634 elsif ($switch == 2) { 18635 push @sections, ucfirst $section; 18636 } 18637 else { 18638 push @sections, $section; 18639 } 18640 } 18641 } 18642 trace "returning", join "", @sections if main::DEBUG && $to_trace; 18643 return join "", @sections; 18644} 18645 18646sub randomize_loose_name($name, $want_error=0) { 18647 # This used only for making the test script 18648 18649 $name = randomize_stricter_name($name); 18650 18651 my @parts; 18652 push @parts, $good_loose_seps[rand(@good_loose_seps)]; 18653 18654 # Preserve trailing ones for the sake of not stripping the underscore from 18655 # 'L_' 18656 for my $part (split /[-\s_]+ (?= . )/, $name) { 18657 if (@parts) { 18658 if ($want_error and rand() < 0.3) { 18659 push @parts, $bad_loose_seps[rand(@bad_loose_seps)]; 18660 $want_error = 0; 18661 } 18662 else { 18663 push @parts, $good_loose_seps[rand(@good_loose_seps)]; 18664 } 18665 } 18666 push @parts, $part; 18667 } 18668 my $new = join("", @parts); 18669 trace "$name => $new" if main::DEBUG && $to_trace; 18670 18671 if ($want_error) { 18672 if (rand() >= 0.5) { 18673 $new .= $bad_loose_seps[rand(@bad_loose_seps)]; 18674 } 18675 else { 18676 $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new; 18677 } 18678 } 18679 return $new; 18680} 18681 18682# Used to make sure don't generate duplicate test cases. 18683my %test_generated; 18684 18685sub make_property_test_script() { 18686 # This used only for making the test script 18687 # this written directly -- it's huge. 18688 18689 print "Making test script\n" if $verbosity >= $PROGRESS; 18690 18691 # This uses randomness to test different possibilities without testing all 18692 # possibilities. To ensure repeatability, set the seed to 0. But if 18693 # tests are added, it will perturb all later ones in the .t file 18694 srand 0; 18695 18696 $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name 18697 18698 # Create a list of what the %f representation is for each rational number. 18699 # This will be used below. 18700 my @valid_base_floats = '0.0'; 18701 foreach my $e_representation (keys %nv_floating_to_rational) { 18702 push @valid_base_floats, 18703 eval $nv_floating_to_rational{$e_representation}; 18704 } 18705 18706 # It doesn't matter whether the elements of this array contain single lines 18707 # or multiple lines. main::write doesn't count the lines. 18708 my @output; 18709 18710 push @output, <<'EOF_CODE'; 18711Error('\p{Script=InGreek}'); # Bug #69018 18712Test_GCB("1100 $nobreak 1161"); # Bug #70940 18713Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722 18714Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722 18715Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726 18716Error('\p{InKana}'); # 'Kana' is not a block so InKana shouldn't compile 18717 18718# Make sure this gets tested; it was not part of the official test suite at 18719# the time this was added. Note that this is as it would appear in the 18720# official suite, and gets modified to check for the perl tailoring by 18721# Test_WB() 18722Test_WB("$breakable 0020 $breakable 0020 $breakable 0308 $breakable"); 18723Test_LB("$nobreak 200B $nobreak 0020 $nobreak 0020 $breakable 2060 $breakable"); 18724Expect(1, ord(" "), '\p{gc=:(?aa)s:}', ""); # /aa is valid 18725Expect(1, ord(" "), '\p{gc=:(?-s)s:}', ""); # /-s is valid 18726EOF_CODE 18727 18728 # Sort these so get results in same order on different runs of this 18729 # program 18730 foreach my $property (sort { $a->has_dependency <=> $b->has_dependency 18731 or 18732 lc $a->name cmp lc $b->name 18733 } property_ref('*')) 18734 { 18735 # Non-binary properties should not match \p{}; Test all for that. 18736 if ($property->type != $BINARY && $property->type != $FORCED_BINARY) { 18737 my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } 18738 $property->aliases; 18739 foreach my $property_alias ($property->aliases) { 18740 my $name = standardize($property_alias->name); 18741 18742 # But some names are ambiguous, meaning a binary property with 18743 # the same name when used in \p{}, and a different 18744 # (non-binary) property in other contexts. 18745 next if grep { $name eq $_ } keys %ambiguous_names; 18746 18747 push @output, <<"EOF_CODE"; 18748Error('\\p{$name}'); 18749Error('\\P{$name}'); 18750EOF_CODE 18751 } 18752 } 18753 foreach my $table (sort { $a->has_dependency <=> $b->has_dependency 18754 or 18755 lc $a->name cmp lc $b->name 18756 } $property->tables) 18757 { 18758 18759 # Find code points that match, and don't match this table. 18760 my $valid = $table->get_valid_code_point; 18761 my $invalid = $table->get_invalid_code_point; 18762 my $warning = ($table->status eq $DEPRECATED) 18763 ? "'deprecated'" 18764 : '""'; 18765 18766 # Test each possible combination of the property's aliases with 18767 # the table's. If this gets to be too many, could do what is done 18768 # in the set_final_comment() for Tables 18769 my @table_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->aliases; 18770 next unless @table_aliases; 18771 my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->property->aliases; 18772 next unless @property_aliases; 18773 18774 # Every property can be optionally be prefixed by 'Is_', so test 18775 # that those work, by creating such a new alias for each 18776 # pre-existing one. 18777 push @property_aliases, map { Alias->new("Is_" . $_->name, 18778 $_->loose_match, 18779 $_->make_re_pod_entry, 18780 $_->ok_as_filename, 18781 $_->status, 18782 $_->ucd, 18783 ) 18784 } @property_aliases; 18785 my $max = max(scalar @table_aliases, scalar @property_aliases); 18786 for my $j (0 .. $max - 1) { 18787 18788 # The current alias for property is the next one on the list, 18789 # or if beyond the end, start over. Similarly for table 18790 my $property_name 18791 = $property_aliases[$j % @property_aliases]->name; 18792 18793 $property_name = "" if $table->property == $perl; 18794 my $table_alias = $table_aliases[$j % @table_aliases]; 18795 my $table_name = $table_alias->name; 18796 my $loose_match = $table_alias->loose_match; 18797 18798 # If the table doesn't have a file, any test for it is 18799 # already guaranteed to be in error 18800 my $already_error = ! $table->file_path; 18801 18802 # A table that begins with these could actually be a 18803 # user-defined property, so won't be compile time errors, as 18804 # the definitions of those can be deferred until runtime 18805 next if $already_error && $table_name =~ / ^ I[ns] /x; 18806 18807 # Generate error cases for this alias. 18808 push @output, generate_error($property_name, 18809 $table_name, 18810 $already_error); 18811 18812 # If the table is guaranteed to always generate an error, 18813 # quit now without generating success cases. 18814 next if $already_error; 18815 18816 # Now for the success cases. First, wildcard matching, as it 18817 # shouldn't have any randomization. 18818 if ($table_alias->status eq $NORMAL) { 18819 push @output, generate_wildcard_tests($property_name, 18820 $table_name, 18821 $valid, 18822 $invalid, 18823 $warning, 18824 ); 18825 } 18826 my $random; 18827 if ($loose_match) { 18828 18829 # For loose matching, create an extra test case for the 18830 # standard name. 18831 my $standard = standardize($table_name); 18832 18833 # $test_name should be a unique combination for each test 18834 # case; used just to avoid duplicate tests 18835 my $test_name = "$property_name=$standard"; 18836 18837 # Don't output duplicate test cases. 18838 if (! exists $test_generated{$test_name}) { 18839 $test_generated{$test_name} = 1; 18840 push @output, generate_tests($property_name, 18841 $standard, 18842 $valid, 18843 $invalid, 18844 $warning, 18845 ); 18846 if ($table_alias->status eq $NORMAL) { 18847 push @output, generate_wildcard_tests( 18848 $property_name, 18849 $standard, 18850 $valid, 18851 $invalid, 18852 $warning, 18853 ); 18854 } 18855 } 18856 $random = randomize_loose_name($table_name) 18857 } 18858 else { # Stricter match 18859 $random = randomize_stricter_name($table_name); 18860 } 18861 18862 # Now for the main test case for this alias. 18863 my $test_name = "$property_name=$random"; 18864 if (! exists $test_generated{$test_name}) { 18865 $test_generated{$test_name} = 1; 18866 push @output, generate_tests($property_name, 18867 $random, 18868 $valid, 18869 $invalid, 18870 $warning, 18871 ); 18872 18873 if ($property->name eq 'nv') { 18874 if ($table_name !~ qr{/}) { 18875 push @output, generate_tests($property_name, 18876 sprintf("%.15e", $table_name), 18877 $valid, 18878 $invalid, 18879 $warning, 18880 ); 18881 } 18882 else { 18883 # If the name is a rational number, add tests for a 18884 # non-reduced form, and for a floating point equivalent. 18885 18886 # 60 is a number divisible by a bunch of things 18887 my ($numerator, $denominator) = $table_name 18888 =~ m! (.+) / (.+) !x; 18889 $numerator *= 60; 18890 $denominator *= 60; 18891 push @output, generate_tests($property_name, 18892 "$numerator/$denominator", 18893 $valid, 18894 $invalid, 18895 $warning, 18896 ); 18897 18898 # Calculate the float, and the %e representation 18899 my $float = eval $table_name; 18900 my $e_representation = sprintf("%.*e", 18901 $E_FLOAT_PRECISION, $float); 18902 # Parse that 18903 my ($non_zeros, $zeros, $exponent_sign, $exponent) 18904 = $e_representation 18905 =~ / -? [1-9] \. (\d*?) (0*) e ([+-]) (\d+) /x; 18906 my $min_e_precision; 18907 my $min_f_precision; 18908 18909 if ($exponent_sign eq '+' && $exponent != 0) { 18910 Carp::my_carp_bug("Not yet equipped to handle" 18911 . " positive exponents"); 18912 return; 18913 } 18914 else { 18915 # We're trying to find the minimum precision that 18916 # is needed to indicate this particular rational 18917 # for the given $E_FLOAT_PRECISION. For %e, any 18918 # trailing zeros, like 1.500e-02 aren't needed, so 18919 # the correct value is how many non-trailing zeros 18920 # there are after the decimal point. 18921 $min_e_precision = length $non_zeros; 18922 18923 # For %f, like .01500, we want at least 18924 # $E_FLOAT_PRECISION digits, but any trailing 18925 # zeros aren't needed, so we can subtract the 18926 # length of those. But we also need to include 18927 # the zeros after the decimal point, but before 18928 # the first significant digit. 18929 $min_f_precision = $E_FLOAT_PRECISION 18930 + $exponent 18931 - length $zeros; 18932 } 18933 18934 # Make tests for each possible precision from 1 to 18935 # just past the worst case. 18936 my $upper_limit = ($min_e_precision > $min_f_precision) 18937 ? $min_e_precision 18938 : $min_f_precision; 18939 18940 for my $i (1 .. $upper_limit + 1) { 18941 for my $format ("e", "f") { 18942 my $this_table 18943 = sprintf("%.*$format", $i, $float); 18944 18945 # If we don't have enough precision digits, 18946 # make a fail test; otherwise a pass test. 18947 my $pass = ($format eq "e") 18948 ? $i >= $min_e_precision 18949 : $i >= $min_f_precision; 18950 if ($pass) { 18951 push @output, generate_tests($property_name, 18952 $this_table, 18953 $valid, 18954 $invalid, 18955 $warning, 18956 ); 18957 } 18958 elsif ( $format eq "e" 18959 18960 # Here we would fail, but in the %f 18961 # case, the representation at this 18962 # precision could actually be a 18963 # valid one for some other rational 18964 || ! grep { $this_table 18965 =~ / ^ $_ 0* $ /x } 18966 @valid_base_floats) 18967 { 18968 push @output, 18969 generate_error($property_name, 18970 $this_table, 18971 1 # 1 => already an 18972 # error 18973 ); 18974 } 18975 } 18976 } 18977 } 18978 } 18979 } 18980 } 18981 $table->DESTROY(); 18982 } 18983 $property->DESTROY(); 18984 } 18985 18986 # Make any test of the boundary (break) properties TODO if the code 18987 # doesn't match the version being compiled 18988 my $TODO_FAILING_BREAKS = ($version_of_mk_invlist_bounds ne $v_version) 18989 ? "\nsub TODO_FAILING_BREAKS { 1 }\n" 18990 : "\nsub TODO_FAILING_BREAKS { 0 }\n"; 18991 18992 @output= map { 18993 map s/^/ /mgr, 18994 map "$_;\n", 18995 split /;\n/, $_ 18996 } @output; 18997 18998 # Cause there to be 'if' statements to only execute a portion of this 18999 # long-running test each time, so that we can have a bunch of .t's running 19000 # in parallel 19001 my $chunks = 10 # Number of test files 19002 - 1 # For GCB & SB 19003 - 1 # For WB 19004 - 4; # LB split into this many files 19005 my @output_chunked; 19006 my $chunk_count=0; 19007 my $chunk_size= int(@output / $chunks) + 1; 19008 while (@output) { 19009 $chunk_count++; 19010 my @chunk= splice @output, 0, $chunk_size; 19011 push @output_chunked, 19012 "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n", 19013 @chunk, 19014 "}\n"; 19015 } 19016 19017 $chunk_count++; 19018 push @output_chunked, 19019 "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n", 19020 (map {" Test_GCB('$_');\n"} @backslash_X_tests), 19021 (map {" Test_SB('$_');\n"} @SB_tests), 19022 "}\n"; 19023 19024 19025 $chunk_size= int(@LB_tests / 4) + 1; 19026 @LB_tests = map {" Test_LB('$_');\n"} @LB_tests; 19027 while (@LB_tests) { 19028 $chunk_count++; 19029 my @chunk= splice @LB_tests, 0, $chunk_size; 19030 push @output_chunked, 19031 "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n", 19032 @chunk, 19033 "}\n"; 19034 } 19035 19036 $chunk_count++; 19037 push @output_chunked, 19038 "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n", 19039 (map {" Test_WB('$_');\n"} @WB_tests), 19040 "}\n"; 19041 19042 &write($t_path, 19043 0, # Not utf8; 19044 [$HEADER, 19045 $TODO_FAILING_BREAKS, 19046 <DATA>, 19047 @output_chunked, 19048 "Finished();\n", 19049 ]); 19050 19051 return; 19052} 19053 19054sub make_normalization_test_script() { 19055 print "Making normalization test script\n" if $verbosity >= $PROGRESS; 19056 19057 my $n_path = 'TestNorm.pl'; 19058 19059 unshift @normalization_tests, <<'END'; 19060use utf8; 19061use Test::More; 19062 19063sub ord_string { # Convert packed ords to printable string 19064 use charnames (); 19065 return "'" . join("", map { '\N{' . charnames::viacode($_) . '}' } 19066 unpack "U*", shift) . "'"; 19067 #return "'" . join(" ", map { sprintf "%04X", $_ } unpack "U*", shift) . "'"; 19068} 19069 19070sub Test_N { 19071 my ($source, $nfc, $nfd, $nfkc, $nfkd) = @_; 19072 my $display_source = ord_string($source); 19073 my $display_nfc = ord_string($nfc); 19074 my $display_nfd = ord_string($nfd); 19075 my $display_nfkc = ord_string($nfkc); 19076 my $display_nfkd = ord_string($nfkd); 19077 19078 use Unicode::Normalize; 19079 # NFC 19080 # nfc == toNFC(source) == toNFC(nfc) == toNFC(nfd) 19081 # nfkc == toNFC(nfkc) == toNFC(nfkd) 19082 # 19083 # NFD 19084 # nfd == toNFD(source) == toNFD(nfc) == toNFD(nfd) 19085 # nfkd == toNFD(nfkc) == toNFD(nfkd) 19086 # 19087 # NFKC 19088 # nfkc == toNFKC(source) == toNFKC(nfc) == toNFKC(nfd) == 19089 # toNFKC(nfkc) == toNFKC(nfkd) 19090 # 19091 # NFKD 19092 # nfkd == toNFKD(source) == toNFKD(nfc) == toNFKD(nfd) == 19093 # toNFKD(nfkc) == toNFKD(nfkd) 19094 19095 is(NFC($source), $nfc, "NFC($display_source) eq $display_nfc"); 19096 is(NFC($nfc), $nfc, "NFC($display_nfc) eq $display_nfc"); 19097 is(NFC($nfd), $nfc, "NFC($display_nfd) eq $display_nfc"); 19098 is(NFC($nfkc), $nfkc, "NFC($display_nfkc) eq $display_nfkc"); 19099 is(NFC($nfkd), $nfkc, "NFC($display_nfkd) eq $display_nfkc"); 19100 19101 is(NFD($source), $nfd, "NFD($display_source) eq $display_nfd"); 19102 is(NFD($nfc), $nfd, "NFD($display_nfc) eq $display_nfd"); 19103 is(NFD($nfd), $nfd, "NFD($display_nfd) eq $display_nfd"); 19104 is(NFD($nfkc), $nfkd, "NFD($display_nfkc) eq $display_nfkd"); 19105 is(NFD($nfkd), $nfkd, "NFD($display_nfkd) eq $display_nfkd"); 19106 19107 is(NFKC($source), $nfkc, "NFKC($display_source) eq $display_nfkc"); 19108 is(NFKC($nfc), $nfkc, "NFKC($display_nfc) eq $display_nfkc"); 19109 is(NFKC($nfd), $nfkc, "NFKC($display_nfd) eq $display_nfkc"); 19110 is(NFKC($nfkc), $nfkc, "NFKC($display_nfkc) eq $display_nfkc"); 19111 is(NFKC($nfkd), $nfkc, "NFKC($display_nfkd) eq $display_nfkc"); 19112 19113 is(NFKD($source), $nfkd, "NFKD($display_source) eq $display_nfkd"); 19114 is(NFKD($nfc), $nfkd, "NFKD($display_nfc) eq $display_nfkd"); 19115 is(NFKD($nfd), $nfkd, "NFKD($display_nfd) eq $display_nfkd"); 19116 is(NFKD($nfkc), $nfkd, "NFKD($display_nfkc) eq $display_nfkd"); 19117 is(NFKD($nfkd), $nfkd, "NFKD($display_nfkd) eq $display_nfkd"); 19118} 19119END 19120 19121 &write($n_path, 19122 1, # Is utf8; 19123 [ 19124 @normalization_tests, 19125 'done_testing();' 19126 ]); 19127 return; 19128} 19129 19130# Skip reasons, so will be exact same text and hence the files with each 19131# reason will get grouped together in perluniprops. 19132my $Documentation = "Documentation"; 19133my $Indic_Skip 19134 = "Provisional; for the analysis and processing of Indic scripts"; 19135my $Validation = "Validation Tests"; 19136my $Validation_Documentation = "Documentation of validation Tests"; 19137 19138# This is a list of the input files and how to handle them. The files are 19139# processed in their order in this list. Some reordering is possible if 19140# desired, but the PropertyAliases and PropValueAliases files should be first, 19141# and the extracted before the others (as data in an extracted file can be 19142# over-ridden by the non-extracted. Some other files depend on data derived 19143# from an earlier file, like UnicodeData requires data from Jamo, and the case 19144# changing and folding requires data from Unicode. Mostly, it is safest to 19145# order by first version releases in (except the Jamo). 19146# 19147# The version strings allow the program to know whether to expect a file or 19148# not, but if a file exists in the directory, it will be processed, even if it 19149# is in a version earlier than expected, so you can copy files from a later 19150# release into an earlier release's directory. 19151my @input_file_objects = ( 19152 Input_file->new('PropertyAliases.txt', v3.2, 19153 Handler => \&process_PropertyAliases, 19154 Early => [ \&substitute_PropertyAliases ], 19155 Required_Even_in_Debug_Skip => 1, 19156 ), 19157 Input_file->new(undef, v0, # No file associated with this 19158 Progress_Message => 'Finishing property setup', 19159 Handler => \&finish_property_setup, 19160 ), 19161 Input_file->new('PropValueAliases.txt', v3.2, 19162 Handler => \&process_PropValueAliases, 19163 Early => [ \&substitute_PropValueAliases ], 19164 Has_Missings_Defaults => $NOT_IGNORED, 19165 Required_Even_in_Debug_Skip => 1, 19166 ), 19167 Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0, 19168 Property => 'General_Category', 19169 ), 19170 Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0, 19171 Property => 'Canonical_Combining_Class', 19172 Has_Missings_Defaults => $NOT_IGNORED, 19173 ), 19174 Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0, 19175 Property => 'Numeric_Type', 19176 Has_Missings_Defaults => $NOT_IGNORED, 19177 ), 19178 Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0, 19179 Property => 'East_Asian_Width', 19180 Has_Missings_Defaults => $NOT_IGNORED, 19181 ), 19182 Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0, 19183 Property => 'Line_Break', 19184 Has_Missings_Defaults => $NOT_IGNORED, 19185 ), 19186 Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1, 19187 Property => 'Bidi_Class', 19188 Has_Missings_Defaults => $NOT_IGNORED, 19189 ), 19190 Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0, 19191 Property => 'Decomposition_Type', 19192 Has_Missings_Defaults => $NOT_IGNORED, 19193 ), 19194 Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0), 19195 Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0, 19196 Property => 'Numeric_Value', 19197 Each_Line_Handler => \&filter_numeric_value_line, 19198 Has_Missings_Defaults => $NOT_IGNORED, 19199 ), 19200 Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0, 19201 Property => 'Joining_Group', 19202 Has_Missings_Defaults => $NOT_IGNORED, 19203 ), 19204 19205 Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0, 19206 Property => 'Joining_Type', 19207 Has_Missings_Defaults => $NOT_IGNORED, 19208 ), 19209 Input_file->new("${EXTRACTED}DName.txt", v10.0.0, 19210 Skip => 'This file adds no new information not already' 19211 . ' present in other files', 19212 # And it's unnecessary programmer work to handle this new 19213 # format. Previous Derived files actually had bug fixes 19214 # in them that were useful, but that should not be the 19215 # case here. 19216 ), 19217 Input_file->new('Jamo.txt', v2.0.0, 19218 Property => 'Jamo_Short_Name', 19219 Each_Line_Handler => \&filter_jamo_line, 19220 ), 19221 Input_file->new('UnicodeData.txt', v1.1.5, 19222 Pre_Handler => \&setup_UnicodeData, 19223 19224 # We clean up this file for some early versions. 19225 Each_Line_Handler => [ (($v_version lt v2.0.0 ) 19226 ? \&filter_v1_ucd 19227 : ($v_version eq v2.1.5) 19228 ? \&filter_v2_1_5_ucd 19229 19230 # And for 5.14 Perls with 6.0, 19231 # have to also make changes 19232 : ($v_version ge v6.0.0 19233 && $^V lt v5.17.0) 19234 ? \&filter_v6_ucd 19235 : undef), 19236 19237 # Early versions did not have the 19238 # proper Unicode_1 names for the 19239 # controls 19240 (($v_version lt v3.0.0) 19241 ? \&filter_early_U1_names 19242 : undef), 19243 19244 # Early versions did not correctly 19245 # use the later method for giving 19246 # decimal digit values 19247 (($v_version le v3.2.0) 19248 ? \&filter_bad_Nd_ucd 19249 : undef), 19250 19251 # And the main filter 19252 \&filter_UnicodeData_line, 19253 ], 19254 EOF_Handler => \&EOF_UnicodeData, 19255 ), 19256 Input_file->new('CJKXREF.TXT', v1.1.5, 19257 Withdrawn => v2.0.0, 19258 Skip => 'Gives the mapping of CJK code points ' 19259 . 'between Unicode and various other standards', 19260 ), 19261 Input_file->new('ArabicShaping.txt', v2.0.0, 19262 Each_Line_Handler => 19263 ($v_version lt 4.1.0) 19264 ? \&filter_old_style_arabic_shaping 19265 : undef, 19266 # The first field after the range is a "schematic name" 19267 # not used by Perl 19268 Properties => [ '<ignored>', 'Joining_Type', 'Joining_Group' ], 19269 Has_Missings_Defaults => $NOT_IGNORED, 19270 ), 19271 Input_file->new('Blocks.txt', v2.0.0, 19272 Property => 'Block', 19273 Has_Missings_Defaults => $NOT_IGNORED, 19274 Each_Line_Handler => \&filter_blocks_lines 19275 ), 19276 Input_file->new('Index.txt', v2.0.0, 19277 Skip => 'Alphabetical index of Unicode characters', 19278 ), 19279 Input_file->new('NamesList.txt', v2.0.0, 19280 Skip => 'Annotated list of characters', 19281 ), 19282 Input_file->new('PropList.txt', v2.0.0, 19283 Each_Line_Handler => (($v_version lt v3.1.0) 19284 ? \&filter_old_style_proplist 19285 : undef), 19286 ), 19287 Input_file->new('Props.txt', v2.0.0, 19288 Withdrawn => v3.0.0, 19289 Skip => 'A subset of F<PropList.txt> (which is used instead)', 19290 ), 19291 Input_file->new('ReadMe.txt', v2.0.0, 19292 Skip => $Documentation, 19293 ), 19294 Input_file->new('Unihan.txt', v2.0.0, 19295 Withdrawn => v5.2.0, 19296 Construction_Time_Handler => \&construct_unihan, 19297 Pre_Handler => \&setup_unihan, 19298 Optional => [ "", 19299 'Unicode_Radical_Stroke' 19300 ], 19301 Each_Line_Handler => \&filter_unihan_line, 19302 ), 19303 Input_file->new('SpecialCasing.txt', v2.1.8, 19304 Each_Line_Handler => ($v_version eq 2.1.8) 19305 ? \&filter_2_1_8_special_casing_line 19306 : \&filter_special_casing_line, 19307 Pre_Handler => \&setup_special_casing, 19308 Has_Missings_Defaults => $IGNORED, 19309 ), 19310 Input_file->new( 19311 'LineBreak.txt', v3.0.0, 19312 Has_Missings_Defaults => $NOT_IGNORED, 19313 Property => 'Line_Break', 19314 # Early versions had problematic syntax 19315 Each_Line_Handler => ($v_version ge v3.1.0) 19316 ? undef 19317 : ($v_version lt v3.0.0) 19318 ? \&filter_substitute_lb 19319 : \&filter_early_ea_lb, 19320 # Must use long names for property values see comments at 19321 # sub filter_substitute_lb 19322 Early => [ "LBsubst.txt", '_Perl_LB', 'Alphabetic', 19323 'Alphabetic', # default to this because XX -> 19324 # AL 19325 19326 # Don't use _Perl_LB as a synonym for 19327 # Line_Break in later perls, as it is tailored 19328 # and isn't the same as Line_Break 19329 'ONLY_EARLY' ], 19330 ), 19331 Input_file->new('EastAsianWidth.txt', v3.0.0, 19332 Property => 'East_Asian_Width', 19333 Has_Missings_Defaults => $NOT_IGNORED, 19334 # Early versions had problematic syntax 19335 Each_Line_Handler => (($v_version lt v3.1.0) 19336 ? \&filter_early_ea_lb 19337 : undef), 19338 ), 19339 Input_file->new('CompositionExclusions.txt', v3.0.0, 19340 Property => 'Composition_Exclusion', 19341 ), 19342 Input_file->new('UnicodeData.html', v3.0.0, 19343 Withdrawn => v4.0.1, 19344 Skip => $Documentation, 19345 ), 19346 Input_file->new('BidiMirroring.txt', v3.0.1, 19347 Property => 'Bidi_Mirroring_Glyph', 19348 Has_Missings_Defaults => ($v_version lt v6.2.0) 19349 ? $NO_DEFAULTS 19350 # Is <none> which doesn't mean 19351 # anything to us, we will use the 19352 # null string 19353 : $IGNORED, 19354 ), 19355 Input_file->new('NamesList.html', v3.0.0, 19356 Skip => 'Describes the format and contents of ' 19357 . 'F<NamesList.txt>', 19358 ), 19359 Input_file->new('UnicodeCharacterDatabase.html', v3.0.0, 19360 Withdrawn => v5.1, 19361 Skip => $Documentation, 19362 ), 19363 Input_file->new('CaseFolding.txt', v3.0.1, 19364 Pre_Handler => \&setup_case_folding, 19365 Each_Line_Handler => 19366 [ ($v_version lt v3.1.0) 19367 ? \&filter_old_style_case_folding 19368 : undef, 19369 \&filter_case_folding_line 19370 ], 19371 Has_Missings_Defaults => $IGNORED, 19372 ), 19373 Input_file->new("NormTest.txt", v3.0.1, 19374 Handler => \&process_NormalizationsTest, 19375 Skip => ($make_norm_test_script) ? 0 : $Validation, 19376 ), 19377 Input_file->new('DCoreProperties.txt', v3.1.0, 19378 # 5.2 changed this file 19379 Has_Missings_Defaults => (($v_version ge v5.2.0) 19380 ? $NOT_IGNORED 19381 : $NO_DEFAULTS), 19382 ), 19383 Input_file->new('DProperties.html', v3.1.0, 19384 Withdrawn => v3.2.0, 19385 Skip => $Documentation, 19386 ), 19387 Input_file->new('PropList.html', v3.1.0, 19388 Withdrawn => v5.1, 19389 Skip => $Documentation, 19390 ), 19391 Input_file->new('Scripts.txt', v3.1.0, 19392 Property => 'Script', 19393 Each_Line_Handler => (($v_version le v4.0.0) 19394 ? \&filter_all_caps_script_names 19395 : undef), 19396 Has_Missings_Defaults => $NOT_IGNORED, 19397 ), 19398 Input_file->new('DNormalizationProps.txt', v3.1.0, 19399 Has_Missings_Defaults => $NOT_IGNORED, 19400 Each_Line_Handler => (($v_version lt v4.0.1) 19401 ? \&filter_old_style_normalization_lines 19402 : undef), 19403 ), 19404 Input_file->new('DerivedProperties.html', v3.1.1, 19405 Withdrawn => v5.1, 19406 Skip => $Documentation, 19407 ), 19408 Input_file->new('DAge.txt', v3.2.0, 19409 Has_Missings_Defaults => $NOT_IGNORED, 19410 Property => 'Age' 19411 ), 19412 Input_file->new('HangulSyllableType.txt', v4.0, 19413 Has_Missings_Defaults => $NOT_IGNORED, 19414 Early => [ \&generate_hst, 'Hangul_Syllable_Type' ], 19415 Property => 'Hangul_Syllable_Type' 19416 ), 19417 Input_file->new('NormalizationCorrections.txt', v3.2.0, 19418 # This documents the cumulative fixes to erroneous 19419 # normalizations in earlier Unicode versions. Its main 19420 # purpose is so that someone running on an earlier 19421 # version can use this file to override what got 19422 # published in that earlier release. It would be easy 19423 # for mktables to handle this file. But all the 19424 # corrections in it should already be in the other files 19425 # for the release it is. To get it to actually mean 19426 # something useful, someone would have to be using an 19427 # earlier Unicode release, and copy it into the directory 19428 # for that release and recompile. So far there has been 19429 # no demand to do that, so this hasn't been implemented. 19430 Skip => 'Documentation of corrections already ' 19431 . 'incorporated into the Unicode data base', 19432 ), 19433 Input_file->new('StandardizedVariants.html', v3.2.0, 19434 Skip => 'Obsoleted as of Unicode 9.0, but previously ' 19435 . 'provided a visual display of the standard ' 19436 . 'variant sequences derived from ' 19437 . 'F<StandardizedVariants.txt>.', 19438 # I don't know why the html came earlier than the 19439 # .txt, but both are skipped anyway, so it doesn't 19440 # matter. 19441 ), 19442 Input_file->new('StandardizedVariants.txt', v4.0.0, 19443 Skip => 'Certain glyph variations for character display ' 19444 . 'are standardized. This lists the non-Unihan ' 19445 . 'ones; the Unihan ones are also not used by ' 19446 . 'Perl, and are in a separate Unicode data base ' 19447 . 'L<http://www.unicode.org/ivd>', 19448 ), 19449 Input_file->new('UCD.html', v4.0.0, 19450 Withdrawn => v5.2, 19451 Skip => $Documentation, 19452 ), 19453 Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0, 19454 Early => [ "WBsubst.txt", '_Perl_WB', 'ALetter' ], 19455 Property => 'Word_Break', 19456 Has_Missings_Defaults => $NOT_IGNORED, 19457 ), 19458 Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1, 19459 Early => [ \&generate_GCB, '_Perl_GCB' ], 19460 Property => 'Grapheme_Cluster_Break', 19461 Has_Missings_Defaults => $NOT_IGNORED, 19462 ), 19463 Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0, 19464 Handler => \&process_GCB_test, 19465 retain_trailing_comments => 1, 19466 ), 19467 Input_file->new("$AUXILIARY/GraphemeBreakTest.html", v4.1.0, 19468 Skip => $Validation_Documentation, 19469 ), 19470 Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0, 19471 Handler => \&process_SB_test, 19472 retain_trailing_comments => 1, 19473 ), 19474 Input_file->new("$AUXILIARY/SentenceBreakTest.html", v4.1.0, 19475 Skip => $Validation_Documentation, 19476 ), 19477 Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0, 19478 Handler => \&process_WB_test, 19479 retain_trailing_comments => 1, 19480 ), 19481 Input_file->new("$AUXILIARY/WordBreakTest.html", v4.1.0, 19482 Skip => $Validation_Documentation, 19483 ), 19484 Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0, 19485 Property => 'Sentence_Break', 19486 Early => [ "SBsubst.txt", '_Perl_SB', 'OLetter' ], 19487 Has_Missings_Defaults => $NOT_IGNORED, 19488 ), 19489 Input_file->new('NamedSequences.txt', v4.1.0, 19490 Handler => \&process_NamedSequences 19491 ), 19492 Input_file->new('Unihan.html', v4.1.0, 19493 Withdrawn => v5.2, 19494 Skip => $Documentation, 19495 ), 19496 Input_file->new('NameAliases.txt', v5.0, 19497 Property => 'Name_Alias', 19498 Each_Line_Handler => ($v_version le v6.0.0) 19499 ? \&filter_early_version_name_alias_line 19500 : \&filter_later_version_name_alias_line, 19501 ), 19502 # NameAliases.txt came along in v5.0. The above constructor handles 19503 # this. But until 6.1, it was lacking some information needed by core 19504 # perl. The constructor below handles that. It is either a kludge or 19505 # clever, depending on your point of view. The 'Withdrawn' parameter 19506 # indicates not to use it at all starting in 6.1 (so the above 19507 # constructor applies), and the 'v6.1' parameter indicates to use the 19508 # Early parameter before 6.1. Therefore 'Early" is always used, 19509 # yielding the internal-only property '_Perl_Name_Alias', which it 19510 # gets from a NameAliases.txt from 6.1 or later stored in 19511 # N_Asubst.txt. In combination with the above constructor, 19512 # 'Name_Alias' is publicly accessible starting with v5.0, and the 19513 # better 6.1 version is accessible to perl core in all releases. 19514 Input_file->new("NameAliases.txt", v6.1, 19515 Withdrawn => v6.1, 19516 Early => [ "N_Asubst.txt", '_Perl_Name_Alias', "" ], 19517 Property => 'Name_Alias', 19518 EOF_Handler => \&fixup_early_perl_name_alias, 19519 Each_Line_Handler => 19520 \&filter_later_version_name_alias_line, 19521 ), 19522 Input_file->new('NamedSqProv.txt', v5.0.0, 19523 Skip => 'Named sequences proposed for inclusion in a ' 19524 . 'later version of the Unicode Standard; if you ' 19525 . 'need them now, you can append this file to ' 19526 . 'F<NamedSequences.txt> and recompile perl', 19527 ), 19528 Input_file->new("$AUXILIARY/LBTest.txt", v5.1.0, 19529 Handler => \&process_LB_test, 19530 retain_trailing_comments => 1, 19531 ), 19532 Input_file->new("$AUXILIARY/LineBreakTest.html", v5.1.0, 19533 Skip => $Validation_Documentation, 19534 ), 19535 Input_file->new("BidiTest.txt", v5.2.0, 19536 Skip => $Validation, 19537 ), 19538 Input_file->new('UnihanIndicesDictionary.txt', v5.2.0, 19539 Optional => "", 19540 Each_Line_Handler => \&filter_unihan_line, 19541 ), 19542 Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0, 19543 Optional => "", 19544 Each_Line_Handler => \&filter_unihan_line, 19545 ), 19546 Input_file->new('UnihanIRGSources.txt', v5.2.0, 19547 Optional => [ "", 19548 'kCompatibilityVariant', 19549 'kIICore', 19550 'kIRG_GSource', 19551 'kIRG_HSource', 19552 'kIRG_JSource', 19553 'kIRG_KPSource', 19554 'kIRG_MSource', 19555 'kIRG_KSource', 19556 'kIRG_SSource', 19557 'kIRG_TSource', 19558 'kIRG_USource', 19559 'kIRG_UKSource', 19560 'kIRG_VSource', 19561 ], 19562 Pre_Handler => \&setup_unihan, 19563 Each_Line_Handler => \&filter_unihan_line, 19564 ), 19565 Input_file->new('UnihanNumericValues.txt', v5.2.0, 19566 Optional => [ "", 19567 'kAccountingNumeric', 19568 'kOtherNumeric', 19569 'kPrimaryNumeric', 19570 ], 19571 Each_Line_Handler => \&filter_unihan_line, 19572 ), 19573 Input_file->new('UnihanOtherMappings.txt', v5.2.0, 19574 Optional => "", 19575 Each_Line_Handler => \&filter_unihan_line, 19576 ), 19577 Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0, 19578 Optional => [ "", 19579 'Unicode_Radical_Stroke' 19580 ], 19581 Each_Line_Handler => \&filter_unihan_line, 19582 ), 19583 Input_file->new('UnihanReadings.txt', v5.2.0, 19584 Optional => "", 19585 Each_Line_Handler => \&filter_unihan_line, 19586 ), 19587 Input_file->new('UnihanVariants.txt', v5.2.0, 19588 Optional => "", 19589 Each_Line_Handler => \&filter_unihan_line, 19590 ), 19591 Input_file->new('CJKRadicals.txt', v5.2.0, 19592 Skip => 'Maps the kRSUnicode property values to ' 19593 . 'corresponding code points', 19594 ), 19595 Input_file->new('EmojiSources.txt', v6.0.0, 19596 Skip => 'Maps certain Unicode code points to their ' 19597 . 'legacy Japanese cell-phone values', 19598 ), 19599 # This file is actually not usable as-is until 6.1.0, because the property 19600 # is provisional, so its name is missing from PropertyAliases.txt until 19601 # that release, so that further work would have to be done to get it to 19602 # work properly 19603 Input_file->new('ScriptExtensions.txt', v6.0.0, 19604 Property => 'Script_Extensions', 19605 Early => [ sub {} ], # Doesn't do anything but ensures 19606 # that this isn't skipped for early 19607 # versions 19608 Pre_Handler => \&setup_script_extensions, 19609 Each_Line_Handler => \&filter_script_extensions_line, 19610 Has_Missings_Defaults => (($v_version le v6.0.0) 19611 ? $NO_DEFAULTS 19612 : $IGNORED), 19613 ), 19614 # These two Indic files are actually not usable as-is until 6.1.0, 19615 # because they are provisional, so their property values are missing from 19616 # PropValueAliases.txt until that release, so that further work would have 19617 # to be done to get them to work properly. 19618 Input_file->new('IndicMatraCategory.txt', v6.0.0, 19619 Withdrawn => v8.0.0, 19620 Property => 'Indic_Matra_Category', 19621 Has_Missings_Defaults => $NOT_IGNORED, 19622 Skip => $Indic_Skip, 19623 ), 19624 Input_file->new('IndicSyllabicCategory.txt', v6.0.0, 19625 Property => 'Indic_Syllabic_Category', 19626 Has_Missings_Defaults => $NOT_IGNORED, 19627 Skip => (($v_version lt v8.0.0) 19628 ? $Indic_Skip 19629 : 0), 19630 ), 19631 Input_file->new('USourceData.txt', v6.2.0, 19632 Skip => 'Documentation of status and cross reference of ' 19633 . 'proposals for encoding by Unicode of Unihan ' 19634 . 'characters', 19635 ), 19636 Input_file->new('USourceGlyphs.pdf', v6.2.0, 19637 Skip => 'Pictures of the characters in F<USourceData.txt>', 19638 ), 19639 Input_file->new('BidiBrackets.txt', v6.3.0, 19640 Properties => [ 'Bidi_Paired_Bracket', 19641 'Bidi_Paired_Bracket_Type' 19642 ], 19643 Has_Missings_Defaults => $NO_DEFAULTS, 19644 ), 19645 Input_file->new("BidiCharacterTest.txt", v6.3.0, 19646 Skip => $Validation, 19647 ), 19648 Input_file->new('IndicPositionalCategory.txt', v8.0.0, 19649 Property => 'Indic_Positional_Category', 19650 Has_Missings_Defaults => $NOT_IGNORED, 19651 ), 19652 Input_file->new('TangutSources.txt', v9.0.0, 19653 Skip => 'Specifies source mappings for Tangut ideographs' 19654 . ' and components. This data file also includes' 19655 . ' informative radical-stroke values that are used' 19656 . ' internally by Unicode', 19657 ), 19658 Input_file->new('VerticalOrientation.txt', v10.0.0, 19659 Property => 'Vertical_Orientation', 19660 Has_Missings_Defaults => $NOT_IGNORED, 19661 ), 19662 Input_file->new('NushuSources.txt', v10.0.0, 19663 Skip => 'Specifies source material for Nushu characters', 19664 ), 19665 Input_file->new('EquivalentUnifiedIdeograph.txt', v11.0.0, 19666 Property => 'Equivalent_Unified_Ideograph', 19667 Has_Missings_Defaults => $NOT_IGNORED, 19668 ), 19669 Input_file->new('EmojiData.txt', v11.0.0, 19670 # Is in UAX #51 and not the UCD, so must be updated 19671 # separately, and the first line edited to indicate the 19672 # UCD release we're pretending it to be in. The UTC says 19673 # this is a transitional state, and in fact was moved to 19674 # the UCD in 13.0 19675 Withdrawn => v13.0.0, 19676 Pre_Handler => \&setup_emojidata, 19677 Has_Missings_Defaults => $NOT_IGNORED, 19678 Each_Line_Handler => \&filter_emojidata_line, 19679 UCD => 0, 19680 ), 19681 Input_file->new("$EMOJI/emoji.txt", v13.0.0, 19682 Has_Missings_Defaults => $NOT_IGNORED, 19683 UCD => 0, 19684 ), 19685 Input_file->new("$EMOJI/ReadMe.txt", v13.0.0, 19686 Skip => $Documentation, 19687 UCD => 0, 19688 ), 19689 Input_file->new('IdStatus.txt', v13.0.0, 19690 Pre_Handler => \&setup_IdStatus, 19691 Property => 'Identifier_Status', 19692 UCD => 0, 19693 ), 19694 Input_file->new('IdType.txt', v13.0.0, 19695 Pre_Handler => \&setup_IdType, 19696 Each_Line_Handler => \&filter_IdType_line, 19697 Property => 'Identifier_Type', 19698 UCD => 0, 19699 ), 19700); 19701 19702# End of all the preliminaries. 19703# Do it... 19704 19705if (@missing_early_files) { 19706 print simple_fold(join_lines(<<END 19707 19708The compilation cannot be completed because one or more required input files, 19709listed below, are missing. This is because you are compiling Unicode version 19710$unicode_version, which predates the existence of these file(s). To fully 19711function, perl needs the data that these files would have contained if they 19712had been in this release. To work around this, create copies of later 19713versions of the missing files in the directory containing '$0'. (Perl will 19714make the necessary adjustments to the data to compensate for it not being the 19715same version as is being compiled.) The files are available from unicode.org, 19716via either ftp or http. If using http, they will be under 19717www.unicode.org/versions/. Below are listed the source file name of each 19718missing file, the Unicode version to copy it from, and the name to store it 19719as. (Note that the listed source file name may not be exactly the one that 19720Unicode calls it. If you don't find it, you can look it up in 'README.perl' 19721to get the correct name.) 19722END 19723 )); 19724 print simple_fold(join_lines("\n$_")) for @missing_early_files; 19725 exit 2; 19726} 19727 19728if ($compare_versions) { 19729 Carp::my_carp(<<END 19730Warning. \$compare_versions is set. Output is not suitable for production 19731END 19732 ); 19733} 19734 19735# Put into %potential_files a list of all the files in the directory structure 19736# that could be inputs to this program 19737File::Find::find({ 19738 wanted=>sub { 19739 return unless / \. ( txt | htm l? ) $ /xi; # Some platforms change the 19740 # name's case 19741 my $full = lc(File::Spec->rel2abs($_)); 19742 $potential_files{$full} = 1; 19743 return; 19744 } 19745}, File::Spec->curdir()); 19746 19747my @mktables_list_output_files; 19748my $old_start_time = 0; 19749my $old_options = ""; 19750 19751if (! -e $file_list) { 19752 print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE; 19753 $write_unchanged_files = 1; 19754} elsif ($write_unchanged_files) { 19755 print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE; 19756} 19757else { 19758 print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE; 19759 my $file_handle; 19760 if (! open $file_handle, "<", $file_list) { 19761 Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!"); 19762 $glob_list = 1; 19763 } 19764 else { 19765 my @input; 19766 19767 # Read and parse mktables.lst, placing the results from the first part 19768 # into @input, and the second part into @mktables_list_output_files 19769 for my $list ( \@input, \@mktables_list_output_files ) { 19770 while (<$file_handle>) { 19771 s/^ \s+ | \s+ $//xg; 19772 if (/^ \s* \# \s* Autogenerated\ starting\ on\ (\d+)/x) { 19773 $old_start_time = $1; 19774 next; 19775 } 19776 if (/^ \s* \# \s* From\ options\ (.+) /x) { 19777 $old_options = $1; 19778 next; 19779 } 19780 next if /^ \s* (?: \# .* )? $/x; 19781 last if /^ =+ $/x; 19782 my ( $file ) = split /\t/; 19783 push @$list, $file; 19784 } 19785 @$list = uniques(@$list); 19786 next; 19787 } 19788 19789 # Look through all the input files 19790 foreach my $input (@input) { 19791 next if $input eq 'version'; # Already have checked this. 19792 19793 # Ignore if doesn't exist. The checking about whether we care or 19794 # not is done via the Input_file object. 19795 next if ! file_exists($input); 19796 19797 # The paths are stored with relative names, and with '/' as the 19798 # delimiter; convert to absolute on this machine 19799 my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input))); 19800 $potential_files{lc $full} = 1; 19801 } 19802 } 19803 19804 close $file_handle; 19805} 19806 19807if ($glob_list) { 19808 19809 # Here wants to process all .txt files in the directory structure. 19810 # Convert them to full path names. They are stored in the platform's 19811 # relative style 19812 my @known_files; 19813 foreach my $object (@input_file_objects) { 19814 my $file = $object->file; 19815 next unless defined $file; 19816 push @known_files, File::Spec->rel2abs($file); 19817 } 19818 19819 my @unknown_input_files; 19820 foreach my $file (keys %potential_files) { # The keys are stored in lc 19821 next if grep { $file eq lc($_) } @known_files; 19822 19823 # Here, the file is unknown to us. Get relative path name 19824 $file = File::Spec->abs2rel($file); 19825 push @unknown_input_files, $file; 19826 19827 # What will happen is we create a data structure for it, and add it to 19828 # the list of input files to process. First get the subdirectories 19829 # into an array 19830 my (undef, $directories, undef) = File::Spec->splitpath($file); 19831 $directories =~ s;/$;;; # Can have extraneous trailing '/' 19832 my @directories = File::Spec->splitdir($directories); 19833 19834 # If the file isn't extracted (meaning none of the directories is the 19835 # extracted one), just add it to the end of the list of inputs. 19836 if (! grep { $EXTRACTED_DIR eq $_ } @directories) { 19837 push @input_file_objects, Input_file->new($file, v0); 19838 } 19839 else { 19840 19841 # Here, the file is extracted. It needs to go ahead of most other 19842 # processing. Search for the first input file that isn't a 19843 # special required property (that is, find one whose first_release 19844 # is non-0), and isn't extracted. Also, the Age property file is 19845 # processed before the extracted ones, just in case 19846 # $compare_versions is set. 19847 for (my $i = 0; $i < @input_file_objects; $i++) { 19848 if ($input_file_objects[$i]->first_released ne v0 19849 && lc($input_file_objects[$i]->file) ne 'dage.txt' 19850 && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i) 19851 { 19852 splice @input_file_objects, $i, 0, 19853 Input_file->new($file, v0); 19854 last; 19855 } 19856 } 19857 19858 } 19859 } 19860 if (@unknown_input_files) { 19861 print STDERR simple_fold(join_lines(<<END 19862 19863The following files are unknown as to how to handle. Assuming they are 19864typical property files. You'll know by later error messages if it worked or 19865not: 19866END 19867 ) . " " . join(", ", @unknown_input_files) . "\n\n"); 19868 } 19869} # End of looking through directory structure for more .txt files. 19870 19871# Create the list of input files from the objects we have defined, plus 19872# version 19873my @input_files = qw(version Makefile); 19874foreach my $object (@input_file_objects) { 19875 my $file = $object->file; 19876 next if ! defined $file; # Not all objects have files 19877 next if defined $object->skip;; 19878 push @input_files, $file; 19879} 19880 19881if ( $verbosity >= $VERBOSE ) { 19882 print "Expecting ".scalar( @input_files )." input files. ", 19883 "Checking ".scalar( @mktables_list_output_files )." output files.\n"; 19884} 19885 19886# We set $most_recent to be the most recently changed input file, including 19887# this program itself (done much earlier in this file) 19888foreach my $in (@input_files) { 19889 next unless -e $in; # Keep going even if missing a file 19890 my $mod_time = (stat $in)[9]; 19891 $most_recent = $mod_time if $mod_time > $most_recent; 19892 19893 # See that the input files have distinct names, to warn someone if they 19894 # are adding a new one 19895 if ($make_list) { 19896 my ($volume, $directories, $file ) = File::Spec->splitpath($in); 19897 $directories =~ s;/$;;; # Can have extraneous trailing '/' 19898 my @directories = File::Spec->splitdir($directories); 19899 construct_filename($file, 'mutable', \@directories); 19900 } 19901} 19902 19903# We use 'Makefile' just to see if it has changed since the last time we 19904# rebuilt. Now discard it. 19905@input_files = grep { $_ ne 'Makefile' } @input_files; 19906 19907my $rebuild = $write_unchanged_files # Rebuild: if unconditional rebuild 19908 || ! scalar @mktables_list_output_files # or if no outputs known 19909 || $old_start_time < $most_recent # or out-of-date 19910 || $old_options ne $command_line_arguments; # or with different 19911 # options 19912 19913# Now we check to see if any output files are older than youngest, if 19914# they are, we need to continue on, otherwise we can presumably bail. 19915if (! $rebuild) { 19916 foreach my $out (@mktables_list_output_files) { 19917 if ( ! file_exists($out)) { 19918 print "'$out' is missing.\n" if $verbosity >= $VERBOSE; 19919 $rebuild = 1; 19920 last; 19921 } 19922 #local $to_trace = 1 if main::DEBUG; 19923 trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace; 19924 if ( (stat $out)[9] <= $most_recent ) { 19925 #trace "$out: most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace; 19926 print "'$out' is too old.\n" if $verbosity >= $VERBOSE; 19927 $rebuild = 1; 19928 last; 19929 } 19930 } 19931} 19932if (! $rebuild) { 19933 print "$0: Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n"; 19934 exit(0); 19935} 19936print "$0: Must rebuild tables.\n" if $verbosity >= $VERBOSE; 19937 19938# Ready to do the major processing. First create the perl pseudo-property. 19939$perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1); 19940 19941# Process each input file 19942foreach my $file (@input_file_objects) { 19943 $file->run; 19944} 19945 19946# Finish the table generation. 19947 19948print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS; 19949finish_Unicode(); 19950 19951# For the very specialized case of comparing two Unicode versions... 19952if (DEBUG && $compare_versions) { 19953 handle_compare_versions(); 19954} 19955 19956print "Compiling Perl properties\n" if $verbosity >= $PROGRESS; 19957compile_perl(); 19958 19959print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS; 19960add_perl_synonyms(); 19961 19962print "Writing tables\n" if $verbosity >= $PROGRESS; 19963write_all_tables(); 19964 19965# Write mktables.lst 19966if ( $file_list and $make_list ) { 19967 19968 print "Updating '$file_list'\n" if $verbosity >= $PROGRESS; 19969 foreach my $file (@input_files, @files_actually_output) { 19970 my (undef, $directories, $basefile) = File::Spec->splitpath($file); 19971 my @directories = grep length, File::Spec->splitdir($directories); 19972 $file = join '/', @directories, $basefile; 19973 } 19974 19975 my $ofh; 19976 if (! open $ofh,">",$file_list) { 19977 Carp::my_carp("Can't write to '$file_list'. Skipping: $!"); 19978 return 19979 } 19980 else { 19981 my $localtime = localtime $start_time; 19982 print $ofh <<"END"; 19983# 19984# $file_list -- File list for $0. 19985# 19986# Autogenerated starting on $start_time ($localtime) 19987# From options $command_line_arguments 19988# 19989# - First section is input files 19990# ($0 itself is not listed but is automatically considered an input) 19991# - Section separator is /^=+\$/ 19992# - Second section is a list of output files. 19993# - Lines matching /^\\s*#/ are treated as comments 19994# which along with blank lines are ignored. 19995# 19996 19997# Input files: 19998 19999END 20000 print $ofh "$_\n" for sort(@input_files); 20001 print $ofh "\n=================================\n# Output files:\n\n"; 20002 print $ofh "$_\n" for sort @files_actually_output; 20003 print $ofh "\n# ",scalar(@input_files)," input files\n", 20004 "# ",scalar(@files_actually_output)+1," output files\n\n", 20005 "# End list\n"; 20006 close $ofh 20007 or Carp::my_carp("Failed to close $ofh: $!"); 20008 20009 print "Filelist has ",scalar(@input_files)," input files and ", 20010 scalar(@files_actually_output)+1," output files\n" 20011 if $verbosity >= $VERBOSE; 20012 } 20013} 20014 20015# Output these warnings unless -q explicitly specified. 20016if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) { 20017 if (@unhandled_properties) { 20018 print "\nProperties and tables that unexpectedly have no code points\n"; 20019 foreach my $property (sort @unhandled_properties) { 20020 print $property, "\n"; 20021 } 20022 } 20023 20024 if (%potential_files) { 20025 print "\nInput files that are not considered:\n"; 20026 foreach my $file (sort keys %potential_files) { 20027 print File::Spec->abs2rel($file), "\n"; 20028 } 20029 } 20030 print "\nAll done\n" if $verbosity >= $VERBOSE; 20031} 20032 20033if ($version_of_mk_invlist_bounds lt $v_version) { 20034 Carp::my_carp("WARNING: \\b{} algorithms (regen/mk_invlist.pl) need" 20035 . " to be checked and possibly updated to Unicode" 20036 . " $string_version. Failing tests will be marked TODO"); 20037} 20038 20039exit(0); 20040 20041# TRAILING CODE IS USED BY make_property_test_script() 20042__DATA__ 20043 20044use strict; 20045use warnings; 20046 20047use feature 'signatures'; 20048 20049no warnings 'experimental::signatures'; 20050no warnings 'experimental::uniprop_wildcards'; 20051 20052# Test qr/\X/ and the \p{} regular expression constructs. This file is 20053# constructed by mktables from the tables it generates, so if mktables is 20054# buggy, this won't necessarily catch those bugs. Tests are generated for all 20055# feasible properties; a few aren't currently feasible; see 20056# is_code_point_usable() in mktables for details. 20057 20058# Standard test packages are not used because this manipulates SIG_WARN. It 20059# exits 0 if every non-skipped test succeeded; -1 if any failed. 20060 20061my $Tests = 0; 20062my $Fails = 0; 20063 20064# loc_tools.pl requires this function to be defined 20065sub ok($pass, @msg) { 20066 print "not " unless $pass; 20067 print "ok "; 20068 print ++$Tests; 20069 print " - ", join "", @msg if @msg; 20070 print "\n"; 20071} 20072 20073sub Expect($expected, $ord, $regex, $warning_type='') { 20074 my $line = (caller)[2]; 20075 20076 # Convert the code point to hex form 20077 my $string = sprintf "\"\\x{%04X}\"", $ord; 20078 20079 my @tests = ""; 20080 20081 # The first time through, use all warnings. If the input should generate 20082 # a warning, add another time through with them turned off 20083 push @tests, "no warnings '$warning_type';" if $warning_type; 20084 20085 foreach my $no_warnings (@tests) { 20086 20087 # Store any warning messages instead of outputting them 20088 local $SIG{__WARN__} = $SIG{__WARN__}; 20089 my $warning_message; 20090 $SIG{__WARN__} = sub { $warning_message = $_[0] }; 20091 20092 $Tests++; 20093 20094 # A string eval is needed because of the 'no warnings'. 20095 # Assumes no parentheses in the regular expression 20096 my $result = eval "$no_warnings 20097 my \$RegObj = qr($regex); 20098 $string =~ \$RegObj ? 1 : 0"; 20099 if (not defined $result) { 20100 print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n"; 20101 $Fails++; 20102 } 20103 elsif ($result ^ $expected) { 20104 print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n"; 20105 $Fails++; 20106 } 20107 elsif ($warning_message) { 20108 if (! $warning_type || ($warning_type && $no_warnings)) { 20109 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n"; 20110 $Fails++; 20111 } 20112 else { 20113 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n"; 20114 } 20115 } 20116 elsif ($warning_type && ! $no_warnings) { 20117 print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n"; 20118 $Fails++; 20119 } 20120 else { 20121 print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n"; 20122 } 20123 } 20124 return; 20125} 20126 20127sub Error($regex) { 20128 $Tests++; 20129 if (eval { 'x' =~ qr/$regex/; 1 }) { 20130 $Fails++; 20131 my $line = (caller)[2]; 20132 print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n"; 20133 } 20134 else { 20135 my $line = (caller)[2]; 20136 print "ok $Tests - got and expected error for qr/$regex/; line $line\n"; 20137 } 20138 return; 20139} 20140 20141# Break test files (e.g. GCBTest.txt) character that break allowed here 20142my $breakable_utf8 = my $breakable = chr(utf8::unicode_to_native(0xF7)); 20143utf8::upgrade($breakable_utf8); 20144 20145# Break test files (e.g. GCBTest.txt) character that indicates can't break 20146# here 20147my $nobreak_utf8 = my $nobreak = chr(utf8::unicode_to_native(0xD7)); 20148utf8::upgrade($nobreak_utf8); 20149 20150my $are_ctype_locales_available; 20151my $utf8_locale; 20152chdir 't' if -d 't'; 20153eval { require "./loc_tools.pl" }; 20154if (defined &locales_enabled) { 20155 $are_ctype_locales_available = locales_enabled('LC_CTYPE'); 20156 if ($are_ctype_locales_available) { 20157 $utf8_locale = &find_utf8_ctype_locale; 20158 } 20159} 20160 20161# Eval'd so can run on versions earlier than the property is available in 20162my $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}\p{WB=ZWJ}]/'; 20163if (! defined $WB_Extend_or_Format_re) { 20164 $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}]/'; 20165} 20166 20167sub _test_break($template, $break_type) { 20168 # Test various break property matches. The 2nd parameter gives the 20169 # property name. The input is a line from auxiliary/*Test.txt for the 20170 # given property. Each such line is a sequence of Unicode (not native) 20171 # code points given by their hex numbers, separated by the two characters 20172 # defined just before this subroutine that indicate that either there can 20173 # or cannot be a break between the adjacent code points. All these are 20174 # tested. 20175 # 20176 # For the gcb property extra tests are made. if there isn't a break, that 20177 # means the sequence forms an extended grapheme cluster, which means that 20178 # \X should match the whole thing. If there is a break, \X should stop 20179 # there. This is all converted by this routine into a match: $string =~ 20180 # /(\X)/, Each \X should match the next cluster; and that is what is 20181 # checked. 20182 20183 my $line = (caller 1)[2]; # Line number 20184 my $comment = ""; 20185 20186 if ($template =~ / ( .*? ) \s* \# (.*) /x) { 20187 $template = $1; 20188 $comment = $2; 20189 20190 # Replace leading spaces with a single one. 20191 $comment =~ s/ ^ \s* / # /x; 20192 } 20193 20194 # The line contains characters above the ASCII range, but in Latin1. It 20195 # may or may not be in utf8, and if it is, it may or may not know it. So, 20196 # convert these characters to 8 bits. If knows is in utf8, simply 20197 # downgrade. 20198 if (utf8::is_utf8($template)) { 20199 utf8::downgrade($template); 20200 } else { 20201 20202 # Otherwise, if it is in utf8, but doesn't know it, the next lines 20203 # convert the two problematic characters to their 8-bit equivalents. 20204 # If it isn't in utf8, they don't harm anything. 20205 use bytes; 20206 $template =~ s/$nobreak_utf8/$nobreak/g; 20207 $template =~ s/$breakable_utf8/$breakable/g; 20208 } 20209 20210 # Perl customizes wb. So change the official tests accordingly 20211 if ($break_type eq 'wb' && $WB_Extend_or_Format_re) { 20212 20213 # Split into elements that alternate between code point and 20214 # break/no-break 20215 my @line = split / +/, $template; 20216 20217 # Look at each code point and its following one 20218 for (my $i = 1; $i < @line - 1 - 1; $i+=2) { 20219 20220 # The customization only involves changing some breaks to 20221 # non-breaks. 20222 next if $line[$i+1] =~ /$nobreak/; 20223 20224 my $lhs = chr utf8::unicode_to_native(hex $line[$i]); 20225 my $rhs = chr utf8::unicode_to_native(hex $line[$i+2]); 20226 20227 # And it only affects adjacent space characters. 20228 next if $lhs !~ /\s/u; 20229 20230 # But, we want to make sure to test spaces followed by a Extend 20231 # or Format. 20232 next if $rhs !~ /\s|$WB_Extend_or_Format_re/; 20233 20234 # To test the customization, add some white-space before this to 20235 # create a span. The $lhs white space may or may not be bound to 20236 # that span, and also with the $rhs. If the $rhs is a binding 20237 # character, the $lhs is bound to it and not to the span, unless 20238 # $lhs is vertical space. In all other cases, the $lhs is bound 20239 # to the span. If the $rhs is white space, it is bound to the 20240 # $lhs 20241 my $bound; 20242 my $span; 20243 if ($rhs =~ /$WB_Extend_or_Format_re/) { 20244 if ($lhs =~ /\v/) { 20245 $bound = $breakable; 20246 $span = $nobreak; 20247 } 20248 else { 20249 $bound = $nobreak; 20250 $span = $breakable; 20251 } 20252 } 20253 else { 20254 $span = $nobreak; 20255 $bound = $nobreak; 20256 } 20257 20258 splice @line, $i, 0, ( '0020', $nobreak, '0020', $span); 20259 $i += 4; 20260 $line[$i+1] = $bound; 20261 } 20262 $template = join " ", @line; 20263 } 20264 20265 # The input is just the break/no-break symbols and sequences of Unicode 20266 # code points as hex digits separated by spaces for legibility. e.g.: 20267 # ÷ 0020 × 0308 ÷ 0020 ÷ 20268 # Convert to native \x format 20269 $template =~ s/ \s* ( [[:xdigit:]]+ ) \s* /sprintf("\\x{%02X}", utf8::unicode_to_native(hex $1))/gex; 20270 $template =~ s/ \s* //gx; # Probably the line above removed all spaces; 20271 # but be sure 20272 20273 # Make a copy of the input with the symbols replaced by \b{} and \B{} as 20274 # appropriate 20275 my $break_pattern = $template =~ s/ $breakable /\\b{$break_type}/grx; 20276 $break_pattern =~ s/ $nobreak /\\B{$break_type}/gx; 20277 20278 my $display_string = $template =~ s/[$breakable$nobreak]//gr; 20279 my $string = eval "\"$display_string\""; 20280 20281 # The remaining massaging of the input is for the \X tests. Get rid of 20282 # the leading and trailing breakables 20283 $template =~ s/^ \s* $breakable \s* //x; 20284 $template =~ s/ \s* $breakable \s* $ //x; 20285 20286 # Delete no-breaks 20287 $template =~ s/ \s* $nobreak \s* //xg; 20288 20289 # Split the input into segments that are breakable between them. 20290 my @should_display = split /\s*$breakable\s*/, $template; 20291 my @should_match = map { eval "\"$_\"" } @should_display; 20292 20293 # If a string can be represented in both non-ut8 and utf8, test both cases 20294 my $display_upgrade = ""; 20295 UPGRADE: 20296 for my $to_upgrade (0 .. 1) { 20297 20298 if ($to_upgrade) { 20299 20300 # If already in utf8, would just be a repeat 20301 next UPGRADE if utf8::is_utf8($string); 20302 20303 utf8::upgrade($string); 20304 $display_upgrade = " (utf8-upgraded)"; 20305 } 20306 20307 my @modifiers = qw(a aa d u i); 20308 if ($are_ctype_locales_available) { 20309 push @modifiers, "l$utf8_locale" if defined $utf8_locale; 20310 20311 # The /l modifier has C after it to indicate the locale to try 20312 push @modifiers, "lC"; 20313 } 20314 20315 # Test for each of the regex modifiers. 20316 for my $modifier (@modifiers) { 20317 my $display_locale = ""; 20318 20319 # For /l, set the locale to what it says to. 20320 if ($modifier =~ / ^ l (.*) /x) { 20321 my $locale = $1; 20322 $display_locale = "(locale = $locale)"; 20323 POSIX::setlocale(&POSIX::LC_CTYPE, $locale); 20324 $modifier = 'l'; 20325 } 20326 20327 no warnings qw(locale regexp surrogate); 20328 my $pattern = "(?$modifier:$break_pattern)"; 20329 20330 # Actually do the test 20331 my $matched_text; 20332 my $matched = $string =~ qr/$pattern/; 20333 if ($matched) { 20334 $matched_text = "matched"; 20335 } 20336 else { 20337 $matched_text = "failed to match"; 20338 print "not "; 20339 20340 if (TODO_FAILING_BREAKS) { 20341 $comment = " # $comment" unless $comment =~ / ^ \s* \# /x; 20342 $comment =~ s/#/# TODO/; 20343 } 20344 } 20345 print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$pattern/$display_upgrade; line $line $display_locale$comment\n"; 20346 20347 # Only print the comment on the first use of this line 20348 $comment = ""; 20349 20350 # Repeat with the first \B{} in the pattern. This makes sure the 20351 # code in regexec.c:find_byclass() for \B gets executed 20352 if ($pattern =~ / ( .*? : ) .* ( \\B\{ .* ) /x) { 20353 my $B_pattern = "$1$2"; 20354 $matched = $string =~ qr/$B_pattern/; 20355 print "not " unless $matched; 20356 $matched_text = ($matched) ? "matched" : "failed to match"; 20357 print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$B_pattern/$display_upgrade; line $line $display_locale"; 20358 print " # TODO" if TODO_FAILING_BREAKS && ! $matched; 20359 print "\n"; 20360 } 20361 } 20362 20363 next if $break_type ne 'gcb'; 20364 20365 # Finally, do the \X match. 20366 my @matches = $string =~ /(\X)/g; 20367 20368 # Look through each matched cluster to verify that it matches what we 20369 # expect. 20370 my $min = (@matches < @should_match) ? @matches : @should_match; 20371 for my $i (0 .. $min - 1) { 20372 $Tests++; 20373 if ($matches[$i] eq $should_match[$i]) { 20374 print "ok $Tests - "; 20375 if ($i == 0) { 20376 print "In \"$display_string\" =~ /(\\X)/g, \\X #1"; 20377 } else { 20378 print "And \\X #", $i + 1, 20379 } 20380 print " correctly matched $should_display[$i]; line $line\n"; 20381 } else { 20382 $matches[$i] = join("", map { sprintf "\\x{%04X}", ord $_ } 20383 split "", $matches[$i]); 20384 print "not ok $Tests -"; 20385 print " # TODO" if TODO_FAILING_BREAKS; 20386 print " In \"$display_string\" =~ /(\\X)/g, \\X #", 20387 $i + 1, 20388 " should have matched $should_display[$i]", 20389 " but instead matched $matches[$i]", 20390 ". Abandoning rest of line $line\n"; 20391 next UPGRADE; 20392 } 20393 } 20394 20395 # And the number of matches should equal the number of expected matches. 20396 $Tests++; 20397 if (@matches == @should_match) { 20398 print "ok $Tests - Nothing was left over; line $line\n"; 20399 } else { 20400 print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line"; 20401 print " # TODO" if TODO_FAILING_BREAKS; 20402 print "\n"; 20403 } 20404 } 20405 20406 return; 20407} 20408 20409sub Test_GCB($t) { 20410 _test_break($t, 'gcb'); 20411} 20412 20413sub Test_LB($t) { 20414 _test_break($t, 'lb'); 20415} 20416 20417sub Test_SB($t) { 20418 _test_break($t, 'sb'); 20419} 20420 20421sub Test_WB($t) { 20422 _test_break($t, 'wb'); 20423} 20424 20425sub Finished() { 20426 print "1..$Tests\n"; 20427 exit($Fails ? -1 : 0); 20428} 20429 20430