1#!perl -w 2use v5.15.8; 3use strict; 4use warnings; 5require './regen/regen_lib.pl'; 6require './regen/charset_translations.pl'; 7use Unicode::UCD 'prop_invlist'; 8 9# This program outputs l1_charclass_tab.h, which defines the guts of the 10# PL_charclass table. Each line is a bit map of properties that the Unicode 11# code point at the corresponding position in the table array has. The first 12# line corresponds to code point 0x0, NULL, the last line to 0xFF. For 13# an application to see if the code point "i" has a particular property, it 14# just does 15# 'PL_charclass[i] & BIT' 16# The bit names are of the form 'CC_property_suffix_', where 'CC' stands for 17# character class, and 'property' is the corresponding property, and 'suffix' 18# is one of '_A' to mean the property is true only if the corresponding code 19# point is ASCII, and '_L1' means that the range includes any Latin1 20# character (ISO-8859-1 including the C0 and C1 controls). A property without 21# these suffixes does not have different forms for both ranges. 22 23# This program need be run only when adding new properties to it, or upon a 24# new Unicode release, to make sure things haven't been changed by it. 25 26# keys are the names of the bits; values are what generates the code points 27# that have the bit set, or 0 if \p{key} is the generator 28my %bit_names = ( 29 NONLATIN1_SIMPLE_FOLD => \&Non_Latin1_Simple_Folds, 30 NONLATIN1_FOLD => \&Non_Latin1_Folds, 31 ALPHANUMERIC => 'Alnum', # Like \w, but no underscore 32 ALPHA => 'XPosixAlpha', 33 ASCII => 0, 34 BLANK => 0, 35 CASED => 0, 36 CHARNAME_CONT => '_Perl_Charname_Continue', 37 CNTRL => 0, 38 DIGIT => 0, 39 GRAPH => 0, 40 IDFIRST => \&Id_First, 41 LOWER => 'XPosixLower', 42 NON_FINAL_FOLD => \&Non_Final_Folds, 43 PRINT => 0, 44 PUNCT => \&Punct_and_Symbols, 45 QUOTEMETA => '_Perl_Quotemeta', 46 SPACE => 'XPerlSpace', 47 UPPER => 'XPosixUpper', 48 WORDCHAR => 'XPosixWord', 49 XDIGIT => 0, 50 VERTSPACE => 0, 51 IS_IN_SOME_FOLD => '_Perl_Any_Folds', 52 BINDIGIT => [ ord '0', ord '1' ], 53 OCTDIGIT => [ ord '0', ord '1', ord '2', ord '3', 54 ord '4', ord '5', ord '6', ord '7' ], 55 56 # These are the control characters that there are mnemonics for 57 MNEMONIC_CNTRL => [ ord "\a", ord "\b", ord "\e", ord "\f", 58 ord "\n", ord "\r", ord "\t" ], 59); 60 61sub uniques { 62 # Returns non-duplicated input values. From "Perl Best Practices: 63 # Encapsulated Cleverness". p. 455 in first edition. 64 65 my %seen; 66 return grep { ! $seen{$_}++ } @_; 67} 68 69sub expand_invlist { 70 # Return the code points that are in the inversion list given by the 71 # argument 72 73 my $invlist_ref = shift; 74 my $i; 75 my @full_list; 76 77 for (my $i = 0; $i < @$invlist_ref; $i += 2) { 78 my $upper = ($i + 1) < @$invlist_ref 79 ? $invlist_ref->[$i+1] - 1 # In range 80 : $Unicode::UCD::MAX_CP; # To infinity. 81 for my $j ($invlist_ref->[$i] .. $upper) { 82 push @full_list, $j; 83 } 84 } 85 86 return @full_list; 87} 88 89# Read in the case fold mappings. 90my %folded_closure; 91my %simple_folded_closure; 92my @non_final_folds; 93my @non_latin1_simple_folds; 94my @folds; 95use Unicode::UCD; 96 97# Use the Unicode data file if we are on an ASCII platform (which its data 98# is for), and it is in the modern format (starting in Unicode 3.1.0) and 99# it is available. This avoids being affected by potential bugs 100# introduced by other layers of Perl 101my $file="lib/unicore/CaseFolding.txt"; 102 103if (ord('A') == 65 104 && pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0 105 && open my $fh, "<", $file) 106{ 107 @folds = <$fh>; 108} 109else { 110 my ($invlist_ref, $invmap_ref, undef, $default) 111 = Unicode::UCD::prop_invmap('Case_Folding'); 112 for my $i (0 .. @$invlist_ref - 1 - 1) { 113 next if $invmap_ref->[$i] == $default; 114 my $adjust = -1; 115 for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) { 116 $adjust++; 117 118 # Single-code point maps go to a 'C' type 119 if (! ref $invmap_ref->[$i]) { 120 push @folds, sprintf("%04X; C; %04X\n", 121 $j, 122 $invmap_ref->[$i] + $adjust); 123 } 124 else { # Multi-code point maps go to 'F'. prop_invmap() 125 # guarantees that no adjustment is needed for these, 126 # as the range will contain just one element 127 push @folds, sprintf("%04X; F; %s\n", 128 $j, 129 join " ", map { sprintf "%04X", $_ } 130 @{$invmap_ref->[$i]}); 131 } 132 } 133 } 134} 135 136for (@folds) { 137 chomp; 138 139 # Lines look like (without the initial '#' 140 #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE 141 # Get rid of comments, ignore blank or comment-only lines 142 my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx; 143 next unless length $line; 144 my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line; 145 146 my $from = hex $hex_from; 147 148 # Perl only deals with S, C, and F folds 149 next if $fold_type ne 'C' and $fold_type ne 'F' and $fold_type ne 'S'; 150 151 # Get each code point in the range that participates in this line's fold. 152 # The hash has keys of each code point in the range, and values of what it 153 # folds to and what folds to it 154 for my $i (0 .. @folded - 1) { 155 my $fold = hex $folded[$i]; 156 if ($fold < 256) { 157 push @{$folded_closure{$fold}}, $from; 158 push @{$simple_folded_closure{$fold}}, $from if $fold_type ne 'F'; 159 } 160 if ($from < 256) { 161 push @{$folded_closure{$from}}, $fold; 162 push @{$simple_folded_closure{$from}}, $fold if $fold_type ne 'F'; 163 } 164 165 if (($fold_type eq 'C' || $fold_type eq 'S') 166 && ($fold < 256 != $from < 256)) 167 { 168 # Fold is simple (hence can't be a non-final fold, so the 'if' 169 # above is mutualy exclusive from the 'if below) and crosses 170 # 255/256 boundary. We keep track of the Latin1 code points 171 # in such folds. 172 push @non_latin1_simple_folds, ($fold < 256) 173 ? $fold 174 : $from; 175 } 176 elsif ($i < @folded-1 177 && $fold < 256 178 && ! grep { $_ == $fold } @non_final_folds) 179 { 180 push @non_final_folds, $fold; 181 182 # Also add the upper case, which in the latin1 range folds to 183 # $fold 184 push @non_final_folds, ord uc chr $fold; 185 } 186 } 187} 188 189# Now having read all the lines, combine them into the full closure of each 190# code point in the range by adding lists together that share a common 191# element 192foreach my $folded (keys %folded_closure) { 193 foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) { 194 push @{$folded_closure{$from}}, @{$folded_closure{$folded}}; 195 } 196} 197foreach my $folded (keys %simple_folded_closure) { 198 foreach my $from (grep { $_ < 256 } @{$simple_folded_closure{$folded}}) { 199 push @{$simple_folded_closure{$from}}, @{$simple_folded_closure{$folded}}; 200 } 201} 202 203# We have the single-character folds that cross the 255/256, like KELVIN 204# SIGN => 'k', but we need the closure, so add like 'K' to it 205foreach my $folded (@non_latin1_simple_folds) { 206 foreach my $fold (@{$simple_folded_closure{$folded}}) { 207 if ($fold < 256 && ! grep { $fold == $_ } @non_latin1_simple_folds) { 208 push @non_latin1_simple_folds, $fold; 209 } 210 } 211} 212 213sub Id_First { 214 my @alpha_invlist = prop_invlist("XPosixAlpha"); 215 my @ids = expand_invlist(\@alpha_invlist); 216 push @ids, ord "_"; 217 return sort { $a <=> $b } uniques @ids; 218} 219 220sub Non_Latin1_Folds { 221 my @return; 222 223 foreach my $folded (keys %folded_closure) { 224 push @return, $folded if grep { $_ > 255 } @{$folded_closure{$folded}}; 225 } 226 return @return; 227} 228 229sub Non_Latin1_Simple_Folds { # Latin1 code points that are folded to by 230 # non-Latin1 code points as single character 231 # folds 232 return @non_latin1_simple_folds; 233} 234 235sub Non_Final_Folds { 236 return @non_final_folds; 237} 238 239sub Punct_and_Symbols { 240 # Sadly, this is inconsistent: \pP and \pS for the ascii range; 241 # just \pP outside it. 242 243 my @punct_invlist = prop_invlist("Punct"); 244 my @return = expand_invlist(\@punct_invlist); 245 246 my @symbols_invlist = prop_invlist("Symbol"); 247 my @symbols = expand_invlist(\@symbols_invlist); 248 foreach my $cp (@symbols) { 249 last if $cp > 0x7f; 250 push @return, $cp; 251 } 252 253 return sort { $a <=> $b } uniques @return; 254} 255 256my @bits; # Each element is a bit map for a single code point 257 258# For each bit type, calculate which code points should have it set 259foreach my $bit_name (sort keys %bit_names) { 260 my @code_points; 261 262 my $property = $bit_name; # The bit name is the same as its property, 263 # unless overridden 264 $property = $bit_names{$bit_name} if $bit_names{$bit_name}; 265 266 if (! ref $property) { 267 my @invlist = prop_invlist($property, '_perl_core_internal_ok'); 268 @code_points = expand_invlist(\@invlist); 269 } 270 elsif (ref $property eq 'CODE') { 271 @code_points = &$property; 272 } 273 elsif (ref $property eq 'ARRAY') { 274 @code_points = @{$property}; 275 } 276 277 foreach my $cp (@code_points) { 278 last if $cp > 0xFF; 279 $bits[$cp] .= '|' if $bits[$cp]; 280 $bits[$cp] .= "(1U<<CC_${bit_name}_)"; 281 } 282} 283 284my $out_fh = open_new('l1_char_class_tab.h', '>', 285 {style => '*', by => $0, 286 from => "Unicode::UCD"}); 287 288print $out_fh <<END; 289/* For code points whose position is not the same as Unicode, both are shown 290 * in the comment*/ 291END 292 293# Output the table using fairly short names for each char. 294my $is_for_ascii = 1; # get_supported_code_pages() returns the ASCII 295 # character set first 296foreach my $charset (get_supported_code_pages()) { 297 my @a2n = @{get_a2n($charset)}; 298 my @out; 299 my @utf_to_i8; 300 301 if ($is_for_ascii) { 302 $is_for_ascii = 0; 303 } 304 else { # EBCDIC. Calculate mapping from UTF-EBCDIC bytes to I8 305 my $i8_to_utf_ref = get_I8_2_utf($charset); 306 for my $i (0..255) { 307 $utf_to_i8[$i8_to_utf_ref->[$i]] = $i; 308 } 309 } 310 311 print $out_fh "\n" . get_conditional_compile_line_start($charset); 312 for my $ord (0..255) { 313 my $name; 314 my $char = chr $ord; 315 if ($char =~ /\p{PosixGraph}/) { 316 my $quote = $char eq "'" ? '"' : "'"; 317 $name = $quote . chr($ord) . $quote; 318 } 319 elsif ($char =~ /\p{XPosixGraph}/) { 320 use charnames(); 321 $name = charnames::viacode($ord); 322 $name =~ s/LATIN CAPITAL LETTER // 323 or $name =~ s/LATIN SMALL LETTER (.*)/\L$1/ 324 or $name =~ s/ SIGN\b// 325 or $name =~ s/EXCLAMATION MARK/'!'/ 326 or $name =~ s/QUESTION MARK/'?'/ 327 or $name =~ s/QUOTATION MARK/QUOTE/ 328 or $name =~ s/ INDICATOR//; 329 $name =~ s/\bWITH\b/\L$&/; 330 $name =~ s/\bONE\b/1/; 331 $name =~ s/\b(TWO|HALF)\b/2/; 332 $name =~ s/\bTHREE\b/3/; 333 $name =~ s/\b QUARTER S? \b/4/x; 334 $name =~ s/VULGAR FRACTION (.) (.)/$1\/$2/; 335 $name =~ s/\bTILDE\b/'~'/i 336 or $name =~ s/\bCIRCUMFLEX\b/'^'/i 337 or $name =~ s/\bSTROKE\b/'\/'/i 338 or $name =~ s/ ABOVE\b//i; 339 } 340 else { 341 use Unicode::UCD qw(prop_invmap); 342 my ($list_ref, $map_ref, $format) 343 = prop_invmap("_Perl_Name_Alias", '_perl_core_internal_ok'); 344 if ($format !~ /^s/) { 345 use Carp; 346 carp "Unexpected format '$format' for '_Perl_Name_Alias"; 347 last; 348 } 349 my $which = Unicode::UCD::search_invlist($list_ref, $ord); 350 if (! defined $which) { 351 use Carp; 352 carp "No name found for code pont $ord"; 353 } 354 else { 355 my $map = $map_ref->[$which]; 356 if (! ref $map) { 357 $name = $map; 358 } 359 else { 360 # Just pick the first abbreviation if more than one 361 my @names = grep { $_ =~ /abbreviation/ } @$map; 362 $name = $names[0]; 363 } 364 $name =~ s/:.*//; 365 } 366 } 367 368 my $index = $a2n[$ord]; 369 my $i8; 370 $i8 = $utf_to_i8[$index] if @utf_to_i8; 371 372 $out[$index] = "/* "; 373 $out[$index] .= sprintf "0x%02X ", $index if $ord != $index; 374 $out[$index] .= sprintf "U+%02X ", $ord; 375 $out[$index] .= sprintf "I8=%02X ", $i8 if defined $i8 && $i8 != $ord; 376 $out[$index] .= "$name */ "; 377 $out[$index] .= $bits[$ord]; 378 379 $out[$index] .= ",\n"; 380 } 381 $out[-1] =~ s/,$//; # No trailing comma in the final entry 382 383 print $out_fh join "", @out; 384 print $out_fh "\n" . get_conditional_compile_line_end(); 385} 386 387read_only_bottom_close_and_rename($out_fh) 388