1use v5.16.0; 2use strict; 3use warnings; 4no warnings 'experimental::regex_sets'; 5require './regen/regen_lib.pl'; 6require './regen/charset_translations.pl'; 7use Unicode::UCD qw(prop_invlist prop_invmap search_invlist); 8use charnames qw(:loose); 9binmode(STDERR, ":utf8"); 10 11# Set this to 1 temporarily to get on stderr the complete list of paired 12# string delimiters this generates. This list is suitable for plugging into a 13# pod. 14my $output_lists = 0; 15 16# Set this to 1 temporarily to get on stderr the complete list of punctuation 17# marks and symbols that look to be directional but we didn't include for some 18# reason. 19my $output_omitteds = 0; 20 21my $out_fh = open_new('unicode_constants.h', '>', 22 {style => '*', by => $0, 23 from => "Unicode data"}); 24 25print $out_fh <<END; 26 27#ifndef PERL_UNICODE_CONSTANTS_H_ /* Guard against nested #includes */ 28#define PERL_UNICODE_CONSTANTS_H_ 1 29 30/* This file contains #defines for the version of Unicode being used and 31 * various Unicode code points. The values the code point macros expand to 32 * are the native Unicode code point, or all or portions of the UTF-8 encoding 33 * for the code point. In the former case, the macro name has the suffix 34 * "_NATIVE"; otherwise, the suffix "_UTF8". 35 * 36 * The macros that have the suffix "_UTF8" may have further suffixes, as 37 * follows: 38 * "_FIRST_BYTE" if the value is just the first byte of the UTF-8 39 * representation; the value will be a numeric constant. 40 * "_TAIL" if instead it represents all but the first byte. This, and 41 * with no additional suffix are both string constants */ 42 43/* 44=for apidoc_section \$unicode 45 46=for apidoc AmnU|const char *|BOM_UTF8 47 48This is a macro that evaluates to a string constant of the UTF-8 bytes that 49define the Unicode BYTE ORDER MARK (U+FEFF) for the platform that perl 50is compiled on. This allows code to use a mnemonic for this character that 51works on both ASCII and EBCDIC platforms. 52S<C<sizeof(BOM_UTF8) - 1>> can be used to get its length in 53bytes. 54 55=for apidoc AmnU|const char *|REPLACEMENT_CHARACTER_UTF8 56 57This is a macro that evaluates to a string constant of the UTF-8 bytes that 58define the Unicode REPLACEMENT CHARACTER (U+FFFD) for the platform that perl 59is compiled on. This allows code to use a mnemonic for this character that 60works on both ASCII and EBCDIC platforms. 61S<C<sizeof(REPLACEMENT_CHARACTER_UTF8) - 1>> can be used to get its length in 62bytes. 63 64=cut 65*/ 66 67END 68 69sub backslash_x_form($$;$) { 70 # Output the code point represented by the byte string $bytes as a 71 # sequence of \x{} constants. $bytes should be the UTF-8 for the code 72 # point if the final parameter is absent or empty. Otherwise it should be 73 # the Latin1 code point itself. 74 # 75 # The output is translated into the character set '$charset'. 76 77 my ($bytes, $charset, $non_utf8) = @_; 78 if ($non_utf8) { 79 die "Must be utf8 if above 255" if $bytes > 255; 80 my $a2n = get_a2n($charset); 81 return sprintf "\\x%02X", $a2n->[$bytes]; 82 } 83 else { 84 return join "", map { sprintf "\\x%02X", ord $_ } 85 split //, cp_2_utfbytes($bytes, $charset); 86 } 87} 88 89# The most complicated thing this program does is generate paired string 90# delimiters from the Unicode database. Some of these come from the 91# Unicode Bidirectional (bidi) algorithm. 92 93# These all visually look like left and right delimiters 94my @bidi_strong_lefts = ( 'LESS-THAN', 95 'ELEMENT OF', 96 'PRECEDE', 97 'PRECEDES', 98 'SMALLER THAN', 99 'SUBSET', 100 ); 101my @bidi_strong_rights = ( 'GREATER-THAN', 102 'CONTAINS', 103 'SUCCEED', 104 'SUCCEEDS', 105 'LARGER THAN', 106 'SUPERSET', 107 ); 108 109# Create an array of hashes for these, so as to translate between them, and 110# avoid recompiling patterns in the loop. 111my @bidi_strong_directionals; 112for (my $i = 0; $i < @bidi_strong_lefts; $i++) { 113 push @bidi_strong_directionals, 114 { 115 LHS => $bidi_strong_lefts[$i], 116 RHS => $bidi_strong_rights[$i], 117 L_pattern => qr/\b$bidi_strong_lefts[$i]\b/, 118 R_pattern => qr/\b$bidi_strong_rights[$i]\b/, 119 }; 120} 121 122my @ok_bidi_symbols = ( 123 'TACK', 124 'TURNSTILE', 125 ); 126my $ok_bidi_symbols_re = join '|', @ok_bidi_symbols; 127$ok_bidi_symbols_re = qr/\b($ok_bidi_symbols_re)\b/n; 128 129 130# Many characters have mirrors that Unicode hasn't included in their Bidi 131# algorithm. This program uses their names to find them. The next few 132# definitions are towards that end. 133 134# Most horizontal directionality is based on LEFT vs RIGHT. But it's 135# complicated: 136# 1) a barb on one or the other side of a harpoon doesn't indicate 137# directionality of the character. (A HARPOON is the word Unicode uses 138# to indicate an arrow with a one-sided tip.) 139my $no_barb_re = qr/(*nlb:BARB )/; 140 141# 2) RIGHT-SHADED doesn't signify anything about direction of the character 142# itself. These are the suffixes Unicode uses to indicate this. /aa is 143# needed because the wildcard names feature currently requires it for names. 144my $shaded_re = qr/ [- ] (SHADED | SHADOWED) /naax; 145 146# 3a) there are a few anomalies caught here. 'LEFT LUGGAGE' would have been 147# better named UNCLAIMED, and doesn't indicate directionality. 148my $real_LEFT_re = qr/ \b $no_barb_re LEFT (*nla: $shaded_re) 149 (*nla: [ ] LUGGAGE \b) 150 /nx; 151# 3b) And in most cases,a RIGHT TRIANGLE also doesn't refer to 152# directionality, but indicates it contains a 90 degree angle. 153my $real_RIGHT_re = qr/ \b $no_barb_re RIGHT (*nla: $shaded_re) 154 (*nla: [ ] (TRI)? ANGLE \b) 155 /nx; 156# More items could be added to these as needed 157 158# 4) something that is pointing R goes on the left, so is different than 159# the character on the R. For example, a RIGHT BRACKET would be 160# different from a RIGHT-FACING bracket. These patterns capture the 161# typical ways that Unicode character names indicate the latter meaning 162# as a suffix to RIGHT or LEFT 163my $pointing_suffix_re = qr/ ( WARDS # e.g., RIGHTWARDS 164 | [ ] ARROW # A R arrow points to the R 165 | [ -] FACING 166 | [ -] POINTING 167 | [ ] PENCIL # Implies a direction of its 168 # point 169 ) \b /nx; 170# And correspondingly for a prefix for LEFT RIGHT 171my $pointing_prefix_re = qr/ \b ( # e.g. UP RIGHT implies a direction 172 UP ( [ ] AND)? 173 | DOWN ( [ ] AND)? 174 | CONVERGING 175 | POINTING [ ] (DIRECTLY)? 176 | TO [ ] THE 177 ) 178 [ ] 179 /nx; 180 181my @other_directionals = 182 { 183 LHS => 'LEFT', 184 RHS => 'RIGHT', 185 L_pattern => 186 # Something goes on the left if it contains LEFT and doesn't 187 # point left, or it contains RIGHT and does point right. 188 qr/ \b (*nlb: $pointing_prefix_re) $real_LEFT_re 189 (*nla: $pointing_suffix_re) 190 | \b (*plb: $pointing_prefix_re) $real_RIGHT_re \b 191 | \b $real_RIGHT_re (*pla: $pointing_suffix_re) 192 /nx, 193 R_pattern => 194 qr/ \b (*nlb: $pointing_prefix_re) $real_RIGHT_re 195 (*nla: $pointing_suffix_re) 196 | \b (*plb: $pointing_prefix_re) $real_LEFT_re \b 197 | \b $real_LEFT_re (*pla: $pointing_suffix_re) 198 /nx, 199 }; 200 201# Some horizontal directionality is based on EAST vs WEST. These words are 202# almost always used by Unicode to indicate the direction pointing to, without 203# the general consistency in phrasing in L/R above. There are a handful of 204# possible exceptions, with only WEST WIND ever at all possibly an issue 205push @other_directionals, 206 { 207 LHS => 'EAST', 208 RHS => 'WEST', 209 L_pattern => qr/ \b ( EAST (*nla: [ ] WIND) 210 | WEST (*pla: [ ] WIND)) \b /x, 211 R_pattern => qr/ \b ( WEST (*nla: [ ] WIND) 212 | EAST (*pla: [ ] WIND)) \b /x, 213 }; 214 215# The final way the Unicode signals mirroring is by using the words REVERSE or 216# REVERSED; 217my $reverse_re = qr/ \b REVERSE D? [- ] /x; 218 219# Create a mapping from each direction to its opposite one 220my %opposite_of; 221foreach my $directional (@bidi_strong_directionals, @other_directionals) { 222 $opposite_of{$directional->{LHS}} = $directional->{RHS}; 223 $opposite_of{$directional->{RHS}} = $directional->{LHS}; 224} 225 226# Join the two types of each direction as alternatives 227my $L_re = join "|", map { $_->{L_pattern} } @bidi_strong_directionals, 228 @other_directionals; 229my $R_re = join "|", map { $_->{R_pattern} } @bidi_strong_directionals, 230 @other_directionals; 231# And anything containing directionality will be either one of these two 232my $directional_re = join "|", $L_re, $R_re; 233 234# Now compile the strings that result from above 235$L_re = qr/$L_re/; 236$R_re = qr/$R_re/; 237$directional_re = qr/($directional_re)/; # Make sure to capture $1 238 239my @included_symbols = ( 240 0x2326, 0x232B, # ERASE 241 0x23E9 .. 0x23EA, # DOUBLE TRIANGLE 242 0x23ED .. 0x23EE, # DOUBLE TRIANGLE with BAR 243 0x269E .. 0x269F, # THREE LINES CONVERGING 244 0x1D102 .. 0x1D103, # MUSIC STAVES 245 0x1D106 .. 0x1D107, # MUSIC STAVES 246 0x1F57B, # TELEPHONE RECEIVER 247 0x1F57D, # TELEPHONE RECEIVER 248 0x1F508 .. 0x1F50A, # LOUD SPEAKER 249 0x1F568 .. 0x1F56A, # LOUD SPEAKER 250 0x1F5E6 .. 0x1F5E7, # THREE RAYS 251 ); 252my %included_symbols; 253$included_symbols{$_} = 1 for @included_symbols; 254 255sub format_pairs_line($;$) { 256 my ($from, $to) = @_; 257 258 # Format a line containing a character singleton or pair in preparation 259 # for output, suitable for pod. 260 261 my $lhs_name = charnames::viacode($from); 262 my $lhs_hex = sprintf "%04X", $from; 263 my $rhs_name; 264 my $rhs_hex; 265 my $name = $lhs_name; 266 267 my $hanging_indent = 26; 268 269 # Treat a trivial pair as a singleton 270 undef $to if defined $to && $to == $from; 271 272 if (defined $to) { 273 my $rhs_name = charnames::viacode($to); 274 $rhs_hex = sprintf "%04X", $to; 275 276 # Most of the names differ only in LEFT vs RIGHT; some in 277 # LESS-THAN vs GREATER-THAN. It takes less space, and is easier to 278 # understand if they are displayed combined. 279 if ($name =~ s/$directional_re/$opposite_of{$1}/gr eq $rhs_name) { 280 $name =~ s,$directional_re,$1/$opposite_of{$1},g; 281 } 282 else { # Otherwise, display them sequentially 283 $name .= ", " . $rhs_name; 284 } 285 } 286 287 # Handle double-width characters, based on the East Asian Width property. 288 # Add an extra space to non-wide ones so things stay vertically aligned. 289 my $extra = 0; 290 my $output_line = " " # Indent in case output being used for verbatim 291 # pod 292 . chr $from; 293 if (chr($from) =~ /[\p{EA=W}\p{EA=F}]/) { 294 $extra++; # The length() will be shorter than the displayed 295 # width 296 } 297 else { 298 $output_line .= " "; 299 } 300 if (defined $to) { 301 $output_line .= " " . chr $to; 302 if (chr($to) =~ /[\p{EA=W}\p{EA=F}]/) { 303 $extra++; 304 } 305 else { 306 $output_line .= " "; 307 } 308 } 309 else { 310 $output_line .= " "; 311 } 312 313 $output_line .= " U+$lhs_hex"; 314 $output_line .= ", U+$rhs_hex" if defined $to;; 315 my $cur_len = $extra + length $output_line; 316 $output_line .= " " x ($hanging_indent - $cur_len); 317 318 my $max_len = 74; # Pod formatter will indent 4 spaces 319 $cur_len = length $output_line; 320 321 if ($cur_len + length $name <= $max_len) { 322 $output_line .= $name; # It will fit 323 } 324 else { # It won't fit. Append a segment that is unbreakable until would 325 # exceed the available width; then start on a new line 326 # Doesn't handle the case where the whole segment doesn't fit; 327 # this just doesn't come up with the input data. 328 while ($name =~ / ( .+? ) \b{lb} /xg) { 329 my $segment = $1; 330 my $added_length = length $segment; 331 if ($cur_len + $added_length > $max_len) { 332 $output_line =~ s/ +$//; 333 $output_line .= "\n" . " " x $hanging_indent; 334 $cur_len = $hanging_indent; 335 } 336 337 $output_line .= $segment; 338 $cur_len += $added_length; 339 } 340 } 341 342 return $output_line . "\n"; 343} 344 345my $version = Unicode::UCD::UnicodeVersion(); 346my ($major, $dot, $dotdot) = $version =~ / (.*?) \. (.*?) (?: \. (.*) )? $ /x; 347$dotdot = 0 unless defined $dotdot; 348 349print $out_fh <<END; 350#define UNICODE_MAJOR_VERSION $major 351#define UNICODE_DOT_VERSION $dot 352#define UNICODE_DOT_DOT_VERSION $dotdot 353 354END 355 356# Gather the characters in Unicode that have left/right symmetry suitable for 357# paired string delimiters 358my %paireds; 359 360# So don't have to grep an array to determine if have already dealt with the 361# characters that are the keys 362my %inverted_paireds; 363 364# This property is the universe of all characters in Unicode which 365# are of some import to the Bidirectional Algorithm, and for which there is 366# another Unicode character that is a mirror of it. 367my ($bmg_invlist, $bmg_invmap, $format, $bmg_default) = 368 prop_invmap("Bidi_Mirroring_Glyph"); 369 370# Keep track of the characters we don't use, and why not. 371my %discards; 372my $non_directional = 'No perceived horizontal direction'; 373my $not_considered_directional_because = "Not considered directional because"; 374my $trailing_up_down = 'Vertical direction after all L/R direction'; 375my $unpaired = "Didn't find a mirror"; 376my $illegal = "Mirror illegal"; 377my $no_encoded_mate = "Mirrored, but Unicode has no encoded mirror"; 378my $bidirectional = "Bidirectional"; 379 380my %unused_bidi_pairs; 381my %inverted_unused_bidi_pairs; 382my %unused_pairs; # 383my %inverted_unused_pairs; 384 385# Could be more explicit about allowing, e.g. ARROWS, ARROWHEAD, but this 386# suffices 387my $arrow_like_re = qr/\b(ARROW|HARPOON)/; 388 389# Go through the Unicode Punctuation and Symbol characters looking for ones 390# that have mirrors, suitable for being string delimiters. Some of these are 391# easily derivable from Unicode properties dealing with the bidirectional 392# algorithm. But the purpose of that algorithm isn't the same as ours, and 393# excludes many suitable ones. In particular, no arrows are included in it. 394# To find suitable ones, we also look at character names to see if there is a 395# character with that name, but the horizontal direction reversed. That will 396# almost certainly be a mirror. 397foreach my $list (qw(Punctuation Symbol)) { 398 my @invlist = prop_invlist($list); 399 die "Empty list $list" unless @invlist; 400 401 my $is_Symbol = $list eq 'Symbol'; 402 403 # Convert from an inversion list to an array containing everything that 404 # matches. (This uses the recipe given in Unicode::UCD.) 405 my @full_list; 406 for (my $i = 0; $i < @invlist; $i += 2) { 407 my $upper = ($i + 1) < @invlist 408 ? $invlist[$i+1] - 1 # In range 409 : $Unicode::UCD::MAX_CP; # To infinity. 410 for my $j ($invlist[$i] .. $upper) { 411 push @full_list, $j; 412 } 413 } 414 415 CODE_POINT: 416 foreach my $code_point (@full_list) { 417 #print STDERR __FILE__, ": ", __LINE__, ": ", sprintf("%04x ", $code_point), charnames::viacode($code_point), "\n"; 418 my $chr = chr $code_point; 419 420 # Don't reexamine something we've already determined. This happens 421 # when its mate was earlier processed and found this one. 422 foreach my $hash_ref (\%paireds, \%inverted_paireds, 423 \%unused_bidi_pairs, \%inverted_unused_bidi_pairs, 424 \%unused_pairs, \%inverted_unused_pairs) 425 { 426 next CODE_POINT if exists $hash_ref->{$code_point} 427 } 428 429 my $name = charnames::viacode($code_point); 430 my $original_had_REVERSE; 431 my $mirror; 432 my $mirror_code_point; 433 434 # If Unicode considers this to have a mirror, we don't have to go 435 # looking 436 if ($chr =~ /\p{Bidi_Mirrored}/) { 437 my $i = search_invlist($bmg_invlist, $code_point); 438 $mirror_code_point = $bmg_invmap->[$i]; 439 if ( $mirror_code_point eq $bmg_default) { 440 $discards{$code_point} = { reason => $no_encoded_mate, 441 mirror => undef 442 }; 443 next; 444 } 445 446 # Certain Unicode properties classify some mirrored characters as 447 # opening (left) vs closing (right). Skip the closing ones this 448 # iteration; they will be handled later when the opening mate 449 # comes along. 450 if ($chr =~ /(?[ \p{BPT=Close} 451 | \p{Gc=Close_Punctuation} 452 ])/) 453 { 454 next; # Get this when its opening mirror comes up. 455 } 456 elsif ($chr =~ /(?[ \p{BPT=Open} 457 | \p{Gc=Open_Punctuation} 458 | \p{Gc=Initial_Punctuation} 459 | \p{Gc=Final_Punctuation} 460 ])/) 461 { 462 # Here, it's a left delimiter. (The ones in Final Punctuation 463 # can be opening ones in some languages.) 464 $paireds{$code_point} = $mirror_code_point; 465 $inverted_paireds{$mirror_code_point} = $code_point; 466 467 # If the delimiter can be used on either side, add its 468 # complement 469 if ($chr =~ /(?[ \p{Gc=Initial_Punctuation} 470 | \p{Gc=Final_Punctuation} 471 ])/) 472 { 473 $paireds{$mirror_code_point} = $code_point; 474 $inverted_paireds{$code_point} = $mirror_code_point; 475 } 476 477 next; 478 } 479 480 # Unicode doesn't consider '< >' to be brackets, but Perl does. There are 481 # lots of variants of these in Unicode; easiest to accept all of 482 # them that aren't bidirectional (which would be visually 483 # confusing). 484 for (my $i = 0; $i < @bidi_strong_directionals; $i++) { 485 my $hash_ref = $bidi_strong_directionals[$i]; 486 487 next if $name !~ $hash_ref->{L_pattern}; 488 489 if ($name =~ $hash_ref->{R_pattern}) { 490 $discards{$code_point} = { reason => $bidirectional, 491 mirror => $mirror_code_point 492 }; 493 next CODE_POINT; 494 } 495 496 $paireds{$code_point} = $mirror_code_point; 497 $inverted_paireds{$mirror_code_point} = $code_point; 498 $original_had_REVERSE = $name =~ /$reverse_re/; 499 next CODE_POINT; 500 } 501 502 # The other paired symbols are more iffy as being desirable paired 503 # delimiters; we let the code below decide what to do with them. 504 $mirror = charnames::viacode($mirror_code_point); 505 } 506 else { # Here is not involved with the bidirectional algorithm. 507 508 # Get the mirror (if any) from reversing the directions in the 509 # name, and looking that up 510 $mirror = $name; 511 $mirror =~ s/$directional_re/$opposite_of{$1}/g; 512 $original_had_REVERSE = $mirror =~ s/$reverse_re//g; 513 $mirror_code_point = charnames::vianame($mirror); 514 } 515 516 # Letter-like symbols don't really stand on their own and don't look 517 # like traditional delimiters. 518 if ($chr =~ /\p{Sk}/) { 519 $discards{$code_point} 520 = { reason => "Letter-like symbols are not eligible", 521 mirror => $mirror_code_point 522 }; 523 next CODE_POINT; 524 } 525 526 # Certain names are always treated as non directional. 527 if ($name =~ m{ \b ( WITH [ ] (?:LEFT|RIGHT) [ ] HALF [ ] BLACK 528 | BLOCK 529 | BOX [ ] DRAWINGS 530 | CIRCLE [ ] WITH 531 | EXTENSION 532 | (?: UPPER | LOWER ) [ ] HOOK 533 534 # The VERTICAL marks these as not actually 535 # L/R mirrored. 536 | PRESENTATION [ ] FORM [ ] FOR [ ] VERTICAL 537 | QUADRANT 538 | SHADE 539 | SQUARE [ ] WITH 540 ) \b }x) 541 { 542 $discards{$code_point} 543 = { reason => "$not_considered_directional_because name" 544 . " contains '$1'", 545 mirror => $mirror_code_point 546 }; 547 next CODE_POINT; 548 } 549 550 # If these are equal, it means the original had no horizontal 551 # directioning 552 if ($name eq $mirror) { 553 $discards{$code_point} = { reason => $non_directional, 554 mirror => undef 555 }; 556 next CODE_POINT; 557 } 558 559 # If the name has both left and right directions, it is bidirectional, 560 # so not suited to be a paired delimiter. 561 if ($name =~ $L_re && $name =~ $R_re) { 562 $discards{$code_point} = { reason => $bidirectional, 563 mirror => $mirror_code_point 564 }; 565 next CODE_POINT; 566 } 567 568 # If no mate was found, it could be that it's like the case of 569 # SPEAKER vs RIGHT SPEAKER (which probably means the mirror was added 570 # in a later version than the original. Check by removing all 571 # directionality and trying to see if there is a character with that 572 # name. 573 if (! defined $mirror_code_point) { 574 $mirror =~ s/$directional_re //; 575 $mirror_code_point = charnames::vianame($mirror); 576 if (! defined $mirror_code_point) { 577 578 # Still no mate. 579 $discards{$code_point} = { reason => $unpaired, 580 mirror => undef 581 }; 582 next; 583 } 584 } 585 586 if ($code_point == $mirror_code_point) { 587 $discards{$code_point} = 588 { reason => "$unpaired - Single character, multiple" 589 . " names; Unicode name correction", 590 mirror => $mirror_code_point 591 }; 592 next; 593 } 594 595 if ($is_Symbol) { 596 597 # Skip if the direction is followed by a vertical motion 598 # (which defeats the left-right directionality). 599 if ( $name =~ / ^ .* $no_barb_re 600 \b (UP|DOWN|NORTH|SOUTH) /gx 601 and not $name =~ /$directional_re/g) 602 { 603 $discards{$code_point} = { reason => $trailing_up_down, 604 mirror => $mirror_code_point 605 }; 606 next; 607 } 608 } 609 610 # There are a few characters like REVERSED SEMICOLON that are mirrors, 611 # but have always commonly been used unmirrored. There is also the 612 # PILCROW SIGN and its mirror which might be considered to be 613 # legitimate mirrors, but maybe not. Additionally the current 614 # algorithm for finding the mirror depends on each member of a pair 615 # being respresented by the same number of bytes as its mate. By 616 # skipping these, we solve both problems 617 if ($code_point < 256 != $mirror_code_point < 256) { 618 $discards{$code_point} = { reason => $illegal, 619 mirror => $mirror_code_point 620 }; 621 next; 622 } 623 624 # And '/' and '\' are mirrors that we don't accept 625 if ( $name =~ /SOLIDUS/ 626 && $name =~ s/REVERSE SOLIDUS/SOLIDUS/r 627 eq $mirror =~ s/REVERSE SOLIDUS/SOLIDUS/r) 628 { 629 $discards{$code_point} = { reason => $illegal, 630 mirror => $mirror_code_point 631 }; 632 next; 633 } 634 635 # We enter the pair with the original code point on the left; if it 636 # should instead be on the R, swap. Most Symbols that contain the 637 # word REVERSE go on the rhs, except those whose names explicitly 638 # indicate lhs. FINAL in the name indicates stays on the rhs. 639 if ($name =~ $R_re || ( $original_had_REVERSE 640 && $is_Symbol 641 && $name !~ $L_re 642 && $name !~ /\bFINAL\b/ 643 )) 644 { 645 my $temp = $code_point; 646 $code_point = $mirror_code_point; 647 $mirror_code_point = $temp; 648 } 649 650 # Only a few symbols are currently used, determined by inspection, but 651 # all the (few) remaining paired punctuations. 652 if ( ! $is_Symbol 653 || defined $included_symbols{$code_point} 654 || ( $chr =~ /\p{BidiMirrored}/ 655 && ( $name =~ $ok_bidi_symbols_re 656 || $mirror =~ $ok_bidi_symbols_re)) 657 || $name =~ /\bINDEX\b/ # index FINGER pointing 658 659 # Also accept most arrows that don't have N/S in their 660 # names. (Those are almost all currently pointing at an 661 # angle, like SW anyway.) 662 || ( $name !~ /\bNORTH|SOUTH\b/ 663 && $name =~ $arrow_like_re 664 665 # Arguably bi-directional 666 && $name !~ /U-SHAPED/) 667 ) { 668 $paireds{$code_point} = $mirror_code_point; 669 $inverted_paireds{$mirror_code_point} = $code_point; 670 671 # Again, accept either one at either end for these ambiguous 672 # punctuation delimiters 673 if ($chr =~ /[\p{PI}\p{PF}]/x) { 674 $paireds{$mirror_code_point} = $code_point; 675 $inverted_paireds{$code_point} = $mirror_code_point; 676 } 677 } 678 elsif ( $chr =~ /\p{BidiMirrored}/ 679 && ! exists $inverted_unused_bidi_pairs{$code_point} 680 && ! defined $inverted_unused_bidi_pairs{$code_point}) 681 { 682 $unused_bidi_pairs{$code_point} = $mirror_code_point; 683 $inverted_unused_bidi_pairs{$mirror_code_point} = $code_point; 684 } 685 elsif ( ! exists $inverted_unused_pairs{$code_point} 686 && ! defined $inverted_unused_pairs{$code_point}) 687 { # A pair that we don't currently accept 688 $unused_pairs{$code_point} = $mirror_code_point; 689 $inverted_unused_pairs{$mirror_code_point} = $code_point; 690 } 691 } # End of loop through code points 692} # End of loop through properties 693 694# The rest of the data are at __DATA__ in this file. 695 696my @data = <DATA>; 697 698foreach my $charset (get_supported_code_pages()) { 699 print $out_fh "\n" . get_conditional_compile_line_start($charset); 700 701 my @a2n = @{get_a2n($charset)}; 702 703 for ( @data ) { 704 chomp; 705 706 # Convert any '#' comments to /* ... */; empty lines and comments are 707 # output as blank lines 708 if ($_ =~ m/ ^ \s* (?: \# ( .* ) )? $ /x) { 709 my $comment_body = $1 // ""; 710 if ($comment_body ne "") { 711 print $out_fh "/* $comment_body */\n"; 712 } 713 else { 714 print $out_fh "\n"; 715 } 716 next; 717 } 718 719 unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token 720 (?: [\ ]+ ( [^ ]* ) )? # optional flag 721 (?: [\ ]+ ( .* ) )? # name if unnamed; flag is required 722 /x) 723 { 724 die "Unexpected syntax at line $.: $_\n"; 725 } 726 727 my $name_or_cp = $1; 728 my $flag = $2; 729 my $desired_name = $3; 730 731 my $name; 732 my $cp; 733 my $U_cp; # code point in Unicode (not-native) terms 734 735 if ($name_or_cp =~ /^U\+(.*)/) { 736 $U_cp = hex $1; 737 $name = charnames::viacode($name_or_cp); 738 if (! defined $name) { 739 next if $flag =~ /skip_if_undef/; 740 die "Unknown code point '$name_or_cp' at line $.: $_\n" unless $desired_name; 741 $name = ""; 742 } 743 } 744 else { 745 $name = $name_or_cp; 746 die "Unknown name '$name' at line $.: $_\n" unless defined $name; 747 $U_cp = charnames::vianame($name =~ s/_/ /gr); 748 } 749 750 $cp = ($U_cp < 256) 751 ? $a2n[$U_cp] 752 : $U_cp; 753 754 $name = $desired_name if $name eq "" && $desired_name; 755 $name =~ s/[- ]/_/g; # The macro name can have no blanks nor dashes 756 757 my $str; 758 my $suffix; 759 if (defined $flag && $flag eq 'native') { 760 die "Are you sure you want to run this on an above-Latin1 code point?" if $cp > 0xff; 761 $suffix = '_NATIVE'; 762 $str = sprintf "0x%02X", $cp; # Is a numeric constant 763 } 764 else { 765 $str = backslash_x_form($U_cp, $charset); 766 767 $suffix = '_UTF8'; 768 if (! defined $flag || $flag =~ /^ string (_skip_if_undef)? $/x) { 769 $str = "\"$str\""; # Will be a string constant 770 } elsif ($flag eq 'tail') { 771 $str =~ s/\\x..//; # Remove the first byte 772 $suffix .= '_TAIL'; 773 $str = "\"$str\""; # Will be a string constant 774 } 775 elsif ($flag eq 'first') { 776 $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte 777 $suffix .= '_FIRST_BYTE'; 778 $str = "0x$str"; # Is a numeric constant 779 } 780 else { 781 die "Unknown flag at line $.: $_\n"; 782 } 783 } 784 printf $out_fh "# define %s%s %s /* U+%04X */\n", $name, $suffix, $str, $U_cp; 785 } 786 787 # Now output the strings of opening/closing delimiters. The Unicode 788 # values were earlier entered into %paireds 789 my $utf8_opening = ""; 790 my $utf8_closing = ""; 791 my $non_utf8_opening = ""; 792 my $non_utf8_closing = ""; 793 my $deprecated_if_not_mirrored = ""; 794 my $non_utf8_deprecated_if_not_mirrored = ""; 795 796 for my $from (sort { $a <=> $b } keys %paireds) { 797 my $to = $paireds{$from}; 798 my $utf8_from_backslashed = backslash_x_form($from, $charset); 799 my $utf8_to_backslashed = backslash_x_form($to, $charset); 800 my $non_utf8_from_backslashed; 801 my $non_utf8_to_backslashed; 802 803 $utf8_opening .= $utf8_from_backslashed; 804 $utf8_closing .= $utf8_to_backslashed; 805 806 if ($from < 256) { 807 $non_utf8_from_backslashed = 808 backslash_x_form($from, $charset, 'not_utf8'); 809 $non_utf8_to_backslashed = 810 backslash_x_form($to, $charset, 'not_utf8'); 811 812 $non_utf8_opening .= $non_utf8_from_backslashed; 813 $non_utf8_closing .= $non_utf8_to_backslashed; 814 } 815 816 # Only the ASCII range paired delimiters have traditionally been 817 # accepted. Until the feature is considered standard, the non-ASCII 818 # opening ones must be deprecated when the feature isn't in effect, so 819 # as to warn about behavior that is planned to change. 820 if ($from > 127) { 821 $deprecated_if_not_mirrored .= $utf8_from_backslashed; 822 $non_utf8_deprecated_if_not_mirrored .= 823 $non_utf8_from_backslashed if $from < 256; 824 825 # We deprecate using any of these strongly directional characters 826 # at either end of the string, in part so we could allow them to 827 # be reversed. 828 $deprecated_if_not_mirrored .= $utf8_to_backslashed 829 if index ($deprecated_if_not_mirrored, 830 $utf8_to_backslashed) < 0; 831 } 832 833 # The implementing code in toke.c assumes that the byte length of each 834 # opening delimiter is the same as its mirrored closing one. This 835 # makes sure of that by checking upon each iteration of the loop. 836 if (length $utf8_opening != length $utf8_closing) { 837 die "Byte length of representation of '" 838 . charnames::viacode($from) 839 . " differs from its mapping '" 840 . charnames::viacode($to) 841 . "'"; 842 } 843 844 print STDERR format_pairs_line($from, $to) if $output_lists; 845 } 846 $output_lists = 0; # Only output in first iteration 847 848 print $out_fh <<~"EOT"; 849 850 # ifdef PERL_IN_TOKE_C 851 /* Paired characters for quote-like operators, in UTF-8 */ 852 # define EXTRA_OPENING_UTF8_BRACKETS "$utf8_opening" 853 # define EXTRA_CLOSING_UTF8_BRACKETS "$utf8_closing" 854 855 /* And not in UTF-8 */ 856 # define EXTRA_OPENING_NON_UTF8_BRACKETS "$non_utf8_opening" 857 # define EXTRA_CLOSING_NON_UTF8_BRACKETS "$non_utf8_closing" 858 859 /* And what's deprecated */ 860 # define DEPRECATED_OPENING_UTF8_BRACKETS "$deprecated_if_not_mirrored" 861 # define DEPRECATED_OPENING_NON_UTF8_BRACKETS "$non_utf8_deprecated_if_not_mirrored" 862 # endif 863 EOT 864 865 my $max_PRINT_A = 0; 866 for my $i (0x20 .. 0x7E) { 867 $max_PRINT_A = $a2n[$i] if $a2n[$i] > $max_PRINT_A; 868 } 869 $max_PRINT_A = sprintf "0x%02X", $max_PRINT_A; 870 print $out_fh <<"EOT"; 871 872# ifdef PERL_IN_REGCOMP_ANY 873# define MAX_PRINT_A $max_PRINT_A /* The max code point that isPRINT_A */ 874# endif 875EOT 876 877 print $out_fh get_conditional_compile_line_end(); 878 879} 880 881if ($output_omitteds) { 882 # We haven't bothered to delete things that later became used. 883 foreach my $which (\%paireds, 884 \%unused_bidi_pairs, 885 \%unused_pairs) 886 { 887 foreach my $lhs (keys $which->%*) { 888 delete $discards{$lhs}; 889 delete $discards{$which->{$lhs}}; 890 } 891 } 892 893 print STDERR "\nMirrored says Unicode, but not currently used as paired string delimiters\n"; 894 foreach my $from (sort { $a <=> $b } keys %unused_bidi_pairs) { 895 print STDERR format_pairs_line($from, $unused_bidi_pairs{$from}); 896 } 897 898 print STDERR "\nMirror found by name, but not currently used as paired string delimiters\n"; 899 foreach my $from (sort { $a <=> $b } keys %unused_pairs) { 900 print STDERR format_pairs_line($from, $unused_pairs{$from}); 901 } 902 903 # Invert %discards so that all the code points for a given reason are 904 # keyed by that reason. 905 my %inverted_discards; 906 foreach my $code_point (sort { $a <=> $b } keys %discards) { 907 my $type = $discards{$code_point}{reason}; 908 push $inverted_discards{$type}->@*, [ $code_point, 909 $discards{$code_point}{mirror} 910 ]; 911 } 912 913 # Then output each list 914 foreach my $type (sort keys %inverted_discards) { 915 print STDERR "\n$type\n" if $type ne ""; 916 foreach my $ref ($inverted_discards{$type}->@*) { 917 print STDERR format_pairs_line($ref->[0], $ref->[1]); 918 } 919 } 920} 921 922my $count = 0; 923my @other_invlist = prop_invlist("Other"); 924for (my $i = 0; $i < @other_invlist; $i += 2) { 925 $count += ((defined $other_invlist[$i+1]) 926 ? $other_invlist[$i+1] 927 : 0x110000) 928 - $other_invlist[$i]; 929} 930$count = 0x110000 - $count; 931print $out_fh <<~"EOT"; 932 933 /* The number of code points not matching \\pC */ 934 #ifdef PERL_IN_REGCOMP_ANY 935 # define NON_OTHER_COUNT $count 936 #endif 937 EOT 938 939# If this release has both the CWCM and CWCF properties, find the highest code 940# point which changes under any case change. We can use this to short-circuit 941# code 942my @cwcm = prop_invlist('CWCM'); 943if (@cwcm) { 944 my @cwcf = prop_invlist('CWCF'); 945 if (@cwcf) { 946 my $max = ($cwcm[-1] < $cwcf[-1]) 947 ? $cwcf[-1] 948 : $cwcm[-1]; 949 $max = sprintf "0x%X", $max - 1; 950 print $out_fh <<~"EOS"; 951 952 /* The highest code point that has any type of case change */ 953 #ifdef PERL_IN_UTF8_C 954 # define HIGHEST_CASE_CHANGING_CP $max 955 #endif 956 EOS 957 } 958} 959 960print $out_fh "\n#endif /* PERL_UNICODE_CONSTANTS_H_ */\n"; 961 962read_only_bottom_close_and_rename($out_fh); 963 964# DATA FORMAT 965# 966# Note that any apidoc comments you want in the file need to be added to one 967# of the prints above 968# 969# A blank line is output as-is. 970# Comments (lines whose first non-blank is a '#') are converted to C-style, 971# though empty comments are converted to blank lines. Otherwise, each line 972# represents one #define, and begins with either a Unicode character name with 973# the blanks and dashes in it squeezed out or replaced by underscores; or it 974# may be a hexadecimal Unicode code point of the form U+xxxx. In the latter 975# case, the name will be looked-up to use as the name of the macro. In either 976# case, the macro name will have suffixes as listed above, and all blanks and 977# dashes will be replaced by underscores. 978# 979# Each line may optionally have one of the following flags on it, separated by 980# white space from the initial token. 981# string indicates that the output is to be of the string form 982# described in the comments above that are placed in the file. 983# string_skip_ifundef is the same as 'string', but instead of dying if the 984# code point doesn't exist, the line is just skipped: no output is 985# generated for it 986# first indicates that the output is to be of the FIRST_BYTE form. 987# tail indicates that the output is of the _TAIL form. 988# native indicates that the output is the code point, converted to the 989# platform's native character set if applicable 990# 991# If the code point has no official name, the desired name may be appended 992# after the flag, which will be ignored if there is an official name. 993# 994# This program is used to make it convenient to create compile time constants 995# of UTF-8, and to generate proper EBCDIC as well as ASCII without manually 996# having to figure things out. 997 998__DATA__ 999U+017F string 1000 1001U+0300 string 1002U+0307 string 1003 1004U+1E9E string_skip_if_undef 1005 1006U+FB05 string 1007U+FB06 string 1008U+0130 string 1009U+0131 string 1010 1011U+2010 string 1012BOM first 1013BOM tail 1014 1015BOM string 1016 1017U+FFFD string 1018 1019U+10FFFF string MAX_UNICODE 1020 1021NBSP native 1022NBSP string 1023 1024DEL native 1025CR native 1026LF native 1027VT native 1028ESC native 1029U+00DF native 1030U+00DF string 1031U+00E5 native 1032U+00C5 native 1033U+00FF native 1034U+00B5 native 1035U+00B5 string 1036