1#!perl -w 2use 5.015; 3use strict; 4use warnings; 5use Unicode::UCD qw(prop_aliases 6 prop_values 7 prop_value_aliases 8 prop_invlist 9 prop_invmap search_invlist 10 charprop 11 num 12 charblock 13 ); 14require './regen/regen_lib.pl'; 15require './regen/charset_translations.pl'; 16require './lib/unicore/Heavy.pl'; 17use re "/aa"; 18 19# This program outputs charclass_invlists.h, which contains various inversion 20# lists in the form of C arrays that are to be used as-is for inversion lists. 21# Thus, the lists it contains are essentially pre-compiled, and need only a 22# light-weight fast wrapper to make them usable at run-time. 23 24# As such, this code knows about the internal structure of these lists, and 25# any change made to that has to be done here as well. A random number stored 26# in the headers is used to minimize the possibility of things getting 27# out-of-sync, or the wrong data structure being passed. Currently that 28# random number is: 29 30my $VERSION_DATA_STRUCTURE_TYPE = 148565664; 31 32# charclass_invlists.h now also contains inversion maps and enum definitions 33# for those maps that have a finite number of possible values 34 35# integer or float (no exponent) 36my $integer_or_float_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /x; 37 38# Also includes rationals 39my $numeric_re = qr! $integer_or_float_re | ^ -? \d+ / \d+ $ !x; 40 41# More than one code point may have the same code point as their fold. This 42# gives the maximum number in the current Unicode release. (The folded-to 43# code point is not included in this count.) Most folds are pairs of code 44# points, like 'B' and 'b', so this number is at least one. 45my $max_fold_froms = 1; 46 47my %keywords; 48my $table_name_prefix = "UNI_"; 49 50# Matches valid C language enum names: begins with ASCII alphabetic, then any 51# ASCII \w 52my $enum_name_re = qr / ^ [[:alpha:]] \w* $ /ax; 53 54my $out_fh = open_new('charclass_invlists.h', '>', 55 {style => '*', by => 'regen/mk_invlists.pl', 56 from => "Unicode::UCD"}); 57 58my $in_file_pound_if = ""; 59 60my $max_hdr_len = 3; # In headings, how wide a name is allowed? 61 62print $out_fh "/* See the generating file for comments */\n\n"; 63 64# enums that should be made public 65my %public_enums = ( 66 _Perl_SCX => 1 67 ); 68 69# The symbols generated by this program are all currently defined only in a 70# single dot c each. The code knows where most of them go, but this hash 71# gives overrides for the exceptions to the typical place 72my %exceptions_to_where_to_define = 73 ( 74 #_Perl_IVCF => 'PERL_IN_REGCOMP_C', 75 ); 76 77my %where_to_define_enums = (); 78 79my $applies_to_all_charsets_text = "all charsets"; 80 81my %gcb_enums; 82my @gcb_short_enums; 83my %gcb_abbreviations; 84my %lb_enums; 85my @lb_short_enums; 86my %lb_abbreviations; 87my %wb_enums; 88my @wb_short_enums; 89my %wb_abbreviations; 90 91my @a2n; 92 93my %prop_name_aliases; 94# Invert this hash so that for each canonical name, we get a list of things 95# that map to it (excluding itself) 96foreach my $name (sort keys %utf8::loose_property_name_of) { 97 my $canonical = $utf8::loose_property_name_of{$name}; 98 push @{$prop_name_aliases{$canonical}}, $name if $canonical ne $name; 99} 100 101# Output these tables in the same vicinity as each other, so that will get 102# paged in at about the same time. These are also assumed to be the exact 103# same list as those properties used internally by perl. 104my %keep_together = ( 105 assigned => 1, 106 ascii => 1, 107 upper => 1, 108 lower => 1, 109 title => 1, 110 cased => 1, 111 uppercaseletter => 1, 112 lowercaseletter => 1, 113 titlecaseletter => 1, 114 casedletter => 1, 115 vertspace => 1, 116 xposixalnum => 1, 117 xposixalpha => 1, 118 xposixblank => 1, 119 xposixcntrl => 1, 120 xposixdigit => 1, 121 xposixgraph => 1, 122 xposixlower => 1, 123 xposixprint => 1, 124 xposixpunct => 1, 125 xposixspace => 1, 126 xposixupper => 1, 127 xposixword => 1, 128 xposixxdigit => 1, 129 posixalnum => 1, 130 posixalpha => 1, 131 posixblank => 1, 132 posixcntrl => 1, 133 posixdigit => 1, 134 posixgraph => 1, 135 posixlower => 1, 136 posixprint => 1, 137 posixpunct => 1, 138 posixspace => 1, 139 posixupper => 1, 140 posixword => 1, 141 posixxdigit => 1, 142 _perl_any_folds => 1, 143 _perl_folds_to_multi_char => 1, 144 _perl_is_in_multi_char_fold => 1, 145 _perl_non_final_folds => 1, 146 _perl_idstart => 1, 147 _perl_idcont => 1, 148 _perl_charname_begin => 1, 149 _perl_charname_continue => 1, 150 _perl_problematic_locale_foldeds_start => 1, 151 _perl_problematic_locale_folds => 1, 152 _perl_quotemeta => 1, 153 ); 154my %perl_tags; # So can find synonyms of the above properties 155 156my $unused_table_hdr = 'u'; # Heading for row or column for unused values 157 158sub uniques { 159 # Returns non-duplicated input values. From "Perl Best Practices: 160 # Encapsulated Cleverness". p. 455 in first edition. 161 162 my %seen; 163 return grep { ! $seen{$_}++ } @_; 164} 165 166sub a2n($) { 167 my $cp = shift; 168 169 # Returns the input Unicode code point translated to native. 170 171 return $cp if $cp !~ $integer_or_float_re || $cp > 255; 172 return $a2n[$cp]; 173} 174 175sub end_file_pound_if { 176 if ($in_file_pound_if) { 177 print $out_fh "\n#endif\t/* $in_file_pound_if */\n"; 178 $in_file_pound_if = ""; 179 } 180} 181 182sub end_charset_pound_if { 183 print $out_fh "\n" . get_conditional_compile_line_end(); 184} 185 186sub switch_pound_if ($$;$) { 187 my $name = shift; 188 my $new_pound_if = shift; 189 my $charset = shift; 190 191 my @new_pound_if = ref ($new_pound_if) 192 ? sort @$new_pound_if 193 : $new_pound_if; 194 195 # Switch to new #if given by the 2nd argument. If there is an override 196 # for this, it instead switches to that. The 1st argument is the 197 # static's name, used only to check if there is an override for this 198 # 199 # The 'charset' parmameter, if present, is used to first end the charset 200 # #if if we actually do a switch, and then restart it afterwards. This 201 # code, then assumes that the charset #if's are enclosed in the file ones. 202 203 if (exists $exceptions_to_where_to_define{$name}) { 204 @new_pound_if = $exceptions_to_where_to_define{$name}; 205 } 206 207 foreach my $element (@new_pound_if) { 208 209 # regcomp.c is arranged so that the tables are not compiled in 210 # re_comp.c */ 211 my $no_xsub = 1 if $element =~ / PERL_IN_ (?: REGCOMP ) _C /x; 212 $element = "defined($element)"; 213 $element = "($element && ! defined(PERL_IN_XSUB_RE))" if $no_xsub; 214 } 215 $new_pound_if = join " || ", @new_pound_if; 216 217 # Change to the new one if different from old 218 if ($in_file_pound_if ne $new_pound_if) { 219 220 end_charset_pound_if() if defined $charset; 221 222 # Exit any current #if 223 if ($in_file_pound_if) { 224 end_file_pound_if; 225 } 226 227 $in_file_pound_if = $new_pound_if; 228 print $out_fh "\n#if $in_file_pound_if\n"; 229 230 start_charset_pound_if ($charset, 1) if defined $charset; 231 } 232} 233 234sub start_charset_pound_if ($;$) { 235 print $out_fh "\n" . get_conditional_compile_line_start(shift, shift); 236} 237 238{ # Closure 239 my $fh; 240 my $in_doinit = 0; 241 242 sub output_table_header($$$;$@) { 243 244 # Output to $fh the heading for a table given by the other inputs 245 246 $fh = shift; 247 my ($type, # typedef of table, like UV, UV* 248 $name, # name of table 249 $comment, # Optional comment to put on header line 250 @sizes # Optional sizes of each array index. If omitted, 251 # there is a single index whose size is computed by 252 # the C compiler. 253 ) = @_; 254 255 $type =~ s/ \s+ $ //x; 256 257 # If a the typedef is a ptr, add in an extra const 258 $type .= " const" if $type =~ / \* $ /x; 259 260 $comment = "" unless defined $comment; 261 $comment = " /* $comment */" if $comment; 262 263 my $array_declaration; 264 if (@sizes) { 265 $array_declaration = ""; 266 $array_declaration .= "[$_]" for @sizes; 267 } 268 else { 269 $array_declaration = '[]'; 270 } 271 272 my $declaration = "$type ${name}$array_declaration"; 273 274 # Things not matching this are static. Otherwise, it is an external 275 # constant, initialized only under DOINIT. 276 # 277 # (Currently everything is static) 278 if ($in_file_pound_if !~ / PERL_IN_ (?: ) _C /x) { 279 $in_doinit = 0; 280 print $fh "\nstatic const $declaration = {$comment\n"; 281 } 282 else { 283 $in_doinit = 1; 284 print $fh <<EOF; 285 286# ifndef DOINIT 287 288EXTCONST $declaration; 289 290# else 291 292EXTCONST $declaration = {$comment 293EOF 294 } 295 } 296 297 sub output_table_trailer() { 298 299 # Close out a table started by output_table_header() 300 301 print $fh "};\n"; 302 if ($in_doinit) { 303 print $fh "\n# endif /* DOINIT */\n\n"; 304 $in_doinit = 0; 305 } 306 } 307} # End closure 308 309 310sub output_invlist ($$;$) { 311 my $name = shift; 312 my $invlist = shift; # Reference to inversion list array 313 my $charset = shift // ""; # name of character set for comment 314 315 die "No inversion list for $name" unless defined $invlist 316 && ref $invlist eq 'ARRAY'; 317 318 # Output the inversion list $invlist using the name $name for it. 319 # It is output in the exact internal form for inversion lists. 320 321 # Is the last element of the header 0, or 1 ? 322 my $zero_or_one = 0; 323 if (@$invlist && $invlist->[0] != 0) { 324 unshift @$invlist, 0; 325 $zero_or_one = 1; 326 } 327 328 $charset = "for $charset" if $charset; 329 output_table_header($out_fh, "UV", "${name}_invlist", $charset); 330 331 my $count = @$invlist; 332 print $out_fh <<EOF; 333\t$count,\t/* Number of elements */ 334\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */ 335\t$zero_or_one,\t/* 0 if the list starts at 0; 336\t\t 1 if it starts at the element beyond 0 */ 337EOF 338 339 # The main body are the UVs passed in to this routine. Do the final 340 # element separately 341 for my $i (0 .. @$invlist - 1) { 342 printf $out_fh "\t0x%X", $invlist->[$i]; 343 print $out_fh "," if $i < @$invlist - 1; 344 print $out_fh "\n"; 345 } 346 347 output_table_trailer(); 348} 349 350sub output_invmap ($$$$$$$) { 351 my $name = shift; 352 my $invmap = shift; # Reference to inversion map array 353 my $prop_name = shift; 354 my $input_format = shift; # The inversion map's format 355 my $default = shift; # The property value for code points who 356 # otherwise don't have a value specified. 357 my $extra_enums = shift; # comma-separated list of our additions to the 358 # property's standard possible values 359 my $charset = shift // ""; # name of character set for comment 360 361 # Output the inversion map $invmap for property $prop_name, but use $name 362 # as the actual data structure's name. 363 364 my $count = @$invmap; 365 366 my $output_format; 367 my $invmap_declaration_type; 368 my $enum_declaration_type; 369 my $aux_declaration_type; 370 my %enums; 371 my $name_prefix; 372 373 if ($input_format =~ / ^ [as] l? $ /x) { 374 $prop_name = (prop_aliases($prop_name))[1] // $prop_name =~ s/^_Perl_//r; # Get full name 375 my $short_name = (prop_aliases($prop_name))[0] // $prop_name; 376 my @input_enums; 377 378 # Find all the possible input values. These become the enum names 379 # that comprise the inversion map. For inputs that don't have sub 380 # lists, we can just get the unique values. Otherwise, we have to 381 # expand the sublists first. 382 if ($input_format !~ / ^ a /x) { 383 if ($input_format ne 'sl') { 384 @input_enums = sort(uniques(@$invmap)); 385 } 386 else { 387 foreach my $element (@$invmap) { 388 if (ref $element) { 389 push @input_enums, @$element; 390 } 391 else { 392 push @input_enums, $element; 393 } 394 } 395 @input_enums = sort(uniques(@input_enums)); 396 } 397 } 398 399 # The internal enums come last, and in the order specified. 400 # 401 # The internal one named EDGE is also used a marker. Any ones that 402 # come after it are used in the algorithms below, and so must be 403 # defined, even if the release of Unicode this is being compiled for 404 # doesn't use them. But since no code points are assigned to them in 405 # such a release, those values will never be accessed. We collapse 406 # all of them into a single placholder row and a column. The 407 # algorithms below will fill in those cells with essentially garbage, 408 # but they are never read, so it doesn't matter. This allows the 409 # algorithm to remain the same from release to release. 410 # 411 # In one case, regexec.c also uses a placeholder which must be defined 412 # here, and we put it in the unused row and column as its value is 413 # never read. 414 # 415 my @enums = @input_enums; 416 my @extras; 417 my @unused_enums; 418 my $unused_enum_value = @enums; 419 if ($extra_enums ne "") { 420 @extras = split /,/, $extra_enums; 421 my $seen_EDGE = 0; 422 423 # Don't add if already there. 424 foreach my $this_extra (@extras) { 425 next if grep { $_ eq $this_extra } @enums; 426 if ($this_extra eq 'EDGE') { 427 push @enums, $this_extra; 428 $seen_EDGE = 1; 429 } 430 elsif ($seen_EDGE) { 431 push @unused_enums, $this_extra; 432 } 433 else { 434 push @enums, $this_extra; 435 } 436 } 437 438 @unused_enums = sort @unused_enums; 439 $unused_enum_value = @enums; # All unused have the same value, 440 # one beyond the final used one 441 } 442 443 # Assign a value to each element of the enum type we are creating. 444 # The default value always gets 0; the others are arbitrarily 445 # assigned. 446 my $enum_val = 0; 447 my $canonical_default = prop_value_aliases($prop_name, $default); 448 $default = $canonical_default if defined $canonical_default; 449 $enums{$default} = $enum_val++; 450 451 for my $enum (@enums) { 452 $enums{$enum} = $enum_val++ unless exists $enums{$enum}; 453 } 454 455 # Calculate the data for the special tables output for these properties. 456 if ($name =~ / ^ _Perl_ (?: GCB | LB | WB ) $ /x) { 457 458 # The data includes the hashes %gcb_enums, %lb_enums, etc. 459 # Similarly we calculate column headings for the tables. 460 # 461 # We use string evals to allow the same code to work on 462 # all the tables 463 my $type = lc $prop_name; 464 465 # Skip if we've already done this code, which populated 466 # this hash 467 if (eval "! \%${type}_enums") { 468 469 # For each enum in the type ... 470 foreach my $enum (sort keys %enums) { 471 my $value = $enums{$enum}; 472 my $short; 473 my $abbreviated_from; 474 475 # Special case this wb property value to make the 476 # name more clear 477 if ($enum eq 'Perl_Tailored_HSpace') { 478 $short = 'hs'; 479 $abbreviated_from = $enum; 480 } 481 else { 482 483 # Use the official short name, if found. 484 ($short) = prop_value_aliases($type, $enum); 485 486 if (! defined $short) { 487 488 # But if there is no official name, use the name 489 # that came from the data (if any). Otherwise, 490 # the name had to come from the extras list. 491 # There are two types of values in that list. 492 # 493 # First are those enums that are not part of the 494 # property, but are defined by this code. By 495 # convention these have all-caps names. We use 496 # the lowercased name for these. 497 # 498 # Second are enums that are needed to get the 499 # algorithms below to work and/or to get regexec.c 500 # to compile, but don't exist in all Unicode 501 # releases. These are handled outside this loop 502 # as 'unused_enums' 503 if (grep { $_ eq $enum } @input_enums) { 504 $short = $enum 505 } 506 else { 507 $short = lc $enum; 508 } 509 } 510 } 511 512 # If our short name is too long, or we already 513 # know that the name is an abbreviation, truncate 514 # to make sure it's short enough, and remember 515 # that we did this so we can later add a comment in the 516 # generated file 517 if ( $abbreviated_from 518 || length $short > $max_hdr_len) 519 { 520 $short = substr($short, 0, $max_hdr_len); 521 $abbreviated_from = $enum 522 unless $abbreviated_from; 523 # If the name we are to display conflicts, try 524 # another. 525 while (eval "exists 526 \$${type}_abbreviations{$short}") 527 { 528 die $@ if $@; 529 530 # The increment operator on strings doesn't work 531 # on those containing an '_', so just use the 532 # final portion. 533 my @short = split '_', $short; 534 $short[-1]++; 535 $short = join "_", @short; 536 } 537 538 eval "\$${type}_abbreviations{$short} = '$enum'"; 539 die $@ if $@; 540 } 541 542 # Remember the mapping from the property value 543 # (enum) name to its value. 544 eval "\$${type}_enums{$enum} = $value"; 545 die $@ if $@; 546 547 # Remember the inverse mapping to the short name 548 # so that we can properly label the generated 549 # table's rows and columns 550 eval "\$${type}_short_enums[$value] = '$short'"; 551 die $@ if $@; 552 } 553 554 # Each unused enum has the same value. They all are collapsed 555 # into one row and one column, named $unused_table_hdr. 556 if (@unused_enums) { 557 eval "\$${type}_short_enums['$unused_enum_value'] = '$unused_table_hdr'"; 558 die $@ if $@; 559 560 foreach my $enum (@unused_enums) { 561 eval "\$${type}_enums{$enum} = $unused_enum_value"; 562 die $@ if $@; 563 } 564 } 565 } 566 } 567 568 # The short names tend to be two lower case letters, but it looks 569 # better for those if they are upper. XXX 570 $short_name = uc($short_name) if length($short_name) < 3 571 || substr($short_name, 0, 1) =~ /[[:lower:]]/; 572 $name_prefix = "${short_name}_"; 573 574 # Start the enum definition for this map 575 my @enum_definition; 576 my @enum_list; 577 foreach my $enum (keys %enums) { 578 $enum_list[$enums{$enum}] = $enum; 579 } 580 foreach my $i (0 .. @enum_list - 1) { 581 push @enum_definition, ",\n" if $i > 0; 582 583 my $name = $enum_list[$i]; 584 push @enum_definition, "\t${name_prefix}$name = $i"; 585 } 586 if (@unused_enums) { 587 foreach my $unused (@unused_enums) { 588 push @enum_definition, 589 ",\n\t${name_prefix}$unused = $unused_enum_value"; 590 } 591 } 592 593 # For an 'l' property, we need extra enums, because some of the 594 # elements are lists. Each such distinct list is placed in its own 595 # auxiliary map table. Here, we go through the inversion map, and for 596 # each distinct list found, create an enum value for it, numbered -1, 597 # -2, .... 598 my %multiples; 599 my $aux_table_prefix = "AUX_TABLE_"; 600 if ($input_format =~ /l/) { 601 foreach my $element (@$invmap) { 602 603 # A regular scalar is not one of the lists we're looking for 604 # at this stage. 605 next unless ref $element; 606 607 my $joined; 608 if ($input_format =~ /a/) { # These are already ordered 609 $joined = join ",", @$element; 610 } 611 else { 612 $joined = join ",", sort @$element; 613 } 614 my $already_found = exists $multiples{$joined}; 615 616 my $i; 617 if ($already_found) { # Use any existing one 618 $i = $multiples{$joined}; 619 } 620 else { # Otherwise increment to get a new table number 621 $i = keys(%multiples) + 1; 622 $multiples{$joined} = $i; 623 } 624 625 # This changes the inversion map for this entry to not be the 626 # list 627 $element = "use_$aux_table_prefix$i"; 628 629 # And add to the enum values 630 if (! $already_found) { 631 push @enum_definition, ",\n\t${name_prefix}$element = -$i"; 632 } 633 } 634 } 635 636 $enum_declaration_type = "${name_prefix}enum"; 637 638 # Finished with the enum definition. Inversion map stuff is used only 639 # by regexec or utf-8 (if it is for code points) , unless it is in the 640 # enum exception list 641 my $where = (exists $where_to_define_enums{$name}) 642 ? $where_to_define_enums{$name} 643 : ($input_format =~ /a/) 644 ? 'PERL_IN_UTF8_C' 645 : 'PERL_IN_REGEXEC_C'; 646 647 if (! exists $public_enums{$name}) { 648 switch_pound_if($name, $where, $charset); 649 } 650 else { 651 end_charset_pound_if; 652 end_file_pound_if; 653 start_charset_pound_if($charset, 1); 654 } 655 656 # If the enum only contains one element, that is a dummy, default one 657 if (scalar @enum_definition > 1) { 658 659 # Currently unneeded 660 #print $out_fh "\n#define ${name_prefix}ENUM_COUNT ", 661 # ..scalar keys %enums, "\n"; 662 663 if ($input_format =~ /l/) { 664 print $out_fh 665 "\n", 666 "/* Negative enum values indicate the need to use an", 667 " auxiliary table\n", 668 " * consisting of the list of enums this one expands to.", 669 " The absolute\n", 670 " * values of the negative enums are indices into a table", 671 " of the auxiliary\n", 672 " * tables' addresses */"; 673 } 674 print $out_fh "\ntypedef enum {\n"; 675 print $out_fh join "", @enum_definition; 676 print $out_fh "\n"; 677 print $out_fh "} $enum_declaration_type;\n"; 678 } 679 680 switch_pound_if($name, $where, $charset); 681 682 $invmap_declaration_type = ($input_format =~ /s/) 683 ? $enum_declaration_type 684 : "int"; 685 $aux_declaration_type = ($input_format =~ /s/) 686 ? $enum_declaration_type 687 : "unsigned int"; 688 689 $output_format = "${name_prefix}%s"; 690 691 # If there are auxiliary tables, output them. 692 if (%multiples) { 693 694 print $out_fh "\n#define HAS_${name_prefix}AUX_TABLES\n"; 695 696 # Invert keys and values 697 my %inverted_mults; 698 while (my ($key, $value) = each %multiples) { 699 $inverted_mults{$value} = $key; 700 } 701 702 # Output them in sorted order 703 my @sorted_table_list = sort { $a <=> $b } keys %inverted_mults; 704 705 # Keep track of how big each aux table is 706 my @aux_counts; 707 708 # Output each aux table. 709 foreach my $table_number (@sorted_table_list) { 710 my $table = $inverted_mults{$table_number}; 711 output_table_header($out_fh, 712 $aux_declaration_type, 713 "$name_prefix$aux_table_prefix$table_number"); 714 715 # Earlier, we joined the elements of this table together with a comma 716 my @elements = split ",", $table; 717 718 $aux_counts[$table_number] = scalar @elements; 719 for my $i (0 .. @elements - 1) { 720 print $out_fh ",\n" if $i > 0; 721 if ($input_format =~ /a/) { 722 printf $out_fh "\t0x%X", $elements[$i]; 723 } 724 else { 725 print $out_fh "\t${name_prefix}$elements[$i]"; 726 } 727 } 728 729 print $out_fh "\n"; 730 output_table_trailer(); 731 } 732 733 # Output the table that is indexed by the absolute value of the 734 # aux table enum and contains pointers to the tables output just 735 # above 736 output_table_header($out_fh, "$aux_declaration_type *", 737 "${name_prefix}${aux_table_prefix}ptrs"); 738 print $out_fh "\tNULL,\t/* Placeholder */\n"; 739 for my $i (1 .. @sorted_table_list) { 740 print $out_fh ",\n" if $i > 1; 741 print $out_fh "\t$name_prefix$aux_table_prefix$i"; 742 } 743 print $out_fh "\n"; 744 output_table_trailer(); 745 746 print $out_fh 747 "\n/* Parallel table to the above, giving the number of elements" 748 . " in each table\n * pointed to */\n"; 749 output_table_header($out_fh, "U8", 750 "${name_prefix}${aux_table_prefix}lengths"); 751 print $out_fh "\t0,\t/* Placeholder */\n"; 752 for my $i (1 .. @sorted_table_list) { 753 print $out_fh ",\n" if $i > 1; 754 print $out_fh "\t$aux_counts[$i]\t/* $name_prefix$aux_table_prefix$i */"; 755 } 756 print $out_fh "\n"; 757 output_table_trailer(); 758 } # End of outputting the auxiliary and associated tables 759 760 # The scx property used in regexec.c needs a specialized table which 761 # is most convenient to output here, while the data structures set up 762 # above are still extant. This table contains the code point that is 763 # the zero digit of each script, indexed by script enum value. 764 if (lc $short_name eq 'scx') { 765 my @decimals_invlist = prop_invlist("Numeric_Type=Decimal"); 766 my %script_zeros; 767 768 # Find all the decimal digits. The 0 of each range is always the 769 # 0th element, except in some early Unicode releases, so check for 770 # that. 771 for (my $i = 0; $i < @decimals_invlist; $i += 2) { 772 my $code_point = $decimals_invlist[$i]; 773 next if num(chr($code_point)) ne '0'; 774 775 # Turn the scripts this zero is in into a list. 776 my @scripts = split ",", 777 charprop($code_point, "_Perl_SCX", '_perl_core_internal_ok'); 778 $code_point = sprintf("0x%x", $code_point); 779 780 foreach my $script (@scripts) { 781 if (! exists $script_zeros{$script}) { 782 $script_zeros{$script} = $code_point; 783 } 784 elsif (ref $script_zeros{$script}) { 785 push $script_zeros{$script}->@*, $code_point; 786 } 787 else { # Turn into a list if this is the 2nd zero of the 788 # script 789 my $existing = $script_zeros{$script}; 790 undef $script_zeros{$script}; 791 push $script_zeros{$script}->@*, $existing, $code_point; 792 } 793 } 794 } 795 796 # @script_zeros contains the zero, sorted by the script's enum 797 # value 798 my @script_zeros; 799 foreach my $script (keys %script_zeros) { 800 my $enum_value = $enums{$script}; 801 $script_zeros[$enum_value] = $script_zeros{$script}; 802 } 803 804 print $out_fh 805 "\n/* This table, indexed by the script enum, gives the zero" 806 . " code point for that\n * script; 0 if the script has multiple" 807 . " digit sequences. Scripts without a\n * digit sequence use" 808 . " ASCII [0-9], hence are marked '0' */\n"; 809 output_table_header($out_fh, "UV", "script_zeros"); 810 for my $i (0 .. @script_zeros - 1) { 811 my $code_point = $script_zeros[$i]; 812 if (defined $code_point) { 813 $code_point = " 0" if ref $code_point; 814 print $out_fh "\t$code_point"; 815 } 816 elsif (lc $enum_list[$i] eq 'inherited') { 817 print $out_fh "\t 0"; 818 } 819 else { # The only digits a script without its own set accepts 820 # is [0-9] 821 print $out_fh "\t'0'"; 822 } 823 print $out_fh "," if $i < @script_zeros - 1; 824 print $out_fh "\t/* $enum_list[$i] */"; 825 print $out_fh "\n"; 826 } 827 output_table_trailer(); 828 } # End of special handling of scx 829 } 830 else { 831 die "'$input_format' invmap() format for '$prop_name' unimplemented"; 832 } 833 834 die "No inversion map for $prop_name" unless defined $invmap 835 && ref $invmap eq 'ARRAY' 836 && $count; 837 838 # Now output the inversion map proper 839 $charset = "for $charset" if $charset; 840 output_table_header($out_fh, $invmap_declaration_type, 841 "${name}_invmap", 842 $charset); 843 844 # The main body are the scalars passed in to this routine. 845 for my $i (0 .. $count - 1) { 846 my $element = $invmap->[$i]; 847 my $full_element_name = prop_value_aliases($prop_name, $element); 848 if ($input_format =~ /a/ && $element !~ /\D/) { 849 $element = ($element == 0) 850 ? 0 851 : sprintf("0x%X", $element); 852 } 853 else { 854 $element = $full_element_name if defined $full_element_name; 855 $element = $name_prefix . $element; 856 } 857 print $out_fh "\t$element"; 858 print $out_fh "," if $i < $count - 1; 859 print $out_fh "\n"; 860 } 861 output_table_trailer(); 862} 863 864sub mk_invlist_from_sorted_cp_list { 865 866 # Returns an inversion list constructed from the sorted input array of 867 # code points 868 869 my $list_ref = shift; 870 871 return unless @$list_ref; 872 873 # Initialize to just the first element 874 my @invlist = ( $list_ref->[0], $list_ref->[0] + 1); 875 876 # For each succeeding element, if it extends the previous range, adjust 877 # up, otherwise add it. 878 for my $i (1 .. @$list_ref - 1) { 879 if ($invlist[-1] == $list_ref->[$i]) { 880 $invlist[-1]++; 881 } 882 else { 883 push @invlist, $list_ref->[$i], $list_ref->[$i] + 1; 884 } 885 } 886 return @invlist; 887} 888 889# Read in the Case Folding rules, and construct arrays of code points for the 890# properties we need. 891my ($cp_ref, $folds_ref, $format, $default) = prop_invmap("Case_Folding"); 892die "Could not find inversion map for Case_Folding" unless defined $format; 893die "Incorrect format '$format' for Case_Folding inversion map" 894 unless $format eq 'al' 895 || $format eq 'a'; 896sub _Perl_IVCF { 897 898 # This creates a map of the inversion of case folding. i.e., given a 899 # character, it gives all the other characters that fold to it. 900 # 901 # Inversion maps function kind of like a hash, with the inversion list 902 # specifying the buckets (keys) and the inversion maps specifying the 903 # contents of the corresponding bucket. Effectively this function just 904 # swaps the keys and values of the case fold hash. But there are 905 # complications. Most importantly, More than one character can each have 906 # the same fold. This is solved by having a list of characters that fold 907 # to a given one. 908 909 my %new; 910 911 # Go through the inversion list. 912 for (my $i = 0; $i < @$cp_ref; $i++) { 913 914 # Skip if nothing folds to this 915 next if $folds_ref->[$i] == 0; 916 917 # This entry which is valid from here to up (but not including) the 918 # next entry is for the next $count characters, so that, for example, 919 # A-Z is represented by one entry. 920 my $cur_list = $cp_ref->[$i]; 921 my $count = $cp_ref->[$i+1] - $cur_list; 922 923 # The fold of [$i] can be not just a single character, but a sequence 924 # of multiple ones. We deal with those here by just creating a string 925 # consisting of them. Otherwise, we use the single code point [$i] 926 # folds to. 927 my $cur_map = (ref $folds_ref->[$i]) 928 ? join "", map { chr } $folds_ref->[$i]->@* 929 : $folds_ref->[$i]; 930 931 # Expand out this range 932 while ($count > 0) { 933 push @{$new{$cur_map}}, $cur_list; 934 935 # A multiple-character fold is a string, and shouldn't need 936 # incrementing anyway 937 if (ref $folds_ref->[$i]) { 938 die sprintf("Case fold for %x is multiple chars; should have" 939 . " a count of 1, but instead it was $count", $count) 940 unless $count == 1; 941 } 942 else { 943 $cur_map++; 944 $cur_list++; 945 } 946 $count--; 947 } 948 } 949 950 # Now go through and make some adjustments. We add synthetic entries for 951 # two cases. 952 # 1) Two or more code points can fold to the same multiple character, 953 # sequence, as U+FB05 and U+FB06 both fold to 'st'. This code is only 954 # for single character folds, but FB05 and FB06 are single characters 955 # that are equivalent folded, so we add entries so that they are 956 # considered to fold to each other 957 # 2) If two or more above-Latin1 code points fold to the same Latin1 range 958 # one, we also add entries so that they are considered to fold to each 959 # other. This is so that under /aa or /l matching, where folding to 960 # their Latin1 range code point is illegal, they still can fold to each 961 # other. This situation happens in Unicode 3.0.1, but probably no 962 # other version. 963 foreach my $fold (keys %new) { 964 my $folds_to_string = $fold =~ /\D/; 965 966 # If the bucket contains only one element, convert from an array to a 967 # scalar 968 if (scalar $new{$fold}->@* == 1) { 969 $new{$fold} = $new{$fold}[0]; 970 } 971 else { 972 973 # Otherwise, sort numerically. This places the highest code point 974 # in the list at the tail end. This is because Unicode keeps the 975 # lowercase code points as higher ordinals than the uppercase, at 976 # least for the ones that matter so far. These are synthetic 977 # entries, and we want to predictably have the lowercase (which is 978 # more likely to be what gets folded to) in the same corresponding 979 # position, so that other code can rely on that. If some new 980 # version of Unicode came along that violated this, we might have 981 # to change so that the sort is based on upper vs lower instead. 982 # (The lower-comes-after isn't true of native EBCDIC, but here we 983 # are dealing strictly with Unicode values). 984 @{$new{$fold}} = sort { $a <=> $b } $new{$fold}->@* 985 unless $folds_to_string; 986 # We will be working with a copy of this sorted entry. 987 my @source_list = $new{$fold}->@*; 988 if (! $folds_to_string) { 989 990 # This handles situation 2) listed above, which only arises if 991 # what is being folded-to (the fold) is in the Latin1 range. 992 if ($fold > 255 ) { 993 undef @source_list; 994 } 995 else { 996 # And it only arises if there are two or more folders that 997 # fold to it above Latin1. We look at just those. 998 @source_list = grep { $_ > 255 } @source_list; 999 undef @source_list if @source_list == 1; 1000 } 1001 } 1002 1003 # Here, we've found the items we want to set up synthetic folds 1004 # for. Add entries so that each folds to each other. 1005 foreach my $cp (@source_list) { 1006 my @rest = grep { $cp != $_ } @source_list; 1007 if (@rest == 1) { 1008 $new{$cp} = $rest[0]; 1009 } 1010 else { 1011 push @{$new{$cp}}, @rest; 1012 } 1013 } 1014 } 1015 1016 # We don't otherwise deal with multiple-character folds 1017 delete $new{$fold} if $folds_to_string; 1018 } 1019 1020 1021 # Now we have a hash that is the inversion of the case fold property. 1022 # First find the maximum number of code points that fold to the same one. 1023 foreach my $fold_to (keys %new) { 1024 if (ref $new{$fold_to}) { 1025 my $folders_count = scalar @{$new{$fold_to}}; 1026 $max_fold_froms = $folders_count if $folders_count > $max_fold_froms; 1027 } 1028 } 1029 1030 # Then convert the hash to an inversion map. 1031 my @sorted_folds = sort { $a <=> $b } keys %new; 1032 my (@invlist, @invmap); 1033 1034 # We know that nothing folds to the controls (whose ordinals start at 0). 1035 # And the first real entries are the lowest in the hash. 1036 push @invlist, 0, $sorted_folds[0]; 1037 push @invmap, 0, $new{$sorted_folds[0]}; 1038 1039 # Go through the remainder of the hash keys (which are the folded code 1040 # points) 1041 for (my $i = 1; $i < @sorted_folds; $i++) { 1042 1043 # Get the current one, and the one prior to it. 1044 my $fold = $sorted_folds[$i]; 1045 my $prev_fold = $sorted_folds[$i-1]; 1046 1047 # If the current one is not just 1 away from the prior one, we close 1048 # out the range containing the previous fold, and know that the gap 1049 # doesn't have anything that folds. 1050 if ($fold - 1 != $prev_fold) { 1051 push @invlist, $prev_fold + 1; 1052 push @invmap, 0; 1053 1054 # And start a new range 1055 push @invlist, $fold; 1056 push @invmap, $new{$fold}; 1057 } 1058 elsif ($new{$fold} - 1 != $new{$prev_fold}) { 1059 1060 # Here the current fold is just 1 greater than the previous, but 1061 # the new map isn't correspondingly 1 greater than the previous, 1062 # the old range is ended, but since there is no gap, we don't have 1063 # to insert anything else. 1064 push @invlist, $fold; 1065 push @invmap, $new{$fold}; 1066 1067 } # else { Otherwise, this new entry just extends the previous } 1068 1069 die "In IVCF: $invlist[-1] <= $invlist[-2]" 1070 if $invlist[-1] <= $invlist[-2]; 1071 } 1072 1073 # And add an entry that indicates that everything above this, to infinity, 1074 # does not have a case fold. 1075 push @invlist, $sorted_folds[-1] + 1; 1076 push @invmap, 0; 1077 1078 # All Unicode versions have some places where multiple code points map to 1079 # the same one, so the format always has an 'l' 1080 return \@invlist, \@invmap, 'al', $default; 1081} 1082 1083sub prop_name_for_cmp ($) { # Sort helper 1084 my $name = shift; 1085 1086 # Returns the input lowercased, with non-alphas removed, as well as 1087 # everything starting with a comma 1088 1089 $name =~ s/,.*//; 1090 $name =~ s/[[:^alpha:]]//g; 1091 return lc $name; 1092} 1093 1094sub UpperLatin1 { 1095 my @return = mk_invlist_from_sorted_cp_list([ 128 .. 255 ]); 1096 return \@return; 1097} 1098 1099sub _Perl_CCC_non0_non230 { 1100 1101 # Create an inversion list of code points with non-zero canonical 1102 # combining class that also don't have 230 as the class number. This is 1103 # part of a Unicode Standard rule 1104 1105 my @nonzeros = prop_invlist("ccc=0"); 1106 shift @nonzeros; # Invert so is "ccc != 0" 1107 1108 my @return; 1109 1110 # Expand into list of code points, while excluding those with ccc == 230 1111 for (my $i = 0; $i < @nonzeros; $i += 2) { 1112 my $upper = ($i + 1) < @nonzeros 1113 ? $nonzeros[$i+1] - 1 # In range 1114 : $Unicode::UCD::MAX_CP; # To infinity. 1115 for my $j ($nonzeros[$i] .. $upper) { 1116 my @ccc_names = prop_value_aliases("ccc", charprop($j, "ccc")); 1117 1118 # Final element in @ccc_names will be all numeric 1119 push @return, $j if $ccc_names[-1] != 230; 1120 } 1121 } 1122 1123 @return = sort { $a <=> $b } @return; 1124 @return = mk_invlist_from_sorted_cp_list(\@return); 1125 return \@return; 1126} 1127 1128sub output_table_common { 1129 1130 # Common subroutine to actually output the generated rules table. 1131 1132 my ($property, 1133 $table_value_defines_ref, 1134 $table_ref, 1135 $names_ref, 1136 $abbreviations_ref) = @_; 1137 my $size = @$table_ref; 1138 1139 # Output the #define list, sorted by numeric value 1140 if ($table_value_defines_ref) { 1141 my $max_name_length = 0; 1142 my @defines; 1143 1144 # Put in order, and at the same time find the longest name 1145 while (my ($enum, $value) = each %$table_value_defines_ref) { 1146 $defines[$value] = $enum; 1147 1148 my $length = length $enum; 1149 $max_name_length = $length if $length > $max_name_length; 1150 } 1151 1152 print $out_fh "\n"; 1153 1154 # Output, so that the values are vertically aligned in a column after 1155 # the longest name 1156 foreach my $i (0 .. @defines - 1) { 1157 next unless defined $defines[$i]; 1158 printf $out_fh "#define %-*s %2d\n", 1159 $max_name_length, 1160 $defines[$i], 1161 $i; 1162 } 1163 } 1164 1165 my $column_width = 2; # We currently allow 2 digits for the number 1166 1167 # Being above a U8 is not currently handled 1168 my $table_type = 'U8'; 1169 1170 # If a name is longer than the width set aside for a column, its column 1171 # needs to have increased spacing so that the name doesn't get truncated 1172 # nor run into an adjacent column 1173 my @spacers; 1174 1175 # Is there a row and column for unused values in this release? 1176 my $has_unused = $names_ref->[$size-1] eq $unused_table_hdr; 1177 1178 for my $i (0 .. $size - 1) { 1179 no warnings 'numeric'; 1180 $spacers[$i] = " " x (length($names_ref->[$i]) - $column_width); 1181 } 1182 1183 output_table_header($out_fh, $table_type, "${property}_table", undef, $size, $size); 1184 1185 # Calculate the column heading line 1186 my $header_line = "/* " 1187 . (" " x $max_hdr_len) # We let the row heading meld to 1188 # the '*/' for those that are at 1189 # the max 1190 . " " x 3; # Space for '*/ ' 1191 # Now each column 1192 for my $i (0 .. $size - 1) { 1193 $header_line .= sprintf "%s%*s", 1194 $spacers[$i], 1195 $column_width + 1, # 1 for the ',' 1196 $names_ref->[$i]; 1197 } 1198 $header_line .= " */\n"; 1199 1200 # If we have annotations, output it now. 1201 if ($has_unused || scalar %$abbreviations_ref) { 1202 my $text = ""; 1203 foreach my $abbr (sort keys %$abbreviations_ref) { 1204 $text .= "; " if $text; 1205 $text .= "'$abbr' stands for '$abbreviations_ref->{$abbr}'"; 1206 } 1207 if ($has_unused) { 1208 $text .= "; $unused_table_hdr stands for 'unused in this Unicode" 1209 . " release (and the data in the row or column are garbage)" 1210 } 1211 1212 my $indent = " " x 3; 1213 $text = $indent . "/* $text */"; 1214 1215 # Wrap the text so that it is no wider than the table, which the 1216 # header line gives. 1217 my $output_width = length $header_line; 1218 while (length $text > $output_width) { 1219 my $cur_line = substr($text, 0, $output_width); 1220 1221 # Find the first blank back from the right end to wrap at. 1222 for (my $i = $output_width -1; $i > 0; $i--) { 1223 if (substr($text, $i, 1) eq " ") { 1224 print $out_fh substr($text, 0, $i), "\n"; 1225 1226 # Set so will look at just the remaining tail (which will 1227 # be indented and have a '*' after the indent 1228 $text = $indent . " * " . substr($text, $i + 1); 1229 last; 1230 } 1231 } 1232 } 1233 1234 # And any remaining 1235 print $out_fh $text, "\n" if $text; 1236 } 1237 1238 # We calculated the header line earlier just to get its width so that we 1239 # could make sure the annotations fit into that. 1240 print $out_fh $header_line; 1241 1242 # Now output the bulk of the table. 1243 for my $i (0 .. $size - 1) { 1244 1245 # First the row heading. 1246 printf $out_fh "/* %-*s*/ ", $max_hdr_len, $names_ref->[$i]; 1247 print $out_fh "{"; # Then the brace for this row 1248 1249 # Then each column 1250 for my $j (0 .. $size -1) { 1251 print $out_fh $spacers[$j]; 1252 printf $out_fh "%*d", $column_width, $table_ref->[$i][$j]; 1253 print $out_fh "," if $j < $size - 1; 1254 } 1255 print $out_fh " }"; 1256 print $out_fh "," if $i < $size - 1; 1257 print $out_fh "\n"; 1258 } 1259 1260 output_table_trailer(); 1261} 1262 1263sub output_GCB_table() { 1264 1265 # Create and output the pair table for use in determining Grapheme Cluster 1266 # Breaks, given in http://www.unicode.org/reports/tr29/. 1267 my %gcb_actions = ( 1268 GCB_NOBREAK => 0, 1269 GCB_BREAKABLE => 1, 1270 GCB_RI_then_RI => 2, # Rules 12 and 13 1271 GCB_EX_then_EM => 3, # Rule 10 1272 GCB_Maybe_Emoji_NonBreak => 4, 1273 ); 1274 1275 # The table is constructed in reverse order of the rules, to make the 1276 # lower-numbered, higher priority ones override the later ones, as the 1277 # algorithm stops at the earliest matching rule 1278 1279 my @gcb_table; 1280 my $table_size = @gcb_short_enums; 1281 1282 # Otherwise, break everywhere. 1283 # GB99 Any ÷ Any 1284 for my $i (0 .. $table_size - 1) { 1285 for my $j (0 .. $table_size - 1) { 1286 $gcb_table[$i][$j] = 1; 1287 } 1288 } 1289 1290 # Do not break within emoji flag sequences. That is, do not break between 1291 # regional indicator (RI) symbols if there is an odd number of RI 1292 # characters before the break point. Must be resolved in runtime code. 1293 # 1294 # GB12 sot (RI RI)* RI × RI 1295 # GB13 [^RI] (RI RI)* RI × RI 1296 $gcb_table[$gcb_enums{'Regional_Indicator'}] 1297 [$gcb_enums{'Regional_Indicator'}] = $gcb_actions{GCB_RI_then_RI}; 1298 1299 # Post 11.0: GB11 \p{Extended_Pictographic} Extend* ZWJ 1300 # × \p{Extended_Pictographic} 1301 $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'XPG_XX'}] = 1302 $gcb_actions{GCB_Maybe_Emoji_NonBreak}; 1303 1304 # This and the rule GB10 obsolete starting with Unicode 11.0, can be left 1305 # in as there are no code points that match, so the code won't ever get 1306 # executed. 1307 # Do not break within emoji modifier sequences or emoji zwj sequences. 1308 # Pre 11.0: GB11 ZWJ × ( Glue_After_Zwj | E_Base_GAZ ) 1309 $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'Glue_After_Zwj'}] = 0; 1310 $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'E_Base_GAZ'}] = 0; 1311 1312 # GB10 ( E_Base | E_Base_GAZ ) Extend* × E_Modifier 1313 $gcb_table[$gcb_enums{'Extend'}][$gcb_enums{'E_Modifier'}] 1314 = $gcb_actions{GCB_EX_then_EM}; 1315 $gcb_table[$gcb_enums{'E_Base'}][$gcb_enums{'E_Modifier'}] = 0; 1316 $gcb_table[$gcb_enums{'E_Base_GAZ'}][$gcb_enums{'E_Modifier'}] = 0; 1317 1318 # Do not break before extending characters or ZWJ. 1319 # Do not break before SpacingMarks, or after Prepend characters. 1320 # GB9b Prepend × 1321 # GB9a × SpacingMark 1322 # GB9 × ( Extend | ZWJ ) 1323 for my $i (0 .. @gcb_table - 1) { 1324 $gcb_table[$gcb_enums{'Prepend'}][$i] = 0; 1325 $gcb_table[$i][$gcb_enums{'SpacingMark'}] = 0; 1326 $gcb_table[$i][$gcb_enums{'Extend'}] = 0; 1327 $gcb_table[$i][$gcb_enums{'ZWJ'}] = 0; 1328 } 1329 1330 # Do not break Hangul syllable sequences. 1331 # GB8 ( LVT | T) × T 1332 $gcb_table[$gcb_enums{'LVT'}][$gcb_enums{'T'}] = 0; 1333 $gcb_table[$gcb_enums{'T'}][$gcb_enums{'T'}] = 0; 1334 1335 # GB7 ( LV | V ) × ( V | T ) 1336 $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'V'}] = 0; 1337 $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'T'}] = 0; 1338 $gcb_table[$gcb_enums{'V'}][$gcb_enums{'V'}] = 0; 1339 $gcb_table[$gcb_enums{'V'}][$gcb_enums{'T'}] = 0; 1340 1341 # GB6 L × ( L | V | LV | LVT ) 1342 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'L'}] = 0; 1343 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'V'}] = 0; 1344 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LV'}] = 0; 1345 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LVT'}] = 0; 1346 1347 # Do not break between a CR and LF. Otherwise, break before and after 1348 # controls. 1349 # GB5 ÷ ( Control | CR | LF ) 1350 # GB4 ( Control | CR | LF ) ÷ 1351 for my $i (0 .. @gcb_table - 1) { 1352 $gcb_table[$i][$gcb_enums{'Control'}] = 1; 1353 $gcb_table[$i][$gcb_enums{'CR'}] = 1; 1354 $gcb_table[$i][$gcb_enums{'LF'}] = 1; 1355 $gcb_table[$gcb_enums{'Control'}][$i] = 1; 1356 $gcb_table[$gcb_enums{'CR'}][$i] = 1; 1357 $gcb_table[$gcb_enums{'LF'}][$i] = 1; 1358 } 1359 1360 # GB3 CR × LF 1361 $gcb_table[$gcb_enums{'CR'}][$gcb_enums{'LF'}] = 0; 1362 1363 # Break at the start and end of text, unless the text is empty 1364 # GB1 sot ÷ 1365 # GB2 ÷ eot 1366 for my $i (0 .. @gcb_table - 1) { 1367 $gcb_table[$i][$gcb_enums{'EDGE'}] = 1; 1368 $gcb_table[$gcb_enums{'EDGE'}][$i] = 1; 1369 } 1370 $gcb_table[$gcb_enums{'EDGE'}][$gcb_enums{'EDGE'}] = 0; 1371 1372 output_table_common('GCB', \%gcb_actions, 1373 \@gcb_table, \@gcb_short_enums, \%gcb_abbreviations); 1374} 1375 1376sub output_LB_table() { 1377 1378 # Create and output the enums, #defines, and pair table for use in 1379 # determining Line Breaks. This uses the default line break algorithm, 1380 # given in http://www.unicode.org/reports/tr14/, but tailored by example 7 1381 # in that page, as the Unicode-furnished tests assume that tailoring. 1382 1383 # The result is really just true or false. But we follow along with tr14, 1384 # creating a rule which is false for something like X SP* X. That gets 1385 # encoding 2. The rest of the actions are synthetic ones that indicate 1386 # some context handling is required. These each are added to the 1387 # underlying 0, 1, or 2, instead of replacing them, so that the underlying 1388 # value can be retrieved. Actually only rules from 7 through 18 (which 1389 # are the ones where space matter) are possible to have 2 added to them. 1390 # The others below add just 0 or 1. It might be possible for one 1391 # synthetic rule to be added to another, yielding a larger value. This 1392 # doesn't happen in the Unicode 8.0 rule set, and as you can see from the 1393 # names of the middle grouping below, it is impossible for that to occur 1394 # for them because they all start with mutually exclusive classes. That 1395 # the final rule can't be added to any of the others isn't obvious from 1396 # its name, so it is assigned a power of 2 higher than the others can get 1397 # to so any addition would preserve all data. (And the code will reach an 1398 # assert(0) on debugging builds should this happen.) 1399 my %lb_actions = ( 1400 LB_NOBREAK => 0, 1401 LB_BREAKABLE => 1, 1402 LB_NOBREAK_EVEN_WITH_SP_BETWEEN => 2, 1403 1404 LB_CM_ZWJ_foo => 3, # Rule 9 1405 LB_SP_foo => 6, # Rule 18 1406 LB_PR_or_PO_then_OP_or_HY => 9, # Rule 25 1407 LB_SY_or_IS_then_various => 11, # Rule 25 1408 LB_HY_or_BA_then_foo => 13, # Rule 21 1409 LB_RI_then_RI => 15, # Rule 30a 1410 1411 LB_various_then_PO_or_PR => (1<<5), # Rule 25 1412 ); 1413 1414 # Construct the LB pair table. This is based on the rules in 1415 # http://www.unicode.org/reports/tr14/, but modified as those rules are 1416 # designed for someone taking a string of text and sequentially going 1417 # through it to find the break opportunities, whereas, Perl requires 1418 # determining if a given random spot is a break opportunity, without 1419 # knowing all the entire string before it. 1420 # 1421 # The table is constructed in reverse order of the rules, to make the 1422 # lower-numbered, higher priority ones override the later ones, as the 1423 # algorithm stops at the earliest matching rule 1424 1425 my @lb_table; 1426 my $table_size = @lb_short_enums; 1427 1428 # LB31. Break everywhere else 1429 for my $i (0 .. $table_size - 1) { 1430 for my $j (0 .. $table_size - 1) { 1431 $lb_table[$i][$j] = $lb_actions{'LB_BREAKABLE'}; 1432 } 1433 } 1434 1435 # LB30b Do not break between an emoji base and an emoji modifier. 1436 # EB × EM 1437 $lb_table[$lb_enums{'E_Base'}][$lb_enums{'E_Modifier'}] 1438 = $lb_actions{'LB_NOBREAK'}; 1439 1440 # LB30a Break between two regional indicator symbols if and only if there 1441 # are an even number of regional indicators preceding the position of the 1442 # break. 1443 # sot (RI RI)* RI × RI 1444 # [^RI] (RI RI)* RI × RI 1445 $lb_table[$lb_enums{'Regional_Indicator'}] 1446 [$lb_enums{'Regional_Indicator'}] = $lb_actions{'LB_RI_then_RI'}; 1447 1448 # LB30 Do not break between letters, numbers, or ordinary symbols and 1449 # opening or closing parentheses. 1450 # (AL | HL | NU) × OP 1451 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Open_Punctuation'}] 1452 = $lb_actions{'LB_NOBREAK'}; 1453 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Open_Punctuation'}] 1454 = $lb_actions{'LB_NOBREAK'}; 1455 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Open_Punctuation'}] 1456 = $lb_actions{'LB_NOBREAK'}; 1457 1458 # CP × (AL | HL | NU) 1459 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Alphabetic'}] 1460 = $lb_actions{'LB_NOBREAK'}; 1461 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Hebrew_Letter'}] 1462 = $lb_actions{'LB_NOBREAK'}; 1463 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Numeric'}] 1464 = $lb_actions{'LB_NOBREAK'}; 1465 1466 # LB29 Do not break between numeric punctuation and alphabetics (“e.g.”). 1467 # IS × (AL | HL) 1468 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Alphabetic'}] 1469 = $lb_actions{'LB_NOBREAK'}; 1470 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Hebrew_Letter'}] 1471 = $lb_actions{'LB_NOBREAK'}; 1472 1473 # LB28 Do not break between alphabetics (“at”). 1474 # (AL | HL) × (AL | HL) 1475 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Alphabetic'}] 1476 = $lb_actions{'LB_NOBREAK'}; 1477 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Alphabetic'}] 1478 = $lb_actions{'LB_NOBREAK'}; 1479 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Hebrew_Letter'}] 1480 = $lb_actions{'LB_NOBREAK'}; 1481 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Hebrew_Letter'}] 1482 = $lb_actions{'LB_NOBREAK'}; 1483 1484 # LB27 Treat a Korean Syllable Block the same as ID. 1485 # (JL | JV | JT | H2 | H3) × IN 1486 $lb_table[$lb_enums{'JL'}][$lb_enums{'Inseparable'}] 1487 = $lb_actions{'LB_NOBREAK'}; 1488 $lb_table[$lb_enums{'JV'}][$lb_enums{'Inseparable'}] 1489 = $lb_actions{'LB_NOBREAK'}; 1490 $lb_table[$lb_enums{'JT'}][$lb_enums{'Inseparable'}] 1491 = $lb_actions{'LB_NOBREAK'}; 1492 $lb_table[$lb_enums{'H2'}][$lb_enums{'Inseparable'}] 1493 = $lb_actions{'LB_NOBREAK'}; 1494 $lb_table[$lb_enums{'H3'}][$lb_enums{'Inseparable'}] 1495 = $lb_actions{'LB_NOBREAK'}; 1496 1497 # (JL | JV | JT | H2 | H3) × PO 1498 $lb_table[$lb_enums{'JL'}][$lb_enums{'Postfix_Numeric'}] 1499 = $lb_actions{'LB_NOBREAK'}; 1500 $lb_table[$lb_enums{'JV'}][$lb_enums{'Postfix_Numeric'}] 1501 = $lb_actions{'LB_NOBREAK'}; 1502 $lb_table[$lb_enums{'JT'}][$lb_enums{'Postfix_Numeric'}] 1503 = $lb_actions{'LB_NOBREAK'}; 1504 $lb_table[$lb_enums{'H2'}][$lb_enums{'Postfix_Numeric'}] 1505 = $lb_actions{'LB_NOBREAK'}; 1506 $lb_table[$lb_enums{'H3'}][$lb_enums{'Postfix_Numeric'}] 1507 = $lb_actions{'LB_NOBREAK'}; 1508 1509 # PR × (JL | JV | JT | H2 | H3) 1510 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JL'}] 1511 = $lb_actions{'LB_NOBREAK'}; 1512 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JV'}] 1513 = $lb_actions{'LB_NOBREAK'}; 1514 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JT'}] 1515 = $lb_actions{'LB_NOBREAK'}; 1516 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H2'}] 1517 = $lb_actions{'LB_NOBREAK'}; 1518 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H3'}] 1519 = $lb_actions{'LB_NOBREAK'}; 1520 1521 # LB26 Do not break a Korean syllable. 1522 # JL × (JL | JV | H2 | H3) 1523 $lb_table[$lb_enums{'JL'}][$lb_enums{'JL'}] = $lb_actions{'LB_NOBREAK'}; 1524 $lb_table[$lb_enums{'JL'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'}; 1525 $lb_table[$lb_enums{'JL'}][$lb_enums{'H2'}] = $lb_actions{'LB_NOBREAK'}; 1526 $lb_table[$lb_enums{'JL'}][$lb_enums{'H3'}] = $lb_actions{'LB_NOBREAK'}; 1527 1528 # (JV | H2) × (JV | JT) 1529 $lb_table[$lb_enums{'JV'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'}; 1530 $lb_table[$lb_enums{'H2'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'}; 1531 $lb_table[$lb_enums{'JV'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'}; 1532 $lb_table[$lb_enums{'H2'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'}; 1533 1534 # (JT | H3) × JT 1535 $lb_table[$lb_enums{'JT'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'}; 1536 $lb_table[$lb_enums{'H3'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'}; 1537 1538 # LB25 Do not break between the following pairs of classes relevant to 1539 # numbers, as tailored by example 7 in 1540 # http://www.unicode.org/reports/tr14/#Examples 1541 # We follow that tailoring because Unicode's test cases expect it 1542 # (PR | PO) × ( OP | HY )? NU 1543 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Numeric'}] 1544 = $lb_actions{'LB_NOBREAK'}; 1545 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Numeric'}] 1546 = $lb_actions{'LB_NOBREAK'}; 1547 1548 # Given that (OP | HY )? is optional, we have to test for it in code. 1549 # We add in the action (instead of overriding) for this, so that in 1550 # the code we can recover the underlying break value. 1551 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Open_Punctuation'}] 1552 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'}; 1553 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Open_Punctuation'}] 1554 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'}; 1555 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hyphen'}] 1556 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'}; 1557 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hyphen'}] 1558 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'}; 1559 1560 # ( OP | HY ) × NU 1561 $lb_table[$lb_enums{'Open_Punctuation'}][$lb_enums{'Numeric'}] 1562 = $lb_actions{'LB_NOBREAK'}; 1563 $lb_table[$lb_enums{'Hyphen'}][$lb_enums{'Numeric'}] 1564 = $lb_actions{'LB_NOBREAK'}; 1565 1566 # NU (NU | SY | IS)* × (NU | SY | IS | CL | CP ) 1567 # which can be rewritten as: 1568 # NU (SY | IS)* × (NU | SY | IS | CL | CP ) 1569 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Numeric'}] 1570 = $lb_actions{'LB_NOBREAK'}; 1571 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Break_Symbols'}] 1572 = $lb_actions{'LB_NOBREAK'}; 1573 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Infix_Numeric'}] 1574 = $lb_actions{'LB_NOBREAK'}; 1575 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Punctuation'}] 1576 = $lb_actions{'LB_NOBREAK'}; 1577 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Parenthesis'}] 1578 = $lb_actions{'LB_NOBREAK'}; 1579 1580 # Like earlier where we have to test in code, we add in the action so 1581 # that we can recover the underlying values. This is done in rules 1582 # below, as well. The code assumes that we haven't added 2 actions. 1583 # Shoul a later Unicode release break that assumption, then tests 1584 # should start failing. 1585 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Numeric'}] 1586 += $lb_actions{'LB_SY_or_IS_then_various'}; 1587 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Break_Symbols'}] 1588 += $lb_actions{'LB_SY_or_IS_then_various'}; 1589 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Infix_Numeric'}] 1590 += $lb_actions{'LB_SY_or_IS_then_various'}; 1591 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Punctuation'}] 1592 += $lb_actions{'LB_SY_or_IS_then_various'}; 1593 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Parenthesis'}] 1594 += $lb_actions{'LB_SY_or_IS_then_various'}; 1595 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Numeric'}] 1596 += $lb_actions{'LB_SY_or_IS_then_various'}; 1597 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Break_Symbols'}] 1598 += $lb_actions{'LB_SY_or_IS_then_various'}; 1599 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Infix_Numeric'}] 1600 += $lb_actions{'LB_SY_or_IS_then_various'}; 1601 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Punctuation'}] 1602 += $lb_actions{'LB_SY_or_IS_then_various'}; 1603 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Parenthesis'}] 1604 += $lb_actions{'LB_SY_or_IS_then_various'}; 1605 1606 # NU (NU | SY | IS)* (CL | CP)? × (PO | PR) 1607 # which can be rewritten as: 1608 # NU (SY | IS)* (CL | CP)? × (PO | PR) 1609 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Postfix_Numeric'}] 1610 = $lb_actions{'LB_NOBREAK'}; 1611 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Prefix_Numeric'}] 1612 = $lb_actions{'LB_NOBREAK'}; 1613 1614 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Postfix_Numeric'}] 1615 += $lb_actions{'LB_various_then_PO_or_PR'}; 1616 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Postfix_Numeric'}] 1617 += $lb_actions{'LB_various_then_PO_or_PR'}; 1618 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Postfix_Numeric'}] 1619 += $lb_actions{'LB_various_then_PO_or_PR'}; 1620 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Postfix_Numeric'}] 1621 += $lb_actions{'LB_various_then_PO_or_PR'}; 1622 1623 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Prefix_Numeric'}] 1624 += $lb_actions{'LB_various_then_PO_or_PR'}; 1625 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Prefix_Numeric'}] 1626 += $lb_actions{'LB_various_then_PO_or_PR'}; 1627 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Prefix_Numeric'}] 1628 += $lb_actions{'LB_various_then_PO_or_PR'}; 1629 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Prefix_Numeric'}] 1630 += $lb_actions{'LB_various_then_PO_or_PR'}; 1631 1632 # LB24 Do not break between numeric prefix/postfix and letters, or between 1633 # letters and prefix/postfix. 1634 # (PR | PO) × (AL | HL) 1635 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Alphabetic'}] 1636 = $lb_actions{'LB_NOBREAK'}; 1637 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hebrew_Letter'}] 1638 = $lb_actions{'LB_NOBREAK'}; 1639 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Alphabetic'}] 1640 = $lb_actions{'LB_NOBREAK'}; 1641 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hebrew_Letter'}] 1642 = $lb_actions{'LB_NOBREAK'}; 1643 1644 # (AL | HL) × (PR | PO) 1645 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Prefix_Numeric'}] 1646 = $lb_actions{'LB_NOBREAK'}; 1647 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Prefix_Numeric'}] 1648 = $lb_actions{'LB_NOBREAK'}; 1649 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Postfix_Numeric'}] 1650 = $lb_actions{'LB_NOBREAK'}; 1651 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Postfix_Numeric'}] 1652 = $lb_actions{'LB_NOBREAK'}; 1653 1654 # LB23a Do not break between numeric prefixes and ideographs, or between 1655 # ideographs and numeric postfixes. 1656 # PR × (ID | EB | EM) 1657 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Ideographic'}] 1658 = $lb_actions{'LB_NOBREAK'}; 1659 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'E_Base'}] 1660 = $lb_actions{'LB_NOBREAK'}; 1661 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'E_Modifier'}] 1662 = $lb_actions{'LB_NOBREAK'}; 1663 1664 # (ID | EB | EM) × PO 1665 $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Postfix_Numeric'}] 1666 = $lb_actions{'LB_NOBREAK'}; 1667 $lb_table[$lb_enums{'E_Base'}][$lb_enums{'Postfix_Numeric'}] 1668 = $lb_actions{'LB_NOBREAK'}; 1669 $lb_table[$lb_enums{'E_Modifier'}][$lb_enums{'Postfix_Numeric'}] 1670 = $lb_actions{'LB_NOBREAK'}; 1671 1672 # LB23 Do not break between digits and letters 1673 # (AL | HL) × NU 1674 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Numeric'}] 1675 = $lb_actions{'LB_NOBREAK'}; 1676 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Numeric'}] 1677 = $lb_actions{'LB_NOBREAK'}; 1678 1679 # NU × (AL | HL) 1680 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Alphabetic'}] 1681 = $lb_actions{'LB_NOBREAK'}; 1682 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Hebrew_Letter'}] 1683 = $lb_actions{'LB_NOBREAK'}; 1684 1685 # LB22 Do not break between two ellipses, or between letters, numbers or 1686 # exclamations and ellipsis. 1687 # (AL | HL) × IN 1688 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Inseparable'}] 1689 = $lb_actions{'LB_NOBREAK'}; 1690 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Inseparable'}] 1691 = $lb_actions{'LB_NOBREAK'}; 1692 1693 # Exclamation × IN 1694 $lb_table[$lb_enums{'Exclamation'}][$lb_enums{'Inseparable'}] 1695 = $lb_actions{'LB_NOBREAK'}; 1696 1697 # (ID | EB | EM) × IN 1698 $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Inseparable'}] 1699 = $lb_actions{'LB_NOBREAK'}; 1700 $lb_table[$lb_enums{'E_Base'}][$lb_enums{'Inseparable'}] 1701 = $lb_actions{'LB_NOBREAK'}; 1702 $lb_table[$lb_enums{'E_Modifier'}][$lb_enums{'Inseparable'}] 1703 = $lb_actions{'LB_NOBREAK'}; 1704 1705 # IN × IN 1706 $lb_table[$lb_enums{'Inseparable'}][$lb_enums{'Inseparable'}] 1707 = $lb_actions{'LB_NOBREAK'}; 1708 1709 # NU × IN 1710 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Inseparable'}] 1711 = $lb_actions{'LB_NOBREAK'}; 1712 1713 # LB21b Don’t break between Solidus and Hebrew letters. 1714 # SY × HL 1715 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Hebrew_Letter'}] 1716 = $lb_actions{'LB_NOBREAK'}; 1717 1718 # LB21a Don't break after Hebrew + Hyphen. 1719 # HL (HY | BA) × 1720 for my $i (0 .. @lb_table - 1) { 1721 $lb_table[$lb_enums{'Hyphen'}][$i] 1722 += $lb_actions{'LB_HY_or_BA_then_foo'}; 1723 $lb_table[$lb_enums{'Break_After'}][$i] 1724 += $lb_actions{'LB_HY_or_BA_then_foo'}; 1725 } 1726 1727 # LB21 Do not break before hyphen-minus, other hyphens, fixed-width 1728 # spaces, small kana, and other non-starters, or after acute accents. 1729 # × BA 1730 # × HY 1731 # × NS 1732 # BB × 1733 for my $i (0 .. @lb_table - 1) { 1734 $lb_table[$i][$lb_enums{'Break_After'}] = $lb_actions{'LB_NOBREAK'}; 1735 $lb_table[$i][$lb_enums{'Hyphen'}] = $lb_actions{'LB_NOBREAK'}; 1736 $lb_table[$i][$lb_enums{'Nonstarter'}] = $lb_actions{'LB_NOBREAK'}; 1737 $lb_table[$lb_enums{'Break_Before'}][$i] = $lb_actions{'LB_NOBREAK'}; 1738 } 1739 1740 # LB20 Break before and after unresolved CB. 1741 # ÷ CB 1742 # CB ÷ 1743 # Conditional breaks should be resolved external to the line breaking 1744 # rules. However, the default action is to treat unresolved CB as breaking 1745 # before and after. 1746 for my $i (0 .. @lb_table - 1) { 1747 $lb_table[$i][$lb_enums{'Contingent_Break'}] 1748 = $lb_actions{'LB_BREAKABLE'}; 1749 $lb_table[$lb_enums{'Contingent_Break'}][$i] 1750 = $lb_actions{'LB_BREAKABLE'}; 1751 } 1752 1753 # LB19 Do not break before or after quotation marks, such as ‘ ” ’. 1754 # × QU 1755 # QU × 1756 for my $i (0 .. @lb_table - 1) { 1757 $lb_table[$i][$lb_enums{'Quotation'}] = $lb_actions{'LB_NOBREAK'}; 1758 $lb_table[$lb_enums{'Quotation'}][$i] = $lb_actions{'LB_NOBREAK'}; 1759 } 1760 1761 # LB18 Break after spaces 1762 # SP ÷ 1763 for my $i (0 .. @lb_table - 1) { 1764 $lb_table[$lb_enums{'Space'}][$i] = $lb_actions{'LB_BREAKABLE'}; 1765 } 1766 1767 # LB17 Do not break within ‘——’, even with intervening spaces. 1768 # B2 SP* × B2 1769 $lb_table[$lb_enums{'Break_Both'}][$lb_enums{'Break_Both'}] 1770 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; 1771 1772 # LB16 Do not break between closing punctuation and a nonstarter even with 1773 # intervening spaces. 1774 # (CL | CP) SP* × NS 1775 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Nonstarter'}] 1776 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; 1777 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Nonstarter'}] 1778 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; 1779 1780 1781 # LB15 Do not break within ‘”[’, even with intervening spaces. 1782 # QU SP* × OP 1783 $lb_table[$lb_enums{'Quotation'}][$lb_enums{'Open_Punctuation'}] 1784 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; 1785 1786 # LB14 Do not break after ‘[’, even after spaces. 1787 # OP SP* × 1788 for my $i (0 .. @lb_table - 1) { 1789 $lb_table[$lb_enums{'Open_Punctuation'}][$i] 1790 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; 1791 } 1792 1793 # LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces, as 1794 # tailored by example 7 in http://www.unicode.org/reports/tr14/#Examples 1795 # [^NU] × CL 1796 # [^NU] × CP 1797 # × EX 1798 # [^NU] × IS 1799 # [^NU] × SY 1800 for my $i (0 .. @lb_table - 1) { 1801 $lb_table[$i][$lb_enums{'Exclamation'}] 1802 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; 1803 1804 next if $i == $lb_enums{'Numeric'}; 1805 1806 $lb_table[$i][$lb_enums{'Close_Punctuation'}] 1807 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; 1808 $lb_table[$i][$lb_enums{'Close_Parenthesis'}] 1809 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; 1810 $lb_table[$i][$lb_enums{'Infix_Numeric'}] 1811 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; 1812 $lb_table[$i][$lb_enums{'Break_Symbols'}] 1813 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; 1814 } 1815 1816 # LB12a Do not break before NBSP and related characters, except after 1817 # spaces and hyphens. 1818 # [^SP BA HY] × GL 1819 for my $i (0 .. @lb_table - 1) { 1820 next if $i == $lb_enums{'Space'} 1821 || $i == $lb_enums{'Break_After'} 1822 || $i == $lb_enums{'Hyphen'}; 1823 1824 # We don't break, but if a property above has said don't break even 1825 # with space between, don't override that (also in the next few rules) 1826 next if $lb_table[$i][$lb_enums{'Glue'}] 1827 == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; 1828 $lb_table[$i][$lb_enums{'Glue'}] = $lb_actions{'LB_NOBREAK'}; 1829 } 1830 1831 # LB12 Do not break after NBSP and related characters. 1832 # GL × 1833 for my $i (0 .. @lb_table - 1) { 1834 next if $lb_table[$lb_enums{'Glue'}][$i] 1835 == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; 1836 $lb_table[$lb_enums{'Glue'}][$i] = $lb_actions{'LB_NOBREAK'}; 1837 } 1838 1839 # LB11 Do not break before or after Word joiner and related characters. 1840 # × WJ 1841 # WJ × 1842 for my $i (0 .. @lb_table - 1) { 1843 if ($lb_table[$i][$lb_enums{'Word_Joiner'}] 1844 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}) 1845 { 1846 $lb_table[$i][$lb_enums{'Word_Joiner'}] = $lb_actions{'LB_NOBREAK'}; 1847 } 1848 if ($lb_table[$lb_enums{'Word_Joiner'}][$i] 1849 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}) 1850 { 1851 $lb_table[$lb_enums{'Word_Joiner'}][$i] = $lb_actions{'LB_NOBREAK'}; 1852 } 1853 } 1854 1855 # Special case this here to avoid having to do a special case in the code, 1856 # by making this the same as other things with a SP in front of them that 1857 # don't break, we avoid an extra test 1858 $lb_table[$lb_enums{'Space'}][$lb_enums{'Word_Joiner'}] 1859 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; 1860 1861 # LB9 and LB10 are done in the same loop 1862 # 1863 # LB9 Do not break a combining character sequence; treat it as if it has 1864 # the line breaking class of the base character in all of the 1865 # higher-numbered rules. Treat ZWJ as if it were CM 1866 # Treat X (CM|ZWJ)* as if it were X. 1867 # where X is any line break class except BK, CR, LF, NL, SP, or ZW. 1868 1869 # LB10 Treat any remaining combining mark or ZWJ as AL. This catches the 1870 # case where a CM or ZWJ is the first character on the line or follows SP, 1871 # BK, CR, LF, NL, or ZW. 1872 for my $i (0 .. @lb_table - 1) { 1873 1874 # When the CM or ZWJ is the first in the pair, we don't know without 1875 # looking behind whether the CM or ZWJ is going to attach to an 1876 # earlier character, or not. So have to figure this out at runtime in 1877 # the code 1878 $lb_table[$lb_enums{'Combining_Mark'}][$i] 1879 = $lb_actions{'LB_CM_ZWJ_foo'}; 1880 $lb_table[$lb_enums{'ZWJ'}][$i] = $lb_actions{'LB_CM_ZWJ_foo'}; 1881 1882 if ( $i == $lb_enums{'Mandatory_Break'} 1883 || $i == $lb_enums{'EDGE'} 1884 || $i == $lb_enums{'Carriage_Return'} 1885 || $i == $lb_enums{'Line_Feed'} 1886 || $i == $lb_enums{'Next_Line'} 1887 || $i == $lb_enums{'Space'} 1888 || $i == $lb_enums{'ZWSpace'}) 1889 { 1890 # For these classes, a following CM doesn't combine, and should do 1891 # whatever 'Alphabetic' would do. 1892 $lb_table[$i][$lb_enums{'Combining_Mark'}] 1893 = $lb_table[$i][$lb_enums{'Alphabetic'}]; 1894 $lb_table[$i][$lb_enums{'ZWJ'}] 1895 = $lb_table[$i][$lb_enums{'Alphabetic'}]; 1896 } 1897 else { 1898 # For these classes, the CM or ZWJ combines, so doesn't break, 1899 # inheriting the type of nobreak from the master character. 1900 if ($lb_table[$i][$lb_enums{'Combining_Mark'}] 1901 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}) 1902 { 1903 $lb_table[$i][$lb_enums{'Combining_Mark'}] 1904 = $lb_actions{'LB_NOBREAK'}; 1905 } 1906 if ($lb_table[$i][$lb_enums{'ZWJ'}] 1907 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}) 1908 { 1909 $lb_table[$i][$lb_enums{'ZWJ'}] 1910 = $lb_actions{'LB_NOBREAK'}; 1911 } 1912 } 1913 } 1914 1915 # LB8a Do not break after a zero width joiner 1916 # ZWJ × 1917 for my $i (0 .. @lb_table - 1) { 1918 $lb_table[$lb_enums{'ZWJ'}][$i] = $lb_actions{'LB_NOBREAK'}; 1919 } 1920 1921 # LB8 Break before any character following a zero-width space, even if one 1922 # or more spaces intervene. 1923 # ZW SP* ÷ 1924 for my $i (0 .. @lb_table - 1) { 1925 $lb_table[$lb_enums{'ZWSpace'}][$i] = $lb_actions{'LB_BREAKABLE'}; 1926 } 1927 1928 # Because of LB8-10, we need to look at context for "SP x", and this must 1929 # be done in the code. So override the existing rules for that, by adding 1930 # a constant to get new rules that tell the code it needs to look at 1931 # context. By adding this action instead of replacing the existing one, 1932 # we can get back to the original rule if necessary. 1933 for my $i (0 .. @lb_table - 1) { 1934 $lb_table[$lb_enums{'Space'}][$i] += $lb_actions{'LB_SP_foo'}; 1935 } 1936 1937 # LB7 Do not break before spaces or zero width space. 1938 # × SP 1939 # × ZW 1940 for my $i (0 .. @lb_table - 1) { 1941 $lb_table[$i][$lb_enums{'Space'}] = $lb_actions{'LB_NOBREAK'}; 1942 $lb_table[$i][$lb_enums{'ZWSpace'}] = $lb_actions{'LB_NOBREAK'}; 1943 } 1944 1945 # LB6 Do not break before hard line breaks. 1946 # × ( BK | CR | LF | NL ) 1947 for my $i (0 .. @lb_table - 1) { 1948 $lb_table[$i][$lb_enums{'Mandatory_Break'}] = $lb_actions{'LB_NOBREAK'}; 1949 $lb_table[$i][$lb_enums{'Carriage_Return'}] = $lb_actions{'LB_NOBREAK'}; 1950 $lb_table[$i][$lb_enums{'Line_Feed'}] = $lb_actions{'LB_NOBREAK'}; 1951 $lb_table[$i][$lb_enums{'Next_Line'}] = $lb_actions{'LB_NOBREAK'}; 1952 } 1953 1954 # LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks. 1955 # CR × LF 1956 # CR ! 1957 # LF ! 1958 # NL ! 1959 for my $i (0 .. @lb_table - 1) { 1960 $lb_table[$lb_enums{'Carriage_Return'}][$i] 1961 = $lb_actions{'LB_BREAKABLE'}; 1962 $lb_table[$lb_enums{'Line_Feed'}][$i] = $lb_actions{'LB_BREAKABLE'}; 1963 $lb_table[$lb_enums{'Next_Line'}][$i] = $lb_actions{'LB_BREAKABLE'}; 1964 } 1965 $lb_table[$lb_enums{'Carriage_Return'}][$lb_enums{'Line_Feed'}] 1966 = $lb_actions{'LB_NOBREAK'}; 1967 1968 # LB4 Always break after hard line breaks. 1969 # BK ! 1970 for my $i (0 .. @lb_table - 1) { 1971 $lb_table[$lb_enums{'Mandatory_Break'}][$i] 1972 = $lb_actions{'LB_BREAKABLE'}; 1973 } 1974 1975 # LB3 Always break at the end of text. 1976 # ! eot 1977 # LB2 Never break at the start of text. 1978 # sot × 1979 for my $i (0 .. @lb_table - 1) { 1980 $lb_table[$i][$lb_enums{'EDGE'}] = $lb_actions{'LB_BREAKABLE'}; 1981 $lb_table[$lb_enums{'EDGE'}][$i] = $lb_actions{'LB_NOBREAK'}; 1982 } 1983 1984 # LB1 Assign a line breaking class to each code point of the input. 1985 # Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes 1986 # depending on criteria outside the scope of this algorithm. 1987 # 1988 # In the absence of such criteria all characters with a specific 1989 # combination of original class and General_Category property value are 1990 # resolved as follows: 1991 # Original Resolved General_Category 1992 # AI, SG, XX AL Any 1993 # SA CM Only Mn or Mc 1994 # SA AL Any except Mn and Mc 1995 # CJ NS Any 1996 # 1997 # This is done in mktables, so we never see any of the remapped-from 1998 # classes. 1999 2000 output_table_common('LB', \%lb_actions, 2001 \@lb_table, \@lb_short_enums, \%lb_abbreviations); 2002} 2003 2004sub output_WB_table() { 2005 2006 # Create and output the enums, #defines, and pair table for use in 2007 # determining Word Breaks, given in http://www.unicode.org/reports/tr29/. 2008 2009 # This uses the same mechanism in the other bounds tables generated by 2010 # this file. The actions that could override a 0 or 1 are added to those 2011 # numbers; the actions that clearly don't depend on the underlying rule 2012 # simply overwrite 2013 my %wb_actions = ( 2014 WB_NOBREAK => 0, 2015 WB_BREAKABLE => 1, 2016 WB_hs_then_hs => 2, 2017 WB_Ex_or_FO_or_ZWJ_then_foo => 3, 2018 WB_DQ_then_HL => 4, 2019 WB_HL_then_DQ => 6, 2020 WB_LE_or_HL_then_MB_or_ML_or_SQ => 8, 2021 WB_MB_or_ML_or_SQ_then_LE_or_HL => 10, 2022 WB_MB_or_MN_or_SQ_then_NU => 12, 2023 WB_NU_then_MB_or_MN_or_SQ => 14, 2024 WB_RI_then_RI => 16, 2025 ); 2026 2027 # Construct the WB pair table. 2028 # The table is constructed in reverse order of the rules, to make the 2029 # lower-numbered, higher priority ones override the later ones, as the 2030 # algorithm stops at the earliest matching rule 2031 2032 my @wb_table; 2033 my $table_size = @wb_short_enums; 2034 2035 # Otherwise, break everywhere (including around ideographs). 2036 # WB99 Any ÷ Any 2037 for my $i (0 .. $table_size - 1) { 2038 for my $j (0 .. $table_size - 1) { 2039 $wb_table[$i][$j] = $wb_actions{'WB_BREAKABLE'}; 2040 } 2041 } 2042 2043 # Do not break within emoji flag sequences. That is, do not break between 2044 # regional indicator (RI) symbols if there is an odd number of RI 2045 # characters before the break point. 2046 # WB16 [^RI] (RI RI)* RI × RI 2047 # WB15 sot (RI RI)* RI × RI 2048 $wb_table[$wb_enums{'Regional_Indicator'}] 2049 [$wb_enums{'Regional_Indicator'}] = $wb_actions{'WB_RI_then_RI'}; 2050 2051 # Do not break within emoji modifier sequences. 2052 # WB14 ( E_Base | EBG ) × E_Modifier 2053 $wb_table[$wb_enums{'E_Base'}][$wb_enums{'E_Modifier'}] 2054 = $wb_actions{'WB_NOBREAK'}; 2055 $wb_table[$wb_enums{'E_Base_GAZ'}][$wb_enums{'E_Modifier'}] 2056 = $wb_actions{'WB_NOBREAK'}; 2057 2058 # Do not break from extenders. 2059 # WB13b ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana) 2060 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ALetter'}] 2061 = $wb_actions{'WB_NOBREAK'}; 2062 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'XPG_LE'}] 2063 = $wb_actions{'WB_NOBREAK'}; 2064 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Hebrew_Letter'}] 2065 = $wb_actions{'WB_NOBREAK'}; 2066 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Numeric'}] 2067 = $wb_actions{'WB_NOBREAK'}; 2068 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Katakana'}] 2069 = $wb_actions{'WB_NOBREAK'}; 2070 2071 # WB13a (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) 2072 # × ExtendNumLet 2073 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ExtendNumLet'}] 2074 = $wb_actions{'WB_NOBREAK'}; 2075 $wb_table[$wb_enums{'XPG_LE'}][$wb_enums{'ExtendNumLet'}] 2076 = $wb_actions{'WB_NOBREAK'}; 2077 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ExtendNumLet'}] 2078 = $wb_actions{'WB_NOBREAK'}; 2079 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ExtendNumLet'}] 2080 = $wb_actions{'WB_NOBREAK'}; 2081 $wb_table[$wb_enums{'Katakana'}][$wb_enums{'ExtendNumLet'}] 2082 = $wb_actions{'WB_NOBREAK'}; 2083 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ExtendNumLet'}] 2084 = $wb_actions{'WB_NOBREAK'}; 2085 2086 # Do not break between Katakana. 2087 # WB13 Katakana × Katakana 2088 $wb_table[$wb_enums{'Katakana'}][$wb_enums{'Katakana'}] 2089 = $wb_actions{'WB_NOBREAK'}; 2090 2091 # Do not break within sequences, such as “3.2” or “3,456.789”. 2092 # WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric 2093 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNumLet'}] 2094 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'}; 2095 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNum'}] 2096 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'}; 2097 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Single_Quote'}] 2098 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'}; 2099 2100 # WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric 2101 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Numeric'}] 2102 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'}; 2103 $wb_table[$wb_enums{'MidNum'}][$wb_enums{'Numeric'}] 2104 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'}; 2105 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Numeric'}] 2106 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'}; 2107 2108 # Do not break within sequences of digits, or digits adjacent to letters 2109 # (“3a”, or “A3”). 2110 # WB10 Numeric × (ALetter | Hebrew_Letter) 2111 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ALetter'}] 2112 = $wb_actions{'WB_NOBREAK'}; 2113 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'XPG_LE'}] 2114 = $wb_actions{'WB_NOBREAK'}; 2115 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Hebrew_Letter'}] 2116 = $wb_actions{'WB_NOBREAK'}; 2117 2118 # WB9 (ALetter | Hebrew_Letter) × Numeric 2119 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Numeric'}] 2120 = $wb_actions{'WB_NOBREAK'}; 2121 $wb_table[$wb_enums{'XPG_LE'}][$wb_enums{'Numeric'}] 2122 = $wb_actions{'WB_NOBREAK'}; 2123 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Numeric'}] 2124 = $wb_actions{'WB_NOBREAK'}; 2125 2126 # WB8 Numeric × Numeric 2127 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Numeric'}] 2128 = $wb_actions{'WB_NOBREAK'}; 2129 2130 # Do not break letters across certain punctuation. 2131 # WB7c Hebrew_Letter Double_Quote × Hebrew_Letter 2132 $wb_table[$wb_enums{'Double_Quote'}][$wb_enums{'Hebrew_Letter'}] 2133 += $wb_actions{'WB_DQ_then_HL'}; 2134 2135 # WB7b Hebrew_Letter × Double_Quote Hebrew_Letter 2136 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Double_Quote'}] 2137 += $wb_actions{'WB_HL_then_DQ'}; 2138 2139 # WB7a Hebrew_Letter × Single_Quote 2140 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}] 2141 = $wb_actions{'WB_NOBREAK'}; 2142 2143 # WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) 2144 # × (ALetter | Hebrew_Letter) 2145 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'ALetter'}] 2146 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; 2147 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'XPG_LE'}] 2148 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; 2149 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Hebrew_Letter'}] 2150 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; 2151 $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'ALetter'}] 2152 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; 2153 $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'XPG_LE'}] 2154 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; 2155 $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'Hebrew_Letter'}] 2156 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; 2157 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'ALetter'}] 2158 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; 2159 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'XPG_LE'}] 2160 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; 2161 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Hebrew_Letter'}] 2162 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; 2163 2164 # WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet 2165 # | Single_Quote) (ALetter | Hebrew_Letter) 2166 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidNumLet'}] 2167 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; 2168 $wb_table[$wb_enums{'XPG_LE'}][$wb_enums{'MidNumLet'}] 2169 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; 2170 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidNumLet'}] 2171 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; 2172 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidLetter'}] 2173 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; 2174 $wb_table[$wb_enums{'XPG_LE'}][$wb_enums{'MidLetter'}] 2175 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; 2176 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidLetter'}] 2177 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; 2178 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Single_Quote'}] 2179 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; 2180 $wb_table[$wb_enums{'XPG_LE'}][$wb_enums{'Single_Quote'}] 2181 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; 2182 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}] 2183 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; 2184 2185 # Do not break between most letters. 2186 # WB5 (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter) 2187 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ALetter'}] 2188 = $wb_actions{'WB_NOBREAK'}; 2189 $wb_table[$wb_enums{'XPG_LE'}][$wb_enums{'ALetter'}] 2190 = $wb_actions{'WB_NOBREAK'}; 2191 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Hebrew_Letter'}] 2192 = $wb_actions{'WB_NOBREAK'}; 2193 $wb_table[$wb_enums{'XPG_LE'}][$wb_enums{'Hebrew_Letter'}] 2194 = $wb_actions{'WB_NOBREAK'}; 2195 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ALetter'}] 2196 = $wb_actions{'WB_NOBREAK'}; 2197 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'XPG_LE'}] 2198 = $wb_actions{'WB_NOBREAK'}; 2199 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Hebrew_Letter'}] 2200 = $wb_actions{'WB_NOBREAK'}; 2201 $wb_table[$wb_enums{'XPG_LE'}][$wb_enums{'XPG_LE'}] 2202 = $wb_actions{'WB_NOBREAK'}; 2203 2204 # Ignore Format and Extend characters, except after sot, CR, LF, and 2205 # Newline. This also has the effect of: Any × (Format | Extend | ZWJ) 2206 # WB4 X (Extend | Format | ZWJ)* → X 2207 for my $i (0 .. @wb_table - 1) { 2208 $wb_table[$wb_enums{'Extend'}][$i] 2209 = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'}; 2210 $wb_table[$wb_enums{'Format'}][$i] 2211 = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'}; 2212 $wb_table[$wb_enums{'ZWJ'}][$i] 2213 = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'}; 2214 } 2215 for my $i (0 .. @wb_table - 1) { 2216 $wb_table[$i][$wb_enums{'Extend'}] = $wb_actions{'WB_NOBREAK'}; 2217 $wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'}; 2218 $wb_table[$i][$wb_enums{'ZWJ'}] = $wb_actions{'WB_NOBREAK'}; 2219 } 2220 2221 # Implied is that these attach to the character before them, except for 2222 # the characters that mark the end of a region of text. The rules below 2223 # override the ones set up here, for all the characters that need 2224 # overriding. 2225 for my $i (0 .. @wb_table - 1) { 2226 $wb_table[$i][$wb_enums{'Extend'}] = $wb_actions{'WB_NOBREAK'}; 2227 $wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'}; 2228 } 2229 2230 # Keep horizontal whitespace together 2231 # Use perl's tailoring instead 2232 # WB3d WSegSpace × WSegSpace 2233 #$wb_table[$wb_enums{'WSegSpace'}][$wb_enums{'WSegSpace'}] 2234 # = $wb_actions{'WB_NOBREAK'}; 2235 2236 # Do not break within emoji zwj sequences. 2237 # WB3c ZWJ × ( Glue_After_Zwj | EBG ) 2238 $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'Glue_After_Zwj'}] 2239 = $wb_actions{'WB_NOBREAK'}; 2240 $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'E_Base_GAZ'}] 2241 = $wb_actions{'WB_NOBREAK'}; 2242 $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'XPG_XX'}] 2243 = $wb_actions{'WB_NOBREAK'}; 2244 $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'XPG_LE'}] 2245 = $wb_actions{'WB_NOBREAK'}; 2246 2247 # Break before and after newlines 2248 # WB3b ÷ (Newline | CR | LF) 2249 # WB3a (Newline | CR | LF) ÷ 2250 # et. al. 2251 for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') { 2252 for my $j (0 .. @wb_table - 1) { 2253 $wb_table[$j][$wb_enums{$i}] = $wb_actions{'WB_BREAKABLE'}; 2254 $wb_table[$wb_enums{$i}][$j] = $wb_actions{'WB_BREAKABLE'}; 2255 } 2256 } 2257 2258 # But do not break within white space. 2259 # WB3 CR × LF 2260 # et.al. 2261 for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') { 2262 for my $j ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') { 2263 $wb_table[$wb_enums{$i}][$wb_enums{$j}] = $wb_actions{'WB_NOBREAK'}; 2264 } 2265 } 2266 2267 # And do not break horizontal space followed by Extend or Format or ZWJ 2268 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Extend'}] 2269 = $wb_actions{'WB_NOBREAK'}; 2270 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Format'}] 2271 = $wb_actions{'WB_NOBREAK'}; 2272 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'ZWJ'}] 2273 = $wb_actions{'WB_NOBREAK'}; 2274 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}] 2275 [$wb_enums{'Perl_Tailored_HSpace'}] 2276 = $wb_actions{'WB_hs_then_hs'}; 2277 2278 # Break at the start and end of text, unless the text is empty 2279 # WB2 Any ÷ eot 2280 # WB1 sot ÷ Any 2281 for my $i (0 .. @wb_table - 1) { 2282 $wb_table[$i][$wb_enums{'EDGE'}] = $wb_actions{'WB_BREAKABLE'}; 2283 $wb_table[$wb_enums{'EDGE'}][$i] = $wb_actions{'WB_BREAKABLE'}; 2284 } 2285 $wb_table[$wb_enums{'EDGE'}][$wb_enums{'EDGE'}] = 0; 2286 2287 output_table_common('WB', \%wb_actions, 2288 \@wb_table, \@wb_short_enums, \%wb_abbreviations); 2289} 2290 2291sub sanitize_name ($) { 2292 # Change the non-word characters in the input string to standardized word 2293 # equivalents 2294 # 2295 my $sanitized = shift; 2296 $sanitized =~ s/=/__/; 2297 $sanitized =~ s/&/_AMP_/; 2298 $sanitized =~ s/\./_DOT_/; 2299 $sanitized =~ s/-/_MINUS_/; 2300 $sanitized =~ s!/!_SLASH_!; 2301 2302 return $sanitized; 2303} 2304 2305switch_pound_if ('ALL', 'PERL_IN_REGCOMP_C'); 2306 2307output_invlist("Latin1", [ 0, 256 ]); 2308output_invlist("AboveLatin1", [ 256 ]); 2309 2310end_file_pound_if; 2311 2312# We construct lists for all the POSIX and backslash sequence character 2313# classes in two forms: 2314# 1) ones which match only in the ASCII range 2315# 2) ones which match either in the Latin1 range, or the entire Unicode range 2316# 2317# These get compiled in, and hence affect the memory footprint of every Perl 2318# program, even those not using Unicode. To minimize the size, currently 2319# the Latin1 version is generated for the beyond ASCII range except for those 2320# lists that are quite small for the entire range, such as for \s, which is 22 2321# UVs long plus 4 UVs (currently) for the header. 2322# 2323# To save even more memory, the ASCII versions could be derived from the 2324# larger ones at runtime, saving some memory (minus the expense of the machine 2325# instructions to do so), but these are all small anyway, so their total is 2326# about 100 UVs. 2327# 2328# In the list of properties below that get generated, the L1 prefix is a fake 2329# property that means just the Latin1 range of the full property (whose name 2330# has an X prefix instead of L1). 2331# 2332# An initial & means to use the subroutine from this file instead of an 2333# official inversion list. 2334 2335# Below is the list of property names to generate. '&' means to use the 2336# subroutine to generate the inversion list instead of the generic code 2337# below. Some properties have a comma-separated list after the name, 2338# These are extra enums to add to those found in the Unicode tables. 2339no warnings 'qw'; 2340 # Ignore non-alpha in sort 2341my @props; 2342push @props, sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw( 2343 &UpperLatin1 2344 _Perl_GCB,EDGE,E_Base,E_Base_GAZ,E_Modifier,Glue_After_Zwj,LV,Prepend,Regional_Indicator,SpacingMark,ZWJ,XPG_XX 2345 _Perl_LB,EDGE,Close_Parenthesis,Hebrew_Letter,Next_Line,Regional_Indicator,ZWJ,Contingent_Break,E_Base,E_Modifier,H2,H3,JL,JT,JV,Word_Joiner 2346 _Perl_SB,EDGE,SContinue,CR,Extend,LF 2347 _Perl_WB,Perl_Tailored_HSpace,EDGE,UNKNOWN,CR,Double_Quote,E_Base,E_Base_GAZ,E_Modifier,Extend,Glue_After_Zwj,Hebrew_Letter,LF,MidNumLet,Newline,Regional_Indicator,Single_Quote,ZWJ,XPG_XX,XPG_LE 2348 _Perl_SCX,Latin,Inherited,Unknown,Kore,Jpan,Hanb,INVALID 2349 Lowercase_Mapping 2350 Titlecase_Mapping 2351 Uppercase_Mapping 2352 Simple_Case_Folding 2353 Case_Folding 2354 &_Perl_IVCF 2355 &_Perl_CCC_non0_non230 2356 ); 2357 # NOTE that the convention is that extra enum values come 2358 # after the property name, separated by commas, with the enums 2359 # that aren't ever defined by Unicode coming last, at least 4 2360 # all-uppercase characters. The others are enum names that 2361 # are needed by perl, but aren't in all Unicode releases. 2362 2363my @bin_props; 2364my @perl_prop_synonyms; 2365my %enums; 2366my @deprecated_messages = ""; # Element [0] is a placeholder 2367my %deprecated_tags; 2368 2369my $float_e_format = qr/ ^ -? \d \. \d+ e [-+] \d+ $ /x; 2370 2371# Create another hash that maps floating point x.yyEzz representation to what 2372# %stricter_to_file_of does for the equivalent rational. A typical entry in 2373# the latter hash is 2374# 2375# 'nv=1/2' => 'Nv/1_2', 2376# 2377# From that, this loop creates an entry 2378# 2379# 'nv=5.00e-01' => 'Nv/1_2', 2380# 2381# %stricter_to_file_of contains far more than just the rationals. Instead we 2382# use %utf8::nv_floating_to_rational which should have an entry for each 2383# nv in the former hash. 2384my %floating_to_file_of; 2385foreach my $key (keys %utf8::nv_floating_to_rational) { 2386 my $value = $utf8::nv_floating_to_rational{$key}; 2387 $floating_to_file_of{$key} = $utf8::stricter_to_file_of{"nv=$value"}; 2388} 2389 2390# Properties that are specified with a prop=value syntax 2391my @equals_properties; 2392 2393# Collect all the binary properties from data in lib/unicore 2394# Sort so that complements come after the main table, and the shortest 2395# names first, finally alphabetically. Also, sort together the tables we want 2396# to be kept together, and prefer those with 'posix' in their names, which is 2397# what the C code is expecting their names to be. 2398foreach my $property (sort 2399 { exists $keep_together{lc $b} <=> exists $keep_together{lc $a} 2400 or $b =~ /posix/i <=> $a =~ /posix/i 2401 or $b =~ /perl/i <=> $a =~ /perl/i 2402 or $a =~ $float_e_format <=> $b =~ $float_e_format 2403 or $a =~ /!/ <=> $b =~ /!/ 2404 or length $a <=> length $b 2405 or $a cmp $b 2406 } keys %utf8::loose_to_file_of, 2407 keys %utf8::stricter_to_file_of, 2408 keys %floating_to_file_of 2409) { 2410 2411 # These two hashes map properties to values that can be considered to 2412 # be checksums. If two properties have the same checksum, they have 2413 # identical entries. Otherwise they differ in some way. 2414 my $tag = $utf8::loose_to_file_of{$property}; 2415 $tag = $utf8::stricter_to_file_of{$property} unless defined $tag; 2416 $tag = $floating_to_file_of{$property} unless defined $tag; 2417 2418 # The tag may contain an '!' meaning it is identical to the one formed 2419 # by removing the !, except that it is inverted. 2420 my $inverted = $tag =~ s/!//; 2421 2422 # This hash is lacking the property name 2423 $property = "nv=$property" if $property =~ $float_e_format; 2424 2425 # The list of 'prop=value' entries that this single entry expands to 2426 my @this_entries; 2427 2428 # Split 'property=value' on the equals sign, with $lhs being the whole 2429 # thing if there is no '=' 2430 my ($lhs, $rhs) = $property =~ / ( [^=]* ) ( =? .*) /x; 2431 2432 # $lhs then becomes the property name. 2433 my $prop_value = $rhs =~ s/ ^ = //rx; 2434 2435 push @equals_properties, $lhs if $prop_value ne ""; 2436 2437 # See if there are any synonyms for this property. 2438 if (exists $prop_name_aliases{$lhs}) { 2439 2440 # If so, do the combinatorics so that a new entry is added for 2441 # each legal property combined with the property value (which is 2442 # $rhs) 2443 foreach my $alias (@{$prop_name_aliases{$lhs}}) { 2444 2445 # But, there are some ambiguities, like 'script' is a synonym 2446 # for 'sc', and 'sc' can stand alone, meaning something 2447 # entirely different than 'script'. 'script' cannot stand 2448 # alone. Don't add if the potential new lhs is in the hash of 2449 # stand-alone properties. 2450 no warnings 'once'; 2451 next if $rhs eq "" && grep { $alias eq $_ } 2452 keys %utf8::loose_property_to_file_of; 2453 2454 my $new_entry = $alias . $rhs; 2455 push @this_entries, $new_entry; 2456 } 2457 } 2458 2459 # Above, we added the synonyms for the base entry we're now 2460 # processing. But we haven't dealt with it yet. If we already have a 2461 # property with the identical characteristics, this becomes just a 2462 # synonym for it. 2463 2464 if (exists $enums{$tag}) { 2465 push @this_entries, $property; 2466 } 2467 else { # Otherwise, create a new entry. 2468 2469 # Add to the list of properties to generate inversion lists for. 2470 push @bin_props, uc $property; 2471 2472 # Create a rule for the parser 2473 if (! exists $keywords{$property}) { 2474 $keywords{$property} = token_name($property); 2475 } 2476 2477 # And create an enum for it. 2478 $enums{$tag} = $table_name_prefix . uc sanitize_name($property); 2479 2480 $perl_tags{$tag} = 1 if exists $keep_together{lc $property}; 2481 2482 # Some properties are deprecated. This hash tells us so, and the 2483 # warning message to raise if they are used. 2484 if (exists $utf8::why_deprecated{$tag}) { 2485 $deprecated_tags{$enums{$tag}} = scalar @deprecated_messages; 2486 push @deprecated_messages, $utf8::why_deprecated{$tag}; 2487 } 2488 2489 # Our sort above should have made sure that we see the 2490 # non-inverted version first, but this makes sure. 2491 warn "$property is inverted!!!" if $inverted; 2492 } 2493 2494 # Everything else is #defined to be the base enum, inversion is 2495 # indicated by negating the value. 2496 my $defined_to = ""; 2497 $defined_to .= "-" if $inverted; 2498 $defined_to .= $enums{$tag}; 2499 2500 # Go through the entries that evaluate to this. 2501 @this_entries = uniques @this_entries; 2502 foreach my $define (@this_entries) { 2503 2504 # There is a rule for the parser for each. 2505 $keywords{$define} = $defined_to; 2506 2507 # And a #define for all simple names equivalent to a perl property, 2508 # except those that begin with 'is' or 'in'; 2509 if (exists $perl_tags{$tag} && $property !~ / ^ i[ns] | = /x) { 2510 push @perl_prop_synonyms, "#define " 2511 . $table_name_prefix 2512 . uc(sanitize_name($define)) 2513 . " $defined_to"; 2514 } 2515 } 2516} 2517 2518@bin_props = sort { exists $keep_together{lc $b} <=> exists $keep_together{lc $a} 2519 or $a cmp $b 2520 } @bin_props; 2521@perl_prop_synonyms = sort(uniques(@perl_prop_synonyms)); 2522push @props, @bin_props; 2523 2524foreach my $prop (@props) { 2525 2526 # For the Latin1 properties, we change to use the eXtended version of the 2527 # base property, then go through the result and get rid of everything not 2528 # in Latin1 (above 255). Actually, we retain the element for the range 2529 # that crosses the 255/256 boundary if it is one that matches the 2530 # property. For example, in the Word property, there is a range of code 2531 # points that start at U+00F8 and goes through U+02C1. Instead of 2532 # artificially cutting that off at 256 because 256 is the first code point 2533 # above Latin1, we let the range go to its natural ending. That gives us 2534 # extra information with no added space taken. But if the range that 2535 # crosses the boundary is one that doesn't match the property, we don't 2536 # start a new range above 255, as that could be construed as going to 2537 # infinity. For example, the Upper property doesn't include the character 2538 # at 255, but does include the one at 256. We don't include the 256 one. 2539 my $prop_name = $prop; 2540 my $is_local_sub = $prop_name =~ s/^&//; 2541 my $extra_enums = ""; 2542 $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x; 2543 my $lookup_prop = $prop_name; 2544 $prop_name = sanitize_name($prop_name); 2545 $prop_name = $table_name_prefix . $prop_name if grep { lc $lookup_prop eq lc $_ } @bin_props; 2546 my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/ 2547 or $lookup_prop =~ s/^L1//); 2548 my $nonl1_only = 0; 2549 $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only; 2550 ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x; 2551 2552 for my $charset (get_supported_code_pages()) { 2553 @a2n = @{get_a2n($charset)}; 2554 2555 my @invlist; 2556 my @invmap; 2557 my $map_format; 2558 my $map_default; 2559 my $maps_to_code_point; 2560 my $to_adjust; 2561 my $same_in_all_code_pages; 2562 if ($is_local_sub) { 2563 my @return = eval $lookup_prop; 2564 die $@ if $@; 2565 my $invlist_ref = shift @return; 2566 @invlist = @$invlist_ref; 2567 if (@return) { # If has other values returned , must be an 2568 # inversion map 2569 my $invmap_ref = shift @return; 2570 @invmap = @$invmap_ref; 2571 $map_format = shift @return; 2572 $map_default = shift @return; 2573 } 2574 } 2575 else { 2576 @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok'); 2577 if (! @invlist) { 2578 2579 # If couldn't find a non-empty inversion list, see if it is 2580 # instead an inversion map 2581 my ($list_ref, $map_ref, $format, $default) 2582 = prop_invmap($lookup_prop, '_perl_core_internal_ok'); 2583 if (! $list_ref) { 2584 # An empty return here could mean an unknown property, or 2585 # merely that the original inversion list is empty. Call 2586 # in scalar context to differentiate 2587 my $count = prop_invlist($lookup_prop, 2588 '_perl_core_internal_ok'); 2589 if (defined $count) { 2590 # Short-circuit an empty inversion list. 2591 output_invlist($prop_name, \@invlist, $charset); 2592 last; 2593 } 2594 die "Could not find inversion list for '$lookup_prop'" 2595 } 2596 else { 2597 @invlist = @$list_ref; 2598 @invmap = @$map_ref; 2599 $map_format = $format; 2600 $map_default = $default; 2601 $maps_to_code_point = $map_format =~ / a ($ | [^r] ) /x; 2602 $to_adjust = $map_format =~ /a/; 2603 } 2604 } 2605 } 2606 2607 # Re-order the Unicode code points to native ones for this platform. 2608 # This is only needed for code points below 256, because native code 2609 # points are only in that range. For inversion maps of properties 2610 # where the mappings are adjusted (format =~ /a/), this reordering 2611 # could mess up the adjustment pattern that was in the input, so that 2612 # has to be dealt with. 2613 # 2614 # And inversion maps that map to code points need to eventually have 2615 # all those code points remapped to native, and it's better to do that 2616 # here, going through the whole list not just those below 256. This 2617 # is because some inversion maps have adjustments (format =~ /a/) 2618 # which may be affected by the reordering. This code needs to be done 2619 # both for when we are translating the inversion lists for < 256, and 2620 # for the inversion maps for everything. By doing both in this loop, 2621 # we can share that code. 2622 # 2623 # So, we go through everything for an inversion map to code points; 2624 # otherwise, we can skip any remapping at all if we are going to 2625 # output only the above-Latin1 values, or if the range spans the whole 2626 # of 0..256, as the remap will also include all of 0..256 (256 not 2627 # 255 because a re-ordering could cause 256 to need to be in the same 2628 # range as 255.) 2629 if ( (@invmap && $maps_to_code_point) 2630 || ( @invlist 2631 && $invlist[0] < 256 2632 && ( $invlist[0] != 0 2633 || (scalar @invlist != 1 && $invlist[1] < 256)))) 2634 { 2635 $same_in_all_code_pages = 0; 2636 if (! @invmap) { # Straight inversion list 2637 # Look at all the ranges that start before 257. 2638 my @latin1_list; 2639 while (@invlist) { 2640 last if $invlist[0] > 256; 2641 my $upper = @invlist > 1 2642 ? $invlist[1] - 1 # In range 2643 2644 # To infinity. You may want to stop much much 2645 # earlier; going this high may expose perl 2646 # deficiencies with very large numbers. 2647 : 256; 2648 for my $j ($invlist[0] .. $upper) { 2649 push @latin1_list, a2n($j); 2650 } 2651 2652 shift @invlist; # Shift off the range that's in the list 2653 shift @invlist; # Shift off the range not in the list 2654 } 2655 2656 # Here @invlist contains all the ranges in the original that 2657 # start at code points above 256, and @latin1_list contains 2658 # all the native code points for ranges that start with a 2659 # Unicode code point below 257. We sort the latter and 2660 # convert it to inversion list format. Then simply prepend it 2661 # to the list of the higher code points. 2662 @latin1_list = sort { $a <=> $b } @latin1_list; 2663 @latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list); 2664 unshift @invlist, @latin1_list; 2665 } 2666 else { # Is an inversion map 2667 2668 # This is a similar procedure as plain inversion list, but has 2669 # multiple buckets. A plain inversion list just has two 2670 # buckets, 1) 'in' the list; and 2) 'not' in the list, and we 2671 # pretty much can ignore the 2nd bucket, as it is completely 2672 # defined by the 1st. But here, what we do is create buckets 2673 # which contain the code points that map to each, translated 2674 # to native and turned into an inversion list. Thus each 2675 # bucket is an inversion list of native code points that map 2676 # to it or don't map to it. We use these to create an 2677 # inversion map for the whole property. 2678 2679 # As mentioned earlier, we use this procedure to not just 2680 # remap the inversion list to native values, but also the maps 2681 # of code points to native ones. In the latter case we have 2682 # to look at the whole of the inversion map (or at least to 2683 # above Unicode; as the maps of code points above that should 2684 # all be to the default). 2685 my $upper_limit = (! $maps_to_code_point) 2686 ? 256 2687 : (Unicode::UCD::UnicodeVersion() eq '1.1.5') 2688 ? 0xFFFF 2689 : 0x10FFFF; 2690 2691 my %mapped_lists; # A hash whose keys are the buckets. 2692 while (@invlist) { 2693 last if $invlist[0] > $upper_limit; 2694 2695 # This shouldn't actually happen, as prop_invmap() returns 2696 # an extra element at the end that is beyond $upper_limit 2697 die "inversion map (for $prop_name) that extends to infinity is unimplemented" unless @invlist > 1; 2698 2699 my $bucket; 2700 2701 # A hash key can't be a ref (we are only expecting arrays 2702 # of scalars here), so convert any such to a string that 2703 # will be converted back later (using a vertical tab as 2704 # the separator). 2705 if (ref $invmap[0]) { 2706 $bucket = join "\cK", map { a2n($_) } @{$invmap[0]}; 2707 } 2708 elsif ( $maps_to_code_point 2709 && $invmap[0] =~ $integer_or_float_re) 2710 { 2711 2712 # Do convert to native for maps to single code points. 2713 # There are some properties that have a few outlier 2714 # maps that aren't code points, so the above test 2715 # skips those. 2716 $bucket = a2n($invmap[0]); 2717 } else { 2718 $bucket = $invmap[0]; 2719 } 2720 2721 # We now have the bucket that all code points in the range 2722 # map to, though possibly they need to be adjusted. Go 2723 # through the range and put each translated code point in 2724 # it into its bucket. 2725 my $base_map = $invmap[0]; 2726 for my $j ($invlist[0] .. $invlist[1] - 1) { 2727 if ($to_adjust 2728 # The 1st code point doesn't need adjusting 2729 && $j > $invlist[0] 2730 2731 # Skip any non-numeric maps: these are outliers 2732 # that aren't code points. 2733 && $base_map =~ $integer_or_float_re 2734 2735 # 'ne' because the default can be a string 2736 && $base_map ne $map_default) 2737 { 2738 # We adjust, by incrementing each the bucket and 2739 # the map. For code point maps, translate to 2740 # native 2741 $base_map++; 2742 $bucket = ($maps_to_code_point) 2743 ? a2n($base_map) 2744 : $base_map; 2745 } 2746 2747 # Add the native code point to the bucket for the 2748 # current map 2749 push @{$mapped_lists{$bucket}}, a2n($j); 2750 } # End of loop through all code points in the range 2751 2752 # Get ready for the next range 2753 shift @invlist; 2754 shift @invmap; 2755 } # End of loop through all ranges in the map. 2756 2757 # Here, @invlist and @invmap retain all the ranges from the 2758 # originals that start with code points above $upper_limit. 2759 # Each bucket in %mapped_lists contains all the code points 2760 # that map to that bucket. If the bucket is for a map to a 2761 # single code point, the bucket has been converted to native. 2762 # If something else (including multiple code points), no 2763 # conversion is done. 2764 # 2765 # Now we recreate the inversion map into %xlated, but this 2766 # time for the native character set. 2767 my %xlated; 2768 foreach my $bucket (keys %mapped_lists) { 2769 2770 # Sort and convert this bucket to an inversion list. The 2771 # result will be that ranges that start with even-numbered 2772 # indexes will be for code points that map to this bucket; 2773 # odd ones map to some other bucket, and are discarded 2774 # below. 2775 @{$mapped_lists{$bucket}} 2776 = sort{ $a <=> $b} @{$mapped_lists{$bucket}}; 2777 @{$mapped_lists{$bucket}} 2778 = mk_invlist_from_sorted_cp_list(\@{$mapped_lists{$bucket}}); 2779 2780 # Add each even-numbered range in the bucket to %xlated; 2781 # so that the keys of %xlated become the range start code 2782 # points, and the values are their corresponding maps. 2783 while (@{$mapped_lists{$bucket}}) { 2784 my $range_start = $mapped_lists{$bucket}->[0]; 2785 if ($bucket =~ /\cK/) { 2786 @{$xlated{$range_start}} = split /\cK/, $bucket; 2787 } 2788 else { 2789 # If adjusting, and there is more than one thing 2790 # that maps to the same thing, they must be split 2791 # so that later the adjusting doesn't think the 2792 # subsequent items can go away because of the 2793 # adjusting. 2794 my $range_end = ($to_adjust && $bucket != $map_default) 2795 ? $mapped_lists{$bucket}->[1] - 1 2796 : $range_start; 2797 for my $i ($range_start .. $range_end) { 2798 $xlated{$i} = $bucket; 2799 } 2800 } 2801 shift @{$mapped_lists{$bucket}}; # Discard odd ranges 2802 shift @{$mapped_lists{$bucket}}; # Get ready for next 2803 # iteration 2804 } 2805 } # End of loop through all the buckets. 2806 2807 # Here %xlated's keys are the range starts of all the code 2808 # points in the inversion map. Construct an inversion list 2809 # from them. 2810 my @new_invlist = sort { $a <=> $b } keys %xlated; 2811 2812 # If the list is adjusted, we want to munge this list so that 2813 # we only have one entry for where consecutive code points map 2814 # to consecutive values. We just skip the subsequent entries 2815 # where this is the case. 2816 if ($to_adjust) { 2817 my @temp; 2818 for my $i (0 .. @new_invlist - 1) { 2819 next if $i > 0 2820 && $new_invlist[$i-1] + 1 == $new_invlist[$i] 2821 && $xlated{$new_invlist[$i-1]} 2822 =~ $integer_or_float_re 2823 && $xlated{$new_invlist[$i]} 2824 =~ $integer_or_float_re 2825 && $xlated{$new_invlist[$i-1]} + 1 2826 == $xlated{$new_invlist[$i]}; 2827 push @temp, $new_invlist[$i]; 2828 } 2829 @new_invlist = @temp; 2830 } 2831 2832 # The inversion map comes from %xlated's values. We can 2833 # unshift each onto the front of the untouched portion, in 2834 # reverse order of the portion we did process. 2835 foreach my $start (reverse @new_invlist) { 2836 unshift @invmap, $xlated{$start}; 2837 } 2838 2839 # Finally prepend the inversion list we have just constructed to the 2840 # one that contains anything we didn't process. 2841 unshift @invlist, @new_invlist; 2842 } 2843 } 2844 elsif (@invmap) { # inversion maps can't cope with this variable 2845 # being true, even if it could be true 2846 $same_in_all_code_pages = 0; 2847 } 2848 else { 2849 $same_in_all_code_pages = 1; 2850 } 2851 2852 # prop_invmap() returns an extra final entry, which we can now 2853 # discard. 2854 if (@invmap) { 2855 pop @invlist; 2856 pop @invmap; 2857 } 2858 2859 if ($l1_only) { 2860 die "Unimplemented to do a Latin-1 only inversion map" if @invmap; 2861 for my $i (0 .. @invlist - 1 - 1) { 2862 if ($invlist[$i] > 255) { 2863 2864 # In an inversion list, even-numbered elements give the code 2865 # points that begin ranges that match the property; 2866 # odd-numbered give ones that begin ranges that don't match. 2867 # If $i is odd, we are at the first code point above 255 that 2868 # doesn't match, which means the range it is ending does 2869 # match, and crosses the 255/256 boundary. We want to include 2870 # this ending point, so increment $i, so the splice below 2871 # includes it. Conversely, if $i is even, it is the first 2872 # code point above 255 that matches, which means there was no 2873 # matching range that crossed the boundary, and we don't want 2874 # to include this code point, so splice before it. 2875 $i++ if $i % 2 != 0; 2876 2877 # Remove everything past this. 2878 splice @invlist, $i; 2879 splice @invmap, $i if @invmap; 2880 last; 2881 } 2882 } 2883 } 2884 elsif ($nonl1_only) { 2885 my $found_nonl1 = 0; 2886 for my $i (0 .. @invlist - 1 - 1) { 2887 next if $invlist[$i] < 256; 2888 2889 # Here, we have the first element in the array that indicates an 2890 # element above Latin1. Get rid of all previous ones. 2891 splice @invlist, 0, $i; 2892 splice @invmap, 0, $i if @invmap; 2893 2894 # If this one's index is not divisible by 2, it means that this 2895 # element is inverting away from being in the list, which means 2896 # all code points from 256 to this one are in this list (or 2897 # map to the default for inversion maps) 2898 if ($i % 2 != 0) { 2899 unshift @invlist, 256; 2900 unshift @invmap, $map_default if @invmap; 2901 } 2902 $found_nonl1 = 1; 2903 last; 2904 } 2905 if (! $found_nonl1) { 2906 warn "No non-Latin1 code points in $prop_name"; 2907 output_invlist($prop_name, []); 2908 last; 2909 } 2910 } 2911 2912 switch_pound_if ($prop_name, 'PERL_IN_REGCOMP_C'); 2913 start_charset_pound_if($charset, 1) unless $same_in_all_code_pages; 2914 2915 output_invlist($prop_name, \@invlist, ($same_in_all_code_pages) 2916 ? $applies_to_all_charsets_text 2917 : $charset); 2918 2919 if (@invmap) { 2920 output_invmap($prop_name, \@invmap, $lookup_prop, $map_format, 2921 $map_default, $extra_enums, $charset); 2922 } 2923 2924 last if $same_in_all_code_pages; 2925 end_charset_pound_if; 2926 } 2927} 2928 2929switch_pound_if ('binary_property_tables', 'PERL_IN_REGCOMP_C'); 2930 2931print $out_fh "\nconst char * const deprecated_property_msgs[] = {\n\t"; 2932print $out_fh join ",\n\t", map { "\"$_\"" } @deprecated_messages; 2933print $out_fh "\n};\n"; 2934 2935my @enums = sort values %enums; 2936 2937# Save a copy of these before modification 2938my @invlist_names = map { "${_}_invlist" } @enums; 2939 2940# Post-process the enums for deprecated properties. 2941if (scalar keys %deprecated_tags) { 2942 my $seen_deprecated = 0; 2943 foreach my $enum (@enums) { 2944 if (grep { $_ eq $enum } keys %deprecated_tags) { 2945 2946 # Change the enum name for this deprecated property to a 2947 # munged one to act as a placeholder in the typedef. Then 2948 # make the real name be a #define whose value is such that 2949 # its modulus with the number of enums yields the index into 2950 # the table occupied by the placeholder. And so that dividing 2951 # the #define value by the table length gives an index into 2952 # the table of deprecation messages for the corresponding 2953 # warning. 2954 my $revised_enum = "${enum}_perl_aux"; 2955 if (! $seen_deprecated) { 2956 $seen_deprecated = 1; 2957 print $out_fh "\n"; 2958 } 2959 print $out_fh "#define $enum ($revised_enum + (MAX_UNI_KEYWORD_INDEX * $deprecated_tags{$enum}))\n"; 2960 $enum = $revised_enum; 2961 } 2962 } 2963} 2964 2965print $out_fh "\ntypedef enum {\n\tPERL_BIN_PLACEHOLDER = 0, /* So no real value is zero */\n\t"; 2966print $out_fh join ",\n\t", @enums; 2967print $out_fh "\n"; 2968print $out_fh "} binary_invlist_enum;\n"; 2969print $out_fh "\n#define MAX_UNI_KEYWORD_INDEX $enums[-1]\n"; 2970 2971output_table_header($out_fh, "UV *", "uni_prop_ptrs"); 2972print $out_fh "\tNULL,\t/* Placeholder */\n"; 2973print $out_fh "\t"; 2974print $out_fh join ",\n\t", @invlist_names; 2975print $out_fh "\n"; 2976 2977output_table_trailer(); 2978 2979print $out_fh join "\n", "\n", 2980 #'# ifdef DOINIT', 2981 #"\n", 2982 "/* Synonyms for perl properties */", 2983 @perl_prop_synonyms, 2984 #"\n", 2985 #"# endif /* DOINIT */", 2986 "\n"; 2987 2988switch_pound_if ('Valid property_values', 'PERL_IN_REGCOMP_C'); 2989 2990# Each entry is a pointer to a table of property values for some property. 2991# (Other properties may share this table. The next two data structures allow 2992# this sharing to be implemented.) 2993my @values_tables = "NULL /* Placeholder so zero index is an error */"; 2994 2995# Keys are all the values of a property, strung together. The value of each 2996# key is its index in @values_tables. This is because many properties have 2997# the same values, and this allows the data to appear just once. 2998my %joined_values; 2999 3000# #defines for indices into @values_tables, so can have synonyms resolved by 3001# the C compiler. 3002my @values_indices; 3003 3004# Go through each property which is specifiable by \p{prop=value}, and create 3005# a hash with the keys being the canonicalized short property names, and the 3006# values for each property being all possible values that it can take on. 3007# Both the full value and its short, canonicalized into lc, sans punctuation 3008# version are included. 3009my %all_values; 3010for my $property (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } 3011 uniques @equals_properties) 3012{ 3013 # Get and canonicalize the short name for this property. 3014 my ($short_name) = prop_aliases($property); 3015 $short_name = lc $short_name; 3016 $short_name =~ s/[ _-]//g; 3017 3018 # Now look at each value this property can take on 3019 foreach my $value (prop_values($short_name)) { 3020 3021 # And for each value, look at each synonym for it 3022 foreach my $alias (prop_value_aliases($short_name, $value)) { 3023 3024 # Add each synonym 3025 push @{$all_values{$short_name}}, $alias; 3026 3027 # As well as its canonicalized name. khw made the decision to not 3028 # support the grandfathered L_ Gc property value 3029 $alias = lc $alias; 3030 $alias =~ s/[ _-]//g unless $alias =~ $numeric_re; 3031 push @{$all_values{$short_name}}, $alias; 3032 } 3033 } 3034} 3035 3036# Also include the old style block names, using the recipe given in 3037# Unicode::UCD 3038foreach my $block (prop_values('block')) { 3039 push @{$all_values{'blk'}}, charblock((prop_invlist("block=$block"))[0]); 3040} 3041 3042# Now create output tables for each property in @equals_properties (the keys 3043# in %all_values) each containing that property's possible values as computed 3044# just above. 3045PROPERTY: 3046for my $property (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) 3047 or $a cmp $b } keys %all_values) 3048{ 3049 @{$all_values{$property}} = uniques(@{$all_values{$property}}); 3050 3051 # String together the values for this property, sorted. This string forms 3052 # a list definition, with each value as an entry in it, indented on a new 3053 # line. The sorting is used to find properties that take on the exact 3054 # same values to share this string. 3055 my $joined = "\t\""; 3056 $joined .= join "\",\n\t\"", 3057 sort { ($a =~ $numeric_re && $b =~ $numeric_re) 3058 ? eval $a <=> eval $b 3059 : prop_name_for_cmp($a) cmp prop_name_for_cmp($b) 3060 or $a cmp $b 3061 } @{$all_values{$property}}; 3062 # And add a trailing marker 3063 $joined .= "\",\n\tNULL\n"; 3064 3065 my $table_name = $table_name_prefix . $property . "_values"; 3066 my $index_name = "${table_name}_index"; 3067 3068 # Add a rule for the parser that is just an empty value. It will need to 3069 # know to look up empty things in the prop_value_ptrs[] table. 3070 3071 $keywords{"$property="} = $index_name; 3072 if (exists $prop_name_aliases{$property}) { 3073 foreach my $alias (@{$prop_name_aliases{$property}}) { 3074 $keywords{"$alias="} = $index_name; 3075 } 3076 } 3077 3078 # Also create rules for the synonyms of this property to point to the same 3079 # thing 3080 3081 # If this property's values are the same as one we've already computed, 3082 # use that instead of creating a duplicate. But we add a #define to point 3083 # to the proper one. 3084 if (exists $joined_values{$joined}) { 3085 push @values_indices, "#define $index_name $joined_values{$joined}\n"; 3086 next PROPERTY; 3087 } 3088 3089 # And this property, now known to have unique values from any other seen 3090 # so far is about to be pushed onto @values_tables. Its index is the 3091 # current count. 3092 push @values_indices, "#define $index_name " 3093 . scalar @values_tables . "\n"; 3094 $joined_values{$joined} = $index_name; 3095 push @values_tables, $table_name; 3096 3097 # Create the table for this set of values. 3098 output_table_header($out_fh, "char *", $table_name); 3099 print $out_fh $joined; 3100 output_table_trailer(); 3101} # End of loop through the properties, and their values 3102 3103# We have completely determined the table of the unique property values 3104output_table_header($out_fh, "char * const *", 3105 "${table_name_prefix}prop_value_ptrs"); 3106print $out_fh join ",\n", @values_tables; 3107print $out_fh "\n"; 3108output_table_trailer(); 3109 3110# And the #defines for the indices in it 3111print $out_fh "\n\n", join "", @values_indices; 3112 3113switch_pound_if('Boundary_pair_tables', 'PERL_IN_REGEXEC_C'); 3114 3115output_GCB_table(); 3116output_LB_table(); 3117output_WB_table(); 3118 3119end_file_pound_if; 3120 3121print $out_fh <<"EOF"; 3122 3123/* More than one code point may have the same code point as their fold. This 3124 * gives the maximum number in the current Unicode release. (The folded-to 3125 * code point is not included in this count.) For example, both 'S' and 3126 * \\x{17F} fold to 's', so the number for that fold is 2. Another way to 3127 * look at it is the maximum length of all the IVCF_AUX_TABLE's */ 3128#define MAX_FOLD_FROMS $max_fold_froms 3129EOF 3130 3131my $sources_list = "lib/unicore/mktables.lst"; 3132my @sources = qw(regen/mk_invlists.pl 3133 lib/unicore/mktables 3134 lib/Unicode/UCD.pm 3135 regen/charset_translations.pl 3136 regen/mk_PL_charclass.pl 3137 ); 3138{ 3139 # Depend on mktables’ own sources. It’s a shorter list of files than 3140 # those that Unicode::UCD uses. 3141 if (! open my $mktables_list, '<', $sources_list) { 3142 3143 # This should force a rebuild once $sources_list exists 3144 push @sources, $sources_list; 3145 } 3146 else { 3147 while(<$mktables_list>) { 3148 last if /===/; 3149 chomp; 3150 push @sources, "lib/unicore/$_" if /^[^#]/; 3151 } 3152 } 3153} 3154 3155read_only_bottom_close_and_rename($out_fh, \@sources); 3156 3157my %name_to_index; 3158for my $i (0 .. @enums - 1) { 3159 my $loose_name = $enums[$i] =~ s/^$table_name_prefix//r; 3160 $loose_name = lc $loose_name; 3161 $loose_name =~ s/__/=/; 3162 $loose_name =~ s/_dot_/./; 3163 $loose_name =~ s/_slash_/\//g; 3164 $name_to_index{$loose_name} = $i + 1; 3165} 3166# unsanitize, exclude &, maybe add these before sanitize 3167for my $i (0 .. @perl_prop_synonyms - 1) { 3168 my $loose_name_pair = $perl_prop_synonyms[$i] =~ s/#\s*define\s*//r; 3169 $loose_name_pair =~ s/\b$table_name_prefix//g; 3170 $loose_name_pair = lc $loose_name_pair; 3171 $loose_name_pair =~ s/__/=/g; 3172 $loose_name_pair =~ s/_dot_/./g; 3173 $loose_name_pair =~ s/_slash_/\//g; 3174 my ($synonym, $primary) = split / +/, $loose_name_pair; 3175 $name_to_index{$synonym} = $name_to_index{$primary}; 3176} 3177 3178my $uni_pl = open_new('lib/unicore/uni_keywords.pl', '>', 3179 {style => '*', by => 'regen/mk_invlists.pl', 3180 from => "Unicode::UCD"}); 3181{ 3182 print $uni_pl "\%utf8::uni_prop_ptrs_indices = (\n"; 3183 for my $name (sort keys %name_to_index) { 3184 print $uni_pl " '$name' => $name_to_index{$name},\n"; 3185 } 3186 print $uni_pl ");\n\n1;\n"; 3187} 3188 3189read_only_bottom_close_and_rename($uni_pl, \@sources); 3190 3191require './regen/mph.pl'; 3192 3193sub token_name 3194{ 3195 my $name = sanitize_name(shift); 3196 warn "$name contains non-word" if $name =~ /\W/; 3197 3198 return "$table_name_prefix\U$name" 3199} 3200 3201my $keywords_fh = open_new('uni_keywords.h', '>', 3202 {style => '*', by => 'regen/mk_invlists.pl', 3203 from => "mph.pl"}); 3204 3205no warnings 'once'; 3206print $keywords_fh <<"EOF"; 3207/* The precision to use in "%.*e" formats */ 3208#define PL_E_FORMAT_PRECISION $utf8::e_precision 3209 3210EOF 3211 3212my ($second_level, $seed1, $length_all_keys, $smart_blob, $rows) = MinimalPerfectHash::make_mph_from_hash(\%keywords); 3213print $keywords_fh MinimalPerfectHash::make_algo($second_level, $seed1, $length_all_keys, $smart_blob, $rows, undef, undef, undef, 'match_uniprop' ); 3214 3215push @sources, 'regen/mph.pl'; 3216read_only_bottom_close_and_rename($keywords_fh, \@sources); 3217