1use v5.16.0; 2use strict; 3use warnings; 4use integer; 5 6BEGIN { unshift @INC, '.' } 7 8require './regen/regen_lib.pl'; 9require './regen/charset_translations.pl'; 10 11# Generates the EBCDIC translation tables that were formerly hard-coded into 12# utfebcdic.h 13 14my $out_fh = open_new('ebcdic_tables.h', '>', 15 {style => '*', by => $0, }); 16 17sub get_column_headers ($$;$) { 18 my ($row_hdr_len, $field_width, $dfa_columns) = @_; 19 my $format; 20 my $final_column_format; 21 my $num_columns; 22 23 if (defined $dfa_columns) { 24 $num_columns = $dfa_columns; 25 26 # Trailing blank to correspond with commas in the rows below 27 $format = "%${field_width}d "; 28 } 29 else { # Is a regular table 30 $num_columns = 16; 31 32 # Use blanks to separate the fields 33 $format = " " x ( $field_width 34 - 2); # For the '_X' 35 $format .= "_%X "; # Again, trailing blank over the commas below 36 } 37 38 my $header = "/*" . " " x ($row_hdr_len - length "/*"); 39 40 # All but the final column 41 $header .= sprintf($format, $_) for 0 .. $num_columns - 2; 42 43 # Get rid of trailing blank, so that the final column takes up one less 44 # space so that the "*/" doesn't extend past the commas in the rows below 45 chop $header; 46 $header .= sprintf $format, $num_columns - 1; 47 48 # Again, remove trailing blank 49 chop $header; 50 51 return $header . "*/\n"; 52} 53 54sub output_table_start($$$;$) { 55 my ($out_fh, $TYPE, $name, $size) = @_; 56 57 $size = "" unless defined $size; 58 59 # Anything locale related will be written on 60 my $const = ($name !~ /locale/i) ? 'CONST' : ""; 61 62 my $declaration = "EXT$const $TYPE $name\[$size\]"; 63 print $out_fh <<EOF; 64# ifndef DOINIT 65 $declaration; 66# else 67 $declaration = { 68EOF 69} 70 71sub output_table_end($) { 72 print $out_fh "};\n# endif\n\n"; 73} 74 75sub output_table ($$;$) { 76 my $table_ref = shift; 77 my $name = shift; 78 79 my $size = @$table_ref; 80 81 # 0 => print in decimal 82 # 1 => print in hex (translates code point to code point) 83 # >= 2 => is a dfa table, like https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ 84 # The number is how many columns in the part after the code point 85 # portion. 86 # 87 # code point tables in hex areasier to debug, but don't fit into 80 88 # columns 89 my $type = shift // 1; 90 91 my $print_in_hex = $type == 1; 92 my $is_dfa = ($type >= 2) ? $type : 0; 93 my $columns_after_256 = 16; 94 95 die "Requres 256 entries in table $name, got @$table_ref" 96 if ! $is_dfa && @$table_ref != 256; 97 if (! $is_dfa) { 98 die "Requres 256 entries in table $name, got @$table_ref" 99 if @$table_ref != 256; 100 } 101 else { 102 $columns_after_256 = $is_dfa; 103 104 print $out_fh <<'EOF'; 105 106/* The table below is adapted from 107 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ 108 * See copyright notice at the beginning of this file. 109 */ 110 111EOF 112 } 113 114 # Highest number in the table 115 my $max_entry = 0; 116 $max_entry = map { $_ > $max_entry ? $_ : $max_entry } @$table_ref; 117 118 # We assume that every table has at least one two digit entry, and none 119 # are more than three digit. 120 my $field_width = ($print_in_hex) 121 ? 4 122 : (($max_entry) > 99 ? 3 : 2); 123 124 my $row_hdr_length; 125 my $node_number_field_width; 126 my $node_value_field_width; 127 128 # dfa tables have a special header for the rows in the transitions part of 129 # the table. It is longer than the regular one. 130 if ($is_dfa) { 131 my $max_node_number = ($max_entry - 256) / $columns_after_256 - 1; 132 $node_number_field_width = ($max_node_number > 9) ? 2 : 1; 133 $node_value_field_width = ($max_node_number * $columns_after_256 > 99) 134 ? 3 : 2; 135 # The header starts with this template, and adds in the number of 136 # digits needed to represent the maximum node number and its value 137 $row_hdr_length = length("/*N=*/") 138 + $node_number_field_width 139 + $node_value_field_width; 140 } 141 else { 142 $row_hdr_length = length "/*_X*/"; # Template for what the header 143 # looks like 144 } 145 146 # The table may not be representable in 8 bits. 147 my $TYPE = 'U8'; 148 $TYPE = 'U16' if grep { $_ > 255 } @$table_ref; 149 150 output_table_start $out_fh, $TYPE, $name, $size; 151 152 # First the headers for the columns 153 print $out_fh get_column_headers($row_hdr_length, $field_width); 154 155 # Now the table body 156 my $count = @$table_ref; 157 my $last_was_nl = 1; 158 159 # Print each element individually, arranged in rows of columns 160 for my $i (0 .. $count - 1) { 161 162 # Node number for here is -1 until get into the dfa state transitions 163 my $node = ($i < 256) ? -1 : ($i - 256) / $columns_after_256; 164 165 # Print row header at beginning of each row 166 if ($last_was_nl) { 167 if ($node >= 0) { 168 printf $out_fh "/*N%-*d=%*d*/", $node_number_field_width, $node, 169 $node_value_field_width, $i - 256; 170 } 171 else { # Otherwise is regular row; print its number 172 printf $out_fh "/*%X_", $i / 16; 173 174 # These rows in a dfa table require extra space so columns 175 # will align vertically (because the Ndd=ddd requires extra 176 # space) 177 if ($is_dfa) { 178 print $out_fh " " x ( $node_number_field_width 179 + $node_value_field_width); 180 } 181 print $out_fh "*/"; 182 } 183 } 184 185 if ($print_in_hex) { 186 printf $out_fh "0x%02X", $table_ref->[$i]; 187 } 188 else { 189 printf $out_fh "%${field_width}d", $table_ref->[$i]; 190 } 191 192 print $out_fh ",", if $i < $count -1; # No comma on final entry 193 194 # Add \n if at end of row, which is 16 columns until we get to the 195 # transitions part 196 if ( ($node < 0 && $i % 16 == 15) 197 || ($node >= 0 && ($i -256) % $columns_after_256 198 == $columns_after_256 - 1)) 199 { 200 print $out_fh "\n"; 201 $last_was_nl = 1; 202 } 203 else { 204 $last_was_nl = 0; 205 } 206 } 207 208 # Print column footer 209 print $out_fh get_column_headers($row_hdr_length, $field_width, 210 ($is_dfa) ? $columns_after_256 : undef); 211 212 output_table_end($out_fh); 213} 214 215print $out_fh <<'END'; 216 217#ifndef PERL_EBCDIC_TABLES_H_ /* Guard against nested #includes */ 218#define PERL_EBCDIC_TABLES_H_ 1 219 220/* This file contains definitions for various tables used in EBCDIC handling. 221 * More info is in utfebcdic.h 222 * 223 * Some of the tables are adapted from 224 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ 225 * which requires this copyright notice: 226 227Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> 228 229Permission is hereby granted, free of charge, to any person obtaining a copy of 230this software and associated documentation files (the "Software"), to deal in 231the Software without restriction, including without limitation the rights to 232use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 233of the Software, and to permit persons to whom the Software is furnished to do 234so, subject to the following conditions: 235 236The above copyright notice and this permission notice shall be included in all 237copies or substantial portions of the Software. 238 239THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 240IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 241FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 242AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 243LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 244OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 245SOFTWARE. 246 247*/ 248END 249 250my @charsets = get_supported_code_pages(); 251shift @charsets; # ASCII is the 0th, and we don't deal with that here. 252foreach my $charset (@charsets) { 253 # we process the whole array several times, make a copy 254 my @a2e = @{get_a2n($charset)}; 255 my @e2a; 256 257 print $out_fh "\n" . get_conditional_compile_line_start($charset); 258 print $out_fh "\n"; 259 260 print $out_fh "/* Index is ASCII platform code point; value is $charset equivalent */\n"; 261 output_table(\@a2e, "PL_a2e"); 262 263 { # Construct the inverse 264 for my $i (0 .. 255) { 265 $e2a[$a2e[$i]] = $i; 266 } 267 print $out_fh "/* Index is $charset code point; value is ASCII platform equivalent */\n"; 268 output_table(\@e2a, "PL_e2a"); 269 } 270 271 my @i82utf = @{get_I8_2_utf($charset)}; 272 print $out_fh <<END; 273/* (Confusingly named) Index is $charset I8 byte; value is 274 * $charset UTF-EBCDIC equivalent */ 275END 276 output_table(\@i82utf, "PL_utf2e"); 277 278 { #Construct the inverse 279 my @utf2i8; 280 for my $i (0 .. 255) { 281 $utf2i8[$i82utf[$i]] = $i; 282 } 283 print $out_fh <<END; 284/* (Confusingly named) Index is $charset UTF-EBCDIC byte; value is 285 * $charset I8 equivalent */ 286END 287 output_table(\@utf2i8, "PL_e2utf"); 288 } 289 290 { 291 my @utf8skip; 292 293 # These are invariants or continuation bytes. 294 for my $i (0 .. 0xBF) { 295 $utf8skip[$i82utf[$i]] = 1; 296 } 297 298 # These are start bytes; The skip is the number of consecutive highest 299 # order 1-bits (up to 7) 300 for my $i (0xC0 .. 255) { 301 my $count; 302 if ($i == 0b11111111) { 303 no warnings 'once'; 304 $count = $CHARSET_TRANSLATIONS::UTF_EBCDIC_MAXBYTES; 305 } 306 elsif (($i & 0b11111110) == 0b11111110) { 307 $count= 7; 308 } 309 elsif (($i & 0b11111100) == 0b11111100) { 310 $count= 6; 311 } 312 elsif (($i & 0b11111000) == 0b11111000) { 313 $count= 5; 314 } 315 elsif (($i & 0b11110000) == 0b11110000) { 316 $count= 4; 317 } 318 elsif (($i & 0b11100000) == 0b11100000) { 319 $count= 3; 320 } 321 elsif (($i & 0b11000000) == 0b11000000) { 322 $count= 2; 323 } 324 else { 325 die "Something wrong for UTF8SKIP calculation for $i"; 326 } 327 $utf8skip[$i82utf[$i]] = $count; 328 } 329 330 print $out_fh <<END; 331/* Index is $charset UTF-EBCDIC byte; value is UTF8SKIP for start bytes 332 * (including for overlongs); 1 for continuation. Adapted from the shadow 333 * flags table in tr16. The entries marked 9 in tr16 are continuation bytes 334 * and are marked as length 1 here so that we can recover. */ 335END 336 output_table(\@utf8skip, "PL_utf8skip", 0); # The 0 means don't print 337 # in hex 338 } 339 340 use feature 'unicode_strings'; 341 342 { 343 my @lc; 344 for my $i (0 .. 255) { 345 $lc[$a2e[$i]] = $a2e[ord lc chr $i]; 346 } 347 print $out_fh 348 "/* Index is $charset code point; value is its lowercase equivalent */\n"; 349 output_table(\@lc, "PL_latin1_lc"); 350 } 351 352 { 353 my @uc; 354 for my $i (0 .. 255) { 355 my $uc = uc chr $i; 356 if (length $uc > 1 || ord $uc > 255) { 357 $uc = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}"; 358 } 359 $uc[$a2e[$i]] = $a2e[ord $uc]; 360 } 361 print $out_fh <<END; 362/* Index is $charset code point; value is its uppercase equivalent. 363 * The 'mod' in the name means that codepoints whose uppercase is above 255 or 364 * longer than 1 character map to LATIN SMALL LETTER Y WITH DIARESIS */ 365END 366 output_table(\@uc, "PL_mod_latin1_uc"); 367 } 368 369 { # PL_fold 370 my @ascii_fold; 371 for my $i (0 .. 255) { # Initialise to identity map 372 $ascii_fold[$i] = $i; 373 } 374 375 # Overwrite the entries that aren't identity 376 for my $chr ('A' .. 'Z') { 377 $ascii_fold[$a2e[ord $chr]] = $a2e[ord lc $chr]; 378 } 379 for my $chr ('a' .. 'z') { 380 $ascii_fold[$a2e[ord $chr]] = $a2e[ord uc $chr]; 381 } 382 print $out_fh <<END; 383/* Index is $charset code point; For A-Z, value is a-z; for a-z, value 384 * is A-Z; all other code points map to themselves */ 385END 386 output_table(\@ascii_fold, "PL_fold"); 387 388 # This table is also the correct folding for the default C locale 389 output_table(\@ascii_fold, "PL_fold_locale"); 390 } 391 392 { 393 my @latin1_fold; 394 for my $i (0 .. 255) { 395 my $char = chr $i; 396 my $lc = lc $char; 397 398 # lc and uc adequately proxy for fold-case pairs in this 0-255 399 # range 400 my $uc = uc $char; 401 $uc = $char if length $uc > 1 || ord $uc > 255; 402 if ($lc ne $char) { 403 $latin1_fold[$a2e[$i]] = $a2e[ord $lc]; 404 } 405 elsif ($uc ne $char) { 406 $latin1_fold[$a2e[$i]] = $a2e[ord $uc]; 407 } 408 else { 409 $latin1_fold[$a2e[$i]] = $a2e[$i]; 410 } 411 } 412 print $out_fh <<END; 413/* Index is $charset code point; value is its other fold-pair equivalent 414 * (A => a; a => A, etc) in the 0-255 range. If no such equivalent, value is 415 * the code point itself */ 416END 417 output_table(\@latin1_fold, "PL_fold_latin1"); 418 } 419 420 { 421 # This generates the dfa table for perl extended UTF-8, which accepts 422 # surrogates, non-characters, and accepts start bytes up through FE 423 # (start byte FF has to be handled outside this dfa). The class numbers 424 # for start bytes are constrained so that they can be used as a shift 425 # count for masking off the leading one bits 426 # 427 # The classes are 428 # 00-9F 0 429 # A0-A1 7 Not legal immediately after start bytes F0 F8 FC 430 # FE 431 # A2-A3 8 Not legal immediately after start bytes F0 F8 FC 432 # A4-A7 9 Not legal immediately after start bytes F0 F8 433 # A8-AF 10 Not legal immediately after start bytes F0 434 # B0-BF 11 435 # C0-C4 1 436 # C5-DF 2 437 # E0 1 438 # E1-EF 3 439 # F0 12 440 # F1-F7 4 441 # F8 13 442 # F9-FB 5 443 # FC 14 444 # FD 6 445 # FE 15 446 # FF 1 447 # 448 # Here's the I8 for the code points before which overlongs occur: 449 # U+4000: \xF0\xB0\xA0\xA0 450 # U+40000: \xF8\xA8\xA0\xA0\xA0 451 # U+400000: \xFC\xA4\xA0\xA0\xA0\xA0 452 # U+4000000: \xFE\xA2\xA0\xA0\xA0\xA0\xA0 453 # 454 # The first part of the table maps bytes to character classes to reduce 455 # the size of the transition table and create bitmasks. 456 # 457 # The second part is a transition table that maps a combination of a 458 # state of the automaton and a character class to a new state. The 459 # numbering of the original nodes is retained, but some have been split 460 # so that there are new nodes. They mean: 461 # N0 The initial state, and final accepting one. 462 # N1 One continuation byte (A0-BF) left. This is transitioned to 463 # immediately when the start byte indicates a two-byte sequence 464 # N2 Two continuation bytes left. 465 # N3 Three continuation bytes left. 466 # N4 Four continuation bytes left. 467 # N5 Five continuation bytes left. 468 # N6 Start byte is F0. Continuation bytes A[0-F] are illegal 469 # (overlong); the other continuations transition to N2 470 # N7 Start byte is F8. Continuation bytes A[0-7] are illegal 471 # (overlong); the other continuations transition to N3 472 # N8 Start byte is FC. Continuation bytes A[0-3] are illegal 473 # (overlong); the other continuations transition to N4 474 # N9 Start byte is FE. Continuation bytes A[01] are illegal 475 # (overlong); the other continuations transition to N5 476 # 1 Reject. All transitions not mentioned above (except the single 477 # byte ones (as they are always legal) are to this state. 478 479 my $NUM_CLASSES = 16; 480 my $N0 = 0; 481 my $N1 = $N0 + $NUM_CLASSES; 482 my $N2 = $N1 + $NUM_CLASSES; 483 my $N3 = $N2 + $NUM_CLASSES; 484 my $N4 = $N3 + $NUM_CLASSES; 485 my $N5 = $N4 + $NUM_CLASSES; 486 my $N6 = $N5 + $NUM_CLASSES; 487 my $N7 = $N6 + $NUM_CLASSES; 488 my $N8 = $N7 + $NUM_CLASSES; 489 my $N9 = $N8 + $NUM_CLASSES; 490 my $N10 = $N9 + $NUM_CLASSES; 491 492 my @perl_extended_utf8_dfa; 493 my @i8 = ( 494 # 0 1 2 3 4 5 6 7 8 9 A B C D E F 495 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F 496 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F 497 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F 498 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F 499 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F 500 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F 501 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F 502 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F 503 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F 504 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F 505 7, 7, 8, 8, 9, 9, 9, 9,10,10,10,10,10,10,10,10, # A0-AF 506 11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11, # B0-BF 507 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF 508 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF 509 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF 510 12, 4, 4, 4, 4, 4, 4, 4,13, 5, 5, 5,14, 6,15, 1, # F0-FF 511 ); 512 $perl_extended_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255); 513 push @perl_extended_utf8_dfa, ( 514 # Class: 515 # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 516 0, 1,$N1,$N2,$N3,$N4,$N5, 1, 1, 1, 1, 1,$N6,$N7,$N8,$N9, # N0 517 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, # N1 518 1, 1, 1, 1, 1, 1, 1,$N1,$N1,$N1,$N1,$N1, 1, 1, 1, 1, # N2 519 1, 1, 1, 1, 1, 1, 1,$N2,$N2,$N2,$N2,$N2, 1, 1, 1, 1, # N3 520 1, 1, 1, 1, 1, 1, 1,$N3,$N3,$N3,$N3,$N3, 1, 1, 1, 1, # N4 521 1, 1, 1, 1, 1, 1, 1,$N4,$N4,$N4,$N4,$N4, 1, 1, 1, 1, # N5 522 523 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,$N2, 1, 1, 1, 1, # N6 524 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,$N3,$N3, 1, 1, 1, 1, # N7 525 1, 1, 1, 1, 1, 1, 1, 1, 1,$N4,$N4,$N4, 1, 1, 1, 1, # N8 526 1, 1, 1, 1, 1, 1, 1, 1,$N5,$N5,$N5,$N5, 1, 1, 1, 1, # N9 527 ); 528 output_table(\@perl_extended_utf8_dfa, "PL_extended_utf8_dfa_tab", 529 $NUM_CLASSES); 530 } 531 532 { 533 # This generates the dfa table for strict UTF-8, which rejects 534 # surrogates, non-characters, and above Unicode. 535 # 536 # The classes are 537 # 00-9F 0 Always legal at start 538 # A0 10 Not legal immediately after start bytes F0 F8 539 # A1 11 Not legal immediately after start bytes F0 F8, 540 # A2-A7 12 Not legal immediately after start bytes F0 F8 F9 541 # A8,AA,AC 13 Not legal immediately after start bytes F0 F9 542 # A9,AB,AD 14 Not legal immediately after start byte F0 543 # AE 15 Not legal immediately after start byte F0 544 # AF 16 Not legal immediately after start bytes F0 545 # B[0248AC] 17 Not legal immediately after start byte F9 546 # B[1359D] 18 Not legal immediately after start byte F9 547 # B6 19 Not legal immediately after start byte F9 548 # B7 20 Not legal immediately after start byte F9 549 # BE 21 Not legal immediately after start byte F9 550 # BF 22 Not legal immediately after start byte F9 551 # C0-C4 1 (reject, all are overlong) 552 # C5-DF 2 Accepts any legal continuation 553 # E0 1 (reject, all are overlong) 554 # E1-EF 3 Accepts any legal continuation 555 # F0 8 (has overlongs) 556 # F1 6 (has surrogates, non-chars) 557 # F2,F4,F6 4 Accepts any legal continuation 558 # F3,F5,F7 5 (has non-chars) 559 # F8 9 (has overlongs, non-chars) 560 # F9 7 (has non-chars, non-Unicode) 561 # FA-FF 1 (reject, all are non-Unicode) 562 # 563 # Here's the I8 for enough code points so that you can figure out what's 564 # going on: 565 # 566 # U+D800: \xF1\xB6\xA0\xA0 567 # U+DFFF: \xF1\xB7\xBF\xBF 568 # U+FDD0: \xF1\xBF\xAE\xB0 569 # U+FDEF: \xF1\xBF\xAF\xAF 570 # U+FFFE: \xF1\xBF\xBF\xBE 571 # U+1FFFE: \xF3\xBF\xBF\xBE 572 # U+2FFFE: \xF5\xBF\xBF\xBE 573 # U+3FFFE: \xF7\xBF\xBF\xBE 574 # U+4FFFE: \xF8\xA9\xBF\xBF\xBE 575 # U+5FFFE: \xF8\xAB\xBF\xBF\xBE 576 # U+6FFFE: \xF8\xAD\xBF\xBF\xBE 577 # U+7FFFE: \xF8\xAF\xBF\xBF\xBE 578 # U+8FFFE: \xF8\xB1\xBF\xBF\xBE 579 # U+9FFFE: \xF8\xB3\xBF\xBF\xBE 580 # U+AFFFE: \xF8\xB5\xBF\xBF\xBE 581 # U+BFFFE: \xF8\xB7\xBF\xBF\xBE 582 # U+CFFFE: \xF8\xB9\xBF\xBF\xBE 583 # U+DFFFE: \xF8\xBB\xBF\xBF\xBE 584 # U+EFFFE: \xF8\xBD\xBF\xBF\xBE 585 # U+FFFFE: \xF8\xBF\xBF\xBF\xBE 586 # U+10FFFE: \xF9\xA1\xBF\xBF\xBE 587 # 588 # The first part of the table maps bytes to character classes to reduce 589 # the size of the transition table and create bitmasks. 590 # 591 # The second part is a transition table that maps a combination of a 592 # state of the automaton and a character class to a new state. The 593 # numbering of the original nodes is retained, but some have been split 594 # so that there are new nodes. They mean: 595 # N0 The initial state, and final accepting one. 596 # N1 One continuation byte (A0-BF) left. This is transitioned to 597 # immediately when the start byte indicates a two-byte sequence 598 # N2 Two continuation bytes left. 599 # N3 Three continuation bytes left. 600 # N4 Start byte is F0. Continuation bytes A[0-F] are illegal 601 # (overlong); the other continuations transition to N2 602 # N5 Start byte is F1. Continuation bytes B6 and B7 are illegal 603 # (surrogates); BF transitions to N9; the other continuations to 604 # N2 605 # N6 Start byte is F[357]. Continuation byte BF transitions to N12; 606 # other continuations to N2 607 # N7 Start byte is F8. Continuation bytes A[0-7] are illegal 608 # (overlong); continuations A[9BDF] and B[13579BDF] transition to 609 # N14; the other continuations to N3 610 # N8 Start byte is F9. Continuation byte A0 transitions to N3; A1 611 # to N14; the other continuation bytes are illegal. 612 # N9 Initial sequence is F1 BF. Continuation byte AE transitions to 613 # state N10; AF to N11; BF to N13; the other continuations to N1. 614 # N10 Initial sequence is F1 BF AE. Continuation bytes B0-BF are 615 # illegal (non-chars); the other continuations are legal 616 # N11 Initial sequence is F1 BF AF. Continuation bytes A0-AF are 617 # illegal (non-chars); the other continuations are legal 618 # N12 Initial sequence is F[357] BF. Continuation bytes BF 619 # transitions to N13; the other continuations to N1 620 # N13 Initial sequence is F[1357] BF BF or F8 x y BF (where x and y 621 # are something that can lead to a non-char. Continuation bytes 622 # BE and BF are illegal (non-chars); the other continuations are 623 # legal 624 # N14 Initial sequence is F8 A[9BDF]; or F8 B[13579BDF]; or F9 A1. 625 # Continuation byte BF transitions to N15; the other 626 # continuations to N2 627 # N15 Initial sequence is F8 A[9BDF] BF; or F8 B[13579BDF] BF; or 628 # F9 A1 BF. Continuation byte BF transitions to N16; the other 629 # continuations to N2 630 # N16 Initial sequence is F8 A[9BDF] BF BF; or F8 B[13579BDF] BF BF; 631 # or F9 A1 BF BF. Continuation bytes BE and BF are illegal 632 # (non-chars); the other continuations are legal 633 # 1 Reject. All transitions not mentioned above (except the single 634 # byte ones (as they are always legal) are to this state. 635 636 my $NUM_CLASSES = 23; 637 my $N0 = 0; 638 my $N1 = $N0 + $NUM_CLASSES; 639 my $N2 = $N1 + $NUM_CLASSES; 640 my $N3 = $N2 + $NUM_CLASSES; 641 my $N4 = $N3 + $NUM_CLASSES; 642 my $N5 = $N4 + $NUM_CLASSES; 643 my $N6 = $N5 + $NUM_CLASSES; 644 my $N7 = $N6 + $NUM_CLASSES; 645 my $N8 = $N7 + $NUM_CLASSES; 646 my $N9 = $N8 + $NUM_CLASSES; 647 my $N10 = $N9 + $NUM_CLASSES; 648 my $N11 = $N10 + $NUM_CLASSES; 649 my $N12 = $N11 + $NUM_CLASSES; 650 my $N13 = $N12 + $NUM_CLASSES; 651 my $N14 = $N13 + $NUM_CLASSES; 652 my $N15 = $N14 + $NUM_CLASSES; 653 654 my @strict_utf8_dfa; 655 my @i8 = ( 656 # 0 1 2 3 4 5 6 7 8 9 A B C D E F 657 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F 658 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F 659 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F 660 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F 661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F 662 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F 663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F 664 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F 665 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F 666 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F 667 10,11,12,12,12,12,12,12,13,14,13,14,13,14,15,16, # A0-AF 668 17,18,17,18,17,18,19,20,17,18,17,18,17,18,21,22, # B0-BF 669 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF 670 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF 671 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF 672 8, 6, 4, 5, 4, 5, 4, 5, 9, 7, 1, 1, 1, 1, 1, 1, # F0-FF 673 ); 674 $strict_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255); 675 push @strict_utf8_dfa, ( 676 # Class: 677 # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 678 0,1,$N1,$N2,$N3,$N6,$N5,$N8,$N4,$N7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # N0 679 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # N1 680 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, # N2 681 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, # N3 682 683 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, # N4 684 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, 1, 1, $N2, $N9, # N5 685 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,$N12, # N6 686 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N3,$N14, $N3,$N14, $N3,$N14, $N3,$N14, $N3,$N14, # N7 687 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N3,$N14, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # N8 688 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N1, $N1, $N1, $N1, $N1,$N10,$N11, $N1, $N1, $N1, $N1, $N1,$N13, # N9 689 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, # N10 690 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, # N11 691 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1,$N13, # N12 692 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, # N13 693 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,$N15, # N14 694 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1,$N13, # N15 695 ); 696 output_table(\@strict_utf8_dfa, "PL_strict_utf8_dfa_tab", $NUM_CLASSES); 697 } 698 699 { 700 # This generates the dfa table for C9 strict UTF-8, which rejects 701 # surrogates and above Unicode, but allows non-characters,. 702 # 703 # The classes are 704 # 00-9F 0 Always legal at start 705 # A0-A1 9 Not legal immediately after start bytes F0 F8 706 # A2-A7 10 Not legal immediately after start bytes F0 F8 F9 707 # A8-AF 11 Not legal immediately after start bytes F0 F9 708 # B0-B5,B8-BF 12 Not legal immediately after start byte F9 709 # B6,B7 13 710 # C0-C4 1 (reject, all are overlong) 711 # C5-DF 2 Accepts any legal continuation 712 # E0 1 (reject, all are overlong) 713 # E1-EF 3 Accepts any legal continuation 714 # F0 6 (has overlongs) 715 # F1 5 (has surrogates) 716 # F2-F7 4 Accepts any legal continuation 717 # F8 8 (has overlongs) 718 # F9 7 (has non-Unicode) 719 # FA-FF 1 (reject, all are non-Unicode) 720 # 721 # The first part of the table maps bytes to character classes to reduce 722 # the size of the transition table and create bitmasks. 723 # 724 # The second part is a transition table that maps a combination of a 725 # state of the automaton and a character class to a new state. The 726 # numbering of the original nodes is retained, but some have been split 727 # so that there are new nodes. They mean: 728 # N0 The initial state, and final accepting one. 729 # N1 One continuation byte (A0-BF) left. This is transitioned to 730 # immediately when the start byte indicates a two-byte sequence 731 # N2 Two continuation bytes left. 732 # N3 Three continuation bytes left. 733 # N4 Start byte is F0. Continuation bytes A[0-F] are illegal 734 # (overlong); the other continuations transition to N2 735 # N5 Start byte is F1. B6 and B7 are illegal (surrogates); the 736 # other continuations transition to N2 737 # N6 Start byte is F8. Continuation bytes A[0-7] are illegal 738 # (overlong); the other continuations transition to N3 739 # N7 Start byte is F9. Continuation bytes A0 and A1 transition to 740 # N3; the other continuation bytes are illegal (non-Unicode) 741 # 1 Reject. All transitions not mentioned above (except the single 742 # byte ones (as they are always legal) are to this state. 743 744 my $NUM_CLASSES = 14; 745 my $N0 = 0; 746 my $N1 = $N0 + $NUM_CLASSES; 747 my $N2 = $N1 + $NUM_CLASSES; 748 my $N3 = $N2 + $NUM_CLASSES; 749 my $N4 = $N3 + $NUM_CLASSES; 750 my $N5 = $N4 + $NUM_CLASSES; 751 my $N6 = $N5 + $NUM_CLASSES; 752 my $N7 = $N6 + $NUM_CLASSES; 753 754 my @C9_utf8_dfa; 755 my @i8 = ( 756 # 0 1 2 3 4 5 6 7 8 9 A B C D E F 757 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F 758 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F 759 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F 760 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F 761 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F 762 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F 763 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F 764 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F 765 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F 766 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F 767 9, 9,10,10,10,10,10,10,11,11,11,11,11,11,11,11, # A0-AF 768 12,12,12,12,12,12,13,13,12,12,12,12,12,12,12,12, # B0-BF 769 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF 770 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF 771 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF 772 6, 5, 4, 4, 4, 4, 4, 4, 8, 7, 1, 1, 1, 1, 1, 1, # F0-FF 773 ); 774 $C9_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255); 775 push @C9_utf8_dfa, ( 776 # Class: 777 # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 778 0,1,$N1,$N2,$N3,$N5,$N4,$N7,$N6, 1, 1, 1, 1, 1, # N0 779 1,1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, # N1 780 1,1, 1, 1, 1, 1, 1, 1, 1,$N1, $N1, $N1, $N1, $N1, # N2 781 1,1, 1, 1, 1, 1, 1, 1, 1,$N2, $N2, $N2, $N2, $N2, # N3 782 783 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, # N4 784 1,1, 1, 1, 1, 1, 1, 1, 1,$N2, $N2, $N2, $N2, 1, # N5 785 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N3, $N3, $N3, # N6 786 1,1, 1, 1, 1, 1, 1, 1, 1,$N3, 1, 1, 1, 1, # N7 787 ); 788 output_table(\@C9_utf8_dfa, "PL_c9_utf8_dfa_tab", $NUM_CLASSES); 789 } 790 791 print $out_fh get_conditional_compile_line_end(); 792} 793 794print $out_fh "\n#endif /* PERL_EBCDIC_TABLES_H_ */\n"; 795 796read_only_bottom_close_and_rename($out_fh); 797