1use v5.16.0; 2use strict; 3use warnings; 4use integer; 5 6BEGIN { unshift @INC, '.' } 7 8require './regen/regen_lib.pl'; 9require './regen/charset_translations.pl'; 10 11# Generates the EBCDIC translation tables that were formerly hard-coded into 12# utfebcdic.h 13 14my $out_fh = open_new('ebcdic_tables.h', '>', 15 {style => '*', by => $0, }); 16 17sub get_column_headers ($$;$) { 18 my ($row_hdr_len, $field_width, $dfa_columns) = @_; 19 my $format; 20 my $final_column_format; 21 my $num_columns; 22 23 if (defined $dfa_columns) { 24 $num_columns = $dfa_columns; 25 26 # Trailing blank to correspond with commas in the rows below 27 $format = "%${field_width}d "; 28 } 29 else { # Is a regular table 30 $num_columns = 16; 31 32 # Use blanks to separate the fields 33 $format = " " x ( $field_width 34 - 2); # For the '_X' 35 $format .= "_%X "; # Again, trailing blank over the commas below 36 } 37 38 my $header = "/*" . " " x ($row_hdr_len - length "/*"); 39 40 # All but the final column 41 $header .= sprintf($format, $_) for 0 .. $num_columns - 2; 42 43 # Get rid of trailing blank, so that the final column takes up one less 44 # space so that the "*/" doesn't extend past the commas in the rows below 45 chop $header; 46 $header .= sprintf $format, $num_columns - 1; 47 48 # Again, remove trailing blank 49 chop $header; 50 51 return $header . "*/\n"; 52} 53 54sub output_table_start($$$;$) { 55 my ($out_fh, $TYPE, $name, $size) = @_; 56 57 $size = "" unless defined $size; 58 my $declaration = "EXTCONST $TYPE $name\[$size\]"; 59 print $out_fh <<EOF; 60# ifndef DOINIT 61 $declaration; 62# else 63 $declaration = { 64EOF 65} 66 67sub output_table_end($) { 68 print $out_fh "};\n# endif\n\n"; 69} 70 71sub output_table ($$;$) { 72 my $table_ref = shift; 73 my $name = shift; 74 75 # 0 => print in decimal 76 # 1 => print in hex (translates code point to code point) 77 # >= 2 => is a dfa table, like https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ 78 # The number is how many columns in the part after the code point 79 # portion. 80 # 81 # code point tables in hex areasier to debug, but don't fit into 80 82 # columns 83 my $type = shift // 1; 84 85 my $print_in_hex = $type == 1; 86 my $is_dfa = ($type >= 2) ? $type : 0; 87 my $columns_after_256 = 16; 88 89 die "Requres 256 entries in table $name, got @$table_ref" 90 if ! $is_dfa && @$table_ref != 256; 91 if (! $is_dfa) { 92 die "Requres 256 entries in table $name, got @$table_ref" 93 if @$table_ref != 256; 94 } 95 else { 96 $columns_after_256 = $is_dfa; 97 98 print $out_fh <<'EOF'; 99 100/* The table below is adapted from 101 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ 102 * See copyright notice at the beginning of this file. 103 */ 104 105EOF 106 } 107 108 # Highest number in the table 109 my $max_entry = 0; 110 $max_entry = map { $_ > $max_entry ? $_ : $max_entry } @$table_ref; 111 112 # We assume that every table has at least one two digit entry, and none 113 # are more than three digit. 114 my $field_width = ($print_in_hex) 115 ? 4 116 : (($max_entry) > 99 ? 3 : 2); 117 118 my $row_hdr_length; 119 my $node_number_field_width; 120 my $node_value_field_width; 121 122 # dfa tables have a special header for the rows in the transitions part of 123 # the table. It is longer than the regular one. 124 if ($is_dfa) { 125 my $max_node_number = ($max_entry - 256) / $columns_after_256 - 1; 126 $node_number_field_width = ($max_node_number > 9) ? 2 : 1; 127 $node_value_field_width = ($max_node_number * $columns_after_256 > 99) 128 ? 3 : 2; 129 # The header starts with this template, and adds in the number of 130 # digits needed to represent the maximum node number and its value 131 $row_hdr_length = length("/*N=*/") 132 + $node_number_field_width 133 + $node_value_field_width; 134 } 135 else { 136 $row_hdr_length = length "/*_X*/"; # Template for what the header 137 # looks like 138 } 139 140 # The table may not be representable in 8 bits. 141 my $TYPE = 'U8'; 142 $TYPE = 'U16' if grep { $_ > 255 } @$table_ref; 143 144 output_table_start $out_fh, $TYPE, $name; 145 146 # First the headers for the columns 147 print $out_fh get_column_headers($row_hdr_length, $field_width); 148 149 # Now the table body 150 my $count = @$table_ref; 151 my $last_was_nl = 1; 152 153 # Print each element individually, arranged in rows of columns 154 for my $i (0 .. $count - 1) { 155 156 # Node number for here is -1 until get into the dfa state transitions 157 my $node = ($i < 256) ? -1 : ($i - 256) / $columns_after_256; 158 159 # Print row header at beginning of each row 160 if ($last_was_nl) { 161 if ($node >= 0) { 162 printf $out_fh "/*N%-*d=%*d*/", $node_number_field_width, $node, 163 $node_value_field_width, $i - 256; 164 } 165 else { # Otherwise is regular row; print its number 166 printf $out_fh "/*%X_", $i / 16; 167 168 # These rows in a dfa table require extra space so columns 169 # will align vertically (because the Ndd=ddd requires extra 170 # space) 171 if ($is_dfa) { 172 print $out_fh " " x ( $node_number_field_width 173 + $node_value_field_width); 174 } 175 print $out_fh "*/"; 176 } 177 } 178 179 if ($print_in_hex) { 180 printf $out_fh "0x%02X", $table_ref->[$i]; 181 } 182 else { 183 printf $out_fh "%${field_width}d", $table_ref->[$i]; 184 } 185 186 print $out_fh ",", if $i < $count -1; # No comma on final entry 187 188 # Add \n if at end of row, which is 16 columns until we get to the 189 # transitions part 190 if ( ($node < 0 && $i % 16 == 15) 191 || ($node >= 0 && ($i -256) % $columns_after_256 192 == $columns_after_256 - 1)) 193 { 194 print $out_fh "\n"; 195 $last_was_nl = 1; 196 } 197 else { 198 $last_was_nl = 0; 199 } 200 } 201 202 # Print column footer 203 print $out_fh get_column_headers($row_hdr_length, $field_width, 204 ($is_dfa) ? $columns_after_256 : undef); 205 206 output_table_end($out_fh); 207} 208 209print $out_fh <<'END'; 210 211#ifndef PERL_EBCDIC_TABLES_H_ /* Guard against nested #includes */ 212#define PERL_EBCDIC_TABLES_H_ 1 213 214/* This file contains definitions for various tables used in EBCDIC handling. 215 * More info is in utfebcdic.h 216 * 217 * Some of the tables are adapted from 218 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ 219 * which requires this copyright notice: 220 221Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> 222 223Permission is hereby granted, free of charge, to any person obtaining a copy of 224this software and associated documentation files (the "Software"), to deal in 225the Software without restriction, including without limitation the rights to 226use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 227of the Software, and to permit persons to whom the Software is furnished to do 228so, subject to the following conditions: 229 230The above copyright notice and this permission notice shall be included in all 231copies or substantial portions of the Software. 232 233THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 234IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 235FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 236AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 237LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 238OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 239SOFTWARE. 240 241*/ 242END 243 244my @charsets = get_supported_code_pages(); 245shift @charsets; # ASCII is the 0th, and we don't deal with that here. 246foreach my $charset (@charsets) { 247 # we process the whole array several times, make a copy 248 my @a2e = @{get_a2n($charset)}; 249 my @e2a; 250 251 print $out_fh "\n" . get_conditional_compile_line_start($charset); 252 print $out_fh "\n"; 253 254 print $out_fh "/* Index is ASCII platform code point; value is $charset equivalent */\n"; 255 output_table(\@a2e, "PL_a2e"); 256 257 { # Construct the inverse 258 for my $i (0 .. 255) { 259 $e2a[$a2e[$i]] = $i; 260 } 261 print $out_fh "/* Index is $charset code point; value is ASCII platform equivalent */\n"; 262 output_table(\@e2a, "PL_e2a"); 263 } 264 265 my @i82utf = @{get_I8_2_utf($charset)}; 266 print $out_fh <<END; 267/* (Confusingly named) Index is $charset I8 byte; value is 268 * $charset UTF-EBCDIC equivalent */ 269END 270 output_table(\@i82utf, "PL_utf2e"); 271 272 { #Construct the inverse 273 my @utf2i8; 274 for my $i (0 .. 255) { 275 $utf2i8[$i82utf[$i]] = $i; 276 } 277 print $out_fh <<END; 278/* (Confusingly named) Index is $charset UTF-EBCDIC byte; value is 279 * $charset I8 equivalent */ 280END 281 output_table(\@utf2i8, "PL_e2utf"); 282 } 283 284 { 285 my @utf8skip; 286 287 # These are invariants or continuation bytes. 288 for my $i (0 .. 0xBF) { 289 $utf8skip[$i82utf[$i]] = 1; 290 } 291 292 # These are start bytes; The skip is the number of consecutive highest 293 # order 1-bits (up to 7) 294 for my $i (0xC0 .. 255) { 295 my $count; 296 if ($i == 0b11111111) { 297 no warnings 'once'; 298 $count = $CHARSET_TRANSLATIONS::UTF_EBCDIC_MAXBYTES; 299 } 300 elsif (($i & 0b11111110) == 0b11111110) { 301 $count= 7; 302 } 303 elsif (($i & 0b11111100) == 0b11111100) { 304 $count= 6; 305 } 306 elsif (($i & 0b11111000) == 0b11111000) { 307 $count= 5; 308 } 309 elsif (($i & 0b11110000) == 0b11110000) { 310 $count= 4; 311 } 312 elsif (($i & 0b11100000) == 0b11100000) { 313 $count= 3; 314 } 315 elsif (($i & 0b11000000) == 0b11000000) { 316 $count= 2; 317 } 318 else { 319 die "Something wrong for UTF8SKIP calculation for $i"; 320 } 321 $utf8skip[$i82utf[$i]] = $count; 322 } 323 324 print $out_fh <<END; 325/* Index is $charset UTF-EBCDIC byte; value is UTF8SKIP for start bytes 326 * (including for overlongs); 1 for continuation. Adapted from the shadow 327 * flags table in tr16. The entries marked 9 in tr16 are continuation bytes 328 * and are marked as length 1 here so that we can recover. */ 329END 330 output_table(\@utf8skip, "PL_utf8skip", 0); # The 0 means don't print 331 # in hex 332 } 333 334 use feature 'unicode_strings'; 335 336 { 337 my @lc; 338 for my $i (0 .. 255) { 339 $lc[$a2e[$i]] = $a2e[ord lc chr $i]; 340 } 341 print $out_fh 342 "/* Index is $charset code point; value is its lowercase equivalent */\n"; 343 output_table(\@lc, "PL_latin1_lc"); 344 } 345 346 { 347 my @uc; 348 for my $i (0 .. 255) { 349 my $uc = uc chr $i; 350 if (length $uc > 1 || ord $uc > 255) { 351 $uc = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}"; 352 } 353 $uc[$a2e[$i]] = $a2e[ord $uc]; 354 } 355 print $out_fh <<END; 356/* Index is $charset code point; value is its uppercase equivalent. 357 * The 'mod' in the name means that codepoints whose uppercase is above 255 or 358 * longer than 1 character map to LATIN SMALL LETTER Y WITH DIARESIS */ 359END 360 output_table(\@uc, "PL_mod_latin1_uc"); 361 } 362 363 { # PL_fold 364 my @ascii_fold; 365 for my $i (0 .. 255) { # Initialise to identity map 366 $ascii_fold[$i] = $i; 367 } 368 369 # Overwrite the entries that aren't identity 370 for my $chr ('A' .. 'Z') { 371 $ascii_fold[$a2e[ord $chr]] = $a2e[ord lc $chr]; 372 } 373 for my $chr ('a' .. 'z') { 374 $ascii_fold[$a2e[ord $chr]] = $a2e[ord uc $chr]; 375 } 376 print $out_fh <<END; 377/* Index is $charset code point; For A-Z, value is a-z; for a-z, value 378 * is A-Z; all other code points map to themselves */ 379END 380 output_table(\@ascii_fold, "PL_fold"); 381 } 382 383 { 384 my @latin1_fold; 385 for my $i (0 .. 255) { 386 my $char = chr $i; 387 my $lc = lc $char; 388 389 # lc and uc adequately proxy for fold-case pairs in this 0-255 390 # range 391 my $uc = uc $char; 392 $uc = $char if length $uc > 1 || ord $uc > 255; 393 if ($lc ne $char) { 394 $latin1_fold[$a2e[$i]] = $a2e[ord $lc]; 395 } 396 elsif ($uc ne $char) { 397 $latin1_fold[$a2e[$i]] = $a2e[ord $uc]; 398 } 399 else { 400 $latin1_fold[$a2e[$i]] = $a2e[$i]; 401 } 402 } 403 print $out_fh <<END; 404/* Index is $charset code point; value is its other fold-pair equivalent 405 * (A => a; a => A, etc) in the 0-255 range. If no such equivalent, value is 406 * the code point itself */ 407END 408 output_table(\@latin1_fold, "PL_fold_latin1"); 409 } 410 411 { 412 # This generates the dfa table for perl extended UTF-8, which accepts 413 # surrogates, non-characters, and accepts start bytes up through FE 414 # (start byte FF has to be handled outside this dfa). The class numbers 415 # for start bytes are constrained so that they can be used as a shift 416 # count for masking off the leading one bits 417 # 418 # The classes are 419 # 00-9F 0 420 # A0-A1 7 Not legal immediately after start bytes F0 F8 FC 421 # FE 422 # A2-A3 8 Not legal immediately after start bytes F0 F8 FC 423 # A4-A7 9 Not legal immediately after start bytes F0 F8 424 # A8-AF 10 Not legal immediately after start bytes F0 425 # B0-BF 11 426 # C0-C4 1 427 # C5-DF 2 428 # E0 1 429 # E1-EF 3 430 # F0 12 431 # F1-F7 4 432 # F8 13 433 # F9-FB 5 434 # FC 14 435 # FD 6 436 # FE 15 437 # FF 1 438 # 439 # Here's the I8 for the code points before which overlongs occur: 440 # U+4000: \xF0\xB0\xA0\xA0 441 # U+40000: \xF8\xA8\xA0\xA0\xA0 442 # U+400000: \xFC\xA4\xA0\xA0\xA0\xA0 443 # U+4000000: \xFE\xA2\xA0\xA0\xA0\xA0\xA0 444 # 445 # The first part of the table maps bytes to character classes to reduce 446 # the size of the transition table and create bitmasks. 447 # 448 # The second part is a transition table that maps a combination of a 449 # state of the automaton and a character class to a new state. The 450 # numbering of the original nodes is retained, but some have been split 451 # so that there are new nodes. They mean: 452 # N0 The initial state, and final accepting one. 453 # N1 One continuation byte (A0-BF) left. This is transitioned to 454 # immediately when the start byte indicates a two-byte sequence 455 # N2 Two continuation bytes left. 456 # N3 Three continuation bytes left. 457 # N4 Four continuation bytes left. 458 # N5 Five continuation bytes left. 459 # N6 Start byte is F0. Continuation bytes A[0-F] are illegal 460 # (overlong); the other continuations transition to N2 461 # N7 Start byte is F8. Continuation bytes A[0-7] are illegal 462 # (overlong); the other continuations transition to N3 463 # N8 Start byte is FC. Continuation bytes A[0-3] are illegal 464 # (overlong); the other continuations transition to N4 465 # N9 Start byte is FE. Continuation bytes A[01] are illegal 466 # (overlong); the other continuations transition to N5 467 # 1 Reject. All transitions not mentioned above (except the single 468 # byte ones (as they are always legal) are to this state. 469 470 my $NUM_CLASSES = 16; 471 my $N0 = 0; 472 my $N1 = $N0 + $NUM_CLASSES; 473 my $N2 = $N1 + $NUM_CLASSES; 474 my $N3 = $N2 + $NUM_CLASSES; 475 my $N4 = $N3 + $NUM_CLASSES; 476 my $N5 = $N4 + $NUM_CLASSES; 477 my $N6 = $N5 + $NUM_CLASSES; 478 my $N7 = $N6 + $NUM_CLASSES; 479 my $N8 = $N7 + $NUM_CLASSES; 480 my $N9 = $N8 + $NUM_CLASSES; 481 my $N10 = $N9 + $NUM_CLASSES; 482 483 my @perl_extended_utf8_dfa; 484 my @i8 = ( 485 # 0 1 2 3 4 5 6 7 8 9 A B C D E F 486 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F 487 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F 488 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F 489 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F 490 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F 491 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F 492 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F 493 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F 494 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F 495 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F 496 7, 7, 8, 8, 9, 9, 9, 9,10,10,10,10,10,10,10,10, # A0-AF 497 11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11, # B0-BF 498 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF 499 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF 500 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF 501 12, 4, 4, 4, 4, 4, 4, 4,13, 5, 5, 5,14, 6,15, 1, # F0-FF 502 ); 503 $perl_extended_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255); 504 push @perl_extended_utf8_dfa, ( 505 # Class: 506 # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 507 0, 1,$N1,$N2,$N3,$N4,$N5, 1, 1, 1, 1, 1,$N6,$N7,$N8,$N9, # N0 508 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, # N1 509 1, 1, 1, 1, 1, 1, 1,$N1,$N1,$N1,$N1,$N1, 1, 1, 1, 1, # N2 510 1, 1, 1, 1, 1, 1, 1,$N2,$N2,$N2,$N2,$N2, 1, 1, 1, 1, # N3 511 1, 1, 1, 1, 1, 1, 1,$N3,$N3,$N3,$N3,$N3, 1, 1, 1, 1, # N4 512 1, 1, 1, 1, 1, 1, 1,$N4,$N4,$N4,$N4,$N4, 1, 1, 1, 1, # N5 513 514 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,$N2, 1, 1, 1, 1, # N6 515 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,$N3,$N3, 1, 1, 1, 1, # N7 516 1, 1, 1, 1, 1, 1, 1, 1, 1,$N4,$N4,$N4, 1, 1, 1, 1, # N8 517 1, 1, 1, 1, 1, 1, 1, 1,$N5,$N5,$N5,$N5, 1, 1, 1, 1, # N9 518 ); 519 output_table(\@perl_extended_utf8_dfa, "PL_extended_utf8_dfa_tab", 520 $NUM_CLASSES); 521 } 522 523 { 524 # This generates the dfa table for strict UTF-8, which rejects 525 # surrogates, non-characters, and above Unicode. 526 # 527 # The classes are 528 # 00-9F 0 Always legal at start 529 # A0 10 Not legal immediately after start bytes F0 F8 530 # A1 11 Not legal immediately after start bytes F0 F8, 531 # A2-A7 12 Not legal immediately after start bytes F0 F8 F9 532 # A8,AA,AC 13 Not legal immediately after start bytes F0 F9 533 # A9,AB,AD 14 Not legal immediately after start byte F0 534 # AE 15 Not legal immediately after start byte F0 535 # AF 16 Not legal immediately after start bytes F0 536 # B[0248AC] 17 Not legal immediately after start byte F9 537 # B[1359D] 18 Not legal immediately after start byte F9 538 # B6 19 Not legal immediately after start byte F9 539 # B7 20 Not legal immediately after start byte F9 540 # BE 21 Not legal immediately after start byte F9 541 # BF 22 Not legal immediately after start byte F9 542 # C0-C4 1 (reject, all are overlong) 543 # C5-DF 2 Accepts any legal continuation 544 # E0 1 (reject, all are overlong) 545 # E1-EF 3 Accepts any legal continuation 546 # F0 8 (has overlongs) 547 # F1 6 (has surrogates, non-chars) 548 # F2,F4,F6 4 Accepts any legal continuation 549 # F3,F5,F7 5 (has non-chars) 550 # F8 9 (has overlongs, non-chars) 551 # F9 7 (has non-chars, non-Unicode) 552 # FA-FF 1 (reject, all are non-Unicode) 553 # 554 # Here's the I8 for enough code points so that you can figure out what's 555 # going on: 556 # 557 # U+D800: \xF1\xB6\xA0\xA0 558 # U+DFFF: \xF1\xB7\xBF\xBF 559 # U+FDD0: \xF1\xBF\xAE\xB0 560 # U+FDEF: \xF1\xBF\xAF\xAF 561 # U+FFFE: \xF1\xBF\xBF\xBE 562 # U+1FFFE: \xF3\xBF\xBF\xBE 563 # U+2FFFE: \xF5\xBF\xBF\xBE 564 # U+3FFFE: \xF7\xBF\xBF\xBE 565 # U+4FFFE: \xF8\xA9\xBF\xBF\xBE 566 # U+5FFFE: \xF8\xAB\xBF\xBF\xBE 567 # U+6FFFE: \xF8\xAD\xBF\xBF\xBE 568 # U+7FFFE: \xF8\xAF\xBF\xBF\xBE 569 # U+8FFFE: \xF8\xB1\xBF\xBF\xBE 570 # U+9FFFE: \xF8\xB3\xBF\xBF\xBE 571 # U+AFFFE: \xF8\xB5\xBF\xBF\xBE 572 # U+BFFFE: \xF8\xB7\xBF\xBF\xBE 573 # U+CFFFE: \xF8\xB9\xBF\xBF\xBE 574 # U+DFFFE: \xF8\xBB\xBF\xBF\xBE 575 # U+EFFFE: \xF8\xBD\xBF\xBF\xBE 576 # U+FFFFE: \xF8\xBF\xBF\xBF\xBE 577 # U+10FFFE: \xF9\xA1\xBF\xBF\xBE 578 # 579 # The first part of the table maps bytes to character classes to reduce 580 # the size of the transition table and create bitmasks. 581 # 582 # The second part is a transition table that maps a combination of a 583 # state of the automaton and a character class to a new state. The 584 # numbering of the original nodes is retained, but some have been split 585 # so that there are new nodes. They mean: 586 # N0 The initial state, and final accepting one. 587 # N1 One continuation byte (A0-BF) left. This is transitioned to 588 # immediately when the start byte indicates a two-byte sequence 589 # N2 Two continuation bytes left. 590 # N3 Three continuation bytes left. 591 # N4 Start byte is F0. Continuation bytes A[0-F] are illegal 592 # (overlong); the other continuations transition to N2 593 # N5 Start byte is F1. Continuation bytes B6 and B7 are illegal 594 # (surrogates); BF transitions to N9; the other continuations to 595 # N2 596 # N6 Start byte is F[357]. Continuation byte BF transitions to N12; 597 # other continuations to N2 598 # N7 Start byte is F8. Continuation bytes A[0-7] are illegal 599 # (overlong); continuations A[9BDF] and B[13579BDF] transition to 600 # N14; the other continuations to N3 601 # N8 Start byte is F9. Continuation byte A0 transitions to N3; A1 602 # to N14; the other continuation bytes are illegal. 603 # N9 Initial sequence is F1 BF. Continuation byte AE transitions to 604 # state N10; AF to N11; BF to N13; the other continuations to N1. 605 # N10 Initial sequence is F1 BF AE. Continuation bytes B0-BF are 606 # illegal (non-chars); the other continuations are legal 607 # N11 Initial sequence is F1 BF AF. Continuation bytes A0-AF are 608 # illegal (non-chars); the other continuations are legal 609 # N12 Initial sequence is F[357] BF. Continuation bytes BF 610 # transitions to N13; the other continuations to N1 611 # N13 Initial sequence is F[1357] BF BF or F8 x y BF (where x and y 612 # are something that can lead to a non-char. Continuation bytes 613 # BE and BF are illegal (non-chars); the other continuations are 614 # legal 615 # N14 Initial sequence is F8 A[9BDF]; or F8 B[13579BDF]; or F9 A1. 616 # Continuation byte BF transitions to N15; the other 617 # continuations to N2 618 # N15 Initial sequence is F8 A[9BDF] BF; or F8 B[13579BDF] BF; or 619 # F9 A1 BF. Continuation byte BF transitions to N16; the other 620 # continuations to N2 621 # N16 Initial sequence is F8 A[9BDF] BF BF; or F8 B[13579BDF] BF BF; 622 # or F9 A1 BF BF. Continuation bytes BE and BF are illegal 623 # (non-chars); the other continuations are legal 624 # 1 Reject. All transitions not mentioned above (except the single 625 # byte ones (as they are always legal) are to this state. 626 627 my $NUM_CLASSES = 23; 628 my $N0 = 0; 629 my $N1 = $N0 + $NUM_CLASSES; 630 my $N2 = $N1 + $NUM_CLASSES; 631 my $N3 = $N2 + $NUM_CLASSES; 632 my $N4 = $N3 + $NUM_CLASSES; 633 my $N5 = $N4 + $NUM_CLASSES; 634 my $N6 = $N5 + $NUM_CLASSES; 635 my $N7 = $N6 + $NUM_CLASSES; 636 my $N8 = $N7 + $NUM_CLASSES; 637 my $N9 = $N8 + $NUM_CLASSES; 638 my $N10 = $N9 + $NUM_CLASSES; 639 my $N11 = $N10 + $NUM_CLASSES; 640 my $N12 = $N11 + $NUM_CLASSES; 641 my $N13 = $N12 + $NUM_CLASSES; 642 my $N14 = $N13 + $NUM_CLASSES; 643 my $N15 = $N14 + $NUM_CLASSES; 644 645 my @strict_utf8_dfa; 646 my @i8 = ( 647 # 0 1 2 3 4 5 6 7 8 9 A B C D E F 648 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F 649 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F 650 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F 651 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F 652 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F 653 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F 654 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F 655 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F 656 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F 657 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F 658 10,11,12,12,12,12,12,12,13,14,13,14,13,14,15,16, # A0-AF 659 17,18,17,18,17,18,19,20,17,18,17,18,17,18,21,22, # B0-BF 660 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF 661 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF 662 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF 663 8, 6, 4, 5, 4, 5, 4, 5, 9, 7, 1, 1, 1, 1, 1, 1, # F0-FF 664 ); 665 $strict_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255); 666 push @strict_utf8_dfa, ( 667 # Class: 668 # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 669 0,1,$N1,$N2,$N3,$N6,$N5,$N8,$N4,$N7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # N0 670 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # N1 671 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, # N2 672 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, # N3 673 674 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, # N4 675 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, 1, 1, $N2, $N9, # N5 676 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,$N12, # N6 677 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N3,$N14, $N3,$N14, $N3,$N14, $N3,$N14, $N3,$N14, # N7 678 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N3,$N14, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # N8 679 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N1, $N1, $N1, $N1, $N1,$N10,$N11, $N1, $N1, $N1, $N1, $N1,$N13, # N9 680 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, # N10 681 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, # N11 682 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1,$N13, # N12 683 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, # N13 684 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,$N15, # N14 685 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1,$N13, # N15 686 ); 687 output_table(\@strict_utf8_dfa, "PL_strict_utf8_dfa_tab", $NUM_CLASSES); 688 } 689 690 { 691 # This generates the dfa table for C9 strict UTF-8, which rejects 692 # surrogates and above Unicode, but allows non-characters,. 693 # 694 # The classes are 695 # 00-9F 0 Always legal at start 696 # A0-A1 9 Not legal immediately after start bytes F0 F8 697 # A2-A7 10 Not legal immediately after start bytes F0 F8 F9 698 # A8-AF 11 Not legal immediately after start bytes F0 F9 699 # B0-B5,B8-BF 12 Not legal immediately after start byte F9 700 # B6,B7 13 701 # C0-C4 1 (reject, all are overlong) 702 # C5-DF 2 Accepts any legal continuation 703 # E0 1 (reject, all are overlong) 704 # E1-EF 3 Accepts any legal continuation 705 # F0 6 (has overlongs) 706 # F1 5 (has surrogates) 707 # F2-F7 4 Accepts any legal continuation 708 # F8 8 (has overlongs) 709 # F9 7 (has non-Unicode) 710 # FA-FF 1 (reject, all are non-Unicode) 711 # 712 # The first part of the table maps bytes to character classes to reduce 713 # the size of the transition table and create bitmasks. 714 # 715 # The second part is a transition table that maps a combination of a 716 # state of the automaton and a character class to a new state. The 717 # numbering of the original nodes is retained, but some have been split 718 # so that there are new nodes. They mean: 719 # N0 The initial state, and final accepting one. 720 # N1 One continuation byte (A0-BF) left. This is transitioned to 721 # immediately when the start byte indicates a two-byte sequence 722 # N2 Two continuation bytes left. 723 # N3 Three continuation bytes left. 724 # N4 Start byte is F0. Continuation bytes A[0-F] are illegal 725 # (overlong); the other continuations transition to N2 726 # N5 Start byte is F1. B6 and B7 are illegal (surrogates); the 727 # other continuations transition to N2 728 # N6 Start byte is F8. Continuation bytes A[0-7] are illegal 729 # (overlong); the other continuations transition to N3 730 # N7 Start byte is F9. Continuation bytes A0 and A1 transition to 731 # N3; the other continuation bytes are illegal (non-Unicode) 732 # 1 Reject. All transitions not mentioned above (except the single 733 # byte ones (as they are always legal) are to this state. 734 735 my $NUM_CLASSES = 14; 736 my $N0 = 0; 737 my $N1 = $N0 + $NUM_CLASSES; 738 my $N2 = $N1 + $NUM_CLASSES; 739 my $N3 = $N2 + $NUM_CLASSES; 740 my $N4 = $N3 + $NUM_CLASSES; 741 my $N5 = $N4 + $NUM_CLASSES; 742 my $N6 = $N5 + $NUM_CLASSES; 743 my $N7 = $N6 + $NUM_CLASSES; 744 745 my @C9_utf8_dfa; 746 my @i8 = ( 747 # 0 1 2 3 4 5 6 7 8 9 A B C D E F 748 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F 749 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F 750 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F 751 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F 752 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F 753 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F 754 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F 755 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F 756 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F 757 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F 758 9, 9,10,10,10,10,10,10,11,11,11,11,11,11,11,11, # A0-AF 759 12,12,12,12,12,12,13,13,12,12,12,12,12,12,12,12, # B0-BF 760 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF 761 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF 762 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF 763 6, 5, 4, 4, 4, 4, 4, 4, 8, 7, 1, 1, 1, 1, 1, 1, # F0-FF 764 ); 765 $C9_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255); 766 push @C9_utf8_dfa, ( 767 # Class: 768 # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 769 0,1,$N1,$N2,$N3,$N5,$N4,$N7,$N6, 1, 1, 1, 1, 1, # N0 770 1,1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, # N1 771 1,1, 1, 1, 1, 1, 1, 1, 1,$N1, $N1, $N1, $N1, $N1, # N2 772 1,1, 1, 1, 1, 1, 1, 1, 1,$N2, $N2, $N2, $N2, $N2, # N3 773 774 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, # N4 775 1,1, 1, 1, 1, 1, 1, 1, 1,$N2, $N2, $N2, $N2, 1, # N5 776 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N3, $N3, $N3, # N6 777 1,1, 1, 1, 1, 1, 1, 1, 1,$N3, 1, 1, 1, 1, # N7 778 ); 779 output_table(\@C9_utf8_dfa, "PL_c9_utf8_dfa_tab", $NUM_CLASSES); 780 } 781 782 print $out_fh get_conditional_compile_line_end(); 783} 784 785print $out_fh "\n#endif /* PERL_EBCDIC_TABLES_H_ */\n"; 786 787read_only_bottom_close_and_rename($out_fh); 788