1use v5.16.0; 2use strict; 3use warnings; 4use integer; 5 6BEGIN { unshift @INC, '.' } 7 8require './regen/regen_lib.pl'; 9require './regen/charset_translations.pl'; 10 11# Generates the EBCDIC translation tables that were formerly hard-coded into 12# utfebcdic.h 13 14my $out_fh = open_new('ebcdic_tables.h', '>', 15 {style => '*', by => $0, }); 16 17sub get_column_headers ($$;$) { 18 my ($row_hdr_len, $field_width, $dfa_columns) = @_; 19 my $format; 20 my $final_column_format; 21 my $num_columns; 22 23 if (defined $dfa_columns) { 24 $num_columns = $dfa_columns; 25 26 # Trailing blank to correspond with commas in the rows below 27 $format = "%${field_width}d "; 28 } 29 else { # Is a regular table 30 $num_columns = 16; 31 32 # Use blanks to separate the fields 33 $format = " " x ( $field_width 34 - 2); # For the '_X' 35 $format .= "_%X "; # Again, trailing blank over the commas below 36 } 37 38 my $header = "/*" . " " x ($row_hdr_len - length "/*"); 39 40 # All but the final column 41 $header .= sprintf($format, $_) for 0 .. $num_columns - 2; 42 43 # Get rid of trailing blank, so that the final column takes up one less 44 # space so that the "*/" doesn't extend past the commas in the rows below 45 chop $header; 46 $header .= sprintf $format, $num_columns - 1; 47 48 # Again, remove trailing blank 49 chop $header; 50 51 return $header . "*/\n"; 52} 53 54sub output_table_start($$$) { 55 my ($out_fh, $TYPE, $name) = @_; 56 57 my $declaration = "EXTCONST $TYPE $name\[\]"; 58 print $out_fh <<EOF; 59# ifndef DOINIT 60 $declaration; 61# else 62 $declaration = { 63EOF 64} 65 66sub output_table_end($) { 67 print $out_fh "};\n# endif\n\n"; 68} 69 70sub output_table ($$;$) { 71 my $table_ref = shift; 72 my $name = shift; 73 74 # 0 => print in decimal 75 # 1 => print in hex (translates code point to code point) 76 # >= 2 => is a dfa table, like http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ 77 # The number is how many columns in the part after the code point 78 # portion. 79 # 80 # code point tables in hex areasier to debug, but don't fit into 80 81 # columns 82 my $type = shift // 1; 83 84 my $print_in_hex = $type == 1; 85 my $is_dfa = ($type >= 2) ? $type : 0; 86 my $columns_after_256 = 16; 87 88 die "Requres 256 entries in table $name, got @$table_ref" 89 if ! $is_dfa && @$table_ref != 256; 90 if (! $is_dfa) { 91 die "Requres 256 entries in table $name, got @$table_ref" 92 if @$table_ref != 256; 93 } 94 else { 95 $columns_after_256 = $is_dfa; 96 97 print $out_fh <<'EOF'; 98 99/* The table below is adapted from 100 * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ 101 * See copyright notice at the beginning of this file. 102 */ 103 104EOF 105 } 106 107 # Highest number in the table 108 my $max_entry = 0; 109 $max_entry = map { $_ > $max_entry ? $_ : $max_entry } @$table_ref; 110 111 # We assume that every table has at least one two digit entry, and none 112 # are more than three digit. 113 my $field_width = ($print_in_hex) 114 ? 4 115 : (($max_entry) > 99 ? 3 : 2); 116 117 my $row_hdr_length; 118 my $node_number_field_width; 119 my $node_value_field_width; 120 121 # dfa tables have a special header for the rows in the transitions part of 122 # the table. It is longer than the regular one. 123 if ($is_dfa) { 124 my $max_node_number = ($max_entry - 256) / $columns_after_256 - 1; 125 $node_number_field_width = ($max_node_number > 9) ? 2 : 1; 126 $node_value_field_width = ($max_node_number * $columns_after_256 > 99) 127 ? 3 : 2; 128 # The header starts with this template, and adds in the number of 129 # digits needed to represent the maximum node number and its value 130 $row_hdr_length = length("/*N=*/") 131 + $node_number_field_width 132 + $node_value_field_width; 133 } 134 else { 135 $row_hdr_length = length "/*_X*/"; # Template for what the header 136 # looks like 137 } 138 139 # The table may not be representable in 8 bits. 140 my $TYPE = 'U8'; 141 $TYPE = 'U16' if grep { $_ > 255 } @$table_ref; 142 143 output_table_start $out_fh, $TYPE, $name; 144 145 # First the headers for the columns 146 print $out_fh get_column_headers($row_hdr_length, $field_width); 147 148 # Now the table body 149 my $count = @$table_ref; 150 my $last_was_nl = 1; 151 152 # Print each element individually, arranged in rows of columns 153 for my $i (0 .. $count - 1) { 154 155 # Node number for here is -1 until get into the dfa state transitions 156 my $node = ($i < 256) ? -1 : ($i - 256) / $columns_after_256; 157 158 # Print row header at beginning of each row 159 if ($last_was_nl) { 160 if ($node >= 0) { 161 printf $out_fh "/*N%-*d=%*d*/", $node_number_field_width, $node, 162 $node_value_field_width, $i - 256; 163 } 164 else { # Otherwise is regular row; print its number 165 printf $out_fh "/*%X_", $i / 16; 166 167 # These rows in a dfa table require extra space so columns 168 # will align vertically (because the Ndd=ddd requires extra 169 # space) 170 if ($is_dfa) { 171 print $out_fh " " x ( $node_number_field_width 172 + $node_value_field_width); 173 } 174 print $out_fh "*/"; 175 } 176 } 177 178 if ($print_in_hex) { 179 printf $out_fh "0x%02X", $table_ref->[$i]; 180 } 181 else { 182 printf $out_fh "%${field_width}d", $table_ref->[$i]; 183 } 184 185 print $out_fh ",", if $i < $count -1; # No comma on final entry 186 187 # Add \n if at end of row, which is 16 columns until we get to the 188 # transitions part 189 if ( ($node < 0 && $i % 16 == 15) 190 || ($node >= 0 && ($i -256) % $columns_after_256 191 == $columns_after_256 - 1)) 192 { 193 print $out_fh "\n"; 194 $last_was_nl = 1; 195 } 196 else { 197 $last_was_nl = 0; 198 } 199 } 200 201 # Print column footer 202 print $out_fh get_column_headers($row_hdr_length, $field_width, 203 ($is_dfa) ? $columns_after_256 : undef); 204 205 output_table_end($out_fh); 206} 207 208print $out_fh <<'END'; 209 210#ifndef PERL_EBCDIC_TABLES_H_ /* Guard against nested #includes */ 211#define PERL_EBCDIC_TABLES_H_ 1 212 213/* This file contains definitions for various tables used in EBCDIC handling. 214 * More info is in utfebcdic.h 215 * 216 * Some of the tables are adapted from 217 * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ 218 * which requires this copyright notice: 219 220Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> 221 222Permission is hereby granted, free of charge, to any person obtaining a copy of 223this software and associated documentation files (the "Software"), to deal in 224the Software without restriction, including without limitation the rights to 225use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 226of the Software, and to permit persons to whom the Software is furnished to do 227so, subject to the following conditions: 228 229The above copyright notice and this permission notice shall be included in all 230copies or substantial portions of the Software. 231 232THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 233IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 234FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 235AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 236LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 237OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 238SOFTWARE. 239 240*/ 241END 242 243my @charsets = get_supported_code_pages(); 244shift @charsets; # ASCII is the 0th, and we don't deal with that here. 245foreach my $charset (@charsets) { 246 # we process the whole array several times, make a copy 247 my @a2e = @{get_a2n($charset)}; 248 249 print $out_fh "\n" . get_conditional_compile_line_start($charset); 250 print $out_fh "\n"; 251 252 print $out_fh "/* Index is ASCII platform code point; value is $charset equivalent */\n"; 253 output_table(\@a2e, "PL_a2e"); 254 255 { # Construct the inverse 256 my @e2a; 257 for my $i (0 .. 255) { 258 $e2a[$a2e[$i]] = $i; 259 } 260 print $out_fh "/* Index is $charset code point; value is ASCII platform equivalent */\n"; 261 output_table(\@e2a, "PL_e2a"); 262 } 263 264 my @i82utf = @{get_I8_2_utf($charset)}; 265 print $out_fh <<END; 266/* (Confusingly named) Index is $charset I8 byte; value is 267 * $charset UTF-EBCDIC equivalent */ 268END 269 output_table(\@i82utf, "PL_utf2e"); 270 271 { #Construct the inverse 272 my @utf2i8; 273 for my $i (0 .. 255) { 274 $utf2i8[$i82utf[$i]] = $i; 275 } 276 print $out_fh <<END; 277/* (Confusingly named) Index is $charset UTF-EBCDIC byte; value is 278 * $charset I8 equivalent */ 279END 280 output_table(\@utf2i8, "PL_e2utf"); 281 } 282 283 { 284 my @utf8skip; 285 286 # These are invariants or continuation bytes. 287 for my $i (0 .. 0xBF) { 288 $utf8skip[$i82utf[$i]] = 1; 289 } 290 291 # These are start bytes; The skip is the number of consecutive highest 292 # order 1-bits (up to 7) 293 for my $i (0xC0 .. 255) { 294 my $count; 295 if ($i == 0b11111111) { 296 no warnings 'once'; 297 $count = $CHARSET_TRANSLATIONS::UTF_EBCDIC_MAXBYTES; 298 } 299 elsif (($i & 0b11111110) == 0b11111110) { 300 $count= 7; 301 } 302 elsif (($i & 0b11111100) == 0b11111100) { 303 $count= 6; 304 } 305 elsif (($i & 0b11111000) == 0b11111000) { 306 $count= 5; 307 } 308 elsif (($i & 0b11110000) == 0b11110000) { 309 $count= 4; 310 } 311 elsif (($i & 0b11100000) == 0b11100000) { 312 $count= 3; 313 } 314 elsif (($i & 0b11000000) == 0b11000000) { 315 $count= 2; 316 } 317 else { 318 die "Something wrong for UTF8SKIP calculation for $i"; 319 } 320 $utf8skip[$i82utf[$i]] = $count; 321 } 322 323 print $out_fh <<END; 324/* Index is $charset UTF-EBCDIC byte; value is UTF8SKIP for start bytes 325 * (including for overlongs); 1 for continuation. Adapted from the shadow 326 * flags table in tr16. The entries marked 9 in tr16 are continuation bytes 327 * and are marked as length 1 here so that we can recover. */ 328END 329 output_table(\@utf8skip, "PL_utf8skip", 0); # The 0 means don't print 330 # in hex 331 } 332 333 use feature 'unicode_strings'; 334 335 { 336 my @lc; 337 for my $i (0 .. 255) { 338 $lc[$a2e[$i]] = $a2e[ord lc chr $i]; 339 } 340 print $out_fh 341 "/* Index is $charset code point; value is its lowercase equivalent */\n"; 342 output_table(\@lc, "PL_latin1_lc"); 343 } 344 345 { 346 my @uc; 347 for my $i (0 .. 255) { 348 my $uc = uc chr $i; 349 if (length $uc > 1 || ord $uc > 255) { 350 $uc = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}"; 351 } 352 $uc[$a2e[$i]] = $a2e[ord $uc]; 353 } 354 print $out_fh <<END; 355/* Index is $charset code point; value is its uppercase equivalent. 356 * The 'mod' in the name means that codepoints whose uppercase is above 255 or 357 * longer than 1 character map to LATIN SMALL LETTER Y WITH DIARESIS */ 358END 359 output_table(\@uc, "PL_mod_latin1_uc"); 360 } 361 362 { # PL_fold 363 my @ascii_fold; 364 for my $i (0 .. 255) { # Initialise to identity map 365 $ascii_fold[$i] = $i; 366 } 367 368 # Overwrite the entries that aren't identity 369 for my $chr ('A' .. 'Z') { 370 $ascii_fold[$a2e[ord $chr]] = $a2e[ord lc $chr]; 371 } 372 for my $chr ('a' .. 'z') { 373 $ascii_fold[$a2e[ord $chr]] = $a2e[ord uc $chr]; 374 } 375 print $out_fh <<END; 376/* Index is $charset code point; For A-Z, value is a-z; for a-z, value 377 * is A-Z; all other code points map to themselves */ 378END 379 output_table(\@ascii_fold, "PL_fold"); 380 } 381 382 { 383 my @latin1_fold; 384 for my $i (0 .. 255) { 385 my $char = chr $i; 386 my $lc = lc $char; 387 388 # lc and uc adequately proxy for fold-case pairs in this 0-255 389 # range 390 my $uc = uc $char; 391 $uc = $char if length $uc > 1 || ord $uc > 255; 392 if ($lc ne $char) { 393 $latin1_fold[$a2e[$i]] = $a2e[ord $lc]; 394 } 395 elsif ($uc ne $char) { 396 $latin1_fold[$a2e[$i]] = $a2e[ord $uc]; 397 } 398 else { 399 $latin1_fold[$a2e[$i]] = $a2e[$i]; 400 } 401 } 402 print $out_fh <<END; 403/* Index is $charset code point; value is its other fold-pair equivalent 404 * (A => a; a => A, etc) in the 0-255 range. If no such equivalent, value is 405 * the code point itself */ 406END 407 output_table(\@latin1_fold, "PL_fold_latin1"); 408 } 409 410 { 411 # This generates the dfa table for perl extended UTF-8, which accepts 412 # surrogates, non-characters, and accepts start bytes up through FE 413 # (start byte FF has to be handled outside this dfa). The class numbers 414 # for start bytes are constrained so that they can be used as a shift 415 # count for masking off the leading one bits 416 # 417 # The classes are 418 # 00-9F 0 419 # A0-A1 7 Not legal immediately after start bytes F0 F8 FC 420 # FE 421 # A2-A3 8 Not legal immediately after start bytes F0 F8 FC 422 # A4-A7 9 Not legal immediately after start bytes F0 F8 423 # A8-AF 10 Not legal immediately after start bytes F0 424 # B0-BF 11 425 # C0-C4 1 426 # C5-DF 2 427 # E0 1 428 # E1-EF 3 429 # F0 12 430 # F1-F7 4 431 # F8 13 432 # F9-FB 5 433 # FC 14 434 # FD 6 435 # FE 15 436 # FF 1 437 # 438 # Here's the I8 for the code points before which overlongs occur: 439 # U+4000: \xF0\xB0\xA0\xA0 440 # U+40000: \xF8\xA8\xA0\xA0\xA0 441 # U+400000: \xFC\xA4\xA0\xA0\xA0\xA0 442 # U+4000000: \xFE\xA2\xA0\xA0\xA0\xA0\xA0 443 # 444 # The first part of the table maps bytes to character classes to reduce 445 # the size of the transition table and create bitmasks. 446 # 447 # The second part is a transition table that maps a combination of a 448 # state of the automaton and a character class to a new state. The 449 # numbering of the original nodes is retained, but some have been split 450 # so that there are new nodes. They mean: 451 # N0 The initial state, and final accepting one. 452 # N1 One continuation byte (A0-BF) left. This is transitioned to 453 # immediately when the start byte indicates a two-byte sequence 454 # N2 Two continuation bytes left. 455 # N3 Three continuation bytes left. 456 # N4 Four continuation bytes left. 457 # N5 Five continuation bytes left. 458 # N6 Start byte is F0. Continuation bytes A[0-F] are illegal 459 # (overlong); the other continuations transition to N2 460 # N7 Start byte is F8. Continuation bytes A[0-7] are illegal 461 # (overlong); the other continuations transition to N3 462 # N8 Start byte is FC. Continuation bytes A[0-3] are illegal 463 # (overlong); the other continuations transition to N4 464 # N9 Start byte is FE. Continuation bytes A[01] are illegal 465 # (overlong); the other continuations transition to N5 466 # 1 Reject. All transitions not mentioned above (except the single 467 # byte ones (as they are always legal) are to this state. 468 469 my $NUM_CLASSES = 16; 470 my $N0 = 0; 471 my $N1 = $N0 + $NUM_CLASSES; 472 my $N2 = $N1 + $NUM_CLASSES; 473 my $N3 = $N2 + $NUM_CLASSES; 474 my $N4 = $N3 + $NUM_CLASSES; 475 my $N5 = $N4 + $NUM_CLASSES; 476 my $N6 = $N5 + $NUM_CLASSES; 477 my $N7 = $N6 + $NUM_CLASSES; 478 my $N8 = $N7 + $NUM_CLASSES; 479 my $N9 = $N8 + $NUM_CLASSES; 480 my $N10 = $N9 + $NUM_CLASSES; 481 482 my @perl_extended_utf8_dfa; 483 my @i8 = ( 484 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F 485 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F 486 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F 487 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F 488 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F 489 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F 490 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F 491 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F 492 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F 493 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F 494 7, 7, 8, 8, 9, 9, 9, 9,10,10,10,10,10,10,10,10, # A0-AF 495 11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11, # B0-BF 496 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF 497 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF 498 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF 499 12, 4, 4, 4, 4, 4, 4, 4,13, 5, 5, 5,14, 6,15, 1, # F0-FF 500 ); 501 $perl_extended_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255); 502 push @perl_extended_utf8_dfa, ( 503 # Class: 504 # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 505 0, 1,$N1,$N2,$N3,$N4,$N5, 1, 1, 1, 1, 1,$N6,$N7,$N8,$N9, # N0 506 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, # N1 507 1, 1, 1, 1, 1, 1, 1,$N1,$N1,$N1,$N1,$N1, 1, 1, 1, 1, # N2 508 1, 1, 1, 1, 1, 1, 1,$N2,$N2,$N2,$N2,$N2, 1, 1, 1, 1, # N3 509 1, 1, 1, 1, 1, 1, 1,$N3,$N3,$N3,$N3,$N3, 1, 1, 1, 1, # N4 510 1, 1, 1, 1, 1, 1, 1,$N4,$N4,$N4,$N4,$N4, 1, 1, 1, 1, # N5 511 512 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,$N2, 1, 1, 1, 1, # N6 513 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,$N3,$N3, 1, 1, 1, 1, # N7 514 1, 1, 1, 1, 1, 1, 1, 1, 1,$N4,$N4,$N4, 1, 1, 1, 1, # N8 515 1, 1, 1, 1, 1, 1, 1, 1,$N5,$N5,$N5,$N5, 1, 1, 1, 1, # N9 516 ); 517 output_table(\@perl_extended_utf8_dfa, "PL_extended_utf8_dfa_tab", 518 $NUM_CLASSES); 519 } 520 521 { 522 # This generates the dfa table for strict UTF-8, which rejects 523 # surrogates, non-characters, and above Unicode. 524 # 525 # The classes are 526 # 00-9F 0 Always legal at start 527 # A0 10 Not legal immediately after start bytes F0 F8 528 # A1 11 Not legal immediately after start bytes F0 F8, 529 # A2-A7 12 Not legal immediately after start bytes F0 F8 F9 530 # A8,AA,AC 13 Not legal immediately after start bytes F0 F9 531 # A9,AB,AD 14 Not legal immediately after start byte F0 532 # AE 15 Not legal immediately after start byte F0 533 # AF 16 Not legal immediately after start bytes F0 534 # B[0248AC] 17 Not legal immediately after start byte F9 535 # B[1359D] 18 Not legal immediately after start byte F9 536 # B6 19 Not legal immediately after start byte F9 537 # B7 20 Not legal immediately after start byte F9 538 # BE 21 Not legal immediately after start byte F9 539 # BF 22 Not legal immediately after start byte F9 540 # C0-C4 1 (reject, all are overlong) 541 # C5-DF 2 Accepts any legal continuation 542 # E0 1 (reject, all are overlong) 543 # E1-EF 3 Accepts any legal continuation 544 # F0 8 (has overlongs) 545 # F1 6 (has surrogates, non-chars) 546 # F2,F4,F6 4 Accepts any legal continuation 547 # F3,F5,F7 5 (has non-chars) 548 # F8 9 (has overlongs, non-chars) 549 # F9 7 (has non-chars, non-Unicode) 550 # FA-FF 1 (reject, all are non-Unicode) 551 # 552 # Here's the I8 for enough code points so that you can figure out what's 553 # going on: 554 # 555 # U+D800: \xF1\xB6\xA0\xA0 556 # U+DFFF: \xF1\xB7\xBF\xBF 557 # U+FDD0: \xF1\xBF\xAE\xB0 558 # U+FDEF: \xF1\xBF\xAF\xAF 559 # U+FFFE: \xF1\xBF\xBF\xBE 560 # U+1FFFE: \xF3\xBF\xBF\xBE 561 # U+2FFFE: \xF5\xBF\xBF\xBE 562 # U+3FFFE: \xF7\xBF\xBF\xBE 563 # U+4FFFE: \xF8\xA9\xBF\xBF\xBE 564 # U+5FFFE: \xF8\xAB\xBF\xBF\xBE 565 # U+6FFFE: \xF8\xAD\xBF\xBF\xBE 566 # U+7FFFE: \xF8\xAF\xBF\xBF\xBE 567 # U+8FFFE: \xF8\xB1\xBF\xBF\xBE 568 # U+9FFFE: \xF8\xB3\xBF\xBF\xBE 569 # U+AFFFE: \xF8\xB5\xBF\xBF\xBE 570 # U+BFFFE: \xF8\xB7\xBF\xBF\xBE 571 # U+CFFFE: \xF8\xB9\xBF\xBF\xBE 572 # U+DFFFE: \xF8\xBB\xBF\xBF\xBE 573 # U+EFFFE: \xF8\xBD\xBF\xBF\xBE 574 # U+FFFFE: \xF8\xBF\xBF\xBF\xBE 575 # U+10FFFE: \xF9\xA1\xBF\xBF\xBE 576 # 577 # The first part of the table maps bytes to character classes to reduce 578 # the size of the transition table and create bitmasks. 579 # 580 # The second part is a transition table that maps a combination of a 581 # state of the automaton and a character class to a new state. The 582 # numbering of the original nodes is retained, but some have been split 583 # so that there are new nodes. They mean: 584 # N0 The initial state, and final accepting one. 585 # N1 One continuation byte (A0-BF) left. This is transitioned to 586 # immediately when the start byte indicates a two-byte sequence 587 # N2 Two continuation bytes left. 588 # N3 Three continuation bytes left. 589 # N4 Start byte is F0. Continuation bytes A[0-F] are illegal 590 # (overlong); the other continuations transition to N2 591 # N5 Start byte is F1. Continuation bytes B6 and B7 are illegal 592 # (surrogates); BF transitions to N9; the other continuations to 593 # N2 594 # N6 Start byte is F[357]. Continuation byte BF transitions to N12; 595 # other continuations to N2 596 # N5 Start byte is F8. Continuation bytes A[0-7] are illegal 597 # (overlong); continuations A[9BDF] and B[13579BDF] transition to 598 # N14; the other continuations to N3 599 # N8 Start byte is F9. Continuation byte A0 transitions to N3; A1 600 # to N14; the other continuation bytes are illegal. 601 # N9 Initial sequence is F1 BF. Continuation byte AE transitions to 602 # state N10; AF to N11; BF to N13; the other continuations to N1. 603 # N10 Initial sequence is F1 BF AE. Continuation bytes B0-BF are 604 # illegal (non-chars); the other continuations are legal 605 # N11 Initial sequence is F1 BF AF. Continuation bytes A0-AF are 606 # illegal (non-chars); the other continuations are legal 607 # N12 Initial sequence is F[357] BF. Continuation bytes BF 608 # transitions to N13; the other continuations to N1 609 # N13 Initial sequence is F[1357] BF BF or F8 x BF (where x is 610 # something that can lead to a non-char. Continuation bytes BE 611 # and BF are illegal (non-chars); the other continuations are 612 # legal 613 # N14 Initial sequence is F8 A[9BDF]; or F8 B[13579BDF]; or F9 A1. 614 # Continuation byte BF transitions to N13; the other 615 # continuations to N2 616 # 1 Reject. All transitions not mentioned above (except the single 617 # byte ones (as they are always legal) are to this state. 618 619 my $NUM_CLASSES = 23; 620 my $N0 = 0; 621 my $N1 = $N0 + $NUM_CLASSES; 622 my $N2 = $N1 + $NUM_CLASSES; 623 my $N3 = $N2 + $NUM_CLASSES; 624 my $N4 = $N3 + $NUM_CLASSES; 625 my $N5 = $N4 + $NUM_CLASSES; 626 my $N6 = $N5 + $NUM_CLASSES; 627 my $N7 = $N6 + $NUM_CLASSES; 628 my $N8 = $N7 + $NUM_CLASSES; 629 my $N9 = $N8 + $NUM_CLASSES; 630 my $N10 = $N9 + $NUM_CLASSES; 631 my $N11 = $N10 + $NUM_CLASSES; 632 my $N12 = $N11 + $NUM_CLASSES; 633 my $N13 = $N12 + $NUM_CLASSES; 634 my $N14 = $N13 + $NUM_CLASSES; 635 636 my @strict_utf8_dfa; 637 my @i8 = ( 638 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F 639 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F 640 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F 641 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F 642 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F 643 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F 644 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F 645 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F 646 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F 647 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F 648 10,11,12,12,12,12,12,12,13,14,13,14,13,14,15,16, # A0-AF 649 17,18,17,18,17,18,19,20,17,18,17,18,17,18,21,22, # B0-BF 650 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF 651 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF 652 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF 653 8, 6, 4, 5, 4, 5, 4, 5, 9, 7, 1, 1, 1, 1, 1, 1, # F0-FF 654 ); 655 $strict_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255); 656 push @strict_utf8_dfa, ( 657 # Class: 658 # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 659 0,1,$N1,$N2,$N3,$N6,$N5,$N8,$N4,$N7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # N0 660 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # N1 661 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, # N2 662 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, # N3 663 664 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, # N4 665 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, 1, 1, $N2, $N9, # N5 666 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,$N12, # N6 667 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N3,$N14, $N3,$N14, $N3,$N14, $N3,$N14, $N3,$N14, # N7 668 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N3,$N14, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # N8 669 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N1, $N1, $N1, $N1, $N1,$N10,$N11, $N1, $N1, $N1, $N1, $N1,$N13, # N9 670 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, # N10 671 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, # N11 672 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1,$N13, # N12 673 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, # N13 674 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,$N13, # N14 675 ); 676 output_table(\@strict_utf8_dfa, "PL_strict_utf8_dfa_tab", $NUM_CLASSES); 677 } 678 679 { 680 # This generates the dfa table for C9 strict UTF-8, which rejects 681 # surrogates and above Unicode, but allows non-characters,. 682 # 683 # The classes are 684 # 00-9F 0 Always legal at start 685 # A0-A1 9 Not legal immediately after start bytes F0 F8 686 # A2-A7 10 Not legal immediately after start bytes F0 F8 F9 687 # A8-AF 11 Not legal immediately after start bytes F0 F9 688 # B0-B5,B8-BF 12 Not legal immediately after start byte F9 689 # B6,B7 13 690 # C0-C4 1 (reject, all are overlong) 691 # C5-DF 2 Accepts any legal continuation 692 # E0 1 (reject, all are overlong) 693 # E1-EF 3 Accepts any legal continuation 694 # F0 6 (has overlongs) 695 # F1 5 (has surrogates) 696 # F2-F7 4 Accepts any legal continuation 697 # F8 8 (has overlongs) 698 # F9 7 (has non-Unicode) 699 # FA-FF 1 (reject, all are non-Unicode) 700 # 701 # The first part of the table maps bytes to character classes to reduce 702 # the size of the transition table and create bitmasks. 703 # 704 # The second part is a transition table that maps a combination of a 705 # state of the automaton and a character class to a new state. The 706 # numbering of the original nodes is retained, but some have been split 707 # so that there are new nodes. They mean: 708 # N0 The initial state, and final accepting one. 709 # N1 One continuation byte (A0-BF) left. This is transitioned to 710 # immediately when the start byte indicates a two-byte sequence 711 # N2 Two continuation bytes left. 712 # N3 Three continuation bytes left. 713 # N4 Start byte is F0. Continuation bytes A[0-F] are illegal 714 # (overlong); the other continuations transition to N2 715 # N5 Start byte is F1. B6 and B7 are illegal (surrogates); the 716 # other continuations transition to N2 717 # N6 Start byte is F8. Continuation bytes A[0-7] are illegal 718 # (overlong); the other continuations transition to N3 719 # N7 Start byte is F9. Continuation bytes A0 and A1 transition to 720 # N3; the other continuation bytes are illegal (non-Unicode) 721 # 1 Reject. All transitions not mentioned above (except the single 722 # byte ones (as they are always legal) are to this state. 723 724 my $NUM_CLASSES = 14; 725 my $N0 = 0; 726 my $N1 = $N0 + $NUM_CLASSES; 727 my $N2 = $N1 + $NUM_CLASSES; 728 my $N3 = $N2 + $NUM_CLASSES; 729 my $N4 = $N3 + $NUM_CLASSES; 730 my $N5 = $N4 + $NUM_CLASSES; 731 my $N6 = $N5 + $NUM_CLASSES; 732 my $N7 = $N6 + $NUM_CLASSES; 733 734 my @C9_utf8_dfa; 735 my @i8 = ( 736 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F 737 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F 738 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F 739 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F 740 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F 741 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F 742 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F 743 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F 744 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F 745 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F 746 9, 9,10,10,10,10,10,10,11,11,11,11,11,11,11,11, # A0-AF 747 12,12,12,12,12,12,13,13,12,12,12,12,12,12,12,12, # B0-BF 748 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF 749 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF 750 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF 751 6, 5, 4, 4, 4, 4, 4, 4, 8, 7, 1, 1, 1, 1, 1, 1, # F0-FF 752 ); 753 $C9_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255); 754 push @C9_utf8_dfa, ( 755 # Class: 756 # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 757 0,1,$N1,$N2,$N3,$N5,$N4,$N7,$N6, 1, 1, 1, 1, 1, # N0 758 1,1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, # N1 759 1,1, 1, 1, 1, 1, 1, 1, 1,$N1, $N1, $N1, $N1, $N1, # N2 760 1,1, 1, 1, 1, 1, 1, 1, 1,$N2, $N2, $N2, $N2, $N2, # N3 761 762 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, # N4 763 1,1, 1, 1, 1, 1, 1, 1, 1,$N2, $N2, $N2, $N2, 1, # N5 764 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N3, $N3, $N3, # N6 765 1,1, 1, 1, 1, 1, 1, 1, 1,$N3, 1, 1, 1, 1, # N7 766 ); 767 output_table(\@C9_utf8_dfa, "PL_c9_utf8_dfa_tab", $NUM_CLASSES); 768 } 769 770 print $out_fh get_conditional_compile_line_end(); 771} 772 773print $out_fh "\n#endif /* PERL_EBCDIC_TABLES_H_ */\n"; 774 775read_only_bottom_close_and_rename($out_fh); 776