xref: /openbsd/gnu/usr.bin/perl/regen/ebcdic.pl (revision 097a140d)
1use v5.16.0;
2use strict;
3use warnings;
4use integer;
5
6BEGIN { unshift @INC, '.' }
7
8require './regen/regen_lib.pl';
9require './regen/charset_translations.pl';
10
11# Generates the EBCDIC translation tables that were formerly hard-coded into
12# utfebcdic.h
13
14my $out_fh = open_new('ebcdic_tables.h', '>',
15        {style => '*', by => $0, });
16
17sub get_column_headers ($$;$) {
18    my ($row_hdr_len, $field_width, $dfa_columns) = @_;
19    my $format;
20    my $final_column_format;
21    my $num_columns;
22
23    if (defined $dfa_columns) {
24        $num_columns = $dfa_columns;
25
26        # Trailing blank to correspond with commas in the rows below
27        $format = "%${field_width}d ";
28    }
29    else {  # Is a regular table
30        $num_columns = 16;
31
32        # Use blanks to separate the fields
33        $format = " " x ( $field_width
34                        - 2);               # For the '_X'
35        $format .= "_%X ";  # Again, trailing blank over the commas below
36    }
37
38    my $header = "/*" . " " x ($row_hdr_len - length "/*");
39
40    # All but the final column
41    $header .= sprintf($format, $_) for 0 .. $num_columns - 2;
42
43     # Get rid of trailing blank, so that the final column takes up one less
44     # space so that the "*/" doesn't extend past the commas in the rows below
45    chop $header;
46    $header .= sprintf $format, $num_columns - 1;
47
48    # Again, remove trailing blank
49    chop $header;
50
51    return $header . "*/\n";
52}
53
54sub output_table_start($$$;$) {
55    my ($out_fh, $TYPE, $name, $size) = @_;
56
57    $size = "" unless defined $size;
58    my $declaration = "EXTCONST $TYPE $name\[$size\]";
59    print $out_fh <<EOF;
60#  ifndef DOINIT
61    $declaration;
62#  else
63    $declaration = {
64EOF
65}
66
67sub output_table_end($) {
68    print $out_fh "};\n#  endif\n\n";
69}
70
71sub output_table ($$;$) {
72    my $table_ref = shift;
73    my $name = shift;
74
75    # 0 => print in decimal
76    # 1 => print in hex (translates code point to code point)
77    # >= 2 => is a dfa table, like https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
78    #      The number is how many columns in the part after the code point
79    #      portion.
80    #
81    # code point tables in hex areasier to debug, but don't fit into 80
82    # columns
83    my $type = shift // 1;
84
85    my $print_in_hex = $type == 1;
86    my $is_dfa = ($type >= 2) ? $type : 0;
87    my $columns_after_256 = 16;
88
89    die "Requres 256 entries in table $name, got @$table_ref"
90                                if ! $is_dfa && @$table_ref != 256;
91    if (! $is_dfa) {
92        die "Requres 256 entries in table $name, got @$table_ref"
93                                                        if @$table_ref != 256;
94    }
95    else {
96        $columns_after_256 = $is_dfa;
97
98        print $out_fh <<'EOF';
99
100/* The table below is adapted from
101 *      https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
102 * See copyright notice at the beginning of this file.
103 */
104
105EOF
106    }
107
108    # Highest number in the table
109    my $max_entry = 0;
110    $max_entry = map { $_ > $max_entry ? $_ : $max_entry } @$table_ref;
111
112    # We assume that every table has at least one two digit entry, and none
113    # are more than three digit.
114    my $field_width = ($print_in_hex)
115                      ? 4
116                      : (($max_entry) > 99 ? 3 : 2);
117
118    my $row_hdr_length;
119    my $node_number_field_width;
120    my $node_value_field_width;
121
122    # dfa tables have a special header for the rows in the transitions part of
123    # the table.  It is longer than the regular one.
124    if ($is_dfa) {
125        my $max_node_number = ($max_entry - 256) / $columns_after_256 - 1;
126        $node_number_field_width = ($max_node_number > 9) ? 2 : 1;
127        $node_value_field_width = ($max_node_number * $columns_after_256 > 99)
128                                  ? 3 : 2;
129        # The header starts with this template, and adds in the number of
130        # digits needed to represent the maximum node number and its value
131        $row_hdr_length = length("/*N=*/")
132                        + $node_number_field_width
133                        + $node_value_field_width;
134    }
135    else {
136        $row_hdr_length = length "/*_X*/";  # Template for what the header
137                                            # looks like
138    }
139
140    # The table may not be representable in 8 bits.
141    my $TYPE = 'U8';
142    $TYPE = 'U16' if grep { $_ > 255 } @$table_ref;
143
144    output_table_start $out_fh, $TYPE, $name;
145
146    # First the headers for the columns
147    print $out_fh get_column_headers($row_hdr_length, $field_width);
148
149    # Now the table body
150    my $count = @$table_ref;
151    my $last_was_nl = 1;
152
153    # Print each element individually, arranged in rows of columns
154    for my $i (0 .. $count - 1) {
155
156        # Node number for here is -1 until get into the dfa state transitions
157        my $node = ($i < 256) ? -1 : ($i - 256) / $columns_after_256;
158
159        # Print row header at beginning of each row
160        if ($last_was_nl) {
161            if ($node >= 0) {
162                printf $out_fh "/*N%-*d=%*d*/", $node_number_field_width, $node,
163                                               $node_value_field_width, $i - 256;
164            }
165            else {  # Otherwise is regular row; print its number
166                printf $out_fh "/*%X_", $i / 16;
167
168                # These rows in a dfa table require extra space so columns
169                # will align vertically (because the Ndd=ddd requires extra
170                # space)
171                if ($is_dfa) {
172                    print  $out_fh " " x (  $node_number_field_width
173                                          + $node_value_field_width);
174                }
175                print  $out_fh "*/";
176            }
177        }
178
179        if ($print_in_hex) {
180            printf $out_fh "0x%02X", $table_ref->[$i];
181        }
182        else {
183            printf $out_fh "%${field_width}d", $table_ref->[$i];
184        }
185
186        print $out_fh ",", if $i < $count -1;   # No comma on final entry
187
188        # Add \n if at end of row, which is 16 columns until we get to the
189        # transitions part
190        if (   ($node < 0 && $i % 16 == 15)
191            || ($node >= 0 && ($i -256) % $columns_after_256
192                                                    == $columns_after_256 - 1))
193        {
194            print $out_fh "\n";
195            $last_was_nl = 1;
196        }
197        else {
198            $last_was_nl = 0;
199        }
200    }
201
202    # Print column footer
203    print $out_fh get_column_headers($row_hdr_length, $field_width,
204                                     ($is_dfa) ? $columns_after_256 : undef);
205
206    output_table_end($out_fh);
207}
208
209print $out_fh <<'END';
210
211#ifndef PERL_EBCDIC_TABLES_H_   /* Guard against nested #includes */
212#define PERL_EBCDIC_TABLES_H_   1
213
214/* This file contains definitions for various tables used in EBCDIC handling.
215 * More info is in utfebcdic.h
216 *
217 * Some of the tables are adapted from
218 *      https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
219 * which requires this copyright notice:
220
221Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
222
223Permission is hereby granted, free of charge, to any person obtaining a copy of
224this software and associated documentation files (the "Software"), to deal in
225the Software without restriction, including without limitation the rights to
226use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
227of the Software, and to permit persons to whom the Software is furnished to do
228so, subject to the following conditions:
229
230The above copyright notice and this permission notice shall be included in all
231copies or substantial portions of the Software.
232
233THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
234IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
235FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
236AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
237LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
238OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
239SOFTWARE.
240
241*/
242END
243
244my @charsets = get_supported_code_pages();
245shift @charsets;    # ASCII is the 0th, and we don't deal with that here.
246foreach my $charset (@charsets) {
247    # we process the whole array several times, make a copy
248    my @a2e = @{get_a2n($charset)};
249    my @e2a;
250
251    print $out_fh "\n" . get_conditional_compile_line_start($charset);
252    print $out_fh "\n";
253
254    print $out_fh "/* Index is ASCII platform code point; value is $charset equivalent */\n";
255    output_table(\@a2e, "PL_a2e");
256
257    { # Construct the inverse
258        for my $i (0 .. 255) {
259            $e2a[$a2e[$i]] = $i;
260        }
261        print $out_fh "/* Index is $charset code point; value is ASCII platform equivalent */\n";
262        output_table(\@e2a, "PL_e2a");
263    }
264
265    my @i82utf = @{get_I8_2_utf($charset)};
266    print $out_fh <<END;
267/* (Confusingly named) Index is $charset I8 byte; value is
268 * $charset UTF-EBCDIC equivalent */
269END
270    output_table(\@i82utf, "PL_utf2e");
271
272    { #Construct the inverse
273        my @utf2i8;
274        for my $i (0 .. 255) {
275            $utf2i8[$i82utf[$i]] = $i;
276        }
277        print $out_fh <<END;
278/* (Confusingly named) Index is $charset UTF-EBCDIC byte; value is
279 * $charset I8 equivalent */
280END
281        output_table(\@utf2i8, "PL_e2utf");
282    }
283
284    {
285        my @utf8skip;
286
287        # These are invariants or continuation bytes.
288        for my $i (0 .. 0xBF) {
289            $utf8skip[$i82utf[$i]] = 1;
290        }
291
292        # These are start bytes;  The skip is the number of consecutive highest
293        # order 1-bits (up to 7)
294        for my $i (0xC0 .. 255) {
295            my $count;
296            if ($i == 0b11111111) {
297                no warnings 'once';
298                $count = $CHARSET_TRANSLATIONS::UTF_EBCDIC_MAXBYTES;
299            }
300            elsif (($i & 0b11111110) == 0b11111110) {
301                $count= 7;
302            }
303            elsif (($i & 0b11111100) == 0b11111100) {
304                $count= 6;
305            }
306            elsif (($i & 0b11111000) == 0b11111000) {
307                $count= 5;
308            }
309            elsif (($i & 0b11110000) == 0b11110000) {
310                $count= 4;
311            }
312            elsif (($i & 0b11100000) == 0b11100000) {
313                $count= 3;
314            }
315            elsif (($i & 0b11000000) == 0b11000000) {
316                $count= 2;
317            }
318            else {
319                die "Something wrong for UTF8SKIP calculation for $i";
320            }
321            $utf8skip[$i82utf[$i]] = $count;
322        }
323
324        print $out_fh <<END;
325/* Index is $charset UTF-EBCDIC byte; value is UTF8SKIP for start bytes
326 * (including for overlongs); 1 for continuation.  Adapted from the shadow
327 * flags table in tr16.  The entries marked 9 in tr16 are continuation bytes
328 * and are marked as length 1 here so that we can recover. */
329END
330        output_table(\@utf8skip, "PL_utf8skip", 0);  # The 0 means don't print
331                                                     # in hex
332    }
333
334    use feature 'unicode_strings';
335
336    {
337        my @lc;
338        for my $i (0 .. 255) {
339            $lc[$a2e[$i]] = $a2e[ord lc chr $i];
340        }
341        print $out_fh
342        "/* Index is $charset code point; value is its lowercase equivalent */\n";
343        output_table(\@lc, "PL_latin1_lc");
344    }
345
346    {
347        my @uc;
348        for my $i (0 .. 255) {
349            my $uc = uc chr $i;
350            if (length $uc > 1 || ord $uc > 255) {
351                $uc = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}";
352            }
353            $uc[$a2e[$i]] = $a2e[ord $uc];
354        }
355        print $out_fh <<END;
356/* Index is $charset code point; value is its uppercase equivalent.
357 * The 'mod' in the name means that codepoints whose uppercase is above 255 or
358 * longer than 1 character map to LATIN SMALL LETTER Y WITH DIARESIS */
359END
360        output_table(\@uc, "PL_mod_latin1_uc");
361    }
362
363    { # PL_fold
364        my @ascii_fold;
365        for my $i (0 .. 255) {  # Initialise to identity map
366            $ascii_fold[$i] = $i;
367        }
368
369        # Overwrite the entries that aren't identity
370        for my $chr ('A' .. 'Z') {
371            $ascii_fold[$a2e[ord $chr]] = $a2e[ord lc $chr];
372        }
373        for my $chr ('a' .. 'z') {
374            $ascii_fold[$a2e[ord $chr]] = $a2e[ord uc $chr];
375        }
376        print $out_fh <<END;
377/* Index is $charset code point; For A-Z, value is a-z; for a-z, value
378 * is A-Z; all other code points map to themselves */
379END
380        output_table(\@ascii_fold, "PL_fold");
381    }
382
383    {
384        my @latin1_fold;
385        for my $i (0 .. 255) {
386            my $char = chr $i;
387            my $lc = lc $char;
388
389            # lc and uc adequately proxy for fold-case pairs in this 0-255
390            # range
391            my $uc = uc $char;
392            $uc = $char if length $uc > 1 || ord $uc > 255;
393            if ($lc ne $char) {
394                $latin1_fold[$a2e[$i]] = $a2e[ord $lc];
395            }
396            elsif ($uc ne $char) {
397                $latin1_fold[$a2e[$i]] = $a2e[ord $uc];
398            }
399            else {
400                $latin1_fold[$a2e[$i]] = $a2e[$i];
401            }
402        }
403        print $out_fh <<END;
404/* Index is $charset code point; value is its other fold-pair equivalent
405 * (A => a; a => A, etc) in the 0-255 range.  If no such equivalent, value is
406 * the code point itself */
407END
408        output_table(\@latin1_fold, "PL_fold_latin1");
409    }
410
411    {
412      # This generates the dfa table for perl extended UTF-8, which accepts
413      # surrogates, non-characters, and accepts start bytes up through FE
414      # (start byte FF has to be handled outside this dfa).  The class numbers
415      # for start bytes are constrained so that they can be used as a shift
416      # count for masking off the leading one bits
417      #
418      # The classes are
419      #   00-9F           0
420      #   A0-A1           7   Not legal immediately after start bytes F0 F8 FC
421      #                       FE
422      #   A2-A3           8   Not legal immediately after start bytes F0 F8 FC
423      #   A4-A7           9   Not legal immediately after start bytes F0 F8
424      #   A8-AF          10   Not legal immediately after start bytes F0
425      #   B0-BF          11
426      #   C0-C4           1
427      #   C5-DF           2
428      #   E0              1
429      #   E1-EF           3
430      #   F0             12
431      #   F1-F7           4
432      #   F8             13
433      #   F9-FB           5
434      #   FC             14
435      #   FD              6
436      #   FE             15
437      #   FF              1
438      #
439      # Here's the I8 for the code points before which overlongs occur:
440      # U+4000:     \xF0\xB0\xA0\xA0
441      # U+40000:    \xF8\xA8\xA0\xA0\xA0
442      # U+400000:   \xFC\xA4\xA0\xA0\xA0\xA0
443      # U+4000000:  \xFE\xA2\xA0\xA0\xA0\xA0\xA0
444      #
445      # The first part of the table maps bytes to character classes to reduce
446      # the size of the transition table and create bitmasks.
447      #
448      # The second part is a transition table that maps a combination of a
449      # state of the automaton and a character class to a new state.  The
450      # numbering of the original nodes is retained, but some have been split
451      # so that there are new nodes.  They mean:
452      # N0     The initial state, and final accepting one.
453      # N1     One continuation byte (A0-BF) left.  This is transitioned to
454      #        immediately when the start byte indicates a two-byte sequence
455      # N2     Two continuation bytes left.
456      # N3     Three continuation bytes left.
457      # N4     Four continuation bytes left.
458      # N5     Five continuation bytes left.
459      # N6     Start byte is F0.  Continuation bytes A[0-F] are illegal
460      #        (overlong); the other continuations transition to N2
461      # N7     Start byte is F8.  Continuation bytes A[0-7] are illegal
462      #        (overlong); the other continuations transition to N3
463      # N8     Start byte is FC.  Continuation bytes A[0-3] are illegal
464      #        (overlong); the other continuations transition to N4
465      # N9     Start byte is FE.  Continuation bytes A[01] are illegal
466      #        (overlong); the other continuations transition to N5
467      # 1      Reject.  All transitions not mentioned above (except the single
468      #        byte ones (as they are always legal) are to this state.
469
470        my $NUM_CLASSES = 16;
471        my $N0 = 0;
472        my $N1 =  $N0 + $NUM_CLASSES;
473        my $N2 =  $N1 + $NUM_CLASSES;
474        my $N3 =  $N2 + $NUM_CLASSES;
475        my $N4 =  $N3 + $NUM_CLASSES;
476        my $N5 =  $N4 + $NUM_CLASSES;
477        my $N6 =  $N5 + $NUM_CLASSES;
478        my $N7 =  $N6 + $NUM_CLASSES;
479        my $N8 =  $N7 + $NUM_CLASSES;
480        my $N9 =  $N8 + $NUM_CLASSES;
481        my $N10 = $N9 + $NUM_CLASSES;
482
483        my @perl_extended_utf8_dfa;
484        my @i8 = (
485                # 0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
486                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F
487                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F
488                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F
489                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F
490                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F
491                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F
492                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F
493                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F
494                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F
495                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F
496                  7, 7, 8, 8, 9, 9, 9, 9,10,10,10,10,10,10,10,10, # A0-AF
497                 11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11, # B0-BF
498                  1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF
499                  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF
500                  1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF
501                 12, 4, 4, 4, 4, 4, 4, 4,13, 5, 5, 5,14, 6,15, 1, # F0-FF
502                );
503        $perl_extended_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255);
504        push @perl_extended_utf8_dfa, (
505          # Class:
506          # 0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
507            0,  1,$N1,$N2,$N3,$N4,$N5,  1,  1,  1,  1,  1,$N6,$N7,$N8,$N9, # N0
508            1,  1,  1,  1,  1,  1,  1,  0,  0,  0,  0,  0,  1,  1,  1,  1, # N1
509            1,  1,  1,  1,  1,  1,  1,$N1,$N1,$N1,$N1,$N1,  1,  1,  1,  1, # N2
510            1,  1,  1,  1,  1,  1,  1,$N2,$N2,$N2,$N2,$N2,  1,  1,  1,  1, # N3
511            1,  1,  1,  1,  1,  1,  1,$N3,$N3,$N3,$N3,$N3,  1,  1,  1,  1, # N4
512            1,  1,  1,  1,  1,  1,  1,$N4,$N4,$N4,$N4,$N4,  1,  1,  1,  1, # N5
513
514            1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,$N2,  1,  1,  1,  1, # N6
515            1,  1,  1,  1,  1,  1,  1,  1,  1,  1,$N3,$N3,  1,  1,  1,  1, # N7
516            1,  1,  1,  1,  1,  1,  1,  1,  1,$N4,$N4,$N4,  1,  1,  1,  1, # N8
517            1,  1,  1,  1,  1,  1,  1,  1,$N5,$N5,$N5,$N5,  1,  1,  1,  1, # N9
518        );
519        output_table(\@perl_extended_utf8_dfa, "PL_extended_utf8_dfa_tab",
520                                                                   $NUM_CLASSES);
521    }
522
523    {
524      # This generates the dfa table for strict UTF-8, which rejects
525      # surrogates, non-characters, and above Unicode.
526      #
527      # The classes are
528      #   00-9F           0   Always legal at start
529      #   A0             10   Not legal immediately after start bytes F0 F8
530      #   A1             11   Not legal immediately after start bytes F0 F8,
531      #   A2-A7          12   Not legal immediately after start bytes F0 F8 F9
532      #   A8,AA,AC       13   Not legal immediately after start bytes F0 F9
533      #   A9,AB,AD       14   Not legal immediately after start byte F0
534      #   AE             15   Not legal immediately after start byte F0
535      #   AF             16   Not legal immediately after start bytes F0
536      #   B[0248AC]      17   Not legal immediately after start byte F9
537      #   B[1359D]       18   Not legal immediately after start byte F9
538      #   B6             19   Not legal immediately after start byte F9
539      #   B7             20   Not legal immediately after start byte F9
540      #   BE             21   Not legal immediately after start byte F9
541      #   BF             22   Not legal immediately after start byte F9
542      #   C0-C4           1   (reject, all are overlong)
543      #   C5-DF           2   Accepts any legal continuation
544      #   E0              1   (reject, all are overlong)
545      #   E1-EF           3   Accepts any legal continuation
546      #   F0              8   (has overlongs)
547      #   F1              6   (has surrogates, non-chars)
548      #   F2,F4,F6        4   Accepts any legal continuation
549      #   F3,F5,F7        5   (has non-chars)
550      #   F8              9   (has overlongs, non-chars)
551      #   F9              7   (has non-chars, non-Unicode)
552      #   FA-FF           1   (reject, all are non-Unicode)
553      #
554      # Here's the I8 for enough code points so that you can figure out what's
555      # going on:
556      #
557      # U+D800: \xF1\xB6\xA0\xA0
558      # U+DFFF: \xF1\xB7\xBF\xBF
559      # U+FDD0: \xF1\xBF\xAE\xB0
560      # U+FDEF: \xF1\xBF\xAF\xAF
561      # U+FFFE: \xF1\xBF\xBF\xBE
562      # U+1FFFE: \xF3\xBF\xBF\xBE
563      # U+2FFFE: \xF5\xBF\xBF\xBE
564      # U+3FFFE: \xF7\xBF\xBF\xBE
565      # U+4FFFE: \xF8\xA9\xBF\xBF\xBE
566      # U+5FFFE: \xF8\xAB\xBF\xBF\xBE
567      # U+6FFFE: \xF8\xAD\xBF\xBF\xBE
568      # U+7FFFE: \xF8\xAF\xBF\xBF\xBE
569      # U+8FFFE: \xF8\xB1\xBF\xBF\xBE
570      # U+9FFFE: \xF8\xB3\xBF\xBF\xBE
571      # U+AFFFE: \xF8\xB5\xBF\xBF\xBE
572      # U+BFFFE: \xF8\xB7\xBF\xBF\xBE
573      # U+CFFFE: \xF8\xB9\xBF\xBF\xBE
574      # U+DFFFE: \xF8\xBB\xBF\xBF\xBE
575      # U+EFFFE: \xF8\xBD\xBF\xBF\xBE
576      # U+FFFFE: \xF8\xBF\xBF\xBF\xBE
577      # U+10FFFE: \xF9\xA1\xBF\xBF\xBE
578      #
579      # The first part of the table maps bytes to character classes to reduce
580      # the size of the transition table and create bitmasks.
581      #
582      # The second part is a transition table that maps a combination of a
583      # state of the automaton and a character class to a new state.  The
584      # numbering of the original nodes is retained, but some have been split
585      # so that there are new nodes.  They mean:
586      # N0     The initial state, and final accepting one.
587      # N1     One continuation byte (A0-BF) left.  This is transitioned to
588      #        immediately when the start byte indicates a two-byte sequence
589      # N2     Two continuation bytes left.
590      # N3     Three continuation bytes left.
591      # N4     Start byte is F0.  Continuation bytes A[0-F] are illegal
592      #        (overlong); the other continuations transition to N2
593      # N5     Start byte is F1.  Continuation bytes B6 and B7 are illegal
594      #        (surrogates); BF transitions to N9; the other continuations to
595      #        N2
596      # N6     Start byte is F[357].  Continuation byte BF transitions to N12;
597      #        other continuations to N2
598      # N7     Start byte is F8.  Continuation bytes A[0-7] are illegal
599      #        (overlong); continuations A[9BDF] and B[13579BDF] transition to
600      #        N14; the other continuations to N3
601      # N8     Start byte is F9.  Continuation byte A0 transitions to N3; A1
602      #        to N14; the other continuation bytes are illegal.
603      # N9     Initial sequence is F1 BF.  Continuation byte AE transitions to
604      #        state N10; AF to N11; BF to N13; the other continuations to N1.
605      # N10    Initial sequence is F1 BF AE.  Continuation bytes B0-BF are
606      #        illegal (non-chars); the other continuations are legal
607      # N11    Initial sequence is F1 BF AF.  Continuation bytes A0-AF are
608      #        illegal (non-chars); the other continuations are legal
609      # N12    Initial sequence is F[357] BF.  Continuation bytes BF
610      #        transitions to N13; the other continuations to N1
611      # N13    Initial sequence is F[1357] BF BF or F8 x y BF (where x and y
612      #        are something that can lead to a non-char.  Continuation bytes
613      #        BE and BF are illegal (non-chars); the other continuations are
614      #        legal
615      # N14    Initial sequence is F8 A[9BDF]; or F8 B[13579BDF]; or F9 A1.
616      #        Continuation byte BF transitions to N15; the other
617      #        continuations to N2
618      # N15    Initial sequence is F8 A[9BDF] BF; or F8 B[13579BDF] BF; or
619      #        F9 A1 BF.  Continuation byte BF transitions to N16; the other
620      #        continuations to N2
621      # N16    Initial sequence is F8 A[9BDF] BF BF; or F8 B[13579BDF] BF BF;
622      #        or F9 A1 BF BF.  Continuation bytes BE and BF are illegal
623      #        (non-chars); the other continuations are legal
624      # 1      Reject.  All transitions not mentioned above (except the single
625      #        byte ones (as they are always legal) are to this state.
626
627        my $NUM_CLASSES = 23;
628        my $N0 = 0;
629        my $N1 =  $N0 + $NUM_CLASSES;
630        my $N2 =  $N1 + $NUM_CLASSES;
631        my $N3 =  $N2 + $NUM_CLASSES;
632        my $N4 =  $N3 + $NUM_CLASSES;
633        my $N5 =  $N4 + $NUM_CLASSES;
634        my $N6 =  $N5 + $NUM_CLASSES;
635        my $N7 =  $N6 + $NUM_CLASSES;
636        my $N8 =  $N7 + $NUM_CLASSES;
637        my $N9 =  $N8 + $NUM_CLASSES;
638        my $N10 = $N9 + $NUM_CLASSES;
639        my $N11 = $N10 + $NUM_CLASSES;
640        my $N12 = $N11 + $NUM_CLASSES;
641        my $N13 = $N12 + $NUM_CLASSES;
642        my $N14 = $N13 + $NUM_CLASSES;
643        my $N15 = $N14 + $NUM_CLASSES;
644
645        my @strict_utf8_dfa;
646        my @i8 = (
647                # 0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
648                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F
649                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F
650                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F
651                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F
652                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F
653                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F
654                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F
655                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F
656                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F
657                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F
658                 10,11,12,12,12,12,12,12,13,14,13,14,13,14,15,16, # A0-AF
659                 17,18,17,18,17,18,19,20,17,18,17,18,17,18,21,22, # B0-BF
660                  1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF
661                  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF
662                  1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF
663                  8, 6, 4, 5, 4, 5, 4, 5, 9, 7, 1, 1, 1, 1, 1, 1, # F0-FF
664                );
665        $strict_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255);
666        push @strict_utf8_dfa, (
667          # Class:
668          # 0 1   2   3   4   5   6   7   8   9   10   11   12   13   14   15   16   17   18   19   20   21   22
669            0,1,$N1,$N2,$N3,$N6,$N5,$N8,$N4,$N7,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1, # N0
670            1,1,  1,  1,  1,  1,  1,  1,  1,  1,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, # N1
671            1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, # N2
672            1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, # N3
673
674            1,1,  1,  1,  1,  1,  1,  1,  1,  1,   1,   1,   1,   1,   1,   1,   1, $N2, $N2, $N2, $N2, $N2, $N2, # N4
675            1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,   1,   1, $N2, $N9, # N5
676            1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,$N12, # N6
677            1,1,  1,  1,  1,  1,  1,  1,  1,  1,   1,   1,   1, $N3,$N14, $N3,$N14, $N3,$N14, $N3,$N14, $N3,$N14, # N7
678            1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N3,$N14,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1, # N8
679            1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N1, $N1, $N1, $N1, $N1,$N10,$N11, $N1, $N1, $N1, $N1, $N1,$N13, # N9
680            1,1,  1,  1,  1,  1,  1,  1,  1,  1,   0,   0,   0,   0,   0,   0,   0,   1,   1,   1,   1,   1,   1, # N10
681            1,1,  1,  1,  1,  1,  1,  1,  1,  1,   1,   1,   1,   1,   1,   1,   1,   0,   0,   0,   0,   0,   0, # N11
682            1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1,$N13, # N12
683            1,1,  1,  1,  1,  1,  1,  1,  1,  1,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   1,   1, # N13
684            1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,$N15, # N14
685            1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1,$N13, # N15
686        );
687        output_table(\@strict_utf8_dfa, "PL_strict_utf8_dfa_tab", $NUM_CLASSES);
688    }
689
690    {
691      # This generates the dfa table for C9 strict UTF-8, which rejects
692      # surrogates and above Unicode, but allows non-characters,.
693      #
694      # The classes are
695      #   00-9F           0   Always legal at start
696      #   A0-A1           9   Not legal immediately after start bytes F0 F8
697      #   A2-A7          10   Not legal immediately after start bytes F0 F8 F9
698      #   A8-AF          11   Not legal immediately after start bytes F0 F9
699      #   B0-B5,B8-BF    12   Not legal immediately after start byte F9
700      #   B6,B7          13
701      #   C0-C4           1   (reject, all are overlong)
702      #   C5-DF           2   Accepts any legal continuation
703      #   E0              1   (reject, all are overlong)
704      #   E1-EF           3   Accepts any legal continuation
705      #   F0              6   (has overlongs)
706      #   F1              5   (has surrogates)
707      #   F2-F7           4   Accepts any legal continuation
708      #   F8              8   (has overlongs)
709      #   F9              7   (has non-Unicode)
710      #   FA-FF           1   (reject, all are non-Unicode)
711      #
712      # The first part of the table maps bytes to character classes to reduce
713      # the size of the transition table and create bitmasks.
714      #
715      # The second part is a transition table that maps a combination of a
716      # state of the automaton and a character class to a new state.  The
717      # numbering of the original nodes is retained, but some have been split
718      # so that there are new nodes.  They mean:
719      # N0     The initial state, and final accepting one.
720      # N1     One continuation byte (A0-BF) left.  This is transitioned to
721      #        immediately when the start byte indicates a two-byte sequence
722      # N2     Two continuation bytes left.
723      # N3     Three continuation bytes left.
724      # N4     Start byte is F0.  Continuation bytes A[0-F] are illegal
725      #        (overlong); the other continuations transition to N2
726      # N5     Start byte is F1.  B6 and B7 are illegal (surrogates); the
727      #        other continuations transition to N2
728      # N6     Start byte is F8.  Continuation bytes A[0-7] are illegal
729      #        (overlong); the other continuations transition to N3
730      # N7     Start byte is F9.  Continuation bytes A0 and A1 transition to
731      #        N3; the other continuation bytes are illegal (non-Unicode)
732      # 1      Reject.  All transitions not mentioned above (except the single
733      #        byte ones (as they are always legal) are to this state.
734
735        my $NUM_CLASSES = 14;
736        my $N0 = 0;
737        my $N1 =  $N0 + $NUM_CLASSES;
738        my $N2 =  $N1 + $NUM_CLASSES;
739        my $N3 =  $N2 + $NUM_CLASSES;
740        my $N4 =  $N3 + $NUM_CLASSES;
741        my $N5 =  $N4 + $NUM_CLASSES;
742        my $N6 =  $N5 + $NUM_CLASSES;
743        my $N7 =  $N6 + $NUM_CLASSES;
744
745        my @C9_utf8_dfa;
746        my @i8 = (
747                # 0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
748                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F
749                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F
750                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F
751                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F
752                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F
753                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F
754                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F
755                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F
756                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F
757                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F
758                  9, 9,10,10,10,10,10,10,11,11,11,11,11,11,11,11, # A0-AF
759                 12,12,12,12,12,12,13,13,12,12,12,12,12,12,12,12, # B0-BF
760                  1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF
761                  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF
762                  1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF
763                  6, 5, 4, 4, 4, 4, 4, 4, 8, 7, 1, 1, 1, 1, 1, 1, # F0-FF
764                );
765        $C9_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255);
766        push @C9_utf8_dfa, (
767          # Class:
768          # 0 1   2   3   4   5   6   7   8   9   10   11   12   13
769            0,1,$N1,$N2,$N3,$N5,$N4,$N7,$N6,  1,   1,   1,   1,   1, # N0
770            1,1,  1,  1,  1,  1,  1,  1,  1,  0,   0,   0,   0,   0, # N1
771            1,1,  1,  1,  1,  1,  1,  1,  1,$N1, $N1, $N1, $N1, $N1, # N2
772            1,1,  1,  1,  1,  1,  1,  1,  1,$N2, $N2, $N2, $N2, $N2, # N3
773
774            1,1,  1,  1,  1,  1,  1,  1,  1,  1,   1,   1, $N2, $N2, # N4
775            1,1,  1,  1,  1,  1,  1,  1,  1,$N2, $N2, $N2, $N2,   1, # N5
776            1,1,  1,  1,  1,  1,  1,  1,  1,  1,   1, $N3, $N3, $N3, # N6
777            1,1,  1,  1,  1,  1,  1,  1,  1,$N3,   1,   1,   1,   1, # N7
778        );
779        output_table(\@C9_utf8_dfa, "PL_c9_utf8_dfa_tab", $NUM_CLASSES);
780    }
781
782    print $out_fh get_conditional_compile_line_end();
783}
784
785print $out_fh "\n#endif /* PERL_EBCDIC_TABLES_H_ */\n";
786
787read_only_bottom_close_and_rename($out_fh);
788