1#!/usr/bin/perl -w 2use strict; 3use warnings; 4 5# WARNING: This must be kept in sync with the UTF8_MAXBYTES value in 6# utfebcdic.h 7$CHARSET_TRANSLATIONS::UTF_EBCDIC_MAXBYTES = 14; 8 9# Utilities for various character set issues. Currently handles ASCII and 10# EBCDIC only. It is trivial to add support for new EBCDIC code pages (unless 11# they have identical variant character signatures as existing ones, and there 12# aren't other glitches that arise): just add a mapping table to 13# %ebcdic_translations and regen everything that uses this. 14 15my %ebcdic_translations = ( 16 # Keys are code page name; values are arrays that map ASCII ordinals to 17 # the code page's ordinals 18 19 'EBCDIC 1047' => 20 [ 0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, 0x16, 0x05, 0x15, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 21 0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, 0x18, 0x19, 0x3F, 0x27, 0x1C, 0x1D, 0x1E, 0x1F, 22 0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61, 23 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0x7A, 0x5E, 0x4C, 0x7E, 0x6E, 0x6F, 24 0x7C, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 25 0xD7, 0xD8, 0xD9, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xAD, 0xE0, 0xBD, 0x5F, 0x6D, 26 0x79, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 27 0x97, 0x98, 0x99, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xC0, 0x4F, 0xD0, 0xA1, 0x07, 28 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x06, 0x17, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x09, 0x0A, 0x1B, 29 0x30, 0x31, 0x1A, 0x33, 0x34, 0x35, 0x36, 0x08, 0x38, 0x39, 0x3A, 0x3B, 0x04, 0x14, 0x3E, 0xFF, 30 0x41, 0xAA, 0x4A, 0xB1, 0x9F, 0xB2, 0x6A, 0xB5, 0xBB, 0xB4, 0x9A, 0x8A, 0xB0, 0xCA, 0xAF, 0xBC, 31 0x90, 0x8F, 0xEA, 0xFA, 0xBE, 0xA0, 0xB6, 0xB3, 0x9D, 0xDA, 0x9B, 0x8B, 0xB7, 0xB8, 0xB9, 0xAB, 32 0x64, 0x65, 0x62, 0x66, 0x63, 0x67, 0x9E, 0x68, 0x74, 0x71, 0x72, 0x73, 0x78, 0x75, 0x76, 0x77, 33 0xAC, 0x69, 0xED, 0xEE, 0xEB, 0xEF, 0xEC, 0xBF, 0x80, 0xFD, 0xFE, 0xFB, 0xFC, 0xBA, 0xAE, 0x59, 34 0x44, 0x45, 0x42, 0x46, 0x43, 0x47, 0x9C, 0x48, 0x54, 0x51, 0x52, 0x53, 0x58, 0x55, 0x56, 0x57, 35 0x8C, 0x49, 0xCD, 0xCE, 0xCB, 0xCF, 0xCC, 0xE1, 0x70, 0xDD, 0xDE, 0xDB, 0xDC, 0x8D, 0x8E, 0xDF 36 ], 37 38# 'EBCDIC POSIX-BC' => 39# [ 40# 0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, 0x16, 0x05, 0x15, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 41# 0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, 0x18, 0x19, 0x3F, 0x27, 0x1C, 0x1D, 0x1E, 0x1F, 42# 0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61, 43# 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0x7A, 0x5E, 0x4C, 0x7E, 0x6E, 0x6F, 44# 0x7C, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 45# 0xD7, 0xD8, 0xD9, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xBB, 0xBC, 0xBD, 0x6A, 0x6D, 46# 0x4A, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 47# 0x97, 0x98, 0x99, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xFB, 0x4F, 0xFD, 0xFF, 0x07, 48# 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x06, 0x17, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x09, 0x0A, 0x1B, 49# 0x30, 0x31, 0x1A, 0x33, 0x34, 0x35, 0x36, 0x08, 0x38, 0x39, 0x3A, 0x3B, 0x04, 0x14, 0x3E, 0x5F, 50# 0x41, 0xAA, 0xB0, 0xB1, 0x9F, 0xB2, 0xD0, 0xB5, 0x79, 0xB4, 0x9A, 0x8A, 0xBA, 0xCA, 0xAF, 0xA1, 51# 0x90, 0x8F, 0xEA, 0xFA, 0xBE, 0xA0, 0xB6, 0xB3, 0x9D, 0xDA, 0x9B, 0x8B, 0xB7, 0xB8, 0xB9, 0xAB, 52# 0x64, 0x65, 0x62, 0x66, 0x63, 0x67, 0x9E, 0x68, 0x74, 0x71, 0x72, 0x73, 0x78, 0x75, 0x76, 0x77, 53# 0xAC, 0x69, 0xED, 0xEE, 0xEB, 0xEF, 0xEC, 0xBF, 0x80, 0xE0, 0xFE, 0xDD, 0xFC, 0xAD, 0xAE, 0x59, 54# 0x44, 0x45, 0x42, 0x46, 0x43, 0x47, 0x9C, 0x48, 0x54, 0x51, 0x52, 0x53, 0x58, 0x55, 0x56, 0x57, 55# 0x8C, 0x49, 0xCD, 0xCE, 0xCB, 0xCF, 0xCC, 0xE1, 0x70, 0xC0, 0xDE, 0xDB, 0xDC, 0x8D, 0x8E, 0xDF 56# ], 57 58 'EBCDIC 037' => 59 [ 60 0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, 0x16, 0x05, 0x25, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 61 0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, 0x18, 0x19, 0x3F, 0x27, 0x1C, 0x1D, 0x1E, 0x1F, 62 0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61, 63 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0x7A, 0x5E, 0x4C, 0x7E, 0x6E, 0x6F, 64 0x7C, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 65 0xD7, 0xD8, 0xD9, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xBA, 0xE0, 0xBB, 0xB0, 0x6D, 66 0x79, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 67 0x97, 0x98, 0x99, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xC0, 0x4F, 0xD0, 0xA1, 0x07, 68 0x20, 0x21, 0x22, 0x23, 0x24, 0x15, 0x06, 0x17, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x09, 0x0A, 0x1B, 69 0x30, 0x31, 0x1A, 0x33, 0x34, 0x35, 0x36, 0x08, 0x38, 0x39, 0x3A, 0x3B, 0x04, 0x14, 0x3E, 0xFF, 70 0x41, 0xAA, 0x4A, 0xB1, 0x9F, 0xB2, 0x6A, 0xB5, 0xBD, 0xB4, 0x9A, 0x8A, 0x5F, 0xCA, 0xAF, 0xBC, 71 0x90, 0x8F, 0xEA, 0xFA, 0xBE, 0xA0, 0xB6, 0xB3, 0x9D, 0xDA, 0x9B, 0x8B, 0xB7, 0xB8, 0xB9, 0xAB, 72 0x64, 0x65, 0x62, 0x66, 0x63, 0x67, 0x9E, 0x68, 0x74, 0x71, 0x72, 0x73, 0x78, 0x75, 0x76, 0x77, 73 0xAC, 0x69, 0xED, 0xEE, 0xEB, 0xEF, 0xEC, 0xBF, 0x80, 0xFD, 0xFE, 0xFB, 0xFC, 0xAD, 0xAE, 0x59, 74 0x44, 0x45, 0x42, 0x46, 0x43, 0x47, 0x9C, 0x48, 0x54, 0x51, 0x52, 0x53, 0x58, 0x55, 0x56, 0x57, 75 0x8C, 0x49, 0xCD, 0xCE, 0xCB, 0xCF, 0xCC, 0xE1, 0x70, 0xDD, 0xDE, 0xDB, 0xDC, 0x8D, 0x8E, 0xDF 76 ], 77); 78 79my $ascii_key = 'ASCII/Latin1'; 80 81my %I8_TO_NATIVE_UTF8; # Maps I8 UTF to final UTF-EBCDIC 82 # See http://www.unicode.org/reports/tr16/ 83 84sub get_supported_code_pages() { 85 # Returns an ordered array of the currently supported code pages, 86 # including ASCII as the 0th element, 1047 as the 1th, and the others 87 # sorted lexically by code page name. 88 89 # Create an ASCII table. 90 unless (exists $ebcdic_translations{$ascii_key}) { 91 for my $i (0 .. 255) { 92 $ebcdic_translations{$ascii_key}->[$i] = $i; 93 } 94 } 95 96 return sort { 97 $a eq $ascii_key 98 ? -1 99 : $b eq $ascii_key 100 ? 1 101 : $a =~ /1047/ 102 ? -1 103 : $b =~ /1047/ 104 ? 1 105 : $a cmp $b 106 } keys %ebcdic_translations; 107} 108 109sub get_a2n($) { 110 # Returns the mapping array for ASCII to code page for the code page named 111 # by the input parameter. 112 113 my $charset = shift; 114 115 if (! exists $ebcdic_translations{$charset}) { 116 die "Unknown character set '$charset'"; 117 } 118 119 return $ebcdic_translations{$charset}; 120} 121 122sub get_I8_2_utf($) { 123 # Returns the mapping array for I8 to code page UTF-EBCDIC for the code 124 # page named by the input parameter. This is Table 2 of TR16 customized 125 # for the code page. See utfebcdic.h for why, contrary to TR16, it has to 126 # be code-page-specific. 127 128 my $charset = shift; 129 130 die "I8 not a valid concept for ASCII" if $charset eq $ascii_key; 131 die "'$charset' unknown" unless exists $ebcdic_translations{$charset}; 132 133 # Generate the table if not already present 134 if (! exists $I8_TO_NATIVE_UTF8{$charset}) { 135 136 # The code points not used for invariants. Initialized to everything, 137 # then entries are removed as we go along. 138 my %unused_cps; 139 for my $i (0 .. 255) { 140 $unused_cps{$i} = 1; 141 } 142 143 # These are the invariants. The output has them mapped to the 144 # original EBCDIC code point. 145 for my $i (0 .. 0x9F) { 146 use charnames (); 147 my $ebcdic_value = $ebcdic_translations{$charset}[$i]; 148 #printf "$charset: using %02x which is %02x ascii, %s\n", $ebcdic_value, $i, charnames::viacode($i); 149 $I8_TO_NATIVE_UTF8{$charset}[$i] = $ebcdic_value; 150 if (! defined delete $unused_cps{$ebcdic_value}) { 151 die "Two code points map to $ebcdic_value; one is $i"; 152 } 153 } 154 155 # Put the unused code points in order 156 my @unused_cps = sort { $a <=> $b } keys %unused_cps; 157 158 # Fill in the rest of the map with these ordered code points, as TR16 159 # specifies 160 for my $i (0xA0 .. 255) { 161 $I8_TO_NATIVE_UTF8{$charset}[$i] = shift @unused_cps; 162 #printf "$charset: filling in %02x which is %02x ascii, %s\n", $I8_TO_NATIVE_UTF8{$charset}[$i], $i, charnames::viacode($i); 163 } 164 165 if (@unused_cps) { 166 die "Left-over code points"; 167 } 168 } 169 170 return $I8_TO_NATIVE_UTF8{$charset}; 171} 172 173{ # Closure 174 175 my $charset; # We use these to do some error checking that the #if and 176 # #endif are matched. 177 my $indent; 178 179 sub get_conditional_compile_line_start($;$) { 180 # Returns the '#if' line to put into C code to compile for the code 181 # page given by the first parameter. The second parameter, if 182 # present, is the indentation level, like '# if ...' 183 184 if (defined $charset || defined $indent) { 185 die "Missing call to get_conditional_compile_line_end()" 186 } 187 188 $charset = shift; 189 my $indent_level = shift // 0; 190 191 die "This is designed to run only on an ASCII platform" unless ord "A" == 65; 192 193 if ($indent_level == 0) { 194 $indent = ""; 195 } 196 else { 197 $indent = " " x $indent_level; 198 } 199 200 die "Unknown character set '$charset'" unless exists $ebcdic_translations{$charset}; 201 202 my $return = ""; 203 { 204 no warnings 'qw'; 205 my $count = -1; 206 207 # We use all the typical variant characters to construct the #if, 208 # so that it is unlikely that a different code page will match 209 # this #if 210 my @variant_chars = qw/A \\\ [ ] { } ^ ~ ! # | $ @ `/; 211 push @variant_chars, "\n"; 212 for my $char (@variant_chars) { 213 my $compare; 214 my $ascii_ord = ord $char; 215 my $first_time = $return eq ""; 216 217 $compare = $ebcdic_translations{$charset}[$ascii_ord]; 218 $return .= " && " unless $first_time; 219 $char = '\n' if $char eq "\n"; 220 die "Non-graphical character ord=" . ord($char) 221 if $char !~ /[[:graph:]]/; 222 $return .= "'$char' == $compare"; 223 $return .= " /* $charset */" if $first_time; 224 last if $charset eq $ascii_key; 225 $count++; 226 $return .= " \\\n " if $first_time || $count % 5 == 0; 227 } 228 } 229 230 return "#${indent}if $return\n"; 231 } 232 233 sub get_conditional_compile_line_end () { 234 # Returns the #endif for the currently open #if 235 236 my $return = "#${indent}endif\t/* $charset */\n"; 237 undef $charset; 238 undef $indent; 239 return $return; 240 } 241} 242 243sub _UTF_START_MASK($) { 244 # Internal 245 my $len = shift; 246 return (0x7F >> ($len)); 247} 248 249sub _UTF_START_MARK($) { 250 # Internal 251 my $len = shift; 252 return (0xFF & ~(0xFF >> ($len))); 253} 254 255sub cp_2_utfbytes($$) { 256 # Returns a string consisting of the UTF-EBCDIC for the code page given by 257 # the 2nd parameter, of the Unicode code point given by the first 258 # parameter, using the UTF-MOD algorithm published in TR16. (If the "code 259 # page" is ASCII, straight UTF-8 is returned.) 260 261 my ($ucp, $charset) = @_; 262 263 if ($charset eq $ascii_key) { 264 my $str = chr $ucp; 265 utf8::upgrade($str); 266 utf8::encode($str); 267 return $str; 268 } 269 elsif (exists $ebcdic_translations{$charset}) { 270 271 if ($ucp < 0xA0) { 272 return chr $ebcdic_translations{$charset}[$ucp]; 273 } 274 275 my $I8_2_utf = get_I8_2_utf($charset); 276 277 my $len = $ucp < 0xA0 ? 1 : 278 $ucp < 0x400 ? 2 : 279 $ucp < 0x4000 ? 3 : 280 $ucp < 0x40000 ? 4 : 281 $ucp < 0x400000 ? 5 : 282 $ucp < 0x4000000 ? 6 : 283 $ucp < 0x40000000? 7 : 284 $CHARSET_TRANSLATIONS::UTF_EBCDIC_MAXBYTES; 285 286 my @str; 287 for (1 .. $len - 1) { 288 unshift @str, chr $I8_2_utf->[($ucp & 0x1f) | 0xA0]; 289 $ucp >>= 5; 290 } 291 292 unshift @str, chr $I8_2_utf->[($ucp & _UTF_START_MASK($len)) | _UTF_START_MARK($len)]; 293 294 return join "", @str; 295 } 296 else { 297 die "Unknown character set '$charset'"; 298 } 299} 300 3011; 302