1#!/usr/bin/perl 2# 3# Generate code page .c files from ftp.unicode.org descriptions 4# 5# Copyright 2000 Alexandre Julliard 6# 7# This library is free software; you can redistribute it and/or 8# modify it under the terms of the GNU Lesser General Public 9# License as published by the Free Software Foundation; either 10# version 2.1 of the License, or (at your option) any later version. 11# 12# This library is distributed in the hope that it will be useful, 13# but WITHOUT ANY WARRANTY; without even the implied warranty of 14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15# Lesser General Public License for more details. 16# 17# You should have received a copy of the GNU Lesser General Public 18# License along with this library; if not, write to the Free Software 19# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA 20# 21 22# base directory for ftp.unicode.org files 23$BASEDIR = "ftp.unicode.org/Public/"; 24$MAPPREFIX = $BASEDIR . "MAPPINGS/"; 25 26# UnicodeData file 27$UNICODEDATA = $BASEDIR . "UNIDATA/UnicodeData.txt"; 28 29# Sort keys file 30$SORTKEYS = "www.unicode.org/reports/tr10/allkeys.txt"; 31 32# Defaults mapping 33$DEFAULTS = "./defaults"; 34 35# Default char for undefined mappings 36$DEF_CHAR = ord '?'; 37 38@allfiles = 39( 40 [ 37, "VENDORS/MICSFT/EBCDIC/CP037.TXT", 0, "IBM EBCDIC US Canada" ], 41 [ 424, "VENDORS/MISC/CP424.TXT", 0, "IBM EBCDIC Hebrew" ], 42 [ 437, "VENDORS/MICSFT/PC/CP437.TXT", 1, "OEM United States" ], 43 [ 500, "VENDORS/MICSFT/EBCDIC/CP500.TXT", 0, "IBM EBCDIC International" ], 44 [ 737, "VENDORS/MICSFT/PC/CP737.TXT", 1, "OEM Greek 437G" ], 45 [ 775, "VENDORS/MICSFT/PC/CP775.TXT", 1, "OEM Baltic" ], 46 [ 850, "VENDORS/MICSFT/PC/CP850.TXT", 1, "OEM Multilingual Latin 1" ], 47 [ 852, "VENDORS/MICSFT/PC/CP852.TXT", 1, "OEM Slovak Latin 2" ], 48 [ 855, "VENDORS/MICSFT/PC/CP855.TXT", 1, "OEM Cyrillic" ], 49 [ 856, "VENDORS/MISC/CP856.TXT", 0, "Hebrew PC" ], 50 [ 857, "VENDORS/MICSFT/PC/CP857.TXT", 1, "OEM Turkish" ], 51 [ 860, "VENDORS/MICSFT/PC/CP860.TXT", 1, "OEM Portuguese" ], 52 [ 861, "VENDORS/MICSFT/PC/CP861.TXT", 1, "OEM Icelandic" ], 53 [ 862, "VENDORS/MICSFT/PC/CP862.TXT", 1, "OEM Hebrew" ], 54 [ 863, "VENDORS/MICSFT/PC/CP863.TXT", 1, "OEM Canadian French" ], 55 [ 864, "VENDORS/MICSFT/PC/CP864.TXT", 0, "OEM Arabic" ], 56 [ 865, "VENDORS/MICSFT/PC/CP865.TXT", 1, "OEM Nordic" ], 57 [ 866, "VENDORS/MICSFT/PC/CP866.TXT", 1, "OEM Russian" ], 58 [ 869, "VENDORS/MICSFT/PC/CP869.TXT", 1, "OEM Greek" ], 59 [ 874, "VENDORS/MICSFT/PC/CP874.TXT", 1, "ANSI/OEM Thai" ], 60 [ 875, "VENDORS/MICSFT/EBCDIC/CP875.TXT", 0, "IBM EBCDIC Greek" ], 61 [ 878, "VENDORS/MISC/KOI8-R.TXT", 0, "Russian KOI8" ], 62 [ 932, "VENDORS/MICSFT/WINDOWS/CP932.TXT", 0, "ANSI/OEM Japanese Shift-JIS" ], 63 [ 936, "VENDORS/MICSFT/WINDOWS/CP936.TXT", 0, "ANSI/OEM Simplified Chinese GBK" ], 64 [ 949, "VENDORS/MICSFT/WINDOWS/CP949.TXT", 0, "ANSI/OEM Korean Unified Hangul" ], 65 [ 950, "VENDORS/MICSFT/WINDOWS/CP950.TXT", 0, "ANSI/OEM Traditional Chinese Big5" ], 66 [ 1006, "VENDORS/MISC/CP1006.TXT", 0, "IBM Arabic" ], 67 [ 1026, "VENDORS/MICSFT/EBCDIC/CP1026.TXT", 0, "IBM EBCDIC Latin 5 Turkish" ], 68 [ 1250, "VENDORS/MICSFT/WINDOWS/CP1250.TXT", 0, "ANSI Eastern Europe" ], 69 [ 1251, "VENDORS/MICSFT/WINDOWS/CP1251.TXT", 0, "ANSI Cyrillic" ], 70 [ 1252, "VENDORS/MICSFT/WINDOWS/CP1252.TXT", 0, "ANSI Latin 1" ], 71 [ 1253, "VENDORS/MICSFT/WINDOWS/CP1253.TXT", 0, "ANSI Greek" ], 72 [ 1254, "VENDORS/MICSFT/WINDOWS/CP1254.TXT", 0, "ANSI Turkish" ], 73 [ 1255, "VENDORS/MICSFT/WINDOWS/CP1255.TXT", 0, "ANSI Hebrew" ], 74 [ 1256, "VENDORS/MICSFT/WINDOWS/CP1256.TXT", 0, "ANSI Arabic" ], 75 [ 1257, "VENDORS/MICSFT/WINDOWS/CP1257.TXT", 0, "ANSI Baltic" ], 76 [ 1258, "VENDORS/MICSFT/WINDOWS/CP1258.TXT", 0, "ANSI/OEM Viet Nam" ], 77 [ 1361, "OBSOLETE/EASTASIA/KSC/JOHAB.TXT", 0, "Korean Johab" ], 78 [ 10000, "VENDORS/MICSFT/MAC/ROMAN.TXT", 0, "Mac Roman" ], 79 [ 10006, "VENDORS/MICSFT/MAC/GREEK.TXT", 0, "Mac Greek" ], 80 [ 10007, "VENDORS/MICSFT/MAC/CYRILLIC.TXT", 0, "Mac Cyrillic" ], 81 [ 10029, "VENDORS/MICSFT/MAC/LATIN2.TXT", 0, "Mac Latin 2" ], 82 [ 10079, "VENDORS/MICSFT/MAC/ICELAND.TXT", 0, "Mac Icelandic" ], 83 [ 10081, "VENDORS/MICSFT/MAC/TURKISH.TXT", 0, "Mac Turkish" ], 84 [ 20127, undef, 0, "US-ASCII (7bit)" ], 85 [ 20866, "VENDORS/MISC/KOI8-R.TXT", 0, "Russian KOI8" ], 86 [ 20932, "OBSOLETE/EASTASIA/JIS/JIS0208.TXT", 0, "EUC-JP" ], 87 [ 21866, "VENDORS/MISC/KOI8-U.TXT", 0, "Ukrainian KOI8" ], 88 [ 28591, "ISO8859/8859-1.TXT", 0, "ISO 8859-1 Latin 1" ], 89 [ 28592, "ISO8859/8859-2.TXT", 0, "ISO 8859-2 Latin 2 (East European)" ], 90 [ 28593, "ISO8859/8859-3.TXT", 0, "ISO 8859-3 Latin 3 (South European)" ], 91 [ 28594, "ISO8859/8859-4.TXT", 0, "ISO 8859-4 Latin 4 (Baltic old)" ], 92 [ 28595, "ISO8859/8859-5.TXT", 0, "ISO 8859-5 Cyrillic" ], 93 [ 28596, "ISO8859/8859-6.TXT", 0, "ISO 8859-6 Arabic" ], 94 [ 28597, "ISO8859/8859-7.TXT", 0, "ISO 8859-7 Greek" ], 95 [ 28598, "ISO8859/8859-8.TXT", 0, "ISO 8859-8 Hebrew" ], 96 [ 28599, "ISO8859/8859-9.TXT", 0, "ISO 8859-9 Latin 5 (Turkish)" ], 97 [ 28600, "ISO8859/8859-10.TXT", 0, "ISO 8859-10 Latin 6 (Nordic)" ], 98 [ 28603, "ISO8859/8859-13.TXT", 0, "ISO 8859-13 Latin 7 (Baltic)" ], 99 [ 28604, "ISO8859/8859-14.TXT", 0, "ISO 8859-14 Latin 8 (Celtic)" ], 100 [ 28605, "ISO8859/8859-15.TXT", 0, "ISO 8859-15 Latin 9 (Euro)" ], 101 [ 28606, "ISO8859/8859-16.TXT", 0, "ISO 8859-16 Latin 10 (Balkan)" ] 102); 103 104 105%ctype = 106( 107 "upper" => 0x0001, 108 "lower" => 0x0002, 109 "digit" => 0x0004, 110 "space" => 0x0008, 111 "punct" => 0x0010, 112 "cntrl" => 0x0020, 113 "blank" => 0x0040, 114 "xdigit" => 0x0080, 115 "alpha" => 0x0100 116); 117 118%categories = 119( 120 "Lu" => $ctype{"alpha"}|$ctype{"upper"}, # Letter, Uppercase 121 "Ll" => $ctype{"alpha"}|$ctype{"lower"}, # Letter, Lowercase 122 "Lt" => $ctype{"alpha"}, # Letter, Titlecase 123 "Mn" => $ctype{"punct"}, # Mark, Non-Spacing 124 "Mc" => $ctype{"punct"}, # Mark, Spacing Combining 125 "Me" => $ctype{"punct"}, # Mark, Enclosing 126 "Nd" => $ctype{"digit"}, # Number, Decimal Digit 127 "Nl" => $ctype{"punct"}, # Number, Letter 128 "No" => $ctype{"punct"}, # Number, Other 129 "Zs" => $ctype{"space"}, # Separator, Space 130 "Zl" => $ctype{"space"}, # Separator, Line 131 "Zp" => $ctype{"space"}, # Separator, Paragraph 132 "Cc" => $ctype{"cntrl"}, # Other, Control 133 "Cf" => 0, # Other, Format 134 "Cs" => 0, # Other, Surrogate 135 "Co" => 0, # Other, Private Use 136 "Cn" => 0, # Other, Not Assigned 137 "Lm" => $ctype{"punct"}, # Letter, Modifier 138 "Lo" => $ctype{"alpha"}, # Letter, Other 139 "Pc" => $ctype{"punct"}, # Punctuation, Connector 140 "Pd" => $ctype{"punct"}, # Punctuation, Dash 141 "Ps" => $ctype{"punct"}, # Punctuation, Open 142 "Pe" => $ctype{"punct"}, # Punctuation, Close 143 "Pi" => $ctype{"punct"}, # Punctuation, Initial quote 144 "Pf" => $ctype{"punct"}, # Punctuation, Final quote 145 "Po" => $ctype{"punct"}, # Punctuation, Other 146 "Sm" => $ctype{"punct"}, # Symbol, Math 147 "Sc" => $ctype{"punct"}, # Symbol, Currency 148 "Sk" => $ctype{"punct"}, # Symbol, Modifier 149 "So" => $ctype{"punct"} # Symbol, Other 150); 151 152# a few characters need additional categories that cannot be determined automatically 153%special_categories = 154( 155 "xdigit" => [ ord('0')..ord('9'),ord('A')..ord('F'),ord('a')..ord('f'), 156 0xff10..0xff19, 0xff21..0xff26, 0xff41..0xff46 ], 157 "space" => [ 0x09..0x0d, 0x85 ], 158 "blank" => [ 0x09, 0x20, 0xa0, 0x3000, 0xfeff ], 159 "cntrl" => [ 0x070f, 0x180b, 0x180c, 0x180d, 0x180e, 0x200c, 0x200d, 160 0x200e, 0x200f, 0x202a, 0x202b, 0x202c, 0x202d, 0x202e, 161 0x206a, 0x206b, 0x206c, 0x206d, 0x206e, 0x206f, 0xfeff, 162 0xfff9, 0xfffa, 0xfffb ] 163); 164 165%directions = 166( 167 "L" => 1, # Left-to-Right 168 "LRE" => 11, # Left-to-Right Embedding 169 "LRO" => 11, # Left-to-Right Override 170 "R" => 2, # Right-to-Left 171 "AL" => 2, # Right-to-Left Arabic 172 "RLE" => 11, # Right-to-Left Embedding 173 "RLO" => 11, # Right-to-Left Override 174 "PDF" => 11, # Pop Directional Format 175 "EN" => 3, # European Number 176 "ES" => 4, # European Number Separator 177 "ET" => 5, # European Number Terminator 178 "AN" => 6, # Arabic Number 179 "CS" => 7, # Common Number Separator 180 "NSM" => 0, # Non-Spacing Mark 181 "BN" => 0, # Boundary Neutral 182 "B" => 8, # Paragraph Separator 183 "S" => 9, # Segment Separator 184 "WS" => 10, # Whitespace 185 "ON" => 11 # Other Neutrals 186); 187 188 189################################################################ 190# main routine 191 192READ_DEFAULTS(); 193my @sortkeys = READ_SORTKEYS_FILE(); 194DUMP_CASE_MAPPINGS(); 195DUMP_SORTKEYS(@sortkeys); 196DUMP_COMPOSE_TABLES(); 197DUMP_CTYPE_TABLES(); 198 199foreach $file (@allfiles) { HANDLE_FILE( @$file ); } 200 201OUTPUT_CPTABLE(); 202 203exit(0); 204 205 206################################################################ 207# read in the defaults file 208sub READ_DEFAULTS 209{ 210 @unicode_defaults = (); 211 @unicode_aliases = (); 212 @tolower_table = (); 213 @toupper_table = (); 214 @digitmap_table = (); 215 @compatmap_table = (); 216 @category_table = (); 217 @direction_table = (); 218 @decomp_table = (); 219 @compose_table = (); 220 221 # first setup a few default mappings 222 223 open DEFAULTS or die "Cannot open $DEFAULTS"; 224 print "Loading $DEFAULTS\n"; 225 while (<DEFAULTS>) 226 { 227 next if /^\#/; # skip comments 228 next if /^$/; # skip empty lines 229 if (/^(([0-9a-fA-F]+)(,[0-9a-fA-F]+)*)\s+([0-9a-fA-F]+|'.'|none)\s+(\#.*)?/) 230 { 231 my @src = map hex, split /,/,$1; 232 my $dst = $4; 233 my $comment = $5; 234 if ($#src > 0) { push @unicode_aliases, \@src; } 235 next if ($dst eq "none"); 236 $dst = ($dst =~ /\'.\'/) ? ord substr($dst,1,1) : hex $dst; 237 foreach $src (@src) 238 { 239 die "Duplicate value" if defined($unicode_defaults[$src]); 240 $unicode_defaults[$src] = $dst; 241 } 242 next; 243 } 244 die "Unrecognized line $_\n"; 245 } 246 247 # now build mappings from the decomposition field of the Unicode database 248 249 open UNICODEDATA or die "Cannot open $UNICODEDATA"; 250 print "Loading $UNICODEDATA\n"; 251 while (<UNICODEDATA>) 252 { 253 # Decode the fields ... 254 ($code, $name, $cat, $comb, $bidi, 255 $decomp, $dec, $dig, $num, $mirror, 256 $oldname, $comment, $upper, $lower, $title) = split /;/; 257 258 my $src = hex $code; 259 260 die "unknown category $cat" unless defined $categories{$cat}; 261 die "unknown directionality $bidi" unless defined $directions{$bidi}; 262 263 $uniname[$src] = $name; 264 $category_table[$src] = $categories{$cat}; 265 $direction_table[$src] = $directions{$bidi}; 266 267 if ($lower ne "") 268 { 269 $tolower_table[$src] = hex $lower; 270 $category_table[$src] |= $ctype{"upper"}|$ctype{"alpha"}; 271 } 272 if ($upper ne "") 273 { 274 $toupper_table[$src] = hex $upper; 275 $category_table[$src] |= $ctype{"lower"}|$ctype{"alpha"}; 276 } 277 if ($dec ne "") 278 { 279 $category_table[$src] |= $ctype{"digit"}; 280 } 281 if ($dig ne "") 282 { 283 $digitmap_table[$src] = ord $dig; 284 } 285 286 # copy the category and direction for everything between First/Last pairs 287 if ($name =~ /, First>/) { $start = $src; } 288 if ($name =~ /, Last>/) 289 { 290 while ($start < $src) 291 { 292 $category_table[$start] = $category_table[$src]; 293 $direction_table[$start] = $direction_table[$src]; 294 $start++; 295 } 296 } 297 298 next if $decomp eq ""; # no decomposition, skip it 299 300 if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/) 301 { 302 # decomposition of the form "<foo> 1234" -> use char if type is known 303 if (($src >= 0xf900 && $src < 0xfb00) || ($src >= 0xfe30 && $src < 0xfffd)) 304 { 305 # Single char decomposition in the compatibility range 306 $compatmap_table[$src] = hex $2; 307 } 308 next unless ($1 eq "font" || 309 $1 eq "noBreak" || 310 $1 eq "circle" || 311 $1 eq "super" || 312 $1 eq "sub" || 313 $1 eq "wide" || 314 $1 eq "narrow" || 315 $1 eq "compat" || 316 $1 eq "small"); 317 $dst = hex $2; 318 } 319 elsif ($decomp =~ /^<compat>\s+0020\s+([0-9a-fA-F]+)/) 320 { 321 # decomposition "<compat> 0020 1234" -> combining accent 322 $dst = hex $1; 323 } 324 elsif ($decomp =~ /^([0-9a-fA-F]+)/) 325 { 326 # decomposition contains only char values without prefix -> use first char 327 $dst = hex $1; 328 $category_table[$src] |= $category_table[$dst]; 329 # store decomposition if it contains two chars 330 if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/) 331 { 332 $decomp_table[$src] = [ hex $1, hex $2 ]; 333 push @compose_table, [ hex $1, hex $2, $src ]; 334 } 335 elsif ($decomp =~ /^(<[a-z]+>\s)*([0-9a-fA-F]+)$/ && 336 (($src >= 0xf900 && $src < 0xfb00) || ($src >= 0xfe30 && $src < 0xfffd))) 337 { 338 # Single char decomposition in the compatibility range 339 $compatmap_table[$src] = hex $2; 340 } 341 } 342 else 343 { 344 next; 345 } 346 347 next if defined($unicode_defaults[$src]); # may have been set in the defaults file 348 349 # check for loops 350 for ($i = $dst; ; $i = $unicode_defaults[$i]) 351 { 352 die sprintf("loop detected for %04x -> %04x",$src,$dst) if $i == $src; 353 last unless defined($unicode_defaults[$i]); 354 } 355 $unicode_defaults[$src] = $dst; 356 } 357 358 # patch the category of some special characters 359 360 foreach $cat (keys %special_categories) 361 { 362 my $flag = $ctype{$cat}; 363 foreach $i (@{$special_categories{$cat}}) { $category_table[$i] |= $flag; } 364 } 365} 366 367 368################################################################ 369# parse the input file 370sub READ_FILE 371{ 372 my $name = shift; 373 open INPUT,$name or die "Cannot open $name"; 374 375 while (<INPUT>) 376 { 377 next if /^\#/; # skip comments 378 next if /^$/; # skip empty lines 379 next if /\x1a/; # skip ^Z 380 next if (/^0x([0-9a-fA-F]+)\s+\#UNDEFINED/); # undefined char 381 382 if (/^0x([0-9a-fA-F]+)\s+\#DBCS LEAD BYTE/) 383 { 384 $cp = hex $1; 385 push @lead_bytes,$cp; 386 $cp2uni[$cp] = 0; 387 next; 388 } 389 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/) 390 { 391 $cp = hex $1; 392 $uni = hex $2; 393 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]); 394 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]); 395 if ($cp > 0xff && !defined($cp2uni[$cp >> 8])) 396 { 397 push @lead_bytes,$cp >> 8; 398 $cp2uni[$cp >> 8] = 0; 399 } 400 next; 401 } 402 die "$name: Unrecognized line $_\n"; 403 } 404} 405 406 407################################################################ 408# fill input data for the 20127 (us-ascii) codepage 409sub fill_20127_codepage() 410{ 411 for (my $i = 0; $i < 128; $i++) { $cp2uni[$i] = $uni2cp[$i] = $i; } 412 for (my $i = 128; $i < 256; $i++) { $cp2uni[$i] = $i & 0x7f; } 413} 414 415################################################################ 416# get a mapping including glyph chars for MB_USEGLYPHCHARS 417 418sub get_glyphs_mapping(@) 419{ 420 $_[0x01] = 0x263a; # (WHITE SMILING FACE) 421 $_[0x02] = 0x263b; # (BLACK SMILING FACE) 422 $_[0x03] = 0x2665; # (BLACK HEART SUIT) 423 $_[0x04] = 0x2666; # (BLACK DIAMOND SUIT) 424 $_[0x05] = 0x2663; # (BLACK CLUB SUIT) 425 $_[0x06] = 0x2660; # (BLACK SPADE SUIT) 426 $_[0x07] = 0x2022; # (BULLET) 427 $_[0x08] = 0x25d8; # (INVERSE BULLET) 428 $_[0x09] = 0x25cb; # (WHITE CIRCLE) 429 $_[0x0a] = 0x25d9; # (INVERSE WHITE CIRCLE) 430 $_[0x0b] = 0x2642; # (MALE SIGN) 431 $_[0x0c] = 0x2640; # (FEMALE SIGN) 432 $_[0x0d] = 0x266a; # (EIGHTH NOTE) 433 $_[0x0e] = 0x266b; # (BEAMED EIGHTH NOTES) 434 $_[0x0f] = 0x263c; # (WHITE SUN WITH RAYS) 435 $_[0x10] = 0x25ba; # (BLACK RIGHT-POINTING POINTER) 436 $_[0x11] = 0x25c4; # (BLACK LEFT-POINTING POINTER) 437 $_[0x12] = 0x2195; # (UP DOWN ARROW) 438 $_[0x13] = 0x203c; # (DOUBLE EXCLAMATION MARK) 439 $_[0x14] = 0x00b6; # (PILCROW SIGN) 440 $_[0x15] = 0x00a7; # (SECTION SIGN) 441 $_[0x16] = 0x25ac; # (BLACK RECTANGLE) 442 $_[0x17] = 0x21a8; # (UP DOWN ARROW WITH BASE) 443 $_[0x18] = 0x2191; # (UPWARDS ARROW) 444 $_[0x19] = 0x2193; # (DOWNWARDS ARROW) 445 $_[0x1a] = 0x2192; # (RIGHTWARDS ARROW) 446 $_[0x1b] = 0x2190; # (LEFTWARDS ARROW) 447 $_[0x1c] = 0x221f; # (RIGHT ANGLE) 448 $_[0x1d] = 0x2194; # (LEFT RIGHT ARROW) 449 $_[0x1e] = 0x25b2; # (BLACK UP-POINTING TRIANGLE) 450 $_[0x1f] = 0x25bc; # (BLACK DOWN-POINTING TRIANGLE) 451 $_[0x7f] = 0x2302; # (HOUSE) 452 return @_; 453} 454 455################################################################ 456# build EUC-JP table from the JIS 0208 file 457# FIXME: for proper EUC-JP we should probably read JIS 0212 too 458# but this would require 3-byte DBCS characters 459sub READ_JIS0208_FILE 460{ 461 my $name = shift; 462 463 # ASCII chars 464 for ($i = 0x00; $i <= 0x7f; $i++) 465 { 466 $cp2uni[$i] = $i; 467 $uni2cp[$i] = $i; 468 } 469 470 # JIS X 0201 right plane 471 for ($i = 0xa1; $i <= 0xdf; $i++) 472 { 473 $cp2uni[0x8e00 + $i] = 0xfec0 + $i; 474 $uni2cp[0xfec0 + $i] = 0x8e00 + $i; 475 } 476 477 # lead bytes 478 foreach $i (0x8e, 0x8f, 0xa1 .. 0xfe) 479 { 480 push @lead_bytes,$i; 481 $cp2uni[$i] = 0; 482 } 483 484 # undefined chars 485 foreach $i (0x80 .. 0x8d, 0x90 .. 0xa0, 0xff) 486 { 487 $cp2uni[$i] = $DEF_CHAR; 488 } 489 490 # Shift-JIS compatibility 491 $uni2cp[0x00a5] = 0x5c; 492 $uni2cp[0x203e] = 0x7e; 493 494 # Fix backslash conversion 495 $cp2uni[0xa1c0] = 0xff3c; 496 $uni2cp[0xff3c] = 0xa1c0; 497 498 open INPUT, "$name" or die "Cannot open $name"; 499 while (<INPUT>) 500 { 501 next if /^\#/; # skip comments 502 next if /^$/; # skip empty lines 503 next if /\x1a/; # skip ^Z 504 if (/^0x[0-9a-fA-F]+\s+0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/) 505 { 506 $cp = 0x8080 + hex $1; 507 $uni = hex $2; 508 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]); 509 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]); 510 next; 511 } 512 die "$name: Unrecognized line $_\n"; 513 } 514} 515 516 517################################################################ 518# build the sort keys table 519sub READ_SORTKEYS_FILE 520{ 521 my @sortkeys = (); 522 for (my $i = 0; $i < 65536; $i++) { $sortkeys[$i] = [ -1, 0, 0, 0, 0 ] }; 523 524 open INPUT, "$SORTKEYS" or die "Cannot open $SORTKEYS"; 525 print "Loading $SORTKEYS\n"; 526 while (<INPUT>) 527 { 528 next if /^\#/; # skip comments 529 next if /^$/; # skip empty lines 530 next if /\x1a/; # skip ^Z 531 next if /^\@version/; # skip @version header 532 if (/^([0-9a-fA-F]+)\s+;\s+\[([*.])([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/) 533 { 534 my ($uni,$variable) = (hex $1, $2); 535 next if $uni > 65535; 536 $sortkeys[$uni] = [ $uni, hex $3, hex $4, hex $5, hex $6 ]; 537 next; 538 } 539 if (/^([0-9a-fA-F]+\s+)+;\s+\[[*.]([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/) 540 { 541 # multiple character sequence, ignored for now 542 next; 543 } 544 die "$SORTKEYS: Unrecognized line $_\n"; 545 } 546 close INPUT; 547 548 # compress the keys to 32 bit: 549 # key 1 to 16 bits, key 2 to 8 bits, key 3 to 4 bits, key 4 to 1 bit 550 551 @sortkeys = sort { ${$a}[1] <=> ${$b}[1] or 552 ${$a}[2] <=> ${$b}[2] or 553 ${$a}[3] <=> ${$b}[3] or 554 ${$a}[4] <=> ${$b}[4] or 555 $a cmp $b; } @sortkeys; 556 557 my ($n2, $n3) = (1, 1); 558 my @keys = (-1, -1, -1, -1, -1 ); 559 my @flatkeys = (); 560 561 for (my $i = 0; $i < 65536; $i++) 562 { 563 my @current = @{$sortkeys[$i]}; 564 next if $current[0] == -1; 565 if ($current[1] == $keys[1]) 566 { 567 if ($current[2] == $keys[2]) 568 { 569 if ($current[3] == $keys[3]) 570 { 571 # nothing 572 } 573 else 574 { 575 $keys[3] = $current[3]; 576 $n3++; 577 die if ($n3 >= 16); 578 } 579 } 580 else 581 { 582 $keys[2] = $current[2]; 583 $keys[3] = $current[3]; 584 $n2++; 585 $n3 = 1; 586 die if ($n2 >= 256); 587 } 588 } 589 else 590 { 591 $keys[1] = $current[1]; 592 $keys[2] = $current[2]; 593 $keys[3] = $current[3]; 594 $n2 = 1; 595 $n3 = 1; 596 } 597 598 if ($current[2]) { $current[2] = $n2; } 599 if ($current[3]) { $current[3] = $n3; } 600 if ($current[4]) { $current[4] = 1; } 601 602 $flatkeys[$current[0]] = ($current[1] << 16) | ($current[2] << 8) | ($current[3] << 4) | $current[4]; 603 } 604 return @flatkeys; 605} 606 607 608################################################################ 609# build the sort keys table 610sub DUMP_SORTKEYS 611{ 612 my @keys = @_; 613 614 # count the number of 256-key ranges that contain something 615 616 my @offsets = (); 617 my $ranges = 2; 618 for (my $i = 0; $i < 256; $i++) { $offsets[$i] = 256; } 619 for (my $i = 0; $i < 65536; $i++) 620 { 621 next unless defined $keys[$i]; 622 $offsets[$i >> 8] = $ranges * 256; 623 $ranges++; 624 $i |= 255; 625 } 626 627 # output the range offsets 628 629 open OUTPUT,">collation.c.new" or die "Cannot create collation.c"; 630 printf "Building collation.c\n"; 631 printf OUTPUT "/* Unicode collation element table */\n"; 632 printf OUTPUT "/* generated from %s */\n", $SORTKEYS; 633 printf OUTPUT "/* DO NOT EDIT!! */\n\n"; 634 635 printf OUTPUT "const unsigned int collation_table[%d] =\n{\n", $ranges*256; 636 printf OUTPUT " /* index */\n"; 637 printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%08x", 0, @offsets ); 638 639 # output the default values 640 641 printf OUTPUT " /* defaults */\n"; 642 printf OUTPUT "%s", DUMP_ARRAY( "0x%08x", 0, (0xffffffff) x 256 ); 643 644 # output all the key ranges 645 646 for (my $i = 0; $i < 256; $i++) 647 { 648 next if $offsets[$i] == 256; 649 printf OUTPUT ",\n /* 0x%02x00 .. 0x%02xff */\n", $i, $i; 650 printf OUTPUT "%s", DUMP_ARRAY( "0x%08x", 0xffffffff, @keys[($i<<8) .. ($i<<8)+255] ); 651 } 652 printf OUTPUT "\n};\n"; 653 close OUTPUT; 654 save_file("collation.c"); 655} 656 657 658################################################################ 659# add default mappings once the file had been read 660sub ADD_DEFAULT_MAPPINGS 661{ 662 # Apply aliases 663 664 foreach $alias (@unicode_aliases) 665 { 666 my $target = undef; 667 foreach $src (@$alias) 668 { 669 if (defined($uni2cp[$src])) 670 { 671 $target = $uni2cp[$src]; 672 last; 673 } 674 } 675 next unless defined($target); 676 677 # At least one char of the alias set is defined, set the others to the same value 678 foreach $src (@$alias) 679 { 680 $uni2cp[$src] = $target unless defined($uni2cp[$src]); 681 } 682 } 683 684 # For every src -> target mapping in the defaults table, 685 # make uni2cp[src] = uni2cp[target] if uni2cp[target] is defined 686 687 for ($src = 0; $src < 65536; $src++) 688 { 689 next if defined($uni2cp[$src]); # source has a definition already 690 next unless defined($unicode_defaults[$src]); # no default for this char 691 my $target = $unicode_defaults[$src]; 692 693 # do a recursive mapping until we find a target char that is defined 694 while (!defined($uni2cp[$target]) && 695 defined($unicode_defaults[$target])) { $target = $unicode_defaults[$target]; } 696 697 if (defined($uni2cp[$target])) { $uni2cp[$src] = $uni2cp[$target]; } 698 } 699 700 # Add an identity mapping for all undefined chars 701 702 for ($i = 0; $i < 256; $i++) 703 { 704 next if defined($cp2uni[$i]); 705 next if defined($uni2cp[$i]); 706 $cp2uni[$i] = $uni2cp[$i] = $i; 707 } 708} 709 710################################################################ 711# dump an array of integers 712sub DUMP_ARRAY 713{ 714 my ($format,$default,@array) = @_; 715 my $i, $ret = " "; 716 for ($i = 0; $i < $#array; $i++) 717 { 718 $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default); 719 $ret .= (($i % 8) != 7) ? ", " : ",\n "; 720 } 721 $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default); 722 return $ret; 723} 724 725################################################################ 726# dump an SBCS mapping table 727sub DUMP_SBCS_TABLE 728{ 729 my ($codepage, $has_glyphs, $name) = @_; 730 my $i; 731 732 # output the ascii->unicode table 733 734 if ($has_glyphs) 735 { 736 printf OUTPUT "static const WCHAR cp2uni[512] =\n"; 737 printf OUTPUT "{\n%s", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @cp2uni[0 .. 255] ); 738 printf OUTPUT ",\n /* glyphs */\n%s\n};\n\n", 739 DUMP_ARRAY( "0x%04x", $DEF_CHAR, get_glyphs_mapping(@cp2uni[0 .. 255]) ); 740 } 741 else 742 { 743 printf OUTPUT "static const WCHAR cp2uni[256] =\n"; 744 printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @cp2uni[0 .. 255] ); 745 } 746 747 # count the number of unicode->ascii subtables that contain something 748 749 my @filled = (); 750 my $subtables = 1; 751 for ($i = 0; $i < 65536; $i++) 752 { 753 next unless defined $uni2cp[$i]; 754 $filled[$i >> 8] = 1; 755 $subtables++; 756 $i |= 255; 757 } 758 759 # output all the subtables into a single array 760 761 printf OUTPUT "static const unsigned char uni2cp_low[%d] =\n{\n", $subtables*256; 762 for ($i = 0; $i < 256; $i++) 763 { 764 next unless $filled[$i]; 765 printf OUTPUT " /* 0x%02x00 .. 0x%02xff */\n", $i, $i; 766 printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%02x", $DEF_CHAR, @uni2cp[($i<<8) .. ($i<<8)+255] ); 767 } 768 printf OUTPUT " /* defaults */\n"; 769 printf OUTPUT "%s\n};\n\n", DUMP_ARRAY( "0x%02x", 0, ($DEF_CHAR) x 256 ); 770 771 # output a table of the offsets of the subtables in the previous array 772 773 my $pos = 0; 774 my @offsets = (); 775 for ($i = 0; $i < 256; $i++) 776 { 777 if ($filled[$i]) { push @offsets, $pos; $pos += 256; } 778 else { push @offsets, ($subtables-1) * 256; } 779 } 780 printf OUTPUT "static const unsigned short uni2cp_high[256] =\n"; 781 printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%04x", 0, @offsets ); 782 783 # output the code page descriptor 784 785 printf OUTPUT "const struct sbcs_table cptable_%03d =\n{\n", $codepage; 786 printf OUTPUT " { %d, 1, 0x%04x, 0x%04x, \"%s\" },\n", 787 $codepage, $DEF_CHAR, $DEF_CHAR, $name; 788 printf OUTPUT " cp2uni,\n"; 789 if ($has_glyphs) { printf OUTPUT " cp2uni + 256,\n"; } 790 else { printf OUTPUT " cp2uni,\n"; } 791 printf OUTPUT " uni2cp_low,\n"; 792 printf OUTPUT " uni2cp_high\n};\n"; 793} 794 795 796################################################################ 797# dump a DBCS mapping table 798sub DUMP_DBCS_TABLE 799{ 800 my ($codepage, $name) = @_; 801 my $i, $x, $y; 802 803 # build a list of lead bytes that are actually used 804 805 my @lblist = (); 806 LBLOOP: for ($y = 0; $y <= $#lead_bytes; $y++) 807 { 808 my $base = $lead_bytes[$y] << 8; 809 for ($x = 0; $x < 256; $x++) 810 { 811 if (defined $cp2uni[$base+$x]) 812 { 813 push @lblist,$lead_bytes[$y]; 814 next LBLOOP; 815 } 816 } 817 } 818 my $unused = ($#lead_bytes > $#lblist); 819 820 # output the ascii->unicode table for the single byte chars 821 822 printf OUTPUT "static const WCHAR cp2uni[%d] =\n", 256 * ($#lblist + 2 + $unused); 823 printf OUTPUT "{\n%s,\n", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @cp2uni[0 .. 255] ); 824 825 # output the default table for unused lead bytes 826 827 if ($unused) 828 { 829 printf OUTPUT " /* unused lead bytes */\n"; 830 printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%04x", 0, ($DEF_CHAR) x 256 ); 831 } 832 833 # output the ascii->unicode table for each DBCS lead byte 834 835 for ($y = 0; $y <= $#lblist; $y++) 836 { 837 my $base = $lblist[$y] << 8; 838 printf OUTPUT " /* lead byte %02x */\n", $lblist[$y]; 839 printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @cp2uni[$base .. $base+255] ); 840 printf OUTPUT ($y < $#lblist) ? ",\n" : "\n};\n\n"; 841 } 842 843 # output the lead byte subtables offsets 844 845 my @offsets = (); 846 for ($x = 0; $x < 256; $x++) { $offsets[$x] = 0; } 847 for ($x = 0; $x <= $#lblist; $x++) { $offsets[$lblist[$x]] = $x + 1; } 848 if ($unused) 849 { 850 # increment all lead bytes offset to take into account the unused table 851 for ($x = 0; $x <= $#lead_bytes; $x++) { $offsets[$lead_bytes[$x]]++; } 852 } 853 printf OUTPUT "static const unsigned char cp2uni_leadbytes[256] =\n"; 854 printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%02x", 0, @offsets ); 855 856 # count the number of unicode->ascii subtables that contain something 857 858 my @filled = (); 859 my $subtables = 1; 860 for ($i = 0; $i < 65536; $i++) 861 { 862 next unless defined $uni2cp[$i]; 863 $filled[$i >> 8] = 1; 864 $subtables++; 865 $i |= 255; 866 } 867 868 # output all the subtables into a single array 869 870 printf OUTPUT "static const unsigned short uni2cp_low[%d] =\n{\n", $subtables*256; 871 for ($y = 0; $y < 256; $y++) 872 { 873 next unless $filled[$y]; 874 printf OUTPUT " /* 0x%02x00 .. 0x%02xff */\n", $y, $y; 875 printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @uni2cp[($y<<8) .. ($y<<8)+255] ); 876 } 877 printf OUTPUT " /* defaults */\n"; 878 printf OUTPUT "%s\n};\n\n", DUMP_ARRAY( "0x%04x", 0, ($DEF_CHAR) x 256 ); 879 880 # output a table of the offsets of the subtables in the previous array 881 882 my $pos = 0; 883 my @offsets = (); 884 for ($y = 0; $y < 256; $y++) 885 { 886 if ($filled[$y]) { push @offsets, $pos; $pos += 256; } 887 else { push @offsets, ($subtables-1) * 256; } 888 } 889 printf OUTPUT "static const unsigned short uni2cp_high[256] =\n"; 890 printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%04x", 0, @offsets ); 891 892 # output the code page descriptor 893 894 printf OUTPUT "const struct dbcs_table cptable_%03d =\n{\n", $codepage; 895 printf OUTPUT " { %d, 2, 0x%04x, 0x%04x, \"%s\" },\n", 896 $codepage, $DEF_CHAR, $DEF_CHAR, $name; 897 printf OUTPUT " cp2uni,\n"; 898 printf OUTPUT " cp2uni_leadbytes,\n"; 899 printf OUTPUT " uni2cp_low,\n"; 900 printf OUTPUT " uni2cp_high,\n"; 901 DUMP_LB_RANGES(); 902 printf OUTPUT "};\n"; 903} 904 905 906################################################################ 907# dump the list of defined lead byte ranges 908sub DUMP_LB_RANGES 909{ 910 my @list = (); 911 my $i = 0; 912 foreach $i (@lead_bytes) { $list[$i] = 1; } 913 my $on = 0; 914 printf OUTPUT " { "; 915 for ($i = 0; $i < 256; $i++) 916 { 917 if ($on) 918 { 919 if (!defined $list[$i]) { printf OUTPUT "0x%02x, ", $i-1; $on = 0; } 920 } 921 else 922 { 923 if ($list[$i]) { printf OUTPUT "0x%02x, ", $i; $on = 1; } 924 } 925 } 926 if ($on) { printf OUTPUT "0xff, "; } 927 printf OUTPUT "0x00, 0x00 }\n"; 928} 929 930 931################################################################ 932# dump the case mapping tables 933sub DUMP_CASE_MAPPINGS 934{ 935 open OUTPUT,">casemap.c.new" or die "Cannot create casemap.c"; 936 printf "Building casemap.c\n"; 937 printf OUTPUT "/* Unicode case mappings */\n"; 938 printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n"; 939 printf OUTPUT "#include \"wine/unicode.h\"\n\n"; 940 941 DUMP_CASE_TABLE( "wine_casemap_lower", @tolower_table ); 942 DUMP_CASE_TABLE( "wine_casemap_upper", @toupper_table ); 943 DUMP_CASE_TABLE( "wine_digitmap", @digitmap_table ); 944 DUMP_CASE_TABLE( "wine_compatmap", @compatmap_table ); 945 close OUTPUT; 946 save_file("casemap.c"); 947} 948 949 950################################################################ 951# dump a case mapping table 952sub DUMP_CASE_TABLE 953{ 954 my ($name,@table) = @_; 955 956 # count the number of sub tables that contain something 957 # also compute the low and upper populated bounds 958 959 my @lowerbounds = ( 0, 0 ); 960 my @upperbounds = ( 0, 255 ); 961 my $index = 0; 962 my @filled = (); 963 for ($i = 0; $i < 65536; $i++) 964 { 965 next unless defined $table[$i]; 966 if (!defined $filled[$i >> 8]) 967 { 968 $lowerbounds[$index] = $i & 0xff; 969 $upperbounds[$index] = 0xff - $lowerbounds[$index]; 970 $filled[$i >> 8] = $index * 256 + 512; 971 $index++; 972 } 973 else 974 { 975 $upperbounds[$index-1] = 0xff - ($i & 0xff); 976 } 977 $table[$i] = ($table[$i] - $i) & 0xffff; 978 } 979 980 # Collapse blocks upwards if possible 981 my $removed = 0; 982 $index = 0; 983 for ($i = 0; $i < 256; $i++) 984 { 985 next unless defined $filled[$i]; 986 if ($upperbounds[$index - 1] > $lowerbounds[$index]) 987 { 988 $removed = $removed + $lowerbounds[$index]; 989 } 990 else 991 { 992 $removed = $removed + $upperbounds[$index - 1]; 993 $lowerbounds[$index] = $upperbounds[$index - 1]; 994 } 995 $filled[$i] = $filled[$i] - $removed; 996 $index++; 997 } 998 999 # dump the table 1000 1001 printf OUTPUT "const WCHAR %s[%d] =\n", $name, $index * 256 + 512 - $removed; 1002 printf OUTPUT "{\n /* index */\n"; 1003 printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%04x", 256, @filled ); 1004 printf OUTPUT " /* defaults */\n"; 1005 printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, (0) x 256 ); 1006 $index = 0; 1007 for ($i = 0; $i < 256; $i++) 1008 { 1009 next unless $filled[$i]; 1010 printf OUTPUT ",\n /* 0x%02x%02x .. 0x%02xff */\n", $i, $lowerbounds[$index], $i; 1011 printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, 1012 @table[($i<<8) + $lowerbounds[$index] .. ($i<<8)+255] ); 1013 $index++; 1014 } 1015 printf OUTPUT "\n};\n"; 1016} 1017 1018 1019################################################################ 1020# dump the ctype tables 1021sub DUMP_CTYPE_TABLES 1022{ 1023 open OUTPUT,">wctype.c.new" or die "Cannot create wctype.c"; 1024 printf "Building wctype.c\n"; 1025 printf OUTPUT "/* Unicode ctype tables */\n"; 1026 printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n"; 1027 printf OUTPUT "#include \"wine/unicode.h\"\n\n"; 1028 1029 my $i; 1030 my @array = (0) x 256; 1031 1032 # add the direction in the high 4 bits of the category 1033 for ($i = 0; $i < 65536; $i++) 1034 { 1035 $category_table[$i] |= $direction_table[$i] << 12; 1036 } 1037 1038 # try to merge table rows 1039 for ($row = 0; $row < 256; $row++) 1040 { 1041 my $rowtxt = sprintf "%04x" x 256, @category_table[($row<<8)..($row<<8)+255]; 1042 if (defined($sequences{$rowtxt})) 1043 { 1044 # reuse an existing row 1045 $array[$row] = $sequences{$rowtxt}; 1046 } 1047 else 1048 { 1049 # create a new row 1050 $sequences{$rowtxt} = $array[$row] = $#array + 1; 1051 push @array, @category_table[($row<<8)..($row<<8)+255]; 1052 } 1053 } 1054 1055 printf OUTPUT "const unsigned short wine_wctype_table[%d] =\n{\n", $#array+1; 1056 printf OUTPUT " /* offsets */\n%s,\n", DUMP_ARRAY( "0x%04x", 0, @array[0..255] ); 1057 printf OUTPUT " /* values */\n%s\n};\n", DUMP_ARRAY( "0x%04x", 0, @array[256..$#array] ); 1058 1059 close OUTPUT; 1060 save_file("wctype.c"); 1061} 1062 1063 1064################################################################ 1065# dump the char composition tables 1066sub DUMP_COMPOSE_TABLES 1067{ 1068 open OUTPUT,">compose.c.new" or die "Cannot create compose.c"; 1069 printf "Building compose.c\n"; 1070 printf OUTPUT "/* Unicode char composition */\n"; 1071 printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n"; 1072 printf OUTPUT "#include \"wine/unicode.h\"\n\n"; 1073 1074 ######### composition table 1075 1076 my @filled = (); 1077 foreach $i (@compose_table) 1078 { 1079 my @comp = @$i; 1080 push @{$filled[$comp[1]]}, [ $comp[0], $comp[2] ]; 1081 } 1082 1083 # count how many different second chars we have 1084 1085 for ($i = $count = 0; $i < 65536; $i++) 1086 { 1087 next unless defined $filled[$i]; 1088 $count++; 1089 } 1090 1091 # build the table of second chars and offsets 1092 1093 my $pos = $count + 1; 1094 for ($i = 0; $i < 65536; $i++) 1095 { 1096 next unless defined $filled[$i]; 1097 push @table, $i, $pos; 1098 $pos += @{$filled[$i]}; 1099 } 1100 # terminator with last position 1101 push @table, 0, $pos; 1102 printf OUTPUT "const WCHAR unicode_compose_table[0x%x] =\n{\n", 2*$pos; 1103 printf OUTPUT " /* second chars + offsets */\n%s", DUMP_ARRAY( "0x%04x", 0, @table ); 1104 1105 # build the table of first chars and mappings 1106 1107 for ($i = 0; $i < 65536; $i++) 1108 { 1109 next unless defined $filled[$i]; 1110 my @table = (); 1111 my @list = sort { $a->[0] <=> $b->[0] } @{$filled[$i]}; 1112 for ($j = 0; $j <= $#list; $j++) 1113 { 1114 push @table, $list[$j][0], $list[$j][1]; 1115 } 1116 printf OUTPUT ",\n /* 0x%04x */\n%s", $i, DUMP_ARRAY( "0x%04x", 0, @table ); 1117 } 1118 printf OUTPUT "\n};\n\nconst unsigned int unicode_compose_table_size = %d;\n\n", $count; 1119 1120 ######### decomposition table 1121 1122 # first determine all the 16-char subsets that contain something 1123 1124 my @filled = (0) x 4096; 1125 my $pos = 16*2; # for the null subset 1126 for ($i = 0; $i < 65536; $i++) 1127 { 1128 next unless defined $decomp_table[$i]; 1129 $filled[$i >> 4] = $pos; 1130 $pos += 16*2; 1131 $i |= 15; 1132 } 1133 my $total = $pos; 1134 1135 # now count the 256-char subsets that contain something 1136 1137 my @filled_idx = (256) x 256; 1138 $pos = 256 + 16; 1139 for ($i = 0; $i < 4096; $i++) 1140 { 1141 next unless $filled[$i]; 1142 $filled_idx[$i >> 4] = $pos; 1143 $pos += 16; 1144 $i |= 15; 1145 } 1146 my $null_offset = $pos; # null mapping 1147 $total += $pos; 1148 1149 # add the index offsets to the subsets positions 1150 1151 for ($i = 0; $i < 4096; $i++) 1152 { 1153 next unless $filled[$i]; 1154 $filled[$i] += $null_offset; 1155 } 1156 1157 # dump the main index 1158 1159 printf OUTPUT "const WCHAR unicode_decompose_table[%d] =\n", $total; 1160 printf OUTPUT "{\n /* index */\n"; 1161 printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @filled_idx ); 1162 printf OUTPUT ",\n /* null sub-index */\n%s", DUMP_ARRAY( "0x%04x", 0, ($null_offset) x 16 ); 1163 1164 # dump the second-level indexes 1165 1166 for ($i = 0; $i < 256; $i++) 1167 { 1168 next unless ($filled_idx[$i] > 256); 1169 my @table = @filled[($i<<4)..($i<<4)+15]; 1170 for ($j = 0; $j < 16; $j++) { $table[$j] ||= $null_offset; } 1171 printf OUTPUT ",\n /* sub-index %02x */\n", $i; 1172 printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @table ); 1173 } 1174 1175 # dump the 16-char subsets 1176 1177 printf OUTPUT ",\n /* null mapping */\n"; 1178 printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, (0) x 32 ); 1179 1180 for ($i = 0; $i < 4096; $i++) 1181 { 1182 next unless $filled[$i]; 1183 my @table = (0) x 32; 1184 for ($j = 0; $j < 16; $j++) 1185 { 1186 if (defined $decomp_table[($i<<4) + $j]) 1187 { 1188 $table[2 * $j] = ${$decomp_table[($i << 4) + $j]}[0]; 1189 $table[2 * $j + 1] = ${$decomp_table[($i << 4) + $j]}[1]; 1190 } 1191 } 1192 printf OUTPUT ",\n /* 0x%03x0 .. 0x%03xf */\n", $i, $i; 1193 printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @table ); 1194 } 1195 1196 printf OUTPUT "\n};\n"; 1197 close OUTPUT; 1198 save_file("compose.c"); 1199} 1200 1201 1202################################################################ 1203# read an input file and generate the corresponding .c file 1204sub HANDLE_FILE 1205{ 1206 my ($codepage,$filename,$has_glyphs,$comment) = @_; 1207 1208 @cp2uni = (); 1209 @lead_bytes = (); 1210 @uni2cp = (); 1211 1212 # symbol codepage file is special 1213 if ($codepage == 20932) { READ_JIS0208_FILE($MAPPREFIX . $filename); } 1214 elsif ($codepage == 20127) { fill_20127_codepage(); } 1215 else { READ_FILE($MAPPREFIX . $filename); } 1216 1217 # hack: 0x00a5 must map to backslash in Shift-JIS 1218 if ($codepage == 932) { $uni2cp[0x00a5] = 0x5c; } 1219 1220 ADD_DEFAULT_MAPPINGS(); 1221 1222 my $output = sprintf "c_%03d.c", $codepage; 1223 open OUTPUT,">$output.new" or die "Cannot create $output"; 1224 1225 printf "Building %s from %s (%s)\n", $output, $filename || "hardcoded data", $comment; 1226 1227 # dump all tables 1228 1229 printf OUTPUT "/* code page %03d (%s) */\n", $codepage, $comment; 1230 if ($filename) 1231 { 1232 printf OUTPUT "/* generated from %s */\n", $MAPPREFIX . $filename; 1233 printf OUTPUT "/* DO NOT EDIT!! */\n\n"; 1234 } 1235 else 1236 { 1237 printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n"; 1238 } 1239 printf OUTPUT "#include \"wine/unicode.h\"\n\n"; 1240 1241 if ($#lead_bytes == -1) { DUMP_SBCS_TABLE( $codepage, $has_glyphs, $comment ); } 1242 else { DUMP_DBCS_TABLE( $codepage, $comment ); } 1243 close OUTPUT; 1244 save_file($output); 1245} 1246 1247 1248################################################################ 1249# save a file if modified 1250sub save_file($) 1251{ 1252 my $file = shift; 1253 if (-f $file && !system "cmp $file $file.new >/dev/null") 1254 { 1255 unlink "$file.new"; 1256 } 1257 else 1258 { 1259 rename "$file.new", "$file"; 1260 } 1261} 1262 1263 1264################################################################ 1265# output the list of codepage tables into the cptable.c file 1266sub OUTPUT_CPTABLE 1267{ 1268 @tables_decl = (); 1269 1270 foreach $file (@allfiles) 1271 { 1272 my ($codepage,$filename,$comment) = @$file; 1273 push @tables_decl, sprintf("extern union cptable cptable_%03d;\n",$codepage); 1274 } 1275 1276 push @tables_decl, sprintf("\nstatic const union cptable * const cptables[%d] =\n{\n",$#allfiles+1); 1277 foreach $file (@allfiles) 1278 { 1279 my ($codepage,$filename,$comment) = @$file; 1280 push @tables_decl, sprintf(" &cptable_%03d,\n", $codepage); 1281 } 1282 push @tables_decl, "};"; 1283 REPLACE_IN_FILE( "cptable.c", @tables_decl ); 1284} 1285 1286################################################################ 1287# replace the contents of a file between ### cpmap ### marks 1288 1289sub REPLACE_IN_FILE 1290{ 1291 my $name = shift; 1292 my @data = @_; 1293 my @lines = (); 1294 open(FILE,$name) or die "Can't open $name"; 1295 while (<FILE>) 1296 { 1297 push @lines, $_; 1298 last if /\#\#\# cpmap begin \#\#\#/; 1299 } 1300 push @lines, @data; 1301 while (<FILE>) 1302 { 1303 if (/\#\#\# cpmap end \#\#\#/) { push @lines, "\n", $_; last; } 1304 } 1305 push @lines, <FILE>; 1306 open(FILE,">$name.new") or die "Can't modify $name"; 1307 print FILE @lines; 1308 close(FILE); 1309 save_file($name); 1310} 1311