xref: /reactos/sdk/tools/unicode/cpmap.pl (revision c2c66aff)
1#!/usr/bin/perl
2#
3# Generate code page .c files from ftp.unicode.org descriptions
4#
5# Copyright 2000 Alexandre Julliard
6#
7# This library is free software; you can redistribute it and/or
8# modify it under the terms of the GNU Lesser General Public
9# License as published by the Free Software Foundation; either
10# version 2.1 of the License, or (at your option) any later version.
11#
12# This library is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15# Lesser General Public License for more details.
16#
17# You should have received a copy of the GNU Lesser General Public
18# License along with this library; if not, write to the Free Software
19# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
20#
21
22# base directory for ftp.unicode.org files
23$BASEDIR = "ftp.unicode.org/Public/";
24$MAPPREFIX = $BASEDIR . "MAPPINGS/";
25
26# UnicodeData file
27$UNICODEDATA = $BASEDIR . "UNIDATA/UnicodeData.txt";
28
29# Sort keys file
30$SORTKEYS = "www.unicode.org/reports/tr10/allkeys.txt";
31
32# Defaults mapping
33$DEFAULTS = "./defaults";
34
35# Default char for undefined mappings
36$DEF_CHAR = ord '?';
37
38@allfiles =
39(
40    [ 37,    "VENDORS/MICSFT/EBCDIC/CP037.TXT",   0, "IBM EBCDIC US Canada" ],
41    [ 424,   "VENDORS/MISC/CP424.TXT",            0, "IBM EBCDIC Hebrew" ],
42    [ 437,   "VENDORS/MICSFT/PC/CP437.TXT",       1, "OEM United States" ],
43    [ 500,   "VENDORS/MICSFT/EBCDIC/CP500.TXT",   0, "IBM EBCDIC International" ],
44    [ 737,   "VENDORS/MICSFT/PC/CP737.TXT",       1, "OEM Greek 437G" ],
45    [ 775,   "VENDORS/MICSFT/PC/CP775.TXT",       1, "OEM Baltic" ],
46    [ 850,   "VENDORS/MICSFT/PC/CP850.TXT",       1, "OEM Multilingual Latin 1" ],
47    [ 852,   "VENDORS/MICSFT/PC/CP852.TXT",       1, "OEM Slovak Latin 2" ],
48    [ 855,   "VENDORS/MICSFT/PC/CP855.TXT",       1, "OEM Cyrillic" ],
49    [ 856,   "VENDORS/MISC/CP856.TXT",            0, "Hebrew PC" ],
50    [ 857,   "VENDORS/MICSFT/PC/CP857.TXT",       1, "OEM Turkish" ],
51    [ 860,   "VENDORS/MICSFT/PC/CP860.TXT",       1, "OEM Portuguese" ],
52    [ 861,   "VENDORS/MICSFT/PC/CP861.TXT",       1, "OEM Icelandic" ],
53    [ 862,   "VENDORS/MICSFT/PC/CP862.TXT",       1, "OEM Hebrew" ],
54    [ 863,   "VENDORS/MICSFT/PC/CP863.TXT",       1, "OEM Canadian French" ],
55    [ 864,   "VENDORS/MICSFT/PC/CP864.TXT",       0, "OEM Arabic" ],
56    [ 865,   "VENDORS/MICSFT/PC/CP865.TXT",       1, "OEM Nordic" ],
57    [ 866,   "VENDORS/MICSFT/PC/CP866.TXT",       1, "OEM Russian" ],
58    [ 869,   "VENDORS/MICSFT/PC/CP869.TXT",       1, "OEM Greek" ],
59    [ 874,   "VENDORS/MICSFT/PC/CP874.TXT",       1, "ANSI/OEM Thai" ],
60    [ 875,   "VENDORS/MICSFT/EBCDIC/CP875.TXT",   0, "IBM EBCDIC Greek" ],
61    [ 878,   "VENDORS/MISC/KOI8-R.TXT",           0, "Russian KOI8" ],
62    [ 932,   "VENDORS/MICSFT/WINDOWS/CP932.TXT",  0, "ANSI/OEM Japanese Shift-JIS" ],
63    [ 936,   "VENDORS/MICSFT/WINDOWS/CP936.TXT",  0, "ANSI/OEM Simplified Chinese GBK" ],
64    [ 949,   "VENDORS/MICSFT/WINDOWS/CP949.TXT",  0, "ANSI/OEM Korean Unified Hangul" ],
65    [ 950,   "VENDORS/MICSFT/WINDOWS/CP950.TXT",  0, "ANSI/OEM Traditional Chinese Big5" ],
66    [ 1006,  "VENDORS/MISC/CP1006.TXT",           0, "IBM Arabic" ],
67    [ 1026,  "VENDORS/MICSFT/EBCDIC/CP1026.TXT",  0, "IBM EBCDIC Latin 5 Turkish" ],
68    [ 1250,  "VENDORS/MICSFT/WINDOWS/CP1250.TXT", 0, "ANSI Eastern Europe" ],
69    [ 1251,  "VENDORS/MICSFT/WINDOWS/CP1251.TXT", 0, "ANSI Cyrillic" ],
70    [ 1252,  "VENDORS/MICSFT/WINDOWS/CP1252.TXT", 0, "ANSI Latin 1" ],
71    [ 1253,  "VENDORS/MICSFT/WINDOWS/CP1253.TXT", 0, "ANSI Greek" ],
72    [ 1254,  "VENDORS/MICSFT/WINDOWS/CP1254.TXT", 0, "ANSI Turkish" ],
73    [ 1255,  "VENDORS/MICSFT/WINDOWS/CP1255.TXT", 0, "ANSI Hebrew" ],
74    [ 1256,  "VENDORS/MICSFT/WINDOWS/CP1256.TXT", 0, "ANSI Arabic" ],
75    [ 1257,  "VENDORS/MICSFT/WINDOWS/CP1257.TXT", 0, "ANSI Baltic" ],
76    [ 1258,  "VENDORS/MICSFT/WINDOWS/CP1258.TXT", 0, "ANSI/OEM Viet Nam" ],
77    [ 1361,  "OBSOLETE/EASTASIA/KSC/JOHAB.TXT",   0, "Korean Johab" ],
78    [ 10000, "VENDORS/MICSFT/MAC/ROMAN.TXT",      0, "Mac Roman" ],
79    [ 10006, "VENDORS/MICSFT/MAC/GREEK.TXT",      0, "Mac Greek" ],
80    [ 10007, "VENDORS/MICSFT/MAC/CYRILLIC.TXT",   0, "Mac Cyrillic" ],
81    [ 10029, "VENDORS/MICSFT/MAC/LATIN2.TXT",     0, "Mac Latin 2" ],
82    [ 10079, "VENDORS/MICSFT/MAC/ICELAND.TXT",    0, "Mac Icelandic" ],
83    [ 10081, "VENDORS/MICSFT/MAC/TURKISH.TXT",    0, "Mac Turkish" ],
84    [ 20127, undef,                               0, "US-ASCII (7bit)" ],
85    [ 20866, "VENDORS/MISC/KOI8-R.TXT",           0, "Russian KOI8" ],
86    [ 20932, "OBSOLETE/EASTASIA/JIS/JIS0208.TXT", 0, "EUC-JP" ],
87    [ 21866, "VENDORS/MISC/KOI8-U.TXT",           0, "Ukrainian KOI8" ],
88    [ 28591, "ISO8859/8859-1.TXT",                0, "ISO 8859-1 Latin 1" ],
89    [ 28592, "ISO8859/8859-2.TXT",                0, "ISO 8859-2 Latin 2 (East European)" ],
90    [ 28593, "ISO8859/8859-3.TXT",                0, "ISO 8859-3 Latin 3 (South European)" ],
91    [ 28594, "ISO8859/8859-4.TXT",                0, "ISO 8859-4 Latin 4 (Baltic old)" ],
92    [ 28595, "ISO8859/8859-5.TXT",                0, "ISO 8859-5 Cyrillic" ],
93    [ 28596, "ISO8859/8859-6.TXT",                0, "ISO 8859-6 Arabic" ],
94    [ 28597, "ISO8859/8859-7.TXT",                0, "ISO 8859-7 Greek" ],
95    [ 28598, "ISO8859/8859-8.TXT",                0, "ISO 8859-8 Hebrew" ],
96    [ 28599, "ISO8859/8859-9.TXT",                0, "ISO 8859-9 Latin 5 (Turkish)" ],
97    [ 28600, "ISO8859/8859-10.TXT",               0, "ISO 8859-10 Latin 6 (Nordic)" ],
98    [ 28603, "ISO8859/8859-13.TXT",               0, "ISO 8859-13 Latin 7 (Baltic)" ],
99    [ 28604, "ISO8859/8859-14.TXT",               0, "ISO 8859-14 Latin 8 (Celtic)" ],
100    [ 28605, "ISO8859/8859-15.TXT",               0, "ISO 8859-15 Latin 9 (Euro)" ],
101    [ 28606, "ISO8859/8859-16.TXT",               0, "ISO 8859-16 Latin 10 (Balkan)" ]
102);
103
104
105%ctype =
106(
107    "upper"  => 0x0001,
108    "lower"  => 0x0002,
109    "digit"  => 0x0004,
110    "space"  => 0x0008,
111    "punct"  => 0x0010,
112    "cntrl"  => 0x0020,
113    "blank"  => 0x0040,
114    "xdigit" => 0x0080,
115    "alpha"  => 0x0100
116);
117
118%categories =
119(
120    "Lu" => $ctype{"alpha"}|$ctype{"upper"}, # Letter, Uppercase
121    "Ll" => $ctype{"alpha"}|$ctype{"lower"}, # Letter, Lowercase
122    "Lt" => $ctype{"alpha"},    # Letter, Titlecase
123    "Mn" => $ctype{"punct"},    # Mark, Non-Spacing
124    "Mc" => $ctype{"punct"},    # Mark, Spacing Combining
125    "Me" => $ctype{"punct"},    # Mark, Enclosing
126    "Nd" => $ctype{"digit"},    # Number, Decimal Digit
127    "Nl" => $ctype{"punct"},    # Number, Letter
128    "No" => $ctype{"punct"},    # Number, Other
129    "Zs" => $ctype{"space"},    # Separator, Space
130    "Zl" => $ctype{"space"},    # Separator, Line
131    "Zp" => $ctype{"space"},    # Separator, Paragraph
132    "Cc" => $ctype{"cntrl"},    # Other, Control
133    "Cf" => 0,                  # Other, Format
134    "Cs" => 0,                  # Other, Surrogate
135    "Co" => 0,                  # Other, Private Use
136    "Cn" => 0,                  # Other, Not Assigned
137    "Lm" => $ctype{"punct"},    # Letter, Modifier
138    "Lo" => $ctype{"alpha"},    # Letter, Other
139    "Pc" => $ctype{"punct"},    # Punctuation, Connector
140    "Pd" => $ctype{"punct"},    # Punctuation, Dash
141    "Ps" => $ctype{"punct"},    # Punctuation, Open
142    "Pe" => $ctype{"punct"},    # Punctuation, Close
143    "Pi" => $ctype{"punct"},    # Punctuation, Initial quote
144    "Pf" => $ctype{"punct"},    # Punctuation, Final quote
145    "Po" => $ctype{"punct"},    # Punctuation, Other
146    "Sm" => $ctype{"punct"},    # Symbol, Math
147    "Sc" => $ctype{"punct"},    # Symbol, Currency
148    "Sk" => $ctype{"punct"},    # Symbol, Modifier
149    "So" => $ctype{"punct"}     # Symbol, Other
150);
151
152# a few characters need additional categories that cannot be determined automatically
153%special_categories =
154(
155    "xdigit" => [ ord('0')..ord('9'),ord('A')..ord('F'),ord('a')..ord('f'),
156                  0xff10..0xff19, 0xff21..0xff26, 0xff41..0xff46 ],
157    "space"  => [ 0x09..0x0d, 0x85 ],
158    "blank"  => [ 0x09, 0x20, 0xa0, 0x3000, 0xfeff ],
159    "cntrl"  => [ 0x070f, 0x180b, 0x180c, 0x180d, 0x180e, 0x200c, 0x200d,
160                  0x200e, 0x200f, 0x202a, 0x202b, 0x202c, 0x202d, 0x202e,
161                  0x206a, 0x206b, 0x206c, 0x206d, 0x206e, 0x206f, 0xfeff,
162                  0xfff9, 0xfffa, 0xfffb ]
163);
164
165%directions =
166(
167    "L"   => 1,    # Left-to-Right
168    "LRE" => 11,   # Left-to-Right Embedding
169    "LRO" => 11,   # Left-to-Right Override
170    "R"   => 2,    # Right-to-Left
171    "AL"  => 2,    # Right-to-Left Arabic
172    "RLE" => 11,   # Right-to-Left Embedding
173    "RLO" => 11,   # Right-to-Left Override
174    "PDF" => 11,   # Pop Directional Format
175    "EN"  => 3,    # European Number
176    "ES"  => 4,    # European Number Separator
177    "ET"  => 5,    # European Number Terminator
178    "AN"  => 6,    # Arabic Number
179    "CS"  => 7,    # Common Number Separator
180    "NSM" => 0,    # Non-Spacing Mark
181    "BN"  => 0,    # Boundary Neutral
182    "B"   => 8,    # Paragraph Separator
183    "S"   => 9,    # Segment Separator
184    "WS"  => 10,   # Whitespace
185    "ON"  => 11    # Other Neutrals
186);
187
188
189################################################################
190# main routine
191
192READ_DEFAULTS();
193my @sortkeys = READ_SORTKEYS_FILE();
194DUMP_CASE_MAPPINGS();
195DUMP_SORTKEYS(@sortkeys);
196DUMP_COMPOSE_TABLES();
197DUMP_CTYPE_TABLES();
198
199foreach $file (@allfiles) { HANDLE_FILE( @$file ); }
200
201OUTPUT_CPTABLE();
202
203exit(0);
204
205
206################################################################
207# read in the defaults file
208sub READ_DEFAULTS
209{
210    @unicode_defaults = ();
211    @unicode_aliases = ();
212    @tolower_table = ();
213    @toupper_table = ();
214    @digitmap_table = ();
215    @compatmap_table = ();
216    @category_table = ();
217    @direction_table = ();
218    @decomp_table = ();
219    @compose_table = ();
220
221    # first setup a few default mappings
222
223    open DEFAULTS or die "Cannot open $DEFAULTS";
224    print "Loading $DEFAULTS\n";
225    while (<DEFAULTS>)
226    {
227        next if /^\#/;  # skip comments
228        next if /^$/;  # skip empty lines
229        if (/^(([0-9a-fA-F]+)(,[0-9a-fA-F]+)*)\s+([0-9a-fA-F]+|'.'|none)\s+(\#.*)?/)
230        {
231            my @src = map hex, split /,/,$1;
232            my $dst = $4;
233            my $comment = $5;
234            if ($#src > 0) { push @unicode_aliases, \@src; }
235            next if ($dst eq "none");
236            $dst = ($dst =~ /\'.\'/) ? ord substr($dst,1,1) : hex $dst;
237            foreach $src (@src)
238            {
239                die "Duplicate value" if defined($unicode_defaults[$src]);
240                $unicode_defaults[$src] = $dst;
241            }
242            next;
243        }
244        die "Unrecognized line $_\n";
245    }
246
247    # now build mappings from the decomposition field of the Unicode database
248
249    open UNICODEDATA or die "Cannot open $UNICODEDATA";
250    print "Loading $UNICODEDATA\n";
251    while (<UNICODEDATA>)
252    {
253	# Decode the fields ...
254	($code, $name, $cat, $comb, $bidi,
255	 $decomp, $dec, $dig, $num, $mirror,
256	 $oldname, $comment, $upper, $lower, $title) = split /;/;
257
258        my $src = hex $code;
259
260        die "unknown category $cat" unless defined $categories{$cat};
261        die "unknown directionality $bidi" unless defined $directions{$bidi};
262
263        $uniname[$src] = $name;
264        $category_table[$src] = $categories{$cat};
265        $direction_table[$src] = $directions{$bidi};
266
267        if ($lower ne "")
268        {
269            $tolower_table[$src] = hex $lower;
270            $category_table[$src] |= $ctype{"upper"}|$ctype{"alpha"};
271        }
272        if ($upper ne "")
273        {
274            $toupper_table[$src] = hex $upper;
275            $category_table[$src] |= $ctype{"lower"}|$ctype{"alpha"};
276        }
277        if ($dec ne "")
278        {
279            $category_table[$src] |= $ctype{"digit"};
280        }
281        if ($dig ne "")
282        {
283            $digitmap_table[$src] = ord $dig;
284        }
285
286        # copy the category and direction for everything between First/Last pairs
287        if ($name =~ /, First>/) { $start = $src; }
288        if ($name =~ /, Last>/)
289        {
290            while ($start < $src)
291            {
292                $category_table[$start] = $category_table[$src];
293                $direction_table[$start] = $direction_table[$src];
294                $start++;
295            }
296        }
297
298        next if $decomp eq "";  # no decomposition, skip it
299
300        if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/)
301        {
302            # decomposition of the form "<foo> 1234" -> use char if type is known
303            if (($src >= 0xf900 && $src < 0xfb00) || ($src >= 0xfe30 && $src < 0xfffd))
304            {
305                # Single char decomposition in the compatibility range
306                $compatmap_table[$src] = hex $2;
307            }
308            next unless ($1 eq "font" ||
309                         $1 eq "noBreak" ||
310                         $1 eq "circle" ||
311                         $1 eq "super" ||
312                         $1 eq "sub" ||
313                         $1 eq "wide" ||
314                         $1 eq "narrow" ||
315                         $1 eq "compat" ||
316                         $1 eq "small");
317            $dst = hex $2;
318        }
319        elsif ($decomp =~ /^<compat>\s+0020\s+([0-9a-fA-F]+)/)
320        {
321            # decomposition "<compat> 0020 1234" -> combining accent
322            $dst = hex $1;
323        }
324        elsif ($decomp =~ /^([0-9a-fA-F]+)/)
325        {
326            # decomposition contains only char values without prefix -> use first char
327            $dst = hex $1;
328            $category_table[$src] |= $category_table[$dst];
329            # store decomposition if it contains two chars
330            if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/)
331            {
332                $decomp_table[$src] = [ hex $1, hex $2 ];
333                push @compose_table, [ hex $1, hex $2, $src ];
334            }
335            elsif ($decomp =~ /^(<[a-z]+>\s)*([0-9a-fA-F]+)$/ &&
336                   (($src >= 0xf900 && $src < 0xfb00) || ($src >= 0xfe30 && $src < 0xfffd)))
337            {
338                # Single char decomposition in the compatibility range
339                $compatmap_table[$src] = hex $2;
340            }
341        }
342        else
343        {
344            next;
345        }
346
347        next if defined($unicode_defaults[$src]);  # may have been set in the defaults file
348
349        # check for loops
350        for ($i = $dst; ; $i = $unicode_defaults[$i])
351        {
352            die sprintf("loop detected for %04x -> %04x",$src,$dst) if $i == $src;
353            last unless defined($unicode_defaults[$i]);
354        }
355        $unicode_defaults[$src] = $dst;
356    }
357
358    # patch the category of some special characters
359
360    foreach $cat (keys %special_categories)
361    {
362        my $flag = $ctype{$cat};
363        foreach $i (@{$special_categories{$cat}}) { $category_table[$i] |= $flag; }
364    }
365}
366
367
368################################################################
369# parse the input file
370sub READ_FILE
371{
372    my $name = shift;
373    open INPUT,$name or die "Cannot open $name";
374
375    while (<INPUT>)
376    {
377        next if /^\#/;  # skip comments
378        next if /^$/;  # skip empty lines
379        next if /\x1a/;  # skip ^Z
380        next if (/^0x([0-9a-fA-F]+)\s+\#UNDEFINED/);  # undefined char
381
382        if (/^0x([0-9a-fA-F]+)\s+\#DBCS LEAD BYTE/)
383        {
384            $cp = hex $1;
385            push @lead_bytes,$cp;
386            $cp2uni[$cp] = 0;
387            next;
388        }
389        if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
390        {
391            $cp = hex $1;
392            $uni = hex $2;
393            $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
394            $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
395            if ($cp > 0xff && !defined($cp2uni[$cp >> 8]))
396            {
397                push @lead_bytes,$cp >> 8;
398                $cp2uni[$cp >> 8] = 0;
399            }
400            next;
401        }
402        die "$name: Unrecognized line $_\n";
403    }
404}
405
406
407################################################################
408# fill input data for the 20127 (us-ascii) codepage
409sub fill_20127_codepage()
410{
411    for (my $i = 0; $i < 128; $i++) { $cp2uni[$i] = $uni2cp[$i] = $i; }
412    for (my $i = 128; $i < 256; $i++) { $cp2uni[$i] = $i & 0x7f; }
413}
414
415################################################################
416# get a mapping including glyph chars for MB_USEGLYPHCHARS
417
418sub get_glyphs_mapping(@)
419{
420    $_[0x01] = 0x263a;  # (WHITE SMILING FACE)
421    $_[0x02] = 0x263b;  # (BLACK SMILING FACE)
422    $_[0x03] = 0x2665;  # (BLACK HEART SUIT)
423    $_[0x04] = 0x2666;  # (BLACK DIAMOND SUIT)
424    $_[0x05] = 0x2663;  # (BLACK CLUB SUIT)
425    $_[0x06] = 0x2660;  # (BLACK SPADE SUIT)
426    $_[0x07] = 0x2022;  # (BULLET)
427    $_[0x08] = 0x25d8;  # (INVERSE BULLET)
428    $_[0x09] = 0x25cb;  # (WHITE CIRCLE)
429    $_[0x0a] = 0x25d9;  # (INVERSE WHITE CIRCLE)
430    $_[0x0b] = 0x2642;  # (MALE SIGN)
431    $_[0x0c] = 0x2640;  # (FEMALE SIGN)
432    $_[0x0d] = 0x266a;  # (EIGHTH NOTE)
433    $_[0x0e] = 0x266b;  # (BEAMED EIGHTH NOTES)
434    $_[0x0f] = 0x263c;  # (WHITE SUN WITH RAYS)
435    $_[0x10] = 0x25ba;  # (BLACK RIGHT-POINTING POINTER)
436    $_[0x11] = 0x25c4;  # (BLACK LEFT-POINTING POINTER)
437    $_[0x12] = 0x2195;  # (UP DOWN ARROW)
438    $_[0x13] = 0x203c;  # (DOUBLE EXCLAMATION MARK)
439    $_[0x14] = 0x00b6;  # (PILCROW SIGN)
440    $_[0x15] = 0x00a7;  # (SECTION SIGN)
441    $_[0x16] = 0x25ac;  # (BLACK RECTANGLE)
442    $_[0x17] = 0x21a8;  # (UP DOWN ARROW WITH BASE)
443    $_[0x18] = 0x2191;  # (UPWARDS ARROW)
444    $_[0x19] = 0x2193;  # (DOWNWARDS ARROW)
445    $_[0x1a] = 0x2192;  # (RIGHTWARDS ARROW)
446    $_[0x1b] = 0x2190;  # (LEFTWARDS ARROW)
447    $_[0x1c] = 0x221f;  # (RIGHT ANGLE)
448    $_[0x1d] = 0x2194;  # (LEFT RIGHT ARROW)
449    $_[0x1e] = 0x25b2;  # (BLACK UP-POINTING TRIANGLE)
450    $_[0x1f] = 0x25bc;  # (BLACK DOWN-POINTING TRIANGLE)
451    $_[0x7f] = 0x2302;  # (HOUSE)
452    return @_;
453}
454
455################################################################
456# build EUC-JP table from the JIS 0208 file
457# FIXME: for proper EUC-JP we should probably read JIS 0212 too
458# but this would require 3-byte DBCS characters
459sub READ_JIS0208_FILE
460{
461    my $name = shift;
462
463    # ASCII chars
464    for ($i = 0x00; $i <= 0x7f; $i++)
465    {
466        $cp2uni[$i] = $i;
467        $uni2cp[$i] = $i;
468    }
469
470    # JIS X 0201 right plane
471    for ($i = 0xa1; $i <= 0xdf; $i++)
472    {
473        $cp2uni[0x8e00 + $i] = 0xfec0 + $i;
474        $uni2cp[0xfec0 + $i] = 0x8e00 + $i;
475    }
476
477    # lead bytes
478    foreach $i (0x8e, 0x8f, 0xa1 .. 0xfe)
479    {
480        push @lead_bytes,$i;
481        $cp2uni[$i] = 0;
482    }
483
484    # undefined chars
485    foreach $i (0x80 .. 0x8d, 0x90 .. 0xa0, 0xff)
486    {
487        $cp2uni[$i] = $DEF_CHAR;
488    }
489
490    # Shift-JIS compatibility
491    $uni2cp[0x00a5] = 0x5c;
492    $uni2cp[0x203e] = 0x7e;
493
494    # Fix backslash conversion
495    $cp2uni[0xa1c0] = 0xff3c;
496    $uni2cp[0xff3c] = 0xa1c0;
497
498    open INPUT, "$name" or die "Cannot open $name";
499    while (<INPUT>)
500    {
501        next if /^\#/;  # skip comments
502        next if /^$/;  # skip empty lines
503        next if /\x1a/;  # skip ^Z
504        if (/^0x[0-9a-fA-F]+\s+0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
505        {
506            $cp = 0x8080 + hex $1;
507            $uni = hex $2;
508            $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
509            $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
510            next;
511        }
512        die "$name: Unrecognized line $_\n";
513    }
514}
515
516
517################################################################
518# build the sort keys table
519sub READ_SORTKEYS_FILE
520{
521    my @sortkeys = ();
522    for (my $i = 0; $i < 65536; $i++) { $sortkeys[$i] = [ -1, 0, 0, 0, 0 ] };
523
524    open INPUT, "$SORTKEYS" or die "Cannot open $SORTKEYS";
525    print "Loading $SORTKEYS\n";
526    while (<INPUT>)
527    {
528        next if /^\#/;  # skip comments
529        next if /^$/;  # skip empty lines
530        next if /\x1a/;  # skip ^Z
531        next if /^\@version/;  # skip @version header
532        if (/^([0-9a-fA-F]+)\s+;\s+\[([*.])([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
533        {
534            my ($uni,$variable) = (hex $1, $2);
535            next if $uni > 65535;
536            $sortkeys[$uni] = [ $uni, hex $3, hex $4, hex $5, hex $6 ];
537            next;
538        }
539        if (/^([0-9a-fA-F]+\s+)+;\s+\[[*.]([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
540        {
541            # multiple character sequence, ignored for now
542            next;
543        }
544        die "$SORTKEYS: Unrecognized line $_\n";
545    }
546    close INPUT;
547
548    # compress the keys to 32 bit:
549    # key 1 to 16 bits, key 2 to 8 bits, key 3 to 4 bits, key 4 to 1 bit
550
551    @sortkeys = sort { ${$a}[1] <=> ${$b}[1] or
552                       ${$a}[2] <=> ${$b}[2] or
553                       ${$a}[3] <=> ${$b}[3] or
554                       ${$a}[4] <=> ${$b}[4] or
555                       $a cmp $b; } @sortkeys;
556
557    my ($n2, $n3) = (1, 1);
558    my @keys = (-1, -1, -1, -1, -1 );
559    my @flatkeys = ();
560
561    for (my $i = 0; $i < 65536; $i++)
562    {
563        my @current = @{$sortkeys[$i]};
564        next if $current[0] == -1;
565        if ($current[1] == $keys[1])
566        {
567            if ($current[2] == $keys[2])
568            {
569                if ($current[3] == $keys[3])
570                {
571                    # nothing
572                }
573                else
574                {
575                    $keys[3] = $current[3];
576                    $n3++;
577                    die if ($n3 >= 16);
578                }
579            }
580            else
581            {
582                $keys[2] = $current[2];
583                $keys[3] = $current[3];
584                $n2++;
585                $n3 = 1;
586                die if ($n2 >= 256);
587            }
588        }
589        else
590        {
591            $keys[1] = $current[1];
592            $keys[2] = $current[2];
593            $keys[3] = $current[3];
594            $n2 = 1;
595            $n3 = 1;
596        }
597
598        if ($current[2]) { $current[2] = $n2; }
599        if ($current[3]) { $current[3] = $n3; }
600        if ($current[4]) { $current[4] = 1; }
601
602        $flatkeys[$current[0]] = ($current[1] << 16) | ($current[2] << 8) | ($current[3] << 4) | $current[4];
603    }
604    return @flatkeys;
605}
606
607
608################################################################
609# build the sort keys table
610sub DUMP_SORTKEYS
611{
612    my @keys = @_;
613
614    # count the number of 256-key ranges that contain something
615
616    my @offsets = ();
617    my $ranges = 2;
618    for (my $i = 0; $i < 256; $i++) { $offsets[$i] = 256; }
619    for (my $i = 0; $i < 65536; $i++)
620    {
621        next unless defined $keys[$i];
622        $offsets[$i >> 8] = $ranges * 256;
623        $ranges++;
624        $i |= 255;
625    }
626
627    # output the range offsets
628
629    open OUTPUT,">collation.c.new" or die "Cannot create collation.c";
630    printf "Building collation.c\n";
631    printf OUTPUT "/* Unicode collation element table */\n";
632    printf OUTPUT "/* generated from %s */\n", $SORTKEYS;
633    printf OUTPUT "/* DO NOT EDIT!! */\n\n";
634
635    printf OUTPUT "const unsigned int collation_table[%d] =\n{\n", $ranges*256;
636    printf OUTPUT "    /* index */\n";
637    printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%08x", 0, @offsets );
638
639    # output the default values
640
641    printf OUTPUT "    /* defaults */\n";
642    printf OUTPUT "%s", DUMP_ARRAY( "0x%08x", 0, (0xffffffff) x 256 );
643
644    # output all the key ranges
645
646    for (my $i = 0; $i < 256; $i++)
647    {
648        next if $offsets[$i] == 256;
649        printf OUTPUT ",\n    /* 0x%02x00 .. 0x%02xff */\n", $i, $i;
650        printf OUTPUT "%s", DUMP_ARRAY( "0x%08x", 0xffffffff, @keys[($i<<8) .. ($i<<8)+255] );
651    }
652    printf OUTPUT "\n};\n";
653    close OUTPUT;
654    save_file("collation.c");
655}
656
657
658################################################################
659# add default mappings once the file had been read
660sub ADD_DEFAULT_MAPPINGS
661{
662    # Apply aliases
663
664    foreach $alias (@unicode_aliases)
665    {
666        my $target = undef;
667        foreach $src (@$alias)
668        {
669            if (defined($uni2cp[$src]))
670            {
671                $target = $uni2cp[$src];
672                last;
673            }
674        }
675        next unless defined($target);
676
677        # At least one char of the alias set is defined, set the others to the same value
678        foreach $src (@$alias)
679        {
680            $uni2cp[$src] = $target unless defined($uni2cp[$src]);
681        }
682    }
683
684    # For every src -> target mapping in the defaults table,
685    # make uni2cp[src] = uni2cp[target] if uni2cp[target] is defined
686
687    for ($src = 0; $src < 65536; $src++)
688    {
689        next if defined($uni2cp[$src]);  # source has a definition already
690        next unless defined($unicode_defaults[$src]);  # no default for this char
691        my $target = $unicode_defaults[$src];
692
693        # do a recursive mapping until we find a target char that is defined
694        while (!defined($uni2cp[$target]) &&
695               defined($unicode_defaults[$target])) { $target = $unicode_defaults[$target]; }
696
697        if (defined($uni2cp[$target])) { $uni2cp[$src] = $uni2cp[$target]; }
698    }
699
700    # Add an identity mapping for all undefined chars
701
702    for ($i = 0; $i < 256; $i++)
703    {
704        next if defined($cp2uni[$i]);
705        next if defined($uni2cp[$i]);
706        $cp2uni[$i] = $uni2cp[$i] = $i;
707    }
708}
709
710################################################################
711# dump an array of integers
712sub DUMP_ARRAY
713{
714    my ($format,$default,@array) = @_;
715    my $i, $ret = "    ";
716    for ($i = 0; $i < $#array; $i++)
717    {
718        $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
719        $ret .= (($i % 8) != 7) ? ", " : ",\n    ";
720    }
721    $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
722    return $ret;
723}
724
725################################################################
726# dump an SBCS mapping table
727sub DUMP_SBCS_TABLE
728{
729    my ($codepage, $has_glyphs, $name) = @_;
730    my $i;
731
732    # output the ascii->unicode table
733
734    if ($has_glyphs)
735    {
736        printf OUTPUT "static const WCHAR cp2uni[512] =\n";
737        printf OUTPUT "{\n%s", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @cp2uni[0 .. 255] );
738        printf OUTPUT ",\n    /* glyphs */\n%s\n};\n\n",
739                      DUMP_ARRAY( "0x%04x", $DEF_CHAR, get_glyphs_mapping(@cp2uni[0 .. 255]) );
740    }
741    else
742    {
743        printf OUTPUT "static const WCHAR cp2uni[256] =\n";
744        printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @cp2uni[0 .. 255] );
745    }
746
747    # count the number of unicode->ascii subtables that contain something
748
749    my @filled = ();
750    my $subtables = 1;
751    for ($i = 0; $i < 65536; $i++)
752    {
753        next unless defined $uni2cp[$i];
754        $filled[$i >> 8] = 1;
755        $subtables++;
756        $i |= 255;
757    }
758
759    # output all the subtables into a single array
760
761    printf OUTPUT "static const unsigned char uni2cp_low[%d] =\n{\n", $subtables*256;
762    for ($i = 0; $i < 256; $i++)
763    {
764        next unless $filled[$i];
765        printf OUTPUT "    /* 0x%02x00 .. 0x%02xff */\n", $i, $i;
766        printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%02x", $DEF_CHAR, @uni2cp[($i<<8) .. ($i<<8)+255] );
767    }
768    printf OUTPUT "    /* defaults */\n";
769    printf OUTPUT "%s\n};\n\n", DUMP_ARRAY( "0x%02x", 0, ($DEF_CHAR) x 256 );
770
771    # output a table of the offsets of the subtables in the previous array
772
773    my $pos = 0;
774    my @offsets = ();
775    for ($i = 0; $i < 256; $i++)
776    {
777        if ($filled[$i]) { push @offsets, $pos; $pos += 256; }
778        else { push @offsets, ($subtables-1) * 256; }
779    }
780    printf OUTPUT "static const unsigned short uni2cp_high[256] =\n";
781    printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%04x", 0, @offsets );
782
783    # output the code page descriptor
784
785    printf OUTPUT "const struct sbcs_table cptable_%03d =\n{\n", $codepage;
786    printf OUTPUT "    { %d, 1, 0x%04x, 0x%04x, \"%s\" },\n",
787                  $codepage, $DEF_CHAR, $DEF_CHAR, $name;
788    printf OUTPUT "    cp2uni,\n";
789    if ($has_glyphs) { printf OUTPUT "    cp2uni + 256,\n"; }
790    else { printf OUTPUT "    cp2uni,\n"; }
791    printf OUTPUT "    uni2cp_low,\n";
792    printf OUTPUT "    uni2cp_high\n};\n";
793}
794
795
796################################################################
797# dump a DBCS mapping table
798sub DUMP_DBCS_TABLE
799{
800    my ($codepage, $name) = @_;
801    my $i, $x, $y;
802
803    # build a list of lead bytes that are actually used
804
805    my @lblist = ();
806    LBLOOP: for ($y = 0; $y <= $#lead_bytes; $y++)
807    {
808        my $base = $lead_bytes[$y] << 8;
809        for ($x = 0; $x < 256; $x++)
810        {
811            if (defined $cp2uni[$base+$x])
812            {
813                push @lblist,$lead_bytes[$y];
814                next LBLOOP;
815            }
816        }
817    }
818    my $unused = ($#lead_bytes > $#lblist);
819
820    # output the ascii->unicode table for the single byte chars
821
822    printf OUTPUT "static const WCHAR cp2uni[%d] =\n", 256 * ($#lblist + 2 + $unused);
823    printf OUTPUT "{\n%s,\n", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @cp2uni[0 .. 255] );
824
825    # output the default table for unused lead bytes
826
827    if ($unused)
828    {
829        printf OUTPUT "    /* unused lead bytes */\n";
830        printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%04x", 0, ($DEF_CHAR) x 256 );
831    }
832
833    # output the ascii->unicode table for each DBCS lead byte
834
835    for ($y = 0; $y <= $#lblist; $y++)
836    {
837        my $base = $lblist[$y] << 8;
838        printf OUTPUT "    /* lead byte %02x */\n", $lblist[$y];
839        printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @cp2uni[$base .. $base+255] );
840        printf OUTPUT ($y < $#lblist) ? ",\n" : "\n};\n\n";
841    }
842
843    # output the lead byte subtables offsets
844
845    my @offsets = ();
846    for ($x = 0; $x < 256; $x++) { $offsets[$x] = 0; }
847    for ($x = 0; $x <= $#lblist; $x++) { $offsets[$lblist[$x]] = $x + 1; }
848    if ($unused)
849    {
850        # increment all lead bytes offset to take into account the unused table
851        for ($x = 0; $x <= $#lead_bytes; $x++) { $offsets[$lead_bytes[$x]]++; }
852    }
853    printf OUTPUT "static const unsigned char cp2uni_leadbytes[256] =\n";
854    printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%02x", 0, @offsets );
855
856    # count the number of unicode->ascii subtables that contain something
857
858    my @filled = ();
859    my $subtables = 1;
860    for ($i = 0; $i < 65536; $i++)
861    {
862        next unless defined $uni2cp[$i];
863        $filled[$i >> 8] = 1;
864        $subtables++;
865        $i |= 255;
866    }
867
868    # output all the subtables into a single array
869
870    printf OUTPUT "static const unsigned short uni2cp_low[%d] =\n{\n", $subtables*256;
871    for ($y = 0; $y < 256; $y++)
872    {
873        next unless $filled[$y];
874        printf OUTPUT "    /* 0x%02x00 .. 0x%02xff */\n", $y, $y;
875        printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @uni2cp[($y<<8) .. ($y<<8)+255] );
876    }
877    printf OUTPUT "    /* defaults */\n";
878    printf OUTPUT "%s\n};\n\n", DUMP_ARRAY( "0x%04x", 0, ($DEF_CHAR) x 256 );
879
880    # output a table of the offsets of the subtables in the previous array
881
882    my $pos = 0;
883    my @offsets = ();
884    for ($y = 0; $y < 256; $y++)
885    {
886        if ($filled[$y]) { push @offsets, $pos; $pos += 256; }
887        else { push @offsets, ($subtables-1) * 256; }
888    }
889    printf OUTPUT "static const unsigned short uni2cp_high[256] =\n";
890    printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%04x", 0, @offsets );
891
892    # output the code page descriptor
893
894    printf OUTPUT "const struct dbcs_table cptable_%03d =\n{\n", $codepage;
895    printf OUTPUT "    { %d, 2, 0x%04x, 0x%04x, \"%s\" },\n",
896                  $codepage, $DEF_CHAR, $DEF_CHAR, $name;
897    printf OUTPUT "    cp2uni,\n";
898    printf OUTPUT "    cp2uni_leadbytes,\n";
899    printf OUTPUT "    uni2cp_low,\n";
900    printf OUTPUT "    uni2cp_high,\n";
901    DUMP_LB_RANGES();
902    printf OUTPUT "};\n";
903}
904
905
906################################################################
907# dump the list of defined lead byte ranges
908sub DUMP_LB_RANGES
909{
910    my @list = ();
911    my $i = 0;
912    foreach $i (@lead_bytes) { $list[$i] = 1; }
913    my $on = 0;
914    printf OUTPUT "    { ";
915    for ($i = 0; $i < 256; $i++)
916    {
917        if ($on)
918        {
919            if (!defined $list[$i]) { printf OUTPUT "0x%02x, ", $i-1; $on = 0; }
920        }
921        else
922        {
923            if ($list[$i]) { printf OUTPUT "0x%02x, ", $i; $on = 1; }
924        }
925    }
926    if ($on) { printf OUTPUT "0xff, "; }
927    printf OUTPUT "0x00, 0x00 }\n";
928}
929
930
931################################################################
932# dump the case mapping tables
933sub DUMP_CASE_MAPPINGS
934{
935    open OUTPUT,">casemap.c.new" or die "Cannot create casemap.c";
936    printf "Building casemap.c\n";
937    printf OUTPUT "/* Unicode case mappings */\n";
938    printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
939    printf OUTPUT "#include \"wine/unicode.h\"\n\n";
940
941    DUMP_CASE_TABLE( "wine_casemap_lower", @tolower_table );
942    DUMP_CASE_TABLE( "wine_casemap_upper", @toupper_table );
943    DUMP_CASE_TABLE( "wine_digitmap",  @digitmap_table );
944    DUMP_CASE_TABLE( "wine_compatmap", @compatmap_table );
945    close OUTPUT;
946    save_file("casemap.c");
947}
948
949
950################################################################
951# dump a case mapping table
952sub DUMP_CASE_TABLE
953{
954    my ($name,@table) = @_;
955
956    # count the number of sub tables that contain something
957    # also compute the low and upper populated bounds
958
959    my @lowerbounds = ( 0, 0 );
960    my @upperbounds = ( 0, 255 );
961    my $index = 0;
962    my @filled = ();
963    for ($i = 0; $i < 65536; $i++)
964    {
965        next unless defined $table[$i];
966        if (!defined $filled[$i >> 8])
967        {
968          $lowerbounds[$index] = $i & 0xff;
969          $upperbounds[$index] = 0xff - $lowerbounds[$index];
970          $filled[$i >> 8] = $index * 256 + 512;
971          $index++;
972        }
973        else
974        {
975          $upperbounds[$index-1] = 0xff - ($i & 0xff);
976        }
977        $table[$i] = ($table[$i] - $i) & 0xffff;
978    }
979
980    # Collapse blocks upwards if possible
981    my $removed = 0;
982    $index = 0;
983    for ($i = 0; $i < 256; $i++)
984    {
985        next unless defined $filled[$i];
986        if ($upperbounds[$index - 1] > $lowerbounds[$index])
987        {
988           $removed = $removed + $lowerbounds[$index];
989        }
990        else
991        {
992           $removed = $removed + $upperbounds[$index - 1];
993           $lowerbounds[$index] = $upperbounds[$index - 1];
994        }
995        $filled[$i] = $filled[$i] - $removed;
996        $index++;
997    }
998
999    # dump the table
1000
1001    printf OUTPUT "const WCHAR %s[%d] =\n", $name, $index * 256 + 512 - $removed;
1002    printf OUTPUT "{\n    /* index */\n";
1003    printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%04x", 256, @filled );
1004    printf OUTPUT "    /* defaults */\n";
1005    printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, (0) x 256 );
1006    $index = 0;
1007    for ($i = 0; $i < 256; $i++)
1008    {
1009        next unless $filled[$i];
1010        printf OUTPUT ",\n    /* 0x%02x%02x .. 0x%02xff */\n", $i, $lowerbounds[$index], $i;
1011        printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0,
1012                      @table[($i<<8) + $lowerbounds[$index] .. ($i<<8)+255] );
1013        $index++;
1014    }
1015    printf OUTPUT "\n};\n";
1016}
1017
1018
1019################################################################
1020# dump the ctype tables
1021sub DUMP_CTYPE_TABLES
1022{
1023    open OUTPUT,">wctype.c.new" or die "Cannot create wctype.c";
1024    printf "Building wctype.c\n";
1025    printf OUTPUT "/* Unicode ctype tables */\n";
1026    printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
1027    printf OUTPUT "#include \"wine/unicode.h\"\n\n";
1028
1029    my $i;
1030    my @array = (0) x 256;
1031
1032    # add the direction in the high 4 bits of the category
1033    for ($i = 0; $i < 65536; $i++)
1034    {
1035        $category_table[$i] |= $direction_table[$i] << 12;
1036    }
1037
1038    # try to merge table rows
1039    for ($row = 0; $row < 256; $row++)
1040    {
1041        my $rowtxt = sprintf "%04x" x 256, @category_table[($row<<8)..($row<<8)+255];
1042        if (defined($sequences{$rowtxt}))
1043        {
1044            # reuse an existing row
1045            $array[$row] = $sequences{$rowtxt};
1046        }
1047        else
1048        {
1049            # create a new row
1050            $sequences{$rowtxt} = $array[$row] = $#array + 1;
1051            push @array, @category_table[($row<<8)..($row<<8)+255];
1052        }
1053    }
1054
1055    printf OUTPUT "const unsigned short wine_wctype_table[%d] =\n{\n", $#array+1;
1056    printf OUTPUT "    /* offsets */\n%s,\n", DUMP_ARRAY( "0x%04x", 0, @array[0..255] );
1057    printf OUTPUT "    /* values */\n%s\n};\n", DUMP_ARRAY( "0x%04x", 0, @array[256..$#array] );
1058
1059    close OUTPUT;
1060    save_file("wctype.c");
1061}
1062
1063
1064################################################################
1065# dump the char composition tables
1066sub DUMP_COMPOSE_TABLES
1067{
1068    open OUTPUT,">compose.c.new" or die "Cannot create compose.c";
1069    printf "Building compose.c\n";
1070    printf OUTPUT "/* Unicode char composition */\n";
1071    printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
1072    printf OUTPUT "#include \"wine/unicode.h\"\n\n";
1073
1074    ######### composition table
1075
1076    my @filled = ();
1077    foreach $i (@compose_table)
1078    {
1079        my @comp = @$i;
1080        push @{$filled[$comp[1]]}, [ $comp[0], $comp[2] ];
1081    }
1082
1083    # count how many different second chars we have
1084
1085    for ($i = $count = 0; $i < 65536; $i++)
1086    {
1087        next unless defined $filled[$i];
1088        $count++;
1089    }
1090
1091    # build the table of second chars and offsets
1092
1093    my $pos = $count + 1;
1094    for ($i = 0; $i < 65536; $i++)
1095    {
1096        next unless defined $filled[$i];
1097        push @table, $i, $pos;
1098        $pos += @{$filled[$i]};
1099    }
1100    # terminator with last position
1101    push @table, 0, $pos;
1102    printf OUTPUT "const WCHAR unicode_compose_table[0x%x] =\n{\n", 2*$pos;
1103    printf OUTPUT "    /* second chars + offsets */\n%s", DUMP_ARRAY( "0x%04x", 0, @table );
1104
1105    # build the table of first chars and mappings
1106
1107    for ($i = 0; $i < 65536; $i++)
1108    {
1109        next unless defined $filled[$i];
1110        my @table = ();
1111        my @list = sort { $a->[0] <=> $b->[0] } @{$filled[$i]};
1112        for ($j = 0; $j <= $#list; $j++)
1113        {
1114            push @table, $list[$j][0], $list[$j][1];
1115        }
1116        printf OUTPUT ",\n    /* 0x%04x */\n%s", $i, DUMP_ARRAY( "0x%04x", 0, @table );
1117    }
1118    printf OUTPUT "\n};\n\nconst unsigned int unicode_compose_table_size = %d;\n\n", $count;
1119
1120    ######### decomposition table
1121
1122    # first determine all the 16-char subsets that contain something
1123
1124    my @filled = (0) x 4096;
1125    my $pos = 16*2;  # for the null subset
1126    for ($i = 0; $i < 65536; $i++)
1127    {
1128        next unless defined $decomp_table[$i];
1129        $filled[$i >> 4] = $pos;
1130        $pos += 16*2;
1131        $i |= 15;
1132    }
1133    my $total = $pos;
1134
1135    # now count the 256-char subsets that contain something
1136
1137    my @filled_idx = (256) x 256;
1138    $pos = 256 + 16;
1139    for ($i = 0; $i < 4096; $i++)
1140    {
1141        next unless $filled[$i];
1142        $filled_idx[$i >> 4] = $pos;
1143        $pos += 16;
1144        $i |= 15;
1145    }
1146    my $null_offset = $pos;  # null mapping
1147    $total += $pos;
1148
1149    # add the index offsets to the subsets positions
1150
1151    for ($i = 0; $i < 4096; $i++)
1152    {
1153        next unless $filled[$i];
1154        $filled[$i] += $null_offset;
1155    }
1156
1157    # dump the main index
1158
1159    printf OUTPUT "const WCHAR unicode_decompose_table[%d] =\n", $total;
1160    printf OUTPUT "{\n    /* index */\n";
1161    printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @filled_idx );
1162    printf OUTPUT ",\n    /* null sub-index */\n%s", DUMP_ARRAY( "0x%04x", 0, ($null_offset) x 16 );
1163
1164    # dump the second-level indexes
1165
1166    for ($i = 0; $i < 256; $i++)
1167    {
1168        next unless ($filled_idx[$i] > 256);
1169        my @table = @filled[($i<<4)..($i<<4)+15];
1170        for ($j = 0; $j < 16; $j++) { $table[$j] ||= $null_offset; }
1171        printf OUTPUT ",\n    /* sub-index %02x */\n", $i;
1172        printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @table );
1173    }
1174
1175    # dump the 16-char subsets
1176
1177    printf OUTPUT ",\n    /* null mapping */\n";
1178    printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, (0) x 32 );
1179
1180    for ($i = 0; $i < 4096; $i++)
1181    {
1182        next unless $filled[$i];
1183        my @table = (0) x 32;
1184        for ($j = 0; $j < 16; $j++)
1185        {
1186            if (defined $decomp_table[($i<<4) + $j])
1187            {
1188                $table[2 * $j] = ${$decomp_table[($i << 4) + $j]}[0];
1189                $table[2 * $j + 1] = ${$decomp_table[($i << 4) + $j]}[1];
1190            }
1191        }
1192        printf OUTPUT ",\n    /* 0x%03x0 .. 0x%03xf */\n", $i, $i;
1193        printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @table );
1194    }
1195
1196    printf OUTPUT "\n};\n";
1197    close OUTPUT;
1198    save_file("compose.c");
1199}
1200
1201
1202################################################################
1203# read an input file and generate the corresponding .c file
1204sub HANDLE_FILE
1205{
1206    my ($codepage,$filename,$has_glyphs,$comment) = @_;
1207
1208    @cp2uni = ();
1209    @lead_bytes = ();
1210    @uni2cp = ();
1211
1212    # symbol codepage file is special
1213    if ($codepage == 20932) { READ_JIS0208_FILE($MAPPREFIX . $filename); }
1214    elsif ($codepage == 20127) { fill_20127_codepage(); }
1215    else { READ_FILE($MAPPREFIX . $filename); }
1216
1217    # hack: 0x00a5 must map to backslash in Shift-JIS
1218    if ($codepage == 932) { $uni2cp[0x00a5] = 0x5c; }
1219
1220    ADD_DEFAULT_MAPPINGS();
1221
1222    my $output = sprintf "c_%03d.c", $codepage;
1223    open OUTPUT,">$output.new" or die "Cannot create $output";
1224
1225    printf "Building %s from %s (%s)\n", $output, $filename || "hardcoded data", $comment;
1226
1227    # dump all tables
1228
1229    printf OUTPUT "/* code page %03d (%s) */\n", $codepage, $comment;
1230    if ($filename)
1231    {
1232        printf OUTPUT "/* generated from %s */\n", $MAPPREFIX . $filename;
1233        printf OUTPUT "/* DO NOT EDIT!! */\n\n";
1234    }
1235    else
1236    {
1237        printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
1238    }
1239    printf OUTPUT "#include \"wine/unicode.h\"\n\n";
1240
1241    if ($#lead_bytes == -1) { DUMP_SBCS_TABLE( $codepage, $has_glyphs, $comment ); }
1242    else { DUMP_DBCS_TABLE( $codepage, $comment ); }
1243    close OUTPUT;
1244    save_file($output);
1245}
1246
1247
1248################################################################
1249# save a file if modified
1250sub save_file($)
1251{
1252    my $file = shift;
1253    if (-f $file && !system "cmp $file $file.new >/dev/null")
1254    {
1255        unlink "$file.new";
1256    }
1257    else
1258    {
1259        rename "$file.new", "$file";
1260    }
1261}
1262
1263
1264################################################################
1265# output the list of codepage tables into the cptable.c file
1266sub OUTPUT_CPTABLE
1267{
1268    @tables_decl = ();
1269
1270    foreach $file (@allfiles)
1271    {
1272        my ($codepage,$filename,$comment) = @$file;
1273        push @tables_decl, sprintf("extern union cptable cptable_%03d;\n",$codepage);
1274    }
1275
1276    push @tables_decl, sprintf("\nstatic const union cptable * const cptables[%d] =\n{\n",$#allfiles+1);
1277    foreach $file (@allfiles)
1278    {
1279        my ($codepage,$filename,$comment) = @$file;
1280        push @tables_decl, sprintf("    &cptable_%03d,\n", $codepage);
1281    }
1282    push @tables_decl, "};";
1283    REPLACE_IN_FILE( "cptable.c", @tables_decl );
1284}
1285
1286################################################################
1287# replace the contents of a file between ### cpmap ### marks
1288
1289sub REPLACE_IN_FILE
1290{
1291    my $name = shift;
1292    my @data = @_;
1293    my @lines = ();
1294    open(FILE,$name) or die "Can't open $name";
1295    while (<FILE>)
1296    {
1297	push @lines, $_;
1298	last if /\#\#\# cpmap begin \#\#\#/;
1299    }
1300    push @lines, @data;
1301    while (<FILE>)
1302    {
1303	if (/\#\#\# cpmap end \#\#\#/) { push @lines, "\n", $_; last; }
1304    }
1305    push @lines, <FILE>;
1306    open(FILE,">$name.new") or die "Can't modify $name";
1307    print FILE @lines;
1308    close(FILE);
1309    save_file($name);
1310}
1311