1#!/usr/bin/env perl
2use strict;
3
4my (%titlecase8, %uni8_decomp);
5my (@titlecase16_keys, @titlecase16_values);
6my (@titlecase32_keys, @titlecase32_values);
7my (@uni16_decomp_keys, @uni16_decomp_values);
8my (@uni32_decomp_keys, @uni32_decomp_values);
9my (@multidecomp_keys, @multidecomp_offsets, @multidecomp_values);
10while (<>) {
11  chomp $_;
12  my @arr = split(";");
13  my $code = eval("0x".$arr[0]);
14  my $decomp = $arr[5];
15  my $titlecode = $arr[14];
16
17  if ($titlecode ne "") {
18    # titlecase mapping
19    my $value = eval("0x$titlecode");
20    if ($value == $code) {
21      # the same character, ignore
22    } elsif ($code <= 0xff) {
23      die "Error: We've assumed 8bit keys have max. 16bit values" if ($value > 0xffff);
24      $titlecase8{$code} = $value;
25    } elsif ($code <= 0xffff) {
26      die "Error: We've assumed 16bit keys have max. 16bit values" if ($value > 0xffff);
27      push @titlecase16_keys, $code;
28      push @titlecase16_values, $value;
29    } else {
30      push @titlecase32_keys, $code;
31      push @titlecase32_values, $value;
32    }
33  } elsif ($decomp =~ /(?:\<[^>]*> )?(.+)/) {
34    # decompositions
35    my $decomp_codes = $1;
36    if ($decomp_codes =~ /^([0-9A-Z]*)$/i) {
37      # unicharacter decomposition. use separate lists for this
38      my $value = eval("0x$1");
39      if ($value > 0xffffffff) {
40	print STDERR "Error: We've assumed decomposition codes are max. 32bit\n";
41	exit 1;
42      }
43      if ($code <= 0xff) {
44        $uni8_decomp{$code} = $value;
45      } elsif ($code <= 0xffff) {
46	push @uni16_decomp_keys, $code;
47	push @uni16_decomp_values, $value;
48      } else {
49	push @uni32_decomp_keys, $code;
50	push @uni32_decomp_values, $value;
51      }
52    } else {
53      # multicharacter decomposition.
54      if ($code > 0xffffffff) {
55	print STDERR "Error: We've assumed multi-decomposition key codes are max. 32bit\n";
56	exit 1;
57      }
58
59      push @multidecomp_keys, $code;
60      push @multidecomp_offsets, scalar(@multidecomp_values);
61
62      foreach my $dcode (split(" ", $decomp_codes)) {
63	my $value = eval("0x$dcode");
64	if ($value > 0xffffffff) {
65	  print STDERR "Error: We've assumed decomposition codes are max. 32bit\n";
66	  exit 1;
67	}
68	push @multidecomp_values, $value;
69      }
70      push @multidecomp_values, 0;
71    }
72  }
73}
74
75sub print_list {
76  my @list = @{$_[0]};
77
78  my $last = $#list;
79  my $n = 0;
80  foreach my $key (@list) {
81    printf("0x%05x", $key);
82    last if ($n == $last);
83    print ",";
84
85    $n++;
86    if (($n % 8) == 0) {
87      print "\n\t";
88    } else {
89      print " ";
90    }
91  }
92}
93
94print "/* This file is automatically generated by unicodemap.pl from UnicodeData.txt
95
96   NOTE: decompositions for characters having titlecase characters
97   are not included, because we first translate everything to titlecase */\n";
98
99sub print_map8 {
100  my %map = %{$_[0]};
101  my @list;
102  for (my $i = 0; $i <= 0xff; $i++) {
103    if (defined($map{$i})) {
104      push @list, $map{$i};
105    } else {
106      push @list, $i;
107    }
108  }
109  print_list(\@list);
110}
111
112print "static const uint16_t titlecase8_map[256] = {\n\t";
113print_map8(\%titlecase8);
114print "\n};\n";
115
116print "static const uint16_t titlecase16_keys[] = {\n\t";
117print_list(\@titlecase16_keys);
118print "\n};\n";
119
120print "static const uint16_t titlecase16_values[] = {\n\t";
121print_list(\@titlecase16_values);
122print "\n};\n";
123
124print "static const uint32_t titlecase32_keys[] = {\n\t";
125print_list(\@titlecase32_keys);
126print "\n};\n";
127
128print "static const uint32_t titlecase32_values[] = {\n\t";
129print_list(\@titlecase32_values);
130print "\n};\n";
131
132print "static const uint16_t uni8_decomp_map[256] = {\n\t";
133print_map8(\%uni8_decomp);
134print "\n};\n";
135
136print "static const uint16_t uni16_decomp_keys[] = {\n\t";
137print_list(\@uni16_decomp_keys);
138print "\n};\n";
139
140print "static const uint32_t uni16_decomp_values[] = {\n\t";
141print_list(\@uni16_decomp_values);
142print "\n};\n";
143
144print "static const uint32_t uni32_decomp_keys[] = {\n\t";
145print_list(\@uni32_decomp_keys);
146print "\n};\n";
147
148print "static const uint32_t uni32_decomp_values[] = {\n\t";
149print_list(\@uni32_decomp_values);
150print "\n};\n";
151
152print "static const uint32_t multidecomp_keys[] = {\n\t";
153print_list(\@multidecomp_keys);
154print "\n};\n";
155
156print "static const uint16_t multidecomp_offsets[] = {\n\t";
157print_list(\@multidecomp_offsets);
158print "\n};\n";
159
160print "static const uint32_t multidecomp_values[] = {\n\t";
161print_list(\@multidecomp_values);
162print "\n};\n";
163