1#!perl
2#
3# This auxiliary script makes five header files
4# used for building XSUB of Unicode::Normalize.
5#
6# Usage:
7#    <do 'mkheader'> in perl, or <perl mkheader> in command line
8#
9# Input files:
10#    unicore/CombiningClass.pl (or unicode/CombiningClass.pl)
11#    unicore/Decomposition.pl (or unicode/Decomposition.pl)
12#
13# Output files:
14#    unfcan.h
15#    unfcpt.h
16#    unfcmb.h
17#    unfcmp.h
18#    unfexc.h
19#
20use 5.006;
21use strict;
22use warnings;
23use Carp;
24use File::Spec;
25use SelectSaver;
26
27BEGIN {
28    unless ('A' eq pack('U', 0x41)) {
29	die "Unicode::Normalize cannot stringify a Unicode code point\n";
30    }
31    unless (0x41 == unpack('U', 'A')) {
32	die "Unicode::Normalize cannot get Unicode code point\n";
33    }
34}
35
36our $PACKAGE = 'Unicode::Normalize, mkheader';
37
38our $prefix = "UNF_";
39our $structname = "${prefix}complist";
40
41# Starting in v5.20, the tables in lib/unicore are built using the platform's
42# native character set for code points 0-255.
43*pack_U = ($] ge 5.020)
44          ? sub { return pack('W*', @_).pack('U*'); } # The empty pack returns
45                                                      # an empty UTF-8 string,
46                                                      # so the effect is to
47                                                      # force the return into
48                                                      # being UTF-8.
49          : sub { return pack('U*', @_); };
50
51# %Canon and %Compat will be ($codepoint => $hexstring) after _U_stringify()
52our %Comp1st;	# $codepoint => $listname  : may be composed with a next char.
53our %CompList;	# $listname,$2nd  => $codepoint : composite
54
55##### The below part is common to mkheader and PP #####
56
57our %Combin;	# $codepoint => $number    : combination class
58our %Canon;	# $codepoint => \@codepoints : canonical decomp.
59our %Compat;	# $codepoint => \@codepoints : compat. decomp.
60our %Compos;	# $1st,$2nd  => $codepoint : composite
61our %Exclus;	# $codepoint => 1          : composition exclusions
62our %Single;	# $codepoint => 1          : singletons
63our %NonStD;	# $codepoint => 1          : non-starter decompositions
64our %Comp2nd;	# $codepoint => 1          : may be composed with a prev char.
65
66# from core Unicode database
67our $Combin = do "unicore/CombiningClass.pl"
68    || do "unicode/CombiningClass.pl"
69    || croak "$PACKAGE: CombiningClass.pl not found";
70our $Decomp = do "unicore/Decomposition.pl"
71    || do "unicode/Decomposition.pl"
72    || croak "$PACKAGE: Decomposition.pl not found";
73
74# CompositionExclusions.txt since Unicode 3.2.0.  If this ever changes, it
75# would be better to get the values from Unicode::UCD rather than hard-code
76# them here, as that will protect from having to make fixes for future
77# changes.
78our @CompEx = qw(
79    0958 0959 095A 095B 095C 095D 095E 095F 09DC 09DD 09DF 0A33 0A36
80    0A59 0A5A 0A5B 0A5E 0B5C 0B5D 0F43 0F4D 0F52 0F57 0F5C 0F69 0F76
81    0F78 0F93 0F9D 0FA2 0FA7 0FAC 0FB9 FB1D FB1F FB2A FB2B FB2C FB2D
82    FB2E FB2F FB30 FB31 FB32 FB33 FB34 FB35 FB36 FB38 FB39 FB3A FB3B
83    FB3C FB3E FB40 FB41 FB43 FB44 FB46 FB47 FB48 FB49 FB4A FB4B FB4C
84    FB4D FB4E 2ADC 1D15E 1D15F 1D160 1D161 1D162 1D163 1D164 1D1BB
85    1D1BC 1D1BD 1D1BE 1D1BF 1D1C0
86);
87
88# definition of Hangul constants
89use constant SBase  => 0xAC00;
90use constant SFinal => 0xD7A3; # SBase -1 + SCount
91use constant SCount =>  11172; # LCount * NCount
92use constant NCount =>    588; # VCount * TCount
93use constant LBase  => 0x1100;
94use constant LFinal => 0x1112;
95use constant LCount =>     19;
96use constant VBase  => 0x1161;
97use constant VFinal => 0x1175;
98use constant VCount =>     21;
99use constant TBase  => 0x11A7;
100use constant TFinal => 0x11C2;
101use constant TCount =>     28;
102
103sub decomposeHangul {
104    my $sindex = $_[0] - SBase;
105    my $lindex = int( $sindex / NCount);
106    my $vindex = int(($sindex % NCount) / TCount);
107    my $tindex =      $sindex % TCount;
108    my @ret = (
109       LBase + $lindex,
110       VBase + $vindex,
111      $tindex ? (TBase + $tindex) : (),
112    );
113    return wantarray ? @ret : pack_U(@ret);
114}
115
116########## getting full decomposition ##########
117
118## converts string "hhhh hhhh hhhh" to a numeric list
119## (hex digits separated by spaces)
120sub _getHexArray { map hex, $_[0] =~ /\G *([0-9A-Fa-f]+)/g }
121
122while ($Combin =~ /(.+)/g) {
123    my @tab = split /\t/, $1;
124    my $ini = hex $tab[0];
125    if ($tab[1] eq '') {
126	$Combin{$ini} = $tab[2];
127    } else {
128	$Combin{$_} = $tab[2] foreach $ini .. hex($tab[1]);
129    }
130}
131
132while ($Decomp =~ /(.+)/g) {
133    my @tab = split /\t/, $1;
134    my $compat = $tab[2] =~ s/<[^>]+>//;
135    my $dec = [ _getHexArray($tab[2]) ]; # decomposition
136    my $ini = hex($tab[0]); # initial decomposable character
137    my $end = $tab[1] eq '' ? $ini : hex($tab[1]);
138    # ($ini .. $end) is the range of decomposable characters.
139
140    foreach my $u ($ini .. $end) {
141	$Compat{$u} = $dec;
142	$Canon{$u} = $dec if ! $compat;
143    }
144}
145
146for my $s (@CompEx) {
147    my $u = hex $s;
148    next if !$Canon{$u}; # not assigned
149    next if $u == 0xFB1D && !$Canon{0x1D15E}; # 3.0.1 before Corrigendum #2
150    $Exclus{$u} = 1;
151}
152
153foreach my $u (keys %Canon) {
154    my $dec = $Canon{$u};
155
156    if (@$dec == 2) {
157	if ($Combin{ $dec->[0] }) {
158	    $NonStD{$u} = 1;
159	} else {
160	    $Compos{ $dec->[0] }{ $dec->[1] } = $u;
161	    $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u};
162	}
163    } elsif (@$dec == 1) {
164	$Single{$u} = 1;
165    } else {
166	my $h = sprintf '%04X', $u;
167	croak("Weird Canonical Decomposition of U+$h");
168    }
169}
170
171# modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo
172foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) {
173    $Comp2nd{$j} = 1;
174}
175
176sub getCanonList {
177    my @src = @_;
178    my @dec = map {
179	(SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
180	    : $Canon{$_} ? @{ $Canon{$_} } : $_
181		} @src;
182    return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec);
183    # condition @src == @dec is not ok.
184}
185
186sub getCompatList {
187    my @src = @_;
188    my @dec = map {
189	(SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
190	    : $Compat{$_} ? @{ $Compat{$_} } : $_
191		} @src;
192    return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec);
193    # condition @src == @dec is not ok.
194}
195
196# exhaustive decomposition
197foreach my $key (keys %Canon) {
198    $Canon{$key}  = [ getCanonList($key) ];
199}
200
201# exhaustive decomposition
202foreach my $key (keys %Compat) {
203    $Compat{$key} = [ getCompatList($key) ];
204}
205
206##### The above part is common to mkheader and PP #####
207
208foreach my $comp1st (keys %Compos) {
209    my $listname = sprintf("${structname}_%06x", $comp1st);
210		# %04x is bad since it'd place _3046 after _1d157.
211    $Comp1st{$comp1st} = $listname;
212    my $rh1st = $Compos{$comp1st};
213
214    foreach my $comp2nd (keys %$rh1st) {
215	my $uc = $rh1st->{$comp2nd};
216	$CompList{$listname}{$comp2nd} = $uc;
217    }
218}
219
220sub split_into_char {
221    use bytes;
222    my $uni = shift;
223    my $len = length($uni);
224    my @ary;
225    for(my $i = 0; $i < $len; ++$i) {
226	push @ary, ord(substr($uni,$i,1));
227    }
228    return @ary;
229}
230
231sub _U_stringify {
232    sprintf '"%s"', join '',
233	map sprintf("\\x%02x", $_), split_into_char(pack_U(@_));
234}
235
236foreach my $hash (\%Canon, \%Compat) {
237    foreach my $key (keys %$hash) {
238	$hash->{$key} = _U_stringify( @{ $hash->{$key} } );
239    }
240}
241
242########## writing header files ##########
243
244my @boolfunc = (
245    {
246	name => "Exclusion",
247	type => "bool",
248	hash => \%Exclus,
249    },
250    {
251	name => "Singleton",
252	type => "bool",
253	hash => \%Single,
254    },
255    {
256	name => "NonStDecomp",
257	type => "bool",
258	hash => \%NonStD,
259    },
260    {
261	name => "Comp2nd",
262	type => "bool",
263	hash => \%Comp2nd,
264    },
265);
266
267my $orig_fh = SelectSaver->new;
268{
269
270my $file = "unfexc.h";
271open FH, ">$file" or croak "$PACKAGE: $file can't be made";
272binmode FH; select FH;
273
274    print << 'EOF';
275/*
276 * This file is auto-generated by mkheader.
277 * Any changes here will be lost!
278 */
279EOF
280
281foreach my $tbl (@boolfunc) {
282    my @temp = sort {$a <=> $b} keys %{$tbl->{hash}};
283    my $type = $tbl->{type};
284    my $name = $tbl->{name};
285    print "$type is$name (UV uv)\n{\nreturn\n\t";
286
287    while (@temp) {
288	my $cur = shift @temp;
289	if (@temp && $cur + 1 == $temp[0]) {
290	    print "($cur <= uv && uv <= ";
291	    while (@temp && $cur + 1 == $temp[0]) {
292		$cur = shift @temp;
293	    }
294	    print "$cur)";
295	    print "\n\t|| " if @temp;
296	} else {
297	    print "uv == $cur";
298	    print "\n\t|| " if @temp;
299	}
300    }
301    print "\n\t? TRUE : FALSE;\n}\n\n";
302}
303
304close FH;
305
306####################################
307
308my $compinit =
309    "typedef struct { UV nextchar; UV composite; } $structname;\n\n";
310
311foreach my $i (sort keys %CompList) {
312    $compinit .= "$structname $i [] = {\n";
313    $compinit .= join ",\n",
314	map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}),
315	    sort {$a <=> $b } keys %{ $CompList{$i} };
316    $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel
317}
318
319my @tripletable = (
320    {
321	file => "unfcmb",
322	name => "combin",
323	type => "STDCHAR",
324	hash => \%Combin,
325	null =>  0,
326    },
327    {
328	file => "unfcan",
329	name => "canon",
330	type => "char*",
331	hash => \%Canon,
332	null => "NULL",
333    },
334    {
335	file => "unfcpt",
336	name => "compat",
337	type => "char*",
338	hash => \%Compat,
339	null => "NULL",
340    },
341    {
342	file => "unfcmp",
343	name => "compos",
344	type => "$structname *",
345	hash => \%Comp1st,
346	null => "NULL",
347	init => $compinit,
348    },
349);
350
351foreach my $tbl (@tripletable) {
352    my $file = "$tbl->{file}.h";
353    my $head = "${prefix}$tbl->{name}";
354    my $type = $tbl->{type};
355    my $hash = $tbl->{hash};
356    my $null = $tbl->{null};
357    my $init = $tbl->{init};
358
359    open FH, ">$file" or croak "$PACKAGE: $file can't be made";
360    binmode FH; select FH;
361    my %val;
362
363    print FH << 'EOF';
364/*
365 * This file is auto-generated by mkheader.
366 * Any changes here will be lost!
367 */
368EOF
369
370    print $init if defined $init;
371
372    foreach my $uv (keys %$hash) {
373	croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv)
374	    unless $uv <= 0x10FFFF;
375	my @c = unpack 'CCCC', pack 'N', $uv;
376	$val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv};
377    }
378
379    foreach my $p (sort { $a <=> $b } keys %val) {
380	next if ! $val{ $p };
381	for (my $r = 0; $r < 256; $r++) {
382	    next if ! $val{ $p }{ $r };
383	    printf "static $type ${head}_%02x_%02x [256] = {\n", $p, $r;
384	    for (my $c = 0; $c < 256; $c++) {
385		print "\t", defined $val{$p}{$r}{$c}
386		    ? "($type)".$val{$p}{$r}{$c}
387		    : $null;
388		print ','  if $c != 255;
389		print "\n" if $c % 8 == 7;
390	    }
391	    print "};\n\n";
392	}
393    }
394    foreach my $p (sort { $a <=> $b } keys %val) {
395	next if ! $val{ $p };
396	printf "static $type* ${head}_%02x [256] = {\n", $p;
397	for (my $r = 0; $r < 256; $r++) {
398	    print $val{ $p }{ $r }
399		? sprintf("${head}_%02x_%02x", $p, $r)
400		: "NULL";
401	    print ','  if $r != 255;
402	    print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0;
403	}
404	print "};\n\n";
405    }
406    print "static $type** $head [] = {\n";
407    for (my $p = 0; $p <= 0x10; $p++) {
408	print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL";
409	print ','  if $p != 0x10;
410	print "\n";
411    }
412    print "};\n\n";
413    close FH;
414}
415
416}   # End of block for SelectSaver
417
4181;
419__END__
420