1#!perl 2# 3# This auxiliary script makes five header files 4# used for building XSUB of Unicode::Normalize. 5# 6# Usage: 7# <do 'mkheader'> in perl, or <perl mkheader> in command line 8# 9# Input files: 10# unicore/CombiningClass.pl (or unicode/CombiningClass.pl) 11# unicore/Decomposition.pl (or unicode/Decomposition.pl) 12# 13# Output files: 14# unfcan.h 15# unfcpt.h 16# unfcmb.h 17# unfcmp.h 18# unfexc.h 19# 20use 5.006; 21use strict; 22use warnings; 23use Carp; 24use File::Spec; 25use SelectSaver; 26 27our $PACKAGE = 'Unicode::Normalize, mkheader'; 28 29our $prefix = "UNF_"; 30our $structname = "${prefix}complist"; 31 32# Starting in v5.20, the tables in lib/unicore are built using the platform's 33# native character set for code points 0-255. But in v5.35, pack U stopped 34# trying to compensate 35*pack_U = ($] ge 5.020 && $] lt 5.035) 36 ? sub { return pack('U*', map { utf8::unicode_to_native($_) } @_); } 37 : sub { return pack('U*', @_); }; 38 39# %Canon and %Compat will be ($codepoint => $hexstring) after _U_stringify() 40our %Comp1st; # $codepoint => $listname : may be composed with a next char. 41our %CompList; # $listname,$2nd => $codepoint : composite 42 43##### The below part is common to mkheader and PP ##### 44 45our %Combin; # $codepoint => $number : combination class 46our %Canon; # $codepoint => \@codepoints : canonical decomp. 47our %Compat; # $codepoint => \@codepoints : compat. decomp. 48our %Compos; # $1st,$2nd => $codepoint : composite 49our %Exclus; # $codepoint => 1 : composition exclusions 50our %Single; # $codepoint => 1 : singletons 51our %NonStD; # $codepoint => 1 : non-starter decompositions 52our %Comp2nd; # $codepoint => 1 : may be composed with a prev char. 53 54# from core Unicode database 55our $Combin = do "unicore/CombiningClass.pl" 56 || do "unicode/CombiningClass.pl" 57 || croak "$PACKAGE: CombiningClass.pl not found"; 58our $Decomp = do "unicore/Decomposition.pl" 59 || do "unicode/Decomposition.pl" 60 || croak "$PACKAGE: Decomposition.pl not found"; 61 62# CompositionExclusions.txt since Unicode 3.2.0. If this ever changes, it 63# would be better to get the values from Unicode::UCD rather than hard-code 64# them here, as that will protect from having to make fixes for future 65# changes. 66our @CompEx = qw( 67 0958 0959 095A 095B 095C 095D 095E 095F 09DC 09DD 09DF 0A33 0A36 68 0A59 0A5A 0A5B 0A5E 0B5C 0B5D 0F43 0F4D 0F52 0F57 0F5C 0F69 0F76 69 0F78 0F93 0F9D 0FA2 0FA7 0FAC 0FB9 FB1D FB1F FB2A FB2B FB2C FB2D 70 FB2E FB2F FB30 FB31 FB32 FB33 FB34 FB35 FB36 FB38 FB39 FB3A FB3B 71 FB3C FB3E FB40 FB41 FB43 FB44 FB46 FB47 FB48 FB49 FB4A FB4B FB4C 72 FB4D FB4E 2ADC 1D15E 1D15F 1D160 1D161 1D162 1D163 1D164 1D1BB 73 1D1BC 1D1BD 1D1BE 1D1BF 1D1C0 74); 75 76# definition of Hangul constants 77use constant SBase => 0xAC00; 78use constant SFinal => 0xD7A3; # SBase -1 + SCount 79use constant SCount => 11172; # LCount * NCount 80use constant NCount => 588; # VCount * TCount 81use constant LBase => 0x1100; 82use constant LFinal => 0x1112; 83use constant LCount => 19; 84use constant VBase => 0x1161; 85use constant VFinal => 0x1175; 86use constant VCount => 21; 87use constant TBase => 0x11A7; 88use constant TFinal => 0x11C2; 89use constant TCount => 28; 90 91sub decomposeHangul { 92 my $sindex = $_[0] - SBase; 93 my $lindex = int( $sindex / NCount); 94 my $vindex = int(($sindex % NCount) / TCount); 95 my $tindex = $sindex % TCount; 96 my @ret = ( 97 LBase + $lindex, 98 VBase + $vindex, 99 $tindex ? (TBase + $tindex) : (), 100 ); 101 return wantarray ? @ret : pack_U(@ret); 102} 103 104########## getting full decomposition ########## 105 106## converts string "hhhh hhhh hhhh" to a numeric list 107## (hex digits separated by spaces) 108sub _getHexArray { map hex, $_[0] =~ /\G *([0-9A-Fa-f]+)/g } 109 110while ($Combin =~ /(.+)/g) { 111 my @tab = split /\t/, $1; 112 my $ini = hex $tab[0]; 113 if ($tab[1] eq '') { 114 $Combin{$ini} = $tab[2]; 115 } else { 116 $Combin{$_} = $tab[2] foreach $ini .. hex($tab[1]); 117 } 118} 119 120while ($Decomp =~ /(.+)/g) { 121 my @tab = split /\t/, $1; 122 my $compat = $tab[2] =~ s/<[^>]+>//; 123 my $dec = [ _getHexArray($tab[2]) ]; # decomposition 124 my $ini = hex($tab[0]); # initial decomposable character 125 my $end = $tab[1] eq '' ? $ini : hex($tab[1]); 126 # ($ini .. $end) is the range of decomposable characters. 127 128 foreach my $u ($ini .. $end) { 129 $Compat{$u} = $dec; 130 $Canon{$u} = $dec if ! $compat; 131 } 132} 133 134for my $s (@CompEx) { 135 my $u = hex $s; 136 next if !$Canon{$u}; # not assigned 137 next if $u == 0xFB1D && !$Canon{0x1D15E}; # 3.0.1 before Corrigendum #2 138 $Exclus{$u} = 1; 139} 140 141foreach my $u (keys %Canon) { 142 my $dec = $Canon{$u}; 143 144 if (@$dec == 2) { 145 if ($Combin{ $dec->[0] }) { 146 $NonStD{$u} = 1; 147 } else { 148 $Compos{ $dec->[0] }{ $dec->[1] } = $u; 149 $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u}; 150 } 151 } elsif (@$dec == 1) { 152 $Single{$u} = 1; 153 } else { 154 my $h = sprintf '%04X', $u; 155 croak("Weird Canonical Decomposition of U+$h"); 156 } 157} 158 159# modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo 160foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) { 161 $Comp2nd{$j} = 1; 162} 163 164sub getCanonList { 165 my @src = @_; 166 my @dec = map { 167 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_) 168 : $Canon{$_} ? @{ $Canon{$_} } : $_ 169 } @src; 170 return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec); 171 # condition @src == @dec is not ok. 172} 173 174sub getCompatList { 175 my @src = @_; 176 my @dec = map { 177 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_) 178 : $Compat{$_} ? @{ $Compat{$_} } : $_ 179 } @src; 180 return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec); 181 # condition @src == @dec is not ok. 182} 183 184# exhaustive decomposition 185foreach my $key (keys %Canon) { 186 $Canon{$key} = [ getCanonList($key) ]; 187} 188 189# exhaustive decomposition 190foreach my $key (keys %Compat) { 191 $Compat{$key} = [ getCompatList($key) ]; 192} 193 194##### The above part is common to mkheader and PP ##### 195 196foreach my $comp1st (keys %Compos) { 197 my $listname = sprintf("${structname}_%06x", $comp1st); 198 # %04x is bad since it'd place _3046 after _1d157. 199 $Comp1st{$comp1st} = $listname; 200 my $rh1st = $Compos{$comp1st}; 201 202 foreach my $comp2nd (keys %$rh1st) { 203 my $uc = $rh1st->{$comp2nd}; 204 $CompList{$listname}{$comp2nd} = $uc; 205 } 206} 207 208sub split_into_char { 209 use bytes; 210 my $uni = shift; 211 my $len = length($uni); 212 my @ary; 213 for(my $i = 0; $i < $len; ++$i) { 214 push @ary, ord(substr($uni,$i,1)); 215 } 216 return @ary; 217} 218 219sub _U_stringify { 220 sprintf '"%s"', join '', 221 map sprintf("\\x%02x", $_), split_into_char(pack_U(@_)); 222} 223 224foreach my $hash (\%Canon, \%Compat) { 225 foreach my $key (keys %$hash) { 226 $hash->{$key} = _U_stringify( @{ $hash->{$key} } ); 227 } 228} 229 230########## writing header files ########## 231 232my @boolfunc = ( 233 { 234 name => "Exclusion", 235 type => "bool", 236 hash => \%Exclus, 237 }, 238 { 239 name => "Singleton", 240 type => "bool", 241 hash => \%Single, 242 }, 243 { 244 name => "NonStDecomp", 245 type => "bool", 246 hash => \%NonStD, 247 }, 248 { 249 name => "Comp2nd", 250 type => "bool", 251 hash => \%Comp2nd, 252 }, 253); 254 255my $orig_fh = SelectSaver->new; 256{ 257 258my $file = "unfexc.h"; 259open FH, ">$file" or croak "$PACKAGE: $file can't be made"; 260binmode FH; select FH; 261 262 print << 'EOF'; 263/* 264 * This file is auto-generated by mkheader. 265 * Any changes here will be lost! 266 */ 267EOF 268 269foreach my $tbl (@boolfunc) { 270 my @temp = sort {$a <=> $b} keys %{$tbl->{hash}}; 271 my $type = $tbl->{type}; 272 my $name = $tbl->{name}; 273 print "$type is$name (UV uv)\n{\nreturn\n\t"; 274 275 while (@temp) { 276 my $cur = shift @temp; 277 if (@temp && $cur + 1 == $temp[0]) { 278 print "($cur <= uv && uv <= "; 279 while (@temp && $cur + 1 == $temp[0]) { 280 $cur = shift @temp; 281 } 282 print "$cur)"; 283 print "\n\t|| " if @temp; 284 } else { 285 print "uv == $cur"; 286 print "\n\t|| " if @temp; 287 } 288 } 289 print "\n\t? TRUE : FALSE;\n}\n\n"; 290} 291 292close FH; 293 294#################################### 295 296my $compinit = 297 "typedef struct { UV nextchar; UV composite; } $structname;\n\n"; 298 299foreach my $i (sort keys %CompList) { 300 $compinit .= "$structname $i [] = {\n"; 301 $compinit .= join ",\n", 302 map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}), 303 sort {$a <=> $b } keys %{ $CompList{$i} }; 304 $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel 305} 306 307my @tripletable = ( 308 { 309 file => "unfcmb", 310 name => "combin", 311 type => "STDCHAR", 312 hash => \%Combin, 313 null => 0, 314 }, 315 { 316 file => "unfcan", 317 name => "canon", 318 type => "char*", 319 hash => \%Canon, 320 null => "NULL", 321 }, 322 { 323 file => "unfcpt", 324 name => "compat", 325 type => "char*", 326 hash => \%Compat, 327 null => "NULL", 328 }, 329 { 330 file => "unfcmp", 331 name => "compos", 332 type => "$structname *", 333 hash => \%Comp1st, 334 null => "NULL", 335 init => $compinit, 336 }, 337); 338 339foreach my $tbl (@tripletable) { 340 my $file = "$tbl->{file}.h"; 341 my $head = "${prefix}$tbl->{name}"; 342 my $type = $tbl->{type}; 343 my $hash = $tbl->{hash}; 344 my $null = $tbl->{null}; 345 my $init = $tbl->{init}; 346 347 open FH, ">$file" or croak "$PACKAGE: $file can't be made"; 348 binmode FH; select FH; 349 my %val; 350 351 print FH << 'EOF'; 352/* 353 * This file is auto-generated by mkheader. 354 * Any changes here will be lost! 355 */ 356EOF 357 358 print $init if defined $init; 359 360 foreach my $uv (keys %$hash) { 361 croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv) 362 unless $uv <= 0x10FFFF; 363 my @c = unpack 'CCCC', pack 'N', $uv; 364 $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv}; 365 } 366 367 foreach my $p (sort { $a <=> $b } keys %val) { 368 next if ! $val{ $p }; 369 for (my $r = 0; $r < 256; $r++) { 370 next if ! $val{ $p }{ $r }; 371 printf "static $type ${head}_%02x_%02x [256] = {\n", $p, $r; 372 for (my $c = 0; $c < 256; $c++) { 373 print "\t", defined $val{$p}{$r}{$c} 374 ? "($type)".$val{$p}{$r}{$c} 375 : $null; 376 print ',' if $c != 255; 377 print "\n" if $c % 8 == 7; 378 } 379 print "};\n\n"; 380 } 381 } 382 foreach my $p (sort { $a <=> $b } keys %val) { 383 next if ! $val{ $p }; 384 printf "static $type* ${head}_%02x [256] = {\n", $p; 385 for (my $r = 0; $r < 256; $r++) { 386 print $val{ $p }{ $r } 387 ? sprintf("${head}_%02x_%02x", $p, $r) 388 : "NULL"; 389 print ',' if $r != 255; 390 print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0; 391 } 392 print "};\n\n"; 393 } 394 print "static $type** $head [] = {\n"; 395 for (my $p = 0; $p <= 0x10; $p++) { 396 print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL"; 397 print ',' if $p != 0x10; 398 print "\n"; 399 } 400 print "};\n\n"; 401 close FH; 402} 403 404} # End of block for SelectSaver 405 4061; 407__END__ 408