1#!perl 2# 3# This auxiliary script makes five header files 4# used for building XSUB of Unicode::Normalize. 5# 6# Usage: 7# <do 'mkheader'> in perl, or <perl mkheader> in command line 8# 9# Input files: 10# unicore/CombiningClass.pl (or unicode/CombiningClass.pl) 11# unicore/Decomposition.pl (or unicode/Decomposition.pl) 12# 13# Output files: 14# unfcan.h 15# unfcpt.h 16# unfcmb.h 17# unfcmp.h 18# unfexc.h 19# 20use 5.006; 21use strict; 22use warnings; 23use Carp; 24use File::Spec; 25use SelectSaver; 26 27BEGIN { 28 unless ('A' eq pack('U', 0x41)) { 29 die "Unicode::Normalize cannot stringify a Unicode code point\n"; 30 } 31 unless (0x41 == unpack('U', 'A')) { 32 die "Unicode::Normalize cannot get Unicode code point\n"; 33 } 34} 35 36our $PACKAGE = 'Unicode::Normalize, mkheader'; 37 38our $prefix = "UNF_"; 39our $structname = "${prefix}complist"; 40 41# Starting in v5.20, the tables in lib/unicore are built using the platform's 42# native character set for code points 0-255. 43*pack_U = ($] ge 5.020) 44 ? sub { return pack('W*', @_).pack('U*'); } # The empty pack returns 45 # an empty UTF-8 string, 46 # so the effect is to 47 # force the return into 48 # being UTF-8. 49 : sub { return pack('U*', @_); }; 50 51# %Canon and %Compat will be ($codepoint => $hexstring) after _U_stringify() 52our %Comp1st; # $codepoint => $listname : may be composed with a next char. 53our %CompList; # $listname,$2nd => $codepoint : composite 54 55##### The below part is common to mkheader and PP ##### 56 57our %Combin; # $codepoint => $number : combination class 58our %Canon; # $codepoint => \@codepoints : canonical decomp. 59our %Compat; # $codepoint => \@codepoints : compat. decomp. 60our %Compos; # $1st,$2nd => $codepoint : composite 61our %Exclus; # $codepoint => 1 : composition exclusions 62our %Single; # $codepoint => 1 : singletons 63our %NonStD; # $codepoint => 1 : non-starter decompositions 64our %Comp2nd; # $codepoint => 1 : may be composed with a prev char. 65 66# from core Unicode database 67our $Combin = do "unicore/CombiningClass.pl" 68 || do "unicode/CombiningClass.pl" 69 || croak "$PACKAGE: CombiningClass.pl not found"; 70our $Decomp = do "unicore/Decomposition.pl" 71 || do "unicode/Decomposition.pl" 72 || croak "$PACKAGE: Decomposition.pl not found"; 73 74# CompositionExclusions.txt since Unicode 3.2.0. If this ever changes, it 75# would be better to get the values from Unicode::UCD rather than hard-code 76# them here, as that will protect from having to make fixes for future 77# changes. 78our @CompEx = qw( 79 0958 0959 095A 095B 095C 095D 095E 095F 09DC 09DD 09DF 0A33 0A36 80 0A59 0A5A 0A5B 0A5E 0B5C 0B5D 0F43 0F4D 0F52 0F57 0F5C 0F69 0F76 81 0F78 0F93 0F9D 0FA2 0FA7 0FAC 0FB9 FB1D FB1F FB2A FB2B FB2C FB2D 82 FB2E FB2F FB30 FB31 FB32 FB33 FB34 FB35 FB36 FB38 FB39 FB3A FB3B 83 FB3C FB3E FB40 FB41 FB43 FB44 FB46 FB47 FB48 FB49 FB4A FB4B FB4C 84 FB4D FB4E 2ADC 1D15E 1D15F 1D160 1D161 1D162 1D163 1D164 1D1BB 85 1D1BC 1D1BD 1D1BE 1D1BF 1D1C0 86); 87 88# definition of Hangul constants 89use constant SBase => 0xAC00; 90use constant SFinal => 0xD7A3; # SBase -1 + SCount 91use constant SCount => 11172; # LCount * NCount 92use constant NCount => 588; # VCount * TCount 93use constant LBase => 0x1100; 94use constant LFinal => 0x1112; 95use constant LCount => 19; 96use constant VBase => 0x1161; 97use constant VFinal => 0x1175; 98use constant VCount => 21; 99use constant TBase => 0x11A7; 100use constant TFinal => 0x11C2; 101use constant TCount => 28; 102 103sub decomposeHangul { 104 my $sindex = $_[0] - SBase; 105 my $lindex = int( $sindex / NCount); 106 my $vindex = int(($sindex % NCount) / TCount); 107 my $tindex = $sindex % TCount; 108 my @ret = ( 109 LBase + $lindex, 110 VBase + $vindex, 111 $tindex ? (TBase + $tindex) : (), 112 ); 113 return wantarray ? @ret : pack_U(@ret); 114} 115 116########## getting full decomposition ########## 117 118## converts string "hhhh hhhh hhhh" to a numeric list 119## (hex digits separated by spaces) 120sub _getHexArray { map hex, $_[0] =~ /\G *([0-9A-Fa-f]+)/g } 121 122while ($Combin =~ /(.+)/g) { 123 my @tab = split /\t/, $1; 124 my $ini = hex $tab[0]; 125 if ($tab[1] eq '') { 126 $Combin{$ini} = $tab[2]; 127 } else { 128 $Combin{$_} = $tab[2] foreach $ini .. hex($tab[1]); 129 } 130} 131 132while ($Decomp =~ /(.+)/g) { 133 my @tab = split /\t/, $1; 134 my $compat = $tab[2] =~ s/<[^>]+>//; 135 my $dec = [ _getHexArray($tab[2]) ]; # decomposition 136 my $ini = hex($tab[0]); # initial decomposable character 137 my $end = $tab[1] eq '' ? $ini : hex($tab[1]); 138 # ($ini .. $end) is the range of decomposable characters. 139 140 foreach my $u ($ini .. $end) { 141 $Compat{$u} = $dec; 142 $Canon{$u} = $dec if ! $compat; 143 } 144} 145 146for my $s (@CompEx) { 147 my $u = hex $s; 148 next if !$Canon{$u}; # not assigned 149 next if $u == 0xFB1D && !$Canon{0x1D15E}; # 3.0.1 before Corrigendum #2 150 $Exclus{$u} = 1; 151} 152 153foreach my $u (keys %Canon) { 154 my $dec = $Canon{$u}; 155 156 if (@$dec == 2) { 157 if ($Combin{ $dec->[0] }) { 158 $NonStD{$u} = 1; 159 } else { 160 $Compos{ $dec->[0] }{ $dec->[1] } = $u; 161 $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u}; 162 } 163 } elsif (@$dec == 1) { 164 $Single{$u} = 1; 165 } else { 166 my $h = sprintf '%04X', $u; 167 croak("Weird Canonical Decomposition of U+$h"); 168 } 169} 170 171# modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo 172foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) { 173 $Comp2nd{$j} = 1; 174} 175 176sub getCanonList { 177 my @src = @_; 178 my @dec = map { 179 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_) 180 : $Canon{$_} ? @{ $Canon{$_} } : $_ 181 } @src; 182 return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec); 183 # condition @src == @dec is not ok. 184} 185 186sub getCompatList { 187 my @src = @_; 188 my @dec = map { 189 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_) 190 : $Compat{$_} ? @{ $Compat{$_} } : $_ 191 } @src; 192 return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec); 193 # condition @src == @dec is not ok. 194} 195 196# exhaustive decomposition 197foreach my $key (keys %Canon) { 198 $Canon{$key} = [ getCanonList($key) ]; 199} 200 201# exhaustive decomposition 202foreach my $key (keys %Compat) { 203 $Compat{$key} = [ getCompatList($key) ]; 204} 205 206##### The above part is common to mkheader and PP ##### 207 208foreach my $comp1st (keys %Compos) { 209 my $listname = sprintf("${structname}_%06x", $comp1st); 210 # %04x is bad since it'd place _3046 after _1d157. 211 $Comp1st{$comp1st} = $listname; 212 my $rh1st = $Compos{$comp1st}; 213 214 foreach my $comp2nd (keys %$rh1st) { 215 my $uc = $rh1st->{$comp2nd}; 216 $CompList{$listname}{$comp2nd} = $uc; 217 } 218} 219 220sub split_into_char { 221 use bytes; 222 my $uni = shift; 223 my $len = length($uni); 224 my @ary; 225 for(my $i = 0; $i < $len; ++$i) { 226 push @ary, ord(substr($uni,$i,1)); 227 } 228 return @ary; 229} 230 231sub _U_stringify { 232 sprintf '"%s"', join '', 233 map sprintf("\\x%02x", $_), split_into_char(pack_U(@_)); 234} 235 236foreach my $hash (\%Canon, \%Compat) { 237 foreach my $key (keys %$hash) { 238 $hash->{$key} = _U_stringify( @{ $hash->{$key} } ); 239 } 240} 241 242########## writing header files ########## 243 244my @boolfunc = ( 245 { 246 name => "Exclusion", 247 type => "bool", 248 hash => \%Exclus, 249 }, 250 { 251 name => "Singleton", 252 type => "bool", 253 hash => \%Single, 254 }, 255 { 256 name => "NonStDecomp", 257 type => "bool", 258 hash => \%NonStD, 259 }, 260 { 261 name => "Comp2nd", 262 type => "bool", 263 hash => \%Comp2nd, 264 }, 265); 266 267my $orig_fh = SelectSaver->new; 268{ 269 270my $file = "unfexc.h"; 271open FH, ">$file" or croak "$PACKAGE: $file can't be made"; 272binmode FH; select FH; 273 274 print << 'EOF'; 275/* 276 * This file is auto-generated by mkheader. 277 * Any changes here will be lost! 278 */ 279EOF 280 281foreach my $tbl (@boolfunc) { 282 my @temp = sort {$a <=> $b} keys %{$tbl->{hash}}; 283 my $type = $tbl->{type}; 284 my $name = $tbl->{name}; 285 print "$type is$name (UV uv)\n{\nreturn\n\t"; 286 287 while (@temp) { 288 my $cur = shift @temp; 289 if (@temp && $cur + 1 == $temp[0]) { 290 print "($cur <= uv && uv <= "; 291 while (@temp && $cur + 1 == $temp[0]) { 292 $cur = shift @temp; 293 } 294 print "$cur)"; 295 print "\n\t|| " if @temp; 296 } else { 297 print "uv == $cur"; 298 print "\n\t|| " if @temp; 299 } 300 } 301 print "\n\t? TRUE : FALSE;\n}\n\n"; 302} 303 304close FH; 305 306#################################### 307 308my $compinit = 309 "typedef struct { UV nextchar; UV composite; } $structname;\n\n"; 310 311foreach my $i (sort keys %CompList) { 312 $compinit .= "$structname $i [] = {\n"; 313 $compinit .= join ",\n", 314 map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}), 315 sort {$a <=> $b } keys %{ $CompList{$i} }; 316 $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel 317} 318 319my @tripletable = ( 320 { 321 file => "unfcmb", 322 name => "combin", 323 type => "STDCHAR", 324 hash => \%Combin, 325 null => 0, 326 }, 327 { 328 file => "unfcan", 329 name => "canon", 330 type => "char*", 331 hash => \%Canon, 332 null => "NULL", 333 }, 334 { 335 file => "unfcpt", 336 name => "compat", 337 type => "char*", 338 hash => \%Compat, 339 null => "NULL", 340 }, 341 { 342 file => "unfcmp", 343 name => "compos", 344 type => "$structname *", 345 hash => \%Comp1st, 346 null => "NULL", 347 init => $compinit, 348 }, 349); 350 351foreach my $tbl (@tripletable) { 352 my $file = "$tbl->{file}.h"; 353 my $head = "${prefix}$tbl->{name}"; 354 my $type = $tbl->{type}; 355 my $hash = $tbl->{hash}; 356 my $null = $tbl->{null}; 357 my $init = $tbl->{init}; 358 359 open FH, ">$file" or croak "$PACKAGE: $file can't be made"; 360 binmode FH; select FH; 361 my %val; 362 363 print FH << 'EOF'; 364/* 365 * This file is auto-generated by mkheader. 366 * Any changes here will be lost! 367 */ 368EOF 369 370 print $init if defined $init; 371 372 foreach my $uv (keys %$hash) { 373 croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv) 374 unless $uv <= 0x10FFFF; 375 my @c = unpack 'CCCC', pack 'N', $uv; 376 $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv}; 377 } 378 379 foreach my $p (sort { $a <=> $b } keys %val) { 380 next if ! $val{ $p }; 381 for (my $r = 0; $r < 256; $r++) { 382 next if ! $val{ $p }{ $r }; 383 printf "static $type ${head}_%02x_%02x [256] = {\n", $p, $r; 384 for (my $c = 0; $c < 256; $c++) { 385 print "\t", defined $val{$p}{$r}{$c} 386 ? "($type)".$val{$p}{$r}{$c} 387 : $null; 388 print ',' if $c != 255; 389 print "\n" if $c % 8 == 7; 390 } 391 print "};\n\n"; 392 } 393 } 394 foreach my $p (sort { $a <=> $b } keys %val) { 395 next if ! $val{ $p }; 396 printf "static $type* ${head}_%02x [256] = {\n", $p; 397 for (my $r = 0; $r < 256; $r++) { 398 print $val{ $p }{ $r } 399 ? sprintf("${head}_%02x_%02x", $p, $r) 400 : "NULL"; 401 print ',' if $r != 255; 402 print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0; 403 } 404 print "};\n\n"; 405 } 406 print "static $type** $head [] = {\n"; 407 for (my $p = 0; $p <= 0x10; $p++) { 408 print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL"; 409 print ',' if $p != 0x10; 410 print "\n"; 411 } 412 print "};\n\n"; 413 close FH; 414} 415 416} # End of block for SelectSaver 417 4181; 419__END__ 420