1#! /usr/local/bin/perl -w 2# $Id: generate_normalize_data.pl,v 1.1 2003/06/04 00:27:55 marka Exp $ 3# 4# Copyright (c) 2000,2001 Japan Network Information Center. 5# All rights reserved. 6# 7# By using this file, you agree to the terms and conditions set forth bellow. 8# 9# LICENSE TERMS AND CONDITIONS 10# 11# The following License Terms and Conditions apply, unless a different 12# license is obtained from Japan Network Information Center ("JPNIC"), 13# a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda, 14# Chiyoda-ku, Tokyo 101-0047, Japan. 15# 16# 1. Use, Modification and Redistribution (including distribution of any 17# modified or derived work) in source and/or binary forms is permitted 18# under this License Terms and Conditions. 19# 20# 2. Redistribution of source code must retain the copyright notices as they 21# appear in each source code file, this License Terms and Conditions. 22# 23# 3. Redistribution in binary form must reproduce the Copyright Notice, 24# this License Terms and Conditions, in the documentation and/or other 25# materials provided with the distribution. For the purposes of binary 26# distribution the "Copyright Notice" refers to the following language: 27# "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved." 28# 29# 4. The name of JPNIC may not be used to endorse or promote products 30# derived from this Software without specific prior written approval of 31# JPNIC. 32# 33# 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC 34# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 35# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 36# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE 37# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 38# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 39# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 40# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 41# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 42# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 43# ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 44# 45 46# 47# Generate lib/unicodedata.c from UnicodeData.txt, 48# CompositionExclusions-1.txt, SpecialCasing.txt and CaseFolding.txt, 49# all of them available from ftp://ftp.unicode.org/Public/UNIDATA/. 50# 51 52use strict; 53use lib qw(.); 54 55use Getopt::Long; 56use UCD; 57use SparseMap; 58 59use constant UCS_MAX => 0x110000; 60use constant END_BIT => 0x80000000; 61 62my $DECOMP_COMPAT_BIT = 0x8000; 63 64my $CASEMAP_FINAL_BIT = 0x1; 65my $CASEMAP_NONFINAL_BIT = 0x2; 66my $CASEMAP_LAST_BIT = 0x10; 67 68my $LETTER_BIT = 1; 69my $NSPMARK_BIT = 2; 70 71(my $myid = '$Id: generate_normalize_data.pl,v 1.1 2003/06/04 00:27:55 marka Exp $') =~ s/\$([^\$]+)\$/\$-$1-\$/; 72 73my @default_bits = (9, 7, 5); 74#my @default_bits = (7, 7, 7); 75my @canon_class_bits = @default_bits; 76my @decomp_bits = @default_bits; 77my @comp_bits = @default_bits; 78my @folding_bits = @default_bits; 79my @casemap_bits = @default_bits; 80my @casemap_ctx_bits = @default_bits; 81 82my $prefix = ''; 83my $dir = '.'; 84my $unicodedatafile = 'UnicodeData.txt'; 85my $exclusionfile = 'CompositionExclusions.txt'; 86my $specialcasefile = 'SpecialCasing.txt'; 87my $casefoldingfile = 'CaseFolding.txt'; 88my $verbose; 89 90GetOptions('dir|d=s' => \$dir, 91 'unicodedata|u=s' => \$unicodedatafile, 92 'exclude|e=s' => \$exclusionfile, 93 'specialcase|s=s' => \$specialcasefile, 94 'casefold|c=s' => \$casefoldingfile, 95 'prefix|p=s' => \$prefix, 96 'verbose|v' => \$verbose, 97) or usage(); 98 99foreach my $r (\$unicodedatafile, \$exclusionfile, 100 \$specialcasefile, \$casefoldingfile) { 101 $$r = "$dir/$$r" unless $$r =~ m|^/|; 102} 103 104my %exclusions; 105my %lower_special; 106my %upper_special; 107 108my @decomp_data; 109my @comp_data; 110my @toupper_data; 111my @tolower_data; 112my @folding_data; 113 114# 115# Create Mapping/Bitmap objects. 116# 117 118# canonical class 119my $canon_class = SparseMap::Int->new(BITS => [@canon_class_bits], 120 MAX => UCS_MAX, 121 MAPALL => 1, 122 DEFAULT => 0); 123 124# canonical/compatibility decomposition 125my $decomp = SparseMap::Int->new(BITS => [@decomp_bits], 126 MAX => UCS_MAX, 127 MAPALL => 1, 128 DEFAULT => 0); 129 130# canonical composition 131my $comp = SparseMap::Int->new(BITS => [@comp_bits], 132 MAX => UCS_MAX, 133 MAPALL => 1, 134 DEFAULT => 0); 135 136# uppercase/lowercase 137my $upper = SparseMap::Int->new(BITS => [@casemap_bits], 138 MAX => UCS_MAX, 139 MAPALL => 1, 140 DEFAULT => 0); 141my $lower = SparseMap::Int->new(BITS => [@casemap_bits], 142 MAX => UCS_MAX, 143 MAPALL => 1, 144 DEFAULT => 0); 145 146# final/nonfinal context 147my $casemap_ctx = SparseMap::Int->new(BITS => [@casemap_ctx_bits], 148 MAX => UCS_MAX, 149 MAPALL => 1, 150 DEFAULT => 0); 151 152# casefolding 153my $folding = SparseMap::Int->new(BITS => [@folding_bits], 154 MAX => UCS_MAX, 155 MAPALL => 1, 156 DEFAULT => 0); 157 158# 159# Read datafiles. 160# 161 162read_exclusion_file(); 163read_specialcasing_file(); 164read_unicodedata_file(); 165read_casefolding_file(); 166 167print_header(); 168print_canon_class(); 169print_composition(); 170print_decomposition(); 171print_casemap(); 172print_casemap_context(); 173print_casefolding(); 174 175exit; 176 177sub usage { 178 print STDERR <<"END"; 179Usage: $0 [options..] 180 options: 181 -d DIR directory where Unicode Character Data files resides [./] 182 -u FILE name of the UnicodeData file [UnicodeData.txt] 183 -e FILE name of the CompositionExclusion file [CompositionExclusions-1.txt] 184 -s FILE name of the SpecialCasing file [SpecialCasing.txt] 185 -c FILE name of the CaseFolding file [CaseFolding.txt] 186END 187 exit 1; 188} 189 190# 191# read_exclusion_file -- read CompositionExclusions-1.txt. 192# 193sub read_exclusion_file { 194 open EXCLUDE, $exclusionfile or die "cannot open $exclusionfile: $!\n"; 195 while ($_ = UCD::CompositionExclusions::getline(\*EXCLUDE)) { 196 my %data = UCD::CompositionExclusions::parseline($_); 197 $exclusions{$data{CODE}} = 1; 198 } 199 close EXCLUDE; 200} 201 202# 203# read_specialcasing_file -- read SpecialCasing.txt 204# 205sub read_specialcasing_file { 206 open SPCASE, $specialcasefile or die "cannot open $specialcasefile: $!\n"; 207 while ($_ = UCD::SpecialCasing::getline(\*SPCASE)) { 208 my %data = UCD::SpecialCasing::parseline($_); 209 my $code = $data{CODE}; 210 my $lower = $data{LOWER}; 211 my $upper = $data{UPPER}; 212 my $cond = $data{CONDITION} || ''; 213 214 next unless $cond eq '' or $cond =~ /^(NON_)?FINAL/; 215 216 if (defined $cond && (@$lower > 1 || $lower->[0] != $code) 217 or @$lower > 1 or $lower->[0] != $code) { 218 $lower_special{$code} = [$lower, $cond]; 219 } 220 if (defined $cond && (@$upper > 1 || $upper->[0] != $code) 221 or @$upper > 1 or $upper->[0] != $code) { 222 $upper_special{$code} = [$upper, $cond]; 223 } 224 } 225 close SPCASE; 226} 227 228# 229# read_unicodedata_file -- read UnicodeData.txt 230# 231sub read_unicodedata_file { 232 open UCD, $unicodedatafile or die "cannot open $unicodedatafile: $!\n"; 233 234 @decomp_data = (0); 235 @toupper_data = (0); 236 @tolower_data = (0); 237 238 my @comp_cand; # canonical composition candidates 239 my %nonstarter; 240 241 while ($_ = UCD::UnicodeData::getline(\*UCD)) { 242 my %data = UCD::UnicodeData::parseline($_); 243 my $code = $data{CODE}; 244 245 # combining class 246 if ($data{CLASS} > 0) { 247 $nonstarter{$code} = 1; 248 $canon_class->add($code, $data{CLASS}); 249 } 250 251 # uppercasing 252 if (exists $upper_special{$code} or defined $data{UPPER}) { 253 my $offset = @toupper_data; 254 my @casedata; 255 256 $upper->add($code, $offset); 257 if (exists $upper_special{$code}) { 258 push @casedata, $upper_special{$code}; 259 } 260 if (defined $data{UPPER}) { 261 push @casedata, $data{UPPER}; 262 } 263 push @toupper_data, casemap_data(@casedata); 264 } 265 266 # lowercasing 267 if (exists $lower_special{$code} or defined $data{LOWER}) { 268 my $offset = @tolower_data; 269 my @casedata; 270 271 $lower->add($code, $offset); 272 if (exists $lower_special{$code}) { 273 push @casedata, $lower_special{$code}; 274 } 275 if (defined $data{LOWER}) { 276 push @casedata, $data{LOWER}; 277 } 278 push @tolower_data, casemap_data(@casedata); 279 } 280 281 # composition/decomposition 282 if ($data{DECOMP}) { 283 my ($tag, @decomp) = @{$data{DECOMP}}; 284 my $offset = @decomp_data; 285 286 # composition 287 if ($tag eq '' and @decomp > 1 and not exists $exclusions{$code}) { 288 # canonical composition candidate 289 push @comp_cand, [$code, @decomp]; 290 } 291 292 # decomposition 293 if ($tag ne '') { 294 # compatibility decomposition 295 $offset |= $DECOMP_COMPAT_BIT; 296 } 297 $decomp->add($code, $offset); 298 push @decomp_data, @decomp; 299 $decomp_data[-1] |= END_BIT; 300 301 } 302 303 # final/nonfinal context 304 if ($data{CATEGORY} =~ /L[ult]/) { 305 $casemap_ctx->add($code, $LETTER_BIT); 306 } elsif ($data{CATEGORY} eq 'Mn') { 307 $casemap_ctx->add($code, $NSPMARK_BIT); 308 } 309 } 310 close UCD; 311 312 # Eliminate composition candidates whose decomposition starts with 313 # a non-starter. 314 @comp_cand = grep {not exists $nonstarter{$_->[1]}} @comp_cand; 315 316 @comp_data = ([0, 0, 0]); 317 my $last_code = -1; 318 my $last_offset = @comp_data; 319 for my $r (sort {$a->[1] <=> $b->[1] || $a->[2] <=> $b->[2]} @comp_cand) { 320 if ($r->[1] != $last_code) { 321 $comp->add($last_code, 322 ($last_offset | ((@comp_data - $last_offset)<<16))) 323 unless $last_code == -1; 324 $last_code = $r->[1]; 325 $last_offset = @comp_data; 326 } 327 push @comp_data, $r; 328 } 329 $comp->add($last_code, 330 ($last_offset | ((@comp_data - $last_offset)<<16))); 331} 332 333sub casemap_data { 334 my @data = @_; 335 my @result = (); 336 while (@data > 0) { 337 my $r = shift @data; 338 my $flag = 0; 339 if (ref $r) { 340 if ($r->[1] eq 'FINAL') { 341 $flag |= $CASEMAP_FINAL_BIT; 342 } elsif ($r->[1] eq 'NON_FINAL') { 343 $flag |= $CASEMAP_NONFINAL_BIT; 344 } elsif ($r->[1] ne '') { 345 die "unknown condition \"", $r->[1], "\"\n"; 346 } 347 } 348 $flag |= $CASEMAP_LAST_BIT if @data == 0; 349 push @result, $flag; 350 push @result, (ref $r) ? @{$r->[0]} : $r; 351 $result[-1] |= END_BIT; 352 } 353 @result; 354} 355 356# 357# read_casefolding_file -- read CaseFolding.txt 358# 359sub read_casefolding_file { 360 open FOLD, $casefoldingfile or die "cannto open $casefoldingfile: $!\n"; 361 362 # dummy. 363 @folding_data = (0); 364 365 while ($_ = UCD::CaseFolding::getline(\*FOLD)) { 366 my %data = UCD::CaseFolding::parseline($_); 367 368 $folding->add($data{CODE}, scalar(@folding_data)); 369 push @folding_data, @{$data{MAP}}; 370 $folding_data[-1] |= END_BIT; 371 } 372 close FOLD; 373} 374 375sub print_header { 376 print <<"END"; 377/* \$Id\$ */ 378/* $myid */ 379/* 380 * Do not edit this file! 381 * This file is generated from UnicodeData.txt, CompositionExclusions-1.txt, 382 * SpecialCasing.txt and CaseFolding.txt. 383 */ 384 385END 386} 387 388# 389# print_canon_class -- generate data for canonical class 390# 391sub print_canon_class { 392 $canon_class->fix(); 393 print STDERR "** cannon_class\n", $canon_class->stat() if $verbose; 394 395 print <<"END"; 396 397/* 398 * Canonical Class 399 */ 400 401END 402 print_bits("CANON_CLASS", @canon_class_bits); 403 print "\n"; 404 print $canon_class->cprog(NAME => "${prefix}canon_class"); 405} 406 407# 408# print_composition -- generate data for canonical composition 409# 410sub print_composition { 411 $comp->fix(); 412 print STDERR "** composition\n", $comp->stat() if $verbose; 413 414 print <<"END"; 415 416/* 417 * Canonical Composition 418 */ 419 420END 421 print_bits("CANON_COMPOSE", @comp_bits); 422 print "\n"; 423 print $comp->cprog(NAME => "${prefix}compose"); 424 print <<"END"; 425 426static const struct composition ${prefix}compose_seq[] = { 427END 428 my $i = 0; 429 foreach my $r (@comp_data) { 430 if ($i % 2 == 0) { 431 print "\n" if $i != 0; 432 print "\t"; 433 } 434 printf "{ 0x%08x, 0x%08x }, ", $r->[2], $r->[0]; 435 $i++; 436 } 437 print "\n};\n\n"; 438} 439 440# 441# print_decomposition -- generate data for canonical/compatibility 442# decomposition 443# 444sub print_decomposition { 445 $decomp->fix(); 446 print STDERR "** decomposition\n", $decomp->stat() if $verbose; 447 448 print <<"END"; 449 450/* 451 * Canonical/Compatibility Decomposition 452 */ 453 454END 455 print_bits("DECOMP", @decomp_bits); 456 print "#define DECOMP_COMPAT\t$DECOMP_COMPAT_BIT\n\n"; 457 458 print $decomp->cprog(NAME => "${prefix}decompose"); 459 460 print "static const unsigned long ${prefix}decompose_seq[] = {\n"; 461 print_ulseq(@decomp_data); 462 print "};\n\n"; 463} 464 465# 466# print_casemap -- generate data for case mapping 467# 468sub print_casemap { 469 $upper->fix(); 470 $lower->fix(); 471 print STDERR "** upper mapping\n", $upper->stat() if $verbose; 472 print STDERR "** lower mapping\n", $lower->stat() if $verbose; 473 474 print <<"END"; 475 476/* 477 * Lowercase <-> Uppercase mapping 478 */ 479 480/* 481 * Flags for special case mapping. 482 */ 483#define CMF_FINAL $CASEMAP_FINAL_BIT 484#define CMF_NONFINAL $CASEMAP_NONFINAL_BIT 485#define CMF_LAST $CASEMAP_LAST_BIT 486#define CMF_CTXDEP (CMF_FINAL|CMF_NONFINAL) 487 488END 489 print_bits("CASEMAP", @casemap_bits); 490 print "\n"; 491 print $upper->cprog(NAME => "${prefix}toupper"); 492 print $lower->cprog(NAME => "${prefix}tolower"); 493 494 print "static const unsigned long ${prefix}toupper_seq[] = {\n"; 495 print_ulseq(@toupper_data); 496 print "};\n\n"; 497 498 print "static const unsigned long ${prefix}tolower_seq[] = {\n"; 499 print_ulseq(@tolower_data); 500 print "};\n\n"; 501} 502 503# 504# print_casefolding -- generate data for case folding 505# 506sub print_casefolding { 507 $folding->fix(); 508 print STDERR "** case folding\n", $folding->stat() if $verbose; 509 510 print <<"END"; 511 512/* 513 * Case Folding 514 */ 515 516END 517 print_bits("CASE_FOLDING", @folding_bits); 518 print "\n"; 519 print $folding->cprog(NAME => "${prefix}case_folding"); 520 521 print "static const unsigned long ${prefix}case_folding_seq[] = {\n"; 522 print_ulseq(@folding_data); 523 print "};\n\n"; 524} 525 526# 527# print_casemap_context -- gerarate data for determining context 528# (final/non-final) 529# 530sub print_casemap_context { 531 $casemap_ctx->fix(); 532 print STDERR "** casemap context\n", $casemap_ctx->stat() if $verbose; 533 534 print <<"END"; 535 536/* 537 * Cased characters and non-spacing marks (for casemap context) 538 */ 539 540END 541 542 print_bits("CASEMAP_CTX", @casemap_ctx_bits); 543 print <<"END"; 544 545#define CTX_CASED $LETTER_BIT 546#define CTX_NSM $NSPMARK_BIT 547 548END 549 print $casemap_ctx->cprog(NAME => "${prefix}casemap_ctx"); 550} 551 552sub sprint_composition_hash { 553 my $i = 0; 554 my $s = ''; 555 foreach my $r (@_) { 556 if ($i % 2 == 0) { 557 $s .= "\n" if $i != 0; 558 $s .= "\t"; 559 } 560 $s .= sprintf "{0x%04x, 0x%04x, 0x%04x}, ", @{$r}; 561 $i++; 562 } 563 $s; 564} 565 566sub print_bits { 567 my $prefix = shift; 568 my $i = 0; 569 foreach my $bit (@_) { 570 print "#define ${prefix}_BITS_$i\t$bit\n"; 571 $i++; 572 } 573} 574 575sub print_ulseq { 576 my $i = 0; 577 foreach my $v (@_) { 578 if ($i % 4 == 0) { 579 print "\n" if $i != 0; 580 print "\t"; 581 } 582 printf "0x%08x, ", $v; 583 $i++; 584 } 585 print "\n"; 586} 587