1#! /usr/local/bin/perl -w
2# $Id: generate_normalize_data.pl,v 1.1 2003/06/04 00:27:55 marka Exp $
3#
4# Copyright (c) 2000,2001 Japan Network Information Center.
5# All rights reserved.
6#
7# By using this file, you agree to the terms and conditions set forth bellow.
8#
9# 			LICENSE TERMS AND CONDITIONS
10#
11# The following License Terms and Conditions apply, unless a different
12# license is obtained from Japan Network Information Center ("JPNIC"),
13# a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
14# Chiyoda-ku, Tokyo 101-0047, Japan.
15#
16# 1. Use, Modification and Redistribution (including distribution of any
17#    modified or derived work) in source and/or binary forms is permitted
18#    under this License Terms and Conditions.
19#
20# 2. Redistribution of source code must retain the copyright notices as they
21#    appear in each source code file, this License Terms and Conditions.
22#
23# 3. Redistribution in binary form must reproduce the Copyright Notice,
24#    this License Terms and Conditions, in the documentation and/or other
25#    materials provided with the distribution.  For the purposes of binary
26#    distribution the "Copyright Notice" refers to the following language:
27#    "Copyright (c) 2000-2002 Japan Network Information Center.  All rights reserved."
28#
29# 4. The name of JPNIC may not be used to endorse or promote products
30#    derived from this Software without specific prior written approval of
31#    JPNIC.
32#
33# 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
34#    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
35#    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
36#    PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL JPNIC BE LIABLE
37#    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
38#    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
39#    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
40#    BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
41#    WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
42#    OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
43#    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
44#
45
46#
47# Generate lib/unicodedata.c from UnicodeData.txt,
48# CompositionExclusions-1.txt, SpecialCasing.txt and CaseFolding.txt,
49# all of them available from ftp://ftp.unicode.org/Public/UNIDATA/.
50#
51
52use strict;
53use lib qw(.);
54
55use Getopt::Long;
56use UCD;
57use SparseMap;
58
59use constant UCS_MAX => 0x110000;
60use constant END_BIT => 0x80000000;
61
62my $DECOMP_COMPAT_BIT = 0x8000;
63
64my $CASEMAP_FINAL_BIT = 0x1;
65my $CASEMAP_NONFINAL_BIT = 0x2;
66my $CASEMAP_LAST_BIT = 0x10;
67
68my $LETTER_BIT = 1;
69my $NSPMARK_BIT = 2;
70
71(my $myid = '$Id: generate_normalize_data.pl,v 1.1 2003/06/04 00:27:55 marka Exp $') =~ s/\$([^\$]+)\$/\$-$1-\$/;
72
73my @default_bits = (9, 7, 5);
74#my @default_bits = (7, 7, 7);
75my @canon_class_bits = @default_bits;
76my @decomp_bits = @default_bits;
77my @comp_bits = @default_bits;
78my @folding_bits = @default_bits;
79my @casemap_bits = @default_bits;
80my @casemap_ctx_bits = @default_bits;
81
82my $prefix = '';
83my $dir = '.';
84my $unicodedatafile = 'UnicodeData.txt';
85my $exclusionfile = 'CompositionExclusions.txt';
86my $specialcasefile = 'SpecialCasing.txt';
87my $casefoldingfile = 'CaseFolding.txt';
88my $verbose;
89
90GetOptions('dir|d=s' => \$dir,
91	   'unicodedata|u=s' => \$unicodedatafile,
92	   'exclude|e=s' => \$exclusionfile,
93	   'specialcase|s=s' => \$specialcasefile,
94	   'casefold|c=s' => \$casefoldingfile,
95	   'prefix|p=s' => \$prefix,
96	   'verbose|v' => \$verbose,
97) or usage();
98
99foreach my $r (\$unicodedatafile, \$exclusionfile,
100	       \$specialcasefile, \$casefoldingfile) {
101    $$r = "$dir/$$r" unless $$r =~ m|^/|;
102}
103
104my %exclusions;
105my %lower_special;
106my %upper_special;
107
108my @decomp_data;
109my @comp_data;
110my @toupper_data;
111my @tolower_data;
112my @folding_data;
113
114#
115# Create Mapping/Bitmap objects.
116#
117
118# canonical class
119my $canon_class = SparseMap::Int->new(BITS => [@canon_class_bits],
120				     MAX => UCS_MAX,
121				     MAPALL => 1,
122				     DEFAULT => 0);
123
124# canonical/compatibility decomposition
125my $decomp = SparseMap::Int->new(BITS => [@decomp_bits],
126				 MAX => UCS_MAX,
127				 MAPALL => 1,
128				 DEFAULT => 0);
129
130# canonical composition
131my $comp = SparseMap::Int->new(BITS => [@comp_bits],
132			       MAX => UCS_MAX,
133			       MAPALL => 1,
134			       DEFAULT => 0);
135
136# uppercase/lowercase
137my $upper = SparseMap::Int->new(BITS => [@casemap_bits],
138			        MAX => UCS_MAX,
139			        MAPALL => 1,
140			        DEFAULT => 0);
141my $lower = SparseMap::Int->new(BITS => [@casemap_bits],
142			        MAX => UCS_MAX,
143			        MAPALL => 1,
144			        DEFAULT => 0);
145
146# final/nonfinal context
147my $casemap_ctx = SparseMap::Int->new(BITS => [@casemap_ctx_bits],
148				      MAX => UCS_MAX,
149				      MAPALL => 1,
150				      DEFAULT => 0);
151
152# casefolding
153my $folding = SparseMap::Int->new(BITS => [@folding_bits],
154				  MAX => UCS_MAX,
155				  MAPALL => 1,
156				  DEFAULT => 0);
157
158#
159# Read datafiles.
160#
161
162read_exclusion_file();
163read_specialcasing_file();
164read_unicodedata_file();
165read_casefolding_file();
166
167print_header();
168print_canon_class();
169print_composition();
170print_decomposition();
171print_casemap();
172print_casemap_context();
173print_casefolding();
174
175exit;
176
177sub usage {
178    print STDERR <<"END";
179Usage: $0 [options..]
180  options:
181    -d DIR  directory where Unicode Character Data files resides [./]
182    -u FILE name of the UnicodeData file [UnicodeData.txt]
183    -e FILE name of the CompositionExclusion file [CompositionExclusions-1.txt]
184    -s FILE name of the SpecialCasing file [SpecialCasing.txt]
185    -c FILE name of the CaseFolding file [CaseFolding.txt]
186END
187    exit 1;
188}
189
190#
191# read_exclusion_file -- read CompositionExclusions-1.txt.
192#
193sub read_exclusion_file {
194    open EXCLUDE, $exclusionfile   or die "cannot open $exclusionfile: $!\n";
195    while ($_ = UCD::CompositionExclusions::getline(\*EXCLUDE)) {
196	my %data = UCD::CompositionExclusions::parseline($_);
197	$exclusions{$data{CODE}} = 1;
198    }
199    close EXCLUDE;
200}
201
202#
203# read_specialcasing_file -- read SpecialCasing.txt
204#
205sub read_specialcasing_file {
206    open SPCASE, $specialcasefile or die "cannot open $specialcasefile: $!\n";
207    while ($_ = UCD::SpecialCasing::getline(\*SPCASE)) {
208	my %data = UCD::SpecialCasing::parseline($_);
209	my $code = $data{CODE};
210	my $lower = $data{LOWER};
211	my $upper = $data{UPPER};
212	my $cond = $data{CONDITION} || '';
213
214	next unless $cond eq '' or $cond =~ /^(NON_)?FINAL/;
215
216	if (defined $cond && (@$lower > 1 || $lower->[0] != $code)
217	    or @$lower > 1 or $lower->[0] != $code) {
218	    $lower_special{$code} = [$lower, $cond];
219	}
220	if (defined $cond && (@$upper > 1 || $upper->[0] != $code)
221	    or @$upper > 1 or $upper->[0] != $code) {
222	    $upper_special{$code} = [$upper, $cond];
223	}
224    }
225    close SPCASE;
226}
227
228#
229# read_unicodedata_file -- read UnicodeData.txt
230#
231sub read_unicodedata_file {
232    open UCD, $unicodedatafile or die "cannot open $unicodedatafile: $!\n";
233
234    @decomp_data = (0);
235    @toupper_data = (0);
236    @tolower_data = (0);
237
238    my @comp_cand;	# canonical composition candidates
239    my %nonstarter;
240
241    while ($_ = UCD::UnicodeData::getline(\*UCD)) {
242	my %data = UCD::UnicodeData::parseline($_);
243	my $code = $data{CODE};
244
245	# combining class
246	if ($data{CLASS} > 0) {
247	    $nonstarter{$code} = 1;
248	    $canon_class->add($code, $data{CLASS});
249	}
250
251	# uppercasing
252	if (exists $upper_special{$code} or defined $data{UPPER}) {
253	    my $offset = @toupper_data;
254	    my @casedata;
255
256	    $upper->add($code, $offset);
257	    if (exists $upper_special{$code}) {
258		push @casedata, $upper_special{$code};
259	    }
260	    if (defined $data{UPPER}) {
261		push @casedata, $data{UPPER};
262	    }
263	    push @toupper_data, casemap_data(@casedata);
264	}
265
266	# lowercasing
267	if (exists $lower_special{$code} or defined $data{LOWER}) {
268	    my $offset = @tolower_data;
269	    my @casedata;
270
271	    $lower->add($code, $offset);
272	    if (exists $lower_special{$code}) {
273		push @casedata, $lower_special{$code};
274	    }
275	    if (defined $data{LOWER}) {
276		push @casedata, $data{LOWER};
277	    }
278	    push @tolower_data, casemap_data(@casedata);
279	}
280
281	# composition/decomposition
282	if ($data{DECOMP}) {
283	    my ($tag, @decomp) = @{$data{DECOMP}};
284	    my $offset = @decomp_data;
285
286	    # composition
287	    if ($tag eq '' and @decomp > 1 and not exists $exclusions{$code}) {
288		# canonical composition candidate
289		push @comp_cand, [$code, @decomp];
290	    }
291
292	    # decomposition
293	    if ($tag ne '') {
294		# compatibility decomposition
295		$offset |= $DECOMP_COMPAT_BIT;
296	    }
297	    $decomp->add($code, $offset);
298	    push @decomp_data, @decomp;
299	    $decomp_data[-1] |= END_BIT;
300
301	}
302
303	# final/nonfinal context
304	if ($data{CATEGORY} =~ /L[ult]/) {
305	    $casemap_ctx->add($code, $LETTER_BIT);
306	} elsif ($data{CATEGORY} eq 'Mn') {
307	    $casemap_ctx->add($code, $NSPMARK_BIT);
308	}
309    }
310    close UCD;
311
312    # Eliminate composition candidates whose decomposition starts with
313    # a non-starter.
314    @comp_cand = grep {not exists $nonstarter{$_->[1]}} @comp_cand;
315
316    @comp_data = ([0, 0, 0]);
317    my $last_code = -1;
318    my $last_offset = @comp_data;
319    for my $r (sort {$a->[1] <=> $b->[1] || $a->[2] <=> $b->[2]} @comp_cand) {
320	if ($r->[1] != $last_code) {
321	    $comp->add($last_code,
322		       ($last_offset | ((@comp_data - $last_offset)<<16)))
323		unless $last_code == -1;
324	    $last_code = $r->[1];
325	    $last_offset = @comp_data;
326	}
327	push @comp_data, $r;
328    }
329    $comp->add($last_code,
330	       ($last_offset | ((@comp_data - $last_offset)<<16)));
331}
332
333sub casemap_data {
334    my @data = @_;
335    my @result = ();
336    while (@data > 0) {
337	my $r = shift @data;
338	my $flag = 0;
339	if (ref $r) {
340	    if ($r->[1] eq 'FINAL') {
341		$flag |= $CASEMAP_FINAL_BIT;
342	    } elsif ($r->[1] eq 'NON_FINAL') {
343		$flag |= $CASEMAP_NONFINAL_BIT;
344	    } elsif ($r->[1] ne '') {
345		die "unknown condition \"", $r->[1], "\"\n";
346	    }
347	}
348	$flag |= $CASEMAP_LAST_BIT if @data == 0;
349	push @result, $flag;
350	push @result, (ref $r) ? @{$r->[0]} : $r;
351	$result[-1] |= END_BIT;
352    }
353    @result;
354}
355
356#
357# read_casefolding_file -- read CaseFolding.txt
358#
359sub read_casefolding_file {
360    open FOLD, $casefoldingfile or die "cannto open $casefoldingfile: $!\n";
361
362    # dummy.
363    @folding_data = (0);
364
365    while ($_ = UCD::CaseFolding::getline(\*FOLD)) {
366	my %data = UCD::CaseFolding::parseline($_);
367
368	$folding->add($data{CODE}, scalar(@folding_data));
369	push @folding_data, @{$data{MAP}};
370	$folding_data[-1] |= END_BIT;
371    }
372    close FOLD;
373}
374
375sub print_header {
376    print <<"END";
377/* \$Id\$ */
378/* $myid */
379/*
380 * Do not edit this file!
381 * This file is generated from UnicodeData.txt, CompositionExclusions-1.txt,
382 * SpecialCasing.txt and CaseFolding.txt.
383 */
384
385END
386}
387
388#
389# print_canon_class -- generate data for canonical class
390#
391sub print_canon_class {
392    $canon_class->fix();
393    print STDERR "** cannon_class\n", $canon_class->stat() if $verbose;
394
395    print <<"END";
396
397/*
398 * Canonical Class
399 */
400
401END
402    print_bits("CANON_CLASS", @canon_class_bits);
403    print "\n";
404    print $canon_class->cprog(NAME => "${prefix}canon_class");
405}
406
407#
408# print_composition -- generate data for canonical composition
409#
410sub print_composition {
411    $comp->fix();
412    print STDERR "** composition\n", $comp->stat() if $verbose;
413
414    print <<"END";
415
416/*
417 * Canonical Composition
418 */
419
420END
421    print_bits("CANON_COMPOSE", @comp_bits);
422    print "\n";
423    print $comp->cprog(NAME => "${prefix}compose");
424    print <<"END";
425
426static const struct composition ${prefix}compose_seq[] = {
427END
428    my $i = 0;
429    foreach my $r (@comp_data) {
430	if ($i % 2 == 0) {
431	    print "\n" if $i != 0;
432	    print "\t";
433	}
434	printf "{ 0x%08x, 0x%08x }, ", $r->[2], $r->[0];
435	$i++;
436    }
437    print "\n};\n\n";
438}
439
440#
441# print_decomposition -- generate data for canonical/compatibility
442# decomposition
443#
444sub print_decomposition {
445    $decomp->fix();
446    print STDERR "** decomposition\n", $decomp->stat() if $verbose;
447
448    print <<"END";
449
450/*
451 * Canonical/Compatibility Decomposition
452 */
453
454END
455    print_bits("DECOMP", @decomp_bits);
456    print "#define DECOMP_COMPAT\t$DECOMP_COMPAT_BIT\n\n";
457
458    print $decomp->cprog(NAME => "${prefix}decompose");
459
460    print "static const unsigned long ${prefix}decompose_seq[] = {\n";
461    print_ulseq(@decomp_data);
462    print "};\n\n";
463}
464
465#
466# print_casemap -- generate data for case mapping
467#
468sub print_casemap {
469    $upper->fix();
470    $lower->fix();
471    print STDERR "** upper mapping\n", $upper->stat() if $verbose;
472    print STDERR "** lower mapping\n", $lower->stat() if $verbose;
473
474    print <<"END";
475
476/*
477 * Lowercase <-> Uppercase mapping
478 */
479
480/*
481 * Flags for special case mapping.
482 */
483#define CMF_FINAL	$CASEMAP_FINAL_BIT
484#define CMF_NONFINAL	$CASEMAP_NONFINAL_BIT
485#define CMF_LAST	$CASEMAP_LAST_BIT
486#define CMF_CTXDEP	(CMF_FINAL|CMF_NONFINAL)
487
488END
489    print_bits("CASEMAP", @casemap_bits);
490    print "\n";
491    print $upper->cprog(NAME => "${prefix}toupper");
492    print $lower->cprog(NAME => "${prefix}tolower");
493
494    print "static const unsigned long ${prefix}toupper_seq[] = {\n";
495    print_ulseq(@toupper_data);
496    print "};\n\n";
497
498    print "static const unsigned long ${prefix}tolower_seq[] = {\n";
499    print_ulseq(@tolower_data);
500    print "};\n\n";
501}
502
503#
504# print_casefolding -- generate data for case folding
505#
506sub print_casefolding {
507    $folding->fix();
508    print STDERR "** case folding\n", $folding->stat() if $verbose;
509
510    print <<"END";
511
512/*
513 * Case Folding
514 */
515
516END
517    print_bits("CASE_FOLDING", @folding_bits);
518    print "\n";
519    print $folding->cprog(NAME => "${prefix}case_folding");
520
521    print "static const unsigned long ${prefix}case_folding_seq[] = {\n";
522    print_ulseq(@folding_data);
523    print "};\n\n";
524}
525
526#
527# print_casemap_context -- gerarate data for determining context
528# (final/non-final)
529#
530sub print_casemap_context {
531    $casemap_ctx->fix();
532    print STDERR "** casemap context\n", $casemap_ctx->stat() if $verbose;
533
534    print <<"END";
535
536/*
537 * Cased characters and non-spacing marks (for casemap context)
538 */
539
540END
541
542    print_bits("CASEMAP_CTX", @casemap_ctx_bits);
543    print <<"END";
544
545#define CTX_CASED	$LETTER_BIT
546#define CTX_NSM		$NSPMARK_BIT
547
548END
549    print $casemap_ctx->cprog(NAME => "${prefix}casemap_ctx");
550}
551
552sub sprint_composition_hash {
553    my $i = 0;
554    my $s = '';
555    foreach my $r (@_) {
556	if ($i % 2 == 0) {
557	    $s .= "\n" if $i != 0;
558	    $s .= "\t";
559	}
560	$s .= sprintf "{0x%04x, 0x%04x, 0x%04x}, ", @{$r};
561	$i++;
562    }
563    $s;
564}
565
566sub print_bits {
567    my $prefix = shift;
568    my $i = 0;
569    foreach my $bit (@_) {
570	print "#define ${prefix}_BITS_$i\t$bit\n";
571	$i++;
572    }
573}
574
575sub print_ulseq {
576    my $i = 0;
577    foreach my $v (@_) {
578	if ($i % 4 == 0) {
579	    print "\n" if $i != 0;
580	    print "\t";
581	}
582	printf "0x%08x, ", $v;
583	$i++;
584    }
585    print "\n";
586}
587