1# Id: UCD.pm,v 1.1 2003/06/04 00:27:53 marka Exp
2#
3# Copyright (c) 2000,2001 Japan Network Information Center.
4# All rights reserved.
5#
6# By using this file, you agree to the terms and conditions set forth bellow.
7#
8# 			LICENSE TERMS AND CONDITIONS
9#
10# The following License Terms and Conditions apply, unless a different
11# license is obtained from Japan Network Information Center ("JPNIC"),
12# a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
13# Chiyoda-ku, Tokyo 101-0047, Japan.
14#
15# 1. Use, Modification and Redistribution (including distribution of any
16#    modified or derived work) in source and/or binary forms is permitted
17#    under this License Terms and Conditions.
18#
19# 2. Redistribution of source code must retain the copyright notices as they
20#    appear in each source code file, this License Terms and Conditions.
21#
22# 3. Redistribution in binary form must reproduce the Copyright Notice,
23#    this License Terms and Conditions, in the documentation and/or other
24#    materials provided with the distribution.  For the purposes of binary
25#    distribution the "Copyright Notice" refers to the following language:
26#    "Copyright (c) 2000-2002 Japan Network Information Center.  All rights reserved."
27#
28# 4. The name of JPNIC may not be used to endorse or promote products
29#    derived from this Software without specific prior written approval of
30#    JPNIC.
31#
32# 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
33#    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
34#    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
35#    PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL JPNIC BE LIABLE
36#    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
37#    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
38#    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
39#    BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
40#    WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
41#    OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
42#    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
43#
44
45package UCD;
46
47#
48# UCD.pm -- parser for Unicode Character Database files.
49#
50# This file is an aggregation of the following modules, each of which
51# provides a parser for a specific data file of UCD.
52#	UCD::UnicodeData		-- for UnicodeData.txt
53#	UCD::CaseFolding		-- for CaseFolding.txt
54#	UCD::SpecialCasing		-- for SpecialCasing.txt
55#	UCD::CompositionExclusions	-- for CompositionExclusions-1.txt
56#
57# Each module provides two subroutines:
58#
59#   $line = getline(\*HANDLE);
60#	reads next non-comment line from HANDLE, and returns it.
61#	undef will be returned upon EOF.
62#
63#   %fields = parse($line);
64#	parses a line and extract fields, and returns a list of
65#	field name and its value, suitable for assignment to a hash.
66#
67
68package UCD::UnicodeData;
69
70use strict;
71use Carp;
72
73sub getline {
74    my $fh = shift;
75    my $s = <$fh>;
76    $s =~ s/\r?\n$// if $s;
77    $s;
78}
79
80sub parseline {
81    my $s = shift;
82
83    my @f = split /;/, $s, -1;
84    return (CODE     => hex($f[0]),
85	    NAME     => $f[1],
86	    CATEGORY => $f[2],
87	    CLASS    => $f[3]+0,
88	    BIDI     => $f[4],
89	    DECOMP   => dcmap($f[5]),
90	    DECIMAL  => dvalue($f[6]),
91	    DIGIT    => dvalue($f[7]),
92	    NUMERIC  => dvalue($f[8]),
93	    MIRRORED => $f[9] eq 'Y',
94	    NAME10   => $f[10],
95	    COMMENT  => $f[11],
96	    UPPER    => ucode($f[12]),
97	    LOWER    => ucode($f[13]),
98	    TITLE    => ucode($f[14]));
99}
100
101sub dcmap {
102    my $v = shift;
103    return undef if $v eq '';
104    $v =~ /^(?:(<[^>]+>)\s*)?(\S.*)/
105	or croak "invalid decomposition mapping \"$v\"";
106    my $tag = $1 || '';
107    [$tag, map {hex($_)} split(' ', $2)];
108}
109
110sub ucode {
111    my $v = shift;
112    return undef if $v eq '';
113    hex($v);
114}
115
116sub dvalue {
117    my $v = shift;
118    return undef if $v eq '';
119    $v;
120}
121
122package UCD::CaseFolding;
123
124use strict;
125
126sub getline {
127    my $fh = shift;
128    while (defined(my $s = <$fh>)) {
129	next if $s =~ /^\#/;
130	next if $s =~ /^\s*$/;
131	$s =~ s/\r?\n$//;
132	return $s;
133    }
134    undef;
135}
136
137sub parseline {
138    my $s = shift;
139    my @f = split /;\s*/, $s, -1;
140    return (CODE => hex($f[0]),
141	    TYPE => $f[1],
142	    MAP  => [map(hex, split ' ', $f[2])],
143	   );
144}
145
146package UCD::SpecialCasing;
147
148use strict;
149
150sub getline {
151    my $fh = shift;
152    while (defined(my $s = <$fh>)) {
153	next if $s =~ /^\#/;
154	next if $s =~ /^\s*$/;
155	$s =~ s/\r?\n$//;
156	return $s;
157    }
158    undef;
159}
160
161sub parseline {
162    my $s = shift;
163
164    my @f = split /;\s*/, $s, -1;
165    my $cond = (@f > 5) ? $f[4] : undef;
166    return (CODE => hex($f[0]),
167	    LOWER => [map(hex, split ' ', $f[1])],
168	    TITLE => [map(hex, split ' ', $f[2])],
169	    UPPER => [map(hex, split ' ', $f[3])],
170	    CONDITION => $cond);
171}
172
173package UCD::CompositionExclusions;
174
175use strict;
176
177sub getline {
178    my $fh = shift;
179    while (defined(my $s = <$fh>)) {
180	next if $s =~ /^\#/;
181	next if $s =~ /^\s*$/;
182	$s =~ s/\r?\n$//;
183	return $s;
184    }
185    undef;
186}
187
188sub parseline {
189    my $s = shift;
190    m/^[0-9A-Fa-f]+/;
191    return (CODE => hex($&));
192}
193
1941;
195