1# Id: UCD.pm,v 1.1 2003/06/04 00:27:53 marka Exp 2# 3# Copyright (c) 2000,2001 Japan Network Information Center. 4# All rights reserved. 5# 6# By using this file, you agree to the terms and conditions set forth bellow. 7# 8# LICENSE TERMS AND CONDITIONS 9# 10# The following License Terms and Conditions apply, unless a different 11# license is obtained from Japan Network Information Center ("JPNIC"), 12# a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda, 13# Chiyoda-ku, Tokyo 101-0047, Japan. 14# 15# 1. Use, Modification and Redistribution (including distribution of any 16# modified or derived work) in source and/or binary forms is permitted 17# under this License Terms and Conditions. 18# 19# 2. Redistribution of source code must retain the copyright notices as they 20# appear in each source code file, this License Terms and Conditions. 21# 22# 3. Redistribution in binary form must reproduce the Copyright Notice, 23# this License Terms and Conditions, in the documentation and/or other 24# materials provided with the distribution. For the purposes of binary 25# distribution the "Copyright Notice" refers to the following language: 26# "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved." 27# 28# 4. The name of JPNIC may not be used to endorse or promote products 29# derived from this Software without specific prior written approval of 30# JPNIC. 31# 32# 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC 33# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 34# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 35# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE 36# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 37# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 38# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 39# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 40# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 41# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 42# ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 43# 44 45package UCD; 46 47# 48# UCD.pm -- parser for Unicode Character Database files. 49# 50# This file is an aggregation of the following modules, each of which 51# provides a parser for a specific data file of UCD. 52# UCD::UnicodeData -- for UnicodeData.txt 53# UCD::CaseFolding -- for CaseFolding.txt 54# UCD::SpecialCasing -- for SpecialCasing.txt 55# UCD::CompositionExclusions -- for CompositionExclusions-1.txt 56# 57# Each module provides two subroutines: 58# 59# $line = getline(\*HANDLE); 60# reads next non-comment line from HANDLE, and returns it. 61# undef will be returned upon EOF. 62# 63# %fields = parse($line); 64# parses a line and extract fields, and returns a list of 65# field name and its value, suitable for assignment to a hash. 66# 67 68package UCD::UnicodeData; 69 70use strict; 71use Carp; 72 73sub getline { 74 my $fh = shift; 75 my $s = <$fh>; 76 $s =~ s/\r?\n$// if $s; 77 $s; 78} 79 80sub parseline { 81 my $s = shift; 82 83 my @f = split /;/, $s, -1; 84 return (CODE => hex($f[0]), 85 NAME => $f[1], 86 CATEGORY => $f[2], 87 CLASS => $f[3]+0, 88 BIDI => $f[4], 89 DECOMP => dcmap($f[5]), 90 DECIMAL => dvalue($f[6]), 91 DIGIT => dvalue($f[7]), 92 NUMERIC => dvalue($f[8]), 93 MIRRORED => $f[9] eq 'Y', 94 NAME10 => $f[10], 95 COMMENT => $f[11], 96 UPPER => ucode($f[12]), 97 LOWER => ucode($f[13]), 98 TITLE => ucode($f[14])); 99} 100 101sub dcmap { 102 my $v = shift; 103 return undef if $v eq ''; 104 $v =~ /^(?:(<[^>]+>)\s*)?(\S.*)/ 105 or croak "invalid decomposition mapping \"$v\""; 106 my $tag = $1 || ''; 107 [$tag, map {hex($_)} split(' ', $2)]; 108} 109 110sub ucode { 111 my $v = shift; 112 return undef if $v eq ''; 113 hex($v); 114} 115 116sub dvalue { 117 my $v = shift; 118 return undef if $v eq ''; 119 $v; 120} 121 122package UCD::CaseFolding; 123 124use strict; 125 126sub getline { 127 my $fh = shift; 128 while (defined(my $s = <$fh>)) { 129 next if $s =~ /^\#/; 130 next if $s =~ /^\s*$/; 131 $s =~ s/\r?\n$//; 132 return $s; 133 } 134 undef; 135} 136 137sub parseline { 138 my $s = shift; 139 my @f = split /;\s*/, $s, -1; 140 return (CODE => hex($f[0]), 141 TYPE => $f[1], 142 MAP => [map(hex, split ' ', $f[2])], 143 ); 144} 145 146package UCD::SpecialCasing; 147 148use strict; 149 150sub getline { 151 my $fh = shift; 152 while (defined(my $s = <$fh>)) { 153 next if $s =~ /^\#/; 154 next if $s =~ /^\s*$/; 155 $s =~ s/\r?\n$//; 156 return $s; 157 } 158 undef; 159} 160 161sub parseline { 162 my $s = shift; 163 164 my @f = split /;\s*/, $s, -1; 165 my $cond = (@f > 5) ? $f[4] : undef; 166 return (CODE => hex($f[0]), 167 LOWER => [map(hex, split ' ', $f[1])], 168 TITLE => [map(hex, split ' ', $f[2])], 169 UPPER => [map(hex, split ' ', $f[3])], 170 CONDITION => $cond); 171} 172 173package UCD::CompositionExclusions; 174 175use strict; 176 177sub getline { 178 my $fh = shift; 179 while (defined(my $s = <$fh>)) { 180 next if $s =~ /^\#/; 181 next if $s =~ /^\s*$/; 182 $s =~ s/\r?\n$//; 183 return $s; 184 } 185 undef; 186} 187 188sub parseline { 189 my $s = shift; 190 m/^[0-9A-Fa-f]+/; 191 return (CODE => hex($&)); 192} 193 1941; 195