1#!/usr/bin/env perl -wC 2 3# 4# $FreeBSD$ 5# 6 7use strict; 8use XML::Parser; 9use Tie::IxHash; 10use Data::Dumper; 11use Getopt::Long; 12use Digest::SHA qw(sha1_hex); 13 14 15if ($#ARGV < 2) { 16 print "Usage: $0 --cldr=<cldrdir> --unidata=<unidatadir> --etc=<etcdir> --input=<inputfile> --output=<outputfile>\n"; 17 exit(1); 18} 19 20my @filter = (); 21 22my $CLDRDIR = undef; 23my $UNIDATADIR = undef; 24my $ETCDIR = undef; 25my $TYPE = undef; 26my $INPUT = undef; 27my $OUTPUT = undef; 28 29my $result = GetOptions ( 30 "cldr=s" => \$CLDRDIR, 31 "unidata=s" => \$UNIDATADIR, 32 "etc=s" => \$ETCDIR, 33 "type=s" => \$TYPE, 34 "input=s" => \$INPUT, 35 "output=s" => \$OUTPUT, 36 ); 37 38my %ucd = (); 39my %utf8map = (); 40my %utf8aliases = (); 41get_unidata($UNIDATADIR); 42get_utf8map("$CLDRDIR/posix/UTF-8.cm"); 43convert($INPUT, $OUTPUT); 44 45############################ 46 47sub get_unidata { 48 my $directory = shift; 49 50 open(FIN, "$directory/UnicodeData.txt") 51 or die("Cannot open $directory/UnicodeData.txt");; 52 my @lines = <FIN>; 53 chomp(@lines); 54 close(FIN); 55 56 foreach my $l (@lines) { 57 my @a = split(/;/, $l); 58 59 $ucd{code2name}{"$a[0]"} = $a[1]; # Unicode name 60 $ucd{name2code}{"$a[1]"} = $a[0]; # Unicode code 61 } 62} 63 64sub get_utf8map { 65 my $file = shift; 66 67 open(FIN, $file); 68 my @lines = <FIN>; 69 close(FIN); 70 chomp(@lines); 71 72 my $prev_k = undef; 73 my $prev_v = ""; 74 my $incharmap = 0; 75 foreach my $l (@lines) { 76 $l =~ s/\r//; 77 next if ($l =~ /^\#/); 78 next if ($l eq ""); 79 80 if ($l eq "CHARMAP") { 81 $incharmap = 1; 82 next; 83 } 84 85 next if (!$incharmap); 86 last if ($l eq "END CHARMAP"); 87 88 $l =~ /^<([^\s]+)>\s+(.*)/; 89 my $k = $1; 90 my $v = $2; 91 $k =~ s/_/ /g; # unicode char string 92 $v =~ s/\\x//g; # UTF-8 char code 93 $utf8map{$k} = $v; 94 95 $utf8aliases{$k} = $prev_k if ($prev_v eq $v); 96 97 $prev_v = $v; 98 $prev_k = $k; 99 } 100} 101 102sub decode_cldr { 103 my $s = shift; 104 105 my $v = $utf8map{$s}; 106 $v = $utf8aliases{$s} if (!defined $v); 107 die "Cannot convert $s" if (!defined $v); 108 109 return pack("C", hex($v)) if (length($v) == 2); 110 return pack("CC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2))) 111 if (length($v) == 4); 112 return pack("CCC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)), 113 hex(substr($v, 4, 2))) if (length($v) == 6); 114 print STDERR "Cannot convert $s\n"; 115 return "length = " . length($v); 116} 117 118sub convert { 119 my $IN = shift; 120 my $OUT = shift; 121 122 open(FIN, "$IN"); 123 open(FOUT, ">$OUT"); 124 125# print Dumper(%utf8map); 126 127 my $l; 128 while (defined ($l = <FIN>)) { 129 chomp($l); 130 131 if ($l =~ /^#/) { 132 print FOUT $l, "\n"; 133 next; 134 } 135 136 while ($l =~ /^(.*?)<(.*?)>(.*)$/) { 137 $l = $1 . decode_cldr($2) . $3; 138 } 139 print FOUT $l, "\n"; 140 } 141 142 close(FOUT); 143 close(FIN); 144} 145