1#! /usr/local/bin/perl 2# 3# 4# This file and its contents are supplied under the terms of the 5# Common Development and Distribution License ("CDDL"), version 1.0. 6# You may only use this file in accordance with the terms of version 7# 1.0 of the CDDL. 8# 9# A full copy of the text of the CDDL should have accompanied this 10# source. A copy is of the CDDL is also available via the Internet 11# at http://www.illumos.org/license/CDDL. 12# 13 14# 15# Copyright 2010 Nexenta Systems, Inc. All rights reserved. 16# Copyright 2015 John Marino <draco@marino.st> 17# 18 19# This converts MAPPING files to localedef character maps 20# suitable for use with the UTF-8 derived localedef data. 21 22sub ucs_to_utf8 23{ 24 my $ucs = shift; 25 my $utf8; 26 27 if ($ucs <= 0x7f) { 28 $utf8 = sprintf("\\x%02X", $ucs).$utf8; 29 } elsif ($ucs <= 0x7ff) { 30 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 31 $ucs >>= 6; 32 $utf8 = sprintf("\\x%02X", $ucs | 0xc0).$utf8; 33 34 } elsif ($ucs <= 0xffff) { 35 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 36 $ucs >>= 6; 37 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 38 $ucs >>= 6; 39 $utf8 = sprintf("\\x%02X", $ucs | 0xe0).$utf8; 40 41 } elsif ($ucs <= 0x1fffff) { 42 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 43 $ucs >>= 6; 44 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 45 $ucs >>= 6; 46 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 47 $ucs >>= 6; 48 $utf8 = sprintf("\\x%02X", $ucs | 0xf0).$utf8; 49 50 } elsif ($ucs <= 0x03ffffff) { 51 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 52 $ucs >>= 6; 53 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 54 $ucs >>= 6; 55 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 56 $ucs >>= 6; 57 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 58 $ucs >>= 6; 59 $utf8 = sprintf("\\x%02X", $ucs | 0xf8).$utf8; 60 61 } else { 62 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 63 $ucs >>= 6; 64 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 65 $ucs >>= 6; 66 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 67 $ucs >>= 6; 68 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 69 $ucs >>= 6; 70 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 71 $ucs >>= 6; 72 $utf8 = sprintf("\\x%02X", $ucs | 0xf8).$utf8; 73 } 74 75 return ($utf8); 76} 77 78my %unames; 79my %uvalues; 80 81# 82# This is not a general purpose Character Map parser, but its good enough 83# for the stock one supplied with CLDR. 84# 85sub load_utf8_cm 86{ 87 my $file = shift; 88 89 open(UTF8, "$file") || die "$!: open: $file"; 90 91 while (<UTF8>) { 92 next if (/^#/); 93 next if (/^\s*$/); 94 next if (/^\s*CHARMAP\s*$/); 95 next if (/^\s*END\s*CHARMAP\s*$/); 96 chomp; 97 @words = split /\s+/; 98 $name = $words[0]; 99 $utf8val = $words[1]; 100 101 if (defined($unames{$utf8val})) { 102 $unames{$utf8val} .= "\n" .$name; 103 } else { 104 $unames{$utf8val} = $name; 105 } 106 $uvalues{$name} = $utf8val; 107 } 108 close(UTF8); 109} 110 111my %map; 112 113sub load_map 114{ 115 my $file = shift; 116 117 open(MAP, "$file") || die "open"; 118 119 while (<MAP>) { 120 next if (/^#/); 121 next if (/^\s*$/); 122 next if (/^0x..\+0x../); 123 next if (/^0x[0-9A-F]{4}\t0x[0-9A-F]{4} 0x[0-9A-F]{4}/); 124 next if (/^0x[0-9A-F]{2}\s+#/); 125 next if (/# ... NO MAPPING .../); 126 chomp; 127 @words = split /\s+/; 128 $utf8 = $words[1]; 129 $utf8 =~ s/^\\x[0]*//; 130 $utf8 = ucs_to_utf8(hex($utf8)); 131 $val = $words[0]; 132 if (defined ($map{$val})) { 133 $map{$val} .= " ".$utf8; 134 } else { 135 $map{$val} = $utf8; 136 } 137 } 138} 139 140sub mb_str 141{ 142 my $val = shift; 143 my $str = ""; 144 $val = hex($val); 145 146 if ($val == 0) { 147 return ("\\x00"); 148 } 149 while ($val) { 150 $str = sprintf("\\x%02x", $val & 0xff).$str; 151 $val >>= 8; 152 } 153 return ($str); 154} 155 156$mf = shift(@ARGV); 157$codeset = shift(@ARGV); 158my $max_mb; 159 160my $etcdir = (exists $ENV{'ETCDIR'}) ? $ENV{'ETCDIR'} : "etc"; 161load_utf8_cm("${etcdir}/final-maps/map.UTF-8"); 162load_map($mf); 163 164 165 if ($codeset eq "SJIS") { $max_mb = 2 } 166elsif ($codeset eq "eucCN") { $max_mb = 2 } 167elsif ($codeset eq "eucJP") { $max_mb = 3 } 168elsif ($codeset eq "eucKR") { $max_mb = 2 } 169elsif ($codeset eq "GBK") { $max_mb = 2 } 170elsif ($codeset eq "GB2312") { $max_mb = 2 } 171elsif ($codeset eq "Big5") { $max_mb = 2 } 172else { $max_mb = 1 }; 173print("<code_set_name> \"$codeset\"\n"); 174print("<mb_cur_min> 1\n"); 175print("<mb_cur_max> $max_mb\n"); 176 177print("CHARMAP\n"); 178foreach $val (sort (keys (%map))) { 179 #$utf8 = $map{$val}; 180 foreach $utf8 (split / /, $map{$val}) { 181 $ref = $unames{$utf8}; 182 foreach $name (sort (split /\n/, $ref)) { 183 print "$name"; 184 my $nt = int((64 - length($name) + 7) / 8); 185 while ($nt) { 186 print "\t"; 187 $nt--; 188 } 189 print mb_str($val)."\n"; 190 } 191 } 192} 193print "END CHARMAP\n"; 194