1#! /usr/local/bin/perl 2# 3# This file and its contents are supplied under the terms of the 4# Common Development and Distribution License ("CDDL"), version 1.0. 5# You may only use this file in accordance with the terms of version 6# 1.0 of the CDDL. 7# 8# A full copy of the text of the CDDL should have accompanied this 9# source. A copy is of the CDDL is also available via the Internet 10# at http://www.illumos.org/license/CDDL. 11# 12 13# 14# Copyright 2010 Nexenta Systems, Inc. All rights reserved. 15# Copyright 2015 John Marino <draco@marino.st> 16# 17 18# This converts MAPPING files to localedef character maps 19# suitable for use with the UTF-8 derived localedef data. 20 21sub ucs_to_utf8 22{ 23 my $ucs = shift; 24 my $utf8; 25 26 if ($ucs <= 0x7f) { 27 $utf8 = sprintf("\\x%02X", $ucs).$utf8; 28 } elsif ($ucs <= 0x7ff) { 29 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 30 $ucs >>= 6; 31 $utf8 = sprintf("\\x%02X", $ucs | 0xc0).$utf8; 32 33 } elsif ($ucs <= 0xffff) { 34 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 35 $ucs >>= 6; 36 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 37 $ucs >>= 6; 38 $utf8 = sprintf("\\x%02X", $ucs | 0xe0).$utf8; 39 40 } elsif ($ucs <= 0x1fffff) { 41 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 42 $ucs >>= 6; 43 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 44 $ucs >>= 6; 45 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 46 $ucs >>= 6; 47 $utf8 = sprintf("\\x%02X", $ucs | 0xf0).$utf8; 48 49 } elsif ($ucs <= 0x03ffffff) { 50 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 51 $ucs >>= 6; 52 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 53 $ucs >>= 6; 54 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 55 $ucs >>= 6; 56 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 57 $ucs >>= 6; 58 $utf8 = sprintf("\\x%02X", $ucs | 0xf8).$utf8; 59 60 } else { 61 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 62 $ucs >>= 6; 63 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 64 $ucs >>= 6; 65 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 66 $ucs >>= 6; 67 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 68 $ucs >>= 6; 69 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 70 $ucs >>= 6; 71 $utf8 = sprintf("\\x%02X", $ucs | 0xf8).$utf8; 72 } 73 74 return ($utf8); 75} 76 77my %unames; 78my %uvalues; 79 80# 81# This is not a general purpose Character Map parser, but its good enough 82# for the stock one supplied with CLDR. 83# 84sub load_utf8_cm 85{ 86 my $file = shift; 87 88 open(UTF8, "$file") || die "open"; 89 90 while (<UTF8>) { 91 next if (/^#/); 92 next if (/^\s*$/); 93 next if (/^\s*CHARMAP\s*$/); 94 next if (/^\s*END\s*CHARMAP\s*$/); 95 chomp; 96 @words = split /\s+/; 97 $name = $words[0]; 98 $utf8val = $words[1]; 99 100 if (defined($unames{$utf8val})) { 101 $unames{$utf8val} .= "\n" .$name; 102 } else { 103 $unames{$utf8val} = $name; 104 } 105 $uvalues{$name} = $utf8val; 106 } 107 close(UTF8); 108} 109 110my %map; 111 112sub load_map 113{ 114 my $file = shift; 115 116 open(MAP, "$file") || die "open"; 117 118 while (<MAP>) { 119 next if (/^#/); 120 next if (/^\s*$/); 121 next if (/^0x..\+0x../); 122 next if (/^0x[0-9A-F]{4}\t0x[0-9A-F]{4} 0x[0-9A-F]{4}/); 123 next if (/^0x[0-9A-F]{2}\s+#/); 124 next if (/# ... NO MAPPING .../); 125 chomp; 126 @words = split /\s+/; 127 $utf8 = $words[1]; 128 $utf8 =~ s/^\\x[0]*//; 129 $utf8 = ucs_to_utf8(hex($utf8)); 130 $val = $words[0]; 131 if (defined ($map{$val})) { 132 $map{$val} .= " ".$utf8; 133 } else { 134 $map{$val} = $utf8; 135 } 136 } 137} 138 139sub mb_str 140{ 141 my $val = shift; 142 my $str = ""; 143 $val = hex($val); 144 145 if ($val == 0) { 146 return ("\\x00"); 147 } 148 while ($val) { 149 $str = sprintf("\\x%02x", $val & 0xff).$str; 150 $val >>= 8; 151 } 152 return ($str); 153} 154 155$mf = shift(@ARGV); 156$codeset = shift(@ARGV); 157my $max_mb; 158 159load_utf8_cm("etc/final-maps/map.UTF-8"); 160load_map($mf); 161 162 163 if ($codeset eq "SJIS") { $max_mb = 2 } 164elsif ($codeset eq "eucCN") { $max_mb = 2 } 165elsif ($codeset eq "eucJP") { $max_mb = 3 } 166elsif ($codeset eq "eucKR") { $max_mb = 2 } 167elsif ($codeset eq "GBK") { $max_mb = 2 } 168elsif ($codeset eq "GB2312") { $max_mb = 2 } 169elsif ($codeset eq "Big5") { $max_mb = 2 } 170elsif ($codeset eq "Big5HKSCS") { $max_mb = 2 } 171else { $max_mb = 1 }; 172print("<code_set_name> \"$codeset\"\n"); 173print("<mb_cur_min> 1\n"); 174print("<mb_cur_max> $max_mb\n"); 175 176print("CHARMAP\n"); 177foreach $val (sort (keys (%map))) { 178 #$utf8 = $map{$val}; 179 foreach $utf8 (split / /, $map{$val}) { 180 $ref = $unames{$utf8}; 181 foreach $name (sort (split /\n/, $ref)) { 182 print "$name"; 183 my $nt = int((64 - length($name) + 7) / 8); 184 while ($nt) { 185 print "\t"; 186 $nt--; 187 } 188 print mb_str($val)."\n"; 189 } 190 } 191} 192print "END CHARMAP\n"; 193