1#!/usr/bin/perl -w 2 3=head1 NAME 4 5umap - map between different character sets 6 7=head1 SYNOPSIS 8 9 umap [options] <before>:<after> 10 11=head1 DESCRIPTION 12 13The I<umap> script acts as a filter between different encodings and 14character sets. 15 16The following options are recognized: 17 18=over 4 19 20=item --list [charset] 21 22Without argument list all character sets recognized. With a specified 23character set list the mapping between this set and Unicode. 24 25=item --strict 26 27Do the stict mapping between the character sets. The default is to 28not translate unmapped character. With I<--stict> we will remove 29unmapped characters or use the default specified with I<--def8> or 30I<--def16>. 31 32=item --def8=<charcode> 33 34Set the default 8-bit code for unmapped chars. 35 36=item --def16=<charcode> 37 38Set the default 16-bit code for unmapped chars. 39 40=item --verbose 41 42Generate more verbose output. 43 44=item --version 45 46Print the version number of this program and quit. 47 48=item --help 49 50Print the usage message. 51 52=back 53 54=head1 SEE ALSO 55 56L<Unicode::String>, 57L<Unicode::Map8>, 58recode(1) 59 60=head1 COPYRIGHT 61 62Copyright 1998 Gisle Aas. 63 64This is free software; you can redistribute it and/or 65modify it under the same terms as Perl itself. 66 67=cut 68 69 70use strict; 71use Getopt::Long qw(GetOptions); 72 73my $VERSION = "1.05"; 74 75my $list; 76my $strict; 77my $verbose; 78my $def8; 79my $def16; 80my $before; 81my $after; 82 83GetOptions('version' => \&print_version, 84 'help' => \&usage, 85 'list:s' => \$list, 86 'verbose' => \$verbose, 87 'strict!' => \$strict, 88 'def8=i' => \$def8, 89 'def16=i' => \$def16, 90 ) || usage (); 91 92 93if (defined $list) { 94 if (length($list)) { 95 list_charset($list); 96 } else { 97 list_charsets(); 98 } 99 exit; 100} 101 102# Try to extract $before/$after from the remaining arguments 103$before = shift || $ENV{UMAP_BEFORE} || "latin1"; 104if (!@ARGV && $before =~ s/([^\\]):/$1\0/) { 105 ($before, $after) = split('\0', $before, 2); 106} 107unless ($after) { 108 $after = shift || $ENV{UMAP_AFTER} || "utf8"; 109} 110for ($before, $after) { 111 s/\\:/:/g; 112} 113usage() if @ARGV; 114 115print STDERR "$before --> $after\n" if $verbose; 116 117 118#------------------------------------------------------------------ 119package MySpace; # use a new namespace 120 121use Unicode::String 2.00 qw(ucs4 ucs2 utf16 utf7 utf8); 122 123my $bsub = \&{$before}; 124 125unless (defined(&$bsub)) { 126 require Unicode::Map8; 127 my $map = Unicode::Map8->new($before); 128 die "Don't know about charset '$before'\n" unless $map; 129 $map->nostrict unless $strict; 130 $map->default_to16($def16) if defined($def16); 131 no strict 'refs'; 132 *{$before} = sub { $map->tou($_[0]); }; 133} 134 135if ($after =~ /^(ucs[24]|utf16|utf[78])$/) { 136 *out = sub { print $_[0]->$after(); }; 137} elsif ($after eq "hex") { 138 *out = sub { 139 my $hex = $_[0]->hex; 140 $hex =~ s/U\+000a\s*/U+000a\n/g; 141 print $hex; 142 }; 143} elsif ($after eq "uname") { 144 require Unicode::CharName; 145 *out = sub { 146 for ($_[0]->unpack) { 147 printf "U+%04X %s\n", $_, Unicode::CharName::uname($_) || ""; 148 } 149 }; 150} else { 151 require Unicode::Map8; 152 my $map = Unicode::Map8->new($after); 153 die "Don't know about charset '$after'\n" unless $map; 154 $map->nostrict unless $strict; 155 $map->default_to8($def8) if defined($def8); 156 #*out = sub { print $map->to8(${$_[0]}); }; 157 *out = sub { print $map->to8(${$_[0]}); }; 158} 159 160if (-t STDIN || $before =~ /^utf[78]$/) { 161 # must read a line at the time (should not break encoded chars) 162 my $line; 163 while (defined($line = <STDIN>)) { 164 out(&$bsub($line)); 165 } 166} else { 167 my $n; 168 my $buf; 169 # must read buffers which are multiples of 4 bytes (ucs4) 170 while ( $n = read(STDIN, $buf, 512)) { 171 #print "$n bytes read\n"; 172 out(&$bsub($buf)); 173 } 174} 175 176 177#------------------------------------------------------------------ 178package main; 179 180sub list_charset 181{ 182 require Unicode::Map8; 183 require Unicode::CharName; 184 185 my($charset, $format) = @_; 186 my $m = Unicode::Map8->new($charset); 187 die "Don't know about charset $charset\n" unless $m; 188 189 my @res8; 190 my %map16; 191 for (my $i = 0; $i < 256; $i++) { 192 my $u = $m->to_char16($i); 193 if ($u == Unicode::Map8::NOCHAR()) { 194 push(@res8, sprintf "# 0x%02X unmapped\n", $i) if $verbose; 195 } else { 196 push(@res8, sprintf "0x%02X 0x%04X # %s\n", $i, $u, 197 Unicode::CharName::uname($u)); 198 $map16{$u} = $i; 199 } 200 } 201 202 my @res16; 203 my @blocks; 204 for (my $block = 0; $block < 256; $block++) { 205 next if $m->_empty_block($block); 206 push(@blocks, $block); 207 for (my $i = 0; $i < 256; $i++) { 208 my $u = $block*256 + $i; 209 my $c = $m->to_char8($u); 210 next if $c == Unicode::Map8::NOCHAR(); 211 next if exists $map16{$u} && $map16{$u} == $c; 212 push(@res16, sprintf "0x%02X 0x%04X # %s\n", $c, $u, 213 Unicode::CharName::uname($u)); 214 } 215 } 216 217 print "# Mapping for '$charset'\n"; 218 print "#\n"; 219 printf "# %d allocated blocks", scalar(@blocks); 220 if (@blocks > 1 || $blocks[0] != 0) { 221 print " (", join(", ", map "#".($_+1), @blocks), ")"; 222 } 223 print "\n"; 224 print "#\n"; 225 print @res8; 226 227 if (@res16) { 228 print "\n# Extra 16-bit to 8-bit mappings\n"; 229 print @res16; 230 } 231} 232 233 234sub list_charsets 235{ 236 require Unicode::Map8; 237 my %set = ( 238 ucs4 => {}, 239 ucs2 => {utf16 => 1}, 240 utf7 => {}, 241 utf8 => {}, 242 ); 243 if (opendir(DIR, $Unicode::Map8::MAPS_DIR)) { 244 my $f; 245 while (defined($f = readdir(DIR))) { 246 next unless -f "$Unicode::Map8::MAPS_DIR/$f"; 247 $f =~ s/\.(?:bin|txt)$//; 248 $set{$f} = {} if Unicode::Map8->new($f); 249 } 250 } 251 252 my $avoid_warning = keys %Unicode::Map8::ALIASES; 253 while ( my($alias, $charset) = each %Unicode::Map8::ALIASES) { 254 if (exists $set{$charset}) { 255 $set{$charset}{$alias} = 1; 256 } else { 257 warn "$charset does not seem to exist (aliased as $alias)\n"; 258 } 259 } 260 261 for (sort keys %set) { 262 print "$_"; 263 if (%{$set{$_}}) { 264 print " ", join(" ", sort keys %{$set{$_}}); 265 } 266 print "\n"; 267 } 268} 269 270 271sub print_version 272{ 273 require Unicode::Map8; 274 my $avoid_warning = $Unicode::Map8::VERSION; 275 print <<"EOT"; 276This is umap version $VERSION (Unicode-Map8-$Unicode::Map8::VERSION) 277 278Copyright 1998, Gisle Aas. 279 280This program is free software; you can redistribute it and/or 281modify it under the same terms as Perl itself. 282EOT 283 exit 0; 284} 285 286 287sub usage 288{ 289 (my $progname = $0) =~ s,.*/,,; 290 die "Usage:\t$progname [options] <before>:<after> 291The options are: 292 --list [charset] list character sets 293 --strict use the strict mapping 294 --def8 <code> default 8-bit code for unmapped chars 295 --def16 <code> default 16-bit code for unmapped chars 296 --version print version number and quit 297"; 298} 299