1#!/usr/bin/perl -w
2
3=head1 NAME
4
5umap - map between different character sets
6
7=head1 SYNOPSIS
8
9 umap [options] <before>:<after>
10
11=head1 DESCRIPTION
12
13The I<umap> script acts as a filter between different encodings and
14character sets.
15
16The following options are recognized:
17
18=over 4
19
20=item --list [charset]
21
22Without argument list all character sets recognized.  With a specified
23character set list the mapping between this set and Unicode.
24
25=item --strict
26
27Do the stict mapping between the character sets.  The default is to
28not translate unmapped character.  With I<--stict> we will remove
29unmapped characters or use the default specified with I<--def8> or
30I<--def16>.
31
32=item --def8=<charcode>
33
34Set the default 8-bit code for unmapped chars.
35
36=item --def16=<charcode>
37
38Set the default 16-bit code for unmapped chars.
39
40=item --verbose
41
42Generate more verbose output.
43
44=item --version
45
46Print the version number of this program and quit.
47
48=item --help
49
50Print the usage message.
51
52=back
53
54=head1 SEE ALSO
55
56L<Unicode::String>,
57L<Unicode::Map8>,
58recode(1)
59
60=head1 COPYRIGHT
61
62Copyright 1998 Gisle Aas.
63
64This is free software; you can redistribute it and/or
65modify it under the same terms as Perl itself.
66
67=cut
68
69
70use strict;
71use Getopt::Long  qw(GetOptions);
72
73my $VERSION = "1.05";
74
75my $list;
76my $strict;
77my $verbose;
78my $def8;
79my $def16;
80my $before;
81my $after;
82
83GetOptions('version' => \&print_version,
84	   'help'    => \&usage,
85	   'list:s'  => \$list,
86	   'verbose' => \$verbose,
87	   'strict!' => \$strict,
88	   'def8=i'  => \$def8,
89	   'def16=i' => \$def16,
90	  ) || usage ();
91
92
93if (defined $list) {
94    if (length($list)) {
95	list_charset($list);
96    } else {
97	list_charsets();
98    }
99    exit;
100}
101
102# Try to extract $before/$after from the remaining arguments
103$before = shift || $ENV{UMAP_BEFORE} || "latin1";
104if (!@ARGV && $before =~ s/([^\\]):/$1\0/) {
105    ($before, $after) = split('\0', $before, 2);
106}
107unless ($after) {
108    $after  = shift || $ENV{UMAP_AFTER}  || "utf8";
109}
110for ($before, $after) {
111    s/\\:/:/g;
112}
113usage() if @ARGV;
114
115print STDERR "$before --> $after\n" if $verbose;
116
117
118#------------------------------------------------------------------
119package MySpace;  # use a new namespace
120
121use Unicode::String 2.00 qw(ucs4 ucs2 utf16 utf7 utf8);
122
123my $bsub = \&{$before};
124
125unless (defined(&$bsub)) {
126    require Unicode::Map8;
127    my $map = Unicode::Map8->new($before);
128    die "Don't know about charset '$before'\n" unless $map;
129    $map->nostrict unless $strict;
130    $map->default_to16($def16) if defined($def16);
131    no strict 'refs';
132    *{$before} = sub {	$map->tou($_[0]); };
133}
134
135if ($after =~ /^(ucs[24]|utf16|utf[78])$/) {
136    *out = sub { print $_[0]->$after(); };
137} elsif ($after eq "hex") {
138    *out = sub {
139	my $hex = $_[0]->hex;
140	$hex =~ s/U\+000a\s*/U+000a\n/g;
141	print $hex;
142    };
143} elsif ($after eq "uname") {
144    require Unicode::CharName;
145    *out = sub {
146	for ($_[0]->unpack) {
147	    printf "U+%04X   %s\n", $_, Unicode::CharName::uname($_) || "";
148	}
149    };
150} else {
151    require Unicode::Map8;
152    my $map = Unicode::Map8->new($after);
153    die "Don't know about charset '$after'\n" unless $map;
154    $map->nostrict unless $strict;
155    $map->default_to8($def8) if defined($def8);
156    #*out = sub { print $map->to8(${$_[0]}); };
157    *out = sub { print $map->to8(${$_[0]}); };
158}
159
160if (-t STDIN || $before =~ /^utf[78]$/) {
161    # must read a line at the time (should not break encoded chars)
162    my $line;
163    while (defined($line = <STDIN>)) {
164	out(&$bsub($line));
165    }
166} else {
167    my $n;
168    my $buf;
169    # must read buffers which are multiples of 4 bytes (ucs4)
170    while ( $n = read(STDIN, $buf, 512)) {
171	#print "$n bytes read\n";
172	out(&$bsub($buf));
173    }
174}
175
176
177#------------------------------------------------------------------
178package main;
179
180sub list_charset
181{
182    require Unicode::Map8;
183    require Unicode::CharName;
184
185    my($charset, $format) = @_;
186    my $m = Unicode::Map8->new($charset);
187    die "Don't know about charset $charset\n" unless $m;
188
189    my @res8;
190    my %map16;
191    for (my $i = 0; $i < 256; $i++) {
192	my $u = $m->to_char16($i);
193	if ($u == Unicode::Map8::NOCHAR()) {
194	    push(@res8, sprintf "# 0x%02X unmapped\n", $i) if $verbose;
195	} else {
196	    push(@res8, sprintf "0x%02X 0x%04X   # %s\n", $i, $u,
197		                               Unicode::CharName::uname($u));
198	    $map16{$u} = $i;
199	}
200    }
201
202    my @res16;
203    my @blocks;
204    for (my $block = 0; $block < 256; $block++) {
205	next if $m->_empty_block($block);
206	push(@blocks, $block);
207	for (my $i = 0; $i < 256; $i++) {
208	    my $u = $block*256 + $i;
209	    my $c = $m->to_char8($u);
210	    next if $c == Unicode::Map8::NOCHAR();
211	    next if exists $map16{$u} && $map16{$u} == $c;
212	    push(@res16, sprintf "0x%02X 0x%04X   # %s\n", $c, $u,
213		                                Unicode::CharName::uname($u));
214	}
215    }
216
217    print "# Mapping for '$charset'\n";
218    print "#\n";
219    printf "# %d allocated blocks", scalar(@blocks);
220    if (@blocks > 1 || $blocks[0] != 0) {
221	print " (", join(", ", map  "#".($_+1), @blocks), ")";
222    }
223    print "\n";
224    print "#\n";
225    print @res8;
226
227    if (@res16) {
228	print "\n# Extra 16-bit to 8-bit mappings\n";
229	print @res16;
230    }
231}
232
233
234sub list_charsets
235{
236    require Unicode::Map8;
237    my %set = (
238	       ucs4 => {},
239	       ucs2 => {utf16 => 1},
240	       utf7 => {},
241	       utf8 => {},
242	      );
243    if (opendir(DIR, $Unicode::Map8::MAPS_DIR)) {
244	my $f;
245	while (defined($f = readdir(DIR))) {
246	    next unless -f "$Unicode::Map8::MAPS_DIR/$f";
247	    $f =~ s/\.(?:bin|txt)$//;
248	    $set{$f} = {} if Unicode::Map8->new($f);
249	}
250    }
251
252    my $avoid_warning = keys %Unicode::Map8::ALIASES;
253    while ( my($alias, $charset) = each %Unicode::Map8::ALIASES) {
254	if (exists $set{$charset}) {
255	    $set{$charset}{$alias} = 1;
256	} else {
257	    warn "$charset does not seem to exist (aliased as $alias)\n";
258	}
259    }
260
261    for (sort keys %set) {
262	print "$_";
263	if (%{$set{$_}}) {
264	    print " ", join(" ", sort keys %{$set{$_}});
265	}
266	print "\n";
267    }
268}
269
270
271sub print_version
272{
273    require Unicode::Map8;
274    my $avoid_warning = $Unicode::Map8::VERSION;
275    print <<"EOT";
276This is umap version $VERSION (Unicode-Map8-$Unicode::Map8::VERSION)
277
278Copyright 1998, Gisle Aas.
279
280This program is free software; you can redistribute it and/or
281modify it under the same terms as Perl itself.
282EOT
283    exit 0;
284}
285
286
287sub usage
288{
289    (my $progname = $0) =~ s,.*/,,;
290    die "Usage:\t$progname [options] <before>:<after>
291The options are:
292  --list [charset]    list character sets
293  --strict            use the strict mapping
294  --def8 <code>       default 8-bit code for unmapped chars
295  --def16 <code>      default 16-bit code for unmapped chars
296  --version           print version number and quit
297";
298}
299