xref: /openbsd/gnu/usr.bin/perl/cpan/Encode/bin/piconv (revision 09467b48)
1#!./perl
2# $Id: piconv,v 2.8 2016/08/04 03:15:58 dankogai Exp $
3#
4BEGIN { pop @INC if $INC[-1] eq '.' }
5use 5.8.0;
6use strict;
7use Encode ;
8use Encode::Alias;
9my %Scheme =  map {$_ => 1} qw(from_to decode_encode perlio);
10
11use File::Basename;
12my $name = basename($0);
13
14use Getopt::Long qw(:config no_ignore_case);
15
16my %Opt;
17
18help()
19    unless
20      GetOptions(\%Opt,
21         'from|f=s',
22         'to|t=s',
23         'list|l',
24         'string|s=s',
25         'check|C=i',
26         'c',
27         'perlqq|p',
28         'htmlcref',
29         'xmlcref',
30         'debug|D',
31         'scheme|S=s',
32         'resolve|r=s',
33         'help',
34         );
35
36$Opt{help} and help();
37$Opt{list} and list_encodings();
38my $locale = $ENV{LC_CTYPE} || $ENV{LC_ALL} || $ENV{LANG};
39defined $Opt{resolve} and resolve_encoding($Opt{resolve});
40$Opt{from} || $Opt{to} || help();
41my $from = $Opt{from} || $locale or help("from_encoding unspecified");
42my $to   = $Opt{to}   || $locale or help("to_encoding unspecified");
43$Opt{string} and Encode::from_to($Opt{string}, $from, $to) and print $Opt{string} and exit;
44my $scheme = do {
45    if (defined $Opt{scheme}) {
46	if (!exists $Scheme{$Opt{scheme}}) {
47	    warn "Unknown scheme '$Opt{scheme}', fallback to 'from_to'.\n";
48	    'from_to';
49	} else {
50	    $Opt{scheme};
51	}
52    } else {
53	'from_to';
54    }
55};
56
57$Opt{check} ||= $Opt{c};
58$Opt{perlqq}   and $Opt{check} = Encode::PERLQQ;
59$Opt{htmlcref} and $Opt{check} = Encode::HTMLCREF;
60$Opt{xmlcref}  and $Opt{check} = Encode::XMLCREF;
61
62my $efrom = Encode->getEncoding($from) || die "Unknown encoding '$from'";
63my $eto   = Encode->getEncoding($to)   || die "Unknown encoding '$to'";
64
65my $cfrom = $efrom->name;
66my $cto   = $eto->name;
67
68if ($Opt{debug}){
69    print <<"EOT";
70Scheme: $scheme
71From:   $from => $cfrom
72To:     $to => $cto
73EOT
74}
75
76my %use_bom =
77  map { $_ => 1 } qw/UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE/;
78
79# we do not use <> (or ARGV) for the sake of binmode()
80@ARGV or push @ARGV, \*STDIN;
81
82unless ( $scheme eq 'perlio' ) {
83    binmode STDOUT;
84    my $need2slurp = $use_bom{ $eto } || $use_bom{ $efrom };
85    for my $argv (@ARGV) {
86        my $ifh = ref $argv ? $argv : undef;
87	$ifh or open $ifh, "<", $argv or warn "Can't open $argv: $!" and next;
88        $ifh or open $ifh, "<", $argv or next;
89        binmode $ifh;
90        if ( $scheme eq 'from_to' ) {    # default
91	    if ($need2slurp){
92		local $/;
93		$_ = <$ifh>;
94		Encode::from_to( $_, $from, $to, $Opt{check} );
95		print;
96	    }else{
97		while (<$ifh>) {
98		    Encode::from_to( $_, $from, $to, $Opt{check} );
99		    print;
100		}
101	    }
102        }
103        elsif ( $scheme eq 'decode_encode' ) {    # step-by-step
104	    if ($need2slurp){
105		local $/;
106		$_ = <$ifh>;
107                my $decoded = decode( $from, $_, $Opt{check} );
108                my $encoded = encode( $to, $decoded );
109                print $encoded;
110	    }else{
111		while (<$ifh>) {
112		    my $decoded = decode( $from, $_, $Opt{check} );
113		    my $encoded = encode( $to, $decoded );
114		    print $encoded;
115		}
116	    }
117	}
118	else {                                    # won't reach
119            die "$name: unknown scheme: $scheme";
120        }
121    }
122}
123else {
124
125    # NI-S favorite
126    binmode STDOUT => "raw:encoding($to)";
127    for my $argv (@ARGV) {
128        my $ifh = ref $argv ? $argv : undef;
129	$ifh or open $ifh, "<", $argv or warn "Can't open $argv: $!" and next;
130        $ifh or open $ifh, "<", $argv or next;
131        binmode $ifh => "raw:encoding($from)";
132        print while (<$ifh>);
133    }
134}
135
136sub list_encodings {
137    print join( "\n", Encode->encodings(":all") ), "\n";
138    exit 0;
139}
140
141sub resolve_encoding {
142    if ( my $alias = Encode::resolve_alias( $_[0] ) ) {
143        print $alias, "\n";
144        exit 0;
145    }
146    else {
147        warn "$name: $_[0] is not known to Encode\n";
148        exit 1;
149    }
150}
151
152sub help {
153    my $message = shift;
154    $message and print STDERR "$name error: $message\n";
155    print STDERR <<"EOT";
156$name [-f from_encoding] [-t to_encoding]
157      [-p|--perlqq|--htmlcref|--xmlcref] [-C N|-c] [-D] [-S scheme]
158      [-s string|file...]
159$name -l
160$name -r encoding_alias
161$name -h
162Common options:
163  -l,--list
164     lists all available encodings
165  -r,--resolve encoding_alias
166    resolve encoding to its (Encode) canonical name
167  -f,--from from_encoding
168     when omitted, the current locale will be used
169  -t,--to to_encoding
170     when omitted, the current locale will be used
171  -s,--string string
172     "string" will be the input instead of STDIN or files
173The following are mainly of interest to Encode hackers:
174  -C N | -c           check the validity of the input
175  -D,--debug          show debug information
176  -S,--scheme scheme  use the scheme for conversion
177Those are handy when you can only see ASCII characters:
178  -p,--perlqq         transliterate characters missing in encoding to \\x{HHHH}
179                      where HHHH is the hexadecimal Unicode code point
180  --htmlcref          transliterate characters missing in encoding to &#NNN;
181                      where NNN is the decimal Unicode code point
182  --xmlcref           transliterate characters missing in encoding to &#xHHHH;
183                      where HHHH is the hexadecimal Unicode code point
184
185EOT
186    exit;
187}
188
189__END__
190
191=head1 NAME
192
193piconv -- iconv(1), reinvented in perl
194
195=head1 SYNOPSIS
196
197  piconv [-f from_encoding] [-t to_encoding]
198         [-p|--perlqq|--htmlcref|--xmlcref] [-C N|-c] [-D] [-S scheme]
199         [-s string|file...]
200  piconv -l
201  piconv -r encoding_alias
202  piconv -h
203
204=head1 DESCRIPTION
205
206B<piconv> is perl version of B<iconv>, a character encoding converter
207widely available for various Unixen today.  This script was primarily
208a technology demonstrator for Perl 5.8.0, but you can use piconv in the
209place of iconv for virtually any case.
210
211piconv converts the character encoding of either STDIN or files
212specified in the argument and prints out to STDOUT.
213
214Here is the list of options.  Some options can be in short format (-f)
215or long (--from) one.
216
217=over 4
218
219=item -f,--from I<from_encoding>
220
221Specifies the encoding you are converting from.  Unlike B<iconv>,
222this option can be omitted.  In such cases, the current locale is used.
223
224=item -t,--to I<to_encoding>
225
226Specifies the encoding you are converting to.  Unlike B<iconv>,
227this option can be omitted.  In such cases, the current locale is used.
228
229Therefore, when both -f and -t are omitted, B<piconv> just acts
230like B<cat>.
231
232=item -s,--string I<string>
233
234uses I<string> instead of file for the source of text.
235
236=item -l,--list
237
238Lists all available encodings, one per line, in case-insensitive
239order.  Note that only the canonical names are listed; many aliases
240exist.  For example, the names are case-insensitive, and many standard
241and common aliases work, such as "latin1" for "ISO-8859-1", or "ibm850"
242instead of "cp850", or "winlatin1" for "cp1252".  See L<Encode::Supported>
243for a full discussion.
244
245=item -r,--resolve I<encoding_alias>
246
247Resolve I<encoding_alias> to Encode canonical encoding name.
248
249=item -C,--check I<N>
250
251Check the validity of the stream if I<N> = 1.  When I<N> = -1, something
252interesting happens when it encounters an invalid character.
253
254=item -c
255
256Same as C<-C 1>.
257
258=item -p,--perlqq
259
260Transliterate characters missing in encoding to \x{HHHH} where HHHH is the
261hexadecimal Unicode code point.
262
263=item --htmlcref
264
265Transliterate characters missing in encoding to &#NNN; where NNN is the
266decimal Unicode code point.
267
268=item --xmlcref
269
270Transliterate characters missing in encoding to &#xHHHH; where HHHH is the
271hexadecimal Unicode code point.
272
273=item -h,--help
274
275Show usage.
276
277=item -D,--debug
278
279Invokes debugging mode.  Primarily for Encode hackers.
280
281=item -S,--scheme I<scheme>
282
283Selects which scheme is to be used for conversion.  Available schemes
284are as follows:
285
286=over 4
287
288=item from_to
289
290Uses Encode::from_to for conversion.  This is the default.
291
292=item decode_encode
293
294Input strings are decode()d then encode()d.  A straight two-step
295implementation.
296
297=item perlio
298
299The new perlIO layer is used.  NI-S' favorite.
300
301You should use this option if you are using UTF-16 and others which
302linefeed is not $/.
303
304=back
305
306Like the I<-D> option, this is also for Encode hackers.
307
308=back
309
310=head1 SEE ALSO
311
312L<iconv(1)>
313L<locale(3)>
314L<Encode>
315L<Encode::Supported>
316L<Encode::Alias>
317L<PerlIO>
318
319=cut
320