1#!./perl 2# $Id: piconv,v 2.8 2016/08/04 03:15:58 dankogai Exp $ 3# 4BEGIN { pop @INC if $INC[-1] eq '.' } 5use 5.8.0; 6use strict; 7use Encode ; 8use Encode::Alias; 9my %Scheme = map {$_ => 1} qw(from_to decode_encode perlio); 10 11use File::Basename; 12my $name = basename($0); 13 14use Getopt::Long qw(:config no_ignore_case); 15 16my %Opt; 17 18help() 19 unless 20 GetOptions(\%Opt, 21 'from|f=s', 22 'to|t=s', 23 'list|l', 24 'string|s=s', 25 'check|C=i', 26 'c', 27 'perlqq|p', 28 'htmlcref', 29 'xmlcref', 30 'debug|D', 31 'scheme|S=s', 32 'resolve|r=s', 33 'help', 34 ); 35 36$Opt{help} and help(); 37$Opt{list} and list_encodings(); 38my $locale = $ENV{LC_CTYPE} || $ENV{LC_ALL} || $ENV{LANG}; 39defined $Opt{resolve} and resolve_encoding($Opt{resolve}); 40$Opt{from} || $Opt{to} || help(); 41my $from = $Opt{from} || $locale or help("from_encoding unspecified"); 42my $to = $Opt{to} || $locale or help("to_encoding unspecified"); 43$Opt{string} and Encode::from_to($Opt{string}, $from, $to) and print $Opt{string} and exit; 44my $scheme = do { 45 if (defined $Opt{scheme}) { 46 if (!exists $Scheme{$Opt{scheme}}) { 47 warn "Unknown scheme '$Opt{scheme}', fallback to 'from_to'.\n"; 48 'from_to'; 49 } else { 50 $Opt{scheme}; 51 } 52 } else { 53 'from_to'; 54 } 55}; 56 57$Opt{check} ||= $Opt{c}; 58$Opt{perlqq} and $Opt{check} = Encode::PERLQQ; 59$Opt{htmlcref} and $Opt{check} = Encode::HTMLCREF; 60$Opt{xmlcref} and $Opt{check} = Encode::XMLCREF; 61 62my $efrom = Encode->getEncoding($from) || die "Unknown encoding '$from'"; 63my $eto = Encode->getEncoding($to) || die "Unknown encoding '$to'"; 64 65my $cfrom = $efrom->name; 66my $cto = $eto->name; 67 68if ($Opt{debug}){ 69 print <<"EOT"; 70Scheme: $scheme 71From: $from => $cfrom 72To: $to => $cto 73EOT 74} 75 76my %use_bom = 77 map { $_ => 1 } qw/UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE/; 78 79# we do not use <> (or ARGV) for the sake of binmode() 80@ARGV or push @ARGV, \*STDIN; 81 82unless ( $scheme eq 'perlio' ) { 83 binmode STDOUT; 84 my $need2slurp = $use_bom{ $eto } || $use_bom{ $efrom }; 85 for my $argv (@ARGV) { 86 my $ifh = ref $argv ? $argv : undef; 87 $ifh or open $ifh, "<", $argv or warn "Can't open $argv: $!" and next; 88 $ifh or open $ifh, "<", $argv or next; 89 binmode $ifh; 90 if ( $scheme eq 'from_to' ) { # default 91 if ($need2slurp){ 92 local $/; 93 $_ = <$ifh>; 94 Encode::from_to( $_, $from, $to, $Opt{check} ); 95 print; 96 }else{ 97 while (<$ifh>) { 98 Encode::from_to( $_, $from, $to, $Opt{check} ); 99 print; 100 } 101 } 102 } 103 elsif ( $scheme eq 'decode_encode' ) { # step-by-step 104 if ($need2slurp){ 105 local $/; 106 $_ = <$ifh>; 107 my $decoded = decode( $from, $_, $Opt{check} ); 108 my $encoded = encode( $to, $decoded ); 109 print $encoded; 110 }else{ 111 while (<$ifh>) { 112 my $decoded = decode( $from, $_, $Opt{check} ); 113 my $encoded = encode( $to, $decoded ); 114 print $encoded; 115 } 116 } 117 } 118 else { # won't reach 119 die "$name: unknown scheme: $scheme"; 120 } 121 } 122} 123else { 124 125 # NI-S favorite 126 binmode STDOUT => "raw:encoding($to)"; 127 for my $argv (@ARGV) { 128 my $ifh = ref $argv ? $argv : undef; 129 $ifh or open $ifh, "<", $argv or warn "Can't open $argv: $!" and next; 130 $ifh or open $ifh, "<", $argv or next; 131 binmode $ifh => "raw:encoding($from)"; 132 print while (<$ifh>); 133 } 134} 135 136sub list_encodings { 137 print join( "\n", Encode->encodings(":all") ), "\n"; 138 exit 0; 139} 140 141sub resolve_encoding { 142 if ( my $alias = Encode::resolve_alias( $_[0] ) ) { 143 print $alias, "\n"; 144 exit 0; 145 } 146 else { 147 warn "$name: $_[0] is not known to Encode\n"; 148 exit 1; 149 } 150} 151 152sub help { 153 my $message = shift; 154 $message and print STDERR "$name error: $message\n"; 155 print STDERR <<"EOT"; 156$name [-f from_encoding] [-t to_encoding] 157 [-p|--perlqq|--htmlcref|--xmlcref] [-C N|-c] [-D] [-S scheme] 158 [-s string|file...] 159$name -l 160$name -r encoding_alias 161$name -h 162Common options: 163 -l,--list 164 lists all available encodings 165 -r,--resolve encoding_alias 166 resolve encoding to its (Encode) canonical name 167 -f,--from from_encoding 168 when omitted, the current locale will be used 169 -t,--to to_encoding 170 when omitted, the current locale will be used 171 -s,--string string 172 "string" will be the input instead of STDIN or files 173The following are mainly of interest to Encode hackers: 174 -C N | -c check the validity of the input 175 -D,--debug show debug information 176 -S,--scheme scheme use the scheme for conversion 177Those are handy when you can only see ASCII characters: 178 -p,--perlqq transliterate characters missing in encoding to \\x{HHHH} 179 where HHHH is the hexadecimal Unicode code point 180 --htmlcref transliterate characters missing in encoding to &#NNN; 181 where NNN is the decimal Unicode code point 182 --xmlcref transliterate characters missing in encoding to &#xHHHH; 183 where HHHH is the hexadecimal Unicode code point 184 185EOT 186 exit; 187} 188 189__END__ 190 191=head1 NAME 192 193piconv -- iconv(1), reinvented in perl 194 195=head1 SYNOPSIS 196 197 piconv [-f from_encoding] [-t to_encoding] 198 [-p|--perlqq|--htmlcref|--xmlcref] [-C N|-c] [-D] [-S scheme] 199 [-s string|file...] 200 piconv -l 201 piconv -r encoding_alias 202 piconv -h 203 204=head1 DESCRIPTION 205 206B<piconv> is perl version of B<iconv>, a character encoding converter 207widely available for various Unixen today. This script was primarily 208a technology demonstrator for Perl 5.8.0, but you can use piconv in the 209place of iconv for virtually any case. 210 211piconv converts the character encoding of either STDIN or files 212specified in the argument and prints out to STDOUT. 213 214Here is the list of options. Some options can be in short format (-f) 215or long (--from) one. 216 217=over 4 218 219=item -f,--from I<from_encoding> 220 221Specifies the encoding you are converting from. Unlike B<iconv>, 222this option can be omitted. In such cases, the current locale is used. 223 224=item -t,--to I<to_encoding> 225 226Specifies the encoding you are converting to. Unlike B<iconv>, 227this option can be omitted. In such cases, the current locale is used. 228 229Therefore, when both -f and -t are omitted, B<piconv> just acts 230like B<cat>. 231 232=item -s,--string I<string> 233 234uses I<string> instead of file for the source of text. 235 236=item -l,--list 237 238Lists all available encodings, one per line, in case-insensitive 239order. Note that only the canonical names are listed; many aliases 240exist. For example, the names are case-insensitive, and many standard 241and common aliases work, such as "latin1" for "ISO-8859-1", or "ibm850" 242instead of "cp850", or "winlatin1" for "cp1252". See L<Encode::Supported> 243for a full discussion. 244 245=item -r,--resolve I<encoding_alias> 246 247Resolve I<encoding_alias> to Encode canonical encoding name. 248 249=item -C,--check I<N> 250 251Check the validity of the stream if I<N> = 1. When I<N> = -1, something 252interesting happens when it encounters an invalid character. 253 254=item -c 255 256Same as C<-C 1>. 257 258=item -p,--perlqq 259 260Transliterate characters missing in encoding to \x{HHHH} where HHHH is the 261hexadecimal Unicode code point. 262 263=item --htmlcref 264 265Transliterate characters missing in encoding to &#NNN; where NNN is the 266decimal Unicode code point. 267 268=item --xmlcref 269 270Transliterate characters missing in encoding to &#xHHHH; where HHHH is the 271hexadecimal Unicode code point. 272 273=item -h,--help 274 275Show usage. 276 277=item -D,--debug 278 279Invokes debugging mode. Primarily for Encode hackers. 280 281=item -S,--scheme I<scheme> 282 283Selects which scheme is to be used for conversion. Available schemes 284are as follows: 285 286=over 4 287 288=item from_to 289 290Uses Encode::from_to for conversion. This is the default. 291 292=item decode_encode 293 294Input strings are decode()d then encode()d. A straight two-step 295implementation. 296 297=item perlio 298 299The new perlIO layer is used. NI-S' favorite. 300 301You should use this option if you are using UTF-16 and others which 302linefeed is not $/. 303 304=back 305 306Like the I<-D> option, this is also for Encode hackers. 307 308=back 309 310=head1 SEE ALSO 311 312L<iconv(1)> 313L<locale(3)> 314L<Encode> 315L<Encode::Supported> 316L<Encode::Alias> 317L<PerlIO> 318 319=cut 320