1#!./perl 2# $Id: piconv,v 2.6 2014/03/28 02:37:42 dankogai Exp $ 3# 4use 5.8.0; 5use strict; 6use Encode ; 7use Encode::Alias; 8my %Scheme = map {$_ => 1} qw(from_to decode_encode perlio); 9 10use File::Basename; 11my $name = basename($0); 12 13use Getopt::Long qw(:config no_ignore_case); 14 15my %Opt; 16 17help() 18 unless 19 GetOptions(\%Opt, 20 'from|f=s', 21 'to|t=s', 22 'list|l', 23 'string|s=s', 24 'check|C=i', 25 'c', 26 'perlqq|p', 27 'htmlcref', 28 'xmlcref', 29 'debug|D', 30 'scheme|S=s', 31 'resolve|r=s', 32 'help', 33 ); 34 35$Opt{help} and help(); 36$Opt{list} and list_encodings(); 37my $locale = $ENV{LC_CTYPE} || $ENV{LC_ALL} || $ENV{LANG}; 38defined $Opt{resolve} and resolve_encoding($Opt{resolve}); 39$Opt{from} || $Opt{to} || help(); 40my $from = $Opt{from} || $locale or help("from_encoding unspecified"); 41my $to = $Opt{to} || $locale or help("to_encoding unspecified"); 42$Opt{string} and Encode::from_to($Opt{string}, $from, $to) and print $Opt{string} and exit; 43my $scheme = do { 44 if (defined $Opt{scheme}) { 45 if (!exists $Scheme{$Opt{scheme}}) { 46 warn "Unknown scheme '$Opt{scheme}', fallback to 'from_to'.\n"; 47 'from_to'; 48 } else { 49 $Opt{scheme}; 50 } 51 } else { 52 'from_to'; 53 } 54}; 55 56$Opt{check} ||= $Opt{c}; 57$Opt{perlqq} and $Opt{check} = Encode::PERLQQ; 58$Opt{htmlcref} and $Opt{check} = Encode::HTMLCREF; 59$Opt{xmlcref} and $Opt{check} = Encode::XMLCREF; 60 61if ($Opt{debug}){ 62 my $cfrom = Encode->getEncoding($from)->name; 63 my $cto = Encode->getEncoding($to)->name; 64 print <<"EOT"; 65Scheme: $scheme 66From: $from => $cfrom 67To: $to => $cto 68EOT 69} 70 71my %use_bom = 72 map { $_ => 1 } qw/UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE/; 73 74# we do not use <> (or ARGV) for the sake of binmode() 75@ARGV or push @ARGV, \*STDIN; 76 77unless ( $scheme eq 'perlio' ) { 78 binmode STDOUT; 79 my $need2slurp = $use_bom{ find_encoding($to)->name } 80 || $use_bom{ find_encoding($from)->name }; 81 for my $argv (@ARGV) { 82 my $ifh = ref $argv ? $argv : undef; 83 $ifh or open $ifh, "<", $argv or warn "Can't open $argv: $!" and next; 84 $ifh or open $ifh, "<", $argv or next; 85 binmode $ifh; 86 if ( $scheme eq 'from_to' ) { # default 87 if ($need2slurp){ 88 local $/; 89 $_ = <$ifh>; 90 Encode::from_to( $_, $from, $to, $Opt{check} ); 91 print; 92 }else{ 93 while (<$ifh>) { 94 Encode::from_to( $_, $from, $to, $Opt{check} ); 95 print; 96 } 97 } 98 } 99 elsif ( $scheme eq 'decode_encode' ) { # step-by-step 100 if ($need2slurp){ 101 local $/; 102 $_ = <$ifh>; 103 my $decoded = decode( $from, $_, $Opt{check} ); 104 my $encoded = encode( $to, $decoded ); 105 print $encoded; 106 }else{ 107 while (<$ifh>) { 108 my $decoded = decode( $from, $_, $Opt{check} ); 109 my $encoded = encode( $to, $decoded ); 110 print $encoded; 111 } 112 } 113 } 114 else { # won't reach 115 die "$name: unknown scheme: $scheme"; 116 } 117 } 118} 119else { 120 121 # NI-S favorite 122 binmode STDOUT => "raw:encoding($to)"; 123 for my $argv (@ARGV) { 124 my $ifh = ref $argv ? $argv : undef; 125 $ifh or open $ifh, "<", $argv or warn "Can't open $argv: $!" and next; 126 $ifh or open $ifh, "<", $argv or next; 127 binmode $ifh => "raw:encoding($from)"; 128 print while (<$ifh>); 129 } 130} 131 132sub list_encodings { 133 print join( "\n", Encode->encodings(":all") ), "\n"; 134 exit 0; 135} 136 137sub resolve_encoding { 138 if ( my $alias = Encode::resolve_alias( $_[0] ) ) { 139 print $alias, "\n"; 140 exit 0; 141 } 142 else { 143 warn "$name: $_[0] is not known to Encode\n"; 144 exit 1; 145 } 146} 147 148sub help { 149 my $message = shift; 150 $message and print STDERR "$name error: $message\n"; 151 print STDERR <<"EOT"; 152$name [-f from_encoding] [-t to_encoding] 153 [-p|--perlqq|--htmlcref|--xmlcref] [-C N|-c] [-D] [-S scheme] 154 [-s string|file...] 155$name -l 156$name -r encoding_alias 157$name -h 158Common options: 159 -l,--list 160 lists all available encodings 161 -r,--resolve encoding_alias 162 resolve encoding to its (Encode) canonical name 163 -f,--from from_encoding 164 when omitted, the current locale will be used 165 -t,--to to_encoding 166 when omitted, the current locale will be used 167 -s,--string string 168 "string" will be the input instead of STDIN or files 169The following are mainly of interest to Encode hackers: 170 -C N | -c check the validity of the input 171 -D,--debug show debug information 172 -S,--scheme scheme use the scheme for conversion 173Those are handy when you can only see ASCII characters: 174 -p,--perlqq transliterate characters missing in encoding to \\x{HHHH} 175 where HHHH is the hexadecimal Unicode code point 176 --htmlcref transliterate characters missing in encoding to &#NNN; 177 where NNN is the decimal Unicode code point 178 --xmlcref transliterate characters missing in encoding to &#xHHHH; 179 where HHHH is the hexadecimal Unicode code point 180 181EOT 182 exit; 183} 184 185__END__ 186 187=head1 NAME 188 189piconv -- iconv(1), reinvented in perl 190 191=head1 SYNOPSIS 192 193 piconv [-f from_encoding] [-t to_encoding] 194 [-p|--perlqq|--htmlcref|--xmlcref] [-C N|-c] [-D] [-S scheme] 195 [-s string|file...] 196 piconv -l 197 piconv -r encoding_alias 198 piconv -h 199 200=head1 DESCRIPTION 201 202B<piconv> is perl version of B<iconv>, a character encoding converter 203widely available for various Unixen today. This script was primarily 204a technology demonstrator for Perl 5.8.0, but you can use piconv in the 205place of iconv for virtually any case. 206 207piconv converts the character encoding of either STDIN or files 208specified in the argument and prints out to STDOUT. 209 210Here is the list of options. Some options can be in short format (-f) 211or long (--from) one. 212 213=over 4 214 215=item -f,--from I<from_encoding> 216 217Specifies the encoding you are converting from. Unlike B<iconv>, 218this option can be omitted. In such cases, the current locale is used. 219 220=item -t,--to I<to_encoding> 221 222Specifies the encoding you are converting to. Unlike B<iconv>, 223this option can be omitted. In such cases, the current locale is used. 224 225Therefore, when both -f and -t are omitted, B<piconv> just acts 226like B<cat>. 227 228=item -s,--string I<string> 229 230uses I<string> instead of file for the source of text. 231 232=item -l,--list 233 234Lists all available encodings, one per line, in case-insensitive 235order. Note that only the canonical names are listed; many aliases 236exist. For example, the names are case-insensitive, and many standard 237and common aliases work, such as "latin1" for "ISO-8859-1", or "ibm850" 238instead of "cp850", or "winlatin1" for "cp1252". See L<Encode::Supported> 239for a full discussion. 240 241=item -r,--resolve I<encoding_alias> 242 243Resolve I<encoding_alias> to Encode canonical encoding name. 244 245=item -C,--check I<N> 246 247Check the validity of the stream if I<N> = 1. When I<N> = -1, something 248interesting happens when it encounters an invalid character. 249 250=item -c 251 252Same as C<-C 1>. 253 254=item -p,--perlqq 255 256Transliterate characters missing in encoding to \x{HHHH} where HHHH is the 257hexadecimal Unicode code point. 258 259=item --htmlcref 260 261Transliterate characters missing in encoding to &#NNN; where NNN is the 262decimal Unicode code point. 263 264=item --xmlcref 265 266Transliterate characters missing in encoding to &#xHHHH; where HHHH is the 267hexadecimal Unicode code point. 268 269=item -h,--help 270 271Show usage. 272 273=item -D,--debug 274 275Invokes debugging mode. Primarily for Encode hackers. 276 277=item -S,--scheme I<scheme> 278 279Selects which scheme is to be used for conversion. Available schemes 280are as follows: 281 282=over 4 283 284=item from_to 285 286Uses Encode::from_to for conversion. This is the default. 287 288=item decode_encode 289 290Input strings are decode()d then encode()d. A straight two-step 291implementation. 292 293=item perlio 294 295The new perlIO layer is used. NI-S' favorite. 296 297You should use this option if you are using UTF-16 and others which 298linefeed is not $/. 299 300=back 301 302Like the I<-D> option, this is also for Encode hackers. 303 304=back 305 306=head1 SEE ALSO 307 308L<iconv(1)> 309L<locale(3)> 310L<Encode> 311L<Encode::Supported> 312L<Encode::Alias> 313L<PerlIO> 314 315=cut 316