1# $Id: encoding.pm,v 2.8 2009/02/15 17:44:13 dankogai Exp $ 2package encoding; 3our $VERSION = '2.6_01'; 4 5use Encode; 6use strict; 7use warnings; 8 9sub DEBUG () { 0 } 10 11BEGIN { 12 if ( ord("A") == 193 ) { 13 require Carp; 14 Carp::croak("encoding: pragma does not support EBCDIC platforms"); 15 } 16} 17 18our $HAS_PERLIO = 0; 19eval { require PerlIO::encoding }; 20unless ($@) { 21 $HAS_PERLIO = ( PerlIO::encoding->VERSION >= 0.02 ); 22} 23 24sub _exception { 25 my $name = shift; 26 $] > 5.008 and return 0; # 5.8.1 or higher then no 27 my %utfs = map { $_ => 1 } 28 qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE 29 UTF-32 UTF-32BE UTF-32LE); 30 $utfs{$name} or return 0; # UTFs or no 31 require Config; 32 Config->import(); 33 our %Config; 34 return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no 35} 36 37sub in_locale { $^H & ( $locale::hint_bits || 0 ) } 38 39sub _get_locale_encoding { 40 my $locale_encoding; 41 42 # I18N::Langinfo isn't available everywhere 43 eval { 44 require I18N::Langinfo; 45 I18N::Langinfo->import(qw(langinfo CODESET)); 46 $locale_encoding = langinfo( CODESET() ); 47 }; 48 49 my $country_language; 50 51 no warnings 'uninitialized'; 52 53 if ( (not $locale_encoding) && in_locale() ) { 54 if ( $ENV{LC_ALL} =~ /^([^.]+)\.([^.@]+)(@.*)?$/ ) { 55 ( $country_language, $locale_encoding ) = ( $1, $2 ); 56 } 57 elsif ( $ENV{LANG} =~ /^([^.]+)\.([^.@]+)(@.*)?$/ ) { 58 ( $country_language, $locale_encoding ) = ( $1, $2 ); 59 } 60 61 # LANGUAGE affects only LC_MESSAGES only on glibc 62 } 63 elsif ( not $locale_encoding ) { 64 if ( $ENV{LC_ALL} =~ /\butf-?8\b/i 65 || $ENV{LANG} =~ /\butf-?8\b/i ) 66 { 67 $locale_encoding = 'utf8'; 68 } 69 70 # Could do more heuristics based on the country and language 71 # parts of LC_ALL and LANG (the parts before the dot (if any)), 72 # since we have Locale::Country and Locale::Language available. 73 # TODO: get a database of Language -> Encoding mappings 74 # (the Estonian database at http://www.eki.ee/letter/ 75 # would be excellent!) --jhi 76 } 77 if ( defined $locale_encoding 78 && lc($locale_encoding) eq 'euc' 79 && defined $country_language ) 80 { 81 if ( $country_language =~ /^ja_JP|japan(?:ese)?$/i ) { 82 $locale_encoding = 'euc-jp'; 83 } 84 elsif ( $country_language =~ /^ko_KR|korean?$/i ) { 85 $locale_encoding = 'euc-kr'; 86 } 87 elsif ( $country_language =~ /^zh_CN|chin(?:a|ese)$/i ) { 88 $locale_encoding = 'euc-cn'; 89 } 90 elsif ( $country_language =~ /^zh_TW|taiwan(?:ese)?$/i ) { 91 $locale_encoding = 'euc-tw'; 92 } 93 else { 94 require Carp; 95 Carp::croak( 96 "encoding: Locale encoding '$locale_encoding' too ambiguous" 97 ); 98 } 99 } 100 101 return $locale_encoding; 102} 103 104sub import { 105 my $class = shift; 106 my $name = shift; 107 if ( $name eq ':_get_locale_encoding' ) { # used by lib/open.pm 108 my $caller = caller(); 109 { 110 no strict 'refs'; 111 *{"${caller}::_get_locale_encoding"} = \&_get_locale_encoding; 112 } 113 return; 114 } 115 $name = _get_locale_encoding() if $name eq ':locale'; 116 my %arg = @_; 117 $name = $ENV{PERL_ENCODING} unless defined $name; 118 my $enc = find_encoding($name); 119 unless ( defined $enc ) { 120 require Carp; 121 Carp::croak("encoding: Unknown encoding '$name'"); 122 } 123 $name = $enc->name; # canonize 124 unless ( $arg{Filter} ) { 125 DEBUG and warn "_exception($name) = ", _exception($name); 126 _exception($name) or ${^ENCODING} = $enc; 127 $HAS_PERLIO or return 1; 128 } 129 else { 130 defined( ${^ENCODING} ) and undef ${^ENCODING}; 131 132 # implicitly 'use utf8' 133 require utf8; # to fetch $utf8::hint_bits; 134 $^H |= $utf8::hint_bits; 135 eval { 136 require Filter::Util::Call; 137 Filter::Util::Call->import; 138 filter_add( 139 sub { 140 my $status = filter_read(); 141 if ( $status > 0 ) { 142 $_ = $enc->decode( $_, 1 ); 143 DEBUG and warn $_; 144 } 145 $status; 146 } 147 ); 148 }; 149 $@ eq '' and DEBUG and warn "Filter installed"; 150 } 151 defined ${^UNICODE} and ${^UNICODE} != 0 and return 1; 152 for my $h (qw(STDIN STDOUT)) { 153 if ( $arg{$h} ) { 154 unless ( defined find_encoding( $arg{$h} ) ) { 155 require Carp; 156 Carp::croak( 157 "encoding: Unknown encoding for $h, '$arg{$h}'"); 158 } 159 eval { binmode( $h, ":raw :encoding($arg{$h})" ) }; 160 } 161 else { 162 unless ( exists $arg{$h} ) { 163 eval { 164 no warnings 'uninitialized'; 165 binmode( $h, ":raw :encoding($name)" ); 166 }; 167 } 168 } 169 if ($@) { 170 require Carp; 171 Carp::croak($@); 172 } 173 } 174 return 1; # I doubt if we need it, though 175} 176 177sub unimport { 178 no warnings; 179 undef ${^ENCODING}; 180 if ($HAS_PERLIO) { 181 binmode( STDIN, ":raw" ); 182 binmode( STDOUT, ":raw" ); 183 } 184 else { 185 binmode(STDIN); 186 binmode(STDOUT); 187 } 188 if ( $INC{"Filter/Util/Call.pm"} ) { 189 eval { filter_del() }; 190 } 191} 192 1931; 194__END__ 195 196=pod 197 198=head1 NAME 199 200encoding - allows you to write your script in non-ascii or non-utf8 201 202=head1 SYNOPSIS 203 204 use encoding "greek"; # Perl like Greek to you? 205 use encoding "euc-jp"; # Jperl! 206 207 # or you can even do this if your shell supports your native encoding 208 209 perl -Mencoding=latin2 -e'...' # Feeling centrally European? 210 perl -Mencoding=euc-kr -e'...' # Or Korean? 211 212 # more control 213 214 # A simple euc-cn => utf-8 converter 215 use encoding "euc-cn", STDOUT => "utf8"; while(<>){print}; 216 217 # "no encoding;" supported (but not scoped!) 218 no encoding; 219 220 # an alternate way, Filter 221 use encoding "euc-jp", Filter=>1; 222 # now you can use kanji identifiers -- in euc-jp! 223 224 # switch on locale - 225 # note that this probably means that unless you have a complete control 226 # over the environments the application is ever going to be run, you should 227 # NOT use the feature of encoding pragma allowing you to write your script 228 # in any recognized encoding because changing locale settings will wreck 229 # the script; you can of course still use the other features of the pragma. 230 use encoding ':locale'; 231 232=head1 ABSTRACT 233 234Let's start with a bit of history: Perl 5.6.0 introduced Unicode 235support. You could apply C<substr()> and regexes even to complex CJK 236characters -- so long as the script was written in UTF-8. But back 237then, text editors that supported UTF-8 were still rare and many users 238instead chose to write scripts in legacy encodings, giving up a whole 239new feature of Perl 5.6. 240 241Rewind to the future: starting from perl 5.8.0 with the B<encoding> 242pragma, you can write your script in any encoding you like (so long 243as the C<Encode> module supports it) and still enjoy Unicode support. 244This pragma achieves that by doing the following: 245 246=over 247 248=item * 249 250Internally converts all literals (C<q//,qq//,qr//,qw///, qx//>) from 251the encoding specified to utf8. In Perl 5.8.1 and later, literals in 252C<tr///> and C<DATA> pseudo-filehandle are also converted. 253 254=item * 255 256Changing PerlIO layers of C<STDIN> and C<STDOUT> to the encoding 257 specified. 258 259=back 260 261=head2 Literal Conversions 262 263You can write code in EUC-JP as follows: 264 265 my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji 266 #<-char-><-char-> # 4 octets 267 s/\bCamel\b/$Rakuda/; 268 269And with C<use encoding "euc-jp"> in effect, it is the same thing as 270the code in UTF-8: 271 272 my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters 273 s/\bCamel\b/$Rakuda/; 274 275=head2 PerlIO layers for C<STD(IN|OUT)> 276 277The B<encoding> pragma also modifies the filehandle layers of 278STDIN and STDOUT to the specified encoding. Therefore, 279 280 use encoding "euc-jp"; 281 my $message = "Camel is the symbol of perl.\n"; 282 my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji 283 $message =~ s/\bCamel\b/$Rakuda/; 284 print $message; 285 286Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n", 287not "\x{99F1}\x{99DD} is the symbol of perl.\n". 288 289You can override this by giving extra arguments; see below. 290 291=head2 Implicit upgrading for byte strings 292 293By default, if strings operating under byte semantics and strings 294with Unicode character data are concatenated, the new string will 295be created by decoding the byte strings as I<ISO 8859-1 (Latin-1)>. 296 297The B<encoding> pragma changes this to use the specified encoding 298instead. For example: 299 300 use encoding 'utf8'; 301 my $string = chr(20000); # a Unicode string 302 utf8::encode($string); # now it's a UTF-8 encoded byte string 303 # concatenate with another Unicode string 304 print length($string . chr(20000)); 305 306Will print C<2>, because C<$string> is upgraded as UTF-8. Without 307C<use encoding 'utf8';>, it will print C<4> instead, since C<$string> 308is three octets when interpreted as Latin-1. 309 310=head2 Side effects 311 312If the C<encoding> pragma is in scope then the lengths returned are 313calculated from the length of C<$/> in Unicode characters, which is not 314always the same as the length of C<$/> in the native encoding. 315 316This pragma affects utf8::upgrade, but not utf8::downgrade. 317 318=head1 FEATURES THAT REQUIRE 5.8.1 319 320Some of the features offered by this pragma requires perl 5.8.1. Most 321of these are done by Inaba Hiroto. Any other features and changes 322are good for 5.8.0. 323 324=over 325 326=item "NON-EUC" doublebyte encodings 327 328Because perl needs to parse script before applying this pragma, such 329encodings as Shift_JIS and Big-5 that may contain '\' (BACKSLASH; 330\x5c) in the second byte fails because the second byte may 331accidentally escape the quoting character that follows. Perl 5.8.1 332or later fixes this problem. 333 334=item tr// 335 336C<tr//> was overlooked by Perl 5 porters when they released perl 5.8.0 337See the section below for details. 338 339=item DATA pseudo-filehandle 340 341Another feature that was overlooked was C<DATA>. 342 343=back 344 345=head1 USAGE 346 347=over 4 348 349=item use encoding [I<ENCNAME>] ; 350 351Sets the script encoding to I<ENCNAME>. And unless ${^UNICODE} 352exists and non-zero, PerlIO layers of STDIN and STDOUT are set to 353":encoding(I<ENCNAME>)". 354 355Note that STDERR WILL NOT be changed. 356 357Also note that non-STD file handles remain unaffected. Use C<use 358open> or C<binmode> to change layers of those. 359 360If no encoding is specified, the environment variable L<PERL_ENCODING> 361is consulted. If no encoding can be found, the error C<Unknown encoding 362'I<ENCNAME>'> will be thrown. 363 364=item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ; 365 366You can also individually set encodings of STDIN and STDOUT via the 367C<< STDIN => I<ENCNAME> >> form. In this case, you cannot omit the 368first I<ENCNAME>. C<< STDIN => undef >> turns the IO transcoding 369completely off. 370 371When ${^UNICODE} exists and non-zero, these options will completely 372ignored. ${^UNICODE} is a variable introduced in perl 5.8.1. See 373L<perlrun> see L<perlvar/"${^UNICODE}"> and L<perlrun/"-C"> for 374details (perl 5.8.1 and later). 375 376=item use encoding I<ENCNAME> Filter=E<gt>1; 377 378This turns the encoding pragma into a source filter. While the 379default approach just decodes interpolated literals (in qq() and 380qr()), this will apply a source filter to the entire source code. See 381L</"The Filter Option"> below for details. 382 383=item no encoding; 384 385Unsets the script encoding. The layers of STDIN, STDOUT are 386reset to ":raw" (the default unprocessed raw stream of bytes). 387 388=back 389 390=head1 The Filter Option 391 392The magic of C<use encoding> is not applied to the names of 393identifiers. In order to make C<${"\x{4eba}"}++> ($human++, where human 394is a single Han ideograph) work, you still need to write your script 395in UTF-8 -- or use a source filter. That's what 'Filter=>1' does. 396 397What does this mean? Your source code behaves as if it is written in 398UTF-8 with 'use utf8' in effect. So even if your editor only supports 399Shift_JIS, for example, you can still try examples in Chapter 15 of 400C<Programming Perl, 3rd Ed.>. For instance, you can use UTF-8 401identifiers. 402 403This option is significantly slower and (as of this writing) non-ASCII 404identifiers are not very stable WITHOUT this option and with the 405source code written in UTF-8. 406 407=head2 Filter-related changes at Encode version 1.87 408 409=over 410 411=item * 412 413The Filter option now sets STDIN and STDOUT like non-filter options. 414And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> work like 415non-filter version. 416 417=item * 418 419C<use utf8> is implicitly declared so you no longer have to C<use 420utf8> to C<${"\x{4eba}"}++>. 421 422=back 423 424=head1 CAVEATS 425 426=head2 NOT SCOPED 427 428The pragma is a per script, not a per block lexical. Only the last 429C<use encoding> or C<no encoding> matters, and it affects 430B<the whole script>. However, the <no encoding> pragma is supported and 431B<use encoding> can appear as many times as you want in a given script. 432The multiple use of this pragma is discouraged. 433 434By the same reason, the use this pragma inside modules is also 435discouraged (though not as strongly discouraged as the case above. 436See below). 437 438If you still have to write a module with this pragma, be very careful 439of the load order. See the codes below; 440 441 # called module 442 package Module_IN_BAR; 443 use encoding "bar"; 444 # stuff in "bar" encoding here 445 1; 446 447 # caller script 448 use encoding "foo" 449 use Module_IN_BAR; 450 # surprise! use encoding "bar" is in effect. 451 452The best way to avoid this oddity is to use this pragma RIGHT AFTER 453other modules are loaded. i.e. 454 455 use Module_IN_BAR; 456 use encoding "foo"; 457 458=head2 DO NOT MIX MULTIPLE ENCODINGS 459 460Notice that only literals (string or regular expression) having only 461legacy code points are affected: if you mix data like this 462 463 \xDF\x{100} 464 465the data is assumed to be in (Latin 1 and) Unicode, not in your native 466encoding. In other words, this will match in "greek": 467 468 "\xDF" =~ /\x{3af}/ 469 470but this will not 471 472 "\xDF\x{100}" =~ /\x{3af}\x{100}/ 473 474since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on 475the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL 476LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left. You 477should not be mixing your legacy data and Unicode in the same string. 478 479This pragma also affects encoding of the 0x80..0xFF code point range: 480normally characters in that range are left as eight-bit bytes (unless 481they are combined with characters with code points 0x100 or larger, 482in which case all characters need to become UTF-8 encoded), but if 483the C<encoding> pragma is present, even the 0x80..0xFF range always 484gets UTF-8 encoded. 485 486After all, the best thing about this pragma is that you don't have to 487resort to \x{....} just to spell your name in a native encoding. 488So feel free to put your strings in your encoding in quotes and 489regexes. 490 491=head2 tr/// with ranges 492 493The B<encoding> pragma works by decoding string literals in 494C<q//,qq//,qr//,qw///, qx//> and so forth. In perl 5.8.0, this 495does not apply to C<tr///>. Therefore, 496 497 use encoding 'euc-jp'; 498 #.... 499 $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/; 500 # -------- -------- -------- -------- 501 502Does not work as 503 504 $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/; 505 506=over 507 508=item Legend of characters above 509 510 utf8 euc-jp charnames::viacode() 511 ----------------------------------------- 512 \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A 513 \x{3093} \xA4\xF3 HIRAGANA LETTER N 514 \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A 515 \x{30f3} \xA5\xF3 KATAKANA LETTER N 516 517=back 518 519This counterintuitive behavior has been fixed in perl 5.8.1. 520 521=head3 workaround to tr///; 522 523In perl 5.8.0, you can work around as follows; 524 525 use encoding 'euc-jp'; 526 # .... 527 eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ }; 528 529Note the C<tr//> expression is surrounded by C<qq{}>. The idea behind 530is the same as classic idiom that makes C<tr///> 'interpolate'. 531 532 tr/$from/$to/; # wrong! 533 eval qq{ tr/$from/$to/ }; # workaround. 534 535Nevertheless, in case of B<encoding> pragma even C<q//> is affected so 536C<tr///> not being decoded was obviously against the will of Perl5 537Porters so it has been fixed in Perl 5.8.1 or later. 538 539=head1 EXAMPLE - Greekperl 540 541 use encoding "iso 8859-7"; 542 543 # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode. 544 545 $a = "\xDF"; 546 $b = "\x{100}"; 547 548 printf "%#x\n", ord($a); # will print 0x3af, not 0xdf 549 550 $c = $a . $b; 551 552 # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}". 553 554 # chr() is affected, and ... 555 556 print "mega\n" if ord(chr(0xdf)) == 0x3af; 557 558 # ... ord() is affected by the encoding pragma ... 559 560 print "tera\n" if ord(pack("C", 0xdf)) == 0x3af; 561 562 # ... as are eq and cmp ... 563 564 print "peta\n" if "\x{3af}" eq pack("C", 0xdf); 565 print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0; 566 567 # ... but pack/unpack C are not affected, in case you still 568 # want to go back to your native encoding 569 570 print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf; 571 572=head1 KNOWN PROBLEMS 573 574=over 575 576=item literals in regex that are longer than 127 bytes 577 578For native multibyte encodings (either fixed or variable length), 579the current implementation of the regular expressions may introduce 580recoding errors for regular expression literals longer than 127 bytes. 581 582=item EBCDIC 583 584The encoding pragma is not supported on EBCDIC platforms. 585(Porters who are willing and able to remove this limitation are 586welcome.) 587 588=item format 589 590This pragma doesn't work well with format because PerlIO does not 591get along very well with it. When format contains non-ascii 592characters it prints funny or gets "wide character warnings". 593To understand it, try the code below. 594 595 # Save this one in utf8 596 # replace *non-ascii* with a non-ascii string 597 my $camel; 598 format STDOUT = 599 *non-ascii*@>>>>>>> 600 $camel 601 . 602 $camel = "*non-ascii*"; 603 binmode(STDOUT=>':encoding(utf8)'); # bang! 604 write; # funny 605 print $camel, "\n"; # fine 606 607Without binmode this happens to work but without binmode, print() 608fails instead of write(). 609 610At any rate, the very use of format is questionable when it comes to 611unicode characters since you have to consider such things as character 612width (i.e. double-width for ideographs) and directions (i.e. BIDI for 613Arabic and Hebrew). 614 615=item Thread safety 616 617C<use encoding ...> is not thread-safe (i.e., do not use in threaded 618applications). 619 620=back 621 622=head2 The Logic of :locale 623 624The logic of C<:locale> is as follows: 625 626=over 4 627 628=item 1. 629 630If the platform supports the langinfo(CODESET) interface, the codeset 631returned is used as the default encoding for the open pragma. 632 633=item 2. 634 635If 1. didn't work but we are under the locale pragma, the environment 636variables LC_ALL and LANG (in that order) are matched for encodings 637(the part after C<.>, if any), and if any found, that is used 638as the default encoding for the open pragma. 639 640=item 3. 641 642If 1. and 2. didn't work, the environment variables LC_ALL and LANG 643(in that order) are matched for anything looking like UTF-8, and if 644any found, C<:utf8> is used as the default encoding for the open 645pragma. 646 647=back 648 649If your locale environment variables (LC_ALL, LC_CTYPE, LANG) 650contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching), 651the default encoding of your STDIN, STDOUT, and STDERR, and of 652B<any subsequent file open>, is UTF-8. 653 654=head1 HISTORY 655 656This pragma first appeared in Perl 5.8.0. For features that require 6575.8.1 and better, see above. 658 659The C<:locale> subpragma was implemented in 2.01, or Perl 5.8.6. 660 661=head1 SEE ALSO 662 663L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>, 664 665Ch. 15 of C<Programming Perl (3rd Edition)> 666by Larry Wall, Tom Christiansen, Jon Orwant; 667O'Reilly & Associates; ISBN 0-596-00027-8 668 669=cut 670