1*56d68f1eSafresh1# $Id: encoding.pm,v 3.00 2020/04/19 10:56:28 dankogai Exp $ 2b39c5158Smillertpackage encoding; 3*56d68f1eSafresh1our $VERSION = sprintf "%d.%02d", q$Revision: 3.00 $ =~ /(\d+)/g; 4b39c5158Smillert 5b39c5158Smillertuse Encode; 6b39c5158Smillertuse strict; 7b39c5158Smillertuse warnings; 89f11ffb7Safresh1use Config; 9b39c5158Smillert 10b8851fccSafresh1use constant { 11b8851fccSafresh1 DEBUG => !!$ENV{PERL_ENCODE_DEBUG}, 12b8851fccSafresh1 HAS_PERLIO => eval { require PerlIO::encoding; PerlIO::encoding->VERSION(0.02) }, 139f11ffb7Safresh1 PERL_5_21_7 => $^V && $^V ge v5.21.7, # lexically scoped 14b8851fccSafresh1}; 15b39c5158Smillert 16b39c5158Smillertsub _exception { 17b39c5158Smillert my $name = shift; 18b39c5158Smillert $] > 5.008 and return 0; # 5.8.1 or higher then no 19b39c5158Smillert my %utfs = map { $_ => 1 } 20b39c5158Smillert qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE 21b39c5158Smillert UTF-32 UTF-32BE UTF-32LE); 22b39c5158Smillert $utfs{$name} or return 0; # UTFs or no 23b39c5158Smillert require Config; 24b39c5158Smillert Config->import(); 25b39c5158Smillert our %Config; 26b39c5158Smillert return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no 27b39c5158Smillert} 28b39c5158Smillert 29b39c5158Smillertsub in_locale { $^H & ( $locale::hint_bits || 0 ) } 30b39c5158Smillert 31b39c5158Smillertsub _get_locale_encoding { 32b39c5158Smillert my $locale_encoding; 33b39c5158Smillert 34b8851fccSafresh1 if ($^O eq 'MSWin32') { 35b8851fccSafresh1 my @tries = ( 36b8851fccSafresh1 # First try to get the OutputCP. This will work only if we 37b8851fccSafresh1 # are attached to a console 38b8851fccSafresh1 'Win32.pm' => 'Win32::GetConsoleOutputCP', 39b8851fccSafresh1 'Win32/Console.pm' => 'Win32::Console::OutputCP', 40b8851fccSafresh1 # If above failed, this means that we are a GUI app 41b8851fccSafresh1 # Let's assume that the ANSI codepage is what matters 42b8851fccSafresh1 'Win32.pm' => 'Win32::GetACP', 43b8851fccSafresh1 ); 44b8851fccSafresh1 while (@tries) { 45b8851fccSafresh1 my $cp = eval { 46b8851fccSafresh1 require $tries[0]; 47b8851fccSafresh1 no strict 'refs'; 48b8851fccSafresh1 &{$tries[1]}() 49b39c5158Smillert }; 50b8851fccSafresh1 if ($cp) { 51b8851fccSafresh1 if ($cp == 65001) { # Code page for UTF-8 52b8851fccSafresh1 $locale_encoding = 'UTF-8'; 53b8851fccSafresh1 } else { 54b8851fccSafresh1 $locale_encoding = 'cp' . $cp; 55b8851fccSafresh1 } 56b8851fccSafresh1 return $locale_encoding; 57b8851fccSafresh1 } 58b8851fccSafresh1 splice(@tries, 0, 2) 59b8851fccSafresh1 } 60b8851fccSafresh1 } 61b39c5158Smillert 62b8851fccSafresh1 # I18N::Langinfo isn't available everywhere 63b8851fccSafresh1 $locale_encoding = eval { 64b8851fccSafresh1 require I18N::Langinfo; 65b8851fccSafresh1 find_encoding( 66b8851fccSafresh1 I18N::Langinfo::langinfo( I18N::Langinfo::CODESET() ) 67b8851fccSafresh1 )->name 68b8851fccSafresh1 }; 69b8851fccSafresh1 return $locale_encoding if defined $locale_encoding; 70b8851fccSafresh1 71b8851fccSafresh1 eval { 72b8851fccSafresh1 require POSIX; 73b8851fccSafresh1 # Get the current locale 74b8851fccSafresh1 # Remember that MSVCRT impl is quite different from Unixes 75b8851fccSafresh1 my $locale = POSIX::setlocale(POSIX::LC_CTYPE()); 76b8851fccSafresh1 if ( $locale =~ /^([^.]+)\.([^.@]+)(?:@.*)?$/ ) { 77b39c5158Smillert my $country_language; 78b39c5158Smillert ( $country_language, $locale_encoding ) = ( $1, $2 ); 79b39c5158Smillert 80b39c5158Smillert # Could do more heuristics based on the country and language 81b39c5158Smillert # since we have Locale::Country and Locale::Language available. 82b39c5158Smillert # TODO: get a database of Language -> Encoding mappings 83b39c5158Smillert # (the Estonian database at http://www.eki.ee/letter/ 84b39c5158Smillert # would be excellent!) --jhi 85b8851fccSafresh1 if (lc($locale_encoding) eq 'euc') { 86b39c5158Smillert if ( $country_language =~ /^ja_JP|japan(?:ese)?$/i ) { 87b39c5158Smillert $locale_encoding = 'euc-jp'; 88b39c5158Smillert } 89b39c5158Smillert elsif ( $country_language =~ /^ko_KR|korean?$/i ) { 90b39c5158Smillert $locale_encoding = 'euc-kr'; 91b39c5158Smillert } 92b39c5158Smillert elsif ( $country_language =~ /^zh_CN|chin(?:a|ese)$/i ) { 93b39c5158Smillert $locale_encoding = 'euc-cn'; 94b39c5158Smillert } 95b39c5158Smillert elsif ( $country_language =~ /^zh_TW|taiwan(?:ese)?$/i ) { 96b39c5158Smillert $locale_encoding = 'euc-tw'; 97b39c5158Smillert } 98b39c5158Smillert else { 99b39c5158Smillert require Carp; 100b39c5158Smillert Carp::croak( 101b39c5158Smillert "encoding: Locale encoding '$locale_encoding' too ambiguous" 102b39c5158Smillert ); 103b39c5158Smillert } 104b39c5158Smillert } 105b8851fccSafresh1 } 106b8851fccSafresh1 }; 107b39c5158Smillert 108b39c5158Smillert return $locale_encoding; 109b39c5158Smillert} 110b39c5158Smillert 111b39c5158Smillertsub import { 112b8851fccSafresh1 113b8851fccSafresh1 if ( ord("A") == 193 ) { 114b8851fccSafresh1 require Carp; 115b8851fccSafresh1 Carp::croak("encoding: pragma does not support EBCDIC platforms"); 116b8851fccSafresh1 } 117b8851fccSafresh1 1189f11ffb7Safresh1 my $deprecate = 1199f11ffb7Safresh1 ($] >= 5.017 and !$Config{usecperl}) 1209f11ffb7Safresh1 ? "Use of the encoding pragma is deprecated" : 0; 1219f11ffb7Safresh1 122b39c5158Smillert my $class = shift; 123b39c5158Smillert my $name = shift; 124e9ce3842Safresh1 if (!$name){ 125e9ce3842Safresh1 require Carp; 126e9ce3842Safresh1 Carp::croak("encoding: no encoding specified."); 127e9ce3842Safresh1 } 128b39c5158Smillert if ( $name eq ':_get_locale_encoding' ) { # used by lib/open.pm 129b39c5158Smillert my $caller = caller(); 130b39c5158Smillert { 131b39c5158Smillert no strict 'refs'; 132b39c5158Smillert *{"${caller}::_get_locale_encoding"} = \&_get_locale_encoding; 133b39c5158Smillert } 134b39c5158Smillert return; 135b39c5158Smillert } 136b39c5158Smillert $name = _get_locale_encoding() if $name eq ':locale'; 1379f11ffb7Safresh1 BEGIN { strict->unimport('hashpairs') if $] >= 5.027 and $^V =~ /c$/; } 138b39c5158Smillert my %arg = @_; 139b39c5158Smillert $name = $ENV{PERL_ENCODING} unless defined $name; 140b39c5158Smillert my $enc = find_encoding($name); 141b39c5158Smillert unless ( defined $enc ) { 142b39c5158Smillert require Carp; 143b39c5158Smillert Carp::croak("encoding: Unknown encoding '$name'"); 144b39c5158Smillert } 145b39c5158Smillert $name = $enc->name; # canonize 146b39c5158Smillert unless ( $arg{Filter} ) { 1479f11ffb7Safresh1 if ($] >= 5.025003 and !$Config{usecperl}) { 1489f11ffb7Safresh1 require Carp; 1499f11ffb7Safresh1 Carp::croak("The encoding pragma is no longer supported. Check cperl"); 1509f11ffb7Safresh1 } 1519f11ffb7Safresh1 warnings::warnif("deprecated",$deprecate) if $deprecate; 1529f11ffb7Safresh1 153b39c5158Smillert DEBUG and warn "_exception($name) = ", _exception($name); 154b8851fccSafresh1 if (! _exception($name)) { 155b8851fccSafresh1 if (!PERL_5_21_7) { 156b8851fccSafresh1 ${^ENCODING} = $enc; 157b8851fccSafresh1 } 158b8851fccSafresh1 else { 159b8851fccSafresh1 # Starting with 5.21.7, this pragma uses a shadow variable 160b8851fccSafresh1 # designed explicitly for it, ${^E_NCODING}, to enforce 161b8851fccSafresh1 # lexical scope; instead of ${^ENCODING}. 162b8851fccSafresh1 $^H{'encoding'} = 1; 163b8851fccSafresh1 ${^E_NCODING} = $enc; 164b8851fccSafresh1 } 165b8851fccSafresh1 } 1669f11ffb7Safresh1 if (! HAS_PERLIO ) { 1679f11ffb7Safresh1 return 1; 1689f11ffb7Safresh1 } 169b39c5158Smillert } 170b39c5158Smillert else { 1719f11ffb7Safresh1 warnings::warnif("deprecated",$deprecate) if $deprecate; 1729f11ffb7Safresh1 173b39c5158Smillert defined( ${^ENCODING} ) and undef ${^ENCODING}; 174b8851fccSafresh1 undef ${^E_NCODING} if PERL_5_21_7; 175b39c5158Smillert 176b39c5158Smillert # implicitly 'use utf8' 177b39c5158Smillert require utf8; # to fetch $utf8::hint_bits; 178b39c5158Smillert $^H |= $utf8::hint_bits; 1799f11ffb7Safresh1 180b39c5158Smillert require Filter::Util::Call; 181b39c5158Smillert Filter::Util::Call->import; 182b39c5158Smillert filter_add( 183b39c5158Smillert sub { 184b39c5158Smillert my $status = filter_read(); 185b39c5158Smillert if ( $status > 0 ) { 186b39c5158Smillert $_ = $enc->decode( $_, 1 ); 187b39c5158Smillert DEBUG and warn $_; 188b39c5158Smillert } 189b39c5158Smillert $status; 190b39c5158Smillert } 191b39c5158Smillert ); 192b39c5158Smillert } 193b39c5158Smillert defined ${^UNICODE} and ${^UNICODE} != 0 and return 1; 194b39c5158Smillert for my $h (qw(STDIN STDOUT)) { 195b39c5158Smillert if ( $arg{$h} ) { 196b39c5158Smillert unless ( defined find_encoding( $arg{$h} ) ) { 197b39c5158Smillert require Carp; 198b39c5158Smillert Carp::croak( 199b39c5158Smillert "encoding: Unknown encoding for $h, '$arg{$h}'"); 200b39c5158Smillert } 2019f11ffb7Safresh1 binmode( $h, ":raw :encoding($arg{$h})" ); 202b39c5158Smillert } 203b39c5158Smillert else { 204b39c5158Smillert unless ( exists $arg{$h} ) { 205b39c5158Smillert no warnings 'uninitialized'; 206b39c5158Smillert binmode( $h, ":raw :encoding($name)" ); 207b39c5158Smillert } 208b39c5158Smillert } 209b39c5158Smillert } 210b39c5158Smillert return 1; # I doubt if we need it, though 211b39c5158Smillert} 212b39c5158Smillert 213b39c5158Smillertsub unimport { 214b39c5158Smillert no warnings; 215b39c5158Smillert undef ${^ENCODING}; 216b8851fccSafresh1 undef ${^E_NCODING} if PERL_5_21_7; 217b8851fccSafresh1 if (HAS_PERLIO) { 218b39c5158Smillert binmode( STDIN, ":raw" ); 219b39c5158Smillert binmode( STDOUT, ":raw" ); 220b39c5158Smillert } 221b39c5158Smillert else { 222b39c5158Smillert binmode(STDIN); 223b39c5158Smillert binmode(STDOUT); 224b39c5158Smillert } 225b39c5158Smillert if ( $INC{"Filter/Util/Call.pm"} ) { 226b39c5158Smillert eval { filter_del() }; 227b39c5158Smillert } 228b39c5158Smillert} 229b39c5158Smillert 230b39c5158Smillert1; 231b39c5158Smillert__END__ 232b39c5158Smillert 233b39c5158Smillert=pod 234b39c5158Smillert 235b39c5158Smillert=head1 NAME 236b39c5158Smillert 237b8851fccSafresh1encoding - allows you to write your script in non-ASCII and non-UTF-8 238b39c5158Smillert 239e9ce3842Safresh1=head1 WARNING 240e9ce3842Safresh1 241b8851fccSafresh1This module has been deprecated since perl v5.18. See L</DESCRIPTION> and 242b8851fccSafresh1L</BUGS>. 243e5157e49Safresh1 244b39c5158Smillert=head1 SYNOPSIS 245b39c5158Smillert 246b39c5158Smillert use encoding "greek"; # Perl like Greek to you? 247b39c5158Smillert use encoding "euc-jp"; # Jperl! 248b39c5158Smillert 249b39c5158Smillert # or you can even do this if your shell supports your native encoding 250b39c5158Smillert 251b39c5158Smillert perl -Mencoding=latin2 -e'...' # Feeling centrally European? 252b39c5158Smillert perl -Mencoding=euc-kr -e'...' # Or Korean? 253b39c5158Smillert 254b39c5158Smillert # more control 255b39c5158Smillert 256b39c5158Smillert # A simple euc-cn => utf-8 converter 257b39c5158Smillert use encoding "euc-cn", STDOUT => "utf8"; while(<>){print}; 258b39c5158Smillert 259b8851fccSafresh1 # "no encoding;" supported 260b39c5158Smillert no encoding; 261b39c5158Smillert 262b39c5158Smillert # an alternate way, Filter 263b39c5158Smillert use encoding "euc-jp", Filter=>1; 264b39c5158Smillert # now you can use kanji identifiers -- in euc-jp! 265b39c5158Smillert 266b8851fccSafresh1 # encode based on the current locale - specialized purposes only; 267b8851fccSafresh1 # fraught with danger!! 268b39c5158Smillert use encoding ':locale'; 269b39c5158Smillert 270b8851fccSafresh1=head1 DESCRIPTION 271b39c5158Smillert 272b8851fccSafresh1This pragma is used to enable a Perl script to be written in encodings that 273b8851fccSafresh1aren't strictly ASCII nor UTF-8. It translates all or portions of the Perl 274b8851fccSafresh1program script from a given encoding into UTF-8, and changes the PerlIO layers 275b8851fccSafresh1of C<STDIN> and C<STDOUT> to the encoding specified. 276b39c5158Smillert 277b8851fccSafresh1This pragma dates from the days when UTF-8-enabled editors were uncommon. But 278b8851fccSafresh1that was long ago, and the need for it is greatly diminished. That, coupled 279b8851fccSafresh1with the fact that it doesn't work with threads, along with other problems, 280b8851fccSafresh1(see L</BUGS>) have led to its being deprecated. It is planned to remove this 281b8851fccSafresh1pragma in a future Perl version. New code should be written in UTF-8, and the 282b8851fccSafresh1C<use utf8> pragma used instead (see L<perluniintro> and L<utf8> for details). 283b8851fccSafresh1Old code should be converted to UTF-8, via something like the recipe in the 284b8851fccSafresh1L</SYNOPSIS> (though this simple approach may require manual adjustments 285b8851fccSafresh1afterwards). 286b39c5158Smillert 2879f11ffb7Safresh1If UTF-8 is not an option, it is recommended that one use a simple source 2889f11ffb7Safresh1filter, such as that provided by L<Filter::Encoding> on CPAN or this 2899f11ffb7Safresh1pragma's own C<Filter> option (see below). 2909f11ffb7Safresh1 291b8851fccSafresh1The only legitimate use of this pragma is almost certainly just one per file, 292b8851fccSafresh1near the top, with file scope, as the file is likely going to only be written 293b8851fccSafresh1in one encoding. Further restrictions apply in Perls before v5.22 (see 294b8851fccSafresh1L</Prior to Perl v5.22>). 295b39c5158Smillert 296b8851fccSafresh1There are two basic modes of operation (plus turning if off): 297b39c5158Smillert 298b8851fccSafresh1=over 4 299b39c5158Smillert 300b8851fccSafresh1=item C<use encoding ['I<ENCNAME>'] ;> 301b39c5158Smillert 3029f11ffb7Safresh1Please note: This mode of operation is no longer supported as of Perl 3039f11ffb7Safresh1v5.26. 3049f11ffb7Safresh1 305b8851fccSafresh1This is the normal operation. It translates various literals encountered in 306b8851fccSafresh1the Perl source file from the encoding I<ENCNAME> into UTF-8, and similarly 307b8851fccSafresh1converts character code points. This is used when the script is a combination 308b8851fccSafresh1of ASCII (for the variable names and punctuation, I<etc>), but the literal 309b8851fccSafresh1data is in the specified encoding. 310b39c5158Smillert 311b8851fccSafresh1I<ENCNAME> is optional. If omitted, the encoding specified in the environment 312b8851fccSafresh1variable L<C<PERL_ENCODING>|perlrun/PERL_ENCODING> is used. If this isn't 313b8851fccSafresh1set, or the resolved-to encoding is not known to C<L<Encode>>, the error 314b8851fccSafresh1C<Unknown encoding 'I<ENCNAME>'> will be thrown. 315b39c5158Smillert 316b8851fccSafresh1Starting in Perl v5.8.6 (C<Encode> version 2.0.1), I<ENCNAME> may be the 317b8851fccSafresh1name C<:locale>. This is for very specialized applications, and is documented 318b8851fccSafresh1in L</The C<:locale> sub-pragma> below. 319b39c5158Smillert 320b8851fccSafresh1The literals that are converted are C<q//, qq//, qr//, qw///, qx//>, and 321b8851fccSafresh1starting in v5.8.1, C<tr///>. Operations that do conversions include C<chr>, 322b8851fccSafresh1C<ord>, C<utf8::upgrade> (but not C<utf8::downgrade>), and C<chomp>. 323b8851fccSafresh1 324b8851fccSafresh1Also starting in v5.8.1, the C<DATA> pseudo-filehandle is translated from the 325b8851fccSafresh1encoding into UTF-8. 326b8851fccSafresh1 327b8851fccSafresh1For example, you can write code in EUC-JP as follows: 328b39c5158Smillert 329b39c5158Smillert my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji 330b39c5158Smillert #<-char-><-char-> # 4 octets 331b39c5158Smillert s/\bCamel\b/$Rakuda/; 332b39c5158Smillert 333b39c5158SmillertAnd with C<use encoding "euc-jp"> in effect, it is the same thing as 334b8851fccSafresh1that code in UTF-8: 335b39c5158Smillert 336b39c5158Smillert my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters 337b39c5158Smillert s/\bCamel\b/$Rakuda/; 338b39c5158Smillert 339b8851fccSafresh1See L</EXAMPLE> below for a more complete example. 340b39c5158Smillert 341b8851fccSafresh1Unless C<${^UNICODE}> (available starting in v5.8.2) exists and is non-zero, the 342b8851fccSafresh1PerlIO layers of C<STDIN> and C<STDOUT> are set to "C<:encoding(I<ENCNAME>)>". 343b8851fccSafresh1Therefore, 344b39c5158Smillert 345b39c5158Smillert use encoding "euc-jp"; 346b39c5158Smillert my $message = "Camel is the symbol of perl.\n"; 347b39c5158Smillert my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji 348b39c5158Smillert $message =~ s/\bCamel\b/$Rakuda/; 349b39c5158Smillert print $message; 350b39c5158Smillert 351b8851fccSafresh1will print 352b8851fccSafresh1 353b8851fccSafresh1 "\xF1\xD1\xF1\xCC is the symbol of perl.\n" 354b8851fccSafresh1 355b8851fccSafresh1not 356b8851fccSafresh1 357b8851fccSafresh1 "\x{99F1}\x{99DD} is the symbol of perl.\n" 358b39c5158Smillert 359b39c5158SmillertYou can override this by giving extra arguments; see below. 360b39c5158Smillert 361b8851fccSafresh1Note that C<STDERR> WILL NOT be changed, regardless. 362b39c5158Smillert 363b8851fccSafresh1Also note that non-STD file handles remain unaffected. Use C<use 364b8851fccSafresh1open> or C<binmode> to change the layers of those. 365b8851fccSafresh1 3669f11ffb7Safresh1=item C<use encoding I<ENCNAME>, Filter=E<gt>1;> 367b8851fccSafresh1 368b8851fccSafresh1This operates as above, but the C<Filter> argument with a non-zero 369b8851fccSafresh1value causes the entire script, and not just literals, to be translated from 370b8851fccSafresh1the encoding into UTF-8. This allows identifiers in the source to be in that 371b8851fccSafresh1encoding as well. (Problems may occur if the encoding is not a superset of 372b8851fccSafresh1ASCII; imagine all your semi-colons being translated into something 373b8851fccSafresh1different.) One can use this form to make 374b8851fccSafresh1 375b8851fccSafresh1 ${"\x{4eba}"}++ 376b8851fccSafresh1 377b8851fccSafresh1work. (This is equivalent to C<$I<human>++>, where I<human> is a single Han 378b8851fccSafresh1ideograph). 379b8851fccSafresh1 380b8851fccSafresh1This effectively means that your source code behaves as if it were written in 381b8851fccSafresh1UTF-8 with C<'use utf8>' in effect. So even if your editor only supports 382b8851fccSafresh1Shift_JIS, for example, you can still try examples in Chapter 15 of 383b8851fccSafresh1C<Programming Perl, 3rd Ed.>. 384b8851fccSafresh1 385b8851fccSafresh1This option is significantly slower than the other one. 386b8851fccSafresh1 387b8851fccSafresh1=item C<no encoding;> 388b8851fccSafresh1 389b8851fccSafresh1Unsets the script encoding. The layers of C<STDIN>, C<STDOUT> are 390b8851fccSafresh1reset to "C<:raw>" (the default unprocessed raw stream of bytes). 391b8851fccSafresh1 392b8851fccSafresh1=back 393b8851fccSafresh1 394b8851fccSafresh1=head1 OPTIONS 395b8851fccSafresh1 396b8851fccSafresh1=head2 Setting C<STDIN> and/or C<STDOUT> individually 397b8851fccSafresh1 398b8851fccSafresh1The encodings of C<STDIN> and C<STDOUT> are individually settable by parameters to 399b8851fccSafresh1the pragma: 400b8851fccSafresh1 401b8851fccSafresh1 use encoding 'euc-tw', STDIN => 'greek' ...; 402b8851fccSafresh1 403b8851fccSafresh1In this case, you cannot omit the first I<ENCNAME>. C<< STDIN => undef >> 404b8851fccSafresh1turns the I/O transcoding completely off for that filehandle. 405b8851fccSafresh1 406b8851fccSafresh1When C<${^UNICODE}> (available starting in v5.8.2) exists and is non-zero, 407b8851fccSafresh1these options will be completely ignored. See L<perlvar/C<${^UNICODE}>> and 408b8851fccSafresh1L<"C<-C>" in perlrun|perlrun/-C [numberE<sol>list]> for details. 409b8851fccSafresh1 410b8851fccSafresh1=head2 The C<:locale> sub-pragma 411b8851fccSafresh1 412b8851fccSafresh1Starting in v5.8.6, the encoding name may be C<:locale>. This means that the 413b8851fccSafresh1encoding is taken from the current locale, and not hard-coded by the pragma. 414b8851fccSafresh1Since a script really can only be encoded in exactly one encoding, this option 415b8851fccSafresh1is dangerous. It makes sense only if the script itself is written in ASCII, 416b8851fccSafresh1and all the possible locales that will be in use when the script is executed 417b8851fccSafresh1are supersets of ASCII. That means that the script itself doesn't get 418b8851fccSafresh1changed, but the I/O handles have the specified encoding added, and the 419b8851fccSafresh1operations like C<chr> and C<ord> use that encoding. 420b8851fccSafresh1 421b8851fccSafresh1The logic of finding which locale C<:locale> uses is as follows: 422b8851fccSafresh1 423b8851fccSafresh1=over 4 424b8851fccSafresh1 425b8851fccSafresh1=item 1. 426b8851fccSafresh1 427b8851fccSafresh1If the platform supports the C<langinfo(CODESET)> interface, the codeset 428b8851fccSafresh1returned is used as the default encoding for the open pragma. 429b8851fccSafresh1 430b8851fccSafresh1=item 2. 431b8851fccSafresh1 432b8851fccSafresh1If 1. didn't work but we are under the locale pragma, the environment 433b8851fccSafresh1variables C<LC_ALL> and C<LANG> (in that order) are matched for encodings 434b8851fccSafresh1(the part after "C<.>", if any), and if any found, that is used 435b8851fccSafresh1as the default encoding for the open pragma. 436b8851fccSafresh1 437b8851fccSafresh1=item 3. 438b8851fccSafresh1 439b8851fccSafresh1If 1. and 2. didn't work, the environment variables C<LC_ALL> and C<LANG> 440b8851fccSafresh1(in that order) are matched for anything looking like UTF-8, and if 441b8851fccSafresh1any found, C<:utf8> is used as the default encoding for the open 442b8851fccSafresh1pragma. 443b8851fccSafresh1 444b8851fccSafresh1=back 445b8851fccSafresh1 446b8851fccSafresh1If your locale environment variables (C<LC_ALL>, C<LC_CTYPE>, C<LANG>) 447b8851fccSafresh1contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching), 448b8851fccSafresh1the default encoding of your C<STDIN>, C<STDOUT>, and C<STDERR>, and of 449b8851fccSafresh1B<any subsequent file open>, is UTF-8. 450b8851fccSafresh1 451b8851fccSafresh1=head1 CAVEATS 452b8851fccSafresh1 453b8851fccSafresh1=head2 SIDE EFFECTS 454b8851fccSafresh1 455b8851fccSafresh1=over 456b8851fccSafresh1 457b8851fccSafresh1=item * 458b8851fccSafresh1 459b8851fccSafresh1If the C<encoding> pragma is in scope then the lengths returned are 460b8851fccSafresh1calculated from the length of C<$/> in Unicode characters, which is not 461b8851fccSafresh1always the same as the length of C<$/> in the native encoding. 462b8851fccSafresh1 463b8851fccSafresh1=item * 464b8851fccSafresh1 465b8851fccSafresh1Without this pragma, if strings operating under byte semantics and strings 466b39c5158Smillertwith Unicode character data are concatenated, the new string will 467b39c5158Smillertbe created by decoding the byte strings as I<ISO 8859-1 (Latin-1)>. 468b39c5158Smillert 469b39c5158SmillertThe B<encoding> pragma changes this to use the specified encoding 470b39c5158Smillertinstead. For example: 471b39c5158Smillert 472b39c5158Smillert use encoding 'utf8'; 473b39c5158Smillert my $string = chr(20000); # a Unicode string 474b39c5158Smillert utf8::encode($string); # now it's a UTF-8 encoded byte string 475b39c5158Smillert # concatenate with another Unicode string 476b39c5158Smillert print length($string . chr(20000)); 477b39c5158Smillert 478b39c5158SmillertWill print C<2>, because C<$string> is upgraded as UTF-8. Without 479b39c5158SmillertC<use encoding 'utf8';>, it will print C<4> instead, since C<$string> 480b39c5158Smillertis three octets when interpreted as Latin-1. 481b39c5158Smillert 482b39c5158Smillert=back 483b39c5158Smillert 484b39c5158Smillert=head2 DO NOT MIX MULTIPLE ENCODINGS 485b39c5158Smillert 486b39c5158SmillertNotice that only literals (string or regular expression) having only 487b39c5158Smillertlegacy code points are affected: if you mix data like this 488b39c5158Smillert 489b8851fccSafresh1 \x{100}\xDF 490b39c5158Smillert \xDF\x{100} 491b39c5158Smillert 492b39c5158Smillertthe data is assumed to be in (Latin 1 and) Unicode, not in your native 493b39c5158Smillertencoding. In other words, this will match in "greek": 494b39c5158Smillert 495b39c5158Smillert "\xDF" =~ /\x{3af}/ 496b39c5158Smillert 497b39c5158Smillertbut this will not 498b39c5158Smillert 499b39c5158Smillert "\xDF\x{100}" =~ /\x{3af}\x{100}/ 500b39c5158Smillert 501b39c5158Smillertsince the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on 502b39c5158Smillertthe left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL 503b39c5158SmillertLETTER IOTA WITH TONOS) because of the C<\x{100}> on the left. You 504b39c5158Smillertshould not be mixing your legacy data and Unicode in the same string. 505b39c5158Smillert 506b39c5158SmillertThis pragma also affects encoding of the 0x80..0xFF code point range: 507b39c5158Smillertnormally characters in that range are left as eight-bit bytes (unless 508b39c5158Smillertthey are combined with characters with code points 0x100 or larger, 509b39c5158Smillertin which case all characters need to become UTF-8 encoded), but if 510b39c5158Smillertthe C<encoding> pragma is present, even the 0x80..0xFF range always 511b39c5158Smillertgets UTF-8 encoded. 512b39c5158Smillert 513b39c5158SmillertAfter all, the best thing about this pragma is that you don't have to 514b39c5158Smillertresort to \x{....} just to spell your name in a native encoding. 515b39c5158SmillertSo feel free to put your strings in your encoding in quotes and 516b39c5158Smillertregexes. 517b39c5158Smillert 518b8851fccSafresh1=head2 Prior to Perl v5.22 519b8851fccSafresh1 520b8851fccSafresh1The pragma was a per script, not a per block lexical. Only the last 521b8851fccSafresh1C<use encoding> or C<no encoding> mattered, and it affected 522b8851fccSafresh1B<the whole script>. However, the C<no encoding> pragma was supported and 523b8851fccSafresh1C<use encoding> could appear as many times as you want in a given script 524b8851fccSafresh1(though only the last was effective). 525b8851fccSafresh1 526b8851fccSafresh1Since the scope wasn't lexical, other modules' use of C<chr>, C<ord>, I<etc.> 527b8851fccSafresh1were affected. This leads to spooky, incorrect action at a distance that is 528b8851fccSafresh1hard to debug. 529b8851fccSafresh1 530b8851fccSafresh1This means you would have to be very careful of the load order: 531b8851fccSafresh1 532b8851fccSafresh1 # called module 533b8851fccSafresh1 package Module_IN_BAR; 534b8851fccSafresh1 use encoding "bar"; 535b8851fccSafresh1 # stuff in "bar" encoding here 536b8851fccSafresh1 1; 537b8851fccSafresh1 538b8851fccSafresh1 # caller script 539b8851fccSafresh1 use encoding "foo" 540b8851fccSafresh1 use Module_IN_BAR; 541b8851fccSafresh1 # surprise! use encoding "bar" is in effect. 542b8851fccSafresh1 543b8851fccSafresh1The best way to avoid this oddity is to use this pragma RIGHT AFTER 544b8851fccSafresh1other modules are loaded. i.e. 545b8851fccSafresh1 546b8851fccSafresh1 use Module_IN_BAR; 547b8851fccSafresh1 use encoding "foo"; 548b8851fccSafresh1 549b8851fccSafresh1=head2 Prior to Encode version 1.87 550b8851fccSafresh1 551b8851fccSafresh1=over 552b8851fccSafresh1 553b8851fccSafresh1=item * 554b8851fccSafresh1 555b8851fccSafresh1C<STDIN> and C<STDOUT> were not set under the filter option. 556b8851fccSafresh1And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> didn't work like 557b8851fccSafresh1non-filter version. 558b8851fccSafresh1 559b8851fccSafresh1=item * 560b8851fccSafresh1 561b8851fccSafresh1C<use utf8> wasn't implicitly declared so you have to C<use utf8> to do 562b8851fccSafresh1 563b8851fccSafresh1 ${"\x{4eba}"}++ 564b8851fccSafresh1 565b8851fccSafresh1=back 566b8851fccSafresh1 567b8851fccSafresh1=head2 Prior to Perl v5.8.1 568b8851fccSafresh1 569b8851fccSafresh1=over 570b8851fccSafresh1 571b8851fccSafresh1=item "NON-EUC" doublebyte encodings 572b8851fccSafresh1 573b8851fccSafresh1Because perl needs to parse the script before applying this pragma, such 574b8851fccSafresh1encodings as Shift_JIS and Big-5 that may contain C<'\'> (BACKSLASH; 575b8851fccSafresh1C<\x5c>) in the second byte fail because the second byte may 576b8851fccSafresh1accidentally escape the quoting character that follows. 577b8851fccSafresh1 578b8851fccSafresh1=item C<tr///> 579b39c5158Smillert 580b39c5158SmillertThe B<encoding> pragma works by decoding string literals in 581b8851fccSafresh1C<q//,qq//,qr//,qw///, qx//> and so forth. In perl v5.8.0, this 582b39c5158Smillertdoes not apply to C<tr///>. Therefore, 583b39c5158Smillert 584b39c5158Smillert use encoding 'euc-jp'; 585b39c5158Smillert #.... 586b39c5158Smillert $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/; 587b39c5158Smillert # -------- -------- -------- -------- 588b39c5158Smillert 589b39c5158SmillertDoes not work as 590b39c5158Smillert 591b39c5158Smillert $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/; 592b39c5158Smillert 593b39c5158Smillert=over 594b39c5158Smillert 595b39c5158Smillert=item Legend of characters above 596b39c5158Smillert 597b39c5158Smillert utf8 euc-jp charnames::viacode() 598b39c5158Smillert ----------------------------------------- 599b39c5158Smillert \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A 600b39c5158Smillert \x{3093} \xA4\xF3 HIRAGANA LETTER N 601b39c5158Smillert \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A 602b39c5158Smillert \x{30f3} \xA5\xF3 KATAKANA LETTER N 603b39c5158Smillert 604b39c5158Smillert=back 605b39c5158Smillert 606b8851fccSafresh1This counterintuitive behavior has been fixed in perl v5.8.1. 607b39c5158Smillert 608b8851fccSafresh1In perl v5.8.0, you can work around this as follows; 609b39c5158Smillert 610b39c5158Smillert use encoding 'euc-jp'; 611b39c5158Smillert # .... 612b39c5158Smillert eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ }; 613b39c5158Smillert 614b39c5158SmillertNote the C<tr//> expression is surrounded by C<qq{}>. The idea behind 615b8851fccSafresh1this is the same as the classic idiom that makes C<tr///> 'interpolate': 616b39c5158Smillert 617b39c5158Smillert tr/$from/$to/; # wrong! 618b39c5158Smillert eval qq{ tr/$from/$to/ }; # workaround. 619b39c5158Smillert 620b8851fccSafresh1=back 621b39c5158Smillert 622b39c5158Smillert=head1 EXAMPLE - Greekperl 623b39c5158Smillert 624b39c5158Smillert use encoding "iso 8859-7"; 625b39c5158Smillert 626b39c5158Smillert # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode. 627b39c5158Smillert 628b39c5158Smillert $a = "\xDF"; 629b39c5158Smillert $b = "\x{100}"; 630b39c5158Smillert 631b39c5158Smillert printf "%#x\n", ord($a); # will print 0x3af, not 0xdf 632b39c5158Smillert 633b39c5158Smillert $c = $a . $b; 634b39c5158Smillert 635b39c5158Smillert # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}". 636b39c5158Smillert 637b39c5158Smillert # chr() is affected, and ... 638b39c5158Smillert 639b39c5158Smillert print "mega\n" if ord(chr(0xdf)) == 0x3af; 640b39c5158Smillert 641b39c5158Smillert # ... ord() is affected by the encoding pragma ... 642b39c5158Smillert 643b39c5158Smillert print "tera\n" if ord(pack("C", 0xdf)) == 0x3af; 644b39c5158Smillert 645b39c5158Smillert # ... as are eq and cmp ... 646b39c5158Smillert 647b39c5158Smillert print "peta\n" if "\x{3af}" eq pack("C", 0xdf); 648b39c5158Smillert print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0; 649b39c5158Smillert 650b39c5158Smillert # ... but pack/unpack C are not affected, in case you still 651b39c5158Smillert # want to go back to your native encoding 652b39c5158Smillert 653b39c5158Smillert print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf; 654b39c5158Smillert 655b8851fccSafresh1=head1 BUGS 656b39c5158Smillert 657b39c5158Smillert=over 658b39c5158Smillert 659b8851fccSafresh1=item Thread safety 660b8851fccSafresh1 661b8851fccSafresh1C<use encoding ...> is not thread-safe (i.e., do not use in threaded 662b8851fccSafresh1applications). 663b8851fccSafresh1 664b8851fccSafresh1=item Can't be used by more than one module in a single program. 665b8851fccSafresh1 666b8851fccSafresh1Only one encoding is allowed. If you combine modules in a program that have 667b8851fccSafresh1different encodings, only one will be actually used. 668b8851fccSafresh1 669b8851fccSafresh1=item Other modules using C<STDIN> and C<STDOUT> get the encoded stream 670b8851fccSafresh1 671b8851fccSafresh1They may be expecting something completely different. 672b8851fccSafresh1 673b39c5158Smillert=item literals in regex that are longer than 127 bytes 674b39c5158Smillert 675b39c5158SmillertFor native multibyte encodings (either fixed or variable length), 676b39c5158Smillertthe current implementation of the regular expressions may introduce 677b39c5158Smillertrecoding errors for regular expression literals longer than 127 bytes. 678b39c5158Smillert 679b39c5158Smillert=item EBCDIC 680b39c5158Smillert 681b39c5158SmillertThe encoding pragma is not supported on EBCDIC platforms. 682b39c5158Smillert 683b8851fccSafresh1=item C<format> 684b39c5158Smillert 685b8851fccSafresh1This pragma doesn't work well with C<format> because PerlIO does not 686b8851fccSafresh1get along very well with it. When C<format> contains non-ASCII 687b39c5158Smillertcharacters it prints funny or gets "wide character warnings". 688b39c5158SmillertTo understand it, try the code below. 689b39c5158Smillert 690b39c5158Smillert # Save this one in utf8 691b39c5158Smillert # replace *non-ascii* with a non-ascii string 692b39c5158Smillert my $camel; 693b39c5158Smillert format STDOUT = 694b39c5158Smillert *non-ascii*@>>>>>>> 695b39c5158Smillert $camel 696b39c5158Smillert . 697b39c5158Smillert $camel = "*non-ascii*"; 698b39c5158Smillert binmode(STDOUT=>':encoding(utf8)'); # bang! 699b39c5158Smillert write; # funny 700b39c5158Smillert print $camel, "\n"; # fine 701b39c5158Smillert 702b39c5158SmillertWithout binmode this happens to work but without binmode, print() 703b39c5158Smillertfails instead of write(). 704b39c5158Smillert 705b8851fccSafresh1At any rate, the very use of C<format> is questionable when it comes to 706b39c5158Smillertunicode characters since you have to consider such things as character 707b39c5158Smillertwidth (i.e. double-width for ideographs) and directions (i.e. BIDI for 708b39c5158SmillertArabic and Hebrew). 709b39c5158Smillert 710b8851fccSafresh1=item See also L</CAVEATS> 711b39c5158Smillert 712b39c5158Smillert=back 713b39c5158Smillert 714b39c5158Smillert=head1 HISTORY 715b39c5158Smillert 716b8851fccSafresh1This pragma first appeared in Perl v5.8.0. It has been enhanced in later 717b8851fccSafresh1releases as specified above. 718b39c5158Smillert 719b39c5158Smillert=head1 SEE ALSO 720b39c5158Smillert 721b39c5158SmillertL<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>, 722b39c5158Smillert 723b39c5158SmillertCh. 15 of C<Programming Perl (3rd Edition)> 724b39c5158Smillertby Larry Wall, Tom Christiansen, Jon Orwant; 725b39c5158SmillertO'Reilly & Associates; ISBN 0-596-00027-8 726b39c5158Smillert 727b39c5158Smillert=cut 728