cpan/Encode/encoding.pm

*56d68f1eSafresh1# $Id: encoding.pm,v 3.00 2020/04/19 10:56:28 dankogai Exp $
b39c5158Smillertpackage encoding;
*56d68f1eSafresh1our $VERSION = sprintf "%d.%02d", q$Revision: 3.00 $ =~ /(\d+)/g;
b39c5158Smillert
b39c5158Smillertuse Encode;
b39c5158Smillertuse strict;
b39c5158Smillertuse warnings;
9f11ffb7Safresh1use Config;
b39c5158Smillert
b8851fccSafresh1use constant {
b8851fccSafresh1    DEBUG => !!$ENV{PERL_ENCODE_DEBUG},
b8851fccSafresh1    HAS_PERLIO => eval { require PerlIO::encoding; PerlIO::encoding->VERSION(0.02) },
9f11ffb7Safresh1    PERL_5_21_7 => $^V && $^V ge v5.21.7, # lexically scoped
b8851fccSafresh1};
b39c5158Smillert
b39c5158Smillertsub _exception {
b39c5158Smillert    my $name = shift;
b39c5158Smillert    $] > 5.008 and return 0;    # 5.8.1 or higher then no
b39c5158Smillert    my %utfs = map { $_ => 1 }
b39c5158Smillert      qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE
b39c5158Smillert      UTF-32 UTF-32BE UTF-32LE);
b39c5158Smillert    $utfs{$name} or return 0;    # UTFs or no
b39c5158Smillert    require Config;
b39c5158Smillert    Config->import();
b39c5158Smillert    our %Config;
b39c5158Smillert    return $Config{perl_patchlevel} ? 0 : 1    # maintperl then no
b39c5158Smillert}
b39c5158Smillert
b39c5158Smillertsub in_locale { $^H & ( $locale::hint_bits || 0 ) }
b39c5158Smillert
b39c5158Smillertsub _get_locale_encoding {
b39c5158Smillert    my $locale_encoding;
b39c5158Smillert
b8851fccSafresh1    if ($^O eq 'MSWin32') {
b8851fccSafresh1        my @tries = (
b8851fccSafresh1            # First try to get the OutputCP. This will work only if we
b8851fccSafresh1            # are attached to a console
b8851fccSafresh1            'Win32.pm' => 'Win32::GetConsoleOutputCP',
b8851fccSafresh1            'Win32/Console.pm' => 'Win32::Console::OutputCP',
b8851fccSafresh1            # If above failed, this means that we are a GUI app
b8851fccSafresh1            # Let's assume that the ANSI codepage is what matters
b8851fccSafresh1            'Win32.pm' => 'Win32::GetACP',
b8851fccSafresh1        );
b8851fccSafresh1        while (@tries) {
b8851fccSafresh1            my $cp = eval {
b8851fccSafresh1                require $tries[0];
b8851fccSafresh1                no strict 'refs';
b8851fccSafresh1                &{$tries[1]}()
b39c5158Smillert            };
b8851fccSafresh1            if ($cp) {
b8851fccSafresh1                if ($cp == 65001) { # Code page for UTF-8
b8851fccSafresh1                    $locale_encoding = 'UTF-8';
b8851fccSafresh1                } else {
b8851fccSafresh1                    $locale_encoding = 'cp' . $cp;
b8851fccSafresh1                }
b8851fccSafresh1                return $locale_encoding;
b8851fccSafresh1            }
b8851fccSafresh1            splice(@tries, 0, 2)
b8851fccSafresh1        }
b8851fccSafresh1    }
b39c5158Smillert
b8851fccSafresh1    # I18N::Langinfo isn't available everywhere
b8851fccSafresh1    $locale_encoding = eval {
b8851fccSafresh1        require I18N::Langinfo;
b8851fccSafresh1        find_encoding(
b8851fccSafresh1            I18N::Langinfo::langinfo( I18N::Langinfo::CODESET() )
b8851fccSafresh1        )->name
b8851fccSafresh1    };
b8851fccSafresh1    return $locale_encoding if defined $locale_encoding;
b8851fccSafresh1
b8851fccSafresh1    eval {
b8851fccSafresh1        require POSIX;
b8851fccSafresh1        # Get the current locale
b8851fccSafresh1        # Remember that MSVCRT impl is quite different from Unixes
b8851fccSafresh1        my $locale = POSIX::setlocale(POSIX::LC_CTYPE());
b8851fccSafresh1        if ( $locale =~ /^([^.]+)\.([^.@]+)(?:@.*)?$/ ) {
b39c5158Smillert            my $country_language;
b39c5158Smillert            ( $country_language, $locale_encoding ) = ( $1, $2 );
b39c5158Smillert
b39c5158Smillert            # Could do more heuristics based on the country and language
b39c5158Smillert            # since we have Locale::Country and Locale::Language available.
b39c5158Smillert            # TODO: get a database of Language -> Encoding mappings
b39c5158Smillert            # (the Estonian database at http://www.eki.ee/letter/
b39c5158Smillert            # would be excellent!) --jhi
b8851fccSafresh1            if (lc($locale_encoding) eq 'euc') {
b39c5158Smillert                if ( $country_language =~ /^ja_JP|japan(?:ese)?$/i ) {
b39c5158Smillert                    $locale_encoding = 'euc-jp';
b39c5158Smillert                }
b39c5158Smillert                elsif ( $country_language =~ /^ko_KR|korean?$/i ) {
b39c5158Smillert                    $locale_encoding = 'euc-kr';
b39c5158Smillert                }
b39c5158Smillert                elsif ( $country_language =~ /^zh_CN|chin(?:a|ese)$/i ) {
b39c5158Smillert                    $locale_encoding = 'euc-cn';
b39c5158Smillert                }
b39c5158Smillert                elsif ( $country_language =~ /^zh_TW|taiwan(?:ese)?$/i ) {
b39c5158Smillert                    $locale_encoding = 'euc-tw';
b39c5158Smillert                }
b39c5158Smillert                else {
b39c5158Smillert                    require Carp;
b39c5158Smillert                    Carp::croak(
b39c5158Smillert                        "encoding: Locale encoding '$locale_encoding' too ambiguous"
b39c5158Smillert                    );
b39c5158Smillert                }
b39c5158Smillert            }
b8851fccSafresh1        }
b8851fccSafresh1    };
b39c5158Smillert
b39c5158Smillert    return $locale_encoding;
b39c5158Smillert}
b39c5158Smillert
b39c5158Smillertsub import {
b8851fccSafresh1
b8851fccSafresh1    if ( ord("A") == 193 ) {
b8851fccSafresh1        require Carp;
b8851fccSafresh1        Carp::croak("encoding: pragma does not support EBCDIC platforms");
b8851fccSafresh1    }
b8851fccSafresh1
9f11ffb7Safresh1    my $deprecate =
9f11ffb7Safresh1        ($] >= 5.017 and !$Config{usecperl})
9f11ffb7Safresh1        ? "Use of the encoding pragma is deprecated" : 0;
9f11ffb7Safresh1
b39c5158Smillert    my $class = shift;
b39c5158Smillert    my $name  = shift;
e9ce3842Safresh1    if (!$name){
e9ce3842Safresh1	require Carp;
e9ce3842Safresh1        Carp::croak("encoding: no encoding specified.");
e9ce3842Safresh1    }
b39c5158Smillert    if ( $name eq ':_get_locale_encoding' ) {    # used by lib/open.pm
b39c5158Smillert        my $caller = caller();
b39c5158Smillert        {
b39c5158Smillert            no strict 'refs';
b39c5158Smillert            *{"${caller}::_get_locale_encoding"} = \&_get_locale_encoding;
b39c5158Smillert        }
b39c5158Smillert        return;
b39c5158Smillert    }
b39c5158Smillert    $name = _get_locale_encoding() if $name eq ':locale';
9f11ffb7Safresh1    BEGIN { strict->unimport('hashpairs') if $] >= 5.027 and $^V =~ /c$/; }
b39c5158Smillert    my %arg = @_;
b39c5158Smillert    $name = $ENV{PERL_ENCODING} unless defined $name;
b39c5158Smillert    my $enc = find_encoding($name);
b39c5158Smillert    unless ( defined $enc ) {
b39c5158Smillert        require Carp;
b39c5158Smillert        Carp::croak("encoding: Unknown encoding '$name'");
b39c5158Smillert    }
b39c5158Smillert    $name = $enc->name;    # canonize
b39c5158Smillert    unless ( $arg{Filter} ) {
9f11ffb7Safresh1        if ($] >= 5.025003 and !$Config{usecperl}) {
9f11ffb7Safresh1            require Carp;
9f11ffb7Safresh1            Carp::croak("The encoding pragma is no longer supported. Check cperl");
9f11ffb7Safresh1        }
9f11ffb7Safresh1        warnings::warnif("deprecated",$deprecate) if $deprecate;
9f11ffb7Safresh1
b39c5158Smillert        DEBUG and warn "_exception($name) = ", _exception($name);
b8851fccSafresh1        if (! _exception($name)) {
b8851fccSafresh1            if (!PERL_5_21_7) {
b8851fccSafresh1                ${^ENCODING} = $enc;
b8851fccSafresh1            }
b8851fccSafresh1            else {
b8851fccSafresh1                # Starting with 5.21.7, this pragma uses a shadow variable
b8851fccSafresh1                # designed explicitly for it, ${^E_NCODING}, to enforce
b8851fccSafresh1                # lexical scope; instead of ${^ENCODING}.
b8851fccSafresh1                $^H{'encoding'} = 1;
b8851fccSafresh1                ${^E_NCODING} = $enc;
b8851fccSafresh1            }
b8851fccSafresh1        }
9f11ffb7Safresh1        if (! HAS_PERLIO ) {
9f11ffb7Safresh1            return 1;
9f11ffb7Safresh1        }
b39c5158Smillert    }
b39c5158Smillert    else {
9f11ffb7Safresh1        warnings::warnif("deprecated",$deprecate) if $deprecate;
9f11ffb7Safresh1
b39c5158Smillert        defined( ${^ENCODING} ) and undef ${^ENCODING};
b8851fccSafresh1        undef ${^E_NCODING} if PERL_5_21_7;
b39c5158Smillert
b39c5158Smillert        # implicitly 'use utf8'
b39c5158Smillert        require utf8;      # to fetch $utf8::hint_bits;
b39c5158Smillert        $^H |= $utf8::hint_bits;
9f11ffb7Safresh1
b39c5158Smillert            require Filter::Util::Call;
b39c5158Smillert            Filter::Util::Call->import;
b39c5158Smillert            filter_add(
b39c5158Smillert                sub {
b39c5158Smillert                    my $status = filter_read();
b39c5158Smillert                    if ( $status > 0 ) {
b39c5158Smillert                        $_ = $enc->decode( $_, 1 );
b39c5158Smillert                        DEBUG and warn $_;
b39c5158Smillert                    }
b39c5158Smillert                    $status;
b39c5158Smillert                }
b39c5158Smillert            );
b39c5158Smillert    }
b39c5158Smillert    defined ${^UNICODE} and ${^UNICODE} != 0 and return 1;
b39c5158Smillert    for my $h (qw(STDIN STDOUT)) {
b39c5158Smillert        if ( $arg{$h} ) {
b39c5158Smillert            unless ( defined find_encoding( $arg{$h} ) ) {
b39c5158Smillert                require Carp;
b39c5158Smillert                Carp::croak(
b39c5158Smillert                    "encoding: Unknown encoding for $h, '$arg{$h}'");
b39c5158Smillert            }
9f11ffb7Safresh1            binmode( $h, ":raw :encoding($arg{$h})" );
b39c5158Smillert        }
b39c5158Smillert        else {
b39c5158Smillert            unless ( exists $arg{$h} ) {
b39c5158Smillert                    no warnings 'uninitialized';
b39c5158Smillert                    binmode( $h, ":raw :encoding($name)" );
b39c5158Smillert            }
b39c5158Smillert        }
b39c5158Smillert    }
b39c5158Smillert    return 1;    # I doubt if we need it, though
b39c5158Smillert}
b39c5158Smillert
b39c5158Smillertsub unimport {
b39c5158Smillert    no warnings;
b39c5158Smillert    undef ${^ENCODING};
b8851fccSafresh1    undef ${^E_NCODING} if PERL_5_21_7;
b8851fccSafresh1    if (HAS_PERLIO) {
b39c5158Smillert        binmode( STDIN,  ":raw" );
b39c5158Smillert        binmode( STDOUT, ":raw" );
b39c5158Smillert    }
b39c5158Smillert    else {
b39c5158Smillert        binmode(STDIN);
b39c5158Smillert        binmode(STDOUT);
b39c5158Smillert    }
b39c5158Smillert    if ( $INC{"Filter/Util/Call.pm"} ) {
b39c5158Smillert        eval { filter_del() };
b39c5158Smillert    }
b39c5158Smillert}
b39c5158Smillert
b39c5158Smillert1;
b39c5158Smillert__END__
b39c5158Smillert
b39c5158Smillert=pod
b39c5158Smillert
b39c5158Smillert=head1 NAME
b39c5158Smillert
b8851fccSafresh1encoding - allows you to write your script in non-ASCII and non-UTF-8
b39c5158Smillert
e9ce3842Safresh1=head1 WARNING
e9ce3842Safresh1
b8851fccSafresh1This module has been deprecated since perl v5.18.  See L</DESCRIPTION> and
b8851fccSafresh1L</BUGS>.
e5157e49Safresh1
b39c5158Smillert=head1 SYNOPSIS
b39c5158Smillert
b39c5158Smillert  use encoding "greek";  # Perl like Greek to you?
b39c5158Smillert  use encoding "euc-jp"; # Jperl!
b39c5158Smillert
b39c5158Smillert  # or you can even do this if your shell supports your native encoding
b39c5158Smillert
b39c5158Smillert  perl -Mencoding=latin2 -e'...' # Feeling centrally European?
b39c5158Smillert  perl -Mencoding=euc-kr -e'...' # Or Korean?
b39c5158Smillert
b39c5158Smillert  # more control
b39c5158Smillert
b39c5158Smillert  # A simple euc-cn => utf-8 converter
b39c5158Smillert  use encoding "euc-cn", STDOUT => "utf8";  while(<>){print};
b39c5158Smillert
b8851fccSafresh1  # "no encoding;" supported
b39c5158Smillert  no encoding;
b39c5158Smillert
b39c5158Smillert  # an alternate way, Filter
b39c5158Smillert  use encoding "euc-jp", Filter=>1;
b39c5158Smillert  # now you can use kanji identifiers -- in euc-jp!
b39c5158Smillert
b8851fccSafresh1  # encode based on the current locale - specialized purposes only;
b8851fccSafresh1  # fraught with danger!!
b39c5158Smillert  use encoding ':locale';
b39c5158Smillert
b8851fccSafresh1=head1 DESCRIPTION
b39c5158Smillert
b8851fccSafresh1This pragma is used to enable a Perl script to be written in encodings that
b8851fccSafresh1aren't strictly ASCII nor UTF-8.  It translates all or portions of the Perl
b8851fccSafresh1program script from a given encoding into UTF-8, and changes the PerlIO layers
b8851fccSafresh1of C<STDIN> and C<STDOUT> to the encoding specified.
b39c5158Smillert
b8851fccSafresh1This pragma dates from the days when UTF-8-enabled editors were uncommon.  But
b8851fccSafresh1that was long ago, and the need for it is greatly diminished.  That, coupled
b8851fccSafresh1with the fact that it doesn't work with threads, along with other problems,
b8851fccSafresh1(see L</BUGS>) have led to its being deprecated.  It is planned to remove this
b8851fccSafresh1pragma in a future Perl version.  New code should be written in UTF-8, and the
b8851fccSafresh1C<use utf8> pragma used instead (see L<perluniintro> and L<utf8> for details).
b8851fccSafresh1Old code should be converted to UTF-8, via something like the recipe in the
b8851fccSafresh1L</SYNOPSIS> (though this simple approach may require manual adjustments
b8851fccSafresh1afterwards).
b39c5158Smillert
9f11ffb7Safresh1If UTF-8 is not an option, it is recommended that one use a simple source
9f11ffb7Safresh1filter, such as that provided by L<Filter::Encoding> on CPAN or this
9f11ffb7Safresh1pragma's own C<Filter> option (see below).
9f11ffb7Safresh1
b8851fccSafresh1The only legitimate use of this pragma is almost certainly just one per file,
b8851fccSafresh1near the top, with file scope, as the file is likely going to only be written
b8851fccSafresh1in one encoding.  Further restrictions apply in Perls before v5.22 (see
b8851fccSafresh1L</Prior to Perl v5.22>).
b39c5158Smillert
b8851fccSafresh1There are two basic modes of operation (plus turning if off):
b39c5158Smillert
b8851fccSafresh1=over 4
b39c5158Smillert
b8851fccSafresh1=item C<use encoding ['I<ENCNAME>'] ;>
b39c5158Smillert
9f11ffb7Safresh1Please note: This mode of operation is no longer supported as of Perl
9f11ffb7Safresh1v5.26.
9f11ffb7Safresh1
b8851fccSafresh1This is the normal operation.  It translates various literals encountered in
b8851fccSafresh1the Perl source file from the encoding I<ENCNAME> into UTF-8, and similarly
b8851fccSafresh1converts character code points.  This is used when the script is a combination
b8851fccSafresh1of ASCII (for the variable names and punctuation, I<etc>), but the literal
b8851fccSafresh1data is in the specified encoding.
b39c5158Smillert
b8851fccSafresh1I<ENCNAME> is optional.  If omitted, the encoding specified in the environment
b8851fccSafresh1variable L<C<PERL_ENCODING>|perlrun/PERL_ENCODING> is used.  If this isn't
b8851fccSafresh1set, or the resolved-to encoding is not known to C<L<Encode>>, the error
b8851fccSafresh1C<Unknown encoding 'I<ENCNAME>'> will be thrown.
b39c5158Smillert
b8851fccSafresh1Starting in Perl v5.8.6 (C<Encode> version 2.0.1), I<ENCNAME> may be the
b8851fccSafresh1name C<:locale>.  This is for very specialized applications, and is documented
b8851fccSafresh1in L</The C<:locale> sub-pragma> below.
b39c5158Smillert
b8851fccSafresh1The literals that are converted are C<q//, qq//, qr//, qw///, qx//>, and
b8851fccSafresh1starting in v5.8.1, C<tr///>.  Operations that do conversions include C<chr>,
b8851fccSafresh1C<ord>, C<utf8::upgrade> (but not C<utf8::downgrade>), and C<chomp>.
b8851fccSafresh1
b8851fccSafresh1Also starting in v5.8.1, the C<DATA> pseudo-filehandle is translated from the
b8851fccSafresh1encoding into UTF-8.
b8851fccSafresh1
b8851fccSafresh1For example, you can write code in EUC-JP as follows:
b39c5158Smillert
b39c5158Smillert  my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
b39c5158Smillert               #<-char-><-char->   # 4 octets
b39c5158Smillert  s/\bCamel\b/$Rakuda/;
b39c5158Smillert
b39c5158SmillertAnd with C<use encoding "euc-jp"> in effect, it is the same thing as
b8851fccSafresh1that code in UTF-8:
b39c5158Smillert
b39c5158Smillert  my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters
b39c5158Smillert  s/\bCamel\b/$Rakuda/;
b39c5158Smillert
b8851fccSafresh1See L</EXAMPLE> below for a more complete example.
b39c5158Smillert
b8851fccSafresh1Unless C<${^UNICODE}> (available starting in v5.8.2) exists and is non-zero, the
b8851fccSafresh1PerlIO layers of C<STDIN> and C<STDOUT> are set to "C<:encoding(I<ENCNAME>)>".
b8851fccSafresh1Therefore,
b39c5158Smillert
b39c5158Smillert  use encoding "euc-jp";
b39c5158Smillert  my $message = "Camel is the symbol of perl.\n";
b39c5158Smillert  my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
b39c5158Smillert  $message =~ s/\bCamel\b/$Rakuda/;
b39c5158Smillert  print $message;
b39c5158Smillert
b8851fccSafresh1will print
b8851fccSafresh1
b8851fccSafresh1 "\xF1\xD1\xF1\xCC is the symbol of perl.\n"
b8851fccSafresh1
b8851fccSafresh1not
b8851fccSafresh1
b8851fccSafresh1 "\x{99F1}\x{99DD} is the symbol of perl.\n"
b39c5158Smillert
b39c5158SmillertYou can override this by giving extra arguments; see below.
b39c5158Smillert
b8851fccSafresh1Note that C<STDERR> WILL NOT be changed, regardless.
b39c5158Smillert
b8851fccSafresh1Also note that non-STD file handles remain unaffected.  Use C<use
b8851fccSafresh1open> or C<binmode> to change the layers of those.
b8851fccSafresh1
9f11ffb7Safresh1=item C<use encoding I<ENCNAME>, Filter=E<gt>1;>
b8851fccSafresh1
b8851fccSafresh1This operates as above, but the C<Filter> argument with a non-zero
b8851fccSafresh1value causes the entire script, and not just literals, to be translated from
b8851fccSafresh1the encoding into UTF-8.  This allows identifiers in the source to be in that
b8851fccSafresh1encoding as well.  (Problems may occur if the encoding is not a superset of
b8851fccSafresh1ASCII; imagine all your semi-colons being translated into something
b8851fccSafresh1different.)  One can use this form to make
b8851fccSafresh1
b8851fccSafresh1 ${"\x{4eba}"}++
b8851fccSafresh1
b8851fccSafresh1work.  (This is equivalent to C<$I<human>++>, where I<human> is a single Han
b8851fccSafresh1ideograph).
b8851fccSafresh1
b8851fccSafresh1This effectively means that your source code behaves as if it were written in
b8851fccSafresh1UTF-8 with C<'use utf8>' in effect.  So even if your editor only supports
b8851fccSafresh1Shift_JIS, for example, you can still try examples in Chapter 15 of
b8851fccSafresh1C<Programming Perl, 3rd Ed.>.
b8851fccSafresh1
b8851fccSafresh1This option is significantly slower than the other one.
b8851fccSafresh1
b8851fccSafresh1=item C<no encoding;>
b8851fccSafresh1
b8851fccSafresh1Unsets the script encoding. The layers of C<STDIN>, C<STDOUT> are
b8851fccSafresh1reset to "C<:raw>" (the default unprocessed raw stream of bytes).
b8851fccSafresh1
b8851fccSafresh1=back
b8851fccSafresh1
b8851fccSafresh1=head1 OPTIONS
b8851fccSafresh1
b8851fccSafresh1=head2 Setting C<STDIN> and/or C<STDOUT> individually
b8851fccSafresh1
b8851fccSafresh1The encodings of C<STDIN> and C<STDOUT> are individually settable by parameters to
b8851fccSafresh1the pragma:
b8851fccSafresh1
b8851fccSafresh1 use encoding 'euc-tw', STDIN => 'greek'  ...;
b8851fccSafresh1
b8851fccSafresh1In this case, you cannot omit the first I<ENCNAME>.  C<< STDIN => undef >>
b8851fccSafresh1turns the I/O transcoding completely off for that filehandle.
b8851fccSafresh1
b8851fccSafresh1When C<${^UNICODE}> (available starting in v5.8.2) exists and is non-zero,
b8851fccSafresh1these options will be completely ignored.  See L<perlvar/C<${^UNICODE}>> and
b8851fccSafresh1L<"C<-C>" in perlrun|perlrun/-C [numberE<sol>list]> for details.
b8851fccSafresh1
b8851fccSafresh1=head2 The C<:locale> sub-pragma
b8851fccSafresh1
b8851fccSafresh1Starting in v5.8.6, the encoding name may be C<:locale>.  This means that the
b8851fccSafresh1encoding is taken from the current locale, and not hard-coded by the pragma.
b8851fccSafresh1Since a script really can only be encoded in exactly one encoding, this option
b8851fccSafresh1is dangerous.  It makes sense only if the script itself is written in ASCII,
b8851fccSafresh1and all the possible locales that will be in use when the script is executed
b8851fccSafresh1are supersets of ASCII.  That means that the script itself doesn't get
b8851fccSafresh1changed, but the I/O handles have the specified encoding added, and the
b8851fccSafresh1operations like C<chr> and C<ord> use that encoding.
b8851fccSafresh1
b8851fccSafresh1The logic of finding which locale C<:locale> uses is as follows:
b8851fccSafresh1
b8851fccSafresh1=over 4
b8851fccSafresh1
b8851fccSafresh1=item 1.
b8851fccSafresh1
b8851fccSafresh1If the platform supports the C<langinfo(CODESET)> interface, the codeset
b8851fccSafresh1returned is used as the default encoding for the open pragma.
b8851fccSafresh1
b8851fccSafresh1=item 2.
b8851fccSafresh1
b8851fccSafresh1If 1. didn't work but we are under the locale pragma, the environment
b8851fccSafresh1variables C<LC_ALL> and C<LANG> (in that order) are matched for encodings
b8851fccSafresh1(the part after "C<.>", if any), and if any found, that is used
b8851fccSafresh1as the default encoding for the open pragma.
b8851fccSafresh1
b8851fccSafresh1=item 3.
b8851fccSafresh1
b8851fccSafresh1If 1. and 2. didn't work, the environment variables C<LC_ALL> and C<LANG>
b8851fccSafresh1(in that order) are matched for anything looking like UTF-8, and if
b8851fccSafresh1any found, C<:utf8> is used as the default encoding for the open
b8851fccSafresh1pragma.
b8851fccSafresh1
b8851fccSafresh1=back
b8851fccSafresh1
b8851fccSafresh1If your locale environment variables (C<LC_ALL>, C<LC_CTYPE>, C<LANG>)
b8851fccSafresh1contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching),
b8851fccSafresh1the default encoding of your C<STDIN>, C<STDOUT>, and C<STDERR>, and of
b8851fccSafresh1B<any subsequent file open>, is UTF-8.
b8851fccSafresh1
b8851fccSafresh1=head1 CAVEATS
b8851fccSafresh1
b8851fccSafresh1=head2 SIDE EFFECTS
b8851fccSafresh1
b8851fccSafresh1=over
b8851fccSafresh1
b8851fccSafresh1=item *
b8851fccSafresh1
b8851fccSafresh1If the C<encoding> pragma is in scope then the lengths returned are
b8851fccSafresh1calculated from the length of C<$/> in Unicode characters, which is not
b8851fccSafresh1always the same as the length of C<$/> in the native encoding.
b8851fccSafresh1
b8851fccSafresh1=item *
b8851fccSafresh1
b8851fccSafresh1Without this pragma, if strings operating under byte semantics and strings
b39c5158Smillertwith Unicode character data are concatenated, the new string will
b39c5158Smillertbe created by decoding the byte strings as I<ISO 8859-1 (Latin-1)>.
b39c5158Smillert
b39c5158SmillertThe B<encoding> pragma changes this to use the specified encoding
b39c5158Smillertinstead.  For example:
b39c5158Smillert
b39c5158Smillert    use encoding 'utf8';
b39c5158Smillert    my $string = chr(20000); # a Unicode string
b39c5158Smillert    utf8::encode($string);   # now it's a UTF-8 encoded byte string
b39c5158Smillert    # concatenate with another Unicode string
b39c5158Smillert    print length($string . chr(20000));
b39c5158Smillert
b39c5158SmillertWill print C<2>, because C<$string> is upgraded as UTF-8.  Without
b39c5158SmillertC<use encoding 'utf8';>, it will print C<4> instead, since C<$string>
b39c5158Smillertis three octets when interpreted as Latin-1.
b39c5158Smillert
b39c5158Smillert=back
b39c5158Smillert
b39c5158Smillert=head2 DO NOT MIX MULTIPLE ENCODINGS
b39c5158Smillert
b39c5158SmillertNotice that only literals (string or regular expression) having only
b39c5158Smillertlegacy code points are affected: if you mix data like this
b39c5158Smillert
b8851fccSafresh1    \x{100}\xDF
b39c5158Smillert    \xDF\x{100}
b39c5158Smillert
b39c5158Smillertthe data is assumed to be in (Latin 1 and) Unicode, not in your native
b39c5158Smillertencoding.  In other words, this will match in "greek":
b39c5158Smillert
b39c5158Smillert    "\xDF" =~ /\x{3af}/
b39c5158Smillert
b39c5158Smillertbut this will not
b39c5158Smillert
b39c5158Smillert    "\xDF\x{100}" =~ /\x{3af}\x{100}/
b39c5158Smillert
b39c5158Smillertsince the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on
b39c5158Smillertthe left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL
b39c5158SmillertLETTER IOTA WITH TONOS) because of the C<\x{100}> on the left.  You
b39c5158Smillertshould not be mixing your legacy data and Unicode in the same string.
b39c5158Smillert
b39c5158SmillertThis pragma also affects encoding of the 0x80..0xFF code point range:
b39c5158Smillertnormally characters in that range are left as eight-bit bytes (unless
b39c5158Smillertthey are combined with characters with code points 0x100 or larger,
b39c5158Smillertin which case all characters need to become UTF-8 encoded), but if
b39c5158Smillertthe C<encoding> pragma is present, even the 0x80..0xFF range always
b39c5158Smillertgets UTF-8 encoded.
b39c5158Smillert
b39c5158SmillertAfter all, the best thing about this pragma is that you don't have to
b39c5158Smillertresort to \x{....} just to spell your name in a native encoding.
b39c5158SmillertSo feel free to put your strings in your encoding in quotes and
b39c5158Smillertregexes.
b39c5158Smillert
b8851fccSafresh1=head2 Prior to Perl v5.22
b8851fccSafresh1
b8851fccSafresh1The pragma was a per script, not a per block lexical.  Only the last
b8851fccSafresh1C<use encoding> or C<no encoding> mattered, and it affected
b8851fccSafresh1B<the whole script>.  However, the C<no encoding> pragma was supported and
b8851fccSafresh1C<use encoding> could appear as many times as you want in a given script
b8851fccSafresh1(though only the last was effective).
b8851fccSafresh1
b8851fccSafresh1Since the scope wasn't lexical, other modules' use of C<chr>, C<ord>, I<etc.>
b8851fccSafresh1were affected.  This leads to spooky, incorrect action at a distance that is
b8851fccSafresh1hard to debug.
b8851fccSafresh1
b8851fccSafresh1This means you would have to be very careful of the load order:
b8851fccSafresh1
b8851fccSafresh1  # called module
b8851fccSafresh1  package Module_IN_BAR;
b8851fccSafresh1  use encoding "bar";
b8851fccSafresh1  # stuff in "bar" encoding here
b8851fccSafresh1  1;
b8851fccSafresh1
b8851fccSafresh1  # caller script
b8851fccSafresh1  use encoding "foo"
b8851fccSafresh1  use Module_IN_BAR;
b8851fccSafresh1  # surprise! use encoding "bar" is in effect.
b8851fccSafresh1
b8851fccSafresh1The best way to avoid this oddity is to use this pragma RIGHT AFTER
b8851fccSafresh1other modules are loaded.  i.e.
b8851fccSafresh1
b8851fccSafresh1  use Module_IN_BAR;
b8851fccSafresh1  use encoding "foo";
b8851fccSafresh1
b8851fccSafresh1=head2 Prior to Encode version 1.87
b8851fccSafresh1
b8851fccSafresh1=over
b8851fccSafresh1
b8851fccSafresh1=item *
b8851fccSafresh1
b8851fccSafresh1C<STDIN> and C<STDOUT> were not set under the filter option.
b8851fccSafresh1And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> didn't work like
b8851fccSafresh1non-filter version.
b8851fccSafresh1
b8851fccSafresh1=item *
b8851fccSafresh1
b8851fccSafresh1C<use utf8> wasn't implicitly declared so you have to C<use utf8> to do
b8851fccSafresh1
b8851fccSafresh1 ${"\x{4eba}"}++
b8851fccSafresh1
b8851fccSafresh1=back
b8851fccSafresh1
b8851fccSafresh1=head2 Prior to Perl v5.8.1
b8851fccSafresh1
b8851fccSafresh1=over
b8851fccSafresh1
b8851fccSafresh1=item "NON-EUC" doublebyte encodings
b8851fccSafresh1
b8851fccSafresh1Because perl needs to parse the script before applying this pragma, such
b8851fccSafresh1encodings as Shift_JIS and Big-5 that may contain C<'\'> (BACKSLASH;
b8851fccSafresh1C<\x5c>) in the second byte fail because the second byte may
b8851fccSafresh1accidentally escape the quoting character that follows.
b8851fccSafresh1
b8851fccSafresh1=item C<tr///>
b39c5158Smillert
b39c5158SmillertThe B<encoding> pragma works by decoding string literals in
b8851fccSafresh1C<q//,qq//,qr//,qw///, qx//> and so forth.  In perl v5.8.0, this
b39c5158Smillertdoes not apply to C<tr///>.  Therefore,
b39c5158Smillert
b39c5158Smillert  use encoding 'euc-jp';
b39c5158Smillert  #....
b39c5158Smillert  $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/;
b39c5158Smillert  #           -------- -------- -------- --------
b39c5158Smillert
b39c5158SmillertDoes not work as
b39c5158Smillert
b39c5158Smillert  $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/;
b39c5158Smillert
b39c5158Smillert=over
b39c5158Smillert
b39c5158Smillert=item Legend of characters above
b39c5158Smillert
b39c5158Smillert  utf8     euc-jp   charnames::viacode()
b39c5158Smillert  -----------------------------------------
b39c5158Smillert  \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A
b39c5158Smillert  \x{3093} \xA4\xF3 HIRAGANA LETTER N
b39c5158Smillert  \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A
b39c5158Smillert  \x{30f3} \xA5\xF3 KATAKANA LETTER N
b39c5158Smillert
b39c5158Smillert=back
b39c5158Smillert
b8851fccSafresh1This counterintuitive behavior has been fixed in perl v5.8.1.
b39c5158Smillert
b8851fccSafresh1In perl v5.8.0, you can work around this as follows;
b39c5158Smillert
b39c5158Smillert  use encoding 'euc-jp';
b39c5158Smillert  #  ....
b39c5158Smillert  eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ };
b39c5158Smillert
b39c5158SmillertNote the C<tr//> expression is surrounded by C<qq{}>.  The idea behind
b8851fccSafresh1this is the same as the classic idiom that makes C<tr///> 'interpolate':
b39c5158Smillert
b39c5158Smillert   tr/$from/$to/;            # wrong!
b39c5158Smillert   eval qq{ tr/$from/$to/ }; # workaround.
b39c5158Smillert
b8851fccSafresh1=back
b39c5158Smillert
b39c5158Smillert=head1 EXAMPLE - Greekperl
b39c5158Smillert
b39c5158Smillert    use encoding "iso 8859-7";
b39c5158Smillert
b39c5158Smillert    # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode.
b39c5158Smillert
b39c5158Smillert    $a = "\xDF";
b39c5158Smillert    $b = "\x{100}";
b39c5158Smillert
b39c5158Smillert    printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
b39c5158Smillert
b39c5158Smillert    $c = $a . $b;
b39c5158Smillert
b39c5158Smillert    # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
b39c5158Smillert
b39c5158Smillert    # chr() is affected, and ...
b39c5158Smillert
b39c5158Smillert    print "mega\n"  if ord(chr(0xdf)) == 0x3af;
b39c5158Smillert
b39c5158Smillert    # ... ord() is affected by the encoding pragma ...
b39c5158Smillert
b39c5158Smillert    print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
b39c5158Smillert
b39c5158Smillert    # ... as are eq and cmp ...
b39c5158Smillert
b39c5158Smillert    print "peta\n" if "\x{3af}" eq  pack("C", 0xdf);
b39c5158Smillert    print "exa\n"  if "\x{3af}" cmp pack("C", 0xdf) == 0;
b39c5158Smillert
b39c5158Smillert    # ... but pack/unpack C are not affected, in case you still
b39c5158Smillert    # want to go back to your native encoding
b39c5158Smillert
b39c5158Smillert    print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
b39c5158Smillert
b8851fccSafresh1=head1 BUGS
b39c5158Smillert
b39c5158Smillert=over
b39c5158Smillert
b8851fccSafresh1=item Thread safety
b8851fccSafresh1
b8851fccSafresh1C<use encoding ...> is not thread-safe (i.e., do not use in threaded
b8851fccSafresh1applications).
b8851fccSafresh1
b8851fccSafresh1=item Can't be used by more than one module in a single program.
b8851fccSafresh1
b8851fccSafresh1Only one encoding is allowed.  If you combine modules in a program that have
b8851fccSafresh1different encodings, only one will be actually used.
b8851fccSafresh1
b8851fccSafresh1=item Other modules using C<STDIN> and C<STDOUT> get the encoded stream
b8851fccSafresh1
b8851fccSafresh1They may be expecting something completely different.
b8851fccSafresh1
b39c5158Smillert=item literals in regex that are longer than 127 bytes
b39c5158Smillert
b39c5158SmillertFor native multibyte encodings (either fixed or variable length),
b39c5158Smillertthe current implementation of the regular expressions may introduce
b39c5158Smillertrecoding errors for regular expression literals longer than 127 bytes.
b39c5158Smillert
b39c5158Smillert=item EBCDIC
b39c5158Smillert
b39c5158SmillertThe encoding pragma is not supported on EBCDIC platforms.
b39c5158Smillert
b8851fccSafresh1=item C<format>
b39c5158Smillert
b8851fccSafresh1This pragma doesn't work well with C<format> because PerlIO does not
b8851fccSafresh1get along very well with it.  When C<format> contains non-ASCII
b39c5158Smillertcharacters it prints funny or gets "wide character warnings".
b39c5158SmillertTo understand it, try the code below.
b39c5158Smillert
b39c5158Smillert  # Save this one in utf8
b39c5158Smillert  # replace *non-ascii* with a non-ascii string
b39c5158Smillert  my $camel;
b39c5158Smillert  format STDOUT =
b39c5158Smillert  *non-ascii*@>>>>>>>
b39c5158Smillert  $camel
b39c5158Smillert  .
b39c5158Smillert  $camel = "*non-ascii*";
b39c5158Smillert  binmode(STDOUT=>':encoding(utf8)'); # bang!
b39c5158Smillert  write;              # funny
b39c5158Smillert  print $camel, "\n"; # fine
b39c5158Smillert
b39c5158SmillertWithout binmode this happens to work but without binmode, print()
b39c5158Smillertfails instead of write().
b39c5158Smillert
b8851fccSafresh1At any rate, the very use of C<format> is questionable when it comes to
b39c5158Smillertunicode characters since you have to consider such things as character
b39c5158Smillertwidth (i.e. double-width for ideographs) and directions (i.e. BIDI for
b39c5158SmillertArabic and Hebrew).
b39c5158Smillert
b8851fccSafresh1=item See also L</CAVEATS>
b39c5158Smillert
b39c5158Smillert=back
b39c5158Smillert
b39c5158Smillert=head1 HISTORY
b39c5158Smillert
b8851fccSafresh1This pragma first appeared in Perl v5.8.0.  It has been enhanced in later
b8851fccSafresh1releases as specified above.
b39c5158Smillert
b39c5158Smillert=head1 SEE ALSO
b39c5158Smillert
b39c5158SmillertL<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>,
b39c5158Smillert
b39c5158SmillertCh. 15 of C<Programming Perl (3rd Edition)>
b39c5158Smillertby Larry Wall, Tom Christiansen, Jon Orwant;
b39c5158SmillertO'Reilly & Associates; ISBN 0-596-00027-8
b39c5158Smillert
b39c5158Smillert=cut