xref: /openbsd/gnu/usr.bin/perl/cpan/Encode/encoding.pm (revision 56d68f1e)
1*56d68f1eSafresh1# $Id: encoding.pm,v 3.00 2020/04/19 10:56:28 dankogai Exp $
2b39c5158Smillertpackage encoding;
3*56d68f1eSafresh1our $VERSION = sprintf "%d.%02d", q$Revision: 3.00 $ =~ /(\d+)/g;
4b39c5158Smillert
5b39c5158Smillertuse Encode;
6b39c5158Smillertuse strict;
7b39c5158Smillertuse warnings;
89f11ffb7Safresh1use Config;
9b39c5158Smillert
10b8851fccSafresh1use constant {
11b8851fccSafresh1    DEBUG => !!$ENV{PERL_ENCODE_DEBUG},
12b8851fccSafresh1    HAS_PERLIO => eval { require PerlIO::encoding; PerlIO::encoding->VERSION(0.02) },
139f11ffb7Safresh1    PERL_5_21_7 => $^V && $^V ge v5.21.7, # lexically scoped
14b8851fccSafresh1};
15b39c5158Smillert
16b39c5158Smillertsub _exception {
17b39c5158Smillert    my $name = shift;
18b39c5158Smillert    $] > 5.008 and return 0;    # 5.8.1 or higher then no
19b39c5158Smillert    my %utfs = map { $_ => 1 }
20b39c5158Smillert      qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE
21b39c5158Smillert      UTF-32 UTF-32BE UTF-32LE);
22b39c5158Smillert    $utfs{$name} or return 0;    # UTFs or no
23b39c5158Smillert    require Config;
24b39c5158Smillert    Config->import();
25b39c5158Smillert    our %Config;
26b39c5158Smillert    return $Config{perl_patchlevel} ? 0 : 1    # maintperl then no
27b39c5158Smillert}
28b39c5158Smillert
29b39c5158Smillertsub in_locale { $^H & ( $locale::hint_bits || 0 ) }
30b39c5158Smillert
31b39c5158Smillertsub _get_locale_encoding {
32b39c5158Smillert    my $locale_encoding;
33b39c5158Smillert
34b8851fccSafresh1    if ($^O eq 'MSWin32') {
35b8851fccSafresh1        my @tries = (
36b8851fccSafresh1            # First try to get the OutputCP. This will work only if we
37b8851fccSafresh1            # are attached to a console
38b8851fccSafresh1            'Win32.pm' => 'Win32::GetConsoleOutputCP',
39b8851fccSafresh1            'Win32/Console.pm' => 'Win32::Console::OutputCP',
40b8851fccSafresh1            # If above failed, this means that we are a GUI app
41b8851fccSafresh1            # Let's assume that the ANSI codepage is what matters
42b8851fccSafresh1            'Win32.pm' => 'Win32::GetACP',
43b8851fccSafresh1        );
44b8851fccSafresh1        while (@tries) {
45b8851fccSafresh1            my $cp = eval {
46b8851fccSafresh1                require $tries[0];
47b8851fccSafresh1                no strict 'refs';
48b8851fccSafresh1                &{$tries[1]}()
49b39c5158Smillert            };
50b8851fccSafresh1            if ($cp) {
51b8851fccSafresh1                if ($cp == 65001) { # Code page for UTF-8
52b8851fccSafresh1                    $locale_encoding = 'UTF-8';
53b8851fccSafresh1                } else {
54b8851fccSafresh1                    $locale_encoding = 'cp' . $cp;
55b8851fccSafresh1                }
56b8851fccSafresh1                return $locale_encoding;
57b8851fccSafresh1            }
58b8851fccSafresh1            splice(@tries, 0, 2)
59b8851fccSafresh1        }
60b8851fccSafresh1    }
61b39c5158Smillert
62b8851fccSafresh1    # I18N::Langinfo isn't available everywhere
63b8851fccSafresh1    $locale_encoding = eval {
64b8851fccSafresh1        require I18N::Langinfo;
65b8851fccSafresh1        find_encoding(
66b8851fccSafresh1            I18N::Langinfo::langinfo( I18N::Langinfo::CODESET() )
67b8851fccSafresh1        )->name
68b8851fccSafresh1    };
69b8851fccSafresh1    return $locale_encoding if defined $locale_encoding;
70b8851fccSafresh1
71b8851fccSafresh1    eval {
72b8851fccSafresh1        require POSIX;
73b8851fccSafresh1        # Get the current locale
74b8851fccSafresh1        # Remember that MSVCRT impl is quite different from Unixes
75b8851fccSafresh1        my $locale = POSIX::setlocale(POSIX::LC_CTYPE());
76b8851fccSafresh1        if ( $locale =~ /^([^.]+)\.([^.@]+)(?:@.*)?$/ ) {
77b39c5158Smillert            my $country_language;
78b39c5158Smillert            ( $country_language, $locale_encoding ) = ( $1, $2 );
79b39c5158Smillert
80b39c5158Smillert            # Could do more heuristics based on the country and language
81b39c5158Smillert            # since we have Locale::Country and Locale::Language available.
82b39c5158Smillert            # TODO: get a database of Language -> Encoding mappings
83b39c5158Smillert            # (the Estonian database at http://www.eki.ee/letter/
84b39c5158Smillert            # would be excellent!) --jhi
85b8851fccSafresh1            if (lc($locale_encoding) eq 'euc') {
86b39c5158Smillert                if ( $country_language =~ /^ja_JP|japan(?:ese)?$/i ) {
87b39c5158Smillert                    $locale_encoding = 'euc-jp';
88b39c5158Smillert                }
89b39c5158Smillert                elsif ( $country_language =~ /^ko_KR|korean?$/i ) {
90b39c5158Smillert                    $locale_encoding = 'euc-kr';
91b39c5158Smillert                }
92b39c5158Smillert                elsif ( $country_language =~ /^zh_CN|chin(?:a|ese)$/i ) {
93b39c5158Smillert                    $locale_encoding = 'euc-cn';
94b39c5158Smillert                }
95b39c5158Smillert                elsif ( $country_language =~ /^zh_TW|taiwan(?:ese)?$/i ) {
96b39c5158Smillert                    $locale_encoding = 'euc-tw';
97b39c5158Smillert                }
98b39c5158Smillert                else {
99b39c5158Smillert                    require Carp;
100b39c5158Smillert                    Carp::croak(
101b39c5158Smillert                        "encoding: Locale encoding '$locale_encoding' too ambiguous"
102b39c5158Smillert                    );
103b39c5158Smillert                }
104b39c5158Smillert            }
105b8851fccSafresh1        }
106b8851fccSafresh1    };
107b39c5158Smillert
108b39c5158Smillert    return $locale_encoding;
109b39c5158Smillert}
110b39c5158Smillert
111b39c5158Smillertsub import {
112b8851fccSafresh1
113b8851fccSafresh1    if ( ord("A") == 193 ) {
114b8851fccSafresh1        require Carp;
115b8851fccSafresh1        Carp::croak("encoding: pragma does not support EBCDIC platforms");
116b8851fccSafresh1    }
117b8851fccSafresh1
1189f11ffb7Safresh1    my $deprecate =
1199f11ffb7Safresh1        ($] >= 5.017 and !$Config{usecperl})
1209f11ffb7Safresh1        ? "Use of the encoding pragma is deprecated" : 0;
1219f11ffb7Safresh1
122b39c5158Smillert    my $class = shift;
123b39c5158Smillert    my $name  = shift;
124e9ce3842Safresh1    if (!$name){
125e9ce3842Safresh1	require Carp;
126e9ce3842Safresh1        Carp::croak("encoding: no encoding specified.");
127e9ce3842Safresh1    }
128b39c5158Smillert    if ( $name eq ':_get_locale_encoding' ) {    # used by lib/open.pm
129b39c5158Smillert        my $caller = caller();
130b39c5158Smillert        {
131b39c5158Smillert            no strict 'refs';
132b39c5158Smillert            *{"${caller}::_get_locale_encoding"} = \&_get_locale_encoding;
133b39c5158Smillert        }
134b39c5158Smillert        return;
135b39c5158Smillert    }
136b39c5158Smillert    $name = _get_locale_encoding() if $name eq ':locale';
1379f11ffb7Safresh1    BEGIN { strict->unimport('hashpairs') if $] >= 5.027 and $^V =~ /c$/; }
138b39c5158Smillert    my %arg = @_;
139b39c5158Smillert    $name = $ENV{PERL_ENCODING} unless defined $name;
140b39c5158Smillert    my $enc = find_encoding($name);
141b39c5158Smillert    unless ( defined $enc ) {
142b39c5158Smillert        require Carp;
143b39c5158Smillert        Carp::croak("encoding: Unknown encoding '$name'");
144b39c5158Smillert    }
145b39c5158Smillert    $name = $enc->name;    # canonize
146b39c5158Smillert    unless ( $arg{Filter} ) {
1479f11ffb7Safresh1        if ($] >= 5.025003 and !$Config{usecperl}) {
1489f11ffb7Safresh1            require Carp;
1499f11ffb7Safresh1            Carp::croak("The encoding pragma is no longer supported. Check cperl");
1509f11ffb7Safresh1        }
1519f11ffb7Safresh1        warnings::warnif("deprecated",$deprecate) if $deprecate;
1529f11ffb7Safresh1
153b39c5158Smillert        DEBUG and warn "_exception($name) = ", _exception($name);
154b8851fccSafresh1        if (! _exception($name)) {
155b8851fccSafresh1            if (!PERL_5_21_7) {
156b8851fccSafresh1                ${^ENCODING} = $enc;
157b8851fccSafresh1            }
158b8851fccSafresh1            else {
159b8851fccSafresh1                # Starting with 5.21.7, this pragma uses a shadow variable
160b8851fccSafresh1                # designed explicitly for it, ${^E_NCODING}, to enforce
161b8851fccSafresh1                # lexical scope; instead of ${^ENCODING}.
162b8851fccSafresh1                $^H{'encoding'} = 1;
163b8851fccSafresh1                ${^E_NCODING} = $enc;
164b8851fccSafresh1            }
165b8851fccSafresh1        }
1669f11ffb7Safresh1        if (! HAS_PERLIO ) {
1679f11ffb7Safresh1            return 1;
1689f11ffb7Safresh1        }
169b39c5158Smillert    }
170b39c5158Smillert    else {
1719f11ffb7Safresh1        warnings::warnif("deprecated",$deprecate) if $deprecate;
1729f11ffb7Safresh1
173b39c5158Smillert        defined( ${^ENCODING} ) and undef ${^ENCODING};
174b8851fccSafresh1        undef ${^E_NCODING} if PERL_5_21_7;
175b39c5158Smillert
176b39c5158Smillert        # implicitly 'use utf8'
177b39c5158Smillert        require utf8;      # to fetch $utf8::hint_bits;
178b39c5158Smillert        $^H |= $utf8::hint_bits;
1799f11ffb7Safresh1
180b39c5158Smillert            require Filter::Util::Call;
181b39c5158Smillert            Filter::Util::Call->import;
182b39c5158Smillert            filter_add(
183b39c5158Smillert                sub {
184b39c5158Smillert                    my $status = filter_read();
185b39c5158Smillert                    if ( $status > 0 ) {
186b39c5158Smillert                        $_ = $enc->decode( $_, 1 );
187b39c5158Smillert                        DEBUG and warn $_;
188b39c5158Smillert                    }
189b39c5158Smillert                    $status;
190b39c5158Smillert                }
191b39c5158Smillert            );
192b39c5158Smillert    }
193b39c5158Smillert    defined ${^UNICODE} and ${^UNICODE} != 0 and return 1;
194b39c5158Smillert    for my $h (qw(STDIN STDOUT)) {
195b39c5158Smillert        if ( $arg{$h} ) {
196b39c5158Smillert            unless ( defined find_encoding( $arg{$h} ) ) {
197b39c5158Smillert                require Carp;
198b39c5158Smillert                Carp::croak(
199b39c5158Smillert                    "encoding: Unknown encoding for $h, '$arg{$h}'");
200b39c5158Smillert            }
2019f11ffb7Safresh1            binmode( $h, ":raw :encoding($arg{$h})" );
202b39c5158Smillert        }
203b39c5158Smillert        else {
204b39c5158Smillert            unless ( exists $arg{$h} ) {
205b39c5158Smillert                    no warnings 'uninitialized';
206b39c5158Smillert                    binmode( $h, ":raw :encoding($name)" );
207b39c5158Smillert            }
208b39c5158Smillert        }
209b39c5158Smillert    }
210b39c5158Smillert    return 1;    # I doubt if we need it, though
211b39c5158Smillert}
212b39c5158Smillert
213b39c5158Smillertsub unimport {
214b39c5158Smillert    no warnings;
215b39c5158Smillert    undef ${^ENCODING};
216b8851fccSafresh1    undef ${^E_NCODING} if PERL_5_21_7;
217b8851fccSafresh1    if (HAS_PERLIO) {
218b39c5158Smillert        binmode( STDIN,  ":raw" );
219b39c5158Smillert        binmode( STDOUT, ":raw" );
220b39c5158Smillert    }
221b39c5158Smillert    else {
222b39c5158Smillert        binmode(STDIN);
223b39c5158Smillert        binmode(STDOUT);
224b39c5158Smillert    }
225b39c5158Smillert    if ( $INC{"Filter/Util/Call.pm"} ) {
226b39c5158Smillert        eval { filter_del() };
227b39c5158Smillert    }
228b39c5158Smillert}
229b39c5158Smillert
230b39c5158Smillert1;
231b39c5158Smillert__END__
232b39c5158Smillert
233b39c5158Smillert=pod
234b39c5158Smillert
235b39c5158Smillert=head1 NAME
236b39c5158Smillert
237b8851fccSafresh1encoding - allows you to write your script in non-ASCII and non-UTF-8
238b39c5158Smillert
239e9ce3842Safresh1=head1 WARNING
240e9ce3842Safresh1
241b8851fccSafresh1This module has been deprecated since perl v5.18.  See L</DESCRIPTION> and
242b8851fccSafresh1L</BUGS>.
243e5157e49Safresh1
244b39c5158Smillert=head1 SYNOPSIS
245b39c5158Smillert
246b39c5158Smillert  use encoding "greek";  # Perl like Greek to you?
247b39c5158Smillert  use encoding "euc-jp"; # Jperl!
248b39c5158Smillert
249b39c5158Smillert  # or you can even do this if your shell supports your native encoding
250b39c5158Smillert
251b39c5158Smillert  perl -Mencoding=latin2 -e'...' # Feeling centrally European?
252b39c5158Smillert  perl -Mencoding=euc-kr -e'...' # Or Korean?
253b39c5158Smillert
254b39c5158Smillert  # more control
255b39c5158Smillert
256b39c5158Smillert  # A simple euc-cn => utf-8 converter
257b39c5158Smillert  use encoding "euc-cn", STDOUT => "utf8";  while(<>){print};
258b39c5158Smillert
259b8851fccSafresh1  # "no encoding;" supported
260b39c5158Smillert  no encoding;
261b39c5158Smillert
262b39c5158Smillert  # an alternate way, Filter
263b39c5158Smillert  use encoding "euc-jp", Filter=>1;
264b39c5158Smillert  # now you can use kanji identifiers -- in euc-jp!
265b39c5158Smillert
266b8851fccSafresh1  # encode based on the current locale - specialized purposes only;
267b8851fccSafresh1  # fraught with danger!!
268b39c5158Smillert  use encoding ':locale';
269b39c5158Smillert
270b8851fccSafresh1=head1 DESCRIPTION
271b39c5158Smillert
272b8851fccSafresh1This pragma is used to enable a Perl script to be written in encodings that
273b8851fccSafresh1aren't strictly ASCII nor UTF-8.  It translates all or portions of the Perl
274b8851fccSafresh1program script from a given encoding into UTF-8, and changes the PerlIO layers
275b8851fccSafresh1of C<STDIN> and C<STDOUT> to the encoding specified.
276b39c5158Smillert
277b8851fccSafresh1This pragma dates from the days when UTF-8-enabled editors were uncommon.  But
278b8851fccSafresh1that was long ago, and the need for it is greatly diminished.  That, coupled
279b8851fccSafresh1with the fact that it doesn't work with threads, along with other problems,
280b8851fccSafresh1(see L</BUGS>) have led to its being deprecated.  It is planned to remove this
281b8851fccSafresh1pragma in a future Perl version.  New code should be written in UTF-8, and the
282b8851fccSafresh1C<use utf8> pragma used instead (see L<perluniintro> and L<utf8> for details).
283b8851fccSafresh1Old code should be converted to UTF-8, via something like the recipe in the
284b8851fccSafresh1L</SYNOPSIS> (though this simple approach may require manual adjustments
285b8851fccSafresh1afterwards).
286b39c5158Smillert
2879f11ffb7Safresh1If UTF-8 is not an option, it is recommended that one use a simple source
2889f11ffb7Safresh1filter, such as that provided by L<Filter::Encoding> on CPAN or this
2899f11ffb7Safresh1pragma's own C<Filter> option (see below).
2909f11ffb7Safresh1
291b8851fccSafresh1The only legitimate use of this pragma is almost certainly just one per file,
292b8851fccSafresh1near the top, with file scope, as the file is likely going to only be written
293b8851fccSafresh1in one encoding.  Further restrictions apply in Perls before v5.22 (see
294b8851fccSafresh1L</Prior to Perl v5.22>).
295b39c5158Smillert
296b8851fccSafresh1There are two basic modes of operation (plus turning if off):
297b39c5158Smillert
298b8851fccSafresh1=over 4
299b39c5158Smillert
300b8851fccSafresh1=item C<use encoding ['I<ENCNAME>'] ;>
301b39c5158Smillert
3029f11ffb7Safresh1Please note: This mode of operation is no longer supported as of Perl
3039f11ffb7Safresh1v5.26.
3049f11ffb7Safresh1
305b8851fccSafresh1This is the normal operation.  It translates various literals encountered in
306b8851fccSafresh1the Perl source file from the encoding I<ENCNAME> into UTF-8, and similarly
307b8851fccSafresh1converts character code points.  This is used when the script is a combination
308b8851fccSafresh1of ASCII (for the variable names and punctuation, I<etc>), but the literal
309b8851fccSafresh1data is in the specified encoding.
310b39c5158Smillert
311b8851fccSafresh1I<ENCNAME> is optional.  If omitted, the encoding specified in the environment
312b8851fccSafresh1variable L<C<PERL_ENCODING>|perlrun/PERL_ENCODING> is used.  If this isn't
313b8851fccSafresh1set, or the resolved-to encoding is not known to C<L<Encode>>, the error
314b8851fccSafresh1C<Unknown encoding 'I<ENCNAME>'> will be thrown.
315b39c5158Smillert
316b8851fccSafresh1Starting in Perl v5.8.6 (C<Encode> version 2.0.1), I<ENCNAME> may be the
317b8851fccSafresh1name C<:locale>.  This is for very specialized applications, and is documented
318b8851fccSafresh1in L</The C<:locale> sub-pragma> below.
319b39c5158Smillert
320b8851fccSafresh1The literals that are converted are C<q//, qq//, qr//, qw///, qx//>, and
321b8851fccSafresh1starting in v5.8.1, C<tr///>.  Operations that do conversions include C<chr>,
322b8851fccSafresh1C<ord>, C<utf8::upgrade> (but not C<utf8::downgrade>), and C<chomp>.
323b8851fccSafresh1
324b8851fccSafresh1Also starting in v5.8.1, the C<DATA> pseudo-filehandle is translated from the
325b8851fccSafresh1encoding into UTF-8.
326b8851fccSafresh1
327b8851fccSafresh1For example, you can write code in EUC-JP as follows:
328b39c5158Smillert
329b39c5158Smillert  my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
330b39c5158Smillert               #<-char-><-char->   # 4 octets
331b39c5158Smillert  s/\bCamel\b/$Rakuda/;
332b39c5158Smillert
333b39c5158SmillertAnd with C<use encoding "euc-jp"> in effect, it is the same thing as
334b8851fccSafresh1that code in UTF-8:
335b39c5158Smillert
336b39c5158Smillert  my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters
337b39c5158Smillert  s/\bCamel\b/$Rakuda/;
338b39c5158Smillert
339b8851fccSafresh1See L</EXAMPLE> below for a more complete example.
340b39c5158Smillert
341b8851fccSafresh1Unless C<${^UNICODE}> (available starting in v5.8.2) exists and is non-zero, the
342b8851fccSafresh1PerlIO layers of C<STDIN> and C<STDOUT> are set to "C<:encoding(I<ENCNAME>)>".
343b8851fccSafresh1Therefore,
344b39c5158Smillert
345b39c5158Smillert  use encoding "euc-jp";
346b39c5158Smillert  my $message = "Camel is the symbol of perl.\n";
347b39c5158Smillert  my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
348b39c5158Smillert  $message =~ s/\bCamel\b/$Rakuda/;
349b39c5158Smillert  print $message;
350b39c5158Smillert
351b8851fccSafresh1will print
352b8851fccSafresh1
353b8851fccSafresh1 "\xF1\xD1\xF1\xCC is the symbol of perl.\n"
354b8851fccSafresh1
355b8851fccSafresh1not
356b8851fccSafresh1
357b8851fccSafresh1 "\x{99F1}\x{99DD} is the symbol of perl.\n"
358b39c5158Smillert
359b39c5158SmillertYou can override this by giving extra arguments; see below.
360b39c5158Smillert
361b8851fccSafresh1Note that C<STDERR> WILL NOT be changed, regardless.
362b39c5158Smillert
363b8851fccSafresh1Also note that non-STD file handles remain unaffected.  Use C<use
364b8851fccSafresh1open> or C<binmode> to change the layers of those.
365b8851fccSafresh1
3669f11ffb7Safresh1=item C<use encoding I<ENCNAME>, Filter=E<gt>1;>
367b8851fccSafresh1
368b8851fccSafresh1This operates as above, but the C<Filter> argument with a non-zero
369b8851fccSafresh1value causes the entire script, and not just literals, to be translated from
370b8851fccSafresh1the encoding into UTF-8.  This allows identifiers in the source to be in that
371b8851fccSafresh1encoding as well.  (Problems may occur if the encoding is not a superset of
372b8851fccSafresh1ASCII; imagine all your semi-colons being translated into something
373b8851fccSafresh1different.)  One can use this form to make
374b8851fccSafresh1
375b8851fccSafresh1 ${"\x{4eba}"}++
376b8851fccSafresh1
377b8851fccSafresh1work.  (This is equivalent to C<$I<human>++>, where I<human> is a single Han
378b8851fccSafresh1ideograph).
379b8851fccSafresh1
380b8851fccSafresh1This effectively means that your source code behaves as if it were written in
381b8851fccSafresh1UTF-8 with C<'use utf8>' in effect.  So even if your editor only supports
382b8851fccSafresh1Shift_JIS, for example, you can still try examples in Chapter 15 of
383b8851fccSafresh1C<Programming Perl, 3rd Ed.>.
384b8851fccSafresh1
385b8851fccSafresh1This option is significantly slower than the other one.
386b8851fccSafresh1
387b8851fccSafresh1=item C<no encoding;>
388b8851fccSafresh1
389b8851fccSafresh1Unsets the script encoding. The layers of C<STDIN>, C<STDOUT> are
390b8851fccSafresh1reset to "C<:raw>" (the default unprocessed raw stream of bytes).
391b8851fccSafresh1
392b8851fccSafresh1=back
393b8851fccSafresh1
394b8851fccSafresh1=head1 OPTIONS
395b8851fccSafresh1
396b8851fccSafresh1=head2 Setting C<STDIN> and/or C<STDOUT> individually
397b8851fccSafresh1
398b8851fccSafresh1The encodings of C<STDIN> and C<STDOUT> are individually settable by parameters to
399b8851fccSafresh1the pragma:
400b8851fccSafresh1
401b8851fccSafresh1 use encoding 'euc-tw', STDIN => 'greek'  ...;
402b8851fccSafresh1
403b8851fccSafresh1In this case, you cannot omit the first I<ENCNAME>.  C<< STDIN => undef >>
404b8851fccSafresh1turns the I/O transcoding completely off for that filehandle.
405b8851fccSafresh1
406b8851fccSafresh1When C<${^UNICODE}> (available starting in v5.8.2) exists and is non-zero,
407b8851fccSafresh1these options will be completely ignored.  See L<perlvar/C<${^UNICODE}>> and
408b8851fccSafresh1L<"C<-C>" in perlrun|perlrun/-C [numberE<sol>list]> for details.
409b8851fccSafresh1
410b8851fccSafresh1=head2 The C<:locale> sub-pragma
411b8851fccSafresh1
412b8851fccSafresh1Starting in v5.8.6, the encoding name may be C<:locale>.  This means that the
413b8851fccSafresh1encoding is taken from the current locale, and not hard-coded by the pragma.
414b8851fccSafresh1Since a script really can only be encoded in exactly one encoding, this option
415b8851fccSafresh1is dangerous.  It makes sense only if the script itself is written in ASCII,
416b8851fccSafresh1and all the possible locales that will be in use when the script is executed
417b8851fccSafresh1are supersets of ASCII.  That means that the script itself doesn't get
418b8851fccSafresh1changed, but the I/O handles have the specified encoding added, and the
419b8851fccSafresh1operations like C<chr> and C<ord> use that encoding.
420b8851fccSafresh1
421b8851fccSafresh1The logic of finding which locale C<:locale> uses is as follows:
422b8851fccSafresh1
423b8851fccSafresh1=over 4
424b8851fccSafresh1
425b8851fccSafresh1=item 1.
426b8851fccSafresh1
427b8851fccSafresh1If the platform supports the C<langinfo(CODESET)> interface, the codeset
428b8851fccSafresh1returned is used as the default encoding for the open pragma.
429b8851fccSafresh1
430b8851fccSafresh1=item 2.
431b8851fccSafresh1
432b8851fccSafresh1If 1. didn't work but we are under the locale pragma, the environment
433b8851fccSafresh1variables C<LC_ALL> and C<LANG> (in that order) are matched for encodings
434b8851fccSafresh1(the part after "C<.>", if any), and if any found, that is used
435b8851fccSafresh1as the default encoding for the open pragma.
436b8851fccSafresh1
437b8851fccSafresh1=item 3.
438b8851fccSafresh1
439b8851fccSafresh1If 1. and 2. didn't work, the environment variables C<LC_ALL> and C<LANG>
440b8851fccSafresh1(in that order) are matched for anything looking like UTF-8, and if
441b8851fccSafresh1any found, C<:utf8> is used as the default encoding for the open
442b8851fccSafresh1pragma.
443b8851fccSafresh1
444b8851fccSafresh1=back
445b8851fccSafresh1
446b8851fccSafresh1If your locale environment variables (C<LC_ALL>, C<LC_CTYPE>, C<LANG>)
447b8851fccSafresh1contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching),
448b8851fccSafresh1the default encoding of your C<STDIN>, C<STDOUT>, and C<STDERR>, and of
449b8851fccSafresh1B<any subsequent file open>, is UTF-8.
450b8851fccSafresh1
451b8851fccSafresh1=head1 CAVEATS
452b8851fccSafresh1
453b8851fccSafresh1=head2 SIDE EFFECTS
454b8851fccSafresh1
455b8851fccSafresh1=over
456b8851fccSafresh1
457b8851fccSafresh1=item *
458b8851fccSafresh1
459b8851fccSafresh1If the C<encoding> pragma is in scope then the lengths returned are
460b8851fccSafresh1calculated from the length of C<$/> in Unicode characters, which is not
461b8851fccSafresh1always the same as the length of C<$/> in the native encoding.
462b8851fccSafresh1
463b8851fccSafresh1=item *
464b8851fccSafresh1
465b8851fccSafresh1Without this pragma, if strings operating under byte semantics and strings
466b39c5158Smillertwith Unicode character data are concatenated, the new string will
467b39c5158Smillertbe created by decoding the byte strings as I<ISO 8859-1 (Latin-1)>.
468b39c5158Smillert
469b39c5158SmillertThe B<encoding> pragma changes this to use the specified encoding
470b39c5158Smillertinstead.  For example:
471b39c5158Smillert
472b39c5158Smillert    use encoding 'utf8';
473b39c5158Smillert    my $string = chr(20000); # a Unicode string
474b39c5158Smillert    utf8::encode($string);   # now it's a UTF-8 encoded byte string
475b39c5158Smillert    # concatenate with another Unicode string
476b39c5158Smillert    print length($string . chr(20000));
477b39c5158Smillert
478b39c5158SmillertWill print C<2>, because C<$string> is upgraded as UTF-8.  Without
479b39c5158SmillertC<use encoding 'utf8';>, it will print C<4> instead, since C<$string>
480b39c5158Smillertis three octets when interpreted as Latin-1.
481b39c5158Smillert
482b39c5158Smillert=back
483b39c5158Smillert
484b39c5158Smillert=head2 DO NOT MIX MULTIPLE ENCODINGS
485b39c5158Smillert
486b39c5158SmillertNotice that only literals (string or regular expression) having only
487b39c5158Smillertlegacy code points are affected: if you mix data like this
488b39c5158Smillert
489b8851fccSafresh1    \x{100}\xDF
490b39c5158Smillert    \xDF\x{100}
491b39c5158Smillert
492b39c5158Smillertthe data is assumed to be in (Latin 1 and) Unicode, not in your native
493b39c5158Smillertencoding.  In other words, this will match in "greek":
494b39c5158Smillert
495b39c5158Smillert    "\xDF" =~ /\x{3af}/
496b39c5158Smillert
497b39c5158Smillertbut this will not
498b39c5158Smillert
499b39c5158Smillert    "\xDF\x{100}" =~ /\x{3af}\x{100}/
500b39c5158Smillert
501b39c5158Smillertsince the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on
502b39c5158Smillertthe left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL
503b39c5158SmillertLETTER IOTA WITH TONOS) because of the C<\x{100}> on the left.  You
504b39c5158Smillertshould not be mixing your legacy data and Unicode in the same string.
505b39c5158Smillert
506b39c5158SmillertThis pragma also affects encoding of the 0x80..0xFF code point range:
507b39c5158Smillertnormally characters in that range are left as eight-bit bytes (unless
508b39c5158Smillertthey are combined with characters with code points 0x100 or larger,
509b39c5158Smillertin which case all characters need to become UTF-8 encoded), but if
510b39c5158Smillertthe C<encoding> pragma is present, even the 0x80..0xFF range always
511b39c5158Smillertgets UTF-8 encoded.
512b39c5158Smillert
513b39c5158SmillertAfter all, the best thing about this pragma is that you don't have to
514b39c5158Smillertresort to \x{....} just to spell your name in a native encoding.
515b39c5158SmillertSo feel free to put your strings in your encoding in quotes and
516b39c5158Smillertregexes.
517b39c5158Smillert
518b8851fccSafresh1=head2 Prior to Perl v5.22
519b8851fccSafresh1
520b8851fccSafresh1The pragma was a per script, not a per block lexical.  Only the last
521b8851fccSafresh1C<use encoding> or C<no encoding> mattered, and it affected
522b8851fccSafresh1B<the whole script>.  However, the C<no encoding> pragma was supported and
523b8851fccSafresh1C<use encoding> could appear as many times as you want in a given script
524b8851fccSafresh1(though only the last was effective).
525b8851fccSafresh1
526b8851fccSafresh1Since the scope wasn't lexical, other modules' use of C<chr>, C<ord>, I<etc.>
527b8851fccSafresh1were affected.  This leads to spooky, incorrect action at a distance that is
528b8851fccSafresh1hard to debug.
529b8851fccSafresh1
530b8851fccSafresh1This means you would have to be very careful of the load order:
531b8851fccSafresh1
532b8851fccSafresh1  # called module
533b8851fccSafresh1  package Module_IN_BAR;
534b8851fccSafresh1  use encoding "bar";
535b8851fccSafresh1  # stuff in "bar" encoding here
536b8851fccSafresh1  1;
537b8851fccSafresh1
538b8851fccSafresh1  # caller script
539b8851fccSafresh1  use encoding "foo"
540b8851fccSafresh1  use Module_IN_BAR;
541b8851fccSafresh1  # surprise! use encoding "bar" is in effect.
542b8851fccSafresh1
543b8851fccSafresh1The best way to avoid this oddity is to use this pragma RIGHT AFTER
544b8851fccSafresh1other modules are loaded.  i.e.
545b8851fccSafresh1
546b8851fccSafresh1  use Module_IN_BAR;
547b8851fccSafresh1  use encoding "foo";
548b8851fccSafresh1
549b8851fccSafresh1=head2 Prior to Encode version 1.87
550b8851fccSafresh1
551b8851fccSafresh1=over
552b8851fccSafresh1
553b8851fccSafresh1=item *
554b8851fccSafresh1
555b8851fccSafresh1C<STDIN> and C<STDOUT> were not set under the filter option.
556b8851fccSafresh1And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> didn't work like
557b8851fccSafresh1non-filter version.
558b8851fccSafresh1
559b8851fccSafresh1=item *
560b8851fccSafresh1
561b8851fccSafresh1C<use utf8> wasn't implicitly declared so you have to C<use utf8> to do
562b8851fccSafresh1
563b8851fccSafresh1 ${"\x{4eba}"}++
564b8851fccSafresh1
565b8851fccSafresh1=back
566b8851fccSafresh1
567b8851fccSafresh1=head2 Prior to Perl v5.8.1
568b8851fccSafresh1
569b8851fccSafresh1=over
570b8851fccSafresh1
571b8851fccSafresh1=item "NON-EUC" doublebyte encodings
572b8851fccSafresh1
573b8851fccSafresh1Because perl needs to parse the script before applying this pragma, such
574b8851fccSafresh1encodings as Shift_JIS and Big-5 that may contain C<'\'> (BACKSLASH;
575b8851fccSafresh1C<\x5c>) in the second byte fail because the second byte may
576b8851fccSafresh1accidentally escape the quoting character that follows.
577b8851fccSafresh1
578b8851fccSafresh1=item C<tr///>
579b39c5158Smillert
580b39c5158SmillertThe B<encoding> pragma works by decoding string literals in
581b8851fccSafresh1C<q//,qq//,qr//,qw///, qx//> and so forth.  In perl v5.8.0, this
582b39c5158Smillertdoes not apply to C<tr///>.  Therefore,
583b39c5158Smillert
584b39c5158Smillert  use encoding 'euc-jp';
585b39c5158Smillert  #....
586b39c5158Smillert  $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/;
587b39c5158Smillert  #           -------- -------- -------- --------
588b39c5158Smillert
589b39c5158SmillertDoes not work as
590b39c5158Smillert
591b39c5158Smillert  $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/;
592b39c5158Smillert
593b39c5158Smillert=over
594b39c5158Smillert
595b39c5158Smillert=item Legend of characters above
596b39c5158Smillert
597b39c5158Smillert  utf8     euc-jp   charnames::viacode()
598b39c5158Smillert  -----------------------------------------
599b39c5158Smillert  \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A
600b39c5158Smillert  \x{3093} \xA4\xF3 HIRAGANA LETTER N
601b39c5158Smillert  \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A
602b39c5158Smillert  \x{30f3} \xA5\xF3 KATAKANA LETTER N
603b39c5158Smillert
604b39c5158Smillert=back
605b39c5158Smillert
606b8851fccSafresh1This counterintuitive behavior has been fixed in perl v5.8.1.
607b39c5158Smillert
608b8851fccSafresh1In perl v5.8.0, you can work around this as follows;
609b39c5158Smillert
610b39c5158Smillert  use encoding 'euc-jp';
611b39c5158Smillert  #  ....
612b39c5158Smillert  eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ };
613b39c5158Smillert
614b39c5158SmillertNote the C<tr//> expression is surrounded by C<qq{}>.  The idea behind
615b8851fccSafresh1this is the same as the classic idiom that makes C<tr///> 'interpolate':
616b39c5158Smillert
617b39c5158Smillert   tr/$from/$to/;            # wrong!
618b39c5158Smillert   eval qq{ tr/$from/$to/ }; # workaround.
619b39c5158Smillert
620b8851fccSafresh1=back
621b39c5158Smillert
622b39c5158Smillert=head1 EXAMPLE - Greekperl
623b39c5158Smillert
624b39c5158Smillert    use encoding "iso 8859-7";
625b39c5158Smillert
626b39c5158Smillert    # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode.
627b39c5158Smillert
628b39c5158Smillert    $a = "\xDF";
629b39c5158Smillert    $b = "\x{100}";
630b39c5158Smillert
631b39c5158Smillert    printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
632b39c5158Smillert
633b39c5158Smillert    $c = $a . $b;
634b39c5158Smillert
635b39c5158Smillert    # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
636b39c5158Smillert
637b39c5158Smillert    # chr() is affected, and ...
638b39c5158Smillert
639b39c5158Smillert    print "mega\n"  if ord(chr(0xdf)) == 0x3af;
640b39c5158Smillert
641b39c5158Smillert    # ... ord() is affected by the encoding pragma ...
642b39c5158Smillert
643b39c5158Smillert    print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
644b39c5158Smillert
645b39c5158Smillert    # ... as are eq and cmp ...
646b39c5158Smillert
647b39c5158Smillert    print "peta\n" if "\x{3af}" eq  pack("C", 0xdf);
648b39c5158Smillert    print "exa\n"  if "\x{3af}" cmp pack("C", 0xdf) == 0;
649b39c5158Smillert
650b39c5158Smillert    # ... but pack/unpack C are not affected, in case you still
651b39c5158Smillert    # want to go back to your native encoding
652b39c5158Smillert
653b39c5158Smillert    print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
654b39c5158Smillert
655b8851fccSafresh1=head1 BUGS
656b39c5158Smillert
657b39c5158Smillert=over
658b39c5158Smillert
659b8851fccSafresh1=item Thread safety
660b8851fccSafresh1
661b8851fccSafresh1C<use encoding ...> is not thread-safe (i.e., do not use in threaded
662b8851fccSafresh1applications).
663b8851fccSafresh1
664b8851fccSafresh1=item Can't be used by more than one module in a single program.
665b8851fccSafresh1
666b8851fccSafresh1Only one encoding is allowed.  If you combine modules in a program that have
667b8851fccSafresh1different encodings, only one will be actually used.
668b8851fccSafresh1
669b8851fccSafresh1=item Other modules using C<STDIN> and C<STDOUT> get the encoded stream
670b8851fccSafresh1
671b8851fccSafresh1They may be expecting something completely different.
672b8851fccSafresh1
673b39c5158Smillert=item literals in regex that are longer than 127 bytes
674b39c5158Smillert
675b39c5158SmillertFor native multibyte encodings (either fixed or variable length),
676b39c5158Smillertthe current implementation of the regular expressions may introduce
677b39c5158Smillertrecoding errors for regular expression literals longer than 127 bytes.
678b39c5158Smillert
679b39c5158Smillert=item EBCDIC
680b39c5158Smillert
681b39c5158SmillertThe encoding pragma is not supported on EBCDIC platforms.
682b39c5158Smillert
683b8851fccSafresh1=item C<format>
684b39c5158Smillert
685b8851fccSafresh1This pragma doesn't work well with C<format> because PerlIO does not
686b8851fccSafresh1get along very well with it.  When C<format> contains non-ASCII
687b39c5158Smillertcharacters it prints funny or gets "wide character warnings".
688b39c5158SmillertTo understand it, try the code below.
689b39c5158Smillert
690b39c5158Smillert  # Save this one in utf8
691b39c5158Smillert  # replace *non-ascii* with a non-ascii string
692b39c5158Smillert  my $camel;
693b39c5158Smillert  format STDOUT =
694b39c5158Smillert  *non-ascii*@>>>>>>>
695b39c5158Smillert  $camel
696b39c5158Smillert  .
697b39c5158Smillert  $camel = "*non-ascii*";
698b39c5158Smillert  binmode(STDOUT=>':encoding(utf8)'); # bang!
699b39c5158Smillert  write;              # funny
700b39c5158Smillert  print $camel, "\n"; # fine
701b39c5158Smillert
702b39c5158SmillertWithout binmode this happens to work but without binmode, print()
703b39c5158Smillertfails instead of write().
704b39c5158Smillert
705b8851fccSafresh1At any rate, the very use of C<format> is questionable when it comes to
706b39c5158Smillertunicode characters since you have to consider such things as character
707b39c5158Smillertwidth (i.e. double-width for ideographs) and directions (i.e. BIDI for
708b39c5158SmillertArabic and Hebrew).
709b39c5158Smillert
710b8851fccSafresh1=item See also L</CAVEATS>
711b39c5158Smillert
712b39c5158Smillert=back
713b39c5158Smillert
714b39c5158Smillert=head1 HISTORY
715b39c5158Smillert
716b8851fccSafresh1This pragma first appeared in Perl v5.8.0.  It has been enhanced in later
717b8851fccSafresh1releases as specified above.
718b39c5158Smillert
719b39c5158Smillert=head1 SEE ALSO
720b39c5158Smillert
721b39c5158SmillertL<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>,
722b39c5158Smillert
723b39c5158SmillertCh. 15 of C<Programming Perl (3rd Edition)>
724b39c5158Smillertby Larry Wall, Tom Christiansen, Jon Orwant;
725b39c5158SmillertO'Reilly & Associates; ISBN 0-596-00027-8
726b39c5158Smillert
727b39c5158Smillert=cut
728