1package ExtUtils::MakeMaker::Locale; 2 3use strict; 4use warnings; 5our $VERSION = "7.64"; 6$VERSION =~ tr/_//d; 7 8use base 'Exporter'; 9our @EXPORT_OK = qw( 10 decode_argv env 11 $ENCODING_LOCALE $ENCODING_LOCALE_FS 12 $ENCODING_CONSOLE_IN $ENCODING_CONSOLE_OUT 13); 14 15use Encode (); 16use Encode::Alias (); 17 18our $ENCODING_LOCALE; 19our $ENCODING_LOCALE_FS; 20our $ENCODING_CONSOLE_IN; 21our $ENCODING_CONSOLE_OUT; 22 23sub DEBUG () { 0 } 24 25sub _init { 26 if ($^O eq "MSWin32") { 27 unless ($ENCODING_LOCALE) { 28 # Try to obtain what the Windows ANSI code page is 29 eval { 30 unless (defined &GetConsoleCP) { 31 require Win32; 32 # manually "import" it since Win32->import refuses 33 *GetConsoleCP = sub { &Win32::GetConsoleCP } if defined &Win32::GetConsoleCP; 34 } 35 unless (defined &GetConsoleCP) { 36 require Win32::API; 37 Win32::API->Import('kernel32', 'int GetConsoleCP()'); 38 } 39 if (defined &GetConsoleCP) { 40 my $cp = GetConsoleCP(); 41 $ENCODING_LOCALE = "cp$cp" if $cp; 42 } 43 }; 44 } 45 46 unless ($ENCODING_CONSOLE_IN) { 47 # only test one since set together 48 unless (defined &GetInputCP) { 49 eval { 50 require Win32; 51 eval { 52 local $SIG{__WARN__} = sub {} if ( "$]" < 5.014 ); # suppress deprecation warning for inherited AUTOLOAD of Win32::GetConsoleCP() 53 Win32::GetConsoleCP(); 54 }; 55 # manually "import" it since Win32->import refuses 56 *GetInputCP = sub { &Win32::GetConsoleCP } if defined &Win32::GetConsoleCP; 57 *GetOutputCP = sub { &Win32::GetConsoleOutputCP } if defined &Win32::GetConsoleOutputCP; 58 }; 59 unless (defined &GetInputCP) { 60 eval { 61 # try Win32::Console module for codepage to use 62 require Win32::Console; 63 *GetInputCP = sub { &Win32::Console::InputCP } 64 if defined &Win32::Console::InputCP; 65 *GetOutputCP = sub { &Win32::Console::OutputCP } 66 if defined &Win32::Console::OutputCP; 67 }; 68 } 69 unless (defined &GetInputCP) { 70 # final fallback 71 *GetInputCP = *GetOutputCP = sub { 72 # another fallback that could work is: 73 # reg query HKLM\System\CurrentControlSet\Control\Nls\CodePage /v ACP 74 ((qx(chcp) || '') =~ /^Active code page: (\d+)/) 75 ? $1 : (); 76 }; 77 } 78 } 79 my $cp = GetInputCP(); 80 $ENCODING_CONSOLE_IN = "cp$cp" if $cp; 81 $cp = GetOutputCP(); 82 $ENCODING_CONSOLE_OUT = "cp$cp" if $cp; 83 } 84 } 85 86 unless ($ENCODING_LOCALE) { 87 eval { 88 require I18N::Langinfo; 89 $ENCODING_LOCALE = I18N::Langinfo::langinfo(I18N::Langinfo::CODESET()); 90 91 # Workaround of Encode < v2.25. The "646" encoding alias was 92 # introduced in Encode-2.25, but we don't want to require that version 93 # quite yet. Should avoid the CPAN testers failure reported from 94 # openbsd-4.7/perl-5.10.0 combo. 95 $ENCODING_LOCALE = "ascii" if $ENCODING_LOCALE eq "646"; 96 97 # https://rt.cpan.org/Ticket/Display.html?id=66373 98 $ENCODING_LOCALE = "hp-roman8" if $^O eq "hpux" && $ENCODING_LOCALE eq "roman8"; 99 }; 100 $ENCODING_LOCALE ||= $ENCODING_CONSOLE_IN; 101 } 102 103 # Workaround of Encode < v2.71 for "cp65000" and "cp65001" 104 # The "cp65000" and "cp65001" aliases were added in [Encode v2.71](https://github.com/dankogai/p5-encode/commit/7874bd95aa10967a3b5dbae333d16bcd703ac6c6) 105 # via commit <https://github.com/dankogai/p5-encode/commit/84b9c1101d5251d37e226f80d1c6781718779047>. 106 # This will avoid test failures for Win32 machines using the UTF-7 or UTF-8 code pages. 107 $ENCODING_LOCALE = 'UTF-7' if $ENCODING_LOCALE && lc($ENCODING_LOCALE) eq "cp65000"; 108 $ENCODING_LOCALE = 'utf-8-strict' if $ENCODING_LOCALE && lc($ENCODING_LOCALE) eq "cp65001"; 109 110 if ($^O eq "darwin") { 111 $ENCODING_LOCALE_FS ||= "UTF-8"; 112 } 113 114 # final fallback 115 $ENCODING_LOCALE ||= $^O eq "MSWin32" ? "cp1252" : "UTF-8"; 116 $ENCODING_LOCALE_FS ||= $ENCODING_LOCALE; 117 $ENCODING_CONSOLE_IN ||= $ENCODING_LOCALE; 118 $ENCODING_CONSOLE_OUT ||= $ENCODING_CONSOLE_IN; 119 120 unless (Encode::find_encoding($ENCODING_LOCALE)) { 121 my $foundit; 122 if (lc($ENCODING_LOCALE) eq "gb18030") { 123 eval { 124 require Encode::HanExtra; 125 }; 126 if ($@) { 127 die "Need Encode::HanExtra to be installed to support locale codeset ($ENCODING_LOCALE), stopped"; 128 } 129 $foundit++ if Encode::find_encoding($ENCODING_LOCALE); 130 } 131 die "The locale codeset ($ENCODING_LOCALE) isn't one that perl can decode, stopped" 132 unless $foundit; 133 134 } 135 136 # use Data::Dump; ddx $ENCODING_LOCALE, $ENCODING_LOCALE_FS, $ENCODING_CONSOLE_IN, $ENCODING_CONSOLE_OUT; 137} 138 139_init(); 140Encode::Alias::define_alias(sub { 141 no strict 'refs'; 142 no warnings 'once'; 143 return ${"ENCODING_" . uc(shift)}; 144}, "locale"); 145 146sub _flush_aliases { 147 no strict 'refs'; 148 for my $a (sort keys %Encode::Alias::Alias) { 149 if (defined ${"ENCODING_" . uc($a)}) { 150 delete $Encode::Alias::Alias{$a}; 151 warn "Flushed alias cache for $a" if DEBUG; 152 } 153 } 154} 155 156sub reinit { 157 $ENCODING_LOCALE = shift; 158 $ENCODING_LOCALE_FS = shift; 159 $ENCODING_CONSOLE_IN = $ENCODING_LOCALE; 160 $ENCODING_CONSOLE_OUT = $ENCODING_LOCALE; 161 _init(); 162 _flush_aliases(); 163} 164 165sub decode_argv { 166 die if defined wantarray; 167 for (@ARGV) { 168 $_ = Encode::decode(locale => $_, @_); 169 } 170} 171 172sub env { 173 my $k = Encode::encode(locale => shift); 174 my $old = $ENV{$k}; 175 if (@_) { 176 my $v = shift; 177 if (defined $v) { 178 $ENV{$k} = Encode::encode(locale => $v); 179 } 180 else { 181 delete $ENV{$k}; 182 } 183 } 184 return Encode::decode(locale => $old) if defined wantarray; 185} 186 1871; 188 189__END__ 190 191=head1 NAME 192 193ExtUtils::MakeMaker::Locale - bundled Encode::Locale 194 195=head1 SYNOPSIS 196 197 use Encode::Locale; 198 use Encode; 199 200 $string = decode(locale => $bytes); 201 $bytes = encode(locale => $string); 202 203 if (-t) { 204 binmode(STDIN, ":encoding(console_in)"); 205 binmode(STDOUT, ":encoding(console_out)"); 206 binmode(STDERR, ":encoding(console_out)"); 207 } 208 209 # Processing file names passed in as arguments 210 my $uni_filename = decode(locale => $ARGV[0]); 211 open(my $fh, "<", encode(locale_fs => $uni_filename)) 212 || die "Can't open '$uni_filename': $!"; 213 binmode($fh, ":encoding(locale)"); 214 ... 215 216=head1 DESCRIPTION 217 218In many applications it's wise to let Perl use Unicode for the strings it 219processes. Most of the interfaces Perl has to the outside world are still byte 220based. Programs therefore need to decode byte strings that enter the program 221from the outside and encode them again on the way out. 222 223The POSIX locale system is used to specify both the language conventions 224requested by the user and the preferred character set to consume and 225output. The C<Encode::Locale> module looks up the charset and encoding (called 226a CODESET in the locale jargon) and arranges for the L<Encode> module to know 227this encoding under the name "locale". It means bytes obtained from the 228environment can be converted to Unicode strings by calling C<< 229Encode::encode(locale => $bytes) >> and converted back again with C<< 230Encode::decode(locale => $string) >>. 231 232Where file systems interfaces pass file names in and out of the program we also 233need care. The trend is for operating systems to use a fixed file encoding 234that don't actually depend on the locale; and this module determines the most 235appropriate encoding for file names. The L<Encode> module will know this 236encoding under the name "locale_fs". For traditional Unix systems this will 237be an alias to the same encoding as "locale". 238 239For programs running in a terminal window (called a "Console" on some systems) 240the "locale" encoding is usually a good choice for what to expect as input and 241output. Some systems allows us to query the encoding set for the terminal and 242C<Encode::Locale> will do that if available and make these encodings known 243under the C<Encode> aliases "console_in" and "console_out". For systems where 244we can't determine the terminal encoding these will be aliased as the same 245encoding as "locale". The advice is to use "console_in" for input known to 246come from the terminal and "console_out" for output to the terminal. 247 248In addition to arranging for various Encode aliases the following functions and 249variables are provided: 250 251=over 252 253=item decode_argv( ) 254 255=item decode_argv( Encode::FB_CROAK ) 256 257This will decode the command line arguments to perl (the C<@ARGV> array) in-place. 258 259The function will by default replace characters that can't be decoded by 260"\x{FFFD}", the Unicode replacement character. 261 262Any argument provided is passed as CHECK to underlying Encode::decode() call. 263Pass the value C<Encode::FB_CROAK> to have the decoding croak if not all the 264command line arguments can be decoded. See L<Encode/"Handling Malformed Data"> 265for details on other options for CHECK. 266 267=item env( $uni_key ) 268 269=item env( $uni_key => $uni_value ) 270 271Interface to get/set environment variables. Returns the current value as a 272Unicode string. The $uni_key and $uni_value arguments are expected to be 273Unicode strings as well. Passing C<undef> as $uni_value deletes the 274environment variable named $uni_key. 275 276The returned value will have the characters that can't be decoded replaced by 277"\x{FFFD}", the Unicode replacement character. 278 279There is no interface to request alternative CHECK behavior as for 280decode_argv(). If you need that you need to call encode/decode yourself. 281For example: 282 283 my $key = Encode::encode(locale => $uni_key, Encode::FB_CROAK); 284 my $uni_value = Encode::decode(locale => $ENV{$key}, Encode::FB_CROAK); 285 286=item reinit( ) 287 288=item reinit( $encoding ) 289 290Reinitialize the encodings from the locale. You want to call this function if 291you changed anything in the environment that might influence the locale. 292 293This function will croak if the determined encoding isn't recognized by 294the Encode module. 295 296With argument force $ENCODING_... variables to set to the given value. 297 298=item $ENCODING_LOCALE 299 300The encoding name determined to be suitable for the current locale. 301L<Encode> know this encoding as "locale". 302 303=item $ENCODING_LOCALE_FS 304 305The encoding name determined to be suitable for file system interfaces 306involving file names. 307L<Encode> know this encoding as "locale_fs". 308 309=item $ENCODING_CONSOLE_IN 310 311=item $ENCODING_CONSOLE_OUT 312 313The encodings to be used for reading and writing output to the a console. 314L<Encode> know these encodings as "console_in" and "console_out". 315 316=back 317 318=head1 NOTES 319 320This table summarizes the mapping of the encodings set up 321by the C<Encode::Locale> module: 322 323 Encode | | | 324 Alias | Windows | Mac OS X | POSIX 325 ------------+---------+--------------+------------ 326 locale | ANSI | nl_langinfo | nl_langinfo 327 locale_fs | ANSI | UTF-8 | nl_langinfo 328 console_in | OEM | nl_langinfo | nl_langinfo 329 console_out | OEM | nl_langinfo | nl_langinfo 330 331=head2 Windows 332 333Windows has basically 2 sets of APIs. A wide API (based on passing UTF-16 334strings) and a byte based API based a character set called ANSI. The 335regular Perl interfaces to the OS currently only uses the ANSI APIs. 336Unfortunately ANSI is not a single character set. 337 338The encoding that corresponds to ANSI varies between different editions of 339Windows. For many western editions of Windows ANSI corresponds to CP-1252 340which is a character set similar to ISO-8859-1. Conceptually the ANSI 341character set is a similar concept to the POSIX locale CODESET so this module 342figures out what the ANSI code page is and make this available as 343$ENCODING_LOCALE and the "locale" Encoding alias. 344 345Windows systems also operate with another byte based character set. 346It's called the OEM code page. This is the encoding that the Console 347takes as input and output. It's common for the OEM code page to 348differ from the ANSI code page. 349 350=head2 Mac OS X 351 352On Mac OS X the file system encoding is always UTF-8 while the locale 353can otherwise be set up as normal for POSIX systems. 354 355File names on Mac OS X will at the OS-level be converted to 356NFD-form. A file created by passing a NFC-filename will come 357in NFD-form from readdir(). See L<Unicode::Normalize> for details 358of NFD/NFC. 359 360Actually, Apple does not follow the Unicode NFD standard since not all 361character ranges are decomposed. The claim is that this avoids problems with 362round trip conversions from old Mac text encodings. See L<Encode::UTF8Mac> for 363details. 364 365=head2 POSIX (Linux and other Unixes) 366 367File systems might vary in what encoding is to be used for 368filenames. Since this module has no way to actually figure out 369what the is correct it goes with the best guess which is to 370assume filenames are encoding according to the current locale. 371Users are advised to always specify UTF-8 as the locale charset. 372 373=head1 SEE ALSO 374 375L<I18N::Langinfo>, L<Encode>, L<Term::Encoding> 376 377=head1 AUTHOR 378 379Copyright 2010 Gisle Aas <gisle@aas.no>. 380 381This library is free software; you can redistribute it and/or 382modify it under the same terms as Perl itself. 383 384=cut 385