1# 2# $Id: Encode.pm,v 3.21 2024/02/25 22:17:32 dankogai Exp $ 3# 4package Encode; 5use strict; 6use warnings; 7use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG}; 8our $VERSION; 9BEGIN { 10 $VERSION = sprintf "%d.%02d", q$Revision: 3.21 $ =~ /(\d+)/g; 11 require XSLoader; 12 XSLoader::load( __PACKAGE__, $VERSION ); 13} 14 15use Exporter 5.57 'import'; 16 17use Carp (); 18our @CARP_NOT = qw(Encode::Encoder); 19 20# Public, encouraged API is exported by default 21 22our @EXPORT = qw( 23 decode decode_utf8 encode encode_utf8 str2bytes bytes2str 24 encodings find_encoding find_mime_encoding clone_encoding 25); 26our @FB_FLAGS = qw( 27 DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC 28 PERLQQ HTMLCREF XMLCREF STOP_AT_PARTIAL 29); 30our @FB_CONSTS = qw( 31 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN 32 FB_PERLQQ FB_HTMLCREF FB_XMLCREF 33); 34our @EXPORT_OK = ( 35 qw( 36 _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit 37 is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade 38 ), 39 @FB_FLAGS, @FB_CONSTS, 40); 41 42our %EXPORT_TAGS = ( 43 all => [ @EXPORT, @EXPORT_OK ], 44 default => [ @EXPORT ], 45 fallbacks => [ @FB_CONSTS ], 46 fallback_all => [ @FB_CONSTS, @FB_FLAGS ], 47); 48 49# Documentation moved after __END__ for speed - NI-S 50 51our $ON_EBCDIC = ( ord("A") == 193 ); 52 53use Encode::Alias (); 54use Encode::MIME::Name; 55 56use Storable; 57 58# Make a %Encoding package variable to allow a certain amount of cheating 59our %Encoding; 60our %ExtModule; 61require Encode::Config; 62# See 63# https://bugzilla.redhat.com/show_bug.cgi?id=435505#c2 64# to find why sig handlers inside eval{} are disabled. 65eval { 66 local $SIG{__DIE__}; 67 local $SIG{__WARN__}; 68 local @INC = @INC; 69 pop @INC if @INC && $INC[-1] eq '.'; 70 require Encode::ConfigLocal; 71}; 72 73sub encodings { 74 my %enc; 75 my $arg = $_[1] || ''; 76 if ( $arg eq ":all" ) { 77 %enc = ( %Encoding, %ExtModule ); 78 } 79 else { 80 %enc = %Encoding; 81 for my $mod ( map { m/::/ ? $_ : "Encode::$_" } @_ ) { 82 DEBUG and warn $mod; 83 for my $enc ( keys %ExtModule ) { 84 $ExtModule{$enc} eq $mod and $enc{$enc} = $mod; 85 } 86 } 87 } 88 return sort { lc $a cmp lc $b } 89 grep { !/^(?:Internal|Unicode|Guess)$/o } keys %enc; 90} 91 92sub perlio_ok { 93 my $obj = ref( $_[0] ) ? $_[0] : find_encoding( $_[0] ); 94 $obj->can("perlio_ok") and return $obj->perlio_ok(); 95 return 0; # safety net 96} 97 98sub define_encoding { 99 my $obj = shift; 100 my $name = shift; 101 $Encoding{$name} = $obj; 102 my $lc = lc($name); 103 define_alias( $lc => $obj ) unless $lc eq $name; 104 while (@_) { 105 my $alias = shift; 106 define_alias( $alias, $obj ); 107 } 108 my $class = ref($obj); 109 push @Encode::CARP_NOT, $class unless grep { $_ eq $class } @Encode::CARP_NOT; 110 push @Encode::Encoding::CARP_NOT, $class unless grep { $_ eq $class } @Encode::Encoding::CARP_NOT; 111 return $obj; 112} 113 114sub getEncoding { 115 my ( $class, $name, $skip_external ) = @_; 116 117 defined($name) or return; 118 119 $name =~ s/\s+//g; # https://rt.cpan.org/Ticket/Display.html?id=65796 120 121 ref($name) && $name->can('renew') and return $name; 122 exists $Encoding{$name} and return $Encoding{$name}; 123 my $lc = lc $name; 124 exists $Encoding{$lc} and return $Encoding{$lc}; 125 126 my $oc = $class->find_alias($name); 127 defined($oc) and return $oc; 128 $lc ne $name and $oc = $class->find_alias($lc); 129 defined($oc) and return $oc; 130 131 unless ($skip_external) { 132 if ( my $mod = $ExtModule{$name} || $ExtModule{$lc} ) { 133 $mod =~ s,::,/,g; 134 $mod .= '.pm'; 135 eval { require $mod; }; 136 exists $Encoding{$name} and return $Encoding{$name}; 137 } 138 } 139 return; 140} 141 142# HACK: These two functions must be defined in Encode and because of 143# cyclic dependency between Encode and Encode::Alias, Exporter does not work 144sub find_alias { 145 goto &Encode::Alias::find_alias; 146} 147sub define_alias { 148 goto &Encode::Alias::define_alias; 149} 150 151sub find_encoding($;$) { 152 my ( $name, $skip_external ) = @_; 153 return __PACKAGE__->getEncoding( $name, $skip_external ); 154} 155 156sub find_mime_encoding($;$) { 157 my ( $mime_name, $skip_external ) = @_; 158 my $name = Encode::MIME::Name::get_encode_name( $mime_name ); 159 return find_encoding( $name, $skip_external ); 160} 161 162sub resolve_alias($) { 163 my $obj = find_encoding(shift); 164 defined $obj and return $obj->name; 165 return; 166} 167 168sub clone_encoding($) { 169 my $obj = find_encoding(shift); 170 ref $obj or return; 171 return Storable::dclone($obj); 172} 173 174onBOOT; 175 176if ($ON_EBCDIC) { 177 package Encode::UTF_EBCDIC; 178 use parent 'Encode::Encoding'; 179 my $obj = bless { Name => "UTF_EBCDIC" } => "Encode::UTF_EBCDIC"; 180 Encode::define_encoding($obj, 'Unicode'); 181 sub decode { 182 my ( undef, $str, $chk ) = @_; 183 my $res = ''; 184 for ( my $i = 0 ; $i < length($str) ; $i++ ) { 185 $res .= 186 chr( 187 utf8::unicode_to_native( ord( substr( $str, $i, 1 ) ) ) 188 ); 189 } 190 $_[1] = '' if $chk; 191 return $res; 192 } 193 sub encode { 194 my ( undef, $str, $chk ) = @_; 195 my $res = ''; 196 for ( my $i = 0 ; $i < length($str) ; $i++ ) { 197 $res .= 198 chr( 199 utf8::native_to_unicode( ord( substr( $str, $i, 1 ) ) ) 200 ); 201 } 202 $_[1] = '' if $chk; 203 return $res; 204 } 205} 206 207{ 208 # https://rt.cpan.org/Public/Bug/Display.html?id=103253 209 package Encode::XS; 210 use parent 'Encode::Encoding'; 211} 212 213{ 214 package Encode::utf8; 215 use parent 'Encode::Encoding'; 216 my %obj = ( 217 'utf8' => { Name => 'utf8' }, 218 'utf-8-strict' => { Name => 'utf-8-strict', strict_utf8 => 1 } 219 ); 220 for ( keys %obj ) { 221 bless $obj{$_} => __PACKAGE__; 222 Encode::define_encoding( $obj{$_} => $_ ); 223 } 224 sub cat_decode { 225 # ($obj, $dst, $src, $pos, $trm, $chk) 226 # currently ignores $chk 227 my ( undef, undef, undef, $pos, $trm ) = @_; 228 my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ]; 229 use bytes; 230 if ( ( my $npos = index( $$rsrc, $trm, $pos ) ) >= 0 ) { 231 $$rdst .= 232 substr( $$rsrc, $pos, $npos - $pos + length($trm) ); 233 $$rpos = $npos + length($trm); 234 return 1; 235 } 236 $$rdst .= substr( $$rsrc, $pos ); 237 $$rpos = length($$rsrc); 238 return ''; 239 } 240} 241 2421; 243 244__END__ 245 246=head1 NAME 247 248Encode - character encodings in Perl 249 250=head1 SYNOPSIS 251 252 use Encode qw(decode encode); 253 $characters = decode('UTF-8', $octets, Encode::FB_CROAK); 254 $octets = encode('UTF-8', $characters, Encode::FB_CROAK); 255 256=head2 Table of Contents 257 258Encode consists of a collection of modules whose details are too extensive 259to fit in one document. This one itself explains the top-level APIs 260and general topics at a glance. For other topics and more details, 261see the documentation for these modules: 262 263=over 2 264 265=item L<Encode::Alias> - Alias definitions to encodings 266 267=item L<Encode::Encoding> - Encode Implementation Base Class 268 269=item L<Encode::Supported> - List of Supported Encodings 270 271=item L<Encode::CN> - Simplified Chinese Encodings 272 273=item L<Encode::JP> - Japanese Encodings 274 275=item L<Encode::KR> - Korean Encodings 276 277=item L<Encode::TW> - Traditional Chinese Encodings 278 279=back 280 281=head1 DESCRIPTION 282 283The C<Encode> module provides the interface between Perl strings 284and the rest of the system. Perl strings are sequences of 285I<characters>. 286 287The repertoire of characters that Perl can represent is a superset of those 288defined by the Unicode Consortium. On most platforms the ordinal 289values of a character as returned by C<ord(I<S>)> is the I<Unicode 290codepoint> for that character. The exceptions are platforms where 291the legacy encoding is some variant of EBCDIC rather than a superset 292of ASCII; see L<perlebcdic>. 293 294During recent history, data is moved around a computer in 8-bit chunks, 295often called "bytes" but also known as "octets" in standards documents. 296Perl is widely used to manipulate data of many types: not only strings of 297characters representing human or computer languages, but also "binary" 298data, being the machine's representation of numbers, pixels in an image, or 299just about anything. 300 301When Perl is processing "binary data", the programmer wants Perl to 302process "sequences of bytes". This is not a problem for Perl: because a 303byte has 256 possible values, it easily fits in Perl's much larger 304"logical character". 305 306This document mostly explains the I<how>. L<perlunitut> and L<perlunifaq> 307explain the I<why>. 308 309=head2 TERMINOLOGY 310 311=head3 character 312 313A character in the range 0 .. 2**32-1 (or more); 314what Perl's strings are made of. 315 316=head3 byte 317 318A character in the range 0..255; 319a special case of a Perl character. 320 321=head3 octet 322 3238 bits of data, with ordinal values 0..255; 324term for bytes passed to or from a non-Perl context, such as a disk file, 325standard I/O stream, database, command-line argument, environment variable, 326socket etc. 327 328=head1 THE PERL ENCODING API 329 330=head2 Basic methods 331 332=head3 encode 333 334 $octets = encode(ENCODING, STRING[, CHECK]) 335 336Encodes the scalar value I<STRING> from Perl's internal form into 337I<ENCODING> and returns a sequence of octets. I<ENCODING> can be either a 338canonical name or an alias. For encoding names and aliases, see 339L</"Defining Aliases">. For CHECK, see L</"Handling Malformed Data">. 340 341B<CAVEAT>: the input scalar I<STRING> might be modified in-place depending 342on what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be 343left unchanged. 344 345For example, to convert a string from Perl's internal format into 346ISO-8859-1, also known as Latin1: 347 348 $octets = encode("iso-8859-1", $string); 349 350B<CAVEAT>: When you run C<$octets = encode("UTF-8", $string)>, then 351$octets I<might not be equal to> $string. Though both contain the 352same data, the UTF8 flag for $octets is I<always> off. When you 353encode anything, the UTF8 flag on the result is always off, even when it 354contains a completely valid UTF-8 string. See L</"The UTF8 flag"> below. 355 356If the $string is C<undef>, then C<undef> is returned. 357 358C<str2bytes> may be used as an alias for C<encode>. 359 360=head3 decode 361 362 $string = decode(ENCODING, OCTETS[, CHECK]) 363 364This function returns the string that results from decoding the scalar 365value I<OCTETS>, assumed to be a sequence of octets in I<ENCODING>, into 366Perl's internal form. As with encode(), 367I<ENCODING> can be either a canonical name or an alias. For encoding names 368and aliases, see L</"Defining Aliases">; for I<CHECK>, see L</"Handling 369Malformed Data">. 370 371B<CAVEAT>: the input scalar I<OCTETS> might be modified in-place depending 372on what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be 373left unchanged. 374 375For example, to convert ISO-8859-1 data into a string in Perl's 376internal format: 377 378 $string = decode("iso-8859-1", $octets); 379 380B<CAVEAT>: When you run C<$string = decode("UTF-8", $octets)>, then $string 381I<might not be equal to> $octets. Though both contain the same data, the 382UTF8 flag for $string is on. See L</"The UTF8 flag"> 383below. 384 385If the $string is C<undef>, then C<undef> is returned. 386 387C<bytes2str> may be used as an alias for C<decode>. 388 389=head3 find_encoding 390 391 [$obj =] find_encoding(ENCODING) 392 393Returns the I<encoding object> corresponding to I<ENCODING>. Returns 394C<undef> if no matching I<ENCODING> is find. The returned object is 395what does the actual encoding or decoding. 396 397 $string = decode($name, $bytes); 398 399is in fact 400 401 $string = do { 402 $obj = find_encoding($name); 403 croak qq(encoding "$name" not found) unless ref $obj; 404 $obj->decode($bytes); 405 }; 406 407with more error checking. 408 409You can therefore save time by reusing this object as follows; 410 411 my $enc = find_encoding("iso-8859-1"); 412 while(<>) { 413 my $string = $enc->decode($_); 414 ... # now do something with $string; 415 } 416 417Besides L</decode> and L</encode>, other methods are 418available as well. For instance, C<name()> returns the canonical 419name of the encoding object. 420 421 find_encoding("latin1")->name; # iso-8859-1 422 423See L<Encode::Encoding> for details. 424 425=head3 find_mime_encoding 426 427 [$obj =] find_mime_encoding(MIME_ENCODING) 428 429Returns the I<encoding object> corresponding to I<MIME_ENCODING>. Acts 430same as C<find_encoding()> but C<mime_name()> of returned object must 431match to I<MIME_ENCODING>. So as opposite of C<find_encoding()> 432canonical names and aliases are not used when searching for object. 433 434 find_mime_encoding("utf8"); # returns undef because "utf8" is not a valid MIME_ENCODING 435 find_mime_encoding("utf-8"); # returns encode object "utf-8-strict" 436 find_mime_encoding("UTF-8"); # same as "utf-8" because MIME_ENCODING is case insensitive 437 find_mime_encoding("utf-8-strict"); returns undef because "utf-8-strict" is not a valid MIME_ENCODING 438 439=head3 from_to 440 441 [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK]) 442 443Converts I<in-place> data between two encodings. The data in $octets 444must be encoded as octets and I<not> as characters in Perl's internal 445format. For example, to convert ISO-8859-1 data into Microsoft's CP1250 446encoding: 447 448 from_to($octets, "iso-8859-1", "cp1250"); 449 450and to convert it back: 451 452 from_to($octets, "cp1250", "iso-8859-1"); 453 454Because the conversion happens in place, the data to be 455converted cannot be a string constant: it must be a scalar variable. 456 457C<from_to()> returns the length of the converted string in octets on success, 458and C<undef> on error. 459 460B<CAVEAT>: The following operations may look the same, but are not: 461 462 from_to($data, "iso-8859-1", "UTF-8"); #1 463 $data = decode("iso-8859-1", $data); #2 464 465Both #1 and #2 make $data consist of a completely valid UTF-8 string, 466but only #2 turns the UTF8 flag on. #1 is equivalent to: 467 468 $data = encode("UTF-8", decode("iso-8859-1", $data)); 469 470See L</"The UTF8 flag"> below. 471 472Also note that: 473 474 from_to($octets, $from, $to, $check); 475 476is equivalent to: 477 478 $octets = encode($to, decode($from, $octets), $check); 479 480Yes, it does I<not> respect the $check during decoding. It is 481deliberately done that way. If you need minute control, use C<decode> 482followed by C<encode> as follows: 483 484 $octets = encode($to, decode($from, $octets, $check_from), $check_to); 485 486=head3 encode_utf8 487 488 $octets = encode_utf8($string); 489 490B<WARNING>: L<This function can produce invalid UTF-8!|/UTF-8 vs. utf8 vs. UTF8> 491Do not use it for data exchange. 492Unless you want Perl's older "lax" mode, prefer 493C<$octets = encode("UTF-8", $string)>. 494 495Equivalent to C<$octets = encode("utf8", $string)>. The characters in 496$string are encoded in Perl's internal format, and the result is returned 497as a sequence of octets. Because all possible characters in Perl have a 498(loose, not strict) utf8 representation, this function cannot fail. 499 500=head3 decode_utf8 501 502 $string = decode_utf8($octets [, CHECK]); 503 504B<WARNING>: L<This function accepts invalid UTF-8!|/UTF-8 vs. utf8 vs. UTF8> 505Do not use it for data exchange. 506Unless you want Perl's older "lax" mode, prefer 507C<$string = decode("UTF-8", $octets [, CHECK])>. 508 509Equivalent to C<$string = decode("utf8", $octets [, CHECK])>. 510The sequence of octets represented by $octets is decoded 511from (loose, not strict) utf8 into a sequence of logical characters. 512Because not all sequences of octets are valid not strict utf8, 513it is quite possible for this function to fail. 514For CHECK, see L</"Handling Malformed Data">. 515 516B<CAVEAT>: the input I<$octets> might be modified in-place depending on 517what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be 518left unchanged. 519 520=head2 Listing available encodings 521 522 use Encode; 523 @list = Encode->encodings(); 524 525Returns a list of canonical names of available encodings that have already 526been loaded. To get a list of all available encodings including those that 527have not yet been loaded, say: 528 529 @all_encodings = Encode->encodings(":all"); 530 531Or you can give the name of a specific module: 532 533 @with_jp = Encode->encodings("Encode::JP"); 534 535When "C<::>" is not in the name, "C<Encode::>" is assumed. 536 537 @ebcdic = Encode->encodings("EBCDIC"); 538 539To find out in detail which encodings are supported by this package, 540see L<Encode::Supported>. 541 542=head2 Defining Aliases 543 544To add a new alias to a given encoding, use: 545 546 use Encode; 547 use Encode::Alias; 548 define_alias(NEWNAME => ENCODING); 549 550After that, I<NEWNAME> can be used as an alias for I<ENCODING>. 551I<ENCODING> may be either the name of an encoding or an 552I<encoding object>. 553 554Before you do that, first make sure the alias is nonexistent using 555C<resolve_alias()>, which returns the canonical name thereof. 556For example: 557 558 Encode::resolve_alias("latin1") eq "iso-8859-1" # true 559 Encode::resolve_alias("iso-8859-12") # false; nonexistent 560 Encode::resolve_alias($name) eq $name # true if $name is canonical 561 562C<resolve_alias()> does not need C<use Encode::Alias>; it can be 563imported via C<use Encode qw(resolve_alias)>. 564 565See L<Encode::Alias> for details. 566 567=head2 Finding IANA Character Set Registry names 568 569The canonical name of a given encoding does not necessarily agree with 570IANA Character Set Registry, commonly seen as C<< Content-Type: 571text/plain; charset=I<WHATEVER> >>. For most cases, the canonical name 572works, but sometimes it does not, most notably with "utf-8-strict". 573 574As of C<Encode> version 2.21, a new method C<mime_name()> is therefore added. 575 576 use Encode; 577 my $enc = find_encoding("UTF-8"); 578 warn $enc->name; # utf-8-strict 579 warn $enc->mime_name; # UTF-8 580 581See also: L<Encode::Encoding> 582 583=head1 Encoding via PerlIO 584 585If your perl supports C<PerlIO> (which is the default), you can use a 586C<PerlIO> layer to decode and encode directly via a filehandle. The 587following two examples are fully identical in functionality: 588 589 ### Version 1 via PerlIO 590 open(INPUT, "< :encoding(shiftjis)", $infile) 591 || die "Can't open < $infile for reading: $!"; 592 open(OUTPUT, "> :encoding(euc-jp)", $outfile) 593 || die "Can't open > $output for writing: $!"; 594 while (<INPUT>) { # auto decodes $_ 595 print OUTPUT; # auto encodes $_ 596 } 597 close(INPUT) || die "can't close $infile: $!"; 598 close(OUTPUT) || die "can't close $outfile: $!"; 599 600 ### Version 2 via from_to() 601 open(INPUT, "< :raw", $infile) 602 || die "Can't open < $infile for reading: $!"; 603 open(OUTPUT, "> :raw", $outfile) 604 || die "Can't open > $output for writing: $!"; 605 606 while (<INPUT>) { 607 from_to($_, "shiftjis", "euc-jp", 1); # switch encoding 608 print OUTPUT; # emit raw (but properly encoded) data 609 } 610 close(INPUT) || die "can't close $infile: $!"; 611 close(OUTPUT) || die "can't close $outfile: $!"; 612 613In the first version above, you let the appropriate encoding layer 614handle the conversion. In the second, you explicitly translate 615from one encoding to the other. 616 617Unfortunately, it may be that encodings are not C<PerlIO>-savvy. You can check 618to see whether your encoding is supported by C<PerlIO> by invoking the 619C<perlio_ok> method on it: 620 621 Encode::perlio_ok("hz"); # false 622 find_encoding("euc-cn")->perlio_ok; # true wherever PerlIO is available 623 624 use Encode qw(perlio_ok); # imported upon request 625 perlio_ok("euc-jp") 626 627Fortunately, all encodings that come with C<Encode> core are C<PerlIO>-savvy 628except for C<hz> and C<ISO-2022-kr>. For the gory details, see 629L<Encode::Encoding> and L<Encode::PerlIO>. 630 631=head1 Handling Malformed Data 632 633The optional I<CHECK> argument tells C<Encode> what to do when 634encountering malformed data. Without I<CHECK>, C<Encode::FB_DEFAULT> 635(== 0) is assumed. 636 637As of version 2.12, C<Encode> supports coderef values for C<CHECK>; 638see below. 639 640B<NOTE:> Not all encodings support this feature. 641Some encodings ignore the I<CHECK> argument. For example, 642L<Encode::Unicode> ignores I<CHECK> and it always croaks on error. 643 644=head2 List of I<CHECK> values 645 646=head3 FB_DEFAULT 647 648 CHECK = Encode::FB_DEFAULT ( == 0) 649 650If I<CHECK> is 0, encoding and decoding replace any malformed character 651with a I<substitution character>. When you encode, I<SUBCHAR> is used. 652When you decode, the Unicode REPLACEMENT CHARACTER, code point U+FFFD, is 653used. If the data is supposed to be UTF-8, an optional lexical warning of 654warning category C<"utf8"> is given. 655 656=head3 FB_CROAK 657 658 CHECK = Encode::FB_CROAK ( == 1) 659 660If I<CHECK> is 1, methods immediately die with an error 661message. Therefore, when I<CHECK> is 1, you should trap 662exceptions with C<eval{}>, unless you really want to let it C<die>. 663 664=head3 FB_QUIET 665 666 CHECK = Encode::FB_QUIET 667 668If I<CHECK> is set to C<Encode::FB_QUIET>, encoding and decoding immediately 669return the portion of the data that has been processed so far when an 670error occurs. The data argument is overwritten with everything 671after that point; that is, the unprocessed portion of the data. This is 672handy when you have to call C<decode> repeatedly in the case where your 673source data may contain partial multi-byte character sequences, 674(that is, you are reading with a fixed-width buffer). Here's some sample 675code to do exactly that: 676 677 my($buffer, $string) = ("", ""); 678 while (read($fh, $buffer, 256, length($buffer))) { 679 $string .= decode($encoding, $buffer, Encode::FB_QUIET); 680 # $buffer now contains the unprocessed partial character 681 } 682 683=head3 FB_WARN 684 685 CHECK = Encode::FB_WARN 686 687This is the same as C<FB_QUIET> above, except that instead of being silent 688on errors, it issues a warning. This is handy for when you are debugging. 689 690B<CAVEAT>: All warnings from Encode module are reported, independently of 691L<pragma warnings|warnings> settings. If you want to follow settings of 692lexical warnings configured by L<pragma warnings|warnings> then append 693also check value C<ENCODE::ONLY_PRAGMA_WARNINGS>. This value is available 694since Encode version 2.99. 695 696=head3 FB_PERLQQ FB_HTMLCREF FB_XMLCREF 697 698=over 2 699 700=item perlqq mode (I<CHECK> = Encode::FB_PERLQQ) 701 702=item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF) 703 704=item XML charref mode (I<CHECK> = Encode::FB_XMLCREF) 705 706=back 707 708For encodings that are implemented by the C<Encode::XS> module, C<CHECK> C<==> 709C<Encode::FB_PERLQQ> puts C<encode> and C<decode> into C<perlqq> fallback mode. 710 711When you decode, C<\xI<HH>> is inserted for a malformed character, where 712I<HH> is the hex representation of the octet that could not be decoded to 713utf8. When you encode, C<\x{I<HHHH>}> will be inserted, where I<HHHH> is 714the Unicode code point (in any number of hex digits) of the character that 715cannot be found in the character repertoire of the encoding. 716 717The HTML/XML character reference modes are about the same. In place of 718C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number, and 719XML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number. 720 721In C<Encode> 2.10 or later, C<LEAVE_SRC> is also implied. 722 723=head3 The bitmask 724 725These modes are all actually set via a bitmask. Here is how the C<FB_I<XXX>> 726constants are laid out. You can import the C<FB_I<XXX>> constants via 727C<use Encode qw(:fallbacks)>, and you can import the generic bitmask 728constants via C<use Encode qw(:fallback_all)>. 729 730 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ 731 DIE_ON_ERR 0x0001 X 732 WARN_ON_ERR 0x0002 X 733 RETURN_ON_ERR 0x0004 X X 734 LEAVE_SRC 0x0008 X 735 PERLQQ 0x0100 X 736 HTMLCREF 0x0200 737 XMLCREF 0x0400 738 739=head3 LEAVE_SRC 740 741 Encode::LEAVE_SRC 742 743If the C<Encode::LEAVE_SRC> bit is I<not> set but I<CHECK> is set, then the 744source string to encode() or decode() will be overwritten in place. 745If you're not interested in this, then bitwise-OR it with the bitmask. 746 747=head2 coderef for CHECK 748 749As of C<Encode> 2.12, C<CHECK> can also be a code reference which takes the 750ordinal value of the unmapped character as an argument and returns 751octets that represent the fallback character. For instance: 752 753 $ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift }); 754 755Acts like C<FB_PERLQQ> but U+I<XXXX> is used instead of C<\x{I<XXXX>}>. 756 757Fallback for C<decode> must return decoded string (sequence of characters) 758and takes a list of ordinal values as its arguments. So for 759example if you wish to decode octets as UTF-8, and use ISO-8859-15 as 760a fallback for bytes that are not valid UTF-8, you could write 761 762 $str = decode 'UTF-8', $octets, sub { 763 my $tmp = join '', map chr, @_; 764 return decode 'ISO-8859-15', $tmp; 765 }; 766 767=head1 Defining Encodings 768 769To define a new encoding, use: 770 771 use Encode qw(define_encoding); 772 define_encoding($object, CANONICAL_NAME [, alias...]); 773 774I<CANONICAL_NAME> will be associated with I<$object>. The object 775should provide the interface described in L<Encode::Encoding>. 776If more than two arguments are provided, additional 777arguments are considered aliases for I<$object>. 778 779See L<Encode::Encoding> for details. 780 781=head1 The UTF8 flag 782 783Before the introduction of Unicode support in Perl, the C<eq> operator 784just compared the strings represented by two scalars. Beginning with 785Perl 5.8, C<eq> compares two strings with simultaneous consideration of 786I<the UTF8 flag>. To explain why we made it so, I quote from page 402 of 787I<Programming Perl, 3rd ed.> 788 789=over 2 790 791=item Goal #1: 792 793Old byte-oriented programs should not spontaneously break on the old 794byte-oriented data they used to work on. 795 796=item Goal #2: 797 798Old byte-oriented programs should magically start working on the new 799character-oriented data when appropriate. 800 801=item Goal #3: 802 803Programs should run just as fast in the new character-oriented mode 804as in the old byte-oriented mode. 805 806=item Goal #4: 807 808Perl should remain one language, rather than forking into a 809byte-oriented Perl and a character-oriented Perl. 810 811=back 812 813When I<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0 had been 814born yet, many features documented in the book remained unimplemented for a 815long time. Perl 5.8 corrected much of this, and the introduction of the 816UTF8 flag is one of them. You can think of there being two fundamentally 817different kinds of strings and string-operations in Perl: one a 818byte-oriented mode for when the internal UTF8 flag is off, and the other a 819character-oriented mode for when the internal UTF8 flag is on. 820 821This UTF8 flag is not visible in Perl scripts, exactly for the same reason 822you cannot (or rather, you I<don't have to>) see whether a scalar contains 823a string, an integer, or a floating-point number. But you can still peek 824and poke these if you will. See the next section. 825 826=head2 Messing with Perl's Internals 827 828The following API uses parts of Perl's internals in the current 829implementation. As such, they are efficient but may change in a future 830release. 831 832=head3 is_utf8 833 834 is_utf8(STRING [, CHECK]) 835 836[INTERNAL] Tests whether the UTF8 flag is turned on in the I<STRING>. 837If I<CHECK> is true, also checks whether I<STRING> contains well-formed 838UTF-8. Returns true if successful, false otherwise. 839 840Typically only necessary for debugging and testing. Don't use this flag as 841a marker to distinguish character and binary data, that should be decided 842for each variable when you write your code. 843 844B<CAVEAT>: If I<STRING> has UTF8 flag set, it does B<NOT> mean that 845I<STRING> is UTF-8 encoded and vice-versa. 846 847As of Perl 5.8.1, L<utf8> also has the C<utf8::is_utf8> function. 848 849=head3 _utf8_on 850 851 _utf8_on(STRING) 852 853[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<on>. The I<STRING> 854is I<not> checked for containing only well-formed UTF-8. Do not use this 855unless you I<know with absolute certainty> that the STRING holds only 856well-formed UTF-8. Returns the previous state of the UTF8 flag (so please 857don't treat the return value as indicating success or failure), or C<undef> 858if I<STRING> is not a string. 859 860B<NOTE>: For security reasons, this function does not work on tainted values. 861 862=head3 _utf8_off 863 864 _utf8_off(STRING) 865 866[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<off>. Do not use 867frivolously. Returns the previous state of the UTF8 flag, or C<undef> if 868I<STRING> is not a string. Do not treat the return value as indicative of 869success or failure, because that isn't what it means: it is only the 870previous setting. 871 872B<NOTE>: For security reasons, this function does not work on tainted values. 873 874=head1 UTF-8 vs. utf8 vs. UTF8 875 876 ....We now view strings not as sequences of bytes, but as sequences 877 of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit 878 computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed. 879 880That has historically been Perl's notion of UTF-8, as that is how UTF-8 was 881first conceived by Ken Thompson when he invented it. However, thanks to 882later revisions to the applicable standards, official UTF-8 is now rather 883stricter than that. For example, its range is much narrower (0 .. 0x10_FFFF 884to cover only 21 bits instead of 32 or 64 bits) and some sequences 885are not allowed, like those used in surrogate pairs, the 31 non-character 886code points 0xFDD0 .. 0xFDEF, the last two code points in I<any> plane 887(0xI<XX>_FFFE and 0xI<XX>_FFFF), all non-shortest encodings, etc. 888 889The former default in which Perl would always use a loose interpretation of 890UTF-8 has now been overruled: 891 892 From: Larry Wall <larry@wall.org> 893 Date: December 04, 2004 11:51:58 JST 894 To: perl-unicode@perl.org 895 Subject: Re: Make Encode.pm support the real UTF-8 896 Message-Id: <20041204025158.GA28754@wall.org> 897 898 On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote: 899 : I've no problem with 'utf8' being perl's unrestricted uft8 encoding, 900 : but "UTF-8" is the name of the standard and should give the 901 : corresponding behaviour. 902 903 For what it's worth, that's how I've always kept them straight in my 904 head. 905 906 Also for what it's worth, Perl 6 will mostly default to strict but 907 make it easy to switch back to lax. 908 909 Larry 910 911Got that? As of Perl 5.8.7, B<"UTF-8"> means UTF-8 in its current 912sense, which is conservative and strict and security-conscious, whereas 913B<"utf8"> means UTF-8 in its former sense, which was liberal and loose and 914lax. C<Encode> version 2.10 or later thus groks this subtle but critically 915important distinction between C<"UTF-8"> and C<"utf8">. 916 917 encode("utf8", "\x{FFFF_FFFF}", 1); # okay 918 encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks 919 920This distinction is also important for decoding. In the following, 921C<$s> stores character U+200000, which exceeds UTF-8's allowed range. 922C<$s> thus stores an invalid Unicode code point: 923 924 $s = decode("utf8", "\xf8\x88\x80\x80\x80"); 925 926C<"UTF-8">, by contrast, will either coerce the input to something valid: 927 928 $s = decode("UTF-8", "\xf8\x88\x80\x80\x80"); # U+FFFD 929 930.. or croak: 931 932 decode("UTF-8", "\xf8\x88\x80\x80\x80", FB_CROAK|LEAVE_SRC); 933 934In the C<Encode> module, C<"UTF-8"> is actually a canonical name for 935C<"utf-8-strict">. That hyphen between the C<"UTF"> and the C<"8"> is 936critical; without it, C<Encode> goes "liberal" and (perhaps overly-)permissive: 937 938 find_encoding("UTF-8")->name # is 'utf-8-strict' 939 find_encoding("utf-8")->name # ditto. names are case insensitive 940 find_encoding("utf_8")->name # ditto. "_" are treated as "-" 941 find_encoding("UTF8")->name # is 'utf8'. 942 943Perl's internal UTF8 flag is called "UTF8", without a hyphen. It indicates 944whether a string is internally encoded as "utf8", also without a hyphen. 945 946=head1 SEE ALSO 947 948L<Encode::Encoding>, 949L<Encode::Supported>, 950L<Encode::PerlIO>, 951L<encoding>, 952L<perlebcdic>, 953L<perlfunc/open>, 954L<perlunicode>, L<perluniintro>, L<perlunifaq>, L<perlunitut> 955L<utf8>, 956the Perl Unicode Mailing List L<http://lists.perl.org/list/perl-unicode.html> 957 958=head1 MAINTAINER 959 960This project was originated by the late Nick Ing-Simmons and later 961maintained by Dan Kogai I<< <dankogai@cpan.org> >>. See AUTHORS 962for a full list of people involved. For any questions, send mail to 963I<< <perl-unicode@perl.org> >> so that we can all share. 964 965While Dan Kogai retains the copyright as a maintainer, credit 966should go to all those involved. See AUTHORS for a list of those 967who submitted code to the project. 968 969=head1 COPYRIGHT 970 971Copyright 2002-2014 Dan Kogai I<< <dankogai@cpan.org> >>. 972 973This library is free software; you can redistribute it and/or modify 974it under the same terms as Perl itself. 975 976=cut 977