1package Encode::Alias; 2use strict; 3use warnings; 4our $VERSION = do { my @r = ( q$Revision: 2.25 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; 5use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG}; 6 7use Exporter 'import'; 8 9# Public, encouraged API is exported by default 10 11our @EXPORT = 12 qw ( 13 define_alias 14 find_alias 15); 16 17our @Alias; # ordered matching list 18our %Alias; # cached known aliases 19 20sub find_alias { 21 my $class = shift; 22 my $find = shift; 23 unless ( exists $Alias{$find} ) { 24 $Alias{$find} = undef; # Recursion guard 25 for ( my $i = 0 ; $i < @Alias ; $i += 2 ) { 26 my $alias = $Alias[$i]; 27 my $val = $Alias[ $i + 1 ]; 28 my $new; 29 if ( ref($alias) eq 'Regexp' && $find =~ $alias ) { 30 DEBUG and warn "eval $val"; 31 $new = eval $val; 32 DEBUG and $@ and warn "$val, $@"; 33 } 34 elsif ( ref($alias) eq 'CODE' ) { 35 DEBUG and warn "$alias", "->", "($find)"; 36 $new = $alias->($find); 37 } 38 elsif ( lc($find) eq lc($alias) ) { 39 $new = $val; 40 } 41 if ( defined($new) ) { 42 next if $new eq $find; # avoid (direct) recursion on bugs 43 DEBUG and warn "$alias, $new"; 44 my $enc = 45 ( ref($new) ) ? $new : Encode::find_encoding($new); 46 if ($enc) { 47 $Alias{$find} = $enc; 48 last; 49 } 50 } 51 } 52 53 # case insensitive search when canonical is not in all lowercase 54 # RT ticket #7835 55 unless ( $Alias{$find} ) { 56 my $lcfind = lc($find); 57 for my $name ( keys %Encode::Encoding, keys %Encode::ExtModule ) 58 { 59 $lcfind eq lc($name) or next; 60 $Alias{$find} = Encode::find_encoding($name); 61 DEBUG and warn "$find => $name"; 62 } 63 } 64 } 65 if (DEBUG) { 66 my $name; 67 if ( my $e = $Alias{$find} ) { 68 $name = $e->name; 69 } 70 else { 71 $name = ""; 72 } 73 warn "find_alias($class, $find)->name = $name"; 74 } 75 return $Alias{$find}; 76} 77 78sub define_alias { 79 while (@_) { 80 my $alias = shift; 81 my $name = shift; 82 unshift( @Alias, $alias => $name ) # newer one has precedence 83 if defined $alias; 84 if ( ref($alias) ) { 85 86 # clear %Alias cache to allow overrides 87 my @a = keys %Alias; 88 for my $k (@a) { 89 if ( ref($alias) eq 'Regexp' && $k =~ $alias ) { 90 DEBUG and warn "delete \$Alias\{$k\}"; 91 delete $Alias{$k}; 92 } 93 elsif ( ref($alias) eq 'CODE' && $alias->($k) ) { 94 DEBUG and warn "delete \$Alias\{$k\}"; 95 delete $Alias{$k}; 96 } 97 } 98 } 99 elsif (defined $alias) { 100 DEBUG and warn "delete \$Alias\{$alias\}"; 101 delete $Alias{$alias}; 102 } 103 elsif (DEBUG) { 104 require Carp; 105 Carp::croak("undef \$alias"); 106 } 107 } 108} 109 110# HACK: Encode must be used after define_alias is declarated as Encode calls define_alias 111use Encode (); 112 113# Allow latin-1 style names as well 114# 0 1 2 3 4 5 6 7 8 9 10 115our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 ); 116 117# Allow winlatin1 style names as well 118our %Winlatin2cp = ( 119 'latin1' => 1252, 120 'latin2' => 1250, 121 'cyrillic' => 1251, 122 'greek' => 1253, 123 'turkish' => 1254, 124 'hebrew' => 1255, 125 'arabic' => 1256, 126 'baltic' => 1257, 127 'vietnamese' => 1258, 128); 129 130init_aliases(); 131 132sub undef_aliases { 133 @Alias = (); 134 %Alias = (); 135} 136 137sub init_aliases { 138 undef_aliases(); 139 140 # Try all-lower-case version should all else fails 141 define_alias( qr/^(.*)$/ => '"\L$1"' ); 142 143 # UTF/UCS stuff 144 define_alias( qr/^(unicode-1-1-)?UTF-?7$/i => '"UTF-7"' ); 145 define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' ); 146 define_alias( 147 qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"', 148 qr/^UCS-?4-?(BE|LE|)?$/i => 'uc("UTF-32$1")', 149 qr/^iso-10646-1$/i => '"UCS-2BE"' 150 ); 151 define_alias( 152 qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"', 153 qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"', 154 qr/^UTF-?(16|32)$/i => '"UTF-$1"', 155 ); 156 157 # ASCII 158 define_alias( qr/^(?:US-?)ascii$/i => '"ascii"' ); 159 define_alias( 'C' => 'ascii' ); 160 define_alias( qr/\b(?:ISO[-_]?)?646(?:[-_]?US)?$/i => '"ascii"' ); 161 162 # Allow variants of iso-8859-1 etc. 163 define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' ); 164 165 # ISO-8859-8-I => ISO-8859-8 166 # https://en.wikipedia.org/wiki/ISO-8859-8-I 167 define_alias( qr/\biso[-_]8859[-_]8[-_]I$/i => '"iso-8859-8"' ); 168 169 # At least HP-UX has these. 170 define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' ); 171 172 # More HP stuff. 173 define_alias( 174 qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => 175 '"${1}8"' ); 176 177 # The Official name of ASCII. 178 define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' ); 179 180 # This is a font issue, not an encoding issue. 181 # (The currency symbol of the Latin 1 upper half 182 # has been redefined as the euro symbol.) 183 define_alias( qr/^(.+)\@euro$/i => '"$1"' ); 184 185 define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i => 186'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef' 187 ); 188 189 define_alias( 190 qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish| 191 hebrew|arabic|baltic|vietnamese)$/ix => 192 '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' 193 ); 194 195 # Common names for non-latin preferred MIME names 196 define_alias( 197 'ascii' => 'US-ascii', 198 'cyrillic' => 'iso-8859-5', 199 'arabic' => 'iso-8859-6', 200 'greek' => 'iso-8859-7', 201 'hebrew' => 'iso-8859-8', 202 'thai' => 'iso-8859-11', 203 ); 204 # RT #20781 205 define_alias(qr/\btis-?620\b/i => '"iso-8859-11"'); 206 207 # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN. 208 # And Microsoft has their own naming (again, surprisingly). 209 # And windows-* is registered in IANA! 210 define_alias( 211 qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"' ); 212 213 # Sometimes seen with a leading zero. 214 # define_alias( qr/\bcp037\b/i => '"cp37"'); 215 216 # Mac Mappings 217 # predefined in *.ucm; unneeded 218 # define_alias( qr/\bmacIcelandic$/i => '"macIceland"'); 219 define_alias( qr/^(?:x[_-])?mac[_-](.*)$/i => '"mac$1"' ); 220 # http://rt.cpan.org/Ticket/Display.html?id=36326 221 define_alias( qr/^macintosh$/i => '"MacRoman"' ); 222 # https://rt.cpan.org/Ticket/Display.html?id=78125 223 define_alias( qr/^macce$/i => '"MacCentralEurRoman"' ); 224 # Ououououou. gone. They are different! 225 # define_alias( qr/\bmacRomanian$/i => '"macRumanian"'); 226 227 # Standardize on the dashed versions. 228 define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' ); 229 230 unless ($Encode::ON_EBCDIC) { 231 232 # for Encode::CN 233 define_alias( qr/\beuc.*cn$/i => '"euc-cn"' ); 234 define_alias( qr/\bcn.*euc$/i => '"euc-cn"' ); 235 236 # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' ) 237 # CP936 doesn't have vendor-addon for GBK, so they're identical. 238 define_alias( qr/^gbk$/i => '"cp936"' ); 239 240 # This fixes gb2312 vs. euc-cn confusion, practically 241 define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' ); 242 243 # for Encode::JP 244 define_alias( qr/\bjis$/i => '"7bit-jis"' ); 245 define_alias( qr/\beuc.*jp$/i => '"euc-jp"' ); 246 define_alias( qr/\bjp.*euc$/i => '"euc-jp"' ); 247 define_alias( qr/\bujis$/i => '"euc-jp"' ); 248 define_alias( qr/\bshift.*jis$/i => '"shiftjis"' ); 249 define_alias( qr/\bsjis$/i => '"shiftjis"' ); 250 define_alias( qr/\bwindows-31j$/i => '"cp932"' ); 251 252 # for Encode::KR 253 define_alias( qr/\beuc.*kr$/i => '"euc-kr"' ); 254 define_alias( qr/\bkr.*euc$/i => '"euc-kr"' ); 255 256 # This fixes ksc5601 vs. euc-kr confusion, practically 257 define_alias( qr/(?:x-)?uhc$/i => '"cp949"' ); 258 define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' ); 259 define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' ); 260 261 # for Encode::TW 262 define_alias( qr/\bbig-?5$/i => '"big5-eten"' ); 263 define_alias( qr/\bbig5-?et(?:en)?$/i => '"big5-eten"' ); 264 define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' ); 265 define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' ); 266 define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' ); 267 } 268 269 # https://github.com/dankogai/p5-encode/issues/37 270 define_alias(qr/cp65000/i => '"UTF-7"'); 271 define_alias(qr/cp65001/i => '"utf-8-strict"'); 272 273 # utf8 is blessed :) 274 define_alias( qr/\bUTF-8$/i => '"utf-8-strict"' ); 275 276 # At last, Map white space and _ to '-' 277 define_alias( qr/^([^\s_]+)[\s_]+([^\s_]*)$/i => '"$1-$2"' ); 278} 279 2801; 281__END__ 282 283# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8 284# TODO: HP-UX '15' encodings japanese15 korean15 roi15 285# TODO: Cyrillic encoding ISO-IR-111 (useful?) 286# TODO: Armenian encoding ARMSCII-8 287# TODO: Hebrew encoding ISO-8859-8-1 288# TODO: Thai encoding TCVN 289# TODO: Vietnamese encodings VPS 290# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese 291# ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic 292# Farsi Georgian Gujarati Gurmukhi Hebrew Japanese 293# Kannada Khmer Korean Laotian Malayalam Mongolian 294# Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese 295 296=head1 NAME 297 298Encode::Alias - alias definitions to encodings 299 300=head1 SYNOPSIS 301 302 use Encode; 303 use Encode::Alias; 304 define_alias( "newName" => ENCODING); 305 define_alias( qr/.../ => ENCODING); 306 define_alias( sub { return ENCODING if ...; } ); 307 308=head1 DESCRIPTION 309 310Allows newName to be used as an alias for ENCODING. ENCODING may be 311either the name of an encoding or an encoding object (as described 312in L<Encode>). 313 314Currently the first argument to define_alias() can be specified in the 315following ways: 316 317=over 4 318 319=item As a simple string. 320 321=item As a qr// compiled regular expression, e.g.: 322 323 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' ); 324 325In this case, if I<ENCODING> is not a reference, it is C<eval>-ed 326in order to allow C<$1> etc. to be substituted. The example is one 327way to alias names as used in X11 fonts to the MIME names for the 328iso-8859-* family. Note the double quotes inside the single quotes. 329 330(or, you don't have to do this yourself because this example is predefined) 331 332If you are using a regex here, you have to use the quotes as shown or 333it won't work. Also note that regex handling is tricky even for the 334experienced. Use this feature with caution. 335 336=item As a code reference, e.g.: 337 338 define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } ); 339 340The same effect as the example above in a different way. The coderef 341takes the alias name as an argument and returns a canonical name on 342success or undef if not. Note the second argument is ignored if provided. 343Use this with even more caution than the regex version. 344 345=back 346 347=head3 Changes in code reference aliasing 348 349As of Encode 1.87, the older form 350 351 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } ); 352 353no longer works. 354 355Encode up to 1.86 internally used "local $_" to implement this older 356form. But consider the code below; 357 358 use Encode; 359 $_ = "eeeee" ; 360 while (/(e)/g) { 361 my $utf = decode('aliased-encoding-name', $1); 362 print "position:",pos,"\n"; 363 } 364 365Prior to Encode 1.86 this fails because of "local $_". 366 367=head2 Alias overloading 368 369You can override predefined aliases by simply applying define_alias(). 370The new alias is always evaluated first, and when necessary, 371define_alias() flushes the internal cache to make the new definition 372available. 373 374 # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a 375 # superset of SHIFT_JIS 376 377 define_alias( qr/shift.*jis$/i => '"cp932"' ); 378 define_alias( qr/sjis$/i => '"cp932"' ); 379 380If you want to zap all predefined aliases, you can use 381 382 Encode::Alias->undef_aliases; 383 384to do so. And 385 386 Encode::Alias->init_aliases; 387 388gets the factory settings back. 389 390Note that define_alias() will not be able to override the canonical name 391of encodings. Encodings are first looked up by canonical name before 392potential aliases are tried. 393 394=head1 SEE ALSO 395 396L<Encode>, L<Encode::Supported> 397 398=cut 399 400