1package Encode::Alias; 2use strict; 3use warnings; 4our $VERSION = do { my @r = ( q$Revision: 2.24 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; 5use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG}; 6 7use Exporter 'import'; 8 9# Public, encouraged API is exported by default 10 11our @EXPORT = 12 qw ( 13 define_alias 14 find_alias 15); 16 17our @Alias; # ordered matching list 18our %Alias; # cached known aliases 19 20sub find_alias { 21 my $class = shift; 22 my $find = shift; 23 unless ( exists $Alias{$find} ) { 24 $Alias{$find} = undef; # Recursion guard 25 for ( my $i = 0 ; $i < @Alias ; $i += 2 ) { 26 my $alias = $Alias[$i]; 27 my $val = $Alias[ $i + 1 ]; 28 my $new; 29 if ( ref($alias) eq 'Regexp' && $find =~ $alias ) { 30 DEBUG and warn "eval $val"; 31 $new = eval $val; 32 DEBUG and $@ and warn "$val, $@"; 33 } 34 elsif ( ref($alias) eq 'CODE' ) { 35 DEBUG and warn "$alias", "->", "($find)"; 36 $new = $alias->($find); 37 } 38 elsif ( lc($find) eq lc($alias) ) { 39 $new = $val; 40 } 41 if ( defined($new) ) { 42 next if $new eq $find; # avoid (direct) recursion on bugs 43 DEBUG and warn "$alias, $new"; 44 my $enc = 45 ( ref($new) ) ? $new : Encode::find_encoding($new); 46 if ($enc) { 47 $Alias{$find} = $enc; 48 last; 49 } 50 } 51 } 52 53 # case insensitive search when canonical is not in all lowercase 54 # RT ticket #7835 55 unless ( $Alias{$find} ) { 56 my $lcfind = lc($find); 57 for my $name ( keys %Encode::Encoding, keys %Encode::ExtModule ) 58 { 59 $lcfind eq lc($name) or next; 60 $Alias{$find} = Encode::find_encoding($name); 61 DEBUG and warn "$find => $name"; 62 } 63 } 64 } 65 if (DEBUG) { 66 my $name; 67 if ( my $e = $Alias{$find} ) { 68 $name = $e->name; 69 } 70 else { 71 $name = ""; 72 } 73 warn "find_alias($class, $find)->name = $name"; 74 } 75 return $Alias{$find}; 76} 77 78sub define_alias { 79 while (@_) { 80 my $alias = shift; 81 my $name = shift; 82 unshift( @Alias, $alias => $name ) # newer one has precedence 83 if defined $alias; 84 if ( ref($alias) ) { 85 86 # clear %Alias cache to allow overrides 87 my @a = keys %Alias; 88 for my $k (@a) { 89 if ( ref($alias) eq 'Regexp' && $k =~ $alias ) { 90 DEBUG and warn "delete \$Alias\{$k\}"; 91 delete $Alias{$k}; 92 } 93 elsif ( ref($alias) eq 'CODE' && $alias->($k) ) { 94 DEBUG and warn "delete \$Alias\{$k\}"; 95 delete $Alias{$k}; 96 } 97 } 98 } 99 elsif (defined $alias) { 100 DEBUG and warn "delete \$Alias\{$alias\}"; 101 delete $Alias{$alias}; 102 } 103 elsif (DEBUG) { 104 require Carp; 105 Carp::croak("undef \$alias"); 106 } 107 } 108} 109 110# HACK: Encode must be used after define_alias is declarated as Encode calls define_alias 111use Encode (); 112 113# Allow latin-1 style names as well 114# 0 1 2 3 4 5 6 7 8 9 10 115our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 ); 116 117# Allow winlatin1 style names as well 118our %Winlatin2cp = ( 119 'latin1' => 1252, 120 'latin2' => 1250, 121 'cyrillic' => 1251, 122 'greek' => 1253, 123 'turkish' => 1254, 124 'hebrew' => 1255, 125 'arabic' => 1256, 126 'baltic' => 1257, 127 'vietnamese' => 1258, 128); 129 130init_aliases(); 131 132sub undef_aliases { 133 @Alias = (); 134 %Alias = (); 135} 136 137sub init_aliases { 138 undef_aliases(); 139 140 # Try all-lower-case version should all else fails 141 define_alias( qr/^(.*)$/ => '"\L$1"' ); 142 143 # UTF/UCS stuff 144 define_alias( qr/^(unicode-1-1-)?UTF-?7$/i => '"UTF-7"' ); 145 define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' ); 146 define_alias( 147 qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"', 148 qr/^UCS-?4-?(BE|LE|)?$/i => 'uc("UTF-32$1")', 149 qr/^iso-10646-1$/i => '"UCS-2BE"' 150 ); 151 define_alias( 152 qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"', 153 qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"', 154 qr/^UTF-?(16|32)$/i => '"UTF-$1"', 155 ); 156 157 # ASCII 158 define_alias( qr/^(?:US-?)ascii$/i => '"ascii"' ); 159 define_alias( 'C' => 'ascii' ); 160 define_alias( qr/\b(?:ISO[-_]?)?646(?:[-_]?US)?$/i => '"ascii"' ); 161 162 # Allow variants of iso-8859-1 etc. 163 define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' ); 164 165 # At least HP-UX has these. 166 define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' ); 167 168 # More HP stuff. 169 define_alias( 170 qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => 171 '"${1}8"' ); 172 173 # The Official name of ASCII. 174 define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' ); 175 176 # This is a font issue, not an encoding issue. 177 # (The currency symbol of the Latin 1 upper half 178 # has been redefined as the euro symbol.) 179 define_alias( qr/^(.+)\@euro$/i => '"$1"' ); 180 181 define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i => 182'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef' 183 ); 184 185 define_alias( 186 qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish| 187 hebrew|arabic|baltic|vietnamese)$/ix => 188 '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' 189 ); 190 191 # Common names for non-latin preferred MIME names 192 define_alias( 193 'ascii' => 'US-ascii', 194 'cyrillic' => 'iso-8859-5', 195 'arabic' => 'iso-8859-6', 196 'greek' => 'iso-8859-7', 197 'hebrew' => 'iso-8859-8', 198 'thai' => 'iso-8859-11', 199 ); 200 # RT #20781 201 define_alias(qr/\btis-?620\b/i => '"iso-8859-11"'); 202 203 # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN. 204 # And Microsoft has their own naming (again, surprisingly). 205 # And windows-* is registered in IANA! 206 define_alias( 207 qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"' ); 208 209 # Sometimes seen with a leading zero. 210 # define_alias( qr/\bcp037\b/i => '"cp37"'); 211 212 # Mac Mappings 213 # predefined in *.ucm; unneeded 214 # define_alias( qr/\bmacIcelandic$/i => '"macIceland"'); 215 define_alias( qr/^(?:x[_-])?mac[_-](.*)$/i => '"mac$1"' ); 216 # http://rt.cpan.org/Ticket/Display.html?id=36326 217 define_alias( qr/^macintosh$/i => '"MacRoman"' ); 218 # https://rt.cpan.org/Ticket/Display.html?id=78125 219 define_alias( qr/^macce$/i => '"MacCentralEurRoman"' ); 220 # Ououououou. gone. They are different! 221 # define_alias( qr/\bmacRomanian$/i => '"macRumanian"'); 222 223 # Standardize on the dashed versions. 224 define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' ); 225 226 unless ($Encode::ON_EBCDIC) { 227 228 # for Encode::CN 229 define_alias( qr/\beuc.*cn$/i => '"euc-cn"' ); 230 define_alias( qr/\bcn.*euc$/i => '"euc-cn"' ); 231 232 # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' ) 233 # CP936 doesn't have vendor-addon for GBK, so they're identical. 234 define_alias( qr/^gbk$/i => '"cp936"' ); 235 236 # This fixes gb2312 vs. euc-cn confusion, practically 237 define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' ); 238 239 # for Encode::JP 240 define_alias( qr/\bjis$/i => '"7bit-jis"' ); 241 define_alias( qr/\beuc.*jp$/i => '"euc-jp"' ); 242 define_alias( qr/\bjp.*euc$/i => '"euc-jp"' ); 243 define_alias( qr/\bujis$/i => '"euc-jp"' ); 244 define_alias( qr/\bshift.*jis$/i => '"shiftjis"' ); 245 define_alias( qr/\bsjis$/i => '"shiftjis"' ); 246 define_alias( qr/\bwindows-31j$/i => '"cp932"' ); 247 248 # for Encode::KR 249 define_alias( qr/\beuc.*kr$/i => '"euc-kr"' ); 250 define_alias( qr/\bkr.*euc$/i => '"euc-kr"' ); 251 252 # This fixes ksc5601 vs. euc-kr confusion, practically 253 define_alias( qr/(?:x-)?uhc$/i => '"cp949"' ); 254 define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' ); 255 define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' ); 256 257 # for Encode::TW 258 define_alias( qr/\bbig-?5$/i => '"big5-eten"' ); 259 define_alias( qr/\bbig5-?et(?:en)?$/i => '"big5-eten"' ); 260 define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' ); 261 define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' ); 262 define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' ); 263 } 264 265 # https://github.com/dankogai/p5-encode/issues/37 266 define_alias(qr/cp65000/i => '"UTF-7"'); 267 define_alias(qr/cp65001/i => '"utf-8-strict"'); 268 269 # utf8 is blessed :) 270 define_alias( qr/\bUTF-8$/i => '"utf-8-strict"' ); 271 272 # At last, Map white space and _ to '-' 273 define_alias( qr/^([^\s_]+)[\s_]+([^\s_]*)$/i => '"$1-$2"' ); 274} 275 2761; 277__END__ 278 279# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8 280# TODO: HP-UX '15' encodings japanese15 korean15 roi15 281# TODO: Cyrillic encoding ISO-IR-111 (useful?) 282# TODO: Armenian encoding ARMSCII-8 283# TODO: Hebrew encoding ISO-8859-8-1 284# TODO: Thai encoding TCVN 285# TODO: Vietnamese encodings VPS 286# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese 287# ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic 288# Farsi Georgian Gujarati Gurmukhi Hebrew Japanese 289# Kannada Khmer Korean Laotian Malayalam Mongolian 290# Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese 291 292=head1 NAME 293 294Encode::Alias - alias definitions to encodings 295 296=head1 SYNOPSIS 297 298 use Encode; 299 use Encode::Alias; 300 define_alias( "newName" => ENCODING); 301 define_alias( qr/.../ => ENCODING); 302 define_alias( sub { return ENCODING if ...; } ); 303 304=head1 DESCRIPTION 305 306Allows newName to be used as an alias for ENCODING. ENCODING may be 307either the name of an encoding or an encoding object (as described 308in L<Encode>). 309 310Currently the first argument to define_alias() can be specified in the 311following ways: 312 313=over 4 314 315=item As a simple string. 316 317=item As a qr// compiled regular expression, e.g.: 318 319 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' ); 320 321In this case, if I<ENCODING> is not a reference, it is C<eval>-ed 322in order to allow C<$1> etc. to be substituted. The example is one 323way to alias names as used in X11 fonts to the MIME names for the 324iso-8859-* family. Note the double quotes inside the single quotes. 325 326(or, you don't have to do this yourself because this example is predefined) 327 328If you are using a regex here, you have to use the quotes as shown or 329it won't work. Also note that regex handling is tricky even for the 330experienced. Use this feature with caution. 331 332=item As a code reference, e.g.: 333 334 define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } ); 335 336The same effect as the example above in a different way. The coderef 337takes the alias name as an argument and returns a canonical name on 338success or undef if not. Note the second argument is ignored if provided. 339Use this with even more caution than the regex version. 340 341=back 342 343=head3 Changes in code reference aliasing 344 345As of Encode 1.87, the older form 346 347 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } ); 348 349no longer works. 350 351Encode up to 1.86 internally used "local $_" to implement this older 352form. But consider the code below; 353 354 use Encode; 355 $_ = "eeeee" ; 356 while (/(e)/g) { 357 my $utf = decode('aliased-encoding-name', $1); 358 print "position:",pos,"\n"; 359 } 360 361Prior to Encode 1.86 this fails because of "local $_". 362 363=head2 Alias overloading 364 365You can override predefined aliases by simply applying define_alias(). 366The new alias is always evaluated first, and when necessary, 367define_alias() flushes the internal cache to make the new definition 368available. 369 370 # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a 371 # superset of SHIFT_JIS 372 373 define_alias( qr/shift.*jis$/i => '"cp932"' ); 374 define_alias( qr/sjis$/i => '"cp932"' ); 375 376If you want to zap all predefined aliases, you can use 377 378 Encode::Alias->undef_aliases; 379 380to do so. And 381 382 Encode::Alias->init_aliases; 383 384gets the factory settings back. 385 386Note that define_alias() will not be able to override the canonical name 387of encodings. Encodings are first looked up by canonical name before 388potential aliases are tried. 389 390=head1 SEE ALSO 391 392L<Encode>, L<Encode::Supported> 393 394=cut 395 396