1package Encode::Alias;
2use strict;
3use warnings;
4our $VERSION = do { my @r = ( q$Revision: 2.24 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
5use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
6
7use Exporter 'import';
8
9# Public, encouraged API is exported by default
10
11our @EXPORT =
12  qw (
13  define_alias
14  find_alias
15);
16
17our @Alias;    # ordered matching list
18our %Alias;    # cached known aliases
19
20sub find_alias {
21    my $class = shift;
22    my $find  = shift;
23    unless ( exists $Alias{$find} ) {
24        $Alias{$find} = undef;    # Recursion guard
25        for ( my $i = 0 ; $i < @Alias ; $i += 2 ) {
26            my $alias = $Alias[$i];
27            my $val   = $Alias[ $i + 1 ];
28            my $new;
29            if ( ref($alias) eq 'Regexp' && $find =~ $alias ) {
30                DEBUG and warn "eval $val";
31                $new = eval $val;
32                DEBUG and $@ and warn "$val, $@";
33            }
34            elsif ( ref($alias) eq 'CODE' ) {
35                DEBUG and warn "$alias", "->", "($find)";
36                $new = $alias->($find);
37            }
38            elsif ( lc($find) eq lc($alias) ) {
39                $new = $val;
40            }
41            if ( defined($new) ) {
42                next if $new eq $find;    # avoid (direct) recursion on bugs
43                DEBUG and warn "$alias, $new";
44                my $enc =
45                  ( ref($new) ) ? $new : Encode::find_encoding($new);
46                if ($enc) {
47                    $Alias{$find} = $enc;
48                    last;
49                }
50            }
51        }
52
53        # case insensitive search when canonical is not in all lowercase
54        # RT ticket #7835
55        unless ( $Alias{$find} ) {
56            my $lcfind = lc($find);
57            for my $name ( keys %Encode::Encoding, keys %Encode::ExtModule )
58            {
59                $lcfind eq lc($name) or next;
60                $Alias{$find} = Encode::find_encoding($name);
61                DEBUG and warn "$find => $name";
62            }
63        }
64    }
65    if (DEBUG) {
66        my $name;
67        if ( my $e = $Alias{$find} ) {
68            $name = $e->name;
69        }
70        else {
71            $name = "";
72        }
73        warn "find_alias($class, $find)->name = $name";
74    }
75    return $Alias{$find};
76}
77
78sub define_alias {
79    while (@_) {
80        my $alias = shift;
81        my $name = shift;
82        unshift( @Alias, $alias => $name )    # newer one has precedence
83            if defined $alias;
84        if ( ref($alias) ) {
85
86            # clear %Alias cache to allow overrides
87            my @a = keys %Alias;
88            for my $k (@a) {
89                if ( ref($alias) eq 'Regexp' && $k =~ $alias ) {
90                    DEBUG and warn "delete \$Alias\{$k\}";
91                    delete $Alias{$k};
92                }
93                elsif ( ref($alias) eq 'CODE' && $alias->($k) ) {
94                    DEBUG and warn "delete \$Alias\{$k\}";
95                    delete $Alias{$k};
96                }
97            }
98        }
99        elsif (defined $alias) {
100            DEBUG and warn "delete \$Alias\{$alias\}";
101            delete $Alias{$alias};
102        }
103        elsif (DEBUG) {
104            require Carp;
105            Carp::croak("undef \$alias");
106        }
107    }
108}
109
110# HACK: Encode must be used after define_alias is declarated as Encode calls define_alias
111use Encode ();
112
113# Allow latin-1 style names as well
114# 0  1  2  3  4  5   6   7   8   9  10
115our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
116
117# Allow winlatin1 style names as well
118our %Winlatin2cp = (
119    'latin1'     => 1252,
120    'latin2'     => 1250,
121    'cyrillic'   => 1251,
122    'greek'      => 1253,
123    'turkish'    => 1254,
124    'hebrew'     => 1255,
125    'arabic'     => 1256,
126    'baltic'     => 1257,
127    'vietnamese' => 1258,
128);
129
130init_aliases();
131
132sub undef_aliases {
133    @Alias = ();
134    %Alias = ();
135}
136
137sub init_aliases {
138    undef_aliases();
139
140    # Try all-lower-case version should all else fails
141    define_alias( qr/^(.*)$/ => '"\L$1"' );
142
143    # UTF/UCS stuff
144    define_alias( qr/^(unicode-1-1-)?UTF-?7$/i     => '"UTF-7"' );
145    define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' );
146    define_alias(
147        qr/^UCS-?2-?(BE)?$/i    => '"UCS-2BE"',
148        qr/^UCS-?4-?(BE|LE|)?$/i => 'uc("UTF-32$1")',
149        qr/^iso-10646-1$/i      => '"UCS-2BE"'
150    );
151    define_alias(
152        qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"',
153        qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"',
154        qr/^UTF-?(16|32)$/i     => '"UTF-$1"',
155    );
156
157    # ASCII
158    define_alias( qr/^(?:US-?)ascii$/i       => '"ascii"' );
159    define_alias( 'C'                        => 'ascii' );
160    define_alias( qr/\b(?:ISO[-_]?)?646(?:[-_]?US)?$/i => '"ascii"' );
161
162    # Allow variants of iso-8859-1 etc.
163    define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
164
165    # At least HP-UX has these.
166    define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' );
167
168    # More HP stuff.
169    define_alias(
170        qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i =>
171          '"${1}8"' );
172
173    # The Official name of ASCII.
174    define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
175
176    # This is a font issue, not an encoding issue.
177    # (The currency symbol of the Latin 1 upper half
178    #  has been redefined as the euro symbol.)
179    define_alias( qr/^(.+)\@euro$/i => '"$1"' );
180
181    define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i =>
182'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef'
183    );
184
185    define_alias(
186        qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish|
187             hebrew|arabic|baltic|vietnamese)$/ix =>
188          '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}'
189    );
190
191    # Common names for non-latin preferred MIME names
192    define_alias(
193        'ascii'    => 'US-ascii',
194        'cyrillic' => 'iso-8859-5',
195        'arabic'   => 'iso-8859-6',
196        'greek'    => 'iso-8859-7',
197        'hebrew'   => 'iso-8859-8',
198        'thai'     => 'iso-8859-11',
199    );
200    # RT #20781
201    define_alias(qr/\btis-?620\b/i  => '"iso-8859-11"');
202
203    # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
204    # And Microsoft has their own naming (again, surprisingly).
205    # And windows-* is registered in IANA!
206    define_alias(
207        qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"' );
208
209    # Sometimes seen with a leading zero.
210    # define_alias( qr/\bcp037\b/i => '"cp37"');
211
212    # Mac Mappings
213    # predefined in *.ucm; unneeded
214    # define_alias( qr/\bmacIcelandic$/i => '"macIceland"');
215    define_alias( qr/^(?:x[_-])?mac[_-](.*)$/i => '"mac$1"' );
216    # http://rt.cpan.org/Ticket/Display.html?id=36326
217    define_alias( qr/^macintosh$/i => '"MacRoman"' );
218    # https://rt.cpan.org/Ticket/Display.html?id=78125
219    define_alias( qr/^macce$/i => '"MacCentralEurRoman"' );
220    # Ououououou. gone.  They are different!
221    # define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
222
223    # Standardize on the dashed versions.
224    define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' );
225
226    unless ($Encode::ON_EBCDIC) {
227
228        # for Encode::CN
229        define_alias( qr/\beuc.*cn$/i => '"euc-cn"' );
230        define_alias( qr/\bcn.*euc$/i => '"euc-cn"' );
231
232        # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
233        # CP936 doesn't have vendor-addon for GBK, so they're identical.
234        define_alias( qr/^gbk$/i => '"cp936"' );
235
236        # This fixes gb2312 vs. euc-cn confusion, practically
237        define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' );
238
239        # for Encode::JP
240        define_alias( qr/\bjis$/i         => '"7bit-jis"' );
241        define_alias( qr/\beuc.*jp$/i     => '"euc-jp"' );
242        define_alias( qr/\bjp.*euc$/i     => '"euc-jp"' );
243        define_alias( qr/\bujis$/i        => '"euc-jp"' );
244        define_alias( qr/\bshift.*jis$/i  => '"shiftjis"' );
245        define_alias( qr/\bsjis$/i        => '"shiftjis"' );
246        define_alias( qr/\bwindows-31j$/i => '"cp932"' );
247
248        # for Encode::KR
249        define_alias( qr/\beuc.*kr$/i => '"euc-kr"' );
250        define_alias( qr/\bkr.*euc$/i => '"euc-kr"' );
251
252        # This fixes ksc5601 vs. euc-kr confusion, practically
253        define_alias( qr/(?:x-)?uhc$/i         => '"cp949"' );
254        define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' );
255        define_alias( qr/\bks_c_5601-1987$/i   => '"cp949"' );
256
257        # for Encode::TW
258        define_alias( qr/\bbig-?5$/i              => '"big5-eten"' );
259        define_alias( qr/\bbig5-?et(?:en)?$/i     => '"big5-eten"' );
260        define_alias( qr/\btca[-_]?big5$/i        => '"big5-eten"' );
261        define_alias( qr/\bbig5-?hk(?:scs)?$/i    => '"big5-hkscs"' );
262        define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' );
263    }
264
265    # https://github.com/dankogai/p5-encode/issues/37
266    define_alias(qr/cp65000/i => '"UTF-7"');
267    define_alias(qr/cp65001/i => '"utf-8-strict"');
268
269    # utf8 is blessed :)
270    define_alias( qr/\bUTF-8$/i => '"utf-8-strict"' );
271
272    # At last, Map white space and _ to '-'
273    define_alias( qr/^([^\s_]+)[\s_]+([^\s_]*)$/i => '"$1-$2"' );
274}
275
2761;
277__END__
278
279# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
280# TODO: HP-UX '15' encodings japanese15 korean15 roi15
281# TODO: Cyrillic encoding ISO-IR-111 (useful?)
282# TODO: Armenian encoding ARMSCII-8
283# TODO: Hebrew encoding ISO-8859-8-1
284# TODO: Thai encoding TCVN
285# TODO: Vietnamese encodings VPS
286# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
287#       ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
288#       Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
289#       Kannada Khmer Korean Laotian Malayalam Mongolian
290#       Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
291
292=head1 NAME
293
294Encode::Alias - alias definitions to encodings
295
296=head1 SYNOPSIS
297
298  use Encode;
299  use Encode::Alias;
300  define_alias( "newName" => ENCODING);
301  define_alias( qr/.../ => ENCODING);
302  define_alias( sub { return ENCODING if ...; } );
303
304=head1 DESCRIPTION
305
306Allows newName to be used as an alias for ENCODING. ENCODING may be
307either the name of an encoding or an encoding object (as described
308in L<Encode>).
309
310Currently the first argument to define_alias() can be specified in the
311following ways:
312
313=over 4
314
315=item As a simple string.
316
317=item As a qr// compiled regular expression, e.g.:
318
319  define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
320
321In this case, if I<ENCODING> is not a reference, it is C<eval>-ed
322in order to allow C<$1> etc. to be substituted.  The example is one
323way to alias names as used in X11 fonts to the MIME names for the
324iso-8859-* family.  Note the double quotes inside the single quotes.
325
326(or, you don't have to do this yourself because this example is predefined)
327
328If you are using a regex here, you have to use the quotes as shown or
329it won't work.  Also note that regex handling is tricky even for the
330experienced.  Use this feature with caution.
331
332=item As a code reference, e.g.:
333
334  define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
335
336The same effect as the example above in a different way.  The coderef
337takes the alias name as an argument and returns a canonical name on
338success or undef if not.  Note the second argument is ignored if provided.
339Use this with even more caution than the regex version.
340
341=back
342
343=head3 Changes in code reference aliasing
344
345As of Encode 1.87, the older form
346
347  define_alias( sub { return  /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
348
349no longer works.
350
351Encode up to 1.86 internally used "local $_" to implement this older
352form.  But consider the code below;
353
354  use Encode;
355  $_ = "eeeee" ;
356  while (/(e)/g) {
357    my $utf = decode('aliased-encoding-name', $1);
358    print "position:",pos,"\n";
359  }
360
361Prior to Encode 1.86 this fails because of "local $_".
362
363=head2 Alias overloading
364
365You can override predefined aliases by simply applying define_alias().
366The new alias is always evaluated first, and when necessary,
367define_alias() flushes the internal cache to make the new definition
368available.
369
370  # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
371  # superset of SHIFT_JIS
372
373  define_alias( qr/shift.*jis$/i  => '"cp932"' );
374  define_alias( qr/sjis$/i        => '"cp932"' );
375
376If you want to zap all predefined aliases, you can use
377
378  Encode::Alias->undef_aliases;
379
380to do so.  And
381
382  Encode::Alias->init_aliases;
383
384gets the factory settings back.
385
386Note that define_alias() will not be able to override the canonical name
387of encodings. Encodings are first looked up by canonical name before
388potential aliases are tried.
389
390=head1 SEE ALSO
391
392L<Encode>, L<Encode::Supported>
393
394=cut
395
396