1package Encode::HanConvert; 2use 5.006; 3use vars qw/$VERSION @EXPORT @EXPORT_OK/; 4 5$VERSION = '0.35'; 6@EXPORT = qw( 7 big5_to_gb trad_to_simp big5_to_simp gb_to_trad big5_to_trad gb_to_simp 8 gb_to_big5 simp_to_trad simp_to_big5 trad_to_gb trad_to_big5 simp_to_gb 9); 10 11@EXPORT_OK = qw(simple trad); 12 13use base 'Exporter'; 14 15if (eval "use Encode qw|encode decode from_to encode_utf8 decode_utf8|; 1") { 16 require XSLoader; 17 eval { XSLoader::load(__PACKAGE__, $VERSION) } 18 or eval {local $^W; require Encode::HanConvert::Perl; Encode::HanConvert::Perl->import; 1} 19 or die "Can't load Perl-based Converter: $@"; 20} 21else { 22 eval {local $^W; require Encode::HanConvert::Perl; Encode::HanConvert::Perl->import; 1} 23 or die "Can't load Perl-based Converter: $@"; 24} 25 26sub big5_to_gb ($) { 27 local $^W; # shuts Encode::HZ's redefine warnings up 28 require Encode::CN; 29 30 local $_[0] = $_[0] if defined wantarray; 31 from_to($_[0], 'big5-simp' => 'gbk'); 32 return $_[0]; 33} 34 35sub gb_to_big5 ($) { 36 require Encode::TW; 37 38 local $_[0] = $_[0] if defined wantarray; 39 from_to($_[0], 'gbk-trad' => 'big5'); 40 return $_[0]; 41} 42 43sub trad_to_simp ($) { 44 return decode('trad-simp', encode_utf8($_[0])) 45 if (defined wantarray); 46 $_[0] = decode('trad-simp', encode_utf8($_[0])); 47} 48 49sub simp_to_trad ($) { 50 return decode('simp-trad', encode_utf8($_[0])) 51 if (defined wantarray); 52 $_[0] = decode('simp-trad', encode_utf8($_[0])); 53} 54 55sub big5_to_simp ($) { 56 return decode('big5-simp', $_[0]) if (defined wantarray); 57 $_[0] = decode('big5-simp', $_[0]); 58} 59 60sub simp_to_big5 ($) { 61 return encode('big5-simp', $_[0]) if (defined wantarray); 62 $_[0] = encode('big5-simp', $_[0]); 63} 64 65sub gb_to_trad ($) { 66 return decode('gbk-trad', $_[0]) if (defined wantarray); 67 $_[0] = decode('gbk-trad', $_[0]); 68} 69 70sub trad_to_gb ($) { 71 return encode('gbk-trad', $_[0]) if (defined wantarray); 72 $_[0] = encode('gbk-trad', $_[0]); 73} 74 75# For completeness' sake... 76 77sub big5_to_trad ($) { 78 require Encode::TW; 79 return decode('big5', $_[0]) if (defined wantarray); 80 $_[0] = decode('big5', $_[0]); 81} 82 83sub trad_to_big5 ($) { 84 require Encode::TW; 85 return encode('big5', $_[0]) if (defined wantarray); 86 $_[0] = encode('big5', $_[0]); 87} 88 89sub gb_to_simp ($) { 90 local $^W; 91 require Encode::CN; 92 return decode('gbk', $_[0]) if (defined wantarray); 93 $_[0] = decode('gbk', $_[0]); 94} 95 96sub simp_to_gb ($) { 97 local $^W; 98 require Encode::CN; 99 return encode('gbk', $_[0]) if (defined wantarray); 100 $_[0] = encode('gbk', $_[0]); 101} 102 103# Lingua::ZH::HanConvert drop-in replacement -- not exported by default 104 105sub trad { simp_to_trad($_[0]) }; 106sub simple { trad_to_simp($_[0]) }; 107 1081; 109 110__END__ 111 112=head1 NAME 113 114Encode::HanConvert - Traditional and Simplified Chinese mappings 115 116=head1 VERSION 117 118This document describes version 0.35 of Encode::HanConvert, released 119January 27, 2009. 120 121=head1 SYNOPSIS 122 123As command line utilities: 124 125B<b2g.pl> [ B<-p> ] [ B<-u> ] [ I<inputfile> ...] > I<outputfile> 126 127B<g2b.pl> [ B<-p> ] [ B<-u> ] [[ I<inputfile> ...] > I<outputfile> 128 129In your program: 130 131 # The XS-based implementation needs Encode.pm 1.41; 132 # otherwise, autoloads the Perl-based Encode::HanConvert::Perl 133 use Encode::HanConvert; 134 135 # Conversion between Chinese encodings 136 $gbk = big5_to_gb($big5); # Big5 to GBK 137 $big5 = gb_to_big5($gbk); # GBK to Big5 138 139 # Conversion between Perl's Unicode strings 140 $simp = trad_to_simp($trad); # Traditional to Simplified 141 $trad = simp_to_trad($simp); # Simplified to Traditional 142 143 # Conversion between Chinese encoding and Unicode strings 144 $simp = big5_to_simp($big5); # Big5 to Simplified 145 $big5 = simp_to_big5($simp); # Simplified to Big5 146 $trad = gb_to_trad($gbk); # GBK to Traditional 147 $gbk = trad_to_gb($trad); # Traditional to GBK 148 149 # For completeness' sake... (no conversion, just encode/decode) 150 $simp = gb_to_simp($gbk); # GBK to Simplified 151 $gbk = simp_to_gb($simp); # Simplified to GBK 152 $trad = big5_to_trad($big5); # Big5 to Traditional 153 $big5 = trad_to_big5($trad); # Traditional to Big5 154 155 # All functions may be used in void context to transform $_[0] 156 big5_to_gb($string); # convert $string from Big5 to GBK 157 158 # Drop-in replacement functions for Lingua::ZH::HanConvert 159 use Encode::HanConvert qw(trad simple); # not exported by default 160 161 $simp = simple($trad); # Traditional to Simplified 162 $trad = trad($simp); # Simplified to Traditional 163 164=head1 DESCRIPTION 165 166This module is an attempt to solve most common problems occured in 167Traditional vs. Simplified Chinese conversion, in an efficient, 168flexible way, without resorting to external tools or modules. 169 170If you are using perl 5.7.2 or earlier, all Unicode-related functions 171are disabled, and B<Encode::HanConvert::Perl> is automagically loaded 172and used instead. In that case, please consult L<Encode::HanConvert::Perl> 173instead. 174 175After installing this module, you'll have two additional encoding 176formats: C<big5-simp> maps I<Big5> into Unicode's Simplified Chinese 177(and vice versa), and C<gbk-trad> maps I<GBK> (also known as I<CP936>) 178into Unicode's Traditional Chinese and back. 179 180The module exports various C<xxx_to_yyy> functions by default, where 181xxx and yyy are one of C<big5>, C<gb> (i.e. GBK/CP936), C<simp> 182(simplified Chinese unicode), or C<trad> (traditional Chinese unicode). 183 184You may also import C<simple> and C<trad>, which are aliases for 185C<simp_to_trad> and C<trad_to_simp>; this is provided as a drop-in 186replacement for programs using L<Lingua::ZH::HanConvert>. 187 188Since this is built on L<Encode>'s architecture, you may also use 189the line discipline syntax to perform the conversion implicitly 190(before 5.7.3, you need to use 'cp936' in place of 'gbk'): 191 192 require Encode::CN; 193 open BIG5, ':encoding(big5-simp)', 'big5.txt'; # as simplified 194 open EUC, '>:encoding(gbk)', 'gbk.txt'; # as gbk 195 print EUC, <BIG5>; 196 197 require Encode::TW; 198 open EUC, ':encoding(gbk-trad)', 'gbk.txt'; # as traditional 199 open BIG5, '>:encoding(big5)', 'big5.txt'; # as big-5 200 print BIG5, <EUC>; 201 202Or, more interestingly: 203 204=for encoding big5 205 206 use encoding 'big5-simp'; 207 print "����"; # prints simplified Chinese in unicode 208 209=head1 COMPARISON 210 211Although L<Lingua::ZH::HanConvert> module already provides mapping 212between Simplified and Traditional Unicode characters, it depend on 213other modules (L<Text::Iconv> or L<Encode>) to provide the necessary 214mapping with B<Big5> and B<GBK> encodings. 215 216Also, L<Encode::HanConvert> loads up much faster: 217 218 0.04 real 0.03 user 0.01 sys # Encode::HanConvert 219 0.19 real 0.18 user 0.00 sys # Encode::HanConvert::Perl 220 1.68 real 1.66 user 0.01 sys # Lingua::ZH::HanConvert (v0.12) 221 222The difference in actual conversion is much more significant. Use 5mb 223text of trad => simp as an example: 224 225 0.77 real 0.25 user 0.00 sys # iconv | b2g | iconv 226 0.64 real 0.59 user 0.04 sys # Encode::HanConvert b2g.pl -u 227 13.79 real 13.69 user 0.02 sys # Lingua::ZH::HanConvert trad2simp (v0.12) 228 229The C<b2g> above refers to Yeung and Lee's I<HanZi Converter>, a C-based 230program that maps big5 to gb2312 and back; C<iconv> refers to GNU 231libiconv. If you don't mind the overhead of calling an external process, 232their result is nearly identical with this module; however, their map 233falls short on rarely-used characters and box-drawing symbols. 234 235=head1 CAVEATS 236 237Please note that from version 0.03 and above, this module support the 238more expressive range B<GBK> instead of B<EUC-CN>. This may cause 239incompatibilities with older fonts. Programs using an earlier version 240of this module should rename C<euc-cn-trad> into C<gbk-trad>; sorry for 241the inconvenience. 242 243This module does not preserve one-to-many mappings; it blindly chooses 244the most frequently used substitutions, instead of presenting the user 245multiple choices. This can be remedied by a dictionary-based post 246processor that restores the correct character. 247 248As of version 0.05, the mapping from Big5 to GBK is I<complete>: All 249displayable Big5 characters are mapped, although substitute characters 250are used where there's no direct corresponding characters. 251 252However, there are numerous GBK characters without its Big5 counterparts: 253C<grep �� map/g2b_map.txt> from the distribution directory should show 254all of them. Any help on completing this mapping are very appreciated. 255 256=head1 ACKNOWLEDGEMENTS 257 258The conversion table used in this module comes from various sources, 259including B<Lingua::ZH::HanConvert> by David Chan, B<hc> by Ricky 260Yeung & Fung F. Lee, and B<Doggy Chinese Big5-GB Conversion Master> 261from Doggy Digital Creative Inc. (L<http://www.miniasp.com/>), Rei-Li 262Chen (rexchen), Unicode consortium's Unicode Character Database 263(L<http://www.unicode.org/ucd/>), as well as mappings used in Microsoft Word 2642000, Far East edition. 265 266The F<*.ucm> files are checked against test files generated by GNU 267libiconv with kind permission from Bruno Haible. 268 269Kudos to Nick Ing-Simmons, Dan Kogai and Jarkko Hietaniemi for 270showing me how to use B<Encode> and PerlIO. Thanks! 271 272=head1 SEE ALSO 273 274L<Encode::HanConvert::Perl>, L<Encode>, L<Lingua::ZH::HanConvert>, 275L<Text::Iconv> 276 277The L<b2g.pl> and L<g2b.pl> utilities installed with this module. 278 279=head1 AUTHORS 280 281Audrey Tang E<lt>cpan@audreyt.orgE<gt>, 282Kuang-che Wu E<lt>kcwu@csie.orgE<gt>. 283 284=head1 COPYRIGHT 285 286Copyright 2002-2009 by Audrey Tang E<lt>cpan@audreyt.orgE<gt>. 287Copyright 2006 by Kuang-che Wu E<lt>kcwu@csie.orgE<gt>. 288 289This program is free software; you can redistribute it and/or 290modify it under the same terms as Perl itself. 291 292See L<http://www.perl.com/perl/misc/Artistic.html> 293 294=cut 295