1package Lingua::Han::Utils; 2 3use warnings; 4use strict; 5use base 'Exporter'; 6use vars qw/$VERSION @EXPORT_OK/; 7$VERSION = '0.13'; 8@EXPORT_OK = qw/Unihan_value csplit cdecode csubstr clength/; 9 10use Encode; 11use Encode::Detect::CJK qw(detect); 12 13sub cdecode { 14 my $word = shift; 15 my $encoding = detect($word); 16 $encoding = 'cp936' if $encoding eq 'iso-8859-1'; # hard fix 17 $word = decode($encoding, $word); 18 return $word; 19} 20 21sub Unihan_value { 22 my $word = shift; 23 $word = cdecode($word) unless Encode::is_utf8($word); 24 my @unihan = map { uc sprintf("%x",$_) } unpack ("U*", $word); 25 return wantarray?@unihan:(join('', @unihan)); 26} 27 28sub csplit { 29 my $word = shift; 30 my $encoding = detect($word); 31 my @return_words; 32 my @code = Unihan_value($word); 33 foreach my $code (@code) { 34 my $value = pack("U*", hex $code); 35 $value = encode($encoding, $value); 36 push @return_words, $value if ($value); 37 } 38 return wantarray?@return_words:(join('', @return_words)); 39} 40 41sub csubstr { 42 my ($word, $offset, $len) = @_; 43 my @words = csplit($word); 44 $len = scalar @words - $offset unless ($len); 45 @words = splice(@words, $offset, $len); 46 return wantarray?@words:(join('', @words)); 47} 48 49sub clength { 50 my $word = shift; 51 my @words = csplit($word); 52 return scalar @words; 53} 54 551; 56__END__ 57=encoding utf8 58 59=head1 NAME 60 61Lingua::Han::Utils - The utility tools of Chinese character(HanZi) 62 63=head1 SYNOPSIS 64 65 use Lingua::Han::Utils qw/Unihan_value csplit cdecode csubstr clength/; 66 67 # cdecode 68 # the same as decode('cp936', $word) in ASCII editing mode 69 # and decode('utf8', $word) in Unicode editing mode 70 my $word = cdecode($word); 71 72 # Unihan_value 73 # return the first field of Unihan.txt on unicode.org 74 my $word = "我"; 75 my $unihan = Unihan_value($word); # return '6211' 76 my $words = "爱你"; 77 my @unihan = Unihan_value($word); # return (7231, 4F60) 78 my $unihan = Unihan_value($word); # return 72314F60 79 80 # csplit 81 # split the Chinese characters into an array 82 my $words = "我爱你"; 83 my @words = csplit($words); # return ("我", "爱", "你") 84 85 # csubstr 86 # treat the Chinese characters as one 87 # so it's the same as splice(csplit($words), $offset, $length) 88 my $words = "我爱你啊"; 89 my @words = csubstr($words, 1, 2); # return ("爱", "你") 90 my @words = csubstr($words, 1); # return ("爱", "你", "啊") 91 my $words = csubstr($words, 1, 2); # 爱你 92 93 # clength 94 # treat the Chinese character as one 95 my $words = "我爱你"; 96 print clength($words); # 3 97 98=head1 EXPORT 99 100Nothing is exported by default. 101 102=head1 EXPORT_OK 103 104=over 4 105 106=item cdecode 107 108use L<Encode::Guess> to decode the character. It behavers like: decode('cp936', $word) under ASCII editing mode and decode('utf8', $word) under Unicode editing mode. 109 110=item Unihan_value 111 112the first field of Unihan.txt is the Unicode scalar value as U+[x]xxxx, we return the [x]xxxx. 113 114=item csplit 115 116split the Chinese characters into an array, English words can be mixed in. 117 118=item csubstr(WORD, OFFSET, LENGTH) 119 120treat the Chinese character as one word, substr it. 121 122(BE CAFEFUL! it's NOT lvalue, we cann't use csubstr($word, 2, 3) = $REPLACEMENT) 123 124if no LENGTH is specified, substr form OFFSET to END. 125 126=item clength 127 128treat the Chinese character as one word(length 1). 129 130=back 131 132=head1 DOCUMENT 133 134a Chinese version of document can be found @ L<http://www.fayland.org/journal/Lingua-Han-Utils.html> 135 136=head1 AUTHOR 137 138Fayland Lam, C<< <fayland at gmail.com> >> 139 140=head1 BUGS 141 142Please report any bugs or feature requests to 143C<bug-lingua-han-utils at rt.cpan.org>, or through the web interface at 144L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Lingua-Han-Utils>. 145I will be notified, and then you'll automatically be notified of progress on 146your bug as I make changes. 147 148=head1 SUPPORT 149 150You can find documentation for this module with the perldoc command. 151 152 perldoc Lingua::Han::Utils 153 154You can also look for information at: 155 156=over 4 157 158=item * AnnoCPAN: Annotated CPAN documentation 159 160L<http://annocpan.org/dist/Lingua-Han-Utils> 161 162=item * CPAN Ratings 163 164L<http://cpanratings.perl.org/d/Lingua-Han-Utils> 165 166=item * RT: CPAN's request tracker 167 168L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Lingua-Han-Utils> 169 170=item * Search CPAN 171 172L<http://search.cpan.org/dist/Lingua-Han-Utils> 173 174=back 175 176=head1 ACKNOWLEDGEMENTS 177 178the wonderful L<Encode::Guess> 179 180=head1 COPYRIGHT & LICENSE 181 182Copyright 2005 Fayland Lam, all rights reserved. 183 184This program is free software; you can redistribute it and/or modify it 185under the same terms as Perl itself. 186