1package Lingua::Han::Utils;
2
3use warnings;
4use strict;
5use base 'Exporter';
6use vars qw/$VERSION @EXPORT_OK/;
7$VERSION = '0.13';
8@EXPORT_OK = qw/Unihan_value csplit cdecode csubstr clength/;
9
10use Encode;
11use Encode::Detect::CJK qw(detect);
12
13sub cdecode {
14	my $word = shift;
15	my $encoding = detect($word);
16	$encoding = 'cp936' if $encoding eq 'iso-8859-1'; # hard fix
17	$word = decode($encoding, $word);
18	return $word;
19}
20
21sub Unihan_value {
22	my $word = shift;
23	$word = cdecode($word) unless Encode::is_utf8($word);
24	my @unihan = map { uc sprintf("%x",$_) } unpack ("U*", $word);
25	return wantarray?@unihan:(join('', @unihan));
26}
27
28sub csplit {
29	my $word = shift;
30	my $encoding = detect($word);
31	my @return_words;
32	my @code = Unihan_value($word);
33	foreach my $code (@code) {
34		my $value = pack("U*", hex $code);
35		$value = encode($encoding, $value);
36		push @return_words, $value if ($value);
37	}
38	return wantarray?@return_words:(join('', @return_words));
39}
40
41sub csubstr {
42	my ($word, $offset, $len) = @_;
43	my @words = csplit($word);
44	$len = scalar @words - $offset unless ($len);
45	@words = splice(@words, $offset, $len);
46	return wantarray?@words:(join('', @words));
47}
48
49sub clength {
50	my $word = shift;
51	my @words = csplit($word);
52	return scalar @words;
53}
54
551;
56__END__
57=encoding utf8
58
59=head1 NAME
60
61Lingua::Han::Utils - The utility tools of Chinese character(HanZi)
62
63=head1 SYNOPSIS
64
65    use Lingua::Han::Utils qw/Unihan_value csplit cdecode csubstr clength/;
66
67    # cdecode
68    # the same as decode('cp936', $word) in ASCII editing mode
69    #         and decode('utf8', $word) in Unicode editing mode
70    my $word = cdecode($word);
71
72    # Unihan_value
73    # return the first field of Unihan.txt on unicode.org
74    my $word = "我";
75    my $unihan = Unihan_value($word); # return '6211'
76    my $words = "爱你";
77    my @unihan = Unihan_value($word); # return (7231, 4F60)
78    my $unihan = Unihan_value($word); # return 72314F60
79
80    # csplit
81    # split the Chinese characters into an array
82    my $words = "我爱你";
83    my @words = csplit($words); # return ("我", "爱", "你")
84
85    # csubstr
86    # treat the Chinese characters as one
87    # so it's the same as splice(csplit($words), $offset, $length)
88    my $words = "我爱你啊";
89    my @words = csubstr($words, 1, 2); # return ("爱", "你")
90    my @words = csubstr($words, 1); # return ("爱", "你", "啊")
91    my $words = csubstr($words, 1, 2); # 爱你
92
93    # clength
94    # treat the Chinese character as one
95    my $words = "我爱你";
96    print clength($words); # 3
97
98=head1 EXPORT
99
100Nothing is exported by default.
101
102=head1 EXPORT_OK
103
104=over 4
105
106=item cdecode
107
108use L<Encode::Guess> to decode the character. It behavers like: decode('cp936', $word) under ASCII editing mode and decode('utf8', $word) under Unicode editing mode.
109
110=item Unihan_value
111
112the first field of Unihan.txt is the Unicode scalar value as U+[x]xxxx, we return the [x]xxxx.
113
114=item csplit
115
116split the Chinese characters into an array, English words can be mixed in.
117
118=item csubstr(WORD, OFFSET, LENGTH)
119
120treat the Chinese character as one word, substr it.
121
122(BE CAFEFUL! it's NOT lvalue, we cann't use csubstr($word, 2, 3) = $REPLACEMENT)
123
124if no LENGTH is specified, substr form OFFSET to END.
125
126=item clength
127
128treat the Chinese character as one word(length 1).
129
130=back
131
132=head1 DOCUMENT
133
134a Chinese version of document can be found @ L<http://www.fayland.org/journal/Lingua-Han-Utils.html>
135
136=head1 AUTHOR
137
138Fayland Lam, C<< <fayland at gmail.com> >>
139
140=head1 BUGS
141
142Please report any bugs or feature requests to
143C<bug-lingua-han-utils at rt.cpan.org>, or through the web interface at
144L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Lingua-Han-Utils>.
145I will be notified, and then you'll automatically be notified of progress on
146your bug as I make changes.
147
148=head1 SUPPORT
149
150You can find documentation for this module with the perldoc command.
151
152    perldoc Lingua::Han::Utils
153
154You can also look for information at:
155
156=over 4
157
158=item * AnnoCPAN: Annotated CPAN documentation
159
160L<http://annocpan.org/dist/Lingua-Han-Utils>
161
162=item * CPAN Ratings
163
164L<http://cpanratings.perl.org/d/Lingua-Han-Utils>
165
166=item * RT: CPAN's request tracker
167
168L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Lingua-Han-Utils>
169
170=item * Search CPAN
171
172L<http://search.cpan.org/dist/Lingua-Han-Utils>
173
174=back
175
176=head1 ACKNOWLEDGEMENTS
177
178the wonderful L<Encode::Guess>
179
180=head1 COPYRIGHT & LICENSE
181
182Copyright 2005 Fayland Lam, all rights reserved.
183
184This program is free software; you can redistribute it and/or modify it
185under the same terms as Perl itself.
186