1##---------------------------------------------------------------------------##
2##  File:
3##	$Id: Encode.pm,v 1.2 2003/03/05 22:17:15 ehood Exp $
4##  Author:
5##      Earl Hood       earl@earlhood.com
6##  Description:
7##	POD after __END__.
8##---------------------------------------------------------------------------##
9##    Copyright (C) 2002	Earl Hood, earl@earlhood.com
10##
11##    This program is free software; you can redistribute it and/or modify
12##    it under the terms of the GNU General Public License as published by
13##    the Free Software Foundation; either version 2 of the License, or
14##    (at your option) any later version.
15##
16##    This program is distributed in the hope that it will be useful,
17##    but WITHOUT ANY WARRANTY; without even the implied warranty of
18##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19##    GNU General Public License for more details.
20##
21##    You should have received a copy of the GNU General Public License
22##    along with this program; if not, write to the Free Software
23##    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
24##    02111-1307, USA
25##---------------------------------------------------------------------------##
26
27package MHonArc::UTF8::Encode;
28
29use strict;
30use Encode;
31use MHonArc::CharMaps;
32
33our $VERSION = '2.6.24';
34
35##---------------------------------------------------------------------------##
36
37sub clip {
38    my $str      = \shift;    # Prevent unnecessary copy.
39    my $len      = shift;     # Clip length
40    my $is_html  = shift;     # If entity references should be considered
41    my $has_tags = shift;     # If html tags should be stripped
42
43    my $u = Encode::decode('utf8', $$str);
44
45    if (!$is_html) {
46        return substr($u, 0, $len);
47    }
48
49    my $text = Encode::decode('utf8', '');
50    my $subtext;
51    my $html_len = length($u);
52    my ($pos, $sublen, $real_len, $semi);
53    my $er_len = 0;
54
55    for ($pos = 0, $sublen = $len; $pos < $html_len;) {
56        $subtext = substr($u, $pos, $sublen);
57        $pos += $sublen;
58
59        # strip tags
60        if ($has_tags) {
61            # Strip full tags
62            $subtext =~ s/<[^>]*>//g;
63            # Check if clipped part of a tag
64            if ($subtext =~ s/<[^>]*\Z//) {
65                my $gt = index($u, '>', $pos);
66                $pos = ($gt < 0) ? $html_len : ($gt + 1);
67            }
68        }
69
70        # check for clipped entity reference
71        if (($pos < $html_len) && ($subtext =~ /\&[^;]*\Z/)) {
72            my $semi = index($u, ';', $pos);
73            if ($semi < 0) {
74                # malformed entity reference
75                $subtext .= substr($u, $pos);
76                $pos = $html_len;
77            } else {
78                $subtext .= substr($u, $pos, $semi - $pos + 1);
79                $pos = $semi + 1;
80            }
81        }
82
83        # compute entity reference lengths to determine "real" character
84        # count and not raw character count.
85        while ($subtext =~ /(\&[^;]+);/g) {
86            $er_len += length($1);
87        }
88
89        $text .= $subtext;
90
91        # done if we have enough
92        $real_len = length($text) - $er_len;
93        if ($real_len >= $len) {
94            last;
95        }
96        $sublen = $len - (length($text) - $er_len);
97    }
98    Encode::encode('utf8', $text);
99}
100
101sub to_utf8 {
102    my $charset = lc $_[1];
103    return $_[0]
104        if ($charset eq 'us-ascii'
105        || $charset eq 'utf-8'
106        || $charset eq 'utf8');
107    my $text = $_[0];
108    my $text_r = ref($text) ? $text : \$text;
109    eval { Encode::from_to($$text_r, $charset, 'utf8'); };
110    if ($@) {
111        # fallback implementation.
112        require MHonArc::UTF8::MhaEncode;
113        return MHonArc::UTF8::MhaEncode::to_utf8($text_r, $charset);
114    }
115    $$text_r;
116}
117
118sub str2sgml {
119    my $text    = shift;
120    my $charset = lc shift;
121    my $text_r  = ref($text) ? $text : \$text;
122
123    if ($charset eq 'us-ascii') {
124        if ($$text_r =~ /[\x80-\xFF]/) {
125            $charset = 'iso-8859-1';
126        } else {
127            $$text_r =~ s/([$HTMLSpecials])/$HTMLSpecials{$1}/go;
128            return $$text_r;
129        }
130    }
131    if ($charset eq 'utf-8' || $charset eq 'utf8') {
132        $$text_r =~ s/([$HTMLSpecials])/$HTMLSpecials{$1}/go;
133        return $$text_r;
134    }
135    eval {
136        Encode::from_to($$text_r, $charset, 'utf8');
137        $$text_r =~ s/([$HTMLSpecials])/$HTMLSpecials{$1}/go;
138    };
139    if ($@) {
140        # fallback implementation.
141        require MHonArc::UTF8::MhaEncode;
142        return MHonArc::UTF8::MhaEncode::str2sgml($text_r, $charset);
143    }
144    $$text_r;
145}
146
147##---------------------------------------------------------------------------##
1481;
149__END__
150
151=head1 NAME
152
153MHonArc::UTF8::Encode - UTF-8 Encode-based routines for MHonArc
154
155=head1 SYNOPSIS
156
157  use MHonArc::UTF8::Encode;
158
159=head1 DESCRIPTION
160
161MHonArc::UTF8::Encode provides UTF-8 related routines for use in MHonArc
162by use Perl's v5.8, or later, Encode module.
163
164This module is generally not accessed directly since it is used by
165MHonArc::UTF8 when determining what encoding routines it can use based
166on your perl installation.
167
168=head1 FUNCTIONS
169
170=over
171
172=item C<to_utf8($data, $from_charset, $to_charset)>
173
174Converts C<$data> encoded in C<$from_charset> into UTF-8.
175C<$to_charset> is ignored since it assumed to be C<utf-8>.
176
177=item C<str2sgml($data, $charset)>
178
179All data passed in is converted to utf-8 with HTML specials
180converted into entity references.
181
182=item C<clip($text, $clip_len, $is_html, $has_tags)>
183
184Clip C<$text> to C<$clip_len> number of characters.
185
186=back
187
188=head1 SEE ALSO
189
190L<MHonArc::UTF8|MHonArc::UTF8>
191
192=head1 VERSION
193
194C<$Id: Encode.pm,v 1.2 2003/03/05 22:17:15 ehood Exp $>
195
196=head1 AUTHOR
197
198Earl Hood, earl@earlhood.com
199
200MHonArc comes with ABSOLUTELY NO WARRANTY and MHonArc may be copied only
201under the terms of the GNU General Public License, which may be found in
202the MHonArc distribution.
203
204=cut
205
206