1##---------------------------------------------------------------------------## 2## File: 3## $Id: Encode.pm,v 1.2 2003/03/05 22:17:15 ehood Exp $ 4## Author: 5## Earl Hood earl@earlhood.com 6## Description: 7## POD after __END__. 8##---------------------------------------------------------------------------## 9## Copyright (C) 2002 Earl Hood, earl@earlhood.com 10## 11## This program is free software; you can redistribute it and/or modify 12## it under the terms of the GNU General Public License as published by 13## the Free Software Foundation; either version 2 of the License, or 14## (at your option) any later version. 15## 16## This program is distributed in the hope that it will be useful, 17## but WITHOUT ANY WARRANTY; without even the implied warranty of 18## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19## GNU General Public License for more details. 20## 21## You should have received a copy of the GNU General Public License 22## along with this program; if not, write to the Free Software 23## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 24## 02111-1307, USA 25##---------------------------------------------------------------------------## 26 27package MHonArc::UTF8::Encode; 28 29use strict; 30use Encode; 31use MHonArc::CharMaps; 32 33our $VERSION = '2.6.24'; 34 35##---------------------------------------------------------------------------## 36 37sub clip { 38 my $str = \shift; # Prevent unnecessary copy. 39 my $len = shift; # Clip length 40 my $is_html = shift; # If entity references should be considered 41 my $has_tags = shift; # If html tags should be stripped 42 43 my $u = Encode::decode('utf8', $$str); 44 45 if (!$is_html) { 46 return substr($u, 0, $len); 47 } 48 49 my $text = Encode::decode('utf8', ''); 50 my $subtext; 51 my $html_len = length($u); 52 my ($pos, $sublen, $real_len, $semi); 53 my $er_len = 0; 54 55 for ($pos = 0, $sublen = $len; $pos < $html_len;) { 56 $subtext = substr($u, $pos, $sublen); 57 $pos += $sublen; 58 59 # strip tags 60 if ($has_tags) { 61 # Strip full tags 62 $subtext =~ s/<[^>]*>//g; 63 # Check if clipped part of a tag 64 if ($subtext =~ s/<[^>]*\Z//) { 65 my $gt = index($u, '>', $pos); 66 $pos = ($gt < 0) ? $html_len : ($gt + 1); 67 } 68 } 69 70 # check for clipped entity reference 71 if (($pos < $html_len) && ($subtext =~ /\&[^;]*\Z/)) { 72 my $semi = index($u, ';', $pos); 73 if ($semi < 0) { 74 # malformed entity reference 75 $subtext .= substr($u, $pos); 76 $pos = $html_len; 77 } else { 78 $subtext .= substr($u, $pos, $semi - $pos + 1); 79 $pos = $semi + 1; 80 } 81 } 82 83 # compute entity reference lengths to determine "real" character 84 # count and not raw character count. 85 while ($subtext =~ /(\&[^;]+);/g) { 86 $er_len += length($1); 87 } 88 89 $text .= $subtext; 90 91 # done if we have enough 92 $real_len = length($text) - $er_len; 93 if ($real_len >= $len) { 94 last; 95 } 96 $sublen = $len - (length($text) - $er_len); 97 } 98 Encode::encode('utf8', $text); 99} 100 101sub to_utf8 { 102 my $charset = lc $_[1]; 103 return $_[0] 104 if ($charset eq 'us-ascii' 105 || $charset eq 'utf-8' 106 || $charset eq 'utf8'); 107 my $text = $_[0]; 108 my $text_r = ref($text) ? $text : \$text; 109 eval { Encode::from_to($$text_r, $charset, 'utf8'); }; 110 if ($@) { 111 # fallback implementation. 112 require MHonArc::UTF8::MhaEncode; 113 return MHonArc::UTF8::MhaEncode::to_utf8($text_r, $charset); 114 } 115 $$text_r; 116} 117 118sub str2sgml { 119 my $text = shift; 120 my $charset = lc shift; 121 my $text_r = ref($text) ? $text : \$text; 122 123 if ($charset eq 'us-ascii') { 124 if ($$text_r =~ /[\x80-\xFF]/) { 125 $charset = 'iso-8859-1'; 126 } else { 127 $$text_r =~ s/([$HTMLSpecials])/$HTMLSpecials{$1}/go; 128 return $$text_r; 129 } 130 } 131 if ($charset eq 'utf-8' || $charset eq 'utf8') { 132 $$text_r =~ s/([$HTMLSpecials])/$HTMLSpecials{$1}/go; 133 return $$text_r; 134 } 135 eval { 136 Encode::from_to($$text_r, $charset, 'utf8'); 137 $$text_r =~ s/([$HTMLSpecials])/$HTMLSpecials{$1}/go; 138 }; 139 if ($@) { 140 # fallback implementation. 141 require MHonArc::UTF8::MhaEncode; 142 return MHonArc::UTF8::MhaEncode::str2sgml($text_r, $charset); 143 } 144 $$text_r; 145} 146 147##---------------------------------------------------------------------------## 1481; 149__END__ 150 151=head1 NAME 152 153MHonArc::UTF8::Encode - UTF-8 Encode-based routines for MHonArc 154 155=head1 SYNOPSIS 156 157 use MHonArc::UTF8::Encode; 158 159=head1 DESCRIPTION 160 161MHonArc::UTF8::Encode provides UTF-8 related routines for use in MHonArc 162by use Perl's v5.8, or later, Encode module. 163 164This module is generally not accessed directly since it is used by 165MHonArc::UTF8 when determining what encoding routines it can use based 166on your perl installation. 167 168=head1 FUNCTIONS 169 170=over 171 172=item C<to_utf8($data, $from_charset, $to_charset)> 173 174Converts C<$data> encoded in C<$from_charset> into UTF-8. 175C<$to_charset> is ignored since it assumed to be C<utf-8>. 176 177=item C<str2sgml($data, $charset)> 178 179All data passed in is converted to utf-8 with HTML specials 180converted into entity references. 181 182=item C<clip($text, $clip_len, $is_html, $has_tags)> 183 184Clip C<$text> to C<$clip_len> number of characters. 185 186=back 187 188=head1 SEE ALSO 189 190L<MHonArc::UTF8|MHonArc::UTF8> 191 192=head1 VERSION 193 194C<$Id: Encode.pm,v 1.2 2003/03/05 22:17:15 ehood Exp $> 195 196=head1 AUTHOR 197 198Earl Hood, earl@earlhood.com 199 200MHonArc comes with ABSOLUTELY NO WARRANTY and MHonArc may be copied only 201under the terms of the GNU General Public License, which may be found in 202the MHonArc distribution. 203 204=cut 205 206