1# 2# -*- Perl -*- 3# $Id: html.pl,v 1.32.4.19 2008-05-09 07:51:00 opengl2772 Exp $ 4# Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved. 5# Copyright (C) 2000-2008 Namazu Project All rights reserved. 6# This is free software with ABSOLUTELY NO WARRANTY. 7# 8# This program is free software; you can redistribute it and/or modify 9# it under the terms of the GNU General Public License as published by 10# the Free Software Foundation; either versions 2, or (at your option) 11# any later version. 12# 13# This program is distributed in the hope that it will be useful 14# but WITHOUT ANY WARRANTY; without even the implied warranty of 15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16# GNU General Public License for more details. 17# 18# You should have received a copy of the GNU General Public License 19# along with this program; if not, write to the Free Software 20# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 21# 02111-1307, USA 22# 23# This file must be encoded in EUC-JP encoding 24# 25 26package html; 27use strict; 28require 'util.pl'; 29require 'gfilter.pl'; 30 31my $EMBEDDED_FILE = '\.(asp|jsp|php[3s]?|phtml)(?:\.gz)?'; 32 33sub mediatype() { 34 return ('text/html'); 35} 36 37sub status() { 38 return 'yes'; 39} 40 41sub recursive() { 42 return 0; 43} 44 45sub pre_codeconv() { 46 return 1; 47} 48 49sub post_codeconv () { 50 return 0; 51} 52 53sub add_magic ($) { 54 return; 55} 56 57sub filter ($$$$$) { 58 my ($orig_cfile, $cont, $weighted_str, $headings, $fields) 59 = @_; 60 my $cfile = defined $orig_cfile ? $$orig_cfile : ''; 61 62 util::vprint("Processing html file ...\n"); 63 64 if ($var::Opt{'robotexclude'}) { 65 my $err = isexcluded($cont); 66 return $err if $err; 67 } 68 69 if ($cfile =~ /($EMBEDDED_FILE)$/o) { 70 embedded_filter($cont); 71 } 72 73 html_filter($cont, $weighted_str, $fields, $headings); 74 75 gfilter::line_adjust_filter($cont); 76 gfilter::line_adjust_filter($weighted_str); 77 gfilter::white_space_adjust_filter($cont); 78 gfilter::show_filter_debug_info($cont, $weighted_str, 79 $fields, $headings); 80 return undef; 81} 82 83# Check wheter or not the given URI is excluded. 84sub isexcluded ($) { 85 my ($contref) = @_; 86 my $err = undef; 87 88 if ($$contref =~ /META\s+NAME\s*=\s*([\'\"]?)ROBOTS\1\s+[^>]* 89 CONTENT\s*=\s*([\'\"]?).*?(NOINDEX|NONE).*?\2[^>]*>/ix) #" 90 { 91 $err = _("is excluded because of <meta name=\"robots\" ...>"); 92 } 93 return $err; 94} 95 96 97sub html_filter ($$$$) { 98 my ($contref, $weighted_str, $fields, $headings) = @_; 99 100 html::escape_lt_gt($contref); 101 $fields->{'title'} = html::get_title($contref, $weighted_str); 102 html::get_author($contref, $fields); 103 html::get_meta_tags($contref, $weighted_str, $fields); 104# html::get_img_alt($contref); 105 html::get_alt_attr($contref) if "ALT" =~ /^(?:$conf::HTML_ATTRIBUTES)$/io; 106 html::get_table_summary($contref) if "SUMMARY" =~ /^(?:$conf::HTML_ATTRIBUTES)$/io; 107 html::get_title_attr($contref) if "TITLE" =~ /^(?:$conf::HTML_ATTRIBUTES)$/io; 108 html::normalize_html_element($contref); 109 html::erase_above_body($contref); 110 html::remove_comments($contref); 111 html::weight_element($contref, $weighted_str, $headings); 112 html::remove_html_elements($contref); 113 # restore entities of each content. 114 html::decode_entity($contref); 115 html::decode_entity($weighted_str); 116 html::decode_entity($headings); 117 for my $key (keys %{$fields}) { 118 html::decode_entity(\$fields->{$key}); 119 } 120} 121 122# Get rid of HTML-embedded codes 123sub embedded_filter ($) { 124 my ($contref) = @_; 125 126 # handle with ASP,JSP,PHP,VBScript,JScript and JavaScript. 127 $$contref =~ s/<%.*?%>//gs; 128 $$contref =~ s/<\?.*?\?>//gs; 129 $$contref =~ s/<asp:.*?\/>//gs; 130 $$contref =~ s/<jsp:.*?\/>//gs; 131 $$contref =~ s/<script.*?<\/script>//igs; 132} 133 134# Convert independent < > s into entity references for escaping. 135# Substitute twice for safe. 136sub escape_lt_gt ($) { 137 my ($contref) = @_; 138 139 $$contref =~ s/\s<\s/ < /g; 140 $$contref =~ s/\s>\s/ > /g; 141 $$contref =~ s/\s<\s/ < /g; 142 $$contref =~ s/\s>\s/ > /g; 143} 144 145sub get_author ($$) { 146 my ($contref, $fields) = @_; 147 148 # <META NAME="AUTHOR" CONTENT="author"> 149 # <LINK REV=MADE HREF="mailto:ccsatoru@vega.aichi-u.ac.jp"> 150 151 if ($$contref =~ m!<META\s[^>]*?NAME=([\"\']?)AUTHOR\1\s[^>]*?CONTENT=([\"\']?)(.*?)\2\s*/?>!is) { 152 $fields->{'author'} = $3; 153 } elsif ($$contref =~ m!<LINK\s[^>]*?HREF=([\"\']?)mailto:(.*?)\1\s*/?>!i) { 154 $fields->{'author'} = $2; 155 } elsif ($$contref =~ m!<ADDRESS[^>]*>(.*?)</ADDRESS>!is) { 156 my $tmp = $1; 157# $tmp =~ s/\s//g; 158 if ($tmp =~ /\b([\w\.\-]+\@[\w\.\-]+(?:\.[\w\.\-]+)+)\b/) { 159 $fields->{'author'} = $1; 160 } 161 } 162} 163 164 165# Get title from <title>..</title> 166# It's okay to exits two or more <title>...</TITLE>. 167# First one will be retrieved. 168sub get_title ($$) { 169 my ($contref, $weighted_str) = @_; 170 my $title = ''; 171 172 if ($$contref =~ s!<TITLE[^>]*>(.*?)</TITLE>!!is) { 173 $title = $1; 174 $title =~ s/\s+/ /g; 175 $title =~ s/^\s+//; 176 $title =~ s/\s+$//; 177 my $weight = $conf::Weight{'html'}->{'title'}; 178 $$weighted_str .= "\x7f$weight\x7f$title\x7f/$weight\x7f\n"; 179 } else { 180 $title = $conf::NO_TITLE; 181 } 182 183 return $title; 184} 185 186# get foo bar from <META NAME="keywords|description" CONTENT="foo bar"> 187sub get_meta_tags ($$$) { 188 my ($contref, $weighted_str, $fields) = @_; 189 190 # <meta name="keywords" content="foo bar baz"> 191 192 my $weight = $conf::Weight{'metakey'}; 193 $$weighted_str .= "\x7f$weight\x7f$3\x7f/$weight\x7f\n" 194 if $$contref =~ /<meta\s+name\s*=\s*([\'\"]?) #" 195 keywords\1\s+[^>]*content\s*=\s*([\'\"]?)([^>]*?)\2[^>]*>/ix; #" 196 197 # <meta name="description" content="foo bar baz"> 198 $$weighted_str .= "\x7f$weight\x7f$3\x7f/$weight\x7f\n" 199 if $$contref =~ /<meta\s+name\s*=\s*([\'\"]?)description #" 200 \1\s+[^>]*content\s*=\s*([\'\"]?)([^>]*?)\2[^>]*>/ix; #" 201 202 if ($var::Opt{'meta'}) { 203 my @keys = split '\|', $conf::META_TAGS; 204 for my $key (@keys) { 205 if ($key !~ m/^author$/i) { 206 my $quotekey = quotemeta($key); 207 while ($$contref =~ /<meta\s+name\s*=\s*([\'\"]?)$quotekey #" 208 \1\s+[^>]*content\s*=\s*([\'\"]?)([^>]*?)\2[^>]*>/gix) 209 { 210 $fields->{$key} .= $3 . " "; 211 } 212 util::dprint("meta: $key: $fields->{$key}\n") 213 if defined $fields->{$key}; 214 } 215 } 216 } 217} 218 219# Get foo from <IMG ... ALT="foo"> 220# It's not to handle HTML strictly. 221sub get_img_alt ($) { 222 my ($contref) = @_; 223 224 $$contref =~ s/(<IMG[^>]*)\s+ALT\s*=\s*([\"\'])(.*?)\2([^>]*>)/ $3 $1$4/gi; 225 $$contref =~ s/(<IMG[^>]*)\s+ALT\s*=\s*([^\"\'\s>]*)([^>]*>)/ $2 $1$3/gi; 226} 227 228# Get foo from <XXX ... ALT="foo"> 229# It's not to handle HTML strictly. 230sub get_alt_attr ($) { 231 my ($contref) = @_; 232 233 $$contref =~ s/(<[A-Z]+[^>]*)\s+ALT\s*=\s*([\"\'])(.*?)\2([^>]*>)/ $3 $1$4/gi; 234 $$contref =~ s/(<[A-Z]+[^>]*)\s+ALT\s*=\s*([^\"\'\s>]*)([^>]*>)/ $2 $1$3/gi; 235} 236 237# Get foo from <TABLE ... SUMMARY="foo"> 238sub get_table_summary ($) { 239 my ($contref) = @_; 240 241 $$contref =~ s/(<TABLE[^>]*)\s+SUMMARY\s*=\s*([\"\'])(.*?)\2([^>]*>)/ $3 $1$4/gi; 242 $$contref =~ s/(<TABLE[^>]*)\s+SUMMARY\s*=\s*([^\"\'\s>]*)([^>]*>)/ $2 $1$3/gi; 243} 244 245# Get foo from <XXX ... TITLE="foo"> 246sub get_title_attr ($) { 247 my ($contref) = @_; 248 249 $$contref =~ s/(<[A-Z]+[^>]*)\s+TITLE\s*=\s*([\"\'])(.*?)\2([^>]*>)/ $3 $1$4/gi; 250 $$contref =~ s/(<[A-Z]+[^>]*)\s+TITLE\s*=\s*([^\"\'\s>]*)([^>]*>)/ $2 $1$3/gi; 251} 252 253# Normalize elements like: <A HREF...> -> <A> 254sub normalize_html_element ($) { 255 my ($contref) = @_; 256 257 $$contref =~ s/<([!\w]+)\s+[^>]*>/<$1>/g; 258} 259 260# Remove contents above <body>. 261sub erase_above_body ($) { 262 my ($contref) = @_; 263 264 $$contref =~ s/^.*<BODY[^>]*>//is; 265} 266 267 268# remove all comments. it's not perfect but almost works. 269sub remove_comments ($) { 270 my ($contref) = @_; 271 272 # remove all comments 273 $$contref =~ s/<!--.*?-->//gs; 274} 275 276# Weight a score of a keyword in a given text using %conf::Weight hash. 277# This process make the text be surround by temporary tags 278# \x7fXX\x7f and \x7f/XX\x7f. XX represents score. 279# Sort keys of %conf::Weight for processing <a> first. 280# Because <a> has a tendency to be inside of other tags. 281# Thus, it does'not processing for nexted tags strictly. 282# Moreover, it does special processing for <h[1-6]> for summarization. 283sub weight_element ($$$ ) { 284 my ($contref, $weighted_str, $headings) = @_; 285 286 for my $element (sort keys(%{$conf::Weight{'html'}})) { 287 my $tmp = ""; 288 $$contref =~ s!<($element)>(.*?)</$element>!weight_element_sub($1, $2, \$tmp)!gies; 289 $$headings .= $tmp if $element =~ /^H[1-6]$/i && ! $var::Opt{'noheadabst'} 290 && $tmp; 291 my $weight = $element =~ /^H[1-6]$/i && ! $var::Opt{'noheadabst'} ? 292 $conf::Weight{'html'}->{$element} : $conf::Weight{'html'}->{$element} - 1; 293 $$weighted_str .= "\x7f$weight\x7f$tmp\x7f/$weight\x7f\n" if $tmp; 294 } 295} 296 297sub weight_element_sub ($$$) { 298 my ($element, $text, $tmp) = @_; 299 300 my $space = element_space($element); 301 $text =~ s/<[^>]*>//g; 302 $$tmp .= "$text " if (length($text)) < $conf::INVALID_LENG; 303 $element =~ /^H[1-6]$/i && ! $var::Opt{'noheadabst'} ? " " : "$space$text$space"; 304} 305 306 307# determine whether a given element should be delete or be substituted with space 308sub element_space ($) { 309 $_[0] =~ /^($conf::NON_SEPARATION_ELEMENTS)$/io ? "" : " "; 310} 311 312# remove all HTML elements. it's not perfect but almost works. 313sub remove_html_elements ($) { 314 my ($contref) = @_; 315 316 # remove Office Markup <o:></o:>, <![]> 317 $$contref =~ s#</?([A-Z]\w*):.*?>|<(!)\[.*?\]\s*/?>#element_space($1||$2)#gsixe; 318 319 # remove all elements 320 $$contref =~ s!</?([A-Z]\w*)(?:\s+[A-Z]\w*(?:\s*=\s*(?:(["']).*?\2|[\w\-.]+))?)*\s*/?>!element_space($1)!gsixe; 321 322} 323 324# Decode a numberd entity. Exclude an invalid number. 325sub decode_numbered_entity ($) { 326 my ($num) = @_; 327 328 # FIXME: very ad hoc. (ISO-8859-1) 329 return "" 330 if (($num >= 0 && $num <= 31) || ($num >= 127 && $num <= 159) || 331 ($num >= 256)); 332 return "" 333 if $num >=127 && util::islang('ja'); 334 sprintf ("%c",$num); 335} 336 337 338# Decode an entity. Ignore characters of right half of ISO-8859-1. 339# Because it can't be handled in EUC encoding. 340# This function provides sequential entities like: " < > 341sub decode_entity ($) { 342 my ($text) = @_; 343 344 return unless defined($$text); 345 346 $$text =~ s/&#(\d{2,3})[;\s]/decode_numbered_entity($1)/ge; 347 $$text =~ s/&#x([\da-f]+)[;\s]/decode_numbered_entity(hex($1))/gei; 348 $$text =~ s/"[;\s]/\"/g; #" 349 $$text =~ s/&[;\s]/&/g; 350 $$text =~ s/<[;\s]/</g; 351 $$text =~ s/>[;\s]/>/g; 352 $$text =~ s/ [;\s]/ /g; 353} 354 355 356# encode entities: only '<', '>', and '&' 357sub encode_entity ($) { 358 my ($tmp) = @_; 359 360 $$tmp =~ s/&/&/g; # & should be processed first 361 $$tmp =~ s/</</g; 362 $$tmp =~ s/>/>/g; 363 $$tmp; 364} 365 3661; 367