1#
2# -*- Perl -*-
3# $Id: html.pl,v 1.32.4.19 2008-05-09 07:51:00 opengl2772 Exp $
4# Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved.
5# Copyright (C) 2000-2008 Namazu Project All rights reserved.
6#     This is free software with ABSOLUTELY NO WARRANTY.
7#
8#  This program is free software; you can redistribute it and/or modify
9#  it under the terms of the GNU General Public License as published by
10#  the Free Software Foundation; either versions 2, or (at your option)
11#  any later version.
12#
13#  This program is distributed in the hope that it will be useful
14#  but WITHOUT ANY WARRANTY; without even the implied warranty of
15#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16#  GNU General Public License for more details.
17#
18#  You should have received a copy of the GNU General Public License
19#  along with this program; if not, write to the Free Software
20#  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
21#  02111-1307, USA
22#
23#  This file must be encoded in EUC-JP encoding
24#
25
26package html;
27use strict;
28require 'util.pl';
29require 'gfilter.pl';
30
31my $EMBEDDED_FILE = '\.(asp|jsp|php[3s]?|phtml)(?:\.gz)?';
32
33sub mediatype() {
34    return ('text/html');
35}
36
37sub status() {
38    return 'yes';
39}
40
41sub recursive() {
42    return 0;
43}
44
45sub pre_codeconv() {
46    return 1;
47}
48
49sub post_codeconv () {
50    return 0;
51}
52
53sub add_magic ($) {
54    return;
55}
56
57sub filter ($$$$$) {
58    my ($orig_cfile, $cont, $weighted_str, $headings, $fields)
59      = @_;
60    my $cfile = defined $orig_cfile ? $$orig_cfile : '';
61
62    util::vprint("Processing html file ...\n");
63
64    if ($var::Opt{'robotexclude'}) {
65	my $err = isexcluded($cont);
66	return $err if $err;
67    }
68
69    if ($cfile =~ /($EMBEDDED_FILE)$/o) {
70       embedded_filter($cont);
71    }
72
73    html_filter($cont, $weighted_str, $fields, $headings);
74
75    gfilter::line_adjust_filter($cont);
76    gfilter::line_adjust_filter($weighted_str);
77    gfilter::white_space_adjust_filter($cont);
78    gfilter::show_filter_debug_info($cont, $weighted_str,
79			   $fields, $headings);
80    return undef;
81}
82
83# Check wheter or not the given URI is excluded.
84sub isexcluded ($) {
85    my ($contref) = @_;
86    my $err = undef;
87
88    if ($$contref =~ /META\s+NAME\s*=\s*([\'\"]?)ROBOTS\1\s+[^>]*
89	CONTENT\s*=\s*([\'\"]?).*?(NOINDEX|NONE).*?\2[^>]*>/ix)  #"
90    {
91	$err = _("is excluded because of <meta name=\"robots\" ...>");
92    }
93    return $err;
94}
95
96
97sub html_filter ($$$$) {
98    my ($contref, $weighted_str, $fields, $headings) = @_;
99
100    html::escape_lt_gt($contref);
101    $fields->{'title'} = html::get_title($contref, $weighted_str);
102    html::get_author($contref, $fields);
103    html::get_meta_tags($contref, $weighted_str, $fields);
104#    html::get_img_alt($contref);
105    html::get_alt_attr($contref) if "ALT" =~ /^(?:$conf::HTML_ATTRIBUTES)$/io;
106    html::get_table_summary($contref) if "SUMMARY" =~ /^(?:$conf::HTML_ATTRIBUTES)$/io;
107    html::get_title_attr($contref) if "TITLE" =~ /^(?:$conf::HTML_ATTRIBUTES)$/io;
108    html::normalize_html_element($contref);
109    html::erase_above_body($contref);
110    html::remove_comments($contref);
111    html::weight_element($contref, $weighted_str, $headings);
112    html::remove_html_elements($contref);
113    # restore entities of each content.
114    html::decode_entity($contref);
115    html::decode_entity($weighted_str);
116    html::decode_entity($headings);
117    for my $key (keys %{$fields}) {
118	html::decode_entity(\$fields->{$key});
119    }
120}
121
122# Get rid of HTML-embedded codes
123sub embedded_filter ($) {
124    my ($contref) = @_;
125
126    # handle with ASP,JSP,PHP,VBScript,JScript and JavaScript.
127    $$contref =~ s/<%.*?%>//gs;
128    $$contref =~ s/<\?.*?\?>//gs;
129    $$contref =~ s/<asp:.*?\/>//gs;
130    $$contref =~ s/<jsp:.*?\/>//gs;
131    $$contref =~ s/<script.*?<\/script>//igs;
132}
133
134# Convert independent < > s into entity references for escaping.
135# Substitute twice for safe.
136sub escape_lt_gt ($) {
137    my ($contref) = @_;
138
139    $$contref =~ s/\s<\s/ &lt; /g;
140    $$contref =~ s/\s>\s/ &gt; /g;
141    $$contref =~ s/\s<\s/ &lt; /g;
142    $$contref =~ s/\s>\s/ &gt; /g;
143}
144
145sub get_author ($$) {
146    my ($contref, $fields) = @_;
147
148    # <META NAME="AUTHOR" CONTENT="author">
149    # <LINK REV=MADE HREF="mailto:ccsatoru@vega.aichi-u.ac.jp">
150
151    if ($$contref =~ m!<META\s[^>]*?NAME=([\"\']?)AUTHOR\1\s[^>]*?CONTENT=([\"\']?)(.*?)\2\s*/?>!is) {
152        $fields->{'author'} = $3;
153    } elsif ($$contref =~ m!<LINK\s[^>]*?HREF=([\"\']?)mailto:(.*?)\1\s*/?>!i) {
154        $fields->{'author'} = $2;
155    } elsif ($$contref =~ m!<ADDRESS[^>]*>(.*?)</ADDRESS>!is) {
156	my $tmp = $1;
157#	$tmp =~ s/\s//g;
158	if ($tmp =~ /\b([\w\.\-]+\@[\w\.\-]+(?:\.[\w\.\-]+)+)\b/) {
159	    $fields->{'author'} = $1;
160	}
161    }
162}
163
164
165# Get title from <title>..</title>
166# It's okay to exits two or more <title>...</TITLE>.
167# First one will be retrieved.
168sub get_title ($$) {
169    my ($contref, $weighted_str) = @_;
170    my $title = '';
171
172    if ($$contref =~ s!<TITLE[^>]*>(.*?)</TITLE>!!is) {
173	$title = $1;
174	$title =~ s/\s+/ /g;
175	$title =~ s/^\s+//;
176	$title =~ s/\s+$//;
177	my $weight = $conf::Weight{'html'}->{'title'};
178	$$weighted_str .= "\x7f$weight\x7f$title\x7f/$weight\x7f\n";
179    } else {
180	$title = $conf::NO_TITLE;
181    }
182
183    return $title;
184}
185
186# get foo bar from <META NAME="keywords|description" CONTENT="foo bar">
187sub get_meta_tags ($$$) {
188    my ($contref, $weighted_str, $fields) = @_;
189
190    # <meta name="keywords" content="foo bar baz">
191
192    my $weight = $conf::Weight{'metakey'};
193    $$weighted_str .= "\x7f$weight\x7f$3\x7f/$weight\x7f\n"
194	if $$contref =~ /<meta\s+name\s*=\s*([\'\"]?) #"
195	    keywords\1\s+[^>]*content\s*=\s*([\'\"]?)([^>]*?)\2[^>]*>/ix; #"
196
197    # <meta name="description" content="foo bar baz">
198    $$weighted_str .= "\x7f$weight\x7f$3\x7f/$weight\x7f\n"
199	if $$contref =~ /<meta\s+name\s*=\s*([\'\"]?)description #"
200	    \1\s+[^>]*content\s*=\s*([\'\"]?)([^>]*?)\2[^>]*>/ix; #"
201
202    if ($var::Opt{'meta'}) {
203        my @keys = split '\|', $conf::META_TAGS;
204        for my $key (@keys) {
205            if ($key !~ m/^author$/i) {
206                my $quotekey = quotemeta($key);
207                while ($$contref =~ /<meta\s+name\s*=\s*([\'\"]?)$quotekey #"
208                    \1\s+[^>]*content\s*=\s*([\'\"]?)([^>]*?)\2[^>]*>/gix)
209                {
210                    $fields->{$key} .= $3 . " ";
211                }
212                util::dprint("meta: $key: $fields->{$key}\n")
213                    if defined $fields->{$key};
214            }
215        }
216    }
217}
218
219# Get foo from <IMG ... ALT="foo">
220# It's not to handle HTML strictly.
221sub get_img_alt ($) {
222    my ($contref) = @_;
223
224    $$contref =~ s/(<IMG[^>]*)\s+ALT\s*=\s*([\"\'])(.*?)\2([^>]*>)/ $3 $1$4/gi;
225    $$contref =~ s/(<IMG[^>]*)\s+ALT\s*=\s*([^\"\'\s>]*)([^>]*>)/ $2 $1$3/gi;
226}
227
228# Get foo from <XXX ... ALT="foo">
229# It's not to handle HTML strictly.
230sub get_alt_attr ($) {
231    my ($contref) = @_;
232
233    $$contref =~ s/(<[A-Z]+[^>]*)\s+ALT\s*=\s*([\"\'])(.*?)\2([^>]*>)/ $3 $1$4/gi;
234    $$contref =~ s/(<[A-Z]+[^>]*)\s+ALT\s*=\s*([^\"\'\s>]*)([^>]*>)/ $2 $1$3/gi;
235}
236
237# Get foo from <TABLE ... SUMMARY="foo">
238sub get_table_summary ($) {
239    my ($contref) = @_;
240
241    $$contref =~ s/(<TABLE[^>]*)\s+SUMMARY\s*=\s*([\"\'])(.*?)\2([^>]*>)/ $3 $1$4/gi;
242    $$contref =~ s/(<TABLE[^>]*)\s+SUMMARY\s*=\s*([^\"\'\s>]*)([^>]*>)/ $2 $1$3/gi;
243}
244
245# Get foo from <XXX ... TITLE="foo">
246sub get_title_attr ($) {
247    my ($contref) = @_;
248
249    $$contref =~ s/(<[A-Z]+[^>]*)\s+TITLE\s*=\s*([\"\'])(.*?)\2([^>]*>)/ $3 $1$4/gi;
250    $$contref =~ s/(<[A-Z]+[^>]*)\s+TITLE\s*=\s*([^\"\'\s>]*)([^>]*>)/ $2 $1$3/gi;
251}
252
253# Normalize elements like: <A HREF...> -> <A>
254sub normalize_html_element ($) {
255    my ($contref) = @_;
256
257    $$contref =~ s/<([!\w]+)\s+[^>]*>/<$1>/g;
258}
259
260# Remove contents above <body>.
261sub erase_above_body ($) {
262    my ($contref) = @_;
263
264    $$contref =~ s/^.*<BODY[^>]*>//is;
265}
266
267
268# remove all comments. it's not perfect but almost works.
269sub remove_comments ($) {
270    my ($contref) = @_;
271
272    # remove all comments
273    $$contref =~ s/<!--.*?-->//gs;
274}
275
276# Weight a score of a keyword in a given text using %conf::Weight hash.
277# This process make the text be surround by temporary tags
278# \x7fXX\x7f and \x7f/XX\x7f. XX represents score.
279# Sort keys of %conf::Weight for processing <a> first.
280# Because <a> has a tendency to be inside of other tags.
281# Thus, it does'not processing for nexted tags strictly.
282# Moreover, it does special processing for <h[1-6]> for summarization.
283sub weight_element ($$$ ) {
284    my ($contref, $weighted_str, $headings) = @_;
285
286    for my $element (sort keys(%{$conf::Weight{'html'}})) {
287	my $tmp = "";
288	$$contref =~ s!<($element)>(.*?)</$element>!weight_element_sub($1, $2, \$tmp)!gies;
289	$$headings .= $tmp if $element =~ /^H[1-6]$/i && ! $var::Opt{'noheadabst'}
290	    && $tmp;
291	my $weight = $element =~ /^H[1-6]$/i && ! $var::Opt{'noheadabst'} ?
292	    $conf::Weight{'html'}->{$element} : $conf::Weight{'html'}->{$element} - 1;
293	$$weighted_str .= "\x7f$weight\x7f$tmp\x7f/$weight\x7f\n" if $tmp;
294    }
295}
296
297sub weight_element_sub ($$$) {
298    my ($element, $text, $tmp) = @_;
299
300    my $space = element_space($element);
301    $text =~ s/<[^>]*>//g;
302    $$tmp .= "$text " if (length($text)) < $conf::INVALID_LENG;
303    $element =~ /^H[1-6]$/i && ! $var::Opt{'noheadabst'}  ? " " : "$space$text$space";
304}
305
306
307# determine whether a given element should be delete or be substituted with space
308sub element_space ($) {
309    $_[0] =~ /^($conf::NON_SEPARATION_ELEMENTS)$/io ? "" : " ";
310}
311
312# remove all HTML elements. it's not perfect but almost works.
313sub remove_html_elements ($) {
314    my ($contref) = @_;
315
316    # remove Office Markup <o:></o:>, <![]>
317    $$contref =~ s#</?([A-Z]\w*):.*?>|<(!)\[.*?\]\s*/?>#element_space($1||$2)#gsixe;
318
319    # remove all elements
320    $$contref =~ s!</?([A-Z]\w*)(?:\s+[A-Z]\w*(?:\s*=\s*(?:(["']).*?\2|[\w\-.]+))?)*\s*/?>!element_space($1)!gsixe;
321
322}
323
324# Decode a numberd entity. Exclude an invalid number.
325sub decode_numbered_entity ($) {
326    my ($num) = @_;
327
328    # FIXME: very ad hoc. (ISO-8859-1)
329    return ""
330        if (($num >= 0 && $num <= 31) || ($num >= 127 && $num <= 159) ||
331        ($num >= 256));
332    return ""
333	if $num >=127 && util::islang('ja');
334    sprintf ("%c",$num);
335}
336
337
338# Decode an entity. Ignore characters of right half of ISO-8859-1.
339# Because it can't be handled in EUC encoding.
340# This function provides sequential entities like: &quot &lt &gt;
341sub decode_entity ($) {
342    my ($text) = @_;
343
344    return unless defined($$text);
345
346    $$text =~ s/&#(\d{2,3})[;\s]/decode_numbered_entity($1)/ge;
347    $$text =~ s/&#x([\da-f]+)[;\s]/decode_numbered_entity(hex($1))/gei;
348    $$text =~ s/&quot[;\s]/\"/g; #"
349    $$text =~ s/&amp[;\s]/&/g;
350    $$text =~ s/&lt[;\s]/</g;
351    $$text =~ s/&gt[;\s]/>/g;
352    $$text =~ s/&nbsp[;\s]/ /g;
353}
354
355
356# encode entities: only '<', '>', and '&'
357sub encode_entity ($) {
358    my ($tmp) = @_;
359
360    $$tmp =~ s/&/&amp;/g;    # &amp; should be processed first
361    $$tmp =~ s/</&lt;/g;
362    $$tmp =~ s/>/&gt;/g;
363    $$tmp;
364}
365
3661;
367