1#
2# -*- Perl -*-
3# $Id: pipermail.pl,v 1.3.2.11 2008-05-09 07:36:13 opengl2772 Exp $
4# Copyright (C) 2004-2008 Namazu Project All rights reserved.
5#
6#     This is free software with ABSOLUTELY NO WARRANTY.
7#
8#  This program is free software; you can redistribute it and/or modify
9#  it under the terms of the GNU General Public License as published by
10#  the Free Software Foundation; either versions 2, or (at your option)
11#  any later version.
12#
13#  This program is distributed in the hope that it will be useful
14#  but WITHOUT ANY WARRANTY; without even the implied warranty of
15#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16#  GNU General Public License for more details.
17#
18#  You should have received a copy of the GNU General Public License
19#  along with this program; if not, write to the Free Software
20#  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
21#  02111-1307, USA
22#
23#  This file must be encoded in EUC-JP encoding
24#
25
26package pipermail;
27use strict;
28require 'util.pl';
29require 'gfilter.pl';
30require 'html.pl';
31require 'mailnews.pl';
32
33#
34# This pattern specifies pipermail's file names.
35#
36my $PIPERMAIL_MESSAGE_FILE = '\d{6}\.html';
37
38sub mediatype() {
39    return ('text/html; x-type=pipermail');
40}
41
42sub status() {
43    # The check of a dependence filter.
44    return 'no' if (html::status() ne 'yes');
45    return 'no' if (mailnews::status() ne 'yes');
46
47    return 'yes';
48}
49
50sub recursive() {
51    return 0;
52}
53
54sub pre_codeconv() {
55    return 0;
56}
57
58sub post_codeconv () {
59    return 0;
60}
61
62sub add_magic ($) {
63    return;
64}
65
66sub filter ($$$$$) {
67    my ($orig_cfile, $contref, $weighted_str, $headings, $fields)
68      = @_;
69    my $cfile = defined $orig_cfile ? $$orig_cfile : '';
70    my $dummy_weighted_str;
71    my %dummy_fields = ();
72
73    util::vprint("Processing pipermail file ...\n");
74
75    if ($$contref !~ m/<!--beginarticle-->/s ||
76    $$contref !~ m/<!--endarticle-->/s) {
77        return $$orig_cfile . " is not a Pipermail message file! skipped."; # error
78    }
79
80    unless ($cfile =~ /($PIPERMAIL_MESSAGE_FILE)$/o)
81    {
82        return $$orig_cfile . " is not a Pipermail message file! skipped."; # error
83    }
84
85    pipermail_filter($contref, $weighted_str, $fields);
86    html::html_filter($contref, \$dummy_weighted_str, \%dummy_fields, $headings);
87
88    $$contref =~ s/^\s+//;
89    mailnews::uuencode_filter($contref);
90    mailnews::mailnews_filter($contref, \$dummy_weighted_str, \%dummy_fields);
91    mailnews::mailnews_citation_filter($contref, \$dummy_weighted_str);
92
93    gfilter::line_adjust_filter($contref);
94    gfilter::line_adjust_filter($weighted_str);
95    gfilter::white_space_adjust_filter($contref);
96    gfilter::show_filter_debug_info($contref, $weighted_str,
97                           $fields, $headings);
98    return undef;
99}
100
101# Assume a normal message files by pipermail (mailman edition)
102sub pipermail_filter ($$$) {
103    my ($contref, $weighted_str, $fields) = @_;
104
105    # Strip off end-matter
106    $$contref =~ s/<!--endarticle-->.*//s;
107
108    my $pos = index($$contref, '<!--beginarticle-->');
109    if ($pos > 0) {
110        my $head = substr($$contref, 0, $pos);
111
112        util::vprint("Looking at header: " . $head . "\n");
113        if ($head =~
114        m!<h1>(.*?)</h1>\s*<b>(.*?)\s*</b>(?:\s*<a\s+.*?href=.*?>(.*?)\s*</a>)?\s*<br>\s*<i>(.*?)</i>!is) {
115            {
116                my $title = $1;
117                decode_entity(\$title);
118                $title = uncommentize($title);
119                # codeconv::toeuc(\$title);
120                codeconv::codeconv_document(\$title);
121
122                1  while ($title =~ s/\A\s*(re|sv|fwd|fw|aw)[\[\]\d]*[:>-]+\s*//i);
123                $title =~ s/\A\s*\[[^\]]+\]\s*//;
124                1  while ($title =~ s/\A\s*(re|sv|fwd|fw|aw)[\[\]\d]*[:>-]+\s*//i);
125                html::decode_entity(\$title);
126                $fields->{'title'} = $title;
127
128                my $weight = $conf::Weight{'html'}->{'title'};
129                $$weighted_str .= "\x7f$weight\x7f$title\x7f/$weight\x7f\n";
130            }
131
132            {
133                my $from = $2;
134                if (defined $3) {
135                    my $email = $3;
136#                    $email =~ s/ at /@/s;    # no spam
137#                    $email =~ s/ @ /@/s;     # no spam
138                    $from .= " <$email>";
139                }
140                html::decode_entity(\$from);
141                # codeconv::toeuc(\$from);
142                codeconv::codeconv_document(\$from);
143                $fields->{'from'} = $from;
144            }
145
146            {
147                my $date = $4;
148                html::decode_entity(\$date);
149                # codeconv::toeuc(\$date);
150                codeconv::codeconv_document(\$date);
151                if (util::islang("ja")) {
152                    if ($date =~ m/(\d{4})ǯ\s*(\d{1,2})��\s*(\d{1,2})��\s+\(.*\)\s+(\d{2}:\d{2}:\d{2})\s+(\w+)/s) {
153                        my @month = (
154                            "Jan", "Feb", "Mar", "Apr",
155                            "May", "Jun", "Jul", "Aug",
156                            "Sep", "Oct", "Nov", "Dec"
157                        );
158                        my $m = $month[$2 - 1];
159                        $date = "Mon $m $3 $4 $5 $1";
160                    }
161                }
162                my $err = time_to_rfc822time(\$date);
163                $fields->{'date'} = $date unless(defined $err);
164            }
165        }
166    }
167
168    $$contref =~ s/<head>(.*)?<\/head>//si;
169    $$contref =~ s/<h1>.*<!--beginarticle-->//si;
170    # codeconv::toeuc($contref);
171    codeconv::codeconv_document($contref);
172    $$contref =~ s/ at /@/s;
173    if (util::islang("ja")) {
174        $$contref =~ s/ @ /@/s;
175    }
176}
177
178sub uncommentize {
179    my($txt) = $_[0];
180    $txt =~ s/&#(\d+);/pack("C",$1)/ge;
181    $txt;
182}
183
184# Decode an entity. Ignore characters of right half of ISO-8859-1.
185# Because it can't be handled in EUC encoding.
186# This function provides sequential entities like: &quot &lt &gt;
187sub decode_entity ($) {
188    my ($text) = @_;
189
190    return unless defined($$text);
191
192    $$text =~ s/&quot[;\s]/\"/g;
193    $$text =~ s/&amp[;\s]/&/g;
194    $$text =~ s/&lt[;\s]/</g;
195    $$text =~ s/&gt[;\s]/>/g;
196    $$text =~ s/&nbsp[;\s]/ /g;
197}
198
199my %wday_names = (
200    "Sun" => 0, "Mon" => 1, "Tue" => 2, "Wed" => 3,
201    "Thu" => 4, "Fri" => 5, "Sat" => 6
202);
203my %month_names = (
204    "Jan" =>  0, "Feb" =>  1, "Mar" =>  2, "Apr" =>  3,
205    "May" =>  4, "Jun" =>  5, "Jul" =>  6, "Aug" =>  7,
206    "Sep" =>  8, "Oct" =>  9, "Nov" => 10, "Dec" => 11
207);
208my $re_wday  =  join '|', keys %wday_names;
209my $re_month =  join '|', keys %month_names;
210my $re_day   =  '(?:0?[1-9]|[12][0-9]|3[01])';
211my $re_year  =  '(?:\d{4}|\d{2})';  # allow 2 digit fomrat
212my $re_hour  =  '(?:[01][0-9]|2[0-3])';
213my $re_min   =  '(?:[0-5][0-9])';
214my $re_sec   =  '(?:[0-5][0-9])';
215
216sub time_to_rfc822time ($) {
217    my ($conf) = @_;
218
219    if ($$conf =~ /
220        ^\s*
221        ($re_wday)\s+                         # a day of the week
222        ($re_month)\s+                        # name of month
223        ($re_day)\s+                          # a day of the month
224        ($re_hour):($re_min):($re_sec)\s+     # HH:MM:SS
225        (?:([^\s]*)\s+)?                      # (timezone)
226        (\d{4})\s*                            # year
227        /x)
228    {
229        my ($week, $month, $day, $hour, $min, $sec, $timezone, $year) =
230            ($1, $2, $3, $4, $5, $6, $7, $8);
231
232        $timezone = gettimezone() unless(defined $timezone);
233        $timezone = time::normalize_rfc822timezone($timezone);
234
235        my $mon = $month_names{$month};
236        $week = ("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat")
237                                [time::getwday($year, $mon + 1, $day)];
238
239        $$conf = sprintf("%s, %2.2d %s %d %2.2d:%2.2d:%2.2d %s\n",
240            $week, $day, $month, $year, $hour, $min, $sec, $timezone);
241
242        return undef;
243    }
244
245    return "Illegal format.";
246}
247
2481;
249