1# 2# -*- Perl -*- 3# $Id: pipermail.pl,v 1.3.2.11 2008-05-09 07:36:13 opengl2772 Exp $ 4# Copyright (C) 2004-2008 Namazu Project All rights reserved. 5# 6# This is free software with ABSOLUTELY NO WARRANTY. 7# 8# This program is free software; you can redistribute it and/or modify 9# it under the terms of the GNU General Public License as published by 10# the Free Software Foundation; either versions 2, or (at your option) 11# any later version. 12# 13# This program is distributed in the hope that it will be useful 14# but WITHOUT ANY WARRANTY; without even the implied warranty of 15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16# GNU General Public License for more details. 17# 18# You should have received a copy of the GNU General Public License 19# along with this program; if not, write to the Free Software 20# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 21# 02111-1307, USA 22# 23# This file must be encoded in EUC-JP encoding 24# 25 26package pipermail; 27use strict; 28require 'util.pl'; 29require 'gfilter.pl'; 30require 'html.pl'; 31require 'mailnews.pl'; 32 33# 34# This pattern specifies pipermail's file names. 35# 36my $PIPERMAIL_MESSAGE_FILE = '\d{6}\.html'; 37 38sub mediatype() { 39 return ('text/html; x-type=pipermail'); 40} 41 42sub status() { 43 # The check of a dependence filter. 44 return 'no' if (html::status() ne 'yes'); 45 return 'no' if (mailnews::status() ne 'yes'); 46 47 return 'yes'; 48} 49 50sub recursive() { 51 return 0; 52} 53 54sub pre_codeconv() { 55 return 0; 56} 57 58sub post_codeconv () { 59 return 0; 60} 61 62sub add_magic ($) { 63 return; 64} 65 66sub filter ($$$$$) { 67 my ($orig_cfile, $contref, $weighted_str, $headings, $fields) 68 = @_; 69 my $cfile = defined $orig_cfile ? $$orig_cfile : ''; 70 my $dummy_weighted_str; 71 my %dummy_fields = (); 72 73 util::vprint("Processing pipermail file ...\n"); 74 75 if ($$contref !~ m/<!--beginarticle-->/s || 76 $$contref !~ m/<!--endarticle-->/s) { 77 return $$orig_cfile . " is not a Pipermail message file! skipped."; # error 78 } 79 80 unless ($cfile =~ /($PIPERMAIL_MESSAGE_FILE)$/o) 81 { 82 return $$orig_cfile . " is not a Pipermail message file! skipped."; # error 83 } 84 85 pipermail_filter($contref, $weighted_str, $fields); 86 html::html_filter($contref, \$dummy_weighted_str, \%dummy_fields, $headings); 87 88 $$contref =~ s/^\s+//; 89 mailnews::uuencode_filter($contref); 90 mailnews::mailnews_filter($contref, \$dummy_weighted_str, \%dummy_fields); 91 mailnews::mailnews_citation_filter($contref, \$dummy_weighted_str); 92 93 gfilter::line_adjust_filter($contref); 94 gfilter::line_adjust_filter($weighted_str); 95 gfilter::white_space_adjust_filter($contref); 96 gfilter::show_filter_debug_info($contref, $weighted_str, 97 $fields, $headings); 98 return undef; 99} 100 101# Assume a normal message files by pipermail (mailman edition) 102sub pipermail_filter ($$$) { 103 my ($contref, $weighted_str, $fields) = @_; 104 105 # Strip off end-matter 106 $$contref =~ s/<!--endarticle-->.*//s; 107 108 my $pos = index($$contref, '<!--beginarticle-->'); 109 if ($pos > 0) { 110 my $head = substr($$contref, 0, $pos); 111 112 util::vprint("Looking at header: " . $head . "\n"); 113 if ($head =~ 114 m!<h1>(.*?)</h1>\s*<b>(.*?)\s*</b>(?:\s*<a\s+.*?href=.*?>(.*?)\s*</a>)?\s*<br>\s*<i>(.*?)</i>!is) { 115 { 116 my $title = $1; 117 decode_entity(\$title); 118 $title = uncommentize($title); 119 # codeconv::toeuc(\$title); 120 codeconv::codeconv_document(\$title); 121 122 1 while ($title =~ s/\A\s*(re|sv|fwd|fw|aw)[\[\]\d]*[:>-]+\s*//i); 123 $title =~ s/\A\s*\[[^\]]+\]\s*//; 124 1 while ($title =~ s/\A\s*(re|sv|fwd|fw|aw)[\[\]\d]*[:>-]+\s*//i); 125 html::decode_entity(\$title); 126 $fields->{'title'} = $title; 127 128 my $weight = $conf::Weight{'html'}->{'title'}; 129 $$weighted_str .= "\x7f$weight\x7f$title\x7f/$weight\x7f\n"; 130 } 131 132 { 133 my $from = $2; 134 if (defined $3) { 135 my $email = $3; 136# $email =~ s/ at /@/s; # no spam 137# $email =~ s/ @ /@/s; # no spam 138 $from .= " <$email>"; 139 } 140 html::decode_entity(\$from); 141 # codeconv::toeuc(\$from); 142 codeconv::codeconv_document(\$from); 143 $fields->{'from'} = $from; 144 } 145 146 { 147 my $date = $4; 148 html::decode_entity(\$date); 149 # codeconv::toeuc(\$date); 150 codeconv::codeconv_document(\$date); 151 if (util::islang("ja")) { 152 if ($date =~ m/(\d{4})ǯ\s*(\d{1,2})��\s*(\d{1,2})��\s+\(.*\)\s+(\d{2}:\d{2}:\d{2})\s+(\w+)/s) { 153 my @month = ( 154 "Jan", "Feb", "Mar", "Apr", 155 "May", "Jun", "Jul", "Aug", 156 "Sep", "Oct", "Nov", "Dec" 157 ); 158 my $m = $month[$2 - 1]; 159 $date = "Mon $m $3 $4 $5 $1"; 160 } 161 } 162 my $err = time_to_rfc822time(\$date); 163 $fields->{'date'} = $date unless(defined $err); 164 } 165 } 166 } 167 168 $$contref =~ s/<head>(.*)?<\/head>//si; 169 $$contref =~ s/<h1>.*<!--beginarticle-->//si; 170 # codeconv::toeuc($contref); 171 codeconv::codeconv_document($contref); 172 $$contref =~ s/ at /@/s; 173 if (util::islang("ja")) { 174 $$contref =~ s/ @ /@/s; 175 } 176} 177 178sub uncommentize { 179 my($txt) = $_[0]; 180 $txt =~ s/&#(\d+);/pack("C",$1)/ge; 181 $txt; 182} 183 184# Decode an entity. Ignore characters of right half of ISO-8859-1. 185# Because it can't be handled in EUC encoding. 186# This function provides sequential entities like: " < > 187sub decode_entity ($) { 188 my ($text) = @_; 189 190 return unless defined($$text); 191 192 $$text =~ s/"[;\s]/\"/g; 193 $$text =~ s/&[;\s]/&/g; 194 $$text =~ s/<[;\s]/</g; 195 $$text =~ s/>[;\s]/>/g; 196 $$text =~ s/ [;\s]/ /g; 197} 198 199my %wday_names = ( 200 "Sun" => 0, "Mon" => 1, "Tue" => 2, "Wed" => 3, 201 "Thu" => 4, "Fri" => 5, "Sat" => 6 202); 203my %month_names = ( 204 "Jan" => 0, "Feb" => 1, "Mar" => 2, "Apr" => 3, 205 "May" => 4, "Jun" => 5, "Jul" => 6, "Aug" => 7, 206 "Sep" => 8, "Oct" => 9, "Nov" => 10, "Dec" => 11 207); 208my $re_wday = join '|', keys %wday_names; 209my $re_month = join '|', keys %month_names; 210my $re_day = '(?:0?[1-9]|[12][0-9]|3[01])'; 211my $re_year = '(?:\d{4}|\d{2})'; # allow 2 digit fomrat 212my $re_hour = '(?:[01][0-9]|2[0-3])'; 213my $re_min = '(?:[0-5][0-9])'; 214my $re_sec = '(?:[0-5][0-9])'; 215 216sub time_to_rfc822time ($) { 217 my ($conf) = @_; 218 219 if ($$conf =~ / 220 ^\s* 221 ($re_wday)\s+ # a day of the week 222 ($re_month)\s+ # name of month 223 ($re_day)\s+ # a day of the month 224 ($re_hour):($re_min):($re_sec)\s+ # HH:MM:SS 225 (?:([^\s]*)\s+)? # (timezone) 226 (\d{4})\s* # year 227 /x) 228 { 229 my ($week, $month, $day, $hour, $min, $sec, $timezone, $year) = 230 ($1, $2, $3, $4, $5, $6, $7, $8); 231 232 $timezone = gettimezone() unless(defined $timezone); 233 $timezone = time::normalize_rfc822timezone($timezone); 234 235 my $mon = $month_names{$month}; 236 $week = ("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat") 237 [time::getwday($year, $mon + 1, $day)]; 238 239 $$conf = sprintf("%s, %2.2d %s %d %2.2d:%2.2d:%2.2d %s\n", 240 $week, $day, $month, $year, $hour, $min, $sec, $timezone); 241 242 return undef; 243 } 244 245 return "Illegal format."; 246} 247 2481; 249