1#!/usr/bin/perl 2##---------------------------------------------------------------------------## 3## File: 4## mhn2mbox 5## Version: 6## 0.38 Nov 28 12:36:27 EST 2002 7## Author: 8## Anthony W anthonyw@albany.net 9## Description: 10## A utility for converting MHonArc html archives into pseudo mbox 11## format. 12## Usage: 13## mhn2mbox /path/to/mhonarc/archives [ your-output-file ] 14## 15##---------------------------------------------------------------------------## 16## Copyright (C) 2000 AnthonyW anthonyw@albany.net 17## 18## This program is free software; you can redistribute it and/or modify 19## it under the terms of the GNU General Public License as published by 20## the Free Software Foundation; either version 2 of the License, or 21## (at your option) any later version. 22## 23## This program is distributed in the hope that it will be useful, 24## but WITHOUT ANY WARRANTY; without even the implied warranty of 25## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 26## GNU General Public License for more details. 27## 28## You should have received a copy of the GNU General Public License 29## along with this program; if not, write to the Free Software 30## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 31## 02111-1307, USA 32##---------------------------------------------------------------------------## 33 34# Library where MHonArc libs 35# are located. CHANGE THIS TO MATCH YOUR SITE 36use lib '/usr/lib/perl5/site_perl/5.6.0'; 37 38use HTML::Entities; 39require 'mhamain.pl'; 40require 'base64.pl'; 41use strict; 42 43my $NoArgs = 1; 44my $USAGE = "Usage: $0 html_dir [output_file]\n"; 45my $HTML_DIR = shift || die $USAGE; 46my $OUTPUT_FILE = shift || '-'; # write to STDOUT by default 47my $debug = $ENV{'CMD_DEBUG'}; 48 49print STDERR "HTML_DIR=$HTML_DIR\n" if $debug; 50 51MAIN: { 52 53 mhonarc::initialize(); 54 #mhonarc::get_resources(); 55 print STDERR "MHonArc initialized.\n" if $debug; 56 require 'ewhutil.pl'; 57 require 'mhtime.pl'; 58 require 'mhutil.pl'; 59 60 local (*DIR); 61 62 print STDERR "Reading $HTML_DIR.\n" if $debug; 63 opendir(DIR, $HTML_DIR) || die qq/Unable to open "$HTML_DIR": $!/; 64 my @files = grep(/^msg/i, readdir(DIR)); 65 closedir(DIR); 66 67 open(MBOXFILE, ">$OUTPUT_FILE") 68 || die qq/Unable to open "$OUTPUT_FILE": $!/; 69 70 foreach (sort @files) { 71 my $name = "$HTML_DIR/" . $_; 72 print STDERR "working on: $name\n" if $debug; 73 &load_data($name); 74 print MBOXFILE "\n\n"; # sometimes necessary to seperate messages 75 } 76 77 close(MBOXFILE); 78} 79 80##---------------------------------------------------------------------------## 81## load_data: Function to read information from the headers of the html 82## files produced by mhonarc. Adapted from mhmsgfile.pl 83## 84sub load_data { 85 86 my $addendum = my $contype = my $index = my $datestr = ""; 87 my $from_addr = my $email_addr = my $binfile = ""; 88 my $description = my $docname = my $boundary = ""; 89 my $filename = shift; # Name of file to read 90 local (*MSGFILE); 91 my @Derived = (); 92 my @bodytext = (); 93 my @array = (); 94 95 if (!open(MSGFILE, $filename)) { 96 warn qq/Warning: Unable to open "$filename": $!\n/; 97 return 0; 98 } 99 my $href = parse_data(\*MSGFILE); 100 close(MSGFILE); 101 102 my $date = $href->{'date'}[0]; 103 104# $day[$wday].', '.$d2[$mday].' '.$month[$mon].' '.($year+1900).' '.$d2[$hour].':'.$d2[$min].':'.$d2[$sec].' GMT'; 105 106 ## Determine date of message 107 if (($date =~ /\S/) && (@array = mhonarc::parse_date($date))) { 108 $index = mhonarc::get_time_from_date(@array[1 .. $#array]); 109 } else { 110 $index = time; 111 $date = mhonarc::time2str("", $index, 1) unless $date =~ /\S/; 112 } 113 114 if (defined($href->{'from-r13'})) { 115 $from_addr = mhonarc::mrot13($href->{'from-r13'}[0]); 116 $email_addr = &extract_email_address($from_addr); 117 #$email_addr = mhonarc::extract_email_address($from_addr); 118 print MBOXFILE "From $email_addr $date\n"; 119 print MBOXFILE "From: $from_addr\n"; 120 } elsif (defined($href->{'from'})) { 121 $from_addr = $href->{'from'}[0]; 122 $email_addr = &extract_email_address($from_addr); 123 #$email_addr = mhonarc::extract_email_address($from_addr); 124 print MBOXFILE "From $email_addr $date\n"; 125 print MBOXFILE "From: $from_addr\n"; 126 } else { 127 print STDERR "WARNING: From Anonymous\n" if $debug; 128 $from_addr = 'Anonymous'; 129 $email_addr = mhonarc::extract_email_address($from_addr); 130 print MBOXFILE "From $email_addr $date\n"; 131 print MBOXFILE "From: $from_addr\n"; 132 } 133 134 print MBOXFILE "Date: $date\n"; 135 136 if (defined($href->{'msgtoheader'})) { 137 print MBOXFILE "To: $href->{'msgtoheader'}[0]\n"; 138 } 139 if (defined($href->{'subject'})) { 140 print MBOXFILE "Subject: $href->{'subject'}[0]\n"; 141 } 142 143 if (defined($href->{'reference'})) { 144 print MBOXFILE "In-Reply-To: <$href->{'reference'}[0]>\n"; 145 } 146 147 if (defined($href->{'message-id'})) { 148 print MBOXFILE "Message-ID: <$href->{'message-id'}[0]>\n"; 149 } 150 151 print MBOXFILE "MIME-Version: 1.0\n"; 152 153 if (defined($href->{'content-type'})) { 154 $contype = $href->{'content-type'}[0]; 155 } elsif (defined($href->{'contenttype'})) { # older versions 156 $contype = $href->{'contenttype'}[0]; 157 } 158 159 if (defined($href->{'msgbodytext'})) { 160 push(@bodytext, @{$href->{'msgbodytext'}}); 161 } 162 163 if ($contype =~ /multipart/i) { 164 $boundary = join("", $$, '.', time, '.', $contype); 165 166 if (defined($href->{'derived'})) { 167 168 print MBOXFILE 169 "Content-Type: $contype; boundary=\"Boundary..$boundary\"\n"; 170 push(@Derived, @{$href->{'derived'}}); 171 print STDERR "Attachments: ", join(',', @Derived), "\n" if $debug; 172 pop(@bodytext); 173 174 foreach $binfile (reverse @Derived) { 175 $description = pop(@bodytext); 176 $docname = $binfile; 177 $addendum .= "\n--Boundary..$boundary\n"; 178 $addendum .= 179 "Content-Type: application\/octet-stream\; name=\"$docname\"\n"; 180 $addendum .= "Content-Transfer-Encoding: base64\n"; 181 $addendum .= 182 "Content-Disposition: attachment\; filename=\"$docname\"\n"; 183 $addendum .= "Content-Description: \"$description\"\n\n"; 184 $addendum .= join("", mime_encode("$HTML_DIR/$binfile")); 185 } 186 187 print MBOXFILE "\n--Boundary..$boundary\n"; 188 print MBOXFILE "Content-Type: text/plain\n"; 189 print MBOXFILE "Content-Transfer-Encoding: 7bit\n"; 190 print MBOXFILE join("\n", @bodytext); 191 print MBOXFILE "\n"; 192 print MBOXFILE "$addendum"; 193 print MBOXFILE "--Boundary..$boundary--\n\n"; 194 195 } else { 196 197 print MBOXFILE "Content-Type: text\/plain\n"; 198 print MBOXFILE join("\n", @bodytext); 199 200 } 201 } else { 202 203 print MBOXFILE "Content-Type: $contype\n\n"; 204 print MBOXFILE join("\n", @bodytext); 205 206 } 207} 208 209##---------------------------------------------------------------------------## 210## parse_data(): Function to parse the initial comment 211## declarations of a MHonArc message file into a hash. A refernce 212## to resulting hash is returned. Keys are the field names, and 213## values are arrays of field values. Adapted from mhmsgfile.pl 214## 215sub parse_data { 216 my $fh = shift; # An open filehandle 217 my $start = "true"; 218 my $head = "false"; 219 my $subj = "false"; 220 my $tail = "false"; 221 my $body = "false"; 222 my ($field, $value); 223 my $AddrExp = '[^()<>@,;:\/\s"\'&|]+@[^()<>@,;:\/\s"\'&|]+'; 224 my %field = (); 225 local ($_); 226 227 while (<$fh>) { 228 229 if (/^<!--X-Head-End-->/) { 230 $start = "false"; 231 next; 232 } 233 if (/^<!--X-Head-of-Message/) { 234 $head = "true"; 235 next; 236 } 237 if (/^<!--X-Body-Begin-->/) { 238 $subj = "true"; 239 } 240 if (/^<!--X-Body-of-Message/) { 241 last if s/^<!--X-Body-of-Message-End//; 242 $body = "true"; 243 $head = "false"; 244 next; 245 } 246 if ($start eq "true") { 247 next unless s/^<!--X-//; # Skip non-field lines 248 chomp; 249 s/ -->$//; 250 s/<a(.*?)href="(.*)"(.*?)>(.*)<\/a>/$7/ig; 251 s/</</g; 252 s/>/>/g; 253 s/"/"/g; 254 ($field, $value) = split(/: /, $_, 2); 255 push(@{$field{lc $field}}, mhonarc::uncommentize($value)); 256 next; 257 } 258 if ($head eq "true") { 259 if (/^<li><em>To<.*?>:/i) { 260 s/<\/li>//ig; 261 s/<\/ul>//ig; 262 s/<a( | )href="(.*)">(.*)<\/a>/$3/ig; 263 s/</</g; 264 s/>/>/g; 265 s/"/"/g; 266 chomp; 267 ($field, $value) = split(/: /, $_, 2); 268 $field = "msgtoheader"; 269 push(@{$field{lc $field}}, $value); 270 $head = "false"; 271 } 272 next; 273 } 274 if ($body eq "true") { 275 # Extract URLs 276 chomp; 277 next if /<!--X-Body-of-Message-->/; 278 next if /^<(.*?)ul>$/i; 279 280 $_ = decode_html($_); 281 282 s/<a(.*?)href="(.*)"(.*?)>(.*)<\/a>/$4/ig; 283 $field = "msgbodytext"; 284 $value = $_; 285 chomp; 286 push(@{$field{lc $field}}, $value); 287 next; 288 } 289 } 290 \%field; 291} 292 293##---------------------------------------------------------------------------## 294## subroutine to base64 encode a file 295##---------------------------------------------------------------------------## 296 297sub mime_encode { 298 my $file = shift; 299 my @encoded_data = (); 300 local $/; # enable data slurp 301 open(MMENCODE, "$file"); 302 print STDERR "MIME-encoding: $file\n" if $debug; 303 @encoded_data = base64::b64encode(<MMENCODE>); 304 close MMENCODE; 305 return @encoded_data; 306} 307 308##---------------------------------------------------------------------------## 309## Subroutine to remove html tags from a string 310##---------------------------------------------------------------------------## 311sub decode_html { 312 313 s/<[^>]*>//gs; # from page 716 of the "Perl Cookbook" 314 $_ = decode_entities($_); 315 return $_; 316} 317 318##---------------------------------------------------------------------------## 319## Subroutine to extract email addresses. Taken from mhutil.pl 320##---------------------------------------------------------------------------## 321sub extract_email_address { 322 323 ## Regexp for address/msg-id detection (looks like cussing in cartoons) 324 my $AddrExp = '[^()<>@,;:\/\s"\'&|]+@[^()<>@,;:\/\s"\'&|]+'; 325 return '' unless defined $_[0]; 326 my $str = shift; 327 328 if ($str =~ /($AddrExp)/o) { 329 return $1; 330 } 331 if ($str =~ /<(\S+)>/) { 332 return $1; 333 } 334 if ($str =~ s/\([^\)]+\)//) { 335 $str =~ /\s*(\S+)\s*/; 336 return $1; 337 } 338 $str =~ /\s*(\S+)\s*/; 339 return $1; 340} 341 342##---------------------------------------------------------------------------## 343 3441; 345