1#!/usr/bin/perl
2##---------------------------------------------------------------------------##
3##  File:
4##      mhn2mbox
5##  Version:
6##      0.38 Nov 28 12:36:27 EST 2002
7##  Author:
8##      Anthony W       anthonyw@albany.net
9##  Description:
10##      A utility for converting MHonArc html archives into pseudo mbox
11##      format.
12##  Usage:
13##      mhn2mbox /path/to/mhonarc/archives [ your-output-file ]
14##
15##---------------------------------------------------------------------------##
16##    Copyright (C) 2000        AnthonyW anthonyw@albany.net
17##
18##    This program is free software; you can redistribute it and/or modify
19##    it under the terms of the GNU General Public License as published by
20##    the Free Software Foundation; either version 2 of the License, or
21##    (at your option) any later version.
22##
23##    This program is distributed in the hope that it will be useful,
24##    but WITHOUT ANY WARRANTY; without even the implied warranty of
25##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26##    GNU General Public License for more details.
27##
28##    You should have received a copy of the GNU General Public License
29##    along with this program; if not, write to the Free Software
30##    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
31##    02111-1307, USA
32##---------------------------------------------------------------------------##
33
34# Library where MHonArc libs
35# are located. CHANGE THIS TO MATCH YOUR SITE
36use lib '/usr/lib/perl5/site_perl/5.6.0';
37
38use HTML::Entities;
39require 'mhamain.pl';
40require 'base64.pl';
41use strict;
42
43my $NoArgs   = 1;
44my $USAGE    = "Usage: $0 html_dir [output_file]\n";
45my $HTML_DIR = shift || die $USAGE;
46my $OUTPUT_FILE = shift || '-';        # write to STDOUT by default
47my $debug       = $ENV{'CMD_DEBUG'};
48
49print STDERR "HTML_DIR=$HTML_DIR\n" if $debug;
50
51MAIN: {
52
53    mhonarc::initialize();
54    #mhonarc::get_resources();
55    print STDERR "MHonArc initialized.\n" if $debug;
56    require 'ewhutil.pl';
57    require 'mhtime.pl';
58    require 'mhutil.pl';
59
60    local (*DIR);
61
62    print STDERR "Reading $HTML_DIR.\n" if $debug;
63    opendir(DIR, $HTML_DIR) || die qq/Unable to open "$HTML_DIR": $!/;
64    my @files = grep(/^msg/i, readdir(DIR));
65    closedir(DIR);
66
67    open(MBOXFILE, ">$OUTPUT_FILE")
68        || die qq/Unable to open "$OUTPUT_FILE": $!/;
69
70    foreach (sort @files) {
71        my $name = "$HTML_DIR/" . $_;
72        print STDERR "working on: $name\n" if $debug;
73        &load_data($name);
74        print MBOXFILE "\n\n";    # sometimes necessary to seperate messages
75    }
76
77    close(MBOXFILE);
78}
79
80##---------------------------------------------------------------------------##
81##      load_data: Function to read information from the headers of the html
82## files produced by mhonarc. Adapted from mhmsgfile.pl
83##
84sub load_data {
85
86    my $addendum = my $contype = my $index = my $datestr = "";
87    my $from_addr   = my $email_addr = my $binfile  = "";
88    my $description = my $docname    = my $boundary = "";
89    my $filename = shift;    # Name of file to read
90    local (*MSGFILE);
91    my @Derived  = ();
92    my @bodytext = ();
93    my @array    = ();
94
95    if (!open(MSGFILE, $filename)) {
96        warn qq/Warning: Unable to open "$filename": $!\n/;
97        return 0;
98    }
99    my $href = parse_data(\*MSGFILE);
100    close(MSGFILE);
101
102    my $date = $href->{'date'}[0];
103
104# $day[$wday].', '.$d2[$mday].' '.$month[$mon].' '.($year+1900).' '.$d2[$hour].':'.$d2[$min].':'.$d2[$sec].' GMT';
105
106    ## Determine date of message
107    if (($date =~ /\S/) && (@array = mhonarc::parse_date($date))) {
108        $index = mhonarc::get_time_from_date(@array[1 .. $#array]);
109    } else {
110        $index = time;
111        $date = mhonarc::time2str("", $index, 1) unless $date =~ /\S/;
112    }
113
114    if (defined($href->{'from-r13'})) {
115        $from_addr  = mhonarc::mrot13($href->{'from-r13'}[0]);
116        $email_addr = &extract_email_address($from_addr);
117        #$email_addr = mhonarc::extract_email_address($from_addr);
118        print MBOXFILE "From $email_addr $date\n";
119        print MBOXFILE "From: $from_addr\n";
120    } elsif (defined($href->{'from'})) {
121        $from_addr  = $href->{'from'}[0];
122        $email_addr = &extract_email_address($from_addr);
123        #$email_addr = mhonarc::extract_email_address($from_addr);
124        print MBOXFILE "From $email_addr $date\n";
125        print MBOXFILE "From: $from_addr\n";
126    } else {
127        print STDERR "WARNING: From Anonymous\n" if $debug;
128        $from_addr  = 'Anonymous';
129        $email_addr = mhonarc::extract_email_address($from_addr);
130        print MBOXFILE "From $email_addr $date\n";
131        print MBOXFILE "From: $from_addr\n";
132    }
133
134    print MBOXFILE "Date: $date\n";
135
136    if (defined($href->{'msgtoheader'})) {
137        print MBOXFILE "To: $href->{'msgtoheader'}[0]\n";
138    }
139    if (defined($href->{'subject'})) {
140        print MBOXFILE "Subject: $href->{'subject'}[0]\n";
141    }
142
143    if (defined($href->{'reference'})) {
144        print MBOXFILE "In-Reply-To: <$href->{'reference'}[0]>\n";
145    }
146
147    if (defined($href->{'message-id'})) {
148        print MBOXFILE "Message-ID: <$href->{'message-id'}[0]>\n";
149    }
150
151    print MBOXFILE "MIME-Version: 1.0\n";
152
153    if (defined($href->{'content-type'})) {
154        $contype = $href->{'content-type'}[0];
155    } elsif (defined($href->{'contenttype'})) {    # older versions
156        $contype = $href->{'contenttype'}[0];
157    }
158
159    if (defined($href->{'msgbodytext'})) {
160        push(@bodytext, @{$href->{'msgbodytext'}});
161    }
162
163    if ($contype =~ /multipart/i) {
164        $boundary = join("", $$, '.', time, '.', $contype);
165
166        if (defined($href->{'derived'})) {
167
168            print MBOXFILE
169                "Content-Type: $contype; boundary=\"Boundary..$boundary\"\n";
170            push(@Derived, @{$href->{'derived'}});
171            print STDERR "Attachments: ", join(',', @Derived), "\n" if $debug;
172            pop(@bodytext);
173
174            foreach $binfile (reverse @Derived) {
175                $description = pop(@bodytext);
176                $docname     = $binfile;
177                $addendum .= "\n--Boundary..$boundary\n";
178                $addendum .=
179                    "Content-Type: application\/octet-stream\; name=\"$docname\"\n";
180                $addendum .= "Content-Transfer-Encoding: base64\n";
181                $addendum .=
182                    "Content-Disposition: attachment\; filename=\"$docname\"\n";
183                $addendum .= "Content-Description: \"$description\"\n\n";
184                $addendum .= join("", mime_encode("$HTML_DIR/$binfile"));
185            }
186
187            print MBOXFILE "\n--Boundary..$boundary\n";
188            print MBOXFILE "Content-Type: text/plain\n";
189            print MBOXFILE "Content-Transfer-Encoding: 7bit\n";
190            print MBOXFILE join("\n", @bodytext);
191            print MBOXFILE "\n";
192            print MBOXFILE "$addendum";
193            print MBOXFILE "--Boundary..$boundary--\n\n";
194
195        } else {
196
197            print MBOXFILE "Content-Type: text\/plain\n";
198            print MBOXFILE join("\n", @bodytext);
199
200        }
201    } else {
202
203        print MBOXFILE "Content-Type: $contype\n\n";
204        print MBOXFILE join("\n", @bodytext);
205
206    }
207}
208
209##---------------------------------------------------------------------------##
210##      parse_data(): Function to parse the initial comment
211##      declarations of a MHonArc message file into a hash.  A refernce
212##      to resulting hash is returned.  Keys are the field names, and
213##      values are arrays of field values. Adapted from mhmsgfile.pl
214##
215sub parse_data {
216    my $fh    = shift;     # An open filehandle
217    my $start = "true";
218    my $head  = "false";
219    my $subj  = "false";
220    my $tail  = "false";
221    my $body  = "false";
222    my ($field, $value);
223    my $AddrExp = '[^()<>@,;:\/\s"\'&|]+@[^()<>@,;:\/\s"\'&|]+';
224    my %field   = ();
225    local ($_);
226
227    while (<$fh>) {
228
229        if (/^<!--X-Head-End-->/) {
230            $start = "false";
231            next;
232        }
233        if (/^<!--X-Head-of-Message/) {
234            $head = "true";
235            next;
236        }
237        if (/^<!--X-Body-Begin-->/) {
238            $subj = "true";
239        }
240        if (/^<!--X-Body-of-Message/) {
241            last if s/^<!--X-Body-of-Message-End//;
242            $body = "true";
243            $head = "false";
244            next;
245        }
246        if ($start eq "true") {
247            next unless s/^<!--X-//;    # Skip non-field lines
248            chomp;
249            s/ -->$//;
250            s/<a(.*?)href="(.*)"(.*?)>(.*)<\/a>/$7/ig;
251            s/&lt;/</g;
252            s/&gt;/>/g;
253            s/&quot;/"/g;
254            ($field, $value) = split(/: /, $_, 2);
255            push(@{$field{lc $field}}, mhonarc::uncommentize($value));
256            next;
257        }
258        if ($head eq "true") {
259            if (/^<li><em>To<.*?>:/i) {
260                s/<\/li>//ig;
261                s/<\/ul>//ig;
262                s/<a( |  )href="(.*)">(.*)<\/a>/$3/ig;
263                s/&lt;/</g;
264                s/&gt;/>/g;
265                s/&quot;/"/g;
266                chomp;
267                ($field, $value) = split(/: /, $_, 2);
268                $field = "msgtoheader";
269                push(@{$field{lc $field}}, $value);
270                $head = "false";
271            }
272            next;
273        }
274        if ($body eq "true") {
275            # Extract URLs
276            chomp;
277            next if /<!--X-Body-of-Message-->/;
278            next if /^<(.*?)ul>$/i;
279
280            $_ = decode_html($_);
281
282            s/<a(.*?)href="(.*)"(.*?)>(.*)<\/a>/$4/ig;
283            $field = "msgbodytext";
284            $value = $_;
285            chomp;
286            push(@{$field{lc $field}}, $value);
287            next;
288        }
289    }
290    \%field;
291}
292
293##---------------------------------------------------------------------------##
294##  subroutine to base64 encode a file
295##---------------------------------------------------------------------------##
296
297sub mime_encode {
298    my $file         = shift;
299    my @encoded_data = ();
300    local $/;    # enable data slurp
301    open(MMENCODE, "$file");
302    print STDERR "MIME-encoding: $file\n" if $debug;
303    @encoded_data = base64::b64encode(<MMENCODE>);
304    close MMENCODE;
305    return @encoded_data;
306}
307
308##---------------------------------------------------------------------------##
309## Subroutine to remove html tags from a string
310##---------------------------------------------------------------------------##
311sub decode_html {
312
313    s/<[^>]*>//gs;    # from page 716 of the "Perl Cookbook"
314    $_ = decode_entities($_);
315    return $_;
316}
317
318##---------------------------------------------------------------------------##
319## Subroutine to extract email addresses. Taken from mhutil.pl
320##---------------------------------------------------------------------------##
321sub extract_email_address {
322
323    ##  Regexp for address/msg-id detection (looks like cussing in cartoons)
324    my $AddrExp = '[^()<>@,;:\/\s"\'&|]+@[^()<>@,;:\/\s"\'&|]+';
325    return '' unless defined $_[0];
326    my $str = shift;
327
328    if ($str =~ /($AddrExp)/o) {
329        return $1;
330    }
331    if ($str =~ /<(\S+)>/) {
332        return $1;
333    }
334    if ($str =~ s/\([^\)]+\)//) {
335        $str =~ /\s*(\S+)\s*/;
336        return $1;
337    }
338    $str =~ /\s*(\S+)\s*/;
339    return $1;
340}
341
342##---------------------------------------------------------------------------##
343
3441;
345