1#!/usr/bin/perl -w
2# Copyright Abandoned 1998 TCX DataKonsult AB & Monty Program KB & Detron HB
3# This file is public domain and comes with NO WARRANTY of any kind
4#
5# This program is brought to you by Janne-Petteri Koilo with the
6# administration of Michael Widenius.
7#
8# Rewritten with a lot of bug fixes by Jani Tolonen and Thimble Smith
9# 15.12.2000
10#
11# This program takes your mails and puts them into your database. It ignores
12# messages with the same from, date and message text.
13# You can use mail-files that are compressed or gzipped and ends with
14# -.gz or -.Z.
15
16use DBI;
17use Getopt::Long;
18
19$| = 1;
20$VER = "3.0";
21
22$opt_help          = 0;
23$opt_version       = 0;
24$opt_debug         = 0;
25$opt_host          = undef();
26$opt_port          = undef();
27$opt_socket        = undef();
28$opt_db            = "mail";
29$opt_user          = undef();
30$opt_password      = undef();
31$opt_max_mail_size = 65536;
32$opt_create        = 0;
33$opt_test          = 0;
34$opt_no_path       = 0;
35$opt_stop_on_error = 0;
36$opt_stdin         = 0;
37
38my ($dbh, $progname, $mail_no_from_f, $mail_no_txt_f, $mail_too_big,
39    $mail_forwarded, $mail_duplicates, $mail_no_subject_f, $mail_inserted);
40
41$mail_no_from_f = $mail_no_txt_f = $mail_too_big = $mail_forwarded =
42$mail_duplicates = $mail_no_subject_f = $mail_inserted = 0;
43$mail_fixed=0;
44
45#
46# Remove the following message-ends from message
47#
48@remove_tail= (
49"\n-*\nSend a mail to .*\n.*\n.*\$",
50"\n-*\nPlease check .*\n.*\n\nTo unsubscribe, .*\n.*\n.*\nIf you have a broken.*\n.*\n.*\$",
51"\n-*\nPlease check .*\n(.*\n){1,3}\nTo unsubscribe.*\n.*\n.*\$",
52"\n-*\nPlease check .*\n.*\n\nTo unsubscribe.*\n.*\$",
53"\n-*\nTo request this thread.*\nTo unsubscribe.*\n.*\.*\n.*\$",
54"\n -*\n.*Send a mail to.*\n.*\n.*unsubscribe.*\$",
55"\n-*\nTo request this thread.*\n\nTo unsubscribe.*\n.*\$"
56);
57
58# Generate regexp to remove tails where the unsubscribed is quoted
59{
60  my (@tmp, $tail);
61  @tmp=();
62  foreach $tail (@remove_tail)
63  {
64    $tail =~ s/\n/\n[> ]*/g;
65    push(@tmp, $tail);
66  }
67  push @remove_tail,@tmp;
68}
69
70my %months = ('Jan' => 1, 'Feb' => 2, 'Mar' => 3, 'Apr' => 4, 'May' => 5,
71	      'Jun' => 6, 'Jul' => 7, 'Aug' => 8, 'Sep' => 9, 'Oct' => 10,
72	      'Nov' => 11, 'Dec' => 12);
73
74$progname = $0;
75$progname =~ s/.*[\/]//;
76
77main();
78
79####
80#### main sub routine
81####
82
83sub main
84{
85  my ($connect_arg, @args, $ignored, @defops, $i);
86
87  if (defined(my_which("my_print_defaults")))
88  {
89    @defops = `my_print_defaults mail_to_db`;
90    chop @defops;
91    splice @ARGV, 0, 0, @defops;
92  }
93  else
94  {
95    print "WARNING: No command 'my_print_defaults' found; unable to read\n";
96    print "the my.cnf file. This command is available from the latest MySQL\n";
97    print "distribution.\n";
98  }
99  GetOptions("help","version","host=s","port=i","socket=s","db=s",
100	     "user=s","password=s","max_mail_size=i","create","test",
101	     "no_path","debug","stop_on_error","stdin")
102  || die "Wrong option! See $progname --help\n";
103
104  usage($VER) if ($opt_help || $opt_version ||
105		  (!$ARGV[0] && !$opt_create && !$opt_stdin));
106
107  # Check that the given inbox files exist and are regular files
108  for ($i = 0; ! $opt_stdin && defined($ARGV[$i]); $i++)
109  {
110    die "FATAL: Can't find inbox file: $ARGV[$i]\n" if (! -f $ARGV[$i]);
111  }
112
113  $connect_arg = "DBI:mysql:";
114  push @args, "database=$opt_db" if defined($opt_db);
115  push @args, "host=$opt_host" if defined($opt_host);
116  push @args, "port=$opt_port" if defined($opt_port);
117  push @args, "mysql_socket=$opt_socket" if defined($opt_socket);
118  push @args, "mysql_read_default_group=mail_to_db";
119  $connect_arg .= join ';', @args;
120  $dbh = DBI->connect("$connect_arg", $opt_user, $opt_password,
121		     { PrintError => 0})
122  || die "Couldn't connect: $DBI::errstr\n";
123
124  die "You must specify the database; use --db=" if (!defined($opt_db));
125
126  create_table($dbh) if ($opt_create);
127
128  if ($opt_stdin)
129  {
130    open(FILE, "-");
131    process_mail_file($dbh, "READ-FROM-STDIN");
132  }
133  else
134  {
135    foreach (@ARGV)
136    {
137      # Check if the file is compressed
138      if (/^(.*)\.(gz|Z)$/)
139      {
140	open(FILE, "zcat $_ |");
141	process_mail_file($dbh, $1);
142      }
143      else
144      {
145	open(FILE, $_);
146	process_mail_file($dbh, $_);
147      }
148    }
149  }
150  $dbh->disconnect if (!$opt_test);
151
152  $ignored = ($mail_no_from_f + $mail_no_subject_f + $mail_no_txt_f +
153	      $mail_too_big + $mail_duplicates + $mail_fixed);
154  print "################################ Mail Report #################################\n\n";
155  print "Mails inserted:\t\t\t\t\t$mail_inserted\n";
156  print "---------------                                ";
157  print "=" . "=" x length("$mail_inserted") . "=\n\n";
158  if ($ignored)
159  {
160    print "Ignored mails\n";
161    print "-------------\n";
162    if ($mail_no_from_f)
163    {
164      print "Reason: mail without \"From:\" -field:\t\t$mail_no_from_f\n";
165    }
166    else
167    {
168      print "";
169    }
170    if ($mail_no_txt_f)
171    {
172      print "Reason: mail without message:\t\t\t$mail_no_txt_f\n";
173    }
174    else
175    {
176      print "";
177    }
178    if ($mail_no_subject_f)
179    {
180      print "Reason: mail without subject:\t\t\t$mail_no_subject_f\n";
181    }
182    else
183    {
184      print "";
185    }
186    if ($mail_too_big)
187    {
188      print "Reason: mail too big, over $opt_max_mail_size bytes:\t\t";
189      print $mail_too_big;
190      print " (see --max_mail_size=#)\n";
191    }
192    else
193    {
194      print "";
195    }
196    if ($mail_duplicates)
197    {
198      print "Reason: duplicate mail, or in db already:\t$mail_duplicates\n";
199    }
200    else
201    {
202      print "";
203    }
204    if ($mail_fixed)
205    {
206      print "Reason: mail was an unsubscribe - mail:\t\t$mail_fixed\n";
207    }
208    else
209    {
210      print "";
211    }
212    print "                                               ";
213    print "=" . "=" x length("$ignored") . "=\n";
214    print "Total number of ignored mails:\t\t\t$ignored\n\n";
215  }
216  print "Total number of mails:\t\t\t\t";
217  print $mail_inserted + $ignored;
218  print " (OK: ";
219  print sprintf("%.1f", ($mail_inserted + $ignored) ? (($mail_inserted / ($mail_inserted+$ignored)) * 100) : 0.0);
220  print "% Ignored: ";
221  print sprintf("%.1f", ($mail_inserted + $ignored) ? (($ignored / ($mail_inserted + $ignored)) * 100) : 0);
222  print "%)\n";
223  print "################################ End Report ##################################\n";
224  exit(0);
225}
226
227####
228#### table creation
229####
230
231sub create_table
232{
233  my ($dbh)= @_;
234  my ($sth, $query);
235
236  $query= <<EOF;
237CREATE TABLE my_mail
238(
239 mail_id MEDIUMINT UNSIGNED NOT NULL auto_increment,
240 message_id VARCHAR(255),
241 in_reply_to VARCHAR(255),
242 date DATETIME NOT NULL,
243 time_zone VARCHAR(20),
244 mail_from VARCHAR(120) NOT NULL,
245 reply VARCHAR(120),
246 mail_to TEXT,
247 cc TEXT,
248 sbj VARCHAR(200),
249 txt MEDIUMTEXT NOT NULL,
250 file VARCHAR(64) NOT NULL,
251 hash INTEGER NOT NULL,
252 KEY (mail_id),
253 KEY (message_id),
254 KEY (in_reply_to),
255 PRIMARY KEY (mail_from, date, hash))
256 ENGINE=MyISAM COMMENT=''
257EOF
258  $sth = $dbh->prepare($query) or die $DBI::errstr;
259  $sth->execute() or die "Couldn't create table: $DBI::errstr\n";
260}
261
262####
263#### inbox processing. Can be either a real file, or standard input.
264####
265
266sub process_mail_file
267{
268  my ($dbh, $file_name) = @_;
269  my (%values, $type, $check);
270
271  $file_name =~ s/.*[\/]// if ($opt_no_path);
272
273  %values = ();
274  $type = "";
275  $check = 0;
276  while (<FILE>)
277  {
278    chop;
279    chop if (substr($_, -1, 1) eq "\r");
280    if ($type ne "message")
281    {
282      if (/^Reply-To:\s*(.*)/i)
283      {
284	$type = "reply";
285	$values{$type} = $1;
286      }
287      elsif (/^From: (.*)/i)
288      {
289	$type = "from";
290	$values{$type} = $1;
291      }
292      elsif (/^To: (.*)/i)
293      {
294	$type = "to";
295	$values{$type} = $1;
296      }
297      elsif (/^Cc: (.*)/i)
298      {
299	$type = "cc";
300	$values{$type} = $1;
301      }
302      elsif (/^Subject: (.*)/i)
303      {
304	$type = "subject";
305	$values{$type} = $1;
306      }
307      elsif (/^Message-Id:\s*(.*)/i)
308      {
309	$type = "message_id";
310	s/^\s*(<.*>)\s*/$1/;
311	$values{$type} = $1;
312      }
313      elsif (/^In-Reply-To:\s*(.*)/i)
314      {
315	$type = "in_reply_to";
316	s/^\s*(<.*>)\s*/$1/;
317	$values{$type} = $1;
318      }
319      elsif (/^Date: (.*)/i)
320      {
321	date_parser($1, \%values, $file_name);
322	$type = "rubbish";
323      }
324      # Catch those fields that we don't or can't handle (yet)
325      elsif (/^[\w\W-]+:/)
326      {
327	$type = "rubbish";
328      }
329      elsif ($_ eq "")
330      {
331	$type = "message";
332	$values{$type} = "";
333      }
334      else
335      {
336	s/^\s*/ /;
337	if ($type eq 'message_id' || $type eq 'in_reply_to')
338	{
339	  s/^\s*(<.*>)\s*/$1/;
340	}
341	$values{$type} .= $_;
342      }
343    }
344    elsif ($check != 0 && $_ ne "") # in case of forwarded messages
345    {
346      $values{$type} .= "\n" . $_;
347      $check--;
348    }
349    elsif (/^From .* \d\d:\d\d:\d\d\s\d\d\d\d/ ||
350           /^From .* \d\d\d\d\s\d\d:\d\d:\d\d/)
351    {
352      $values{'hash'} = checksum("$values{'message'}");
353      update_table($dbh, $file_name, \%values);
354      %values = ();
355      $type = "";
356      $check = 0;
357    }
358    elsif (/-* forwarded message .*-*/i) # in case of forwarded messages
359    {
360      $values{$type} .= "\n" . $_;
361      $check++;
362      $mail_forwarded++;
363    }
364    else
365    {
366      $values{$type} .= "\n" . $_;
367    }
368  }
369  if (defined($values{'message'}))
370  {
371    $values{'hash'} = checksum("$values{'message'}");
372    update_table($dbh, $file_name, \%values);
373  }
374}
375
376####
377#### get date and timezone
378####
379
380sub date_parser
381{
382  my ($date_raw, $values, $file_name, $tmp) = @_;
383
384  # If you ever need to change this test, be especially careful with
385  # the timezone; it may be just a number (-0600), or just a name (EET), or
386  # both (-0600 (EET), or -0600 (EET GMT)), or without parenthesis: GMT.
387  # You probably should use a 'greedy' regexp in the end
388  $date_raw =~ /^\D*(\d{1,2})\s+(\w+)\s+(\d{2,4})\s+(\d+:\d+)(:\d+)?\s*(\S+.*)?/;
389
390  if (!defined($1) || !defined($2) || !defined($3) || !defined($4) ||
391      !defined($months{$2}))
392  {
393    if ($opt_debug || $opt_stop_on_error)
394    {
395      print "FAILED: date_parser: 1: $1 2: $2 3: $3 4: $4 5: $5\n";
396      print "months{2}: $months{$2}\n";
397      print "date_raw: $date_raw\n";
398      print "Inbox filename: $file_name\n";
399    }
400    exit(1) if ($opt_stop_on_error);
401    $values->{'date'} = "";
402    $values->{'time_zone'} = "";
403    return;
404  }
405  $tmp = $3 . "-" . $months{$2} . "-" . "$1 $4";
406  $tmp.= defined($5) ? $5 : ":00";
407  $values->{'date'} = $tmp;
408  print "INSERTING DATE: $tmp\n" if ($opt_debug);
409  $values->{'time_zone'} = $6;
410}
411
412####
413#### Insert to table
414####
415
416sub update_table
417{
418  my($dbh, $file_name, $values) = @_;
419  my($q, $tail, $message);
420
421  if (!defined($values->{'subject'}) || !defined($values->{'to'}))
422  {
423    $mail_no_subject_f++;
424    return;			# Ignore these
425  }
426  $message = $values->{'message'};
427  $message =~ s/^\s*//; # removes whitespaces from the beginning
428
429 restart:
430  $message =~ s/[\s\n>]*$//; # removes whitespaces and '>' from the end
431  $values->{'message'} = $message;
432  foreach $tail (@remove_tail)
433  {
434    $message =~ s/$tail//;
435  }
436  if ($message ne $values->{'message'})
437  {
438    $message =~ s/\s*$//; # removes whitespaces from the end
439    $mail_fixed++;
440    goto restart;	  # Some mails may have duplicated messages
441  }
442
443  $q = "INSERT INTO my_mail (";
444  $q.= "mail_id,";
445  $q.= "message_id,";
446  $q.= "in_reply_to,";
447  $q.= "date,";
448  $q.= "time_zone,";
449  $q.= "mail_from,";
450  $q.= "reply,";
451  $q.= "mail_to,";
452  $q.= "cc,";
453  $q.= "sbj,";
454  $q.= "txt,";
455  $q.= "file,";
456  $q.= "hash";
457  $q.= ") VALUES (";
458  $q.= "NULL,";
459  $q.= (defined($values->{'message_id'}) ?
460	$dbh->quote($values->{'message_id'}) : "NULL");
461  $q.= ",";
462  $q.= (defined($values->{'in_reply_to'}) ?
463	$dbh->quote($values->{'in_reply_to'}) : "NULL");
464  $q.= ",";
465  $q.= "'" . $values->{'date'} . "',";
466  $q.= (defined($values->{'time_zone'}) ?
467	$dbh->quote($values->{'time_zone'}) : "NULL");
468  $q.= ",";
469  $q.= defined($values->{'from'}) ? $dbh->quote($values->{'from'}) : "NULL";
470  $q.= ",";
471  $q.= defined($values->{'reply'}) ? $dbh->quote($values->{'reply'}) : "NULL";
472  $q.= ",";
473  $q.= defined($values->{'to'}) ? $dbh->quote($values->{'to'}) : "NULL";
474  $q.= ",";
475  $q.= defined($values->{'cc'}) ? $dbh->quote($values->{'cc'}) : "NULL";
476  $q.= ",";
477  $q.= $dbh->quote($values->{'subject'});
478  $q.= ",";
479  $q.= $dbh->quote($message);
480  $q.= ",";
481  $q.= $dbh->quote($file_name);
482  $q.= ",";
483  $q.= "'" . $values->{'hash'} . "'";
484  $q.= ")";
485
486  # Don't insert mails bigger than $opt_max_mail_size
487  if (length($message) > $opt_max_mail_size)
488  {
489    $mail_too_big++;
490  }
491  # Don't insert mails without 'From' field
492  elsif (!defined($values->{'from'}) || $values->{'from'} eq "")
493  {
494    $mail_no_from_f++;
495  }
496  elsif ($opt_test)
497  {
498    print "$q\n";
499    $mail_inserted++;
500  }
501  # Don't insert mails without the 'message'
502  elsif ($message eq "")
503  {
504    $mail_no_txt_f++;
505  }
506  elsif ($dbh->do($q))
507  {
508    $mail_inserted++;
509  }
510  # This should never happen. This means that the above q failed,
511  # but it wasn't because of a duplicate mail entry
512  elsif (!($DBI::errstr =~ /Duplicate entry /))
513  {
514    die "FATAL: Got error :$DBI::errstr\nAttempted query was: $q\n";
515  }
516  else
517  {
518    $mail_duplicates++;
519    print "Duplicate mail: query: $q\n" if ($opt_debug);
520  }
521  $q = "";
522}
523
524####
525#### In case you have two identical messages we wanted to identify them
526#### and remove additionals;  We do this by calculating a hash number of the
527#### message and ignoring messages with the same from, date and hash.
528#### This function calculates a simple 32 bit hash value for the message.
529####
530
531sub checksum
532{
533  my ($txt)= @_;
534  my ($crc, $i, $count);
535  $count = length($txt);
536  for ($crc = $i = 0; $i < $count ; $i++)
537  {
538    $crc = (($crc << 1) + (ord (substr ($txt, $i, 1)))) +
539      (($crc & (1 << 30)) ? 1 : 0);
540    $crc &= ((1 << 31) -1);
541  }
542  return $crc;
543}
544
545####
546#### my_which is used, because we can't assume that every system has the
547#### which -command. my_which can take only one argument at a time.
548#### Return values: requested system command with the first found path,
549#### or undefined, if not found.
550####
551
552sub my_which
553{
554  my ($command) = @_;
555  my (@paths, $path);
556
557  return $command if (-f $command && -x $command);
558  @paths = split(':', $ENV{'PATH'});
559  foreach $path (@paths)
560  {
561    $path = "." if ($path eq "");
562    $path .= "/$command";
563    return $path if (-f $path && -x $path);
564  }
565  return undef();
566}
567
568####
569#### usage and version
570####
571
572sub usage
573{
574  my ($VER)= @_;
575
576  if ($opt_version)
577  {
578    print "$progname version $VER\n";
579  }
580  else
581  {
582    print <<EOF;
583$progname version $VER
584
585Description: Insert mails from inbox file(s) into a table. This program
586can read group [mail_to_db] from the my.cnf file. You may want to have db
587and table set there at least.
588
589Usage: $progname [options] file1 [file2 file3 ...]
590or:    $progname [options] --create [file1 file2...]
591or:    cat inbox | $progname [options] --stdin
592
593The last example can be used to read mails from standard input and can
594useful when inserting mails to database via a program 'on-the-fly'.
595The filename will be 'READ-FROM-STDIN' in this case.
596
597Options:
598--help             Show this help and exit.
599--version          Show the version number and exit.
600--debug            Print some extra information during the run.
601--host=...         Hostname to be used.
602--port=#           TCP/IP port to be used with connection.
603--socket=...       MySQL UNIX socket to be used with connection.
604--db=...           Database to be used.
605--user=...         Username for connecting.
606--password=...     Password for the user.
607--stdin            Read mails from stdin.
608--max_mail_size=#  Maximum size of a mail in bytes.
609                   Beware of the downside letting this variable be too big;
610                   you may easily end up inserting a lot of attached
611                   binary files (like MS Word documents etc), which take
612                   space, make the database slower and are not really
613                   searchable anyway. (Default $opt_max_mail_size)
614--create           Create the mails table. This can be done with the first run.
615--test		   Dry run. Print the queries and the result as it would be.
616--no_path          When inserting the file name, leave out any paths of
617                   the name.
618--stop_on_error    Stop the run, if an unexpected, but not fatal error occurs
619                   during the run. Without this option some fields may get
620                   unwanted values. --debug will also report about these.
621EOF
622  }
623  exit(0);
624}
625