1#!/usr/bin/perl -w 2# Copyright Abandoned 1998 TCX DataKonsult AB & Monty Program KB & Detron HB 3# This file is public domain and comes with NO WARRANTY of any kind 4# 5# This program is brought to you by Janne-Petteri Koilo with the 6# administration of Michael Widenius. 7# 8# Rewritten with a lot of bug fixes by Jani Tolonen and Thimble Smith 9# 15.12.2000 10# 11# This program takes your mails and puts them into your database. It ignores 12# messages with the same from, date and message text. 13# You can use mail-files that are compressed or gzipped and ends with 14# -.gz or -.Z. 15 16use DBI; 17use Getopt::Long; 18 19$| = 1; 20$VER = "3.0"; 21 22$opt_help = 0; 23$opt_version = 0; 24$opt_debug = 0; 25$opt_host = undef(); 26$opt_port = undef(); 27$opt_socket = undef(); 28$opt_db = "mail"; 29$opt_user = undef(); 30$opt_password = undef(); 31$opt_max_mail_size = 65536; 32$opt_create = 0; 33$opt_test = 0; 34$opt_no_path = 0; 35$opt_stop_on_error = 0; 36$opt_stdin = 0; 37 38my ($dbh, $progname, $mail_no_from_f, $mail_no_txt_f, $mail_too_big, 39 $mail_forwarded, $mail_duplicates, $mail_no_subject_f, $mail_inserted); 40 41$mail_no_from_f = $mail_no_txt_f = $mail_too_big = $mail_forwarded = 42$mail_duplicates = $mail_no_subject_f = $mail_inserted = 0; 43$mail_fixed=0; 44 45# 46# Remove the following message-ends from message 47# 48@remove_tail= ( 49"\n-*\nSend a mail to .*\n.*\n.*\$", 50"\n-*\nPlease check .*\n.*\n\nTo unsubscribe, .*\n.*\n.*\nIf you have a broken.*\n.*\n.*\$", 51"\n-*\nPlease check .*\n(.*\n){1,3}\nTo unsubscribe.*\n.*\n.*\$", 52"\n-*\nPlease check .*\n.*\n\nTo unsubscribe.*\n.*\$", 53"\n-*\nTo request this thread.*\nTo unsubscribe.*\n.*\.*\n.*\$", 54"\n -*\n.*Send a mail to.*\n.*\n.*unsubscribe.*\$", 55"\n-*\nTo request this thread.*\n\nTo unsubscribe.*\n.*\$" 56); 57 58# Generate regexp to remove tails where the unsubscribed is quoted 59{ 60 my (@tmp, $tail); 61 @tmp=(); 62 foreach $tail (@remove_tail) 63 { 64 $tail =~ s/\n/\n[> ]*/g; 65 push(@tmp, $tail); 66 } 67 push @remove_tail,@tmp; 68} 69 70my %months = ('Jan' => 1, 'Feb' => 2, 'Mar' => 3, 'Apr' => 4, 'May' => 5, 71 'Jun' => 6, 'Jul' => 7, 'Aug' => 8, 'Sep' => 9, 'Oct' => 10, 72 'Nov' => 11, 'Dec' => 12); 73 74$progname = $0; 75$progname =~ s/.*[\/]//; 76 77main(); 78 79#### 80#### main sub routine 81#### 82 83sub main 84{ 85 my ($connect_arg, @args, $ignored, @defops, $i); 86 87 if (defined(my_which("my_print_defaults"))) 88 { 89 @defops = `my_print_defaults mail_to_db`; 90 chop @defops; 91 splice @ARGV, 0, 0, @defops; 92 } 93 else 94 { 95 print "WARNING: No command 'my_print_defaults' found; unable to read\n"; 96 print "the my.cnf file. This command is available from the latest MySQL\n"; 97 print "distribution.\n"; 98 } 99 GetOptions("help","version","host=s","port=i","socket=s","db=s", 100 "user=s","password=s","max_mail_size=i","create","test", 101 "no_path","debug","stop_on_error","stdin") 102 || die "Wrong option! See $progname --help\n"; 103 104 usage($VER) if ($opt_help || $opt_version || 105 (!$ARGV[0] && !$opt_create && !$opt_stdin)); 106 107 # Check that the given inbox files exist and are regular files 108 for ($i = 0; ! $opt_stdin && defined($ARGV[$i]); $i++) 109 { 110 die "FATAL: Can't find inbox file: $ARGV[$i]\n" if (! -f $ARGV[$i]); 111 } 112 113 $connect_arg = "DBI:mysql:"; 114 push @args, "database=$opt_db" if defined($opt_db); 115 push @args, "host=$opt_host" if defined($opt_host); 116 push @args, "port=$opt_port" if defined($opt_port); 117 push @args, "mysql_socket=$opt_socket" if defined($opt_socket); 118 push @args, "mysql_read_default_group=mail_to_db"; 119 $connect_arg .= join ';', @args; 120 $dbh = DBI->connect("$connect_arg", $opt_user, $opt_password, 121 { PrintError => 0}) 122 || die "Couldn't connect: $DBI::errstr\n"; 123 124 die "You must specify the database; use --db=" if (!defined($opt_db)); 125 126 create_table($dbh) if ($opt_create); 127 128 if ($opt_stdin) 129 { 130 open(FILE, "-"); 131 process_mail_file($dbh, "READ-FROM-STDIN"); 132 } 133 else 134 { 135 foreach (@ARGV) 136 { 137 # Check if the file is compressed 138 if (/^(.*)\.(gz|Z)$/) 139 { 140 open(FILE, "zcat $_ |"); 141 process_mail_file($dbh, $1); 142 } 143 else 144 { 145 open(FILE, $_); 146 process_mail_file($dbh, $_); 147 } 148 } 149 } 150 $dbh->disconnect if (!$opt_test); 151 152 $ignored = ($mail_no_from_f + $mail_no_subject_f + $mail_no_txt_f + 153 $mail_too_big + $mail_duplicates + $mail_fixed); 154 print "################################ Mail Report #################################\n\n"; 155 print "Mails inserted:\t\t\t\t\t$mail_inserted\n"; 156 print "--------------- "; 157 print "=" . "=" x length("$mail_inserted") . "=\n\n"; 158 if ($ignored) 159 { 160 print "Ignored mails\n"; 161 print "-------------\n"; 162 if ($mail_no_from_f) 163 { 164 print "Reason: mail without \"From:\" -field:\t\t$mail_no_from_f\n"; 165 } 166 else 167 { 168 print ""; 169 } 170 if ($mail_no_txt_f) 171 { 172 print "Reason: mail without message:\t\t\t$mail_no_txt_f\n"; 173 } 174 else 175 { 176 print ""; 177 } 178 if ($mail_no_subject_f) 179 { 180 print "Reason: mail without subject:\t\t\t$mail_no_subject_f\n"; 181 } 182 else 183 { 184 print ""; 185 } 186 if ($mail_too_big) 187 { 188 print "Reason: mail too big, over $opt_max_mail_size bytes:\t\t"; 189 print $mail_too_big; 190 print " (see --max_mail_size=#)\n"; 191 } 192 else 193 { 194 print ""; 195 } 196 if ($mail_duplicates) 197 { 198 print "Reason: duplicate mail, or in db already:\t$mail_duplicates\n"; 199 } 200 else 201 { 202 print ""; 203 } 204 if ($mail_fixed) 205 { 206 print "Reason: mail was an unsubscribe - mail:\t\t$mail_fixed\n"; 207 } 208 else 209 { 210 print ""; 211 } 212 print " "; 213 print "=" . "=" x length("$ignored") . "=\n"; 214 print "Total number of ignored mails:\t\t\t$ignored\n\n"; 215 } 216 print "Total number of mails:\t\t\t\t"; 217 print $mail_inserted + $ignored; 218 print " (OK: "; 219 print sprintf("%.1f", ($mail_inserted + $ignored) ? (($mail_inserted / ($mail_inserted+$ignored)) * 100) : 0.0); 220 print "% Ignored: "; 221 print sprintf("%.1f", ($mail_inserted + $ignored) ? (($ignored / ($mail_inserted + $ignored)) * 100) : 0); 222 print "%)\n"; 223 print "################################ End Report ##################################\n"; 224 exit(0); 225} 226 227#### 228#### table creation 229#### 230 231sub create_table 232{ 233 my ($dbh)= @_; 234 my ($sth, $query); 235 236 $query= <<EOF; 237CREATE TABLE my_mail 238( 239 mail_id MEDIUMINT UNSIGNED NOT NULL auto_increment, 240 message_id VARCHAR(255), 241 in_reply_to VARCHAR(255), 242 date DATETIME NOT NULL, 243 time_zone VARCHAR(20), 244 mail_from VARCHAR(120) NOT NULL, 245 reply VARCHAR(120), 246 mail_to TEXT, 247 cc TEXT, 248 sbj VARCHAR(200), 249 txt MEDIUMTEXT NOT NULL, 250 file VARCHAR(64) NOT NULL, 251 hash INTEGER NOT NULL, 252 KEY (mail_id), 253 KEY (message_id), 254 KEY (in_reply_to), 255 PRIMARY KEY (mail_from, date, hash)) 256 ENGINE=MyISAM COMMENT='' 257EOF 258 $sth = $dbh->prepare($query) or die $DBI::errstr; 259 $sth->execute() or die "Couldn't create table: $DBI::errstr\n"; 260} 261 262#### 263#### inbox processing. Can be either a real file, or standard input. 264#### 265 266sub process_mail_file 267{ 268 my ($dbh, $file_name) = @_; 269 my (%values, $type, $check); 270 271 $file_name =~ s/.*[\/]// if ($opt_no_path); 272 273 %values = (); 274 $type = ""; 275 $check = 0; 276 while (<FILE>) 277 { 278 chop; 279 chop if (substr($_, -1, 1) eq "\r"); 280 if ($type ne "message") 281 { 282 if (/^Reply-To:\s*(.*)/i) 283 { 284 $type = "reply"; 285 $values{$type} = $1; 286 } 287 elsif (/^From: (.*)/i) 288 { 289 $type = "from"; 290 $values{$type} = $1; 291 } 292 elsif (/^To: (.*)/i) 293 { 294 $type = "to"; 295 $values{$type} = $1; 296 } 297 elsif (/^Cc: (.*)/i) 298 { 299 $type = "cc"; 300 $values{$type} = $1; 301 } 302 elsif (/^Subject: (.*)/i) 303 { 304 $type = "subject"; 305 $values{$type} = $1; 306 } 307 elsif (/^Message-Id:\s*(.*)/i) 308 { 309 $type = "message_id"; 310 s/^\s*(<.*>)\s*/$1/; 311 $values{$type} = $1; 312 } 313 elsif (/^In-Reply-To:\s*(.*)/i) 314 { 315 $type = "in_reply_to"; 316 s/^\s*(<.*>)\s*/$1/; 317 $values{$type} = $1; 318 } 319 elsif (/^Date: (.*)/i) 320 { 321 date_parser($1, \%values, $file_name); 322 $type = "rubbish"; 323 } 324 # Catch those fields that we don't or can't handle (yet) 325 elsif (/^[\w\W-]+:/) 326 { 327 $type = "rubbish"; 328 } 329 elsif ($_ eq "") 330 { 331 $type = "message"; 332 $values{$type} = ""; 333 } 334 else 335 { 336 s/^\s*/ /; 337 if ($type eq 'message_id' || $type eq 'in_reply_to') 338 { 339 s/^\s*(<.*>)\s*/$1/; 340 } 341 $values{$type} .= $_; 342 } 343 } 344 elsif ($check != 0 && $_ ne "") # in case of forwarded messages 345 { 346 $values{$type} .= "\n" . $_; 347 $check--; 348 } 349 elsif (/^From .* \d\d:\d\d:\d\d\s\d\d\d\d/ || 350 /^From .* \d\d\d\d\s\d\d:\d\d:\d\d/) 351 { 352 $values{'hash'} = checksum("$values{'message'}"); 353 update_table($dbh, $file_name, \%values); 354 %values = (); 355 $type = ""; 356 $check = 0; 357 } 358 elsif (/-* forwarded message .*-*/i) # in case of forwarded messages 359 { 360 $values{$type} .= "\n" . $_; 361 $check++; 362 $mail_forwarded++; 363 } 364 else 365 { 366 $values{$type} .= "\n" . $_; 367 } 368 } 369 if (defined($values{'message'})) 370 { 371 $values{'hash'} = checksum("$values{'message'}"); 372 update_table($dbh, $file_name, \%values); 373 } 374} 375 376#### 377#### get date and timezone 378#### 379 380sub date_parser 381{ 382 my ($date_raw, $values, $file_name, $tmp) = @_; 383 384 # If you ever need to change this test, be especially careful with 385 # the timezone; it may be just a number (-0600), or just a name (EET), or 386 # both (-0600 (EET), or -0600 (EET GMT)), or without parenthesis: GMT. 387 # You probably should use a 'greedy' regexp in the end 388 $date_raw =~ /^\D*(\d{1,2})\s+(\w+)\s+(\d{2,4})\s+(\d+:\d+)(:\d+)?\s*(\S+.*)?/; 389 390 if (!defined($1) || !defined($2) || !defined($3) || !defined($4) || 391 !defined($months{$2})) 392 { 393 if ($opt_debug || $opt_stop_on_error) 394 { 395 print "FAILED: date_parser: 1: $1 2: $2 3: $3 4: $4 5: $5\n"; 396 print "months{2}: $months{$2}\n"; 397 print "date_raw: $date_raw\n"; 398 print "Inbox filename: $file_name\n"; 399 } 400 exit(1) if ($opt_stop_on_error); 401 $values->{'date'} = ""; 402 $values->{'time_zone'} = ""; 403 return; 404 } 405 $tmp = $3 . "-" . $months{$2} . "-" . "$1 $4"; 406 $tmp.= defined($5) ? $5 : ":00"; 407 $values->{'date'} = $tmp; 408 print "INSERTING DATE: $tmp\n" if ($opt_debug); 409 $values->{'time_zone'} = $6; 410} 411 412#### 413#### Insert to table 414#### 415 416sub update_table 417{ 418 my($dbh, $file_name, $values) = @_; 419 my($q, $tail, $message); 420 421 if (!defined($values->{'subject'}) || !defined($values->{'to'})) 422 { 423 $mail_no_subject_f++; 424 return; # Ignore these 425 } 426 $message = $values->{'message'}; 427 $message =~ s/^\s*//; # removes whitespaces from the beginning 428 429 restart: 430 $message =~ s/[\s\n>]*$//; # removes whitespaces and '>' from the end 431 $values->{'message'} = $message; 432 foreach $tail (@remove_tail) 433 { 434 $message =~ s/$tail//; 435 } 436 if ($message ne $values->{'message'}) 437 { 438 $message =~ s/\s*$//; # removes whitespaces from the end 439 $mail_fixed++; 440 goto restart; # Some mails may have duplicated messages 441 } 442 443 $q = "INSERT INTO my_mail ("; 444 $q.= "mail_id,"; 445 $q.= "message_id,"; 446 $q.= "in_reply_to,"; 447 $q.= "date,"; 448 $q.= "time_zone,"; 449 $q.= "mail_from,"; 450 $q.= "reply,"; 451 $q.= "mail_to,"; 452 $q.= "cc,"; 453 $q.= "sbj,"; 454 $q.= "txt,"; 455 $q.= "file,"; 456 $q.= "hash"; 457 $q.= ") VALUES ("; 458 $q.= "NULL,"; 459 $q.= (defined($values->{'message_id'}) ? 460 $dbh->quote($values->{'message_id'}) : "NULL"); 461 $q.= ","; 462 $q.= (defined($values->{'in_reply_to'}) ? 463 $dbh->quote($values->{'in_reply_to'}) : "NULL"); 464 $q.= ","; 465 $q.= "'" . $values->{'date'} . "',"; 466 $q.= (defined($values->{'time_zone'}) ? 467 $dbh->quote($values->{'time_zone'}) : "NULL"); 468 $q.= ","; 469 $q.= defined($values->{'from'}) ? $dbh->quote($values->{'from'}) : "NULL"; 470 $q.= ","; 471 $q.= defined($values->{'reply'}) ? $dbh->quote($values->{'reply'}) : "NULL"; 472 $q.= ","; 473 $q.= defined($values->{'to'}) ? $dbh->quote($values->{'to'}) : "NULL"; 474 $q.= ","; 475 $q.= defined($values->{'cc'}) ? $dbh->quote($values->{'cc'}) : "NULL"; 476 $q.= ","; 477 $q.= $dbh->quote($values->{'subject'}); 478 $q.= ","; 479 $q.= $dbh->quote($message); 480 $q.= ","; 481 $q.= $dbh->quote($file_name); 482 $q.= ","; 483 $q.= "'" . $values->{'hash'} . "'"; 484 $q.= ")"; 485 486 # Don't insert mails bigger than $opt_max_mail_size 487 if (length($message) > $opt_max_mail_size) 488 { 489 $mail_too_big++; 490 } 491 # Don't insert mails without 'From' field 492 elsif (!defined($values->{'from'}) || $values->{'from'} eq "") 493 { 494 $mail_no_from_f++; 495 } 496 elsif ($opt_test) 497 { 498 print "$q\n"; 499 $mail_inserted++; 500 } 501 # Don't insert mails without the 'message' 502 elsif ($message eq "") 503 { 504 $mail_no_txt_f++; 505 } 506 elsif ($dbh->do($q)) 507 { 508 $mail_inserted++; 509 } 510 # This should never happen. This means that the above q failed, 511 # but it wasn't because of a duplicate mail entry 512 elsif (!($DBI::errstr =~ /Duplicate entry /)) 513 { 514 die "FATAL: Got error :$DBI::errstr\nAttempted query was: $q\n"; 515 } 516 else 517 { 518 $mail_duplicates++; 519 print "Duplicate mail: query: $q\n" if ($opt_debug); 520 } 521 $q = ""; 522} 523 524#### 525#### In case you have two identical messages we wanted to identify them 526#### and remove additionals; We do this by calculating a hash number of the 527#### message and ignoring messages with the same from, date and hash. 528#### This function calculates a simple 32 bit hash value for the message. 529#### 530 531sub checksum 532{ 533 my ($txt)= @_; 534 my ($crc, $i, $count); 535 $count = length($txt); 536 for ($crc = $i = 0; $i < $count ; $i++) 537 { 538 $crc = (($crc << 1) + (ord (substr ($txt, $i, 1)))) + 539 (($crc & (1 << 30)) ? 1 : 0); 540 $crc &= ((1 << 31) -1); 541 } 542 return $crc; 543} 544 545#### 546#### my_which is used, because we can't assume that every system has the 547#### which -command. my_which can take only one argument at a time. 548#### Return values: requested system command with the first found path, 549#### or undefined, if not found. 550#### 551 552sub my_which 553{ 554 my ($command) = @_; 555 my (@paths, $path); 556 557 return $command if (-f $command && -x $command); 558 @paths = split(':', $ENV{'PATH'}); 559 foreach $path (@paths) 560 { 561 $path = "." if ($path eq ""); 562 $path .= "/$command"; 563 return $path if (-f $path && -x $path); 564 } 565 return undef(); 566} 567 568#### 569#### usage and version 570#### 571 572sub usage 573{ 574 my ($VER)= @_; 575 576 if ($opt_version) 577 { 578 print "$progname version $VER\n"; 579 } 580 else 581 { 582 print <<EOF; 583$progname version $VER 584 585Description: Insert mails from inbox file(s) into a table. This program 586can read group [mail_to_db] from the my.cnf file. You may want to have db 587and table set there at least. 588 589Usage: $progname [options] file1 [file2 file3 ...] 590or: $progname [options] --create [file1 file2...] 591or: cat inbox | $progname [options] --stdin 592 593The last example can be used to read mails from standard input and can 594useful when inserting mails to database via a program 'on-the-fly'. 595The filename will be 'READ-FROM-STDIN' in this case. 596 597Options: 598--help Show this help and exit. 599--version Show the version number and exit. 600--debug Print some extra information during the run. 601--host=... Hostname to be used. 602--port=# TCP/IP port to be used with connection. 603--socket=... MySQL UNIX socket to be used with connection. 604--db=... Database to be used. 605--user=... Username for connecting. 606--password=... Password for the user. 607--stdin Read mails from stdin. 608--max_mail_size=# Maximum size of a mail in bytes. 609 Beware of the downside letting this variable be too big; 610 you may easily end up inserting a lot of attached 611 binary files (like MS Word documents etc), which take 612 space, make the database slower and are not really 613 searchable anyway. (Default $opt_max_mail_size) 614--create Create the mails table. This can be done with the first run. 615--test Dry run. Print the queries and the result as it would be. 616--no_path When inserting the file name, leave out any paths of 617 the name. 618--stop_on_error Stop the run, if an unexpected, but not fatal error occurs 619 during the run. Without this option some fields may get 620 unwanted values. --debug will also report about these. 621EOF 622 } 623 exit(0); 624} 625