1#!/usr/bin/perl -w 2 3use FindBin; 4use lib "$FindBin::Bin/../lib"; 5 6use strict; 7 8use Mail::SpamAssassin::ArchiveIterator; 9use Getopt::Std; 10use FileHandle; 11 12########### 13 14sub usage { 15 print STDERR "split-corpora [-n num_buckets] [-p outfile_prefix] ". 16 "[-l max_messages] ". 17 "folder1 ....\n"; 18 exit(1); 19} # usage() 20 21########### 22 23our ($opt_n, $opt_p, $opt_h, $opt_l); 24 25getopt('n:p:l:h'); 26 27usage() if ($opt_h); 28 29my $num_buckets = $opt_n || 2; 30my $prefix = $opt_p || "bucket"; 31my @IN_FILES = @ARGV; 32 33usage() if (@IN_FILES == 0); 34 35my @targets = (); 36foreach (@IN_FILES) { 37 if (-d $_) { 38 push (@targets, "ham:dir:$_"); 39 } else { 40 push (@targets, "ham:mbox:$_"); 41 } 42} 43 44my @bucket_fhs = (); 45foreach my $bucket (1 .. $num_buckets) { 46 my $bucket_fh = FileHandle->new(); 47 48 if (!$bucket_fh->open(">$prefix.$bucket")) { 49 die "Could not open '$prefix.$bucket' for writing: $!\n"; 50 } 51 52 push(@bucket_fhs, $bucket_fh); 53} # foreach my $bucket (1 .. $num_buckets) 54 55my $current_bucket = 0; 56 57my $iter = Mail::SpamAssassin::ArchiveIterator->new({ 58 'opt_all' => 1, 59 }); 60 61$iter->set_functions(\&wanted, sub { }); 62my $messagecount = 0; 63 64eval { 65 $iter->run(@targets); 66}; 67if ($@) { die $@ unless ($@ =~ /HITLIMIT/); } 68 69foreach my $fh (@bucket_fhs) { 70 $fh->close(); 71} 72if ($opt_l && $messagecount < $opt_l) { 73 warn "warning: only found $messagecount messages instead of $opt_l\n"; 74} 75 76############################################# 77 78sub wanted { 79 my (undef, $msg_id, $time, $data_ref) = @_; 80 81 if ($opt_l && $messagecount++ > $opt_l) { die 'HITLIMIT'; } 82 83 # Make sure message can be used for outputing mbox format 84 if ($data_ref->[0] !~ /^From \S+ +... ... /) { 85 unshift(@$data_ref, "From abc\@xyz.com Mon Jan 1 00:00:00 2000\n"); 86 } 87 88 $bucket_fhs[$current_bucket]->print( join("", @$data_ref) ); 89 90 $current_bucket = ($current_bucket + 1) % $num_buckets; 91} # wanted() 92 93