1#!/usr/bin/perl -w
2
3use FindBin;
4use lib "$FindBin::Bin/../lib";
5
6use strict;
7
8use Mail::SpamAssassin::ArchiveIterator;
9use Getopt::Std;
10use FileHandle;
11
12###########
13
14sub usage {
15  print STDERR "split-corpora [-n num_buckets] [-p outfile_prefix] ".
16	"[-l max_messages] ".
17	"folder1 ....\n";
18  exit(1);
19} # usage()
20
21###########
22
23our ($opt_n, $opt_p, $opt_h, $opt_l);
24
25getopt('n:p:l:h');
26
27usage() if ($opt_h);
28
29my $num_buckets = $opt_n || 2;
30my $prefix      = $opt_p || "bucket";
31my @IN_FILES    = @ARGV;
32
33usage() if (@IN_FILES == 0);
34
35my @targets = ();
36foreach (@IN_FILES) {
37  if (-d $_) {
38    push (@targets, "ham:dir:$_");
39  } else {
40    push (@targets, "ham:mbox:$_");
41  }
42}
43
44my @bucket_fhs = ();
45foreach my $bucket (1 .. $num_buckets) {
46  my $bucket_fh = FileHandle->new();
47
48  if (!$bucket_fh->open(">$prefix.$bucket")) {
49    die "Could not open '$prefix.$bucket' for writing: $!\n";
50  }
51
52  push(@bucket_fhs, $bucket_fh);
53} # foreach my $bucket (1 .. $num_buckets)
54
55my $current_bucket = 0;
56
57my $iter = Mail::SpamAssassin::ArchiveIterator->new({
58        'opt_all' => 1,
59  });
60
61$iter->set_functions(\&wanted, sub { });
62my $messagecount = 0;
63
64eval {
65  $iter->run(@targets);
66};
67if ($@) { die $@ unless ($@ =~ /HITLIMIT/); }
68
69foreach my $fh (@bucket_fhs) {
70  $fh->close();
71}
72if ($opt_l && $messagecount < $opt_l) {
73  warn "warning: only found $messagecount messages instead of $opt_l\n";
74}
75
76#############################################
77
78sub wanted {
79  my (undef, $msg_id, $time, $data_ref) = @_;
80
81  if ($opt_l && $messagecount++ > $opt_l) { die 'HITLIMIT'; }
82
83  # Make sure message can be used for outputing mbox format
84  if ($data_ref->[0] !~ /^From \S+ +... ... /) {
85    unshift(@$data_ref, "From abc\@xyz.com Mon Jan  1 00:00:00 2000\n");
86  }
87
88  $bucket_fhs[$current_bucket]->print( join("", @$data_ref) );
89
90  $current_bucket = ($current_bucket + 1) % $num_buckets;
91} # wanted()
92
93