1 ///###////////////////////////////////////////////////////////////////////////
2 //
3 // Burton Computer Corporation
4 // http://www.burton-computer.com
5 // http://www.cooldevtools.com
6 // $Id: Command_auto_train.cc 272 2007-01-06 19:37:27Z brian $
7 //
8 // Copyright (C) 2007 Burton Computer Corporation
9 // ALL RIGHTS RESERVED
10 //
11 // This program is open source software; you can redistribute it
12 // and/or modify it under the terms of the Q Public License (QPL)
13 // version 1.0. Use of this software in whole or in part, including
14 // linking it (modified or unmodified) into other programs is
15 // subject to the terms of the QPL.
16 //
17 // This program is distributed in the hope that it will be useful,
18 // but WITHOUT ANY WARRANTY; without even the implied warranty of
19 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20 // Q Public License for more details.
21 //
22 // You should have received a copy of the Q Public License
23 // along with this program; see the file LICENSE.txt.  If not, visit
24 // the Burton Computer Corporation or CoolDevTools web site
25 // QPL pages at:
26 //
27 //    http://www.burton-computer.com/qpl.html
28 //    http://www.cooldevtools.com/qpl.html
29 //
30 
31 #include <stdexcept>
32 #include "AutoTrainMailMessageReader.h"
33 #include "AutoPurger.h"
34 #include "ParserConfig.h"
35 #include "IstreamCharReader.h"
36 #include "TraditionalMailMessageParser.h"
37 #include "LineReader.h"
38 #include "MD5Digester.h"
39 #include "MailMessageDigester.h"
40 #include "Command_spam.h"
41 #include "WordData.h"
42 #include "RegularExpression.h"
43 #include "SpamFilter.h"
44 #include "FrequencyDB.h"
45 #include "CommandConfig.h"
46 #include "ConfigManager.h"
47 #include "SpamFilter.h"
48 #include "MailMessageReaderFactory.h"
49 #include "Command_auto_train.h"
50 
createAutoTrainCommand()51 const Ref<AbstractCommand> Command_auto_train::createAutoTrainCommand()
52 {
53   return Ref<AbstractCommand>(new Command_auto_train("auto-train",
54                                                      "auto-train    Builds database using train mode.",
55                                                      "auto-train GOOD filename... SPAM filename...\n"
56                                                      "    Attempts to efficiently build a database from all of the named\n"
57                                                      "    files.  You may specify one or more file of each type.  Prior to\n"
58                                                      "    each set of file names you must include the word SPAM or GOOD to\n"
59                                                      "    indicate what type of mail is contained in the files which follow\n"
60                                                      "    on the command line.\n"
61                                                      "\n"
62                                                      "    The case of the SPAM and GOOD keywords is important.  Any number of\n"
63                                                      "    file names can be specified between the keywords.  The command line\n"
64                                                      "    format is very flexible.  You can even use a find command in\n"
65                                                      "    backticks to process whole directory trees of files. For example:\n"
66                                                      "\n"
67                                                      "      spamprobe auto-train SPAM spams/* GOOD `find hams -type f`\n"
68                                                      "\n"
69                                                      "    SpamProbe pre-scans the files to determine how many emails of each\n"
70                                                      "    type exist and then trains on hams and spams in a random sequence\n"
71                                                      "    that balances the inflow of each type so that the train command can\n"
72                                                      "    work most effectively.  For example if you had 400 hams and 400\n"
73                                                      "    spams, auto-train will generally process one spam, then one ham,\n"
74                                                      "    etc.  If you had 4000 spams and 400 hams then auto-train will\n"
75                                                      "    generally process 10 spams, then one ham, etc.\n"
76                                                      "\n"
77                                                      "    Since this command will likely take a long time to run it is often\n"
78                                                      "    desireable to use it with the -v option to see progress information\n"
79                                                      "    as the messages are processed.\n"
80                                                      "\n"
81                                                      "      spamprobe -v auto-train SPAM spams/* GOOD hams/* \n",
82                                                      false));
83 }
84 
createAutoLearnCommand()85 const Ref<AbstractCommand> Command_auto_train::createAutoLearnCommand()
86 {
87   return Ref<AbstractCommand>(new Command_auto_train("auto-learn",
88                                                      "auto-learn    Builds database using learn mode.",
89                                                      "auto-learn GOOD filename... SPAM filename...\n"
90                                                      "    Similar to auto-train but adds all messages in all specified files to\n"
91                                                      "    the database, even if they are easily determined to be good or spam.\n"
92                                                      "    Generally speaking auto-train should be used instead of auto-learn.\n",
93                                                      true));
94 }
95 
logMessage(const ConfigManager & config,SpamFilter & filter,const string & type,bool correct,Message & message)96 void Command_auto_train::logMessage(const ConfigManager &config,
97                                     SpamFilter &filter,
98                                     const string &type,
99                                     bool correct,
100                                     Message &message)
101 {
102   string subject;
103 
104   const string::size_type MAX_SUBJECT_LENGTH = 20;
105   message.getHeader("subject", subject);
106   if (subject.length() > MAX_SUBJECT_LENGTH) {
107     subject.erase(MAX_SUBJECT_LENGTH, string::npos);
108     subject += "...";
109   }
110 
111   cout << type;
112 
113   if (correct) {
114     cout << " PASS";
115   } else {
116     cout << " FAIL";
117   }
118 
119   cout << " DIGEST " << message.getDigest()
120        << " SUBJECT " << subject;
121 
122   cout << endl;
123 
124   if (config.commandConfig()->shouldShowTerms() && message.getTopTokenCount() > 0) {
125     printTerms(filter.getDB(), cout, message, "    ");
126   }
127 }
128 
argsOK(const ConfigManager & config)129 bool Command_auto_train::argsOK(const ConfigManager &config)
130 {
131   const int num_args = config.commandConfig()->numArgs();
132   bool answer = true;
133   for (int i = 0; i < num_args; ++i) {
134     string arg(config.commandConfig()->arg(i));
135     if (arg != "LOG" && arg != "SPAM" && arg != "GOOD" && !MailMessageReaderFactory::isAcceptableFile(File(arg))) {
136       cerr << "error: " << arg << " is not a file" << endl;
137       answer = false;
138     }
139   }
140 
141   return answer;
142 }
143 
execute(const ConfigManager & config,SpamFilter & filter)144 int Command_auto_train::execute(const ConfigManager &config,
145                                 SpamFilter &filter)
146 {
147   openDatabase(config, filter);
148 
149   Ptr<AutoTrainMailMessageReader> mail_reader(new AutoTrainMailMessageReader());
150   bool is_spam_file = false;
151   bool should_log = false;
152   for (int i = 0, limit = config.commandConfig()->numArgs(); i < limit; ++i) {
153     string arg(config.commandConfig()->arg(i));
154     if (arg == "LOG") {
155       should_log = true;
156     } else if (arg == "SPAM") {
157       is_spam_file = true;
158     } else if (arg == "GOOD") {
159       is_spam_file = false;
160     } else {
161       File file(arg);
162       if (!MailMessageReaderFactory::isAcceptableFile(file)) {
163         throw runtime_error(string("file does not exist: ") + arg);
164       }
165       mail_reader->addMailboxFile(is_spam_file, arg);
166     }
167   }
168 
169   Ref<AbstractMessageCommand> good_command;
170   Ref<AbstractMessageCommand> spam_command;
171 
172   if (m_isLearn) {
173     good_command = Command_spam::createGoodCommand();
174     spam_command = Command_spam::createSpamCommand();
175   } else {
176     good_command = Command_spam::createTrainGoodCommand();
177     spam_command = Command_spam::createTrainSpamCommand();
178   }
179 
180   bool is_message_spam = false;
181   int message_num = 0;
182 
183   ParserConfig *parser_config = config.parserConfig();
184 
185   TraditionalMailMessageParser parser(parser_config);
186   MailMessageDigester digester;
187 
188   AutoPurger purger(config, filter);
189   int cumulative_message_count = 0;
190   Ptr<MailMessage> mail_message;
191   mail_message.set(mail_reader->readMessage());
192   while (mail_message.isNotNull()) {
193       Ptr<Message> msg(parser.parseMailMessage(mail_message.get()));
194       msg->setSource(mail_message.release());
195       digester.assignDigestToMessage(msg.get(), msg->source(), parser_config->spamprobeFieldName());
196 
197       SpamFilter::Score score;
198       if (should_log) {
199         score = filter.scoreMessage(*msg);
200       }
201       bool scored_as_spam = should_log && filter.scoreMessage(*msg).isSpam();
202       bool is_spam = mail_reader->messageWasSpam();
203       if (should_log) {
204         logMessage(config, filter, is_spam ? "SPAM" : "GOOD", score.isSpam() == is_spam, *msg);
205       }
206       if (is_spam) {
207         spam_command->processMessage(config, filter, &mail_reader->messageFile(), *msg, message_num, is_spam);
208       } else {
209         good_command->processMessage(config, filter, &mail_reader->messageFile(), *msg, message_num, is_spam);
210       }
211       purger.processedMessage();
212 
213       mail_message.set(mail_reader->readMessage());
214   }
215   purger.finish();
216   return 0;
217 }
218