1 ///###////////////////////////////////////////////////////////////////////////
2 //
3 // Burton Computer Corporation
4 // http://www.burton-computer.com
5 // http://www.cooldevtools.com
6 // $Id: Command_auto_train.cc 272 2007-01-06 19:37:27Z brian $
7 //
8 // Copyright (C) 2007 Burton Computer Corporation
9 // ALL RIGHTS RESERVED
10 //
11 // This program is open source software; you can redistribute it
12 // and/or modify it under the terms of the Q Public License (QPL)
13 // version 1.0. Use of this software in whole or in part, including
14 // linking it (modified or unmodified) into other programs is
15 // subject to the terms of the QPL.
16 //
17 // This program is distributed in the hope that it will be useful,
18 // but WITHOUT ANY WARRANTY; without even the implied warranty of
19 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 // Q Public License for more details.
21 //
22 // You should have received a copy of the Q Public License
23 // along with this program; see the file LICENSE.txt. If not, visit
24 // the Burton Computer Corporation or CoolDevTools web site
25 // QPL pages at:
26 //
27 // http://www.burton-computer.com/qpl.html
28 // http://www.cooldevtools.com/qpl.html
29 //
30
31 #include <stdexcept>
32 #include "AutoTrainMailMessageReader.h"
33 #include "AutoPurger.h"
34 #include "ParserConfig.h"
35 #include "IstreamCharReader.h"
36 #include "TraditionalMailMessageParser.h"
37 #include "LineReader.h"
38 #include "MD5Digester.h"
39 #include "MailMessageDigester.h"
40 #include "Command_spam.h"
41 #include "WordData.h"
42 #include "RegularExpression.h"
43 #include "SpamFilter.h"
44 #include "FrequencyDB.h"
45 #include "CommandConfig.h"
46 #include "ConfigManager.h"
47 #include "SpamFilter.h"
48 #include "MailMessageReaderFactory.h"
49 #include "Command_auto_train.h"
50
createAutoTrainCommand()51 const Ref<AbstractCommand> Command_auto_train::createAutoTrainCommand()
52 {
53 return Ref<AbstractCommand>(new Command_auto_train("auto-train",
54 "auto-train Builds database using train mode.",
55 "auto-train GOOD filename... SPAM filename...\n"
56 " Attempts to efficiently build a database from all of the named\n"
57 " files. You may specify one or more file of each type. Prior to\n"
58 " each set of file names you must include the word SPAM or GOOD to\n"
59 " indicate what type of mail is contained in the files which follow\n"
60 " on the command line.\n"
61 "\n"
62 " The case of the SPAM and GOOD keywords is important. Any number of\n"
63 " file names can be specified between the keywords. The command line\n"
64 " format is very flexible. You can even use a find command in\n"
65 " backticks to process whole directory trees of files. For example:\n"
66 "\n"
67 " spamprobe auto-train SPAM spams/* GOOD `find hams -type f`\n"
68 "\n"
69 " SpamProbe pre-scans the files to determine how many emails of each\n"
70 " type exist and then trains on hams and spams in a random sequence\n"
71 " that balances the inflow of each type so that the train command can\n"
72 " work most effectively. For example if you had 400 hams and 400\n"
73 " spams, auto-train will generally process one spam, then one ham,\n"
74 " etc. If you had 4000 spams and 400 hams then auto-train will\n"
75 " generally process 10 spams, then one ham, etc.\n"
76 "\n"
77 " Since this command will likely take a long time to run it is often\n"
78 " desireable to use it with the -v option to see progress information\n"
79 " as the messages are processed.\n"
80 "\n"
81 " spamprobe -v auto-train SPAM spams/* GOOD hams/* \n",
82 false));
83 }
84
createAutoLearnCommand()85 const Ref<AbstractCommand> Command_auto_train::createAutoLearnCommand()
86 {
87 return Ref<AbstractCommand>(new Command_auto_train("auto-learn",
88 "auto-learn Builds database using learn mode.",
89 "auto-learn GOOD filename... SPAM filename...\n"
90 " Similar to auto-train but adds all messages in all specified files to\n"
91 " the database, even if they are easily determined to be good or spam.\n"
92 " Generally speaking auto-train should be used instead of auto-learn.\n",
93 true));
94 }
95
logMessage(const ConfigManager & config,SpamFilter & filter,const string & type,bool correct,Message & message)96 void Command_auto_train::logMessage(const ConfigManager &config,
97 SpamFilter &filter,
98 const string &type,
99 bool correct,
100 Message &message)
101 {
102 string subject;
103
104 const string::size_type MAX_SUBJECT_LENGTH = 20;
105 message.getHeader("subject", subject);
106 if (subject.length() > MAX_SUBJECT_LENGTH) {
107 subject.erase(MAX_SUBJECT_LENGTH, string::npos);
108 subject += "...";
109 }
110
111 cout << type;
112
113 if (correct) {
114 cout << " PASS";
115 } else {
116 cout << " FAIL";
117 }
118
119 cout << " DIGEST " << message.getDigest()
120 << " SUBJECT " << subject;
121
122 cout << endl;
123
124 if (config.commandConfig()->shouldShowTerms() && message.getTopTokenCount() > 0) {
125 printTerms(filter.getDB(), cout, message, " ");
126 }
127 }
128
argsOK(const ConfigManager & config)129 bool Command_auto_train::argsOK(const ConfigManager &config)
130 {
131 const int num_args = config.commandConfig()->numArgs();
132 bool answer = true;
133 for (int i = 0; i < num_args; ++i) {
134 string arg(config.commandConfig()->arg(i));
135 if (arg != "LOG" && arg != "SPAM" && arg != "GOOD" && !MailMessageReaderFactory::isAcceptableFile(File(arg))) {
136 cerr << "error: " << arg << " is not a file" << endl;
137 answer = false;
138 }
139 }
140
141 return answer;
142 }
143
execute(const ConfigManager & config,SpamFilter & filter)144 int Command_auto_train::execute(const ConfigManager &config,
145 SpamFilter &filter)
146 {
147 openDatabase(config, filter);
148
149 Ptr<AutoTrainMailMessageReader> mail_reader(new AutoTrainMailMessageReader());
150 bool is_spam_file = false;
151 bool should_log = false;
152 for (int i = 0, limit = config.commandConfig()->numArgs(); i < limit; ++i) {
153 string arg(config.commandConfig()->arg(i));
154 if (arg == "LOG") {
155 should_log = true;
156 } else if (arg == "SPAM") {
157 is_spam_file = true;
158 } else if (arg == "GOOD") {
159 is_spam_file = false;
160 } else {
161 File file(arg);
162 if (!MailMessageReaderFactory::isAcceptableFile(file)) {
163 throw runtime_error(string("file does not exist: ") + arg);
164 }
165 mail_reader->addMailboxFile(is_spam_file, arg);
166 }
167 }
168
169 Ref<AbstractMessageCommand> good_command;
170 Ref<AbstractMessageCommand> spam_command;
171
172 if (m_isLearn) {
173 good_command = Command_spam::createGoodCommand();
174 spam_command = Command_spam::createSpamCommand();
175 } else {
176 good_command = Command_spam::createTrainGoodCommand();
177 spam_command = Command_spam::createTrainSpamCommand();
178 }
179
180 bool is_message_spam = false;
181 int message_num = 0;
182
183 ParserConfig *parser_config = config.parserConfig();
184
185 TraditionalMailMessageParser parser(parser_config);
186 MailMessageDigester digester;
187
188 AutoPurger purger(config, filter);
189 int cumulative_message_count = 0;
190 Ptr<MailMessage> mail_message;
191 mail_message.set(mail_reader->readMessage());
192 while (mail_message.isNotNull()) {
193 Ptr<Message> msg(parser.parseMailMessage(mail_message.get()));
194 msg->setSource(mail_message.release());
195 digester.assignDigestToMessage(msg.get(), msg->source(), parser_config->spamprobeFieldName());
196
197 SpamFilter::Score score;
198 if (should_log) {
199 score = filter.scoreMessage(*msg);
200 }
201 bool scored_as_spam = should_log && filter.scoreMessage(*msg).isSpam();
202 bool is_spam = mail_reader->messageWasSpam();
203 if (should_log) {
204 logMessage(config, filter, is_spam ? "SPAM" : "GOOD", score.isSpam() == is_spam, *msg);
205 }
206 if (is_spam) {
207 spam_command->processMessage(config, filter, &mail_reader->messageFile(), *msg, message_num, is_spam);
208 } else {
209 good_command->processMessage(config, filter, &mail_reader->messageFile(), *msg, message_num, is_spam);
210 }
211 purger.processedMessage();
212
213 mail_message.set(mail_reader->readMessage());
214 }
215 purger.finish();
216 return 0;
217 }
218