1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5
6 #include "nsBayesianFilter.h"
7 #include "nsIInputStream.h"
8 #include "nsIStreamListener.h"
9 #include "nsNetUtil.h"
10 #include "nsQuickSort.h"
11 #include "nsIMsgMessageService.h"
12 #include "nsMsgUtils.h" // for GetMessageServiceFromURI
13 #include "prnetdb.h"
14 #include "nsIMsgWindow.h"
15 #include "mozilla/Logging.h"
16 #include "nsAppDirectoryServiceDefs.h"
17 #include "nsUnicharUtils.h"
18 #include "nsDirectoryServiceUtils.h"
19 #include "nsIMIMEHeaderParam.h"
20 #include "nsNetCID.h"
21 #include "nsMsgMimeCID.h"
22 #include "nsIMsgMailNewsUrl.h"
23 #include "nsIMimeMiscStatus.h"
24 #include "nsIPrefService.h"
25 #include "nsIPrefBranch.h"
26 #include "nsIStringEnumerator.h"
27 #include "nsIObserverService.h"
28 #include "nsIChannel.h"
29 #include "nsDependentSubstring.h"
30 #include "nsMemory.h"
31
32 #include "mozilla/ArenaAllocatorExtensions.h" // for ArenaStrdup
33
34 using namespace mozilla;
35
36 // needed to mark attachment flag on the db hdr
37 #include "nsIMsgHdr.h"
38
39 // needed to strip html out of the body
40 #include "nsLayoutCID.h"
41 #include "nsIParserUtils.h"
42 #include "nsIDocumentEncoder.h"
43
44 #include "nsIncompleteGamma.h"
45 #include <math.h>
46 #include <prmem.h>
47 #include "nsIMsgTraitService.h"
48 #include "mozilla/Services.h"
49 #include "mozilla/Attributes.h"
50 #include <cstdlib> // for std::abs(int/long)
51 #include <cmath> // for std::abs(float/double)
52
53 static mozilla::LazyLogModule BayesianFilterLogModule("BayesianFilter");
54
55 #define kDefaultJunkThreshold .99 // we override this value via a pref
56 static const char* kBayesianFilterTokenDelimiters = " \t\n\r\f.";
57 static unsigned int kMinLengthForToken =
58 3; // lower bound on the number of characters in a word before we treat it
59 // as a token
60 static unsigned int kMaxLengthForToken =
61 12; // upper bound on the number of characters in a word to be declared as
62 // a token
63
64 #define FORGED_RECEIVED_HEADER_HINT "may be forged"_ns
65
66 #ifndef M_LN2
67 # define M_LN2 0.69314718055994530942
68 #endif
69
70 #ifndef M_E
71 # define M_E 2.7182818284590452354
72 #endif
73
74 // provide base implementation of hash lookup of a string
75 struct BaseToken : public PLDHashEntryHdr {
76 const char* mWord;
77 };
78
79 // token for a particular message
80 // mCount, mAnalysisLink are initialized to zero by the hash code
81 struct Token : public BaseToken {
82 uint32_t mCount;
83 uint32_t mAnalysisLink; // index in mAnalysisStore of the AnalysisPerToken
84 // object for the first trait for this token
85 // Helper to support Tokenizer::copyTokens()
cloneToken86 void clone(const Token& other) {
87 mWord = other.mWord;
88 mCount = other.mCount;
89 mAnalysisLink = other.mAnalysisLink;
90 }
91 };
92
93 // token stored in a training file for a group of messages
94 // mTraitLink is initialized to 0 by the hash code
95 struct CorpusToken : public BaseToken {
96 uint32_t mTraitLink; // index in mTraitStore of the TraitPerToken
97 // object for the first trait for this token
98 };
99
100 // set the value of a TraitPerToken object
TraitPerToken(uint32_t aTraitId,uint32_t aCount)101 TraitPerToken::TraitPerToken(uint32_t aTraitId, uint32_t aCount)
102 : mId(aTraitId), mCount(aCount), mNextLink(0) {}
103
104 // shorthand representations of trait ids for junk and good
105 static const uint32_t kJunkTrait = nsIJunkMailPlugin::JUNK_TRAIT;
106 static const uint32_t kGoodTrait = nsIJunkMailPlugin::GOOD_TRAIT;
107
108 // set the value of an AnalysisPerToken object
AnalysisPerToken(uint32_t aTraitIndex,double aDistance,double aProbability)109 AnalysisPerToken::AnalysisPerToken(uint32_t aTraitIndex, double aDistance,
110 double aProbability)
111 : mTraitIndex(aTraitIndex),
112 mDistance(aDistance),
113 mProbability(aProbability),
114 mNextLink(0) {}
115
116 // the initial size of the AnalysisPerToken linked list storage
117 const uint32_t kAnalysisStoreCapacity = 2048;
118
119 // the initial size of the TraitPerToken linked list storage
120 const uint32_t kTraitStoreCapacity = 16384;
121
122 // Size of Auto arrays representing per trait information
123 const uint32_t kTraitAutoCapacity = 10;
124
TokenEnumeration(PLDHashTable * table)125 TokenEnumeration::TokenEnumeration(PLDHashTable* table)
126 : mIterator(table->Iter()) {}
127
hasMoreTokens()128 inline bool TokenEnumeration::hasMoreTokens() { return !mIterator.Done(); }
129
nextToken()130 inline BaseToken* TokenEnumeration::nextToken() {
131 auto token = static_cast<BaseToken*>(mIterator.Get());
132 mIterator.Next();
133 return token;
134 }
135
136 // member variables
137 static const PLDHashTableOps gTokenTableOps = {
138 PLDHashTable::HashStringKey, PLDHashTable::MatchStringKey,
139 PLDHashTable::MoveEntryStub, PLDHashTable::ClearEntryStub, nullptr};
140
TokenHash(uint32_t aEntrySize)141 TokenHash::TokenHash(uint32_t aEntrySize)
142 : mTokenTable(&gTokenTableOps, aEntrySize, 128) {
143 mEntrySize = aEntrySize;
144 }
145
~TokenHash()146 TokenHash::~TokenHash() {}
147
clearTokens()148 nsresult TokenHash::clearTokens() {
149 // we re-use the tokenizer when classifying multiple messages,
150 // so this gets called after every message classification.
151 mTokenTable.ClearAndPrepareForLength(128);
152 mWordPool.Clear();
153 return NS_OK;
154 }
155
copyWord(const char * word,uint32_t len)156 char* TokenHash::copyWord(const char* word, uint32_t len) {
157 return ArenaStrdup(Substring(word, len), mWordPool);
158 }
159
get(const char * word)160 inline BaseToken* TokenHash::get(const char* word) {
161 PLDHashEntryHdr* entry = mTokenTable.Search(word);
162 if (entry) return static_cast<BaseToken*>(entry);
163 return NULL;
164 }
165
add(const char * word)166 BaseToken* TokenHash::add(const char* word) {
167 if (!word || !*word) {
168 NS_ERROR("Trying to add a null word");
169 return nullptr;
170 }
171
172 MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug, ("add word: %s", word));
173
174 PLDHashEntryHdr* entry = mTokenTable.Add(word, mozilla::fallible);
175 BaseToken* token = static_cast<BaseToken*>(entry);
176 if (token) {
177 if (token->mWord == NULL) {
178 uint32_t len = strlen(word);
179 NS_ASSERTION(len != 0, "adding zero length word to tokenizer");
180 if (!len)
181 MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug,
182 ("adding zero length word to tokenizer"));
183 token->mWord = copyWord(word, len);
184 NS_ASSERTION(token->mWord, "copyWord failed");
185 if (!token->mWord) {
186 MOZ_LOG(BayesianFilterLogModule, LogLevel::Error,
187 ("copyWord failed: %s (%d)", word, len));
188 mTokenTable.RawRemove(entry);
189 return NULL;
190 }
191 }
192 }
193 return token;
194 }
195
countTokens()196 inline uint32_t TokenHash::countTokens() { return mTokenTable.EntryCount(); }
197
getTokens()198 inline TokenEnumeration TokenHash::getTokens() {
199 return TokenEnumeration(&mTokenTable);
200 }
201
Tokenizer()202 Tokenizer::Tokenizer()
203 : TokenHash(sizeof(Token)),
204 mBodyDelimiters(kBayesianFilterTokenDelimiters),
205 mHeaderDelimiters(kBayesianFilterTokenDelimiters),
206 mCustomHeaderTokenization(false),
207 mMaxLengthForToken(kMaxLengthForToken),
208 mIframeToDiv(false) {
209 nsresult rv;
210 nsCOMPtr<nsIPrefService> prefs =
211 do_GetService(NS_PREFSERVICE_CONTRACTID, &rv);
212 NS_ENSURE_SUCCESS_VOID(rv);
213
214 nsCOMPtr<nsIPrefBranch> prefBranch;
215 rv = prefs->GetBranch("mailnews.bayesian_spam_filter.",
216 getter_AddRefs(prefBranch));
217 NS_ENSURE_SUCCESS_VOID(rv); // no branch defined, just use defaults
218
219 /*
220 * RSS feeds store their summary as alternate content of an iframe. But due
221 * to bug 365953, this is not seen by the serializer. As a workaround, allow
222 * the tokenizer to replace the iframe with div for tokenization.
223 */
224 rv = prefBranch->GetBoolPref("iframe_to_div", &mIframeToDiv);
225 if (NS_FAILED(rv)) mIframeToDiv = false;
226
227 /*
228 * the list of delimiters used to tokenize the message and body
229 * defaults to the value in kBayesianFilterTokenDelimiters, but may be
230 * set with the following preferences for the body and header
231 * separately.
232 *
233 * \t, \n, \v, \f, \r, and \\ will be escaped to their normal
234 * C-library values, all other two-letter combinations beginning with \
235 * will be ignored.
236 */
237
238 prefBranch->GetCharPref("body_delimiters", mBodyDelimiters);
239 if (!mBodyDelimiters.IsEmpty())
240 UnescapeCString(mBodyDelimiters);
241 else // prefBranch empties the result when it fails :(
242 mBodyDelimiters.Assign(kBayesianFilterTokenDelimiters);
243
244 prefBranch->GetCharPref("header_delimiters", mHeaderDelimiters);
245 if (!mHeaderDelimiters.IsEmpty())
246 UnescapeCString(mHeaderDelimiters);
247 else
248 mHeaderDelimiters.Assign(kBayesianFilterTokenDelimiters);
249
250 /*
251 * Extensions may wish to enable or disable tokenization of certain headers.
252 * Define any headers to enable/disable in a string preference like this:
253 * "mailnews.bayesian_spam_filter.tokenizeheader.headername"
254 *
255 * where "headername" is the header to tokenize. For example, to tokenize the
256 * header "x-spam-status" use the preference:
257 *
258 * "mailnews.bayesian_spam_filter.tokenizeheader.x-spam-status"
259 *
260 * The value of the string preference will be interpreted in one of
261 * four ways, depending on the value:
262 *
263 * If "false" then do not tokenize that header
264 * If "full" then add the entire header value as a token,
265 * without breaking up into subtokens using delimiters
266 * If "standard" then tokenize the header using as delimiters the current
267 * value of the generic header delimiters
268 * Any other string is interpreted as a list of delimiters to use to parse
269 * the header. \t, \n, \v, \f, \r, and \\ will be escaped to their normal
270 * C-library values, all other two-letter combinations beginning with \
271 * will be ignored.
272 *
273 * Header names in the preference should be all lower case
274 *
275 * Extensions may also set the maximum length of a token (default is
276 * kMaxLengthForToken) by setting the int preference:
277 * "mailnews.bayesian_spam_filter.maxlengthfortoken"
278 */
279
280 nsTArray<nsCString> headers;
281
282 // get customized maximum token length
283 int32_t maxLengthForToken;
284 rv = prefBranch->GetIntPref("maxlengthfortoken", &maxLengthForToken);
285 mMaxLengthForToken =
286 NS_SUCCEEDED(rv) ? uint32_t(maxLengthForToken) : kMaxLengthForToken;
287
288 rv = prefs->GetBranch("mailnews.bayesian_spam_filter.tokenizeheader.",
289 getter_AddRefs(prefBranch));
290 if (NS_SUCCEEDED(rv)) rv = prefBranch->GetChildList("", headers);
291
292 if (NS_SUCCEEDED(rv)) {
293 mCustomHeaderTokenization = true;
294 for (auto& header : headers) {
295 nsCString value;
296 prefBranch->GetCharPref(header.get(), value);
297 if (value.EqualsLiteral("false")) {
298 mDisabledHeaders.AppendElement(header);
299 continue;
300 }
301 mEnabledHeaders.AppendElement(header);
302 if (value.EqualsLiteral("standard"))
303 value.SetIsVoid(true); // Void means use default delimiter
304 else if (value.EqualsLiteral("full"))
305 value.Truncate(); // Empty means add full header
306 else
307 UnescapeCString(value);
308 mEnabledHeadersDelimiters.AppendElement(value);
309 }
310 }
311 }
312
~Tokenizer()313 Tokenizer::~Tokenizer() {}
314
get(const char * word)315 inline Token* Tokenizer::get(const char* word) {
316 return static_cast<Token*>(TokenHash::get(word));
317 }
318
add(const char * word,uint32_t count)319 Token* Tokenizer::add(const char* word, uint32_t count) {
320 MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug,
321 ("add word: %s (count=%d)", word, count));
322
323 Token* token = static_cast<Token*>(TokenHash::add(word));
324 if (token) {
325 token->mCount += count; // hash code initializes this to zero
326 MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug,
327 ("adding word to tokenizer: %s (count=%d) (mCount=%d)", word, count,
328 token->mCount));
329 }
330 return token;
331 }
332
isDecimalNumber(const char * word)333 static bool isDecimalNumber(const char* word) {
334 const char* p = word;
335 if (*p == '-') ++p;
336 char c;
337 while ((c = *p++)) {
338 if (!isdigit((unsigned char)c)) return false;
339 }
340 return true;
341 }
342
isASCII(const char * word)343 static bool isASCII(const char* word) {
344 const unsigned char* p = (const unsigned char*)word;
345 unsigned char c;
346 while ((c = *p++)) {
347 if (c > 127) return false;
348 }
349 return true;
350 }
351
isUpperCase(char c)352 inline bool isUpperCase(char c) { return ('A' <= c) && (c <= 'Z'); }
353
toLowerCase(char * str)354 static char* toLowerCase(char* str) {
355 char c, *p = str;
356 while ((c = *p++)) {
357 if (isUpperCase(c)) p[-1] = c + ('a' - 'A');
358 }
359 return str;
360 }
361
addTokenForHeader(const char * aTokenPrefix,nsACString & aValue,bool aTokenizeValue,const char * aDelimiters)362 void Tokenizer::addTokenForHeader(const char* aTokenPrefix, nsACString& aValue,
363 bool aTokenizeValue,
364 const char* aDelimiters) {
365 if (aValue.Length()) {
366 ToLowerCase(aValue);
367 if (!aTokenizeValue) {
368 nsCString tmpStr;
369 tmpStr.Assign(aTokenPrefix);
370 tmpStr.Append(':');
371 tmpStr.Append(aValue);
372
373 add(tmpStr.get());
374 } else {
375 char* word;
376 nsCString str(aValue);
377 char* next = str.BeginWriting();
378 const char* delimiters =
379 !aDelimiters ? mHeaderDelimiters.get() : aDelimiters;
380 while ((word = NS_strtok(delimiters, &next)) != NULL) {
381 if (strlen(word) < kMinLengthForToken) continue;
382 if (isDecimalNumber(word)) continue;
383 if (isASCII(word)) {
384 nsCString tmpStr;
385 tmpStr.Assign(aTokenPrefix);
386 tmpStr.Append(':');
387 tmpStr.Append(word);
388 add(tmpStr.get());
389 }
390 }
391 }
392 }
393 }
394
tokenizeAttachment(const char * aContentType,const char * aFileName)395 void Tokenizer::tokenizeAttachment(const char* aContentType,
396 const char* aFileName) {
397 nsAutoCString contentType;
398 nsAutoCString fileName;
399 fileName.Assign(aFileName);
400 contentType.Assign(aContentType);
401
402 // normalize the content type and the file name
403 ToLowerCase(fileName);
404 ToLowerCase(contentType);
405 addTokenForHeader("attachment/filename", fileName);
406
407 addTokenForHeader("attachment/content-type", contentType);
408 }
409
tokenizeHeaders(nsIUTF8StringEnumerator * aHeaderNames,nsIUTF8StringEnumerator * aHeaderValues)410 void Tokenizer::tokenizeHeaders(nsIUTF8StringEnumerator* aHeaderNames,
411 nsIUTF8StringEnumerator* aHeaderValues) {
412 nsCString headerValue;
413 nsAutoCString
414 headerName; // we'll be normalizing all header names to lower case
415 bool hasMore;
416
417 while (NS_SUCCEEDED(aHeaderNames->HasMore(&hasMore)) && hasMore) {
418 aHeaderNames->GetNext(headerName);
419 ToLowerCase(headerName);
420 aHeaderValues->GetNext(headerValue);
421
422 bool headerProcessed = false;
423 if (mCustomHeaderTokenization) {
424 // Process any exceptions set from preferences
425 for (uint32_t i = 0; i < mEnabledHeaders.Length(); i++)
426 if (headerName.Equals(mEnabledHeaders[i])) {
427 if (mEnabledHeadersDelimiters[i].IsVoid())
428 // tokenize with standard delimiters for all headers
429 addTokenForHeader(headerName.get(), headerValue, true);
430 else if (mEnabledHeadersDelimiters[i].IsEmpty())
431 // do not break the header into tokens
432 addTokenForHeader(headerName.get(), headerValue);
433 else
434 // use the delimiter in mEnabledHeadersDelimiters
435 addTokenForHeader(headerName.get(), headerValue, true,
436 mEnabledHeadersDelimiters[i].get());
437 headerProcessed = true;
438 break; // we found the header, no need to look for more custom values
439 }
440
441 for (uint32_t i = 0; i < mDisabledHeaders.Length(); i++) {
442 if (headerName.Equals(mDisabledHeaders[i])) {
443 headerProcessed = true;
444 break;
445 }
446 }
447
448 if (headerProcessed) continue;
449 }
450
451 switch (headerName.First()) {
452 case 'c':
453 if (headerName.EqualsLiteral("content-type")) {
454 nsresult rv;
455 nsCOMPtr<nsIMIMEHeaderParam> mimehdrpar =
456 do_GetService(NS_MIMEHEADERPARAM_CONTRACTID, &rv);
457 if (NS_FAILED(rv)) break;
458
459 // extract the charset parameter
460 nsCString parameterValue;
461 mimehdrpar->GetParameterInternal(headerValue.get(), "charset",
462 nullptr, nullptr,
463 getter_Copies(parameterValue));
464 addTokenForHeader("charset", parameterValue);
465
466 // create a token containing just the content type
467 mimehdrpar->GetParameterInternal(headerValue.get(), "type", nullptr,
468 nullptr,
469 getter_Copies(parameterValue));
470 if (!parameterValue.Length())
471 mimehdrpar->GetParameterInternal(
472 headerValue.get(), nullptr /* use first unnamed param */,
473 nullptr, nullptr, getter_Copies(parameterValue));
474 addTokenForHeader("content-type/type", parameterValue);
475
476 // XXX: should we add a token for the entire content-type header as
477 // well or just these parts we have extracted?
478 }
479 break;
480 case 'r':
481 if (headerName.EqualsLiteral("received")) {
482 // look for the string "may be forged" in the received headers.
483 // sendmail sometimes adds this hint This does not compile on linux
484 // yet. Need to figure out why. Commenting out for now if
485 // (FindInReadable(FORGED_RECEIVED_HEADER_HINT, headerValue))
486 // addTokenForHeader(headerName.get(), FORGED_RECEIVED_HEADER_HINT);
487 }
488
489 // leave out reply-to
490 break;
491 case 's':
492 if (headerName.EqualsLiteral("subject")) {
493 // we want to tokenize the subject
494 addTokenForHeader(headerName.get(), headerValue, true);
495 }
496
497 // important: leave out sender field. Too strong of an indicator
498 break;
499 case 'x': // (2) X-Mailer / user-agent works best if it is untokenized,
500 // just fold the case and any leading/trailing white space
501 // all headers beginning with x-mozilla are being changed by us, so
502 // ignore
503 if (StringBeginsWith(headerName, "x-mozilla"_ns)) break;
504 // fall through
505 [[fallthrough]];
506 case 'u':
507 addTokenForHeader(headerName.get(), headerValue);
508 break;
509 default:
510 addTokenForHeader(headerName.get(), headerValue);
511 break;
512 } // end switch
513 }
514 }
515
tokenize_ascii_word(char * aWord)516 void Tokenizer::tokenize_ascii_word(char* aWord) {
517 // always deal with normalized lower case strings
518 toLowerCase(aWord);
519 uint32_t wordLength = strlen(aWord);
520
521 // if the wordLength is within our accepted token limit, then add it
522 if (wordLength >= kMinLengthForToken && wordLength <= mMaxLengthForToken)
523 add(aWord);
524 else if (wordLength > mMaxLengthForToken) {
525 // don't skip over the word if it looks like an email address,
526 // there is value in adding tokens for addresses
527 nsDependentCString word(aWord,
528 wordLength); // CHEAP, no allocation occurs here...
529
530 // XXX: i think the 40 byte check is just for perf reasons...if the email
531 // address is longer than that then forget about it.
532 const char* atSign = strchr(aWord, '@');
533 if (wordLength < 40 && strchr(aWord, '.') && atSign &&
534 !strchr(atSign + 1, '@')) {
535 uint32_t numBytesToSep = atSign - aWord;
536 if (numBytesToSep <
537 wordLength - 1) // if the @ sign is the last character, it must not
538 // be an email address
539 {
540 // split the john@foo.com into john and foo.com, treat them as separate
541 // tokens
542 nsCString emailNameToken;
543 emailNameToken.AssignLiteral("email name:");
544 emailNameToken.Append(Substring(word, 0, numBytesToSep++));
545 add(emailNameToken.get());
546 nsCString emailAddrToken;
547 emailAddrToken.AssignLiteral("email addr:");
548 emailAddrToken.Append(
549 Substring(word, numBytesToSep, wordLength - numBytesToSep));
550 add(emailAddrToken.get());
551 return;
552 }
553 }
554
555 // there is value in generating a token indicating the number
556 // of characters we are skipping. We'll round to the nearest 10
557 nsCString skipToken;
558 skipToken.AssignLiteral("skip:");
559 skipToken.Append(word[0]);
560 skipToken.Append(' ');
561 skipToken.AppendInt((wordLength / 10) * 10);
562 add(skipToken.get());
563 }
564 }
565
566 // one subtract and one conditional jump should be faster than two conditional
567 // jump on most recent system.
568 #define IN_RANGE(x, low, high) ((uint16_t)((x) - (low)) <= (high) - (low))
569
570 #define IS_JA_HIRAGANA(x) IN_RANGE(x, 0x3040, 0x309F)
571 // swapping the range using xor operation to reduce conditional jump.
572 #define IS_JA_KATAKANA(x) \
573 (IN_RANGE(x ^ 0x0004, 0x30A0, 0x30FE) || (IN_RANGE(x, 0xFF66, 0xFF9F)))
574 #define IS_JA_KANJI(x) \
575 (IN_RANGE(x, 0x2E80, 0x2FDF) || IN_RANGE(x, 0x4E00, 0x9FAF))
576 #define IS_JA_KUTEN(x) (((x) == 0x3001) || ((x) == 0xFF64) || ((x) == 0xFF0E))
577 #define IS_JA_TOUTEN(x) (((x) == 0x3002) || ((x) == 0xFF61) || ((x) == 0xFF0C))
578 #define IS_JA_SPACE(x) ((x) == 0x3000)
579 #define IS_JA_FWLATAIN(x) IN_RANGE(x, 0xFF01, 0xFF5E)
580 #define IS_JA_FWNUMERAL(x) IN_RANGE(x, 0xFF10, 0xFF19)
581
582 #define IS_JAPANESE_SPECIFIC(x) \
583 (IN_RANGE(x, 0x3040, 0x30FF) || IN_RANGE(x, 0xFF01, 0xFF9F))
584
585 enum char_class {
586 others = 0,
587 space,
588 hiragana,
589 katakana,
590 kanji,
591 kuten,
592 touten,
593 kigou,
594 fwlatain,
595 ascii
596 };
597
getCharClass(char16_t c)598 static char_class getCharClass(char16_t c) {
599 char_class charClass = others;
600
601 if (IS_JA_HIRAGANA(c))
602 charClass = hiragana;
603 else if (IS_JA_KATAKANA(c))
604 charClass = katakana;
605 else if (IS_JA_KANJI(c))
606 charClass = kanji;
607 else if (IS_JA_KUTEN(c))
608 charClass = kuten;
609 else if (IS_JA_TOUTEN(c))
610 charClass = touten;
611 else if (IS_JA_FWLATAIN(c))
612 charClass = fwlatain;
613
614 return charClass;
615 }
616
isJapanese(const char * word)617 static bool isJapanese(const char* word) {
618 nsString text = NS_ConvertUTF8toUTF16(word);
619 char16_t* p = (char16_t*)text.get();
620 char16_t c;
621
622 // it is japanese chunk if it contains any hiragana or katakana.
623 while ((c = *p++))
624 if (IS_JAPANESE_SPECIFIC(c)) return true;
625
626 return false;
627 }
628
isFWNumeral(const char16_t * p1,const char16_t * p2)629 static bool isFWNumeral(const char16_t* p1, const char16_t* p2) {
630 for (; p1 < p2; p1++)
631 if (!IS_JA_FWNUMERAL(*p1)) return false;
632
633 return true;
634 }
635
636 // The japanese tokenizer was added as part of Bug #277354
tokenize_japanese_word(char * chunk)637 void Tokenizer::tokenize_japanese_word(char* chunk) {
638 MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug,
639 ("entering tokenize_japanese_word(%s)", chunk));
640
641 nsString srcStr = NS_ConvertUTF8toUTF16(chunk);
642 const char16_t* p1 = srcStr.get();
643 const char16_t* p2 = p1;
644 if (!*p2) return;
645
646 char_class cc = getCharClass(*p2);
647 while (*(++p2)) {
648 if (cc == getCharClass(*p2)) continue;
649
650 nsCString token = NS_ConvertUTF16toUTF8(p1, p2 - p1);
651 if ((!isDecimalNumber(token.get())) && (!isFWNumeral(p1, p2))) {
652 nsCString tmpStr;
653 tmpStr.AppendLiteral("JA:");
654 tmpStr.Append(token);
655 add(tmpStr.get());
656 }
657
658 cc = getCharClass(*p2);
659 p1 = p2;
660 }
661 }
662
stripHTML(const nsAString & inString,nsAString & outString)663 nsresult Tokenizer::stripHTML(const nsAString& inString, nsAString& outString) {
664 uint32_t flags = nsIDocumentEncoder::OutputLFLineBreak |
665 nsIDocumentEncoder::OutputNoScriptContent |
666 nsIDocumentEncoder::OutputNoFramesContent |
667 nsIDocumentEncoder::OutputBodyOnly;
668 nsCOMPtr<nsIParserUtils> utils = do_GetService(NS_PARSERUTILS_CONTRACTID);
669 return utils->ConvertToPlainText(inString, flags, 80, outString);
670 }
671
672 // Copied from nsSemanticUnitScanner.cpp which was removed in bug 1368418.
ScannerNext(const char16_t * text,int32_t length,int32_t pos,bool isLastBuffer,int32_t * begin,int32_t * end,bool * _retval)673 nsresult Tokenizer::ScannerNext(const char16_t* text, int32_t length,
674 int32_t pos, bool isLastBuffer, int32_t* begin,
675 int32_t* end, bool* _retval) {
676 if (!mWordBreaker) {
677 mWordBreaker = mozilla::intl::WordBreaker::Create();
678 }
679
680 // if we reach the end, just return
681 if (pos >= length) {
682 *begin = pos;
683 *end = pos;
684 *_retval = false;
685 return NS_OK;
686 }
687
688 mozilla::intl::WordBreakClass char_class =
689 mozilla::intl::WordBreaker::GetClass(text[pos]);
690
691 // If we are in Chinese mode, return one Han letter at a time.
692 // We should not do this if we are in Japanese or Korean mode.
693 if (mozilla::intl::kWbClassHanLetter == char_class) {
694 *begin = pos;
695 *end = pos + 1;
696 *_retval = true;
697 return NS_OK;
698 }
699
700 int32_t next;
701 // Find the next "word".
702 next = mWordBreaker->NextWord(text, (uint32_t)length, (uint32_t)pos);
703
704 // If we don't have enough text to make decision, return.
705 if (next == NS_WORDBREAKER_NEED_MORE_TEXT) {
706 *begin = pos;
707 *end = isLastBuffer ? length : pos;
708 *_retval = isLastBuffer;
709 return NS_OK;
710 }
711
712 // If what we got is space or punct, look at the next break.
713 if (char_class == mozilla::intl::kWbClassSpace ||
714 char_class == mozilla::intl::kWbClassPunct) {
715 // If the next "word" is not letters,
716 // call itself recursively with the new pos.
717 return ScannerNext(text, length, next, isLastBuffer, begin, end, _retval);
718 }
719
720 // For the rest, return.
721 *begin = pos;
722 *end = next;
723 *_retval = true;
724 return NS_OK;
725 }
726
tokenize(const char * aText)727 void Tokenizer::tokenize(const char* aText) {
728 MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug, ("tokenize: %s", aText));
729
730 // strip out HTML tags before we begin processing
731 // uggh but first we have to blow up our string into UCS2
732 // since that's what the document encoder wants. UTF8/UCS2, I wish we all
733 // spoke the same language here..
734 nsString text = NS_ConvertUTF8toUTF16(aText);
735 nsString strippedUCS2;
736
737 // RSS feeds store their summary information as an iframe. But due to
738 // bug 365953, we can't see those in the plaintext serializer. As a
739 // workaround, allow an option to replace iframe with div in the message
740 // text. We disable by default, since most people won't be applying bayes
741 // to RSS
742
743 if (mIframeToDiv) {
744 text.ReplaceSubstring(u"<iframe"_ns, u"<div"_ns);
745 text.ReplaceSubstring(u"/iframe>"_ns, u"/div>"_ns);
746 }
747
748 stripHTML(text, strippedUCS2);
749
750 // convert 0x3000(full width space) into 0x0020
751 char16_t* substr_start = strippedUCS2.BeginWriting();
752 char16_t* substr_end = strippedUCS2.EndWriting();
753 while (substr_start != substr_end) {
754 if (*substr_start == 0x3000) *substr_start = 0x0020;
755 ++substr_start;
756 }
757
758 nsCString strippedStr = NS_ConvertUTF16toUTF8(strippedUCS2);
759 char* strippedText = strippedStr.BeginWriting();
760 MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug,
761 ("tokenize stripped html: %s", strippedText));
762
763 char* word;
764 char* next = strippedText;
765 while ((word = NS_strtok(mBodyDelimiters.get(), &next)) != NULL) {
766 if (!*word) continue;
767 if (isDecimalNumber(word)) continue;
768 if (isASCII(word))
769 tokenize_ascii_word(word);
770 else if (isJapanese(word))
771 tokenize_japanese_word(word);
772 else {
773 nsresult rv;
774 // Convert this word from UTF-8 into UCS2.
775 NS_ConvertUTF8toUTF16 uword(word);
776 ToLowerCase(uword);
777 const char16_t* utext = uword.get();
778 int32_t len = uword.Length(), pos = 0, begin, end;
779 bool gotUnit;
780 while (pos < len) {
781 rv = ScannerNext(utext, len, pos, true, &begin, &end, &gotUnit);
782 if (NS_SUCCEEDED(rv) && gotUnit) {
783 NS_ConvertUTF16toUTF8 utfUnit(utext + begin, end - begin);
784 add(utfUnit.get());
785 // Advance to end of current unit.
786 pos = end;
787 } else {
788 break;
789 }
790 }
791 }
792 }
793 }
794
795 // helper function to un-escape \n, \t, etc from a CString
UnescapeCString(nsCString & aCString)796 void Tokenizer::UnescapeCString(nsCString& aCString) {
797 nsAutoCString result;
798
799 const char* readEnd = aCString.EndReading();
800 result.SetLength(aCString.Length());
801 char* writeStart = result.BeginWriting();
802 char* writeIter = writeStart;
803
804 bool inEscape = false;
805 for (const char* readIter = aCString.BeginReading(); readIter != readEnd;
806 readIter++) {
807 if (!inEscape) {
808 if (*readIter == '\\')
809 inEscape = true;
810 else
811 *(writeIter++) = *readIter;
812 } else {
813 inEscape = false;
814 switch (*readIter) {
815 case '\\':
816 *(writeIter++) = '\\';
817 break;
818 case 't':
819 *(writeIter++) = '\t';
820 break;
821 case 'n':
822 *(writeIter++) = '\n';
823 break;
824 case 'v':
825 *(writeIter++) = '\v';
826 break;
827 case 'f':
828 *(writeIter++) = '\f';
829 break;
830 case 'r':
831 *(writeIter++) = '\r';
832 break;
833 default:
834 // all other escapes are ignored
835 break;
836 }
837 }
838 }
839 result.Truncate(writeIter - writeStart);
840 aCString.Assign(result);
841 }
842
copyTokens()843 Token* Tokenizer::copyTokens() {
844 uint32_t count = countTokens();
845 if (count > 0) {
846 Token* tokens = new Token[count];
847 if (tokens) {
848 Token* tp = tokens;
849 TokenEnumeration e(&mTokenTable);
850 while (e.hasMoreTokens()) {
851 Token* src = static_cast<Token*>(e.nextToken());
852 tp->clone(*src);
853 ++tp;
854 }
855 }
856 return tokens;
857 }
858 return NULL;
859 }
860
861 class TokenAnalyzer {
862 public:
~TokenAnalyzer()863 virtual ~TokenAnalyzer() {}
864
865 virtual void analyzeTokens(Tokenizer& tokenizer) = 0;
setTokenListener(nsIStreamListener * aTokenListener)866 void setTokenListener(nsIStreamListener* aTokenListener) {
867 mTokenListener = aTokenListener;
868 }
869
setSource(const nsACString & sourceURI)870 void setSource(const nsACString& sourceURI) { mTokenSource = sourceURI; }
871
872 nsCOMPtr<nsIStreamListener> mTokenListener;
873 nsCString mTokenSource;
874 };
875
876 /**
877 * This class downloads the raw content of an email message, buffering until
878 * complete segments are seen, that is until a linefeed is seen, although
879 * any of the valid token separators would do. This could be a further
880 * refinement.
881 */
882 class TokenStreamListener : public nsIStreamListener, nsIMsgHeaderSink {
883 public:
884 NS_DECL_ISUPPORTS
885 NS_DECL_NSIREQUESTOBSERVER
886 NS_DECL_NSISTREAMLISTENER
887 NS_DECL_NSIMSGHEADERSINK
888
889 explicit TokenStreamListener(TokenAnalyzer* analyzer);
890
891 protected:
892 virtual ~TokenStreamListener();
893 TokenAnalyzer* mAnalyzer;
894 char* mBuffer;
895 uint32_t mBufferSize;
896 uint32_t mLeftOverCount;
897 Tokenizer mTokenizer;
898 bool mSetAttachmentFlag;
899 };
900
901 const uint32_t kBufferSize = 16384;
902
TokenStreamListener(TokenAnalyzer * analyzer)903 TokenStreamListener::TokenStreamListener(TokenAnalyzer* analyzer)
904 : mAnalyzer(analyzer),
905 mBuffer(NULL),
906 mBufferSize(kBufferSize),
907 mLeftOverCount(0),
908 mSetAttachmentFlag(false) {}
909
~TokenStreamListener()910 TokenStreamListener::~TokenStreamListener() {
911 delete[] mBuffer;
912 delete mAnalyzer;
913 }
914
NS_IMPL_ISUPPORTS(TokenStreamListener,nsIRequestObserver,nsIStreamListener,nsIMsgHeaderSink)915 NS_IMPL_ISUPPORTS(TokenStreamListener, nsIRequestObserver, nsIStreamListener,
916 nsIMsgHeaderSink)
917
918 NS_IMETHODIMP TokenStreamListener::ProcessHeaders(
919 nsIUTF8StringEnumerator* aHeaderNames,
920 nsIUTF8StringEnumerator* aHeaderValues, bool dontCollectAddress) {
921 mTokenizer.tokenizeHeaders(aHeaderNames, aHeaderValues);
922 return NS_OK;
923 }
924
HandleAttachment(const char * contentType,const nsACString & url,const char16_t * displayName,const nsACString & uri,bool aIsExternalAttachment)925 NS_IMETHODIMP TokenStreamListener::HandleAttachment(
926 const char* contentType, const nsACString& url, const char16_t* displayName,
927 const nsACString& uri, bool aIsExternalAttachment) {
928 mTokenizer.tokenizeAttachment(contentType,
929 NS_ConvertUTF16toUTF8(displayName).get());
930 return NS_OK;
931 }
932
AddAttachmentField(const char * field,const char * value)933 NS_IMETHODIMP TokenStreamListener::AddAttachmentField(const char* field,
934 const char* value) {
935 return NS_OK;
936 }
937
OnEndAllAttachments()938 NS_IMETHODIMP TokenStreamListener::OnEndAllAttachments() { return NS_OK; }
939
OnEndMsgDownload(nsIMsgMailNewsUrl * url)940 NS_IMETHODIMP TokenStreamListener::OnEndMsgDownload(nsIMsgMailNewsUrl* url) {
941 return NS_OK;
942 }
943
OnMsgHasRemoteContent(nsIMsgDBHdr * aMsgHdr,nsIURI * aContentURI,bool aCanOverride)944 NS_IMETHODIMP TokenStreamListener::OnMsgHasRemoteContent(nsIMsgDBHdr* aMsgHdr,
945 nsIURI* aContentURI,
946 bool aCanOverride) {
947 return NS_OK;
948 }
949
OnEndMsgHeaders(nsIMsgMailNewsUrl * url)950 NS_IMETHODIMP TokenStreamListener::OnEndMsgHeaders(nsIMsgMailNewsUrl* url) {
951 return NS_OK;
952 }
953
GetSecurityInfo(nsISupports ** aSecurityInfo)954 NS_IMETHODIMP TokenStreamListener::GetSecurityInfo(
955 nsISupports** aSecurityInfo) {
956 return NS_OK;
957 }
SetSecurityInfo(nsISupports * aSecurityInfo)958 NS_IMETHODIMP TokenStreamListener::SetSecurityInfo(nsISupports* aSecurityInfo) {
959 return NS_OK;
960 }
961
GetDummyMsgHeader(nsIMsgDBHdr ** aMsgDBHdr)962 NS_IMETHODIMP TokenStreamListener::GetDummyMsgHeader(nsIMsgDBHdr** aMsgDBHdr) {
963 return NS_ERROR_NOT_IMPLEMENTED;
964 }
965
ResetProperties()966 NS_IMETHODIMP TokenStreamListener::ResetProperties() { return NS_OK; }
967
GetProperties(nsIWritablePropertyBag2 ** aProperties)968 NS_IMETHODIMP TokenStreamListener::GetProperties(
969 nsIWritablePropertyBag2** aProperties) {
970 return NS_ERROR_NOT_IMPLEMENTED;
971 }
972
973 /* void onStartRequest (in nsIRequest aRequest); */
OnStartRequest(nsIRequest * aRequest)974 NS_IMETHODIMP TokenStreamListener::OnStartRequest(nsIRequest* aRequest) {
975 mLeftOverCount = 0;
976 if (!mBuffer) {
977 mBuffer = new char[mBufferSize];
978 NS_ENSURE_TRUE(mBuffer, NS_ERROR_OUT_OF_MEMORY);
979 }
980
981 // get the url for the channel and set our nsIMsgHeaderSink on it so we get
982 // notified about the headers and attachments
983
984 nsCOMPtr<nsIChannel> channel(do_QueryInterface(aRequest));
985 if (channel) {
986 nsCOMPtr<nsIURI> uri;
987 channel->GetURI(getter_AddRefs(uri));
988 nsCOMPtr<nsIMsgMailNewsUrl> mailUrl = do_QueryInterface(uri);
989 if (mailUrl)
990 mailUrl->SetMsgHeaderSink(static_cast<nsIMsgHeaderSink*>(this));
991 }
992
993 return NS_OK;
994 }
995
996 /* void onDataAvailable (in nsIRequest aRequest, in nsIInputStream aInputStream,
997 * in unsigned long long aOffset, in unsigned long aCount); */
OnDataAvailable(nsIRequest * aRequest,nsIInputStream * aInputStream,uint64_t aOffset,uint32_t aCount)998 NS_IMETHODIMP TokenStreamListener::OnDataAvailable(nsIRequest* aRequest,
999 nsIInputStream* aInputStream,
1000 uint64_t aOffset,
1001 uint32_t aCount) {
1002 nsresult rv = NS_OK;
1003
1004 while (aCount > 0) {
1005 uint32_t readCount, totalCount = (aCount + mLeftOverCount);
1006 if (totalCount >= mBufferSize) {
1007 readCount = mBufferSize - mLeftOverCount - 1;
1008 } else {
1009 readCount = aCount;
1010 }
1011
1012 // mBuffer is supposed to be allocated in onStartRequest. But something
1013 // is causing that to not happen, so as a last-ditch attempt we'll
1014 // do it here.
1015 if (!mBuffer) {
1016 mBuffer = new char[mBufferSize];
1017 NS_ENSURE_TRUE(mBuffer, NS_ERROR_OUT_OF_MEMORY);
1018 }
1019
1020 char* buffer = mBuffer;
1021 rv = aInputStream->Read(buffer + mLeftOverCount, readCount, &readCount);
1022 if (NS_FAILED(rv)) break;
1023
1024 if (readCount == 0) {
1025 rv = NS_ERROR_UNEXPECTED;
1026 NS_WARNING("failed to tokenize");
1027 break;
1028 }
1029
1030 aCount -= readCount;
1031
1032 /* consume the tokens up to the last legal token delimiter in the buffer. */
1033 totalCount = (readCount + mLeftOverCount);
1034 buffer[totalCount] = '\0';
1035 char* lastDelimiter = NULL;
1036 char* scan = buffer + totalCount;
1037 while (scan > buffer) {
1038 if (strchr(mTokenizer.mBodyDelimiters.get(), *--scan)) {
1039 lastDelimiter = scan;
1040 break;
1041 }
1042 }
1043
1044 if (lastDelimiter) {
1045 *lastDelimiter = '\0';
1046 mTokenizer.tokenize(buffer);
1047
1048 uint32_t consumedCount = 1 + (lastDelimiter - buffer);
1049 mLeftOverCount = totalCount - consumedCount;
1050 if (mLeftOverCount)
1051 memmove(buffer, buffer + consumedCount, mLeftOverCount);
1052 } else {
1053 /* didn't find a delimiter, keep the whole buffer around. */
1054 mLeftOverCount = totalCount;
1055 if (totalCount >= (mBufferSize / 2)) {
1056 uint32_t newBufferSize = mBufferSize * 2;
1057 char* newBuffer = new char[newBufferSize];
1058 NS_ENSURE_TRUE(newBuffer, NS_ERROR_OUT_OF_MEMORY);
1059 memcpy(newBuffer, mBuffer, mLeftOverCount);
1060 delete[] mBuffer;
1061 mBuffer = newBuffer;
1062 mBufferSize = newBufferSize;
1063 }
1064 }
1065 }
1066
1067 return rv;
1068 }
1069
1070 /* void onStopRequest (in nsIRequest aRequest, in nsresult aStatusCode); */
OnStopRequest(nsIRequest * aRequest,nsresult aStatusCode)1071 NS_IMETHODIMP TokenStreamListener::OnStopRequest(nsIRequest* aRequest,
1072 nsresult aStatusCode) {
1073 if (mLeftOverCount) {
1074 /* assume final buffer is complete. */
1075 mBuffer[mLeftOverCount] = '\0';
1076 mTokenizer.tokenize(mBuffer);
1077 }
1078
1079 /* finally, analyze the tokenized message. */
1080 MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug,
1081 ("analyze the tokenized message"));
1082 if (mAnalyzer) mAnalyzer->analyzeTokens(mTokenizer);
1083
1084 return NS_OK;
1085 }
1086
1087 /* Implementation file */
1088
NS_IMPL_ISUPPORTS(nsBayesianFilter,nsIMsgFilterPlugin,nsIJunkMailPlugin,nsIMsgCorpus,nsISupportsWeakReference,nsIObserver)1089 NS_IMPL_ISUPPORTS(nsBayesianFilter, nsIMsgFilterPlugin, nsIJunkMailPlugin,
1090 nsIMsgCorpus, nsISupportsWeakReference, nsIObserver)
1091
1092 nsBayesianFilter::nsBayesianFilter() : mTrainingDataDirty(false) {
1093 int32_t junkThreshold = 0;
1094 nsresult rv;
1095 nsCOMPtr<nsIPrefBranch> pPrefBranch(
1096 do_GetService(NS_PREFSERVICE_CONTRACTID, &rv));
1097 if (pPrefBranch)
1098 pPrefBranch->GetIntPref("mail.adaptivefilters.junk_threshold",
1099 &junkThreshold);
1100
1101 mJunkProbabilityThreshold = (static_cast<double>(junkThreshold)) / 100.0;
1102 if (mJunkProbabilityThreshold == 0 || mJunkProbabilityThreshold >= 1)
1103 mJunkProbabilityThreshold = kDefaultJunkThreshold;
1104
1105 MOZ_LOG(BayesianFilterLogModule, LogLevel::Warning,
1106 ("junk probability threshold: %f", mJunkProbabilityThreshold));
1107
1108 mCorpus.readTrainingData();
1109
1110 // get parameters for training data flushing, from the prefs
1111
1112 nsCOMPtr<nsIPrefBranch> prefBranch;
1113
1114 nsCOMPtr<nsIPrefService> prefs =
1115 do_GetService(NS_PREFSERVICE_CONTRACTID, &rv);
1116 NS_ASSERTION(NS_SUCCEEDED(rv), "failed accessing preferences service");
1117 rv = prefs->GetBranch(nullptr, getter_AddRefs(prefBranch));
1118 NS_ASSERTION(NS_SUCCEEDED(rv), "failed getting preferences branch");
1119
1120 rv = prefBranch->GetIntPref(
1121 "mailnews.bayesian_spam_filter.flush.minimum_interval",
1122 &mMinFlushInterval);
1123 // it is not a good idea to allow a minimum interval of under 1 second
1124 if (NS_FAILED(rv) || (mMinFlushInterval <= 1000))
1125 mMinFlushInterval = DEFAULT_MIN_INTERVAL_BETWEEN_WRITES;
1126
1127 rv = prefBranch->GetIntPref("mailnews.bayesian_spam_filter.junk_maxtokens",
1128 &mMaximumTokenCount);
1129 if (NS_FAILED(rv))
1130 mMaximumTokenCount = 0; // which means do not limit token counts
1131 MOZ_LOG(BayesianFilterLogModule, LogLevel::Warning,
1132 ("maximum junk tokens: %d", mMaximumTokenCount));
1133
1134 mTimer = do_CreateInstance(NS_TIMER_CONTRACTID, &rv);
1135 NS_ASSERTION(
1136 NS_SUCCEEDED(rv),
1137 "unable to create a timer; training data will only be written on exit");
1138
1139 // the timer is not used on object construction, since for
1140 // the time being there are no dirying messages
1141
1142 // give a default capacity to the memory structure used to store
1143 // per-message/per-trait token data
1144 mAnalysisStore.SetCapacity(kAnalysisStoreCapacity);
1145
1146 // dummy 0th element. Index 0 means "end of list" so we need to
1147 // start from 1
1148 AnalysisPerToken analysisPT(0, 0.0, 0.0);
1149 mAnalysisStore.AppendElement(analysisPT);
1150 mNextAnalysisIndex = 1;
1151 }
1152
Init()1153 nsresult nsBayesianFilter::Init() {
1154 nsCOMPtr<nsIObserverService> observerService =
1155 mozilla::services::GetObserverService();
1156 if (observerService)
1157 observerService->AddObserver(this, "profile-before-change", true);
1158 return NS_OK;
1159 }
1160
TimerCallback(nsITimer * aTimer,void * aClosure)1161 void nsBayesianFilter::TimerCallback(nsITimer* aTimer, void* aClosure) {
1162 // we will flush the training data to disk after enough time has passed
1163 // since the first time a message has been classified after the last flush
1164
1165 nsBayesianFilter* filter = static_cast<nsBayesianFilter*>(aClosure);
1166 filter->mCorpus.writeTrainingData(filter->mMaximumTokenCount);
1167 filter->mTrainingDataDirty = false;
1168 }
1169
~nsBayesianFilter()1170 nsBayesianFilter::~nsBayesianFilter() {
1171 if (mTimer) {
1172 mTimer->Cancel();
1173 mTimer = nullptr;
1174 }
1175 // call shutdown when we are going away in case we need
1176 // to flush the training set to disk
1177 Shutdown();
1178 }
1179
1180 // this object is used for one call to classifyMessage or classifyMessages().
1181 // So if we're classifying multiple messages, this object will be used for each
1182 // message. It's going to hold a reference to itself, basically, to stay in
1183 // memory.
1184 class MessageClassifier : public TokenAnalyzer {
1185 public:
1186 // full classifier with arbitrary traits
MessageClassifier(nsBayesianFilter * aFilter,nsIJunkMailClassificationListener * aJunkListener,nsIMsgTraitClassificationListener * aTraitListener,nsIMsgTraitDetailListener * aDetailListener,const nsTArray<uint32_t> & aProTraits,const nsTArray<uint32_t> & aAntiTraits,nsIMsgWindow * aMsgWindow,const nsTArray<nsCString> & aMessageURIs)1187 MessageClassifier(nsBayesianFilter* aFilter,
1188 nsIJunkMailClassificationListener* aJunkListener,
1189 nsIMsgTraitClassificationListener* aTraitListener,
1190 nsIMsgTraitDetailListener* aDetailListener,
1191 const nsTArray<uint32_t>& aProTraits,
1192 const nsTArray<uint32_t>& aAntiTraits,
1193 nsIMsgWindow* aMsgWindow,
1194 const nsTArray<nsCString>& aMessageURIs)
1195 : mFilter(aFilter),
1196 mJunkMailPlugin(aFilter),
1197 mJunkListener(aJunkListener),
1198 mTraitListener(aTraitListener),
1199 mDetailListener(aDetailListener),
1200 mProTraits(aProTraits.Clone()),
1201 mAntiTraits(aAntiTraits.Clone()),
1202 mMsgWindow(aMsgWindow),
1203 mMessageURIs(aMessageURIs.Clone()),
1204 mCurMessageToClassify(0) {
1205 MOZ_ASSERT(aProTraits.Length() == aAntiTraits.Length());
1206 }
1207
1208 // junk-only classifier
MessageClassifier(nsBayesianFilter * aFilter,nsIJunkMailClassificationListener * aJunkListener,nsIMsgWindow * aMsgWindow,const nsTArray<nsCString> & aMessageURIs)1209 MessageClassifier(nsBayesianFilter* aFilter,
1210 nsIJunkMailClassificationListener* aJunkListener,
1211 nsIMsgWindow* aMsgWindow,
1212 const nsTArray<nsCString>& aMessageURIs)
1213 : mFilter(aFilter),
1214 mJunkMailPlugin(aFilter),
1215 mJunkListener(aJunkListener),
1216 mTraitListener(nullptr),
1217 mDetailListener(nullptr),
1218 mMsgWindow(aMsgWindow),
1219 mMessageURIs(aMessageURIs.Clone()),
1220 mCurMessageToClassify(0) {
1221 mProTraits.AppendElement(kJunkTrait);
1222 mAntiTraits.AppendElement(kGoodTrait);
1223 }
1224
~MessageClassifier()1225 virtual ~MessageClassifier() {}
analyzeTokens(Tokenizer & tokenizer)1226 virtual void analyzeTokens(Tokenizer& tokenizer) {
1227 mFilter->classifyMessage(tokenizer, mTokenSource, mProTraits, mAntiTraits,
1228 mJunkListener, mTraitListener, mDetailListener);
1229 tokenizer.clearTokens();
1230 classifyNextMessage();
1231 }
1232
classifyNextMessage()1233 virtual void classifyNextMessage() {
1234 if (++mCurMessageToClassify < mMessageURIs.Length()) {
1235 MOZ_LOG(BayesianFilterLogModule, LogLevel::Warning,
1236 ("classifyNextMessage(%s)",
1237 mMessageURIs[mCurMessageToClassify].get()));
1238 mFilter->tokenizeMessage(mMessageURIs[mCurMessageToClassify], mMsgWindow,
1239 this);
1240 } else {
1241 // call all listeners with null parameters to signify end of batch
1242 if (mJunkListener)
1243 mJunkListener->OnMessageClassified(EmptyCString(),
1244 nsIJunkMailPlugin::UNCLASSIFIED, 0);
1245 if (mTraitListener) {
1246 nsTArray<uint32_t> nullTraits;
1247 nsTArray<uint32_t> nullPercents;
1248 mTraitListener->OnMessageTraitsClassified(EmptyCString(), nullTraits,
1249 nullPercents);
1250 }
1251 mTokenListener =
1252 nullptr; // this breaks the circular ref that keeps this object alive
1253 // so we will be destroyed as a result.
1254 }
1255 }
1256
1257 private:
1258 nsBayesianFilter* mFilter;
1259 nsCOMPtr<nsIJunkMailPlugin> mJunkMailPlugin;
1260 nsCOMPtr<nsIJunkMailClassificationListener> mJunkListener;
1261 nsCOMPtr<nsIMsgTraitClassificationListener> mTraitListener;
1262 nsCOMPtr<nsIMsgTraitDetailListener> mDetailListener;
1263 nsTArray<uint32_t> mProTraits;
1264 nsTArray<uint32_t> mAntiTraits;
1265 nsCOMPtr<nsIMsgWindow> mMsgWindow;
1266 nsTArray<nsCString> mMessageURIs;
1267 uint32_t mCurMessageToClassify; // 0-based index
1268 };
1269
tokenizeMessage(const nsACString & aMessageURI,nsIMsgWindow * aMsgWindow,TokenAnalyzer * aAnalyzer)1270 nsresult nsBayesianFilter::tokenizeMessage(const nsACString& aMessageURI,
1271 nsIMsgWindow* aMsgWindow,
1272 TokenAnalyzer* aAnalyzer) {
1273 nsCOMPtr<nsIMsgMessageService> msgService;
1274 nsresult rv =
1275 GetMessageServiceFromURI(aMessageURI, getter_AddRefs(msgService));
1276 NS_ENSURE_SUCCESS(rv, rv);
1277
1278 aAnalyzer->setSource(aMessageURI);
1279 nsCOMPtr<nsIURI> dummyNull;
1280 return msgService->StreamMessage(
1281 aMessageURI, aAnalyzer->mTokenListener, aMsgWindow, nullptr,
1282 true /* convert data */, "filter"_ns, false, getter_AddRefs(dummyNull));
1283 }
1284
1285 // a TraitAnalysis is the per-token representation of the statistical
1286 // calculations, basically created to group information that is then
1287 // sorted by mDistance
1288 struct TraitAnalysis {
1289 uint32_t mTokenIndex;
1290 double mDistance;
1291 double mProbability;
1292 };
1293
1294 // comparator required to sort an nsTArray
1295 class compareTraitAnalysis {
1296 public:
Equals(const TraitAnalysis & a,const TraitAnalysis & b) const1297 bool Equals(const TraitAnalysis& a, const TraitAnalysis& b) const {
1298 return a.mDistance == b.mDistance;
1299 }
LessThan(const TraitAnalysis & a,const TraitAnalysis & b) const1300 bool LessThan(const TraitAnalysis& a, const TraitAnalysis& b) const {
1301 return a.mDistance < b.mDistance;
1302 }
1303 };
1304
dmax(double x,double y)1305 inline double dmax(double x, double y) { return (x > y ? x : y); }
dmin(double x,double y)1306 inline double dmin(double x, double y) { return (x < y ? x : y); }
1307
1308 // Chi square functions are implemented by an incomplete gamma function.
1309 // Note that chi2P's callers multiply the arguments by 2 but chi2P
1310 // divides them by 2 again. Inlining chi2P gives the compiler a
1311 // chance to notice this.
1312
1313 // Both chi2P and nsIncompleteGammaP set *error negative on domain
1314 // errors and nsIncompleteGammaP sets it posivive on internal errors.
1315 // This may be useful but the chi2P callers treat any error as fatal.
1316
1317 // Note that converting unsigned ints to floating point can be slow on
1318 // some platforms (like Intel) so use signed quantities for the numeric
1319 // routines.
chi2P(double chi2,double nu,int32_t * error)1320 static inline double chi2P(double chi2, double nu, int32_t* error) {
1321 // domain checks; set error and return a dummy value
1322 if (chi2 < 0.0 || nu <= 0.0) {
1323 *error = -1;
1324 return 0.0;
1325 }
1326 // reversing the arguments is intentional
1327 return nsIncompleteGammaP(nu / 2.0, chi2 / 2.0, error);
1328 }
1329
classifyMessage(Tokenizer & tokenizer,const nsACString & messageURI,nsTArray<uint32_t> & aProTraits,nsTArray<uint32_t> & aAntiTraits,nsIJunkMailClassificationListener * listener,nsIMsgTraitClassificationListener * aTraitListener,nsIMsgTraitDetailListener * aDetailListener)1330 void nsBayesianFilter::classifyMessage(
1331 Tokenizer& tokenizer, const nsACString& messageURI,
1332 nsTArray<uint32_t>& aProTraits, nsTArray<uint32_t>& aAntiTraits,
1333 nsIJunkMailClassificationListener* listener,
1334 nsIMsgTraitClassificationListener* aTraitListener,
1335 nsIMsgTraitDetailListener* aDetailListener) {
1336 Token* tokens = tokenizer.copyTokens();
1337 uint32_t tokenCount;
1338 if (!tokens) {
1339 // This can happen with problems with UTF conversion
1340 NS_ERROR("Trying to classify a null or invalid message");
1341 tokenCount = 0;
1342 // don't return so that we still call the listeners
1343 } else {
1344 tokenCount = tokenizer.countTokens();
1345 }
1346
1347 if (aProTraits.Length() != aAntiTraits.Length()) {
1348 NS_ERROR("Each Pro trait needs a matching Anti trait");
1349 return;
1350 }
1351
1352 /* this part is similar to the Graham algorithm with some adjustments. */
1353 uint32_t traitCount = aProTraits.Length();
1354
1355 // pro message counts per trait index
1356 AutoTArray<uint32_t, kTraitAutoCapacity> numProMessages;
1357 // anti message counts per trait index
1358 AutoTArray<uint32_t, kTraitAutoCapacity> numAntiMessages;
1359 // array of pro aliases per trait index
1360 AutoTArray<nsTArray<uint32_t>, kTraitAutoCapacity> proAliasArrays;
1361 // array of anti aliases per trait index
1362 AutoTArray<nsTArray<uint32_t>, kTraitAutoCapacity> antiAliasArrays;
1363 // construct the outgoing listener arrays
1364 AutoTArray<uint32_t, kTraitAutoCapacity> traits;
1365 AutoTArray<uint32_t, kTraitAutoCapacity> percents;
1366 if (traitCount > kTraitAutoCapacity) {
1367 traits.SetCapacity(traitCount);
1368 percents.SetCapacity(traitCount);
1369 numProMessages.SetCapacity(traitCount);
1370 numAntiMessages.SetCapacity(traitCount);
1371 proAliasArrays.SetCapacity(traitCount);
1372 antiAliasArrays.SetCapacity(traitCount);
1373 }
1374
1375 nsresult rv;
1376 nsCOMPtr<nsIMsgTraitService> traitService(
1377 do_GetService("@mozilla.org/msg-trait-service;1", &rv));
1378 if (NS_FAILED(rv)) {
1379 NS_ERROR("Failed to get trait service");
1380 MOZ_LOG(BayesianFilterLogModule, LogLevel::Error,
1381 ("Failed to get trait service"));
1382 }
1383
1384 // get aliases and message counts for the pro and anti traits
1385 for (uint32_t traitIndex = 0; traitIndex < traitCount; traitIndex++) {
1386 nsresult rv;
1387
1388 // pro trait
1389 nsTArray<uint32_t> proAliases;
1390 uint32_t proTrait = aProTraits[traitIndex];
1391 if (traitService) {
1392 rv = traitService->GetAliases(proTrait, proAliases);
1393 if (NS_FAILED(rv)) {
1394 NS_ERROR("trait service failed to get aliases");
1395 MOZ_LOG(BayesianFilterLogModule, LogLevel::Error,
1396 ("trait service failed to get aliases"));
1397 }
1398 }
1399 proAliasArrays.AppendElement(proAliases.Clone());
1400 uint32_t proMessageCount = mCorpus.getMessageCount(proTrait);
1401 for (uint32_t aliasIndex = 0; aliasIndex < proAliases.Length();
1402 aliasIndex++)
1403 proMessageCount += mCorpus.getMessageCount(proAliases[aliasIndex]);
1404 numProMessages.AppendElement(proMessageCount);
1405
1406 // anti trait
1407 nsTArray<uint32_t> antiAliases;
1408 uint32_t antiTrait = aAntiTraits[traitIndex];
1409 if (traitService) {
1410 rv = traitService->GetAliases(antiTrait, antiAliases);
1411 if (NS_FAILED(rv)) {
1412 NS_ERROR("trait service failed to get aliases");
1413 MOZ_LOG(BayesianFilterLogModule, LogLevel::Error,
1414 ("trait service failed to get aliases"));
1415 }
1416 }
1417 antiAliasArrays.AppendElement(antiAliases.Clone());
1418 uint32_t antiMessageCount = mCorpus.getMessageCount(antiTrait);
1419 for (uint32_t aliasIndex = 0; aliasIndex < antiAliases.Length();
1420 aliasIndex++)
1421 antiMessageCount += mCorpus.getMessageCount(antiAliases[aliasIndex]);
1422 numAntiMessages.AppendElement(antiMessageCount);
1423 }
1424
1425 for (uint32_t i = 0; i < tokenCount; ++i) {
1426 Token& token = tokens[i];
1427 CorpusToken* t = mCorpus.get(token.mWord);
1428 if (!t) continue;
1429 for (uint32_t traitIndex = 0; traitIndex < traitCount; traitIndex++) {
1430 uint32_t iProCount = mCorpus.getTraitCount(t, aProTraits[traitIndex]);
1431 // add in any counts for aliases to proTrait
1432 for (uint32_t aliasIndex = 0;
1433 aliasIndex < proAliasArrays[traitIndex].Length(); aliasIndex++)
1434 iProCount +=
1435 mCorpus.getTraitCount(t, proAliasArrays[traitIndex][aliasIndex]);
1436 double proCount = static_cast<double>(iProCount);
1437
1438 uint32_t iAntiCount = mCorpus.getTraitCount(t, aAntiTraits[traitIndex]);
1439 // add in any counts for aliases to antiTrait
1440 for (uint32_t aliasIndex = 0;
1441 aliasIndex < antiAliasArrays[traitIndex].Length(); aliasIndex++)
1442 iAntiCount +=
1443 mCorpus.getTraitCount(t, antiAliasArrays[traitIndex][aliasIndex]);
1444 double antiCount = static_cast<double>(iAntiCount);
1445
1446 double prob, denom;
1447 // Prevent a divide by zero error by setting defaults for prob
1448
1449 // If there are no matching tokens at all, ignore.
1450 if (antiCount == 0.0 && proCount == 0.0) continue;
1451 // if only anti match, set probability to 0%
1452 if (proCount == 0.0) prob = 0.0;
1453 // if only pro match, set probability to 100%
1454 else if (antiCount == 0.0)
1455 prob = 1.0;
1456 // not really needed, but just to be sure check the denom as well
1457 else if ((denom = proCount * numAntiMessages[traitIndex] +
1458 antiCount * numProMessages[traitIndex]) == 0.0)
1459 continue;
1460 else
1461 prob = (proCount * numAntiMessages[traitIndex]) / denom;
1462
1463 double n = proCount + antiCount;
1464 prob = (0.225 + n * prob) / (.45 + n);
1465 double distance = std::abs(prob - 0.5);
1466 if (distance >= .1) {
1467 mozilla::DebugOnly<nsresult> rv =
1468 setAnalysis(token, traitIndex, distance, prob);
1469 NS_ASSERTION(NS_SUCCEEDED(rv), "Problem in setAnalysis");
1470 }
1471 }
1472 }
1473
1474 for (uint32_t traitIndex = 0; traitIndex < traitCount; traitIndex++) {
1475 AutoTArray<TraitAnalysis, 1024> traitAnalyses;
1476 // copy valid tokens into an array to sort
1477 for (uint32_t tokenIndex = 0; tokenIndex < tokenCount; tokenIndex++) {
1478 uint32_t storeIndex = getAnalysisIndex(tokens[tokenIndex], traitIndex);
1479 if (storeIndex) {
1480 TraitAnalysis ta = {tokenIndex, mAnalysisStore[storeIndex].mDistance,
1481 mAnalysisStore[storeIndex].mProbability};
1482 traitAnalyses.AppendElement(ta);
1483 }
1484 }
1485
1486 // sort the array by the distances
1487 traitAnalyses.Sort(compareTraitAnalysis());
1488 uint32_t count = traitAnalyses.Length();
1489 uint32_t first, last = count;
1490 const uint32_t kMaxTokens = 150;
1491 first = (count > kMaxTokens) ? count - kMaxTokens : 0;
1492
1493 // Setup the arrays to save details if needed
1494 nsTArray<double> sArray;
1495 nsTArray<double> hArray;
1496 uint32_t usedTokenCount = (count > kMaxTokens) ? kMaxTokens : count;
1497 if (aDetailListener) {
1498 sArray.SetCapacity(usedTokenCount);
1499 hArray.SetCapacity(usedTokenCount);
1500 }
1501
1502 double H = 1.0, S = 1.0;
1503 int32_t Hexp = 0, Sexp = 0;
1504 uint32_t goodclues = 0;
1505 int e;
1506
1507 // index from end to analyze most significant first
1508 for (uint32_t ip1 = last; ip1 != first; --ip1) {
1509 TraitAnalysis& ta = traitAnalyses[ip1 - 1];
1510 if (ta.mDistance > 0.0) {
1511 goodclues++;
1512 double value = ta.mProbability;
1513 S *= (1.0 - value);
1514 H *= value;
1515 if (S < 1e-200) {
1516 S = frexp(S, &e);
1517 Sexp += e;
1518 }
1519 if (H < 1e-200) {
1520 H = frexp(H, &e);
1521 Hexp += e;
1522 }
1523 MOZ_LOG(BayesianFilterLogModule, LogLevel::Warning,
1524 ("token probability (%s) is %f", tokens[ta.mTokenIndex].mWord,
1525 ta.mProbability));
1526 }
1527 if (aDetailListener) {
1528 sArray.AppendElement(log(S) + Sexp * M_LN2);
1529 hArray.AppendElement(log(H) + Hexp * M_LN2);
1530 }
1531 }
1532
1533 S = log(S) + Sexp * M_LN2;
1534 H = log(H) + Hexp * M_LN2;
1535
1536 double prob;
1537 if (goodclues > 0) {
1538 int32_t chi_error;
1539 S = chi2P(-2.0 * S, 2.0 * goodclues, &chi_error);
1540 if (!chi_error) H = chi2P(-2.0 * H, 2.0 * goodclues, &chi_error);
1541 // if any error toss the entire calculation
1542 if (!chi_error)
1543 prob = (S - H + 1.0) / 2.0;
1544 else
1545 prob = 0.5;
1546 } else
1547 prob = 0.5;
1548
1549 if (aDetailListener) {
1550 // Prepare output arrays
1551 nsTArray<uint32_t> tokenPercents(usedTokenCount);
1552 nsTArray<uint32_t> runningPercents(usedTokenCount);
1553 nsTArray<nsString> tokenStrings(usedTokenCount);
1554
1555 double clueCount = 1.0;
1556 for (uint32_t tokenIndex = 0; tokenIndex < usedTokenCount; tokenIndex++) {
1557 TraitAnalysis& ta = traitAnalyses[last - 1 - tokenIndex];
1558 int32_t chi_error;
1559 S = chi2P(-2.0 * sArray[tokenIndex], 2.0 * clueCount, &chi_error);
1560 if (!chi_error)
1561 H = chi2P(-2.0 * hArray[tokenIndex], 2.0 * clueCount, &chi_error);
1562 clueCount += 1.0;
1563 double runningProb;
1564 if (!chi_error)
1565 runningProb = (S - H + 1.0) / 2.0;
1566 else
1567 runningProb = 0.5;
1568 runningPercents.AppendElement(
1569 static_cast<uint32_t>(runningProb * 100. + .5));
1570 tokenPercents.AppendElement(
1571 static_cast<uint32_t>(ta.mProbability * 100. + .5));
1572 tokenStrings.AppendElement(
1573 NS_ConvertUTF8toUTF16(tokens[ta.mTokenIndex].mWord));
1574 }
1575
1576 aDetailListener->OnMessageTraitDetails(messageURI, aProTraits[traitIndex],
1577 tokenStrings, tokenPercents,
1578 runningPercents);
1579 }
1580
1581 uint32_t proPercent = static_cast<uint32_t>(prob * 100. + .5);
1582
1583 // directly classify junk to maintain backwards compatibility
1584 if (aProTraits[traitIndex] == kJunkTrait) {
1585 bool isJunk = (prob >= mJunkProbabilityThreshold);
1586 MOZ_LOG(BayesianFilterLogModule, LogLevel::Info,
1587 ("%s is junk probability = (%f) HAM SCORE:%f SPAM SCORE:%f",
1588 PromiseFlatCString(messageURI).get(), prob, H, S));
1589
1590 // the algorithm in "A Plan For Spam" assumes that you have a large good
1591 // corpus and a large junk corpus.
1592 // that won't be the case with users who first use the junk mail trait
1593 // so, we do certain things to encourage them to train.
1594 //
1595 // if there are no good tokens, assume the message is junk
1596 // this will "encourage" the user to train
1597 // and if there are no bad tokens, assume the message is not junk
1598 // this will also "encourage" the user to train
1599 // see bug #194238
1600
1601 if (listener && !mCorpus.getMessageCount(kGoodTrait))
1602 isJunk = true;
1603 else if (listener && !mCorpus.getMessageCount(kJunkTrait))
1604 isJunk = false;
1605
1606 if (listener)
1607 listener->OnMessageClassified(
1608 messageURI,
1609 isJunk ? nsMsgJunkStatus(nsIJunkMailPlugin::JUNK)
1610 : nsMsgJunkStatus(nsIJunkMailPlugin::GOOD),
1611 proPercent);
1612 }
1613
1614 if (aTraitListener) {
1615 traits.AppendElement(aProTraits[traitIndex]);
1616 percents.AppendElement(proPercent);
1617 }
1618 }
1619
1620 if (aTraitListener)
1621 aTraitListener->OnMessageTraitsClassified(messageURI, traits, percents);
1622
1623 delete[] tokens;
1624 // reuse mAnalysisStore without clearing memory
1625 mNextAnalysisIndex = 1;
1626 // but shrink it back to the default size
1627 if (mAnalysisStore.Length() > kAnalysisStoreCapacity)
1628 mAnalysisStore.RemoveElementsAt(
1629 kAnalysisStoreCapacity,
1630 mAnalysisStore.Length() - kAnalysisStoreCapacity);
1631 mAnalysisStore.Compact();
1632 }
1633
classifyMessage(Tokenizer & tokens,const nsACString & messageURI,nsIJunkMailClassificationListener * aJunkListener)1634 void nsBayesianFilter::classifyMessage(
1635 Tokenizer& tokens, const nsACString& messageURI,
1636 nsIJunkMailClassificationListener* aJunkListener) {
1637 AutoTArray<uint32_t, 1> proTraits;
1638 AutoTArray<uint32_t, 1> antiTraits;
1639 proTraits.AppendElement(kJunkTrait);
1640 antiTraits.AppendElement(kGoodTrait);
1641 classifyMessage(tokens, messageURI, proTraits, antiTraits, aJunkListener,
1642 nullptr, nullptr);
1643 }
1644
1645 NS_IMETHODIMP
Observe(nsISupports * aSubject,const char * aTopic,const char16_t * someData)1646 nsBayesianFilter::Observe(nsISupports* aSubject, const char* aTopic,
1647 const char16_t* someData) {
1648 if (!strcmp(aTopic, "profile-before-change")) Shutdown();
1649 return NS_OK;
1650 }
1651
1652 /* void shutdown (); */
Shutdown()1653 NS_IMETHODIMP nsBayesianFilter::Shutdown() {
1654 if (mTrainingDataDirty) mCorpus.writeTrainingData(mMaximumTokenCount);
1655 mTrainingDataDirty = false;
1656
1657 return NS_OK;
1658 }
1659
1660 /* readonly attribute boolean shouldDownloadAllHeaders; */
GetShouldDownloadAllHeaders(bool * aShouldDownloadAllHeaders)1661 NS_IMETHODIMP nsBayesianFilter::GetShouldDownloadAllHeaders(
1662 bool* aShouldDownloadAllHeaders) {
1663 // bayesian filters work on the whole msg body currently.
1664 *aShouldDownloadAllHeaders = false;
1665 return NS_OK;
1666 }
1667
1668 /* void classifyMessage (in string aMsgURL, in nsIJunkMailClassificationListener
1669 * aListener); */
ClassifyMessage(const nsACString & aMessageURL,nsIMsgWindow * aMsgWindow,nsIJunkMailClassificationListener * aListener)1670 NS_IMETHODIMP nsBayesianFilter::ClassifyMessage(
1671 const nsACString& aMessageURL, nsIMsgWindow* aMsgWindow,
1672 nsIJunkMailClassificationListener* aListener) {
1673 AutoTArray<nsCString, 1> urls = {PromiseFlatCString(aMessageURL)};
1674 MessageClassifier* analyzer =
1675 new MessageClassifier(this, aListener, aMsgWindow, urls);
1676 NS_ENSURE_TRUE(analyzer, NS_ERROR_OUT_OF_MEMORY);
1677 TokenStreamListener* tokenListener = new TokenStreamListener(analyzer);
1678 NS_ENSURE_TRUE(tokenListener, NS_ERROR_OUT_OF_MEMORY);
1679 analyzer->setTokenListener(tokenListener);
1680 return tokenizeMessage(aMessageURL, aMsgWindow, analyzer);
1681 }
1682
1683 /* void classifyMessages(in Array<ACString> aMsgURIs,
1684 * in nsIMsgWindow aMsgWindow,
1685 * in nsIJunkMailClassificationListener aListener); */
ClassifyMessages(const nsTArray<nsCString> & aMsgURLs,nsIMsgWindow * aMsgWindow,nsIJunkMailClassificationListener * aListener)1686 NS_IMETHODIMP nsBayesianFilter::ClassifyMessages(
1687 const nsTArray<nsCString>& aMsgURLs, nsIMsgWindow* aMsgWindow,
1688 nsIJunkMailClassificationListener* aListener) {
1689 TokenAnalyzer* analyzer =
1690 new MessageClassifier(this, aListener, aMsgWindow, aMsgURLs);
1691 NS_ENSURE_TRUE(analyzer, NS_ERROR_OUT_OF_MEMORY);
1692 TokenStreamListener* tokenListener = new TokenStreamListener(analyzer);
1693 NS_ENSURE_TRUE(tokenListener, NS_ERROR_OUT_OF_MEMORY);
1694 analyzer->setTokenListener(tokenListener);
1695 return tokenizeMessage(aMsgURLs[0], aMsgWindow, analyzer);
1696 }
1697
setAnalysis(Token & token,uint32_t aTraitIndex,double aDistance,double aProbability)1698 nsresult nsBayesianFilter::setAnalysis(Token& token, uint32_t aTraitIndex,
1699 double aDistance, double aProbability) {
1700 uint32_t nextLink = token.mAnalysisLink;
1701 uint32_t lastLink = 0;
1702 uint32_t linkCount = 0, maxLinks = 100;
1703
1704 // try to find an existing element. Limit the search to maxLinks
1705 // as a precaution
1706 for (linkCount = 0; nextLink && linkCount < maxLinks; linkCount++) {
1707 AnalysisPerToken& rAnalysis = mAnalysisStore[nextLink];
1708 if (rAnalysis.mTraitIndex == aTraitIndex) {
1709 rAnalysis.mDistance = aDistance;
1710 rAnalysis.mProbability = aProbability;
1711 return NS_OK;
1712 }
1713 lastLink = nextLink;
1714 nextLink = rAnalysis.mNextLink;
1715 }
1716 if (linkCount >= maxLinks) return NS_ERROR_FAILURE;
1717
1718 // trait does not exist, so add it
1719
1720 AnalysisPerToken analysis(aTraitIndex, aDistance, aProbability);
1721 if (mAnalysisStore.Length() == mNextAnalysisIndex)
1722 mAnalysisStore.InsertElementAt(mNextAnalysisIndex, analysis);
1723 else if (mAnalysisStore.Length() > mNextAnalysisIndex)
1724 mAnalysisStore.ReplaceElementsAt(mNextAnalysisIndex, 1, analysis);
1725 else // we can only insert at the end of the array
1726 return NS_ERROR_FAILURE;
1727
1728 if (lastLink)
1729 // the token had at least one link, so update the last link to point to
1730 // the new item
1731 mAnalysisStore[lastLink].mNextLink = mNextAnalysisIndex;
1732 else
1733 // need to update the token's first link
1734 token.mAnalysisLink = mNextAnalysisIndex;
1735 mNextAnalysisIndex++;
1736 return NS_OK;
1737 }
1738
getAnalysisIndex(Token & token,uint32_t aTraitIndex)1739 uint32_t nsBayesianFilter::getAnalysisIndex(Token& token,
1740 uint32_t aTraitIndex) {
1741 uint32_t nextLink;
1742 uint32_t linkCount = 0, maxLinks = 100;
1743 for (nextLink = token.mAnalysisLink; nextLink && linkCount < maxLinks;
1744 linkCount++) {
1745 AnalysisPerToken& rAnalysis = mAnalysisStore[nextLink];
1746 if (rAnalysis.mTraitIndex == aTraitIndex) return nextLink;
1747 nextLink = rAnalysis.mNextLink;
1748 }
1749 NS_ASSERTION(linkCount < maxLinks, "corrupt analysis store");
1750
1751 // Trait not found, indicate by zero
1752 return 0;
1753 }
1754
ClassifyTraitsInMessage(const nsACString & aMsgURI,const nsTArray<uint32_t> & aProTraits,const nsTArray<uint32_t> & aAntiTraits,nsIMsgTraitClassificationListener * aTraitListener,nsIMsgWindow * aMsgWindow,nsIJunkMailClassificationListener * aJunkListener)1755 NS_IMETHODIMP nsBayesianFilter::ClassifyTraitsInMessage(
1756 const nsACString& aMsgURI, const nsTArray<uint32_t>& aProTraits,
1757 const nsTArray<uint32_t>& aAntiTraits,
1758 nsIMsgTraitClassificationListener* aTraitListener, nsIMsgWindow* aMsgWindow,
1759 nsIJunkMailClassificationListener* aJunkListener) {
1760 AutoTArray<nsCString, 1> uris = {PromiseFlatCString(aMsgURI)};
1761 return ClassifyTraitsInMessages(uris, aProTraits, aAntiTraits, aTraitListener,
1762 aMsgWindow, aJunkListener);
1763 }
1764
ClassifyTraitsInMessages(const nsTArray<nsCString> & aMsgURIs,const nsTArray<uint32_t> & aProTraits,const nsTArray<uint32_t> & aAntiTraits,nsIMsgTraitClassificationListener * aTraitListener,nsIMsgWindow * aMsgWindow,nsIJunkMailClassificationListener * aJunkListener)1765 NS_IMETHODIMP nsBayesianFilter::ClassifyTraitsInMessages(
1766 const nsTArray<nsCString>& aMsgURIs, const nsTArray<uint32_t>& aProTraits,
1767 const nsTArray<uint32_t>& aAntiTraits,
1768 nsIMsgTraitClassificationListener* aTraitListener, nsIMsgWindow* aMsgWindow,
1769 nsIJunkMailClassificationListener* aJunkListener) {
1770 MOZ_ASSERT(aProTraits.Length() == aAntiTraits.Length());
1771 MessageClassifier* analyzer =
1772 new MessageClassifier(this, aJunkListener, aTraitListener, nullptr,
1773 aProTraits, aAntiTraits, aMsgWindow, aMsgURIs);
1774
1775 TokenStreamListener* tokenListener = new TokenStreamListener(analyzer);
1776
1777 analyzer->setTokenListener(tokenListener);
1778 return tokenizeMessage(aMsgURIs[0], aMsgWindow, analyzer);
1779 }
1780
1781 class MessageObserver : public TokenAnalyzer {
1782 public:
MessageObserver(nsBayesianFilter * filter,const nsTArray<uint32_t> & aOldClassifications,const nsTArray<uint32_t> & aNewClassifications,nsIJunkMailClassificationListener * aJunkListener,nsIMsgTraitClassificationListener * aTraitListener)1783 MessageObserver(nsBayesianFilter* filter,
1784 const nsTArray<uint32_t>& aOldClassifications,
1785 const nsTArray<uint32_t>& aNewClassifications,
1786 nsIJunkMailClassificationListener* aJunkListener,
1787 nsIMsgTraitClassificationListener* aTraitListener)
1788 : mFilter(filter),
1789 mJunkMailPlugin(filter),
1790 mJunkListener(aJunkListener),
1791 mTraitListener(aTraitListener),
1792 mOldClassifications(aOldClassifications.Clone()),
1793 mNewClassifications(aNewClassifications.Clone()) {}
1794
analyzeTokens(Tokenizer & tokenizer)1795 virtual void analyzeTokens(Tokenizer& tokenizer) {
1796 mFilter->observeMessage(tokenizer, mTokenSource, mOldClassifications,
1797 mNewClassifications, mJunkListener, mTraitListener);
1798 // release reference to listener, which will allow us to go away as well.
1799 mTokenListener = nullptr;
1800 }
1801
1802 private:
1803 nsBayesianFilter* mFilter;
1804 nsCOMPtr<nsIJunkMailPlugin> mJunkMailPlugin;
1805 nsCOMPtr<nsIJunkMailClassificationListener> mJunkListener;
1806 nsCOMPtr<nsIMsgTraitClassificationListener> mTraitListener;
1807 nsTArray<uint32_t> mOldClassifications;
1808 nsTArray<uint32_t> mNewClassifications;
1809 };
1810
SetMsgTraitClassification(const nsACString & aMsgURI,const nsTArray<uint32_t> & aOldTraits,const nsTArray<uint32_t> & aNewTraits,nsIMsgTraitClassificationListener * aTraitListener,nsIMsgWindow * aMsgWindow,nsIJunkMailClassificationListener * aJunkListener)1811 NS_IMETHODIMP nsBayesianFilter::SetMsgTraitClassification(
1812 const nsACString& aMsgURI, const nsTArray<uint32_t>& aOldTraits,
1813 const nsTArray<uint32_t>& aNewTraits,
1814 nsIMsgTraitClassificationListener* aTraitListener, nsIMsgWindow* aMsgWindow,
1815 nsIJunkMailClassificationListener* aJunkListener) {
1816 MessageObserver* analyzer = new MessageObserver(
1817 this, aOldTraits, aNewTraits, aJunkListener, aTraitListener);
1818 NS_ENSURE_TRUE(analyzer, NS_ERROR_OUT_OF_MEMORY);
1819
1820 TokenStreamListener* tokenListener = new TokenStreamListener(analyzer);
1821 NS_ENSURE_TRUE(tokenListener, NS_ERROR_OUT_OF_MEMORY);
1822
1823 analyzer->setTokenListener(tokenListener);
1824 return tokenizeMessage(aMsgURI, aMsgWindow, analyzer);
1825 }
1826
1827 // set new message classifications for a message
observeMessage(Tokenizer & tokenizer,const nsACString & messageURL,nsTArray<uint32_t> & oldClassifications,nsTArray<uint32_t> & newClassifications,nsIJunkMailClassificationListener * aJunkListener,nsIMsgTraitClassificationListener * aTraitListener)1828 void nsBayesianFilter::observeMessage(
1829 Tokenizer& tokenizer, const nsACString& messageURL,
1830 nsTArray<uint32_t>& oldClassifications,
1831 nsTArray<uint32_t>& newClassifications,
1832 nsIJunkMailClassificationListener* aJunkListener,
1833 nsIMsgTraitClassificationListener* aTraitListener) {
1834 bool trainingDataWasDirty = mTrainingDataDirty;
1835
1836 // Uhoh...if the user is re-training then the message may already be
1837 // classified and we are classifying it again with the same classification.
1838 // the old code would have removed the tokens for this message then added them
1839 // back. But this really hurts the message occurrence count for tokens if you
1840 // just removed training.dat and are re-training. See Bug #237095 for more
1841 // details. What can we do here? Well we can skip the token removal step if
1842 // the classifications are the same and assume the user is just re-training.
1843 // But this then allows users to re-classify the same message on the same
1844 // training set over and over again leading to data skew. But that's all I can
1845 // think to do right now to address this.....
1846 uint32_t oldLength = oldClassifications.Length();
1847 for (uint32_t index = 0; index < oldLength; index++) {
1848 uint32_t trait = oldClassifications.ElementAt(index);
1849 // skip removing if trait is also in the new set
1850 if (newClassifications.Contains(trait)) continue;
1851 // remove the tokens from the token set it is currently in
1852 uint32_t messageCount;
1853 messageCount = mCorpus.getMessageCount(trait);
1854 if (messageCount > 0) {
1855 mCorpus.setMessageCount(trait, messageCount - 1);
1856 mCorpus.forgetTokens(tokenizer, trait, 1);
1857 mTrainingDataDirty = true;
1858 }
1859 }
1860
1861 nsMsgJunkStatus newClassification = nsIJunkMailPlugin::UNCLASSIFIED;
1862 uint32_t junkPercent =
1863 0; // 0 here is no possibility of meeting the classification
1864 uint32_t newLength = newClassifications.Length();
1865 for (uint32_t index = 0; index < newLength; index++) {
1866 uint32_t trait = newClassifications.ElementAt(index);
1867 mCorpus.setMessageCount(trait, mCorpus.getMessageCount(trait) + 1);
1868 mCorpus.rememberTokens(tokenizer, trait, 1);
1869 mTrainingDataDirty = true;
1870
1871 if (aJunkListener) {
1872 if (trait == kJunkTrait) {
1873 junkPercent = nsIJunkMailPlugin::IS_SPAM_SCORE;
1874 newClassification = nsIJunkMailPlugin::JUNK;
1875 } else if (trait == kGoodTrait) {
1876 junkPercent = nsIJunkMailPlugin::IS_HAM_SCORE;
1877 newClassification = nsIJunkMailPlugin::GOOD;
1878 }
1879 }
1880 }
1881
1882 if (aJunkListener)
1883 aJunkListener->OnMessageClassified(messageURL, newClassification,
1884 junkPercent);
1885
1886 if (aTraitListener) {
1887 // construct the outgoing listener arrays
1888 AutoTArray<uint32_t, kTraitAutoCapacity> traits;
1889 AutoTArray<uint32_t, kTraitAutoCapacity> percents;
1890 uint32_t newLength = newClassifications.Length();
1891 if (newLength > kTraitAutoCapacity) {
1892 traits.SetCapacity(newLength);
1893 percents.SetCapacity(newLength);
1894 }
1895 traits.AppendElements(newClassifications);
1896 for (uint32_t index = 0; index < newLength; index++)
1897 percents.AppendElement(100); // This is 100 percent, or certainty
1898 aTraitListener->OnMessageTraitsClassified(messageURL, traits, percents);
1899 }
1900
1901 if (mTrainingDataDirty && !trainingDataWasDirty && (mTimer != nullptr)) {
1902 // if training data became dirty just now, schedule flush
1903 // mMinFlushInterval msec from now
1904 MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug,
1905 ("starting training data flush timer %i msec", mMinFlushInterval));
1906 mTimer->InitWithNamedFuncCallback(
1907 nsBayesianFilter::TimerCallback, this, mMinFlushInterval,
1908 nsITimer::TYPE_ONE_SHOT, "nsBayesianFilter::TimerCallback");
1909 }
1910 }
1911
GetUserHasClassified(bool * aResult)1912 NS_IMETHODIMP nsBayesianFilter::GetUserHasClassified(bool* aResult) {
1913 *aResult = ((mCorpus.getMessageCount(kGoodTrait) +
1914 mCorpus.getMessageCount(kJunkTrait)) &&
1915 mCorpus.countTokens());
1916 return NS_OK;
1917 }
1918
1919 // Set message classification (only allows junk and good)
SetMessageClassification(const nsACString & aMsgURL,nsMsgJunkStatus aOldClassification,nsMsgJunkStatus aNewClassification,nsIMsgWindow * aMsgWindow,nsIJunkMailClassificationListener * aListener)1920 NS_IMETHODIMP nsBayesianFilter::SetMessageClassification(
1921 const nsACString& aMsgURL, nsMsgJunkStatus aOldClassification,
1922 nsMsgJunkStatus aNewClassification, nsIMsgWindow* aMsgWindow,
1923 nsIJunkMailClassificationListener* aListener) {
1924 AutoTArray<uint32_t, 1> oldClassifications;
1925 AutoTArray<uint32_t, 1> newClassifications;
1926
1927 // convert between classifications and trait
1928 if (aOldClassification == nsIJunkMailPlugin::JUNK)
1929 oldClassifications.AppendElement(kJunkTrait);
1930 else if (aOldClassification == nsIJunkMailPlugin::GOOD)
1931 oldClassifications.AppendElement(kGoodTrait);
1932 if (aNewClassification == nsIJunkMailPlugin::JUNK)
1933 newClassifications.AppendElement(kJunkTrait);
1934 else if (aNewClassification == nsIJunkMailPlugin::GOOD)
1935 newClassifications.AppendElement(kGoodTrait);
1936
1937 MessageObserver* analyzer = new MessageObserver(
1938 this, oldClassifications, newClassifications, aListener, nullptr);
1939 NS_ENSURE_TRUE(analyzer, NS_ERROR_OUT_OF_MEMORY);
1940
1941 TokenStreamListener* tokenListener = new TokenStreamListener(analyzer);
1942 NS_ENSURE_TRUE(tokenListener, NS_ERROR_OUT_OF_MEMORY);
1943
1944 analyzer->setTokenListener(tokenListener);
1945 return tokenizeMessage(aMsgURL, aMsgWindow, analyzer);
1946 }
1947
ResetTrainingData()1948 NS_IMETHODIMP nsBayesianFilter::ResetTrainingData() {
1949 return mCorpus.resetTrainingData();
1950 }
1951
DetailMessage(const nsACString & aMsgURI,uint32_t aProTrait,uint32_t aAntiTrait,nsIMsgTraitDetailListener * aDetailListener,nsIMsgWindow * aMsgWindow)1952 NS_IMETHODIMP nsBayesianFilter::DetailMessage(
1953 const nsACString& aMsgURI, uint32_t aProTrait, uint32_t aAntiTrait,
1954 nsIMsgTraitDetailListener* aDetailListener, nsIMsgWindow* aMsgWindow) {
1955 AutoTArray<uint32_t, 1> proTraits = {aProTrait};
1956 AutoTArray<uint32_t, 1> antiTraits = {aAntiTrait};
1957 AutoTArray<nsCString, 1> uris = {PromiseFlatCString(aMsgURI)};
1958
1959 MessageClassifier* analyzer =
1960 new MessageClassifier(this, nullptr, nullptr, aDetailListener, proTraits,
1961 antiTraits, aMsgWindow, uris);
1962 NS_ENSURE_TRUE(analyzer, NS_ERROR_OUT_OF_MEMORY);
1963
1964 TokenStreamListener* tokenListener = new TokenStreamListener(analyzer);
1965 NS_ENSURE_TRUE(tokenListener, NS_ERROR_OUT_OF_MEMORY);
1966
1967 analyzer->setTokenListener(tokenListener);
1968 return tokenizeMessage(aMsgURI, aMsgWindow, analyzer);
1969 }
1970
1971 // nsIMsgCorpus implementation
1972
CorpusCounts(uint32_t aTrait,uint32_t * aMessageCount,uint32_t * aTokenCount)1973 NS_IMETHODIMP nsBayesianFilter::CorpusCounts(uint32_t aTrait,
1974 uint32_t* aMessageCount,
1975 uint32_t* aTokenCount) {
1976 NS_ENSURE_ARG_POINTER(aTokenCount);
1977 *aTokenCount = mCorpus.countTokens();
1978 if (aTrait && aMessageCount) *aMessageCount = mCorpus.getMessageCount(aTrait);
1979 return NS_OK;
1980 }
1981
ClearTrait(uint32_t aTrait)1982 NS_IMETHODIMP nsBayesianFilter::ClearTrait(uint32_t aTrait) {
1983 return mCorpus.ClearTrait(aTrait);
1984 }
1985
1986 NS_IMETHODIMP
UpdateData(nsIFile * aFile,bool aIsAdd,const nsTArray<uint32_t> & aFromTraits,const nsTArray<uint32_t> & aToTraits)1987 nsBayesianFilter::UpdateData(nsIFile* aFile, bool aIsAdd,
1988 const nsTArray<uint32_t>& aFromTraits,
1989 const nsTArray<uint32_t>& aToTraits) {
1990 MOZ_ASSERT(aFromTraits.Length() == aToTraits.Length());
1991 return mCorpus.UpdateData(aFile, aIsAdd, aFromTraits, aToTraits);
1992 }
1993
1994 NS_IMETHODIMP
GetTokenCount(const nsACString & aWord,uint32_t aTrait,uint32_t * aCount)1995 nsBayesianFilter::GetTokenCount(const nsACString& aWord, uint32_t aTrait,
1996 uint32_t* aCount) {
1997 NS_ENSURE_ARG_POINTER(aCount);
1998 CorpusToken* t = mCorpus.get(PromiseFlatCString(aWord).get());
1999 uint32_t count = mCorpus.getTraitCount(t, aTrait);
2000 *aCount = count;
2001 return NS_OK;
2002 }
2003
2004 /* Corpus Store */
2005
2006 /*
2007 Format of the training file for version 1:
2008 [0xFEEDFACE]
2009 [number good messages][number bad messages]
2010 [number good tokens]
2011 [count][length of word]word
2012 ...
2013 [number bad tokens]
2014 [count][length of word]word
2015 ...
2016
2017 Format of the trait file for version 1:
2018 [0xFCA93601] (the 01 is the version)
2019 for each trait to write
2020 [id of trait to write] (0 means end of list)
2021 [number of messages per trait]
2022 for each token with non-zero count
2023 [count]
2024 [length of word]word
2025 */
2026
CorpusStore()2027 CorpusStore::CorpusStore()
2028 : TokenHash(sizeof(CorpusToken)),
2029 mNextTraitIndex(1) // skip 0 since index=0 will mean end of linked list
2030 {
2031 getTrainingFile(getter_AddRefs(mTrainingFile));
2032 mTraitStore.SetCapacity(kTraitStoreCapacity);
2033 TraitPerToken traitPT(0, 0);
2034 mTraitStore.AppendElement(traitPT); // dummy 0th element
2035 }
2036
~CorpusStore()2037 CorpusStore::~CorpusStore() {}
2038
writeUInt32(FILE * stream,uint32_t value)2039 inline int writeUInt32(FILE* stream, uint32_t value) {
2040 value = PR_htonl(value);
2041 return fwrite(&value, sizeof(uint32_t), 1, stream);
2042 }
2043
readUInt32(FILE * stream,uint32_t * value)2044 inline int readUInt32(FILE* stream, uint32_t* value) {
2045 int n = fread(value, sizeof(uint32_t), 1, stream);
2046 if (n == 1) {
2047 *value = PR_ntohl(*value);
2048 }
2049 return n;
2050 }
2051
forgetTokens(Tokenizer & aTokenizer,uint32_t aTraitId,uint32_t aCount)2052 void CorpusStore::forgetTokens(Tokenizer& aTokenizer, uint32_t aTraitId,
2053 uint32_t aCount) {
2054 // if we are forgetting the tokens for a message, should only
2055 // subtract 1 from the occurrence count for that token in the training set
2056 // because we assume we only bumped the training set count once per messages
2057 // containing the token.
2058 TokenEnumeration tokens = aTokenizer.getTokens();
2059 while (tokens.hasMoreTokens()) {
2060 CorpusToken* token = static_cast<CorpusToken*>(tokens.nextToken());
2061 remove(token->mWord, aTraitId, aCount);
2062 }
2063 }
2064
rememberTokens(Tokenizer & aTokenizer,uint32_t aTraitId,uint32_t aCount)2065 void CorpusStore::rememberTokens(Tokenizer& aTokenizer, uint32_t aTraitId,
2066 uint32_t aCount) {
2067 TokenEnumeration tokens = aTokenizer.getTokens();
2068 while (tokens.hasMoreTokens()) {
2069 CorpusToken* token = static_cast<CorpusToken*>(tokens.nextToken());
2070 if (!token) {
2071 NS_ERROR("null token");
2072 continue;
2073 }
2074 add(token->mWord, aTraitId, aCount);
2075 }
2076 }
2077
writeTokens(FILE * stream,bool shrink,uint32_t aTraitId)2078 bool CorpusStore::writeTokens(FILE* stream, bool shrink, uint32_t aTraitId) {
2079 uint32_t tokenCount = countTokens();
2080 uint32_t newTokenCount = 0;
2081
2082 // calculate the tokens for this trait to write
2083
2084 TokenEnumeration tokens = getTokens();
2085 for (uint32_t i = 0; i < tokenCount; ++i) {
2086 CorpusToken* token = static_cast<CorpusToken*>(tokens.nextToken());
2087 uint32_t count = getTraitCount(token, aTraitId);
2088 // Shrinking the token database is accomplished by dividing all token counts
2089 // by 2. If shrinking, we'll ignore counts < 2, otherwise only ignore counts
2090 // of < 1
2091 if ((shrink && count > 1) || (!shrink && count)) newTokenCount++;
2092 }
2093
2094 if (writeUInt32(stream, newTokenCount) != 1) return false;
2095
2096 if (newTokenCount > 0) {
2097 TokenEnumeration tokens = getTokens();
2098 for (uint32_t i = 0; i < tokenCount; ++i) {
2099 CorpusToken* token = static_cast<CorpusToken*>(tokens.nextToken());
2100 uint32_t wordCount = getTraitCount(token, aTraitId);
2101 if (shrink) wordCount /= 2;
2102 if (!wordCount) continue; // Don't output zero count words
2103 if (writeUInt32(stream, wordCount) != 1) return false;
2104 uint32_t tokenLength = strlen(token->mWord);
2105 if (writeUInt32(stream, tokenLength) != 1) return false;
2106 if (fwrite(token->mWord, tokenLength, 1, stream) != 1) return false;
2107 }
2108 }
2109 return true;
2110 }
2111
readTokens(FILE * stream,int64_t fileSize,uint32_t aTraitId,bool aIsAdd)2112 bool CorpusStore::readTokens(FILE* stream, int64_t fileSize, uint32_t aTraitId,
2113 bool aIsAdd) {
2114 uint32_t tokenCount;
2115 if (readUInt32(stream, &tokenCount) != 1) return false;
2116
2117 int64_t fpos = ftell(stream);
2118 if (fpos < 0) return false;
2119
2120 uint32_t bufferSize = 4096;
2121 char* buffer = new char[bufferSize];
2122 if (!buffer) return false;
2123
2124 for (uint32_t i = 0; i < tokenCount; ++i) {
2125 uint32_t count;
2126 if (readUInt32(stream, &count) != 1) break;
2127 uint32_t size;
2128 if (readUInt32(stream, &size) != 1) break;
2129 fpos += 8;
2130 if (fpos + size > fileSize) {
2131 delete[] buffer;
2132 return false;
2133 }
2134 if (size >= bufferSize) {
2135 delete[] buffer;
2136 while (size >= bufferSize) {
2137 bufferSize *= 2;
2138 if (bufferSize == 0) return false;
2139 }
2140 buffer = new char[bufferSize];
2141 if (!buffer) return false;
2142 }
2143 if (fread(buffer, size, 1, stream) != 1) break;
2144 fpos += size;
2145 buffer[size] = '\0';
2146 if (aIsAdd)
2147 add(buffer, aTraitId, count);
2148 else
2149 remove(buffer, aTraitId, count);
2150 }
2151
2152 delete[] buffer;
2153
2154 return true;
2155 }
2156
getTrainingFile(nsIFile ** aTrainingFile)2157 nsresult CorpusStore::getTrainingFile(nsIFile** aTrainingFile) {
2158 // should we cache the profile manager's directory?
2159 nsCOMPtr<nsIFile> profileDir;
2160
2161 nsresult rv = NS_GetSpecialDirectory(NS_APP_USER_PROFILE_50_DIR,
2162 getter_AddRefs(profileDir));
2163 NS_ENSURE_SUCCESS(rv, rv);
2164 rv = profileDir->Append(u"training.dat"_ns);
2165 NS_ENSURE_SUCCESS(rv, rv);
2166
2167 return profileDir->QueryInterface(NS_GET_IID(nsIFile), (void**)aTrainingFile);
2168 }
2169
getTraitFile(nsIFile ** aTraitFile)2170 nsresult CorpusStore::getTraitFile(nsIFile** aTraitFile) {
2171 // should we cache the profile manager's directory?
2172 nsCOMPtr<nsIFile> profileDir;
2173
2174 nsresult rv = NS_GetSpecialDirectory(NS_APP_USER_PROFILE_50_DIR,
2175 getter_AddRefs(profileDir));
2176 NS_ENSURE_SUCCESS(rv, rv);
2177
2178 rv = profileDir->Append(u"traits.dat"_ns);
2179 NS_ENSURE_SUCCESS(rv, rv);
2180
2181 return profileDir->QueryInterface(NS_GET_IID(nsIFile), (void**)aTraitFile);
2182 }
2183
2184 static const char kMagicCookie[] = {'\xFE', '\xED', '\xFA', '\xCE'};
2185
2186 // random string used to identify trait file and version (last byte is version)
2187 static const char kTraitCookie[] = {'\xFC', '\xA9', '\x36', '\x01'};
2188
writeTrainingData(uint32_t aMaximumTokenCount)2189 void CorpusStore::writeTrainingData(uint32_t aMaximumTokenCount) {
2190 MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug,
2191 ("writeTrainingData() entered"));
2192 if (!mTrainingFile) return;
2193
2194 /*
2195 * For backwards compatibility, write the good and junk tokens to
2196 * training.dat; additional traits are added to a different file
2197 */
2198
2199 // open the file, and write out training data
2200 FILE* stream;
2201 nsresult rv = mTrainingFile->OpenANSIFileDesc("wb", &stream);
2202 if (NS_FAILED(rv)) return;
2203
2204 // If the number of tokens exceeds our limit, set the shrink flag
2205 bool shrink = false;
2206 if ((aMaximumTokenCount > 0) && // if 0, do not limit tokens
2207 (countTokens() > aMaximumTokenCount)) {
2208 shrink = true;
2209 MOZ_LOG(BayesianFilterLogModule, LogLevel::Warning,
2210 ("shrinking token data file"));
2211 }
2212
2213 // We implement shrink by dividing counts by two
2214 uint32_t shrinkFactor = shrink ? 2 : 1;
2215
2216 if (!((fwrite(kMagicCookie, sizeof(kMagicCookie), 1, stream) == 1) &&
2217 (writeUInt32(stream, getMessageCount(kGoodTrait) / shrinkFactor)) &&
2218 (writeUInt32(stream, getMessageCount(kJunkTrait) / shrinkFactor)) &&
2219 writeTokens(stream, shrink, kGoodTrait) &&
2220 writeTokens(stream, shrink, kJunkTrait))) {
2221 NS_WARNING("failed to write training data.");
2222 fclose(stream);
2223 // delete the training data file, since it is potentially corrupt.
2224 mTrainingFile->Remove(false);
2225 } else {
2226 fclose(stream);
2227 }
2228
2229 /*
2230 * Write the remaining data to a second file traits.dat
2231 */
2232
2233 if (!mTraitFile) {
2234 getTraitFile(getter_AddRefs(mTraitFile));
2235 if (!mTraitFile) return;
2236 }
2237
2238 // open the file, and write out training data
2239 rv = mTraitFile->OpenANSIFileDesc("wb", &stream);
2240 if (NS_FAILED(rv)) return;
2241
2242 uint32_t numberOfTraits = mMessageCounts.Length();
2243 bool error;
2244 while (1) // break on error or done
2245 {
2246 if ((error = (fwrite(kTraitCookie, sizeof(kTraitCookie), 1, stream) != 1)))
2247 break;
2248
2249 for (uint32_t index = 0; index < numberOfTraits; index++) {
2250 uint32_t trait = mMessageCountsId[index];
2251 if (trait == 1 || trait == 2)
2252 continue; // junk traits are stored in training.dat
2253 if ((error = (writeUInt32(stream, trait) != 1))) break;
2254 if ((error = (writeUInt32(stream, mMessageCounts[index] / shrinkFactor) !=
2255 1)))
2256 break;
2257 if ((error = !writeTokens(stream, shrink, trait))) break;
2258 }
2259 break;
2260 }
2261 // we add a 0 at the end to represent end of trait list
2262 error = writeUInt32(stream, 0) != 1;
2263
2264 fclose(stream);
2265 if (error) {
2266 NS_WARNING("failed to write trait data.");
2267 // delete the trait data file, since it is probably corrupt.
2268 mTraitFile->Remove(false);
2269 }
2270
2271 if (shrink) {
2272 // We'll clear the tokens, and read them back in from the file.
2273 // Yes this is slower than in place, but this is a rare event.
2274
2275 if (countTokens()) {
2276 clearTokens();
2277 for (uint32_t index = 0; index < numberOfTraits; index++)
2278 mMessageCounts[index] = 0;
2279 }
2280
2281 readTrainingData();
2282 }
2283 }
2284
readTrainingData()2285 void CorpusStore::readTrainingData() {
2286 /*
2287 * To maintain backwards compatibility, good and junk traits
2288 * are stored in a file "training.dat"
2289 */
2290 if (!mTrainingFile) return;
2291
2292 bool exists;
2293 nsresult rv = mTrainingFile->Exists(&exists);
2294 if (NS_FAILED(rv) || !exists) return;
2295
2296 FILE* stream;
2297 rv = mTrainingFile->OpenANSIFileDesc("rb", &stream);
2298 if (NS_FAILED(rv)) return;
2299
2300 int64_t fileSize;
2301 rv = mTrainingFile->GetFileSize(&fileSize);
2302 if (NS_FAILED(rv)) return;
2303
2304 // FIXME: should make sure that the tokenizers are empty.
2305 char cookie[4];
2306 uint32_t goodMessageCount = 0, junkMessageCount = 0;
2307 if (!((fread(cookie, sizeof(cookie), 1, stream) == 1) &&
2308 (memcmp(cookie, kMagicCookie, sizeof(cookie)) == 0) &&
2309 (readUInt32(stream, &goodMessageCount) == 1) &&
2310 (readUInt32(stream, &junkMessageCount) == 1) &&
2311 readTokens(stream, fileSize, kGoodTrait, true) &&
2312 readTokens(stream, fileSize, kJunkTrait, true))) {
2313 NS_WARNING("failed to read training data.");
2314 MOZ_LOG(BayesianFilterLogModule, LogLevel::Error,
2315 ("failed to read training data."));
2316 }
2317 setMessageCount(kGoodTrait, goodMessageCount);
2318 setMessageCount(kJunkTrait, junkMessageCount);
2319
2320 fclose(stream);
2321
2322 /*
2323 * Additional traits are stored in traits.dat
2324 */
2325
2326 if (!mTraitFile) {
2327 getTraitFile(getter_AddRefs(mTraitFile));
2328 if (!mTraitFile) return;
2329 }
2330
2331 rv = mTraitFile->Exists(&exists);
2332 if (NS_FAILED(rv) || !exists) return;
2333
2334 nsTArray<uint32_t> empty;
2335 rv = UpdateData(mTraitFile, true, empty, empty);
2336
2337 if (NS_FAILED(rv)) {
2338 NS_WARNING("failed to read training data.");
2339 MOZ_LOG(BayesianFilterLogModule, LogLevel::Error,
2340 ("failed to read training data."));
2341 }
2342 return;
2343 }
2344
resetTrainingData()2345 nsresult CorpusStore::resetTrainingData() {
2346 // clear out our in memory training tokens...
2347 if (countTokens()) clearTokens();
2348
2349 uint32_t length = mMessageCounts.Length();
2350 for (uint32_t index = 0; index < length; index++) mMessageCounts[index] = 0;
2351
2352 if (mTrainingFile) mTrainingFile->Remove(false);
2353 if (mTraitFile) mTraitFile->Remove(false);
2354 return NS_OK;
2355 }
2356
get(const char * word)2357 inline CorpusToken* CorpusStore::get(const char* word) {
2358 return static_cast<CorpusToken*>(TokenHash::get(word));
2359 }
2360
updateTrait(CorpusToken * token,uint32_t aTraitId,int32_t aCountChange)2361 nsresult CorpusStore::updateTrait(CorpusToken* token, uint32_t aTraitId,
2362 int32_t aCountChange) {
2363 NS_ENSURE_ARG_POINTER(token);
2364 uint32_t nextLink = token->mTraitLink;
2365 uint32_t lastLink = 0;
2366
2367 uint32_t linkCount, maxLinks = 100; // sanity check
2368 for (linkCount = 0; nextLink && linkCount < maxLinks; linkCount++) {
2369 TraitPerToken& traitPT = mTraitStore[nextLink];
2370 if (traitPT.mId == aTraitId) {
2371 // be careful with signed versus unsigned issues here
2372 if (static_cast<int32_t>(traitPT.mCount) + aCountChange > 0)
2373 traitPT.mCount += aCountChange;
2374 else
2375 traitPT.mCount = 0;
2376 // we could delete zero count traits here, but let's not. It's rare
2377 // anyway.
2378 return NS_OK;
2379 }
2380 lastLink = nextLink;
2381 nextLink = traitPT.mNextLink;
2382 }
2383 if (linkCount >= maxLinks) return NS_ERROR_FAILURE;
2384
2385 // trait does not exist, so add it
2386
2387 if (aCountChange > 0) // don't set a negative count
2388 {
2389 TraitPerToken traitPT(aTraitId, aCountChange);
2390 if (mTraitStore.Length() == mNextTraitIndex)
2391 mTraitStore.InsertElementAt(mNextTraitIndex, traitPT);
2392 else if (mTraitStore.Length() > mNextTraitIndex)
2393 mTraitStore.ReplaceElementsAt(mNextTraitIndex, 1, traitPT);
2394 else
2395 return NS_ERROR_FAILURE;
2396 if (lastLink)
2397 // the token had a parent, so update it
2398 mTraitStore[lastLink].mNextLink = mNextTraitIndex;
2399 else
2400 // need to update the token's root link
2401 token->mTraitLink = mNextTraitIndex;
2402 mNextTraitIndex++;
2403 }
2404 return NS_OK;
2405 }
2406
getTraitCount(CorpusToken * token,uint32_t aTraitId)2407 uint32_t CorpusStore::getTraitCount(CorpusToken* token, uint32_t aTraitId) {
2408 uint32_t nextLink;
2409 if (!token || !(nextLink = token->mTraitLink)) return 0;
2410
2411 uint32_t linkCount, maxLinks = 100; // sanity check
2412 for (linkCount = 0; nextLink && linkCount < maxLinks; linkCount++) {
2413 TraitPerToken& traitPT = mTraitStore[nextLink];
2414 if (traitPT.mId == aTraitId) return traitPT.mCount;
2415 nextLink = traitPT.mNextLink;
2416 }
2417 NS_ASSERTION(linkCount < maxLinks, "Corrupt trait count store");
2418
2419 // trait not found (or error), so count is zero
2420 return 0;
2421 }
2422
add(const char * word,uint32_t aTraitId,uint32_t aCount)2423 CorpusToken* CorpusStore::add(const char* word, uint32_t aTraitId,
2424 uint32_t aCount) {
2425 CorpusToken* token = static_cast<CorpusToken*>(TokenHash::add(word));
2426 if (token) {
2427 MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug,
2428 ("adding word to corpus store: %s (Trait=%d) (deltaCount=%d)", word,
2429 aTraitId, aCount));
2430 updateTrait(token, aTraitId, aCount);
2431 }
2432 return token;
2433 }
2434
remove(const char * word,uint32_t aTraitId,uint32_t aCount)2435 void CorpusStore::remove(const char* word, uint32_t aTraitId, uint32_t aCount) {
2436 MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug,
2437 ("remove word: %s (TraitId=%d) (Count=%d)", word, aTraitId, aCount));
2438 CorpusToken* token = get(word);
2439 if (token) updateTrait(token, aTraitId, -static_cast<int32_t>(aCount));
2440 }
2441
getMessageCount(uint32_t aTraitId)2442 uint32_t CorpusStore::getMessageCount(uint32_t aTraitId) {
2443 size_t index = mMessageCountsId.IndexOf(aTraitId);
2444 if (index == mMessageCountsId.NoIndex) return 0;
2445 return mMessageCounts.ElementAt(index);
2446 }
2447
setMessageCount(uint32_t aTraitId,uint32_t aCount)2448 void CorpusStore::setMessageCount(uint32_t aTraitId, uint32_t aCount) {
2449 size_t index = mMessageCountsId.IndexOf(aTraitId);
2450 if (index == mMessageCountsId.NoIndex) {
2451 mMessageCounts.AppendElement(aCount);
2452 mMessageCountsId.AppendElement(aTraitId);
2453 } else {
2454 mMessageCounts[index] = aCount;
2455 }
2456 }
2457
UpdateData(nsIFile * aFile,bool aIsAdd,const nsTArray<uint32_t> & aFromTraits,const nsTArray<uint32_t> & aToTraits)2458 nsresult CorpusStore::UpdateData(nsIFile* aFile, bool aIsAdd,
2459 const nsTArray<uint32_t>& aFromTraits,
2460 const nsTArray<uint32_t>& aToTraits) {
2461 NS_ENSURE_ARG_POINTER(aFile);
2462 MOZ_ASSERT(aFromTraits.Length() == aToTraits.Length());
2463
2464 int64_t fileSize;
2465 nsresult rv = aFile->GetFileSize(&fileSize);
2466 NS_ENSURE_SUCCESS(rv, rv);
2467
2468 FILE* stream;
2469 rv = aFile->OpenANSIFileDesc("rb", &stream);
2470 NS_ENSURE_SUCCESS(rv, rv);
2471
2472 bool error;
2473 do // break on error or done
2474 {
2475 char cookie[4];
2476 if ((error = (fread(cookie, sizeof(cookie), 1, stream) != 1))) break;
2477
2478 if ((error = memcmp(cookie, kTraitCookie, sizeof(cookie)))) break;
2479
2480 uint32_t fileTrait;
2481 while (!(error = (readUInt32(stream, &fileTrait) != 1)) && fileTrait) {
2482 uint32_t count;
2483 if ((error = (readUInt32(stream, &count) != 1))) break;
2484
2485 uint32_t localTrait = fileTrait;
2486 // remap the trait
2487 for (uint32_t i = 0; i < aFromTraits.Length(); i++) {
2488 if (aFromTraits[i] == fileTrait) localTrait = aToTraits[i];
2489 }
2490
2491 uint32_t messageCount = getMessageCount(localTrait);
2492 if (aIsAdd)
2493 messageCount += count;
2494 else if (count > messageCount)
2495 messageCount = 0;
2496 else
2497 messageCount -= count;
2498 setMessageCount(localTrait, messageCount);
2499
2500 if ((error = !readTokens(stream, fileSize, localTrait, aIsAdd))) break;
2501 }
2502 break;
2503 } while (0);
2504
2505 fclose(stream);
2506
2507 if (error) return NS_ERROR_FAILURE;
2508 return NS_OK;
2509 }
2510
ClearTrait(uint32_t aTrait)2511 nsresult CorpusStore::ClearTrait(uint32_t aTrait) {
2512 // clear message counts
2513 setMessageCount(aTrait, 0);
2514
2515 TokenEnumeration tokens = getTokens();
2516 while (tokens.hasMoreTokens()) {
2517 CorpusToken* token = static_cast<CorpusToken*>(tokens.nextToken());
2518 int32_t wordCount = static_cast<int32_t>(getTraitCount(token, aTrait));
2519 updateTrait(token, aTrait, -wordCount);
2520 }
2521 return NS_OK;
2522 }
2523