1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3  * License, v. 2.0. If a copy of the MPL was not distributed with this
4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 
6 #include "nsBayesianFilter.h"
7 #include "nsIInputStream.h"
8 #include "nsIStreamListener.h"
9 #include "nsNetUtil.h"
10 #include "nsQuickSort.h"
11 #include "nsIMsgMessageService.h"
12 #include "nsMsgUtils.h"  // for GetMessageServiceFromURI
13 #include "prnetdb.h"
14 #include "nsIMsgWindow.h"
15 #include "mozilla/Logging.h"
16 #include "nsAppDirectoryServiceDefs.h"
17 #include "nsUnicharUtils.h"
18 #include "nsDirectoryServiceUtils.h"
19 #include "nsIMIMEHeaderParam.h"
20 #include "nsNetCID.h"
21 #include "nsMsgMimeCID.h"
22 #include "nsIMsgMailNewsUrl.h"
23 #include "nsIMimeMiscStatus.h"
24 #include "nsIPrefService.h"
25 #include "nsIPrefBranch.h"
26 #include "nsIStringEnumerator.h"
27 #include "nsIObserverService.h"
28 #include "nsIChannel.h"
29 #include "nsDependentSubstring.h"
30 #include "nsMemory.h"
31 
32 #include "mozilla/ArenaAllocatorExtensions.h"  // for ArenaStrdup
33 
34 using namespace mozilla;
35 
36 // needed to mark attachment flag on the db hdr
37 #include "nsIMsgHdr.h"
38 
39 // needed to strip html out of the body
40 #include "nsLayoutCID.h"
41 #include "nsIParserUtils.h"
42 #include "nsIDocumentEncoder.h"
43 
44 #include "nsIncompleteGamma.h"
45 #include <math.h>
46 #include <prmem.h>
47 #include "nsIMsgTraitService.h"
48 #include "mozilla/Services.h"
49 #include "mozilla/Attributes.h"
50 #include <cstdlib>  // for std::abs(int/long)
51 #include <cmath>    // for std::abs(float/double)
52 
53 static mozilla::LazyLogModule BayesianFilterLogModule("BayesianFilter");
54 
55 #define kDefaultJunkThreshold .99  // we override this value via a pref
56 static const char* kBayesianFilterTokenDelimiters = " \t\n\r\f.";
57 static unsigned int kMinLengthForToken =
58     3;  // lower bound on the number of characters in a word before we treat it
59         // as a token
60 static unsigned int kMaxLengthForToken =
61     12;  // upper bound on the number of characters in a word to be declared as
62          // a token
63 
64 #define FORGED_RECEIVED_HEADER_HINT "may be forged"_ns
65 
66 #ifndef M_LN2
67 #  define M_LN2 0.69314718055994530942
68 #endif
69 
70 #ifndef M_E
71 #  define M_E 2.7182818284590452354
72 #endif
73 
74 // provide base implementation of hash lookup of a string
75 struct BaseToken : public PLDHashEntryHdr {
76   const char* mWord;
77 };
78 
79 // token for a particular message
80 // mCount, mAnalysisLink are initialized to zero by the hash code
81 struct Token : public BaseToken {
82   uint32_t mCount;
83   uint32_t mAnalysisLink;  // index in mAnalysisStore of the AnalysisPerToken
84                            // object for the first trait for this token
85   // Helper to support Tokenizer::copyTokens()
cloneToken86   void clone(const Token& other) {
87     mWord = other.mWord;
88     mCount = other.mCount;
89     mAnalysisLink = other.mAnalysisLink;
90   }
91 };
92 
93 // token stored in a training file for a group of messages
94 // mTraitLink is initialized to 0 by the hash code
95 struct CorpusToken : public BaseToken {
96   uint32_t mTraitLink;  // index in mTraitStore of the TraitPerToken
97                         // object for the first trait for this token
98 };
99 
100 // set the value of a TraitPerToken object
TraitPerToken(uint32_t aTraitId,uint32_t aCount)101 TraitPerToken::TraitPerToken(uint32_t aTraitId, uint32_t aCount)
102     : mId(aTraitId), mCount(aCount), mNextLink(0) {}
103 
104 // shorthand representations of trait ids for junk and good
105 static const uint32_t kJunkTrait = nsIJunkMailPlugin::JUNK_TRAIT;
106 static const uint32_t kGoodTrait = nsIJunkMailPlugin::GOOD_TRAIT;
107 
108 // set the value of an AnalysisPerToken object
AnalysisPerToken(uint32_t aTraitIndex,double aDistance,double aProbability)109 AnalysisPerToken::AnalysisPerToken(uint32_t aTraitIndex, double aDistance,
110                                    double aProbability)
111     : mTraitIndex(aTraitIndex),
112       mDistance(aDistance),
113       mProbability(aProbability),
114       mNextLink(0) {}
115 
116 // the initial size of the AnalysisPerToken linked list storage
117 const uint32_t kAnalysisStoreCapacity = 2048;
118 
119 // the initial size of the TraitPerToken linked list storage
120 const uint32_t kTraitStoreCapacity = 16384;
121 
122 // Size of Auto arrays representing per trait information
123 const uint32_t kTraitAutoCapacity = 10;
124 
TokenEnumeration(PLDHashTable * table)125 TokenEnumeration::TokenEnumeration(PLDHashTable* table)
126     : mIterator(table->Iter()) {}
127 
hasMoreTokens()128 inline bool TokenEnumeration::hasMoreTokens() { return !mIterator.Done(); }
129 
nextToken()130 inline BaseToken* TokenEnumeration::nextToken() {
131   auto token = static_cast<BaseToken*>(mIterator.Get());
132   mIterator.Next();
133   return token;
134 }
135 
136 // member variables
137 static const PLDHashTableOps gTokenTableOps = {
138     PLDHashTable::HashStringKey, PLDHashTable::MatchStringKey,
139     PLDHashTable::MoveEntryStub, PLDHashTable::ClearEntryStub, nullptr};
140 
TokenHash(uint32_t aEntrySize)141 TokenHash::TokenHash(uint32_t aEntrySize)
142     : mTokenTable(&gTokenTableOps, aEntrySize, 128) {
143   mEntrySize = aEntrySize;
144 }
145 
~TokenHash()146 TokenHash::~TokenHash() {}
147 
clearTokens()148 nsresult TokenHash::clearTokens() {
149   // we re-use the tokenizer when classifying multiple messages,
150   // so this gets called after every message classification.
151   mTokenTable.ClearAndPrepareForLength(128);
152   mWordPool.Clear();
153   return NS_OK;
154 }
155 
copyWord(const char * word,uint32_t len)156 char* TokenHash::copyWord(const char* word, uint32_t len) {
157   return ArenaStrdup(Substring(word, len), mWordPool);
158 }
159 
get(const char * word)160 inline BaseToken* TokenHash::get(const char* word) {
161   PLDHashEntryHdr* entry = mTokenTable.Search(word);
162   if (entry) return static_cast<BaseToken*>(entry);
163   return NULL;
164 }
165 
add(const char * word)166 BaseToken* TokenHash::add(const char* word) {
167   if (!word || !*word) {
168     NS_ERROR("Trying to add a null word");
169     return nullptr;
170   }
171 
172   MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug, ("add word: %s", word));
173 
174   PLDHashEntryHdr* entry = mTokenTable.Add(word, mozilla::fallible);
175   BaseToken* token = static_cast<BaseToken*>(entry);
176   if (token) {
177     if (token->mWord == NULL) {
178       uint32_t len = strlen(word);
179       NS_ASSERTION(len != 0, "adding zero length word to tokenizer");
180       if (!len)
181         MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug,
182                 ("adding zero length word to tokenizer"));
183       token->mWord = copyWord(word, len);
184       NS_ASSERTION(token->mWord, "copyWord failed");
185       if (!token->mWord) {
186         MOZ_LOG(BayesianFilterLogModule, LogLevel::Error,
187                 ("copyWord failed: %s (%d)", word, len));
188         mTokenTable.RawRemove(entry);
189         return NULL;
190       }
191     }
192   }
193   return token;
194 }
195 
countTokens()196 inline uint32_t TokenHash::countTokens() { return mTokenTable.EntryCount(); }
197 
getTokens()198 inline TokenEnumeration TokenHash::getTokens() {
199   return TokenEnumeration(&mTokenTable);
200 }
201 
Tokenizer()202 Tokenizer::Tokenizer()
203     : TokenHash(sizeof(Token)),
204       mBodyDelimiters(kBayesianFilterTokenDelimiters),
205       mHeaderDelimiters(kBayesianFilterTokenDelimiters),
206       mCustomHeaderTokenization(false),
207       mMaxLengthForToken(kMaxLengthForToken),
208       mIframeToDiv(false) {
209   nsresult rv;
210   nsCOMPtr<nsIPrefService> prefs =
211       do_GetService(NS_PREFSERVICE_CONTRACTID, &rv);
212   NS_ENSURE_SUCCESS_VOID(rv);
213 
214   nsCOMPtr<nsIPrefBranch> prefBranch;
215   rv = prefs->GetBranch("mailnews.bayesian_spam_filter.",
216                         getter_AddRefs(prefBranch));
217   NS_ENSURE_SUCCESS_VOID(rv);  // no branch defined, just use defaults
218 
219   /*
220    * RSS feeds store their summary as alternate content of an iframe. But due
221    * to bug 365953, this is not seen by the serializer. As a workaround, allow
222    * the tokenizer to replace the iframe with div for tokenization.
223    */
224   rv = prefBranch->GetBoolPref("iframe_to_div", &mIframeToDiv);
225   if (NS_FAILED(rv)) mIframeToDiv = false;
226 
227   /*
228    * the list of delimiters used to tokenize the message and body
229    * defaults to the value in kBayesianFilterTokenDelimiters, but may be
230    * set with the following preferences for the body and header
231    * separately.
232    *
233    * \t, \n, \v, \f, \r, and \\ will be escaped to their normal
234    * C-library values, all other two-letter combinations beginning with \
235    * will be ignored.
236    */
237 
238   prefBranch->GetCharPref("body_delimiters", mBodyDelimiters);
239   if (!mBodyDelimiters.IsEmpty())
240     UnescapeCString(mBodyDelimiters);
241   else  // prefBranch empties the result when it fails :(
242     mBodyDelimiters.Assign(kBayesianFilterTokenDelimiters);
243 
244   prefBranch->GetCharPref("header_delimiters", mHeaderDelimiters);
245   if (!mHeaderDelimiters.IsEmpty())
246     UnescapeCString(mHeaderDelimiters);
247   else
248     mHeaderDelimiters.Assign(kBayesianFilterTokenDelimiters);
249 
250   /*
251    * Extensions may wish to enable or disable tokenization of certain headers.
252    * Define any headers to enable/disable in a string preference like this:
253    *   "mailnews.bayesian_spam_filter.tokenizeheader.headername"
254    *
255    * where "headername" is the header to tokenize. For example, to tokenize the
256    * header "x-spam-status" use the preference:
257    *
258    *   "mailnews.bayesian_spam_filter.tokenizeheader.x-spam-status"
259    *
260    * The value of the string preference will be interpreted in one of
261    * four ways, depending on the value:
262    *
263    *   If "false" then do not tokenize that header
264    *   If "full" then add the entire header value as a token,
265    *     without breaking up into subtokens using delimiters
266    *   If "standard" then tokenize the header using as delimiters the current
267    *     value of the generic header delimiters
268    *   Any other string is interpreted as a list of delimiters to use to parse
269    *     the header. \t, \n, \v, \f, \r, and \\ will be escaped to their normal
270    *     C-library values, all other two-letter combinations beginning with \
271    *     will be ignored.
272    *
273    * Header names in the preference should be all lower case
274    *
275    * Extensions may also set the maximum length of a token (default is
276    * kMaxLengthForToken) by setting the int preference:
277    *   "mailnews.bayesian_spam_filter.maxlengthfortoken"
278    */
279 
280   nsTArray<nsCString> headers;
281 
282   // get customized maximum token length
283   int32_t maxLengthForToken;
284   rv = prefBranch->GetIntPref("maxlengthfortoken", &maxLengthForToken);
285   mMaxLengthForToken =
286       NS_SUCCEEDED(rv) ? uint32_t(maxLengthForToken) : kMaxLengthForToken;
287 
288   rv = prefs->GetBranch("mailnews.bayesian_spam_filter.tokenizeheader.",
289                         getter_AddRefs(prefBranch));
290   if (NS_SUCCEEDED(rv)) rv = prefBranch->GetChildList("", headers);
291 
292   if (NS_SUCCEEDED(rv)) {
293     mCustomHeaderTokenization = true;
294     for (auto& header : headers) {
295       nsCString value;
296       prefBranch->GetCharPref(header.get(), value);
297       if (value.EqualsLiteral("false")) {
298         mDisabledHeaders.AppendElement(header);
299         continue;
300       }
301       mEnabledHeaders.AppendElement(header);
302       if (value.EqualsLiteral("standard"))
303         value.SetIsVoid(true);  // Void means use default delimiter
304       else if (value.EqualsLiteral("full"))
305         value.Truncate();  // Empty means add full header
306       else
307         UnescapeCString(value);
308       mEnabledHeadersDelimiters.AppendElement(value);
309     }
310   }
311 }
312 
~Tokenizer()313 Tokenizer::~Tokenizer() {}
314 
get(const char * word)315 inline Token* Tokenizer::get(const char* word) {
316   return static_cast<Token*>(TokenHash::get(word));
317 }
318 
add(const char * word,uint32_t count)319 Token* Tokenizer::add(const char* word, uint32_t count) {
320   MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug,
321           ("add word: %s (count=%d)", word, count));
322 
323   Token* token = static_cast<Token*>(TokenHash::add(word));
324   if (token) {
325     token->mCount += count;  // hash code initializes this to zero
326     MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug,
327             ("adding word to tokenizer: %s (count=%d) (mCount=%d)", word, count,
328              token->mCount));
329   }
330   return token;
331 }
332 
isDecimalNumber(const char * word)333 static bool isDecimalNumber(const char* word) {
334   const char* p = word;
335   if (*p == '-') ++p;
336   char c;
337   while ((c = *p++)) {
338     if (!isdigit((unsigned char)c)) return false;
339   }
340   return true;
341 }
342 
isASCII(const char * word)343 static bool isASCII(const char* word) {
344   const unsigned char* p = (const unsigned char*)word;
345   unsigned char c;
346   while ((c = *p++)) {
347     if (c > 127) return false;
348   }
349   return true;
350 }
351 
isUpperCase(char c)352 inline bool isUpperCase(char c) { return ('A' <= c) && (c <= 'Z'); }
353 
toLowerCase(char * str)354 static char* toLowerCase(char* str) {
355   char c, *p = str;
356   while ((c = *p++)) {
357     if (isUpperCase(c)) p[-1] = c + ('a' - 'A');
358   }
359   return str;
360 }
361 
addTokenForHeader(const char * aTokenPrefix,nsACString & aValue,bool aTokenizeValue,const char * aDelimiters)362 void Tokenizer::addTokenForHeader(const char* aTokenPrefix, nsACString& aValue,
363                                   bool aTokenizeValue,
364                                   const char* aDelimiters) {
365   if (aValue.Length()) {
366     ToLowerCase(aValue);
367     if (!aTokenizeValue) {
368       nsCString tmpStr;
369       tmpStr.Assign(aTokenPrefix);
370       tmpStr.Append(':');
371       tmpStr.Append(aValue);
372 
373       add(tmpStr.get());
374     } else {
375       char* word;
376       nsCString str(aValue);
377       char* next = str.BeginWriting();
378       const char* delimiters =
379           !aDelimiters ? mHeaderDelimiters.get() : aDelimiters;
380       while ((word = NS_strtok(delimiters, &next)) != NULL) {
381         if (strlen(word) < kMinLengthForToken) continue;
382         if (isDecimalNumber(word)) continue;
383         if (isASCII(word)) {
384           nsCString tmpStr;
385           tmpStr.Assign(aTokenPrefix);
386           tmpStr.Append(':');
387           tmpStr.Append(word);
388           add(tmpStr.get());
389         }
390       }
391     }
392   }
393 }
394 
tokenizeAttachment(const char * aContentType,const char * aFileName)395 void Tokenizer::tokenizeAttachment(const char* aContentType,
396                                    const char* aFileName) {
397   nsAutoCString contentType;
398   nsAutoCString fileName;
399   fileName.Assign(aFileName);
400   contentType.Assign(aContentType);
401 
402   // normalize the content type and the file name
403   ToLowerCase(fileName);
404   ToLowerCase(contentType);
405   addTokenForHeader("attachment/filename", fileName);
406 
407   addTokenForHeader("attachment/content-type", contentType);
408 }
409 
tokenizeHeaders(nsIUTF8StringEnumerator * aHeaderNames,nsIUTF8StringEnumerator * aHeaderValues)410 void Tokenizer::tokenizeHeaders(nsIUTF8StringEnumerator* aHeaderNames,
411                                 nsIUTF8StringEnumerator* aHeaderValues) {
412   nsCString headerValue;
413   nsAutoCString
414       headerName;  // we'll be normalizing all header names to lower case
415   bool hasMore;
416 
417   while (NS_SUCCEEDED(aHeaderNames->HasMore(&hasMore)) && hasMore) {
418     aHeaderNames->GetNext(headerName);
419     ToLowerCase(headerName);
420     aHeaderValues->GetNext(headerValue);
421 
422     bool headerProcessed = false;
423     if (mCustomHeaderTokenization) {
424       // Process any exceptions set from preferences
425       for (uint32_t i = 0; i < mEnabledHeaders.Length(); i++)
426         if (headerName.Equals(mEnabledHeaders[i])) {
427           if (mEnabledHeadersDelimiters[i].IsVoid())
428             // tokenize with standard delimiters for all headers
429             addTokenForHeader(headerName.get(), headerValue, true);
430           else if (mEnabledHeadersDelimiters[i].IsEmpty())
431             // do not break the header into tokens
432             addTokenForHeader(headerName.get(), headerValue);
433           else
434             // use the delimiter in mEnabledHeadersDelimiters
435             addTokenForHeader(headerName.get(), headerValue, true,
436                               mEnabledHeadersDelimiters[i].get());
437           headerProcessed = true;
438           break;  // we found the header, no need to look for more custom values
439         }
440 
441       for (uint32_t i = 0; i < mDisabledHeaders.Length(); i++) {
442         if (headerName.Equals(mDisabledHeaders[i])) {
443           headerProcessed = true;
444           break;
445         }
446       }
447 
448       if (headerProcessed) continue;
449     }
450 
451     switch (headerName.First()) {
452       case 'c':
453         if (headerName.EqualsLiteral("content-type")) {
454           nsresult rv;
455           nsCOMPtr<nsIMIMEHeaderParam> mimehdrpar =
456               do_GetService(NS_MIMEHEADERPARAM_CONTRACTID, &rv);
457           if (NS_FAILED(rv)) break;
458 
459           // extract the charset parameter
460           nsCString parameterValue;
461           mimehdrpar->GetParameterInternal(headerValue.get(), "charset",
462                                            nullptr, nullptr,
463                                            getter_Copies(parameterValue));
464           addTokenForHeader("charset", parameterValue);
465 
466           // create a token containing just the content type
467           mimehdrpar->GetParameterInternal(headerValue.get(), "type", nullptr,
468                                            nullptr,
469                                            getter_Copies(parameterValue));
470           if (!parameterValue.Length())
471             mimehdrpar->GetParameterInternal(
472                 headerValue.get(), nullptr /* use first unnamed param */,
473                 nullptr, nullptr, getter_Copies(parameterValue));
474           addTokenForHeader("content-type/type", parameterValue);
475 
476           // XXX: should we add a token for the entire content-type header as
477           // well or just these parts we have extracted?
478         }
479         break;
480       case 'r':
481         if (headerName.EqualsLiteral("received")) {
482           // look for the string "may be forged" in the received headers.
483           // sendmail sometimes adds this hint This does not compile on linux
484           // yet. Need to figure out why. Commenting out for now if
485           // (FindInReadable(FORGED_RECEIVED_HEADER_HINT, headerValue))
486           //   addTokenForHeader(headerName.get(), FORGED_RECEIVED_HEADER_HINT);
487         }
488 
489         // leave out reply-to
490         break;
491       case 's':
492         if (headerName.EqualsLiteral("subject")) {
493           // we want to tokenize the subject
494           addTokenForHeader(headerName.get(), headerValue, true);
495         }
496 
497         // important: leave out sender field. Too strong of an indicator
498         break;
499       case 'x':  // (2) X-Mailer / user-agent works best if it is untokenized,
500                  // just fold the case and any leading/trailing white space
501         // all headers beginning with x-mozilla are being changed by us, so
502         // ignore
503         if (StringBeginsWith(headerName, "x-mozilla"_ns)) break;
504         // fall through
505         [[fallthrough]];
506       case 'u':
507         addTokenForHeader(headerName.get(), headerValue);
508         break;
509       default:
510         addTokenForHeader(headerName.get(), headerValue);
511         break;
512     }  // end switch
513   }
514 }
515 
tokenize_ascii_word(char * aWord)516 void Tokenizer::tokenize_ascii_word(char* aWord) {
517   // always deal with normalized lower case strings
518   toLowerCase(aWord);
519   uint32_t wordLength = strlen(aWord);
520 
521   // if the wordLength is within our accepted token limit, then add it
522   if (wordLength >= kMinLengthForToken && wordLength <= mMaxLengthForToken)
523     add(aWord);
524   else if (wordLength > mMaxLengthForToken) {
525     // don't skip over the word if it looks like an email address,
526     // there is value in adding tokens for addresses
527     nsDependentCString word(aWord,
528                             wordLength);  // CHEAP, no allocation occurs here...
529 
530     // XXX: i think the 40 byte check is just for perf reasons...if the email
531     // address is longer than that then forget about it.
532     const char* atSign = strchr(aWord, '@');
533     if (wordLength < 40 && strchr(aWord, '.') && atSign &&
534         !strchr(atSign + 1, '@')) {
535       uint32_t numBytesToSep = atSign - aWord;
536       if (numBytesToSep <
537           wordLength - 1)  // if the @ sign is the last character, it must not
538                            // be an email address
539       {
540         // split the john@foo.com into john and foo.com, treat them as separate
541         // tokens
542         nsCString emailNameToken;
543         emailNameToken.AssignLiteral("email name:");
544         emailNameToken.Append(Substring(word, 0, numBytesToSep++));
545         add(emailNameToken.get());
546         nsCString emailAddrToken;
547         emailAddrToken.AssignLiteral("email addr:");
548         emailAddrToken.Append(
549             Substring(word, numBytesToSep, wordLength - numBytesToSep));
550         add(emailAddrToken.get());
551         return;
552       }
553     }
554 
555     // there is value in generating a token indicating the number
556     // of characters we are skipping. We'll round to the nearest 10
557     nsCString skipToken;
558     skipToken.AssignLiteral("skip:");
559     skipToken.Append(word[0]);
560     skipToken.Append(' ');
561     skipToken.AppendInt((wordLength / 10) * 10);
562     add(skipToken.get());
563   }
564 }
565 
566 // one subtract and one conditional jump should be faster than two conditional
567 // jump on most recent system.
568 #define IN_RANGE(x, low, high) ((uint16_t)((x) - (low)) <= (high) - (low))
569 
570 #define IS_JA_HIRAGANA(x) IN_RANGE(x, 0x3040, 0x309F)
571 // swapping the range using xor operation to reduce conditional jump.
572 #define IS_JA_KATAKANA(x) \
573   (IN_RANGE(x ^ 0x0004, 0x30A0, 0x30FE) || (IN_RANGE(x, 0xFF66, 0xFF9F)))
574 #define IS_JA_KANJI(x) \
575   (IN_RANGE(x, 0x2E80, 0x2FDF) || IN_RANGE(x, 0x4E00, 0x9FAF))
576 #define IS_JA_KUTEN(x) (((x) == 0x3001) || ((x) == 0xFF64) || ((x) == 0xFF0E))
577 #define IS_JA_TOUTEN(x) (((x) == 0x3002) || ((x) == 0xFF61) || ((x) == 0xFF0C))
578 #define IS_JA_SPACE(x) ((x) == 0x3000)
579 #define IS_JA_FWLATAIN(x) IN_RANGE(x, 0xFF01, 0xFF5E)
580 #define IS_JA_FWNUMERAL(x) IN_RANGE(x, 0xFF10, 0xFF19)
581 
582 #define IS_JAPANESE_SPECIFIC(x) \
583   (IN_RANGE(x, 0x3040, 0x30FF) || IN_RANGE(x, 0xFF01, 0xFF9F))
584 
585 enum char_class {
586   others = 0,
587   space,
588   hiragana,
589   katakana,
590   kanji,
591   kuten,
592   touten,
593   kigou,
594   fwlatain,
595   ascii
596 };
597 
getCharClass(char16_t c)598 static char_class getCharClass(char16_t c) {
599   char_class charClass = others;
600 
601   if (IS_JA_HIRAGANA(c))
602     charClass = hiragana;
603   else if (IS_JA_KATAKANA(c))
604     charClass = katakana;
605   else if (IS_JA_KANJI(c))
606     charClass = kanji;
607   else if (IS_JA_KUTEN(c))
608     charClass = kuten;
609   else if (IS_JA_TOUTEN(c))
610     charClass = touten;
611   else if (IS_JA_FWLATAIN(c))
612     charClass = fwlatain;
613 
614   return charClass;
615 }
616 
isJapanese(const char * word)617 static bool isJapanese(const char* word) {
618   nsString text = NS_ConvertUTF8toUTF16(word);
619   char16_t* p = (char16_t*)text.get();
620   char16_t c;
621 
622   // it is japanese chunk if it contains any hiragana or katakana.
623   while ((c = *p++))
624     if (IS_JAPANESE_SPECIFIC(c)) return true;
625 
626   return false;
627 }
628 
isFWNumeral(const char16_t * p1,const char16_t * p2)629 static bool isFWNumeral(const char16_t* p1, const char16_t* p2) {
630   for (; p1 < p2; p1++)
631     if (!IS_JA_FWNUMERAL(*p1)) return false;
632 
633   return true;
634 }
635 
636 // The japanese tokenizer was added as part of Bug #277354
tokenize_japanese_word(char * chunk)637 void Tokenizer::tokenize_japanese_word(char* chunk) {
638   MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug,
639           ("entering tokenize_japanese_word(%s)", chunk));
640 
641   nsString srcStr = NS_ConvertUTF8toUTF16(chunk);
642   const char16_t* p1 = srcStr.get();
643   const char16_t* p2 = p1;
644   if (!*p2) return;
645 
646   char_class cc = getCharClass(*p2);
647   while (*(++p2)) {
648     if (cc == getCharClass(*p2)) continue;
649 
650     nsCString token = NS_ConvertUTF16toUTF8(p1, p2 - p1);
651     if ((!isDecimalNumber(token.get())) && (!isFWNumeral(p1, p2))) {
652       nsCString tmpStr;
653       tmpStr.AppendLiteral("JA:");
654       tmpStr.Append(token);
655       add(tmpStr.get());
656     }
657 
658     cc = getCharClass(*p2);
659     p1 = p2;
660   }
661 }
662 
stripHTML(const nsAString & inString,nsAString & outString)663 nsresult Tokenizer::stripHTML(const nsAString& inString, nsAString& outString) {
664   uint32_t flags = nsIDocumentEncoder::OutputLFLineBreak |
665                    nsIDocumentEncoder::OutputNoScriptContent |
666                    nsIDocumentEncoder::OutputNoFramesContent |
667                    nsIDocumentEncoder::OutputBodyOnly;
668   nsCOMPtr<nsIParserUtils> utils = do_GetService(NS_PARSERUTILS_CONTRACTID);
669   return utils->ConvertToPlainText(inString, flags, 80, outString);
670 }
671 
672 // Copied from nsSemanticUnitScanner.cpp which was removed in bug 1368418.
ScannerNext(const char16_t * text,int32_t length,int32_t pos,bool isLastBuffer,int32_t * begin,int32_t * end,bool * _retval)673 nsresult Tokenizer::ScannerNext(const char16_t* text, int32_t length,
674                                 int32_t pos, bool isLastBuffer, int32_t* begin,
675                                 int32_t* end, bool* _retval) {
676   if (!mWordBreaker) {
677     mWordBreaker = mozilla::intl::WordBreaker::Create();
678   }
679 
680   // if we reach the end, just return
681   if (pos >= length) {
682     *begin = pos;
683     *end = pos;
684     *_retval = false;
685     return NS_OK;
686   }
687 
688   mozilla::intl::WordBreakClass char_class =
689       mozilla::intl::WordBreaker::GetClass(text[pos]);
690 
691   // If we are in Chinese mode, return one Han letter at a time.
692   // We should not do this if we are in Japanese or Korean mode.
693   if (mozilla::intl::kWbClassHanLetter == char_class) {
694     *begin = pos;
695     *end = pos + 1;
696     *_retval = true;
697     return NS_OK;
698   }
699 
700   int32_t next;
701   // Find the next "word".
702   next = mWordBreaker->NextWord(text, (uint32_t)length, (uint32_t)pos);
703 
704   // If we don't have enough text to make decision, return.
705   if (next == NS_WORDBREAKER_NEED_MORE_TEXT) {
706     *begin = pos;
707     *end = isLastBuffer ? length : pos;
708     *_retval = isLastBuffer;
709     return NS_OK;
710   }
711 
712   // If what we got is space or punct, look at the next break.
713   if (char_class == mozilla::intl::kWbClassSpace ||
714       char_class == mozilla::intl::kWbClassPunct) {
715     // If the next "word" is not letters,
716     // call itself recursively with the new pos.
717     return ScannerNext(text, length, next, isLastBuffer, begin, end, _retval);
718   }
719 
720   // For the rest, return.
721   *begin = pos;
722   *end = next;
723   *_retval = true;
724   return NS_OK;
725 }
726 
tokenize(const char * aText)727 void Tokenizer::tokenize(const char* aText) {
728   MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug, ("tokenize: %s", aText));
729 
730   // strip out HTML tags before we begin processing
731   // uggh but first we have to blow up our string into UCS2
732   // since that's what the document encoder wants. UTF8/UCS2, I wish we all
733   // spoke the same language here..
734   nsString text = NS_ConvertUTF8toUTF16(aText);
735   nsString strippedUCS2;
736 
737   // RSS feeds store their summary information as an iframe. But due to
738   // bug 365953, we can't see those in the plaintext serializer. As a
739   // workaround, allow an option to replace iframe with div in the message
740   // text. We disable by default, since most people won't be applying bayes
741   // to RSS
742 
743   if (mIframeToDiv) {
744     text.ReplaceSubstring(u"<iframe"_ns, u"<div"_ns);
745     text.ReplaceSubstring(u"/iframe>"_ns, u"/div>"_ns);
746   }
747 
748   stripHTML(text, strippedUCS2);
749 
750   // convert 0x3000(full width space) into 0x0020
751   char16_t* substr_start = strippedUCS2.BeginWriting();
752   char16_t* substr_end = strippedUCS2.EndWriting();
753   while (substr_start != substr_end) {
754     if (*substr_start == 0x3000) *substr_start = 0x0020;
755     ++substr_start;
756   }
757 
758   nsCString strippedStr = NS_ConvertUTF16toUTF8(strippedUCS2);
759   char* strippedText = strippedStr.BeginWriting();
760   MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug,
761           ("tokenize stripped html: %s", strippedText));
762 
763   char* word;
764   char* next = strippedText;
765   while ((word = NS_strtok(mBodyDelimiters.get(), &next)) != NULL) {
766     if (!*word) continue;
767     if (isDecimalNumber(word)) continue;
768     if (isASCII(word))
769       tokenize_ascii_word(word);
770     else if (isJapanese(word))
771       tokenize_japanese_word(word);
772     else {
773       nsresult rv;
774       // Convert this word from UTF-8 into UCS2.
775       NS_ConvertUTF8toUTF16 uword(word);
776       ToLowerCase(uword);
777       const char16_t* utext = uword.get();
778       int32_t len = uword.Length(), pos = 0, begin, end;
779       bool gotUnit;
780       while (pos < len) {
781         rv = ScannerNext(utext, len, pos, true, &begin, &end, &gotUnit);
782         if (NS_SUCCEEDED(rv) && gotUnit) {
783           NS_ConvertUTF16toUTF8 utfUnit(utext + begin, end - begin);
784           add(utfUnit.get());
785           // Advance to end of current unit.
786           pos = end;
787         } else {
788           break;
789         }
790       }
791     }
792   }
793 }
794 
795 // helper function to un-escape \n, \t, etc from a CString
UnescapeCString(nsCString & aCString)796 void Tokenizer::UnescapeCString(nsCString& aCString) {
797   nsAutoCString result;
798 
799   const char* readEnd = aCString.EndReading();
800   result.SetLength(aCString.Length());
801   char* writeStart = result.BeginWriting();
802   char* writeIter = writeStart;
803 
804   bool inEscape = false;
805   for (const char* readIter = aCString.BeginReading(); readIter != readEnd;
806        readIter++) {
807     if (!inEscape) {
808       if (*readIter == '\\')
809         inEscape = true;
810       else
811         *(writeIter++) = *readIter;
812     } else {
813       inEscape = false;
814       switch (*readIter) {
815         case '\\':
816           *(writeIter++) = '\\';
817           break;
818         case 't':
819           *(writeIter++) = '\t';
820           break;
821         case 'n':
822           *(writeIter++) = '\n';
823           break;
824         case 'v':
825           *(writeIter++) = '\v';
826           break;
827         case 'f':
828           *(writeIter++) = '\f';
829           break;
830         case 'r':
831           *(writeIter++) = '\r';
832           break;
833         default:
834           // all other escapes are ignored
835           break;
836       }
837     }
838   }
839   result.Truncate(writeIter - writeStart);
840   aCString.Assign(result);
841 }
842 
copyTokens()843 Token* Tokenizer::copyTokens() {
844   uint32_t count = countTokens();
845   if (count > 0) {
846     Token* tokens = new Token[count];
847     if (tokens) {
848       Token* tp = tokens;
849       TokenEnumeration e(&mTokenTable);
850       while (e.hasMoreTokens()) {
851         Token* src = static_cast<Token*>(e.nextToken());
852         tp->clone(*src);
853         ++tp;
854       }
855     }
856     return tokens;
857   }
858   return NULL;
859 }
860 
861 class TokenAnalyzer {
862  public:
~TokenAnalyzer()863   virtual ~TokenAnalyzer() {}
864 
865   virtual void analyzeTokens(Tokenizer& tokenizer) = 0;
setTokenListener(nsIStreamListener * aTokenListener)866   void setTokenListener(nsIStreamListener* aTokenListener) {
867     mTokenListener = aTokenListener;
868   }
869 
setSource(const nsACString & sourceURI)870   void setSource(const nsACString& sourceURI) { mTokenSource = sourceURI; }
871 
872   nsCOMPtr<nsIStreamListener> mTokenListener;
873   nsCString mTokenSource;
874 };
875 
876 /**
877  * This class downloads the raw content of an email message, buffering until
878  * complete segments are seen, that is until a linefeed is seen, although
879  * any of the valid token separators would do. This could be a further
880  * refinement.
881  */
882 class TokenStreamListener : public nsIStreamListener, nsIMsgHeaderSink {
883  public:
884   NS_DECL_ISUPPORTS
885   NS_DECL_NSIREQUESTOBSERVER
886   NS_DECL_NSISTREAMLISTENER
887   NS_DECL_NSIMSGHEADERSINK
888 
889   explicit TokenStreamListener(TokenAnalyzer* analyzer);
890 
891  protected:
892   virtual ~TokenStreamListener();
893   TokenAnalyzer* mAnalyzer;
894   char* mBuffer;
895   uint32_t mBufferSize;
896   uint32_t mLeftOverCount;
897   Tokenizer mTokenizer;
898   bool mSetAttachmentFlag;
899 };
900 
901 const uint32_t kBufferSize = 16384;
902 
TokenStreamListener(TokenAnalyzer * analyzer)903 TokenStreamListener::TokenStreamListener(TokenAnalyzer* analyzer)
904     : mAnalyzer(analyzer),
905       mBuffer(NULL),
906       mBufferSize(kBufferSize),
907       mLeftOverCount(0),
908       mSetAttachmentFlag(false) {}
909 
~TokenStreamListener()910 TokenStreamListener::~TokenStreamListener() {
911   delete[] mBuffer;
912   delete mAnalyzer;
913 }
914 
NS_IMPL_ISUPPORTS(TokenStreamListener,nsIRequestObserver,nsIStreamListener,nsIMsgHeaderSink)915 NS_IMPL_ISUPPORTS(TokenStreamListener, nsIRequestObserver, nsIStreamListener,
916                   nsIMsgHeaderSink)
917 
918 NS_IMETHODIMP TokenStreamListener::ProcessHeaders(
919     nsIUTF8StringEnumerator* aHeaderNames,
920     nsIUTF8StringEnumerator* aHeaderValues, bool dontCollectAddress) {
921   mTokenizer.tokenizeHeaders(aHeaderNames, aHeaderValues);
922   return NS_OK;
923 }
924 
HandleAttachment(const char * contentType,const nsACString & url,const char16_t * displayName,const nsACString & uri,bool aIsExternalAttachment)925 NS_IMETHODIMP TokenStreamListener::HandleAttachment(
926     const char* contentType, const nsACString& url, const char16_t* displayName,
927     const nsACString& uri, bool aIsExternalAttachment) {
928   mTokenizer.tokenizeAttachment(contentType,
929                                 NS_ConvertUTF16toUTF8(displayName).get());
930   return NS_OK;
931 }
932 
AddAttachmentField(const char * field,const char * value)933 NS_IMETHODIMP TokenStreamListener::AddAttachmentField(const char* field,
934                                                       const char* value) {
935   return NS_OK;
936 }
937 
OnEndAllAttachments()938 NS_IMETHODIMP TokenStreamListener::OnEndAllAttachments() { return NS_OK; }
939 
OnEndMsgDownload(nsIMsgMailNewsUrl * url)940 NS_IMETHODIMP TokenStreamListener::OnEndMsgDownload(nsIMsgMailNewsUrl* url) {
941   return NS_OK;
942 }
943 
OnMsgHasRemoteContent(nsIMsgDBHdr * aMsgHdr,nsIURI * aContentURI,bool aCanOverride)944 NS_IMETHODIMP TokenStreamListener::OnMsgHasRemoteContent(nsIMsgDBHdr* aMsgHdr,
945                                                          nsIURI* aContentURI,
946                                                          bool aCanOverride) {
947   return NS_OK;
948 }
949 
OnEndMsgHeaders(nsIMsgMailNewsUrl * url)950 NS_IMETHODIMP TokenStreamListener::OnEndMsgHeaders(nsIMsgMailNewsUrl* url) {
951   return NS_OK;
952 }
953 
GetSecurityInfo(nsISupports ** aSecurityInfo)954 NS_IMETHODIMP TokenStreamListener::GetSecurityInfo(
955     nsISupports** aSecurityInfo) {
956   return NS_OK;
957 }
SetSecurityInfo(nsISupports * aSecurityInfo)958 NS_IMETHODIMP TokenStreamListener::SetSecurityInfo(nsISupports* aSecurityInfo) {
959   return NS_OK;
960 }
961 
GetDummyMsgHeader(nsIMsgDBHdr ** aMsgDBHdr)962 NS_IMETHODIMP TokenStreamListener::GetDummyMsgHeader(nsIMsgDBHdr** aMsgDBHdr) {
963   return NS_ERROR_NOT_IMPLEMENTED;
964 }
965 
ResetProperties()966 NS_IMETHODIMP TokenStreamListener::ResetProperties() { return NS_OK; }
967 
GetProperties(nsIWritablePropertyBag2 ** aProperties)968 NS_IMETHODIMP TokenStreamListener::GetProperties(
969     nsIWritablePropertyBag2** aProperties) {
970   return NS_ERROR_NOT_IMPLEMENTED;
971 }
972 
973 /* void onStartRequest (in nsIRequest aRequest); */
OnStartRequest(nsIRequest * aRequest)974 NS_IMETHODIMP TokenStreamListener::OnStartRequest(nsIRequest* aRequest) {
975   mLeftOverCount = 0;
976   if (!mBuffer) {
977     mBuffer = new char[mBufferSize];
978     NS_ENSURE_TRUE(mBuffer, NS_ERROR_OUT_OF_MEMORY);
979   }
980 
981   // get the url for the channel and set our nsIMsgHeaderSink on it so we get
982   // notified about the headers and attachments
983 
984   nsCOMPtr<nsIChannel> channel(do_QueryInterface(aRequest));
985   if (channel) {
986     nsCOMPtr<nsIURI> uri;
987     channel->GetURI(getter_AddRefs(uri));
988     nsCOMPtr<nsIMsgMailNewsUrl> mailUrl = do_QueryInterface(uri);
989     if (mailUrl)
990       mailUrl->SetMsgHeaderSink(static_cast<nsIMsgHeaderSink*>(this));
991   }
992 
993   return NS_OK;
994 }
995 
996 /* void onDataAvailable (in nsIRequest aRequest, in nsIInputStream aInputStream,
997  * in unsigned long long aOffset, in unsigned long aCount); */
OnDataAvailable(nsIRequest * aRequest,nsIInputStream * aInputStream,uint64_t aOffset,uint32_t aCount)998 NS_IMETHODIMP TokenStreamListener::OnDataAvailable(nsIRequest* aRequest,
999                                                    nsIInputStream* aInputStream,
1000                                                    uint64_t aOffset,
1001                                                    uint32_t aCount) {
1002   nsresult rv = NS_OK;
1003 
1004   while (aCount > 0) {
1005     uint32_t readCount, totalCount = (aCount + mLeftOverCount);
1006     if (totalCount >= mBufferSize) {
1007       readCount = mBufferSize - mLeftOverCount - 1;
1008     } else {
1009       readCount = aCount;
1010     }
1011 
1012     // mBuffer is supposed to be allocated in onStartRequest. But something
1013     // is causing that to not happen, so as a last-ditch attempt we'll
1014     // do it here.
1015     if (!mBuffer) {
1016       mBuffer = new char[mBufferSize];
1017       NS_ENSURE_TRUE(mBuffer, NS_ERROR_OUT_OF_MEMORY);
1018     }
1019 
1020     char* buffer = mBuffer;
1021     rv = aInputStream->Read(buffer + mLeftOverCount, readCount, &readCount);
1022     if (NS_FAILED(rv)) break;
1023 
1024     if (readCount == 0) {
1025       rv = NS_ERROR_UNEXPECTED;
1026       NS_WARNING("failed to tokenize");
1027       break;
1028     }
1029 
1030     aCount -= readCount;
1031 
1032     /* consume the tokens up to the last legal token delimiter in the buffer. */
1033     totalCount = (readCount + mLeftOverCount);
1034     buffer[totalCount] = '\0';
1035     char* lastDelimiter = NULL;
1036     char* scan = buffer + totalCount;
1037     while (scan > buffer) {
1038       if (strchr(mTokenizer.mBodyDelimiters.get(), *--scan)) {
1039         lastDelimiter = scan;
1040         break;
1041       }
1042     }
1043 
1044     if (lastDelimiter) {
1045       *lastDelimiter = '\0';
1046       mTokenizer.tokenize(buffer);
1047 
1048       uint32_t consumedCount = 1 + (lastDelimiter - buffer);
1049       mLeftOverCount = totalCount - consumedCount;
1050       if (mLeftOverCount)
1051         memmove(buffer, buffer + consumedCount, mLeftOverCount);
1052     } else {
1053       /* didn't find a delimiter, keep the whole buffer around. */
1054       mLeftOverCount = totalCount;
1055       if (totalCount >= (mBufferSize / 2)) {
1056         uint32_t newBufferSize = mBufferSize * 2;
1057         char* newBuffer = new char[newBufferSize];
1058         NS_ENSURE_TRUE(newBuffer, NS_ERROR_OUT_OF_MEMORY);
1059         memcpy(newBuffer, mBuffer, mLeftOverCount);
1060         delete[] mBuffer;
1061         mBuffer = newBuffer;
1062         mBufferSize = newBufferSize;
1063       }
1064     }
1065   }
1066 
1067   return rv;
1068 }
1069 
1070 /* void onStopRequest (in nsIRequest aRequest, in nsresult aStatusCode); */
OnStopRequest(nsIRequest * aRequest,nsresult aStatusCode)1071 NS_IMETHODIMP TokenStreamListener::OnStopRequest(nsIRequest* aRequest,
1072                                                  nsresult aStatusCode) {
1073   if (mLeftOverCount) {
1074     /* assume final buffer is complete. */
1075     mBuffer[mLeftOverCount] = '\0';
1076     mTokenizer.tokenize(mBuffer);
1077   }
1078 
1079   /* finally, analyze the tokenized message. */
1080   MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug,
1081           ("analyze the tokenized message"));
1082   if (mAnalyzer) mAnalyzer->analyzeTokens(mTokenizer);
1083 
1084   return NS_OK;
1085 }
1086 
1087 /* Implementation file */
1088 
NS_IMPL_ISUPPORTS(nsBayesianFilter,nsIMsgFilterPlugin,nsIJunkMailPlugin,nsIMsgCorpus,nsISupportsWeakReference,nsIObserver)1089 NS_IMPL_ISUPPORTS(nsBayesianFilter, nsIMsgFilterPlugin, nsIJunkMailPlugin,
1090                   nsIMsgCorpus, nsISupportsWeakReference, nsIObserver)
1091 
1092 nsBayesianFilter::nsBayesianFilter() : mTrainingDataDirty(false) {
1093   int32_t junkThreshold = 0;
1094   nsresult rv;
1095   nsCOMPtr<nsIPrefBranch> pPrefBranch(
1096       do_GetService(NS_PREFSERVICE_CONTRACTID, &rv));
1097   if (pPrefBranch)
1098     pPrefBranch->GetIntPref("mail.adaptivefilters.junk_threshold",
1099                             &junkThreshold);
1100 
1101   mJunkProbabilityThreshold = (static_cast<double>(junkThreshold)) / 100.0;
1102   if (mJunkProbabilityThreshold == 0 || mJunkProbabilityThreshold >= 1)
1103     mJunkProbabilityThreshold = kDefaultJunkThreshold;
1104 
1105   MOZ_LOG(BayesianFilterLogModule, LogLevel::Warning,
1106           ("junk probability threshold: %f", mJunkProbabilityThreshold));
1107 
1108   mCorpus.readTrainingData();
1109 
1110   // get parameters for training data flushing, from the prefs
1111 
1112   nsCOMPtr<nsIPrefBranch> prefBranch;
1113 
1114   nsCOMPtr<nsIPrefService> prefs =
1115       do_GetService(NS_PREFSERVICE_CONTRACTID, &rv);
1116   NS_ASSERTION(NS_SUCCEEDED(rv), "failed accessing preferences service");
1117   rv = prefs->GetBranch(nullptr, getter_AddRefs(prefBranch));
1118   NS_ASSERTION(NS_SUCCEEDED(rv), "failed getting preferences branch");
1119 
1120   rv = prefBranch->GetIntPref(
1121       "mailnews.bayesian_spam_filter.flush.minimum_interval",
1122       &mMinFlushInterval);
1123   // it is not a good idea to allow a minimum interval of under 1 second
1124   if (NS_FAILED(rv) || (mMinFlushInterval <= 1000))
1125     mMinFlushInterval = DEFAULT_MIN_INTERVAL_BETWEEN_WRITES;
1126 
1127   rv = prefBranch->GetIntPref("mailnews.bayesian_spam_filter.junk_maxtokens",
1128                               &mMaximumTokenCount);
1129   if (NS_FAILED(rv))
1130     mMaximumTokenCount = 0;  // which means do not limit token counts
1131   MOZ_LOG(BayesianFilterLogModule, LogLevel::Warning,
1132           ("maximum junk tokens: %d", mMaximumTokenCount));
1133 
1134   mTimer = do_CreateInstance(NS_TIMER_CONTRACTID, &rv);
1135   NS_ASSERTION(
1136       NS_SUCCEEDED(rv),
1137       "unable to create a timer; training data will only be written on exit");
1138 
1139   // the timer is not used on object construction, since for
1140   // the time being there are no dirying messages
1141 
1142   // give a default capacity to the memory structure used to store
1143   // per-message/per-trait token data
1144   mAnalysisStore.SetCapacity(kAnalysisStoreCapacity);
1145 
1146   // dummy 0th element. Index 0 means "end of list" so we need to
1147   // start from 1
1148   AnalysisPerToken analysisPT(0, 0.0, 0.0);
1149   mAnalysisStore.AppendElement(analysisPT);
1150   mNextAnalysisIndex = 1;
1151 }
1152 
Init()1153 nsresult nsBayesianFilter::Init() {
1154   nsCOMPtr<nsIObserverService> observerService =
1155       mozilla::services::GetObserverService();
1156   if (observerService)
1157     observerService->AddObserver(this, "profile-before-change", true);
1158   return NS_OK;
1159 }
1160 
TimerCallback(nsITimer * aTimer,void * aClosure)1161 void nsBayesianFilter::TimerCallback(nsITimer* aTimer, void* aClosure) {
1162   // we will flush the training data to disk after enough time has passed
1163   // since the first time a message has been classified after the last flush
1164 
1165   nsBayesianFilter* filter = static_cast<nsBayesianFilter*>(aClosure);
1166   filter->mCorpus.writeTrainingData(filter->mMaximumTokenCount);
1167   filter->mTrainingDataDirty = false;
1168 }
1169 
~nsBayesianFilter()1170 nsBayesianFilter::~nsBayesianFilter() {
1171   if (mTimer) {
1172     mTimer->Cancel();
1173     mTimer = nullptr;
1174   }
1175   // call shutdown when we are going away in case we need
1176   // to flush the training set to disk
1177   Shutdown();
1178 }
1179 
1180 // this object is used for one call to classifyMessage or classifyMessages().
1181 // So if we're classifying multiple messages, this object will be used for each
1182 // message. It's going to hold a reference to itself, basically, to stay in
1183 // memory.
1184 class MessageClassifier : public TokenAnalyzer {
1185  public:
1186   // full classifier with arbitrary traits
MessageClassifier(nsBayesianFilter * aFilter,nsIJunkMailClassificationListener * aJunkListener,nsIMsgTraitClassificationListener * aTraitListener,nsIMsgTraitDetailListener * aDetailListener,const nsTArray<uint32_t> & aProTraits,const nsTArray<uint32_t> & aAntiTraits,nsIMsgWindow * aMsgWindow,const nsTArray<nsCString> & aMessageURIs)1187   MessageClassifier(nsBayesianFilter* aFilter,
1188                     nsIJunkMailClassificationListener* aJunkListener,
1189                     nsIMsgTraitClassificationListener* aTraitListener,
1190                     nsIMsgTraitDetailListener* aDetailListener,
1191                     const nsTArray<uint32_t>& aProTraits,
1192                     const nsTArray<uint32_t>& aAntiTraits,
1193                     nsIMsgWindow* aMsgWindow,
1194                     const nsTArray<nsCString>& aMessageURIs)
1195       : mFilter(aFilter),
1196         mJunkMailPlugin(aFilter),
1197         mJunkListener(aJunkListener),
1198         mTraitListener(aTraitListener),
1199         mDetailListener(aDetailListener),
1200         mProTraits(aProTraits.Clone()),
1201         mAntiTraits(aAntiTraits.Clone()),
1202         mMsgWindow(aMsgWindow),
1203         mMessageURIs(aMessageURIs.Clone()),
1204         mCurMessageToClassify(0) {
1205     MOZ_ASSERT(aProTraits.Length() == aAntiTraits.Length());
1206   }
1207 
1208   // junk-only classifier
MessageClassifier(nsBayesianFilter * aFilter,nsIJunkMailClassificationListener * aJunkListener,nsIMsgWindow * aMsgWindow,const nsTArray<nsCString> & aMessageURIs)1209   MessageClassifier(nsBayesianFilter* aFilter,
1210                     nsIJunkMailClassificationListener* aJunkListener,
1211                     nsIMsgWindow* aMsgWindow,
1212                     const nsTArray<nsCString>& aMessageURIs)
1213       : mFilter(aFilter),
1214         mJunkMailPlugin(aFilter),
1215         mJunkListener(aJunkListener),
1216         mTraitListener(nullptr),
1217         mDetailListener(nullptr),
1218         mMsgWindow(aMsgWindow),
1219         mMessageURIs(aMessageURIs.Clone()),
1220         mCurMessageToClassify(0) {
1221     mProTraits.AppendElement(kJunkTrait);
1222     mAntiTraits.AppendElement(kGoodTrait);
1223   }
1224 
~MessageClassifier()1225   virtual ~MessageClassifier() {}
analyzeTokens(Tokenizer & tokenizer)1226   virtual void analyzeTokens(Tokenizer& tokenizer) {
1227     mFilter->classifyMessage(tokenizer, mTokenSource, mProTraits, mAntiTraits,
1228                              mJunkListener, mTraitListener, mDetailListener);
1229     tokenizer.clearTokens();
1230     classifyNextMessage();
1231   }
1232 
classifyNextMessage()1233   virtual void classifyNextMessage() {
1234     if (++mCurMessageToClassify < mMessageURIs.Length()) {
1235       MOZ_LOG(BayesianFilterLogModule, LogLevel::Warning,
1236               ("classifyNextMessage(%s)",
1237                mMessageURIs[mCurMessageToClassify].get()));
1238       mFilter->tokenizeMessage(mMessageURIs[mCurMessageToClassify], mMsgWindow,
1239                                this);
1240     } else {
1241       // call all listeners with null parameters to signify end of batch
1242       if (mJunkListener)
1243         mJunkListener->OnMessageClassified(EmptyCString(),
1244                                            nsIJunkMailPlugin::UNCLASSIFIED, 0);
1245       if (mTraitListener) {
1246         nsTArray<uint32_t> nullTraits;
1247         nsTArray<uint32_t> nullPercents;
1248         mTraitListener->OnMessageTraitsClassified(EmptyCString(), nullTraits,
1249                                                   nullPercents);
1250       }
1251       mTokenListener =
1252           nullptr;  // this breaks the circular ref that keeps this object alive
1253                     // so we will be destroyed as a result.
1254     }
1255   }
1256 
1257  private:
1258   nsBayesianFilter* mFilter;
1259   nsCOMPtr<nsIJunkMailPlugin> mJunkMailPlugin;
1260   nsCOMPtr<nsIJunkMailClassificationListener> mJunkListener;
1261   nsCOMPtr<nsIMsgTraitClassificationListener> mTraitListener;
1262   nsCOMPtr<nsIMsgTraitDetailListener> mDetailListener;
1263   nsTArray<uint32_t> mProTraits;
1264   nsTArray<uint32_t> mAntiTraits;
1265   nsCOMPtr<nsIMsgWindow> mMsgWindow;
1266   nsTArray<nsCString> mMessageURIs;
1267   uint32_t mCurMessageToClassify;  // 0-based index
1268 };
1269 
tokenizeMessage(const nsACString & aMessageURI,nsIMsgWindow * aMsgWindow,TokenAnalyzer * aAnalyzer)1270 nsresult nsBayesianFilter::tokenizeMessage(const nsACString& aMessageURI,
1271                                            nsIMsgWindow* aMsgWindow,
1272                                            TokenAnalyzer* aAnalyzer) {
1273   nsCOMPtr<nsIMsgMessageService> msgService;
1274   nsresult rv =
1275       GetMessageServiceFromURI(aMessageURI, getter_AddRefs(msgService));
1276   NS_ENSURE_SUCCESS(rv, rv);
1277 
1278   aAnalyzer->setSource(aMessageURI);
1279   nsCOMPtr<nsIURI> dummyNull;
1280   return msgService->StreamMessage(
1281       aMessageURI, aAnalyzer->mTokenListener, aMsgWindow, nullptr,
1282       true /* convert data */, "filter"_ns, false, getter_AddRefs(dummyNull));
1283 }
1284 
1285 // a TraitAnalysis is the per-token representation of the statistical
1286 // calculations, basically created to group information that is then
1287 // sorted by mDistance
1288 struct TraitAnalysis {
1289   uint32_t mTokenIndex;
1290   double mDistance;
1291   double mProbability;
1292 };
1293 
1294 // comparator required to sort an nsTArray
1295 class compareTraitAnalysis {
1296  public:
Equals(const TraitAnalysis & a,const TraitAnalysis & b) const1297   bool Equals(const TraitAnalysis& a, const TraitAnalysis& b) const {
1298     return a.mDistance == b.mDistance;
1299   }
LessThan(const TraitAnalysis & a,const TraitAnalysis & b) const1300   bool LessThan(const TraitAnalysis& a, const TraitAnalysis& b) const {
1301     return a.mDistance < b.mDistance;
1302   }
1303 };
1304 
dmax(double x,double y)1305 inline double dmax(double x, double y) { return (x > y ? x : y); }
dmin(double x,double y)1306 inline double dmin(double x, double y) { return (x < y ? x : y); }
1307 
1308 // Chi square functions are implemented by an incomplete gamma function.
1309 // Note that chi2P's callers multiply the arguments by 2 but chi2P
1310 // divides them by 2 again. Inlining chi2P gives the compiler a
1311 // chance to notice this.
1312 
1313 // Both chi2P and nsIncompleteGammaP set *error negative on domain
1314 // errors and nsIncompleteGammaP sets it posivive on internal errors.
1315 // This may be useful but the chi2P callers treat any error as fatal.
1316 
1317 // Note that converting unsigned ints to floating point can be slow on
1318 // some platforms (like Intel) so use signed quantities for the numeric
1319 // routines.
chi2P(double chi2,double nu,int32_t * error)1320 static inline double chi2P(double chi2, double nu, int32_t* error) {
1321   // domain checks; set error and return a dummy value
1322   if (chi2 < 0.0 || nu <= 0.0) {
1323     *error = -1;
1324     return 0.0;
1325   }
1326   // reversing the arguments is intentional
1327   return nsIncompleteGammaP(nu / 2.0, chi2 / 2.0, error);
1328 }
1329 
classifyMessage(Tokenizer & tokenizer,const nsACString & messageURI,nsTArray<uint32_t> & aProTraits,nsTArray<uint32_t> & aAntiTraits,nsIJunkMailClassificationListener * listener,nsIMsgTraitClassificationListener * aTraitListener,nsIMsgTraitDetailListener * aDetailListener)1330 void nsBayesianFilter::classifyMessage(
1331     Tokenizer& tokenizer, const nsACString& messageURI,
1332     nsTArray<uint32_t>& aProTraits, nsTArray<uint32_t>& aAntiTraits,
1333     nsIJunkMailClassificationListener* listener,
1334     nsIMsgTraitClassificationListener* aTraitListener,
1335     nsIMsgTraitDetailListener* aDetailListener) {
1336   Token* tokens = tokenizer.copyTokens();
1337   uint32_t tokenCount;
1338   if (!tokens) {
1339     // This can happen with problems with UTF conversion
1340     NS_ERROR("Trying to classify a null or invalid message");
1341     tokenCount = 0;
1342     // don't return so that we still call the listeners
1343   } else {
1344     tokenCount = tokenizer.countTokens();
1345   }
1346 
1347   if (aProTraits.Length() != aAntiTraits.Length()) {
1348     NS_ERROR("Each Pro trait needs a matching Anti trait");
1349     return;
1350   }
1351 
1352   /* this part is similar to the Graham algorithm with some adjustments. */
1353   uint32_t traitCount = aProTraits.Length();
1354 
1355   // pro message counts per trait index
1356   AutoTArray<uint32_t, kTraitAutoCapacity> numProMessages;
1357   // anti message counts per trait index
1358   AutoTArray<uint32_t, kTraitAutoCapacity> numAntiMessages;
1359   // array of pro aliases per trait index
1360   AutoTArray<nsTArray<uint32_t>, kTraitAutoCapacity> proAliasArrays;
1361   // array of anti aliases per trait index
1362   AutoTArray<nsTArray<uint32_t>, kTraitAutoCapacity> antiAliasArrays;
1363   // construct the outgoing listener arrays
1364   AutoTArray<uint32_t, kTraitAutoCapacity> traits;
1365   AutoTArray<uint32_t, kTraitAutoCapacity> percents;
1366   if (traitCount > kTraitAutoCapacity) {
1367     traits.SetCapacity(traitCount);
1368     percents.SetCapacity(traitCount);
1369     numProMessages.SetCapacity(traitCount);
1370     numAntiMessages.SetCapacity(traitCount);
1371     proAliasArrays.SetCapacity(traitCount);
1372     antiAliasArrays.SetCapacity(traitCount);
1373   }
1374 
1375   nsresult rv;
1376   nsCOMPtr<nsIMsgTraitService> traitService(
1377       do_GetService("@mozilla.org/msg-trait-service;1", &rv));
1378   if (NS_FAILED(rv)) {
1379     NS_ERROR("Failed to get trait service");
1380     MOZ_LOG(BayesianFilterLogModule, LogLevel::Error,
1381             ("Failed to get trait service"));
1382   }
1383 
1384   // get aliases and message counts for the pro and anti traits
1385   for (uint32_t traitIndex = 0; traitIndex < traitCount; traitIndex++) {
1386     nsresult rv;
1387 
1388     // pro trait
1389     nsTArray<uint32_t> proAliases;
1390     uint32_t proTrait = aProTraits[traitIndex];
1391     if (traitService) {
1392       rv = traitService->GetAliases(proTrait, proAliases);
1393       if (NS_FAILED(rv)) {
1394         NS_ERROR("trait service failed to get aliases");
1395         MOZ_LOG(BayesianFilterLogModule, LogLevel::Error,
1396                 ("trait service failed to get aliases"));
1397       }
1398     }
1399     proAliasArrays.AppendElement(proAliases.Clone());
1400     uint32_t proMessageCount = mCorpus.getMessageCount(proTrait);
1401     for (uint32_t aliasIndex = 0; aliasIndex < proAliases.Length();
1402          aliasIndex++)
1403       proMessageCount += mCorpus.getMessageCount(proAliases[aliasIndex]);
1404     numProMessages.AppendElement(proMessageCount);
1405 
1406     // anti trait
1407     nsTArray<uint32_t> antiAliases;
1408     uint32_t antiTrait = aAntiTraits[traitIndex];
1409     if (traitService) {
1410       rv = traitService->GetAliases(antiTrait, antiAliases);
1411       if (NS_FAILED(rv)) {
1412         NS_ERROR("trait service failed to get aliases");
1413         MOZ_LOG(BayesianFilterLogModule, LogLevel::Error,
1414                 ("trait service failed to get aliases"));
1415       }
1416     }
1417     antiAliasArrays.AppendElement(antiAliases.Clone());
1418     uint32_t antiMessageCount = mCorpus.getMessageCount(antiTrait);
1419     for (uint32_t aliasIndex = 0; aliasIndex < antiAliases.Length();
1420          aliasIndex++)
1421       antiMessageCount += mCorpus.getMessageCount(antiAliases[aliasIndex]);
1422     numAntiMessages.AppendElement(antiMessageCount);
1423   }
1424 
1425   for (uint32_t i = 0; i < tokenCount; ++i) {
1426     Token& token = tokens[i];
1427     CorpusToken* t = mCorpus.get(token.mWord);
1428     if (!t) continue;
1429     for (uint32_t traitIndex = 0; traitIndex < traitCount; traitIndex++) {
1430       uint32_t iProCount = mCorpus.getTraitCount(t, aProTraits[traitIndex]);
1431       // add in any counts for aliases to proTrait
1432       for (uint32_t aliasIndex = 0;
1433            aliasIndex < proAliasArrays[traitIndex].Length(); aliasIndex++)
1434         iProCount +=
1435             mCorpus.getTraitCount(t, proAliasArrays[traitIndex][aliasIndex]);
1436       double proCount = static_cast<double>(iProCount);
1437 
1438       uint32_t iAntiCount = mCorpus.getTraitCount(t, aAntiTraits[traitIndex]);
1439       // add in any counts for aliases to antiTrait
1440       for (uint32_t aliasIndex = 0;
1441            aliasIndex < antiAliasArrays[traitIndex].Length(); aliasIndex++)
1442         iAntiCount +=
1443             mCorpus.getTraitCount(t, antiAliasArrays[traitIndex][aliasIndex]);
1444       double antiCount = static_cast<double>(iAntiCount);
1445 
1446       double prob, denom;
1447       // Prevent a divide by zero error by setting defaults for prob
1448 
1449       // If there are no matching tokens at all, ignore.
1450       if (antiCount == 0.0 && proCount == 0.0) continue;
1451       // if only anti match, set probability to 0%
1452       if (proCount == 0.0) prob = 0.0;
1453       // if only pro match, set probability to 100%
1454       else if (antiCount == 0.0)
1455         prob = 1.0;
1456       // not really needed, but just to be sure check the denom as well
1457       else if ((denom = proCount * numAntiMessages[traitIndex] +
1458                         antiCount * numProMessages[traitIndex]) == 0.0)
1459         continue;
1460       else
1461         prob = (proCount * numAntiMessages[traitIndex]) / denom;
1462 
1463       double n = proCount + antiCount;
1464       prob = (0.225 + n * prob) / (.45 + n);
1465       double distance = std::abs(prob - 0.5);
1466       if (distance >= .1) {
1467         mozilla::DebugOnly<nsresult> rv =
1468             setAnalysis(token, traitIndex, distance, prob);
1469         NS_ASSERTION(NS_SUCCEEDED(rv), "Problem in setAnalysis");
1470       }
1471     }
1472   }
1473 
1474   for (uint32_t traitIndex = 0; traitIndex < traitCount; traitIndex++) {
1475     AutoTArray<TraitAnalysis, 1024> traitAnalyses;
1476     // copy valid tokens into an array to sort
1477     for (uint32_t tokenIndex = 0; tokenIndex < tokenCount; tokenIndex++) {
1478       uint32_t storeIndex = getAnalysisIndex(tokens[tokenIndex], traitIndex);
1479       if (storeIndex) {
1480         TraitAnalysis ta = {tokenIndex, mAnalysisStore[storeIndex].mDistance,
1481                             mAnalysisStore[storeIndex].mProbability};
1482         traitAnalyses.AppendElement(ta);
1483       }
1484     }
1485 
1486     // sort the array by the distances
1487     traitAnalyses.Sort(compareTraitAnalysis());
1488     uint32_t count = traitAnalyses.Length();
1489     uint32_t first, last = count;
1490     const uint32_t kMaxTokens = 150;
1491     first = (count > kMaxTokens) ? count - kMaxTokens : 0;
1492 
1493     // Setup the arrays to save details if needed
1494     nsTArray<double> sArray;
1495     nsTArray<double> hArray;
1496     uint32_t usedTokenCount = (count > kMaxTokens) ? kMaxTokens : count;
1497     if (aDetailListener) {
1498       sArray.SetCapacity(usedTokenCount);
1499       hArray.SetCapacity(usedTokenCount);
1500     }
1501 
1502     double H = 1.0, S = 1.0;
1503     int32_t Hexp = 0, Sexp = 0;
1504     uint32_t goodclues = 0;
1505     int e;
1506 
1507     // index from end to analyze most significant first
1508     for (uint32_t ip1 = last; ip1 != first; --ip1) {
1509       TraitAnalysis& ta = traitAnalyses[ip1 - 1];
1510       if (ta.mDistance > 0.0) {
1511         goodclues++;
1512         double value = ta.mProbability;
1513         S *= (1.0 - value);
1514         H *= value;
1515         if (S < 1e-200) {
1516           S = frexp(S, &e);
1517           Sexp += e;
1518         }
1519         if (H < 1e-200) {
1520           H = frexp(H, &e);
1521           Hexp += e;
1522         }
1523         MOZ_LOG(BayesianFilterLogModule, LogLevel::Warning,
1524                 ("token probability (%s) is %f", tokens[ta.mTokenIndex].mWord,
1525                  ta.mProbability));
1526       }
1527       if (aDetailListener) {
1528         sArray.AppendElement(log(S) + Sexp * M_LN2);
1529         hArray.AppendElement(log(H) + Hexp * M_LN2);
1530       }
1531     }
1532 
1533     S = log(S) + Sexp * M_LN2;
1534     H = log(H) + Hexp * M_LN2;
1535 
1536     double prob;
1537     if (goodclues > 0) {
1538       int32_t chi_error;
1539       S = chi2P(-2.0 * S, 2.0 * goodclues, &chi_error);
1540       if (!chi_error) H = chi2P(-2.0 * H, 2.0 * goodclues, &chi_error);
1541       // if any error toss the entire calculation
1542       if (!chi_error)
1543         prob = (S - H + 1.0) / 2.0;
1544       else
1545         prob = 0.5;
1546     } else
1547       prob = 0.5;
1548 
1549     if (aDetailListener) {
1550       // Prepare output arrays
1551       nsTArray<uint32_t> tokenPercents(usedTokenCount);
1552       nsTArray<uint32_t> runningPercents(usedTokenCount);
1553       nsTArray<nsString> tokenStrings(usedTokenCount);
1554 
1555       double clueCount = 1.0;
1556       for (uint32_t tokenIndex = 0; tokenIndex < usedTokenCount; tokenIndex++) {
1557         TraitAnalysis& ta = traitAnalyses[last - 1 - tokenIndex];
1558         int32_t chi_error;
1559         S = chi2P(-2.0 * sArray[tokenIndex], 2.0 * clueCount, &chi_error);
1560         if (!chi_error)
1561           H = chi2P(-2.0 * hArray[tokenIndex], 2.0 * clueCount, &chi_error);
1562         clueCount += 1.0;
1563         double runningProb;
1564         if (!chi_error)
1565           runningProb = (S - H + 1.0) / 2.0;
1566         else
1567           runningProb = 0.5;
1568         runningPercents.AppendElement(
1569             static_cast<uint32_t>(runningProb * 100. + .5));
1570         tokenPercents.AppendElement(
1571             static_cast<uint32_t>(ta.mProbability * 100. + .5));
1572         tokenStrings.AppendElement(
1573             NS_ConvertUTF8toUTF16(tokens[ta.mTokenIndex].mWord));
1574       }
1575 
1576       aDetailListener->OnMessageTraitDetails(messageURI, aProTraits[traitIndex],
1577                                              tokenStrings, tokenPercents,
1578                                              runningPercents);
1579     }
1580 
1581     uint32_t proPercent = static_cast<uint32_t>(prob * 100. + .5);
1582 
1583     // directly classify junk to maintain backwards compatibility
1584     if (aProTraits[traitIndex] == kJunkTrait) {
1585       bool isJunk = (prob >= mJunkProbabilityThreshold);
1586       MOZ_LOG(BayesianFilterLogModule, LogLevel::Info,
1587               ("%s is junk probability = (%f)  HAM SCORE:%f SPAM SCORE:%f",
1588                PromiseFlatCString(messageURI).get(), prob, H, S));
1589 
1590       // the algorithm in "A Plan For Spam" assumes that you have a large good
1591       // corpus and a large junk corpus.
1592       // that won't be the case with users who first use the junk mail trait
1593       // so, we do certain things to encourage them to train.
1594       //
1595       // if there are no good tokens, assume the message is junk
1596       // this will "encourage" the user to train
1597       // and if there are no bad tokens, assume the message is not junk
1598       // this will also "encourage" the user to train
1599       // see bug #194238
1600 
1601       if (listener && !mCorpus.getMessageCount(kGoodTrait))
1602         isJunk = true;
1603       else if (listener && !mCorpus.getMessageCount(kJunkTrait))
1604         isJunk = false;
1605 
1606       if (listener)
1607         listener->OnMessageClassified(
1608             messageURI,
1609             isJunk ? nsMsgJunkStatus(nsIJunkMailPlugin::JUNK)
1610                    : nsMsgJunkStatus(nsIJunkMailPlugin::GOOD),
1611             proPercent);
1612     }
1613 
1614     if (aTraitListener) {
1615       traits.AppendElement(aProTraits[traitIndex]);
1616       percents.AppendElement(proPercent);
1617     }
1618   }
1619 
1620   if (aTraitListener)
1621     aTraitListener->OnMessageTraitsClassified(messageURI, traits, percents);
1622 
1623   delete[] tokens;
1624   // reuse mAnalysisStore without clearing memory
1625   mNextAnalysisIndex = 1;
1626   // but shrink it back to the default size
1627   if (mAnalysisStore.Length() > kAnalysisStoreCapacity)
1628     mAnalysisStore.RemoveElementsAt(
1629         kAnalysisStoreCapacity,
1630         mAnalysisStore.Length() - kAnalysisStoreCapacity);
1631   mAnalysisStore.Compact();
1632 }
1633 
classifyMessage(Tokenizer & tokens,const nsACString & messageURI,nsIJunkMailClassificationListener * aJunkListener)1634 void nsBayesianFilter::classifyMessage(
1635     Tokenizer& tokens, const nsACString& messageURI,
1636     nsIJunkMailClassificationListener* aJunkListener) {
1637   AutoTArray<uint32_t, 1> proTraits;
1638   AutoTArray<uint32_t, 1> antiTraits;
1639   proTraits.AppendElement(kJunkTrait);
1640   antiTraits.AppendElement(kGoodTrait);
1641   classifyMessage(tokens, messageURI, proTraits, antiTraits, aJunkListener,
1642                   nullptr, nullptr);
1643 }
1644 
1645 NS_IMETHODIMP
Observe(nsISupports * aSubject,const char * aTopic,const char16_t * someData)1646 nsBayesianFilter::Observe(nsISupports* aSubject, const char* aTopic,
1647                           const char16_t* someData) {
1648   if (!strcmp(aTopic, "profile-before-change")) Shutdown();
1649   return NS_OK;
1650 }
1651 
1652 /* void shutdown (); */
Shutdown()1653 NS_IMETHODIMP nsBayesianFilter::Shutdown() {
1654   if (mTrainingDataDirty) mCorpus.writeTrainingData(mMaximumTokenCount);
1655   mTrainingDataDirty = false;
1656 
1657   return NS_OK;
1658 }
1659 
1660 /* readonly attribute boolean shouldDownloadAllHeaders; */
GetShouldDownloadAllHeaders(bool * aShouldDownloadAllHeaders)1661 NS_IMETHODIMP nsBayesianFilter::GetShouldDownloadAllHeaders(
1662     bool* aShouldDownloadAllHeaders) {
1663   // bayesian filters work on the whole msg body currently.
1664   *aShouldDownloadAllHeaders = false;
1665   return NS_OK;
1666 }
1667 
1668 /* void classifyMessage (in string aMsgURL, in nsIJunkMailClassificationListener
1669  * aListener); */
ClassifyMessage(const nsACString & aMessageURL,nsIMsgWindow * aMsgWindow,nsIJunkMailClassificationListener * aListener)1670 NS_IMETHODIMP nsBayesianFilter::ClassifyMessage(
1671     const nsACString& aMessageURL, nsIMsgWindow* aMsgWindow,
1672     nsIJunkMailClassificationListener* aListener) {
1673   AutoTArray<nsCString, 1> urls = {PromiseFlatCString(aMessageURL)};
1674   MessageClassifier* analyzer =
1675       new MessageClassifier(this, aListener, aMsgWindow, urls);
1676   NS_ENSURE_TRUE(analyzer, NS_ERROR_OUT_OF_MEMORY);
1677   TokenStreamListener* tokenListener = new TokenStreamListener(analyzer);
1678   NS_ENSURE_TRUE(tokenListener, NS_ERROR_OUT_OF_MEMORY);
1679   analyzer->setTokenListener(tokenListener);
1680   return tokenizeMessage(aMessageURL, aMsgWindow, analyzer);
1681 }
1682 
1683 /* void classifyMessages(in Array<ACString> aMsgURIs,
1684  *                       in nsIMsgWindow aMsgWindow,
1685  *                       in nsIJunkMailClassificationListener aListener); */
ClassifyMessages(const nsTArray<nsCString> & aMsgURLs,nsIMsgWindow * aMsgWindow,nsIJunkMailClassificationListener * aListener)1686 NS_IMETHODIMP nsBayesianFilter::ClassifyMessages(
1687     const nsTArray<nsCString>& aMsgURLs, nsIMsgWindow* aMsgWindow,
1688     nsIJunkMailClassificationListener* aListener) {
1689   TokenAnalyzer* analyzer =
1690       new MessageClassifier(this, aListener, aMsgWindow, aMsgURLs);
1691   NS_ENSURE_TRUE(analyzer, NS_ERROR_OUT_OF_MEMORY);
1692   TokenStreamListener* tokenListener = new TokenStreamListener(analyzer);
1693   NS_ENSURE_TRUE(tokenListener, NS_ERROR_OUT_OF_MEMORY);
1694   analyzer->setTokenListener(tokenListener);
1695   return tokenizeMessage(aMsgURLs[0], aMsgWindow, analyzer);
1696 }
1697 
setAnalysis(Token & token,uint32_t aTraitIndex,double aDistance,double aProbability)1698 nsresult nsBayesianFilter::setAnalysis(Token& token, uint32_t aTraitIndex,
1699                                        double aDistance, double aProbability) {
1700   uint32_t nextLink = token.mAnalysisLink;
1701   uint32_t lastLink = 0;
1702   uint32_t linkCount = 0, maxLinks = 100;
1703 
1704   // try to find an existing element. Limit the search to maxLinks
1705   // as a precaution
1706   for (linkCount = 0; nextLink && linkCount < maxLinks; linkCount++) {
1707     AnalysisPerToken& rAnalysis = mAnalysisStore[nextLink];
1708     if (rAnalysis.mTraitIndex == aTraitIndex) {
1709       rAnalysis.mDistance = aDistance;
1710       rAnalysis.mProbability = aProbability;
1711       return NS_OK;
1712     }
1713     lastLink = nextLink;
1714     nextLink = rAnalysis.mNextLink;
1715   }
1716   if (linkCount >= maxLinks) return NS_ERROR_FAILURE;
1717 
1718   // trait does not exist, so add it
1719 
1720   AnalysisPerToken analysis(aTraitIndex, aDistance, aProbability);
1721   if (mAnalysisStore.Length() == mNextAnalysisIndex)
1722     mAnalysisStore.InsertElementAt(mNextAnalysisIndex, analysis);
1723   else if (mAnalysisStore.Length() > mNextAnalysisIndex)
1724     mAnalysisStore.ReplaceElementsAt(mNextAnalysisIndex, 1, analysis);
1725   else  // we can only insert at the end of the array
1726     return NS_ERROR_FAILURE;
1727 
1728   if (lastLink)
1729     // the token had at least one link, so update the last link to point to
1730     // the new item
1731     mAnalysisStore[lastLink].mNextLink = mNextAnalysisIndex;
1732   else
1733     // need to update the token's first link
1734     token.mAnalysisLink = mNextAnalysisIndex;
1735   mNextAnalysisIndex++;
1736   return NS_OK;
1737 }
1738 
getAnalysisIndex(Token & token,uint32_t aTraitIndex)1739 uint32_t nsBayesianFilter::getAnalysisIndex(Token& token,
1740                                             uint32_t aTraitIndex) {
1741   uint32_t nextLink;
1742   uint32_t linkCount = 0, maxLinks = 100;
1743   for (nextLink = token.mAnalysisLink; nextLink && linkCount < maxLinks;
1744        linkCount++) {
1745     AnalysisPerToken& rAnalysis = mAnalysisStore[nextLink];
1746     if (rAnalysis.mTraitIndex == aTraitIndex) return nextLink;
1747     nextLink = rAnalysis.mNextLink;
1748   }
1749   NS_ASSERTION(linkCount < maxLinks, "corrupt analysis store");
1750 
1751   // Trait not found, indicate by zero
1752   return 0;
1753 }
1754 
ClassifyTraitsInMessage(const nsACString & aMsgURI,const nsTArray<uint32_t> & aProTraits,const nsTArray<uint32_t> & aAntiTraits,nsIMsgTraitClassificationListener * aTraitListener,nsIMsgWindow * aMsgWindow,nsIJunkMailClassificationListener * aJunkListener)1755 NS_IMETHODIMP nsBayesianFilter::ClassifyTraitsInMessage(
1756     const nsACString& aMsgURI, const nsTArray<uint32_t>& aProTraits,
1757     const nsTArray<uint32_t>& aAntiTraits,
1758     nsIMsgTraitClassificationListener* aTraitListener, nsIMsgWindow* aMsgWindow,
1759     nsIJunkMailClassificationListener* aJunkListener) {
1760   AutoTArray<nsCString, 1> uris = {PromiseFlatCString(aMsgURI)};
1761   return ClassifyTraitsInMessages(uris, aProTraits, aAntiTraits, aTraitListener,
1762                                   aMsgWindow, aJunkListener);
1763 }
1764 
ClassifyTraitsInMessages(const nsTArray<nsCString> & aMsgURIs,const nsTArray<uint32_t> & aProTraits,const nsTArray<uint32_t> & aAntiTraits,nsIMsgTraitClassificationListener * aTraitListener,nsIMsgWindow * aMsgWindow,nsIJunkMailClassificationListener * aJunkListener)1765 NS_IMETHODIMP nsBayesianFilter::ClassifyTraitsInMessages(
1766     const nsTArray<nsCString>& aMsgURIs, const nsTArray<uint32_t>& aProTraits,
1767     const nsTArray<uint32_t>& aAntiTraits,
1768     nsIMsgTraitClassificationListener* aTraitListener, nsIMsgWindow* aMsgWindow,
1769     nsIJunkMailClassificationListener* aJunkListener) {
1770   MOZ_ASSERT(aProTraits.Length() == aAntiTraits.Length());
1771   MessageClassifier* analyzer =
1772       new MessageClassifier(this, aJunkListener, aTraitListener, nullptr,
1773                             aProTraits, aAntiTraits, aMsgWindow, aMsgURIs);
1774 
1775   TokenStreamListener* tokenListener = new TokenStreamListener(analyzer);
1776 
1777   analyzer->setTokenListener(tokenListener);
1778   return tokenizeMessage(aMsgURIs[0], aMsgWindow, analyzer);
1779 }
1780 
1781 class MessageObserver : public TokenAnalyzer {
1782  public:
MessageObserver(nsBayesianFilter * filter,const nsTArray<uint32_t> & aOldClassifications,const nsTArray<uint32_t> & aNewClassifications,nsIJunkMailClassificationListener * aJunkListener,nsIMsgTraitClassificationListener * aTraitListener)1783   MessageObserver(nsBayesianFilter* filter,
1784                   const nsTArray<uint32_t>& aOldClassifications,
1785                   const nsTArray<uint32_t>& aNewClassifications,
1786                   nsIJunkMailClassificationListener* aJunkListener,
1787                   nsIMsgTraitClassificationListener* aTraitListener)
1788       : mFilter(filter),
1789         mJunkMailPlugin(filter),
1790         mJunkListener(aJunkListener),
1791         mTraitListener(aTraitListener),
1792         mOldClassifications(aOldClassifications.Clone()),
1793         mNewClassifications(aNewClassifications.Clone()) {}
1794 
analyzeTokens(Tokenizer & tokenizer)1795   virtual void analyzeTokens(Tokenizer& tokenizer) {
1796     mFilter->observeMessage(tokenizer, mTokenSource, mOldClassifications,
1797                             mNewClassifications, mJunkListener, mTraitListener);
1798     // release reference to listener, which will allow us to go away as well.
1799     mTokenListener = nullptr;
1800   }
1801 
1802  private:
1803   nsBayesianFilter* mFilter;
1804   nsCOMPtr<nsIJunkMailPlugin> mJunkMailPlugin;
1805   nsCOMPtr<nsIJunkMailClassificationListener> mJunkListener;
1806   nsCOMPtr<nsIMsgTraitClassificationListener> mTraitListener;
1807   nsTArray<uint32_t> mOldClassifications;
1808   nsTArray<uint32_t> mNewClassifications;
1809 };
1810 
SetMsgTraitClassification(const nsACString & aMsgURI,const nsTArray<uint32_t> & aOldTraits,const nsTArray<uint32_t> & aNewTraits,nsIMsgTraitClassificationListener * aTraitListener,nsIMsgWindow * aMsgWindow,nsIJunkMailClassificationListener * aJunkListener)1811 NS_IMETHODIMP nsBayesianFilter::SetMsgTraitClassification(
1812     const nsACString& aMsgURI, const nsTArray<uint32_t>& aOldTraits,
1813     const nsTArray<uint32_t>& aNewTraits,
1814     nsIMsgTraitClassificationListener* aTraitListener, nsIMsgWindow* aMsgWindow,
1815     nsIJunkMailClassificationListener* aJunkListener) {
1816   MessageObserver* analyzer = new MessageObserver(
1817       this, aOldTraits, aNewTraits, aJunkListener, aTraitListener);
1818   NS_ENSURE_TRUE(analyzer, NS_ERROR_OUT_OF_MEMORY);
1819 
1820   TokenStreamListener* tokenListener = new TokenStreamListener(analyzer);
1821   NS_ENSURE_TRUE(tokenListener, NS_ERROR_OUT_OF_MEMORY);
1822 
1823   analyzer->setTokenListener(tokenListener);
1824   return tokenizeMessage(aMsgURI, aMsgWindow, analyzer);
1825 }
1826 
1827 // set new message classifications for a message
observeMessage(Tokenizer & tokenizer,const nsACString & messageURL,nsTArray<uint32_t> & oldClassifications,nsTArray<uint32_t> & newClassifications,nsIJunkMailClassificationListener * aJunkListener,nsIMsgTraitClassificationListener * aTraitListener)1828 void nsBayesianFilter::observeMessage(
1829     Tokenizer& tokenizer, const nsACString& messageURL,
1830     nsTArray<uint32_t>& oldClassifications,
1831     nsTArray<uint32_t>& newClassifications,
1832     nsIJunkMailClassificationListener* aJunkListener,
1833     nsIMsgTraitClassificationListener* aTraitListener) {
1834   bool trainingDataWasDirty = mTrainingDataDirty;
1835 
1836   // Uhoh...if the user is re-training then the message may already be
1837   // classified and we are classifying it again with the same classification.
1838   // the old code would have removed the tokens for this message then added them
1839   // back. But this really hurts the message occurrence count for tokens if you
1840   // just removed training.dat and are re-training. See Bug #237095 for more
1841   // details. What can we do here? Well we can skip the token removal step if
1842   // the classifications are the same and assume the user is just re-training.
1843   // But this then allows users to re-classify the same message on the same
1844   // training set over and over again leading to data skew. But that's all I can
1845   // think to do right now to address this.....
1846   uint32_t oldLength = oldClassifications.Length();
1847   for (uint32_t index = 0; index < oldLength; index++) {
1848     uint32_t trait = oldClassifications.ElementAt(index);
1849     // skip removing if trait is also in the new set
1850     if (newClassifications.Contains(trait)) continue;
1851     // remove the tokens from the token set it is currently in
1852     uint32_t messageCount;
1853     messageCount = mCorpus.getMessageCount(trait);
1854     if (messageCount > 0) {
1855       mCorpus.setMessageCount(trait, messageCount - 1);
1856       mCorpus.forgetTokens(tokenizer, trait, 1);
1857       mTrainingDataDirty = true;
1858     }
1859   }
1860 
1861   nsMsgJunkStatus newClassification = nsIJunkMailPlugin::UNCLASSIFIED;
1862   uint32_t junkPercent =
1863       0;  // 0 here is no possibility of meeting the classification
1864   uint32_t newLength = newClassifications.Length();
1865   for (uint32_t index = 0; index < newLength; index++) {
1866     uint32_t trait = newClassifications.ElementAt(index);
1867     mCorpus.setMessageCount(trait, mCorpus.getMessageCount(trait) + 1);
1868     mCorpus.rememberTokens(tokenizer, trait, 1);
1869     mTrainingDataDirty = true;
1870 
1871     if (aJunkListener) {
1872       if (trait == kJunkTrait) {
1873         junkPercent = nsIJunkMailPlugin::IS_SPAM_SCORE;
1874         newClassification = nsIJunkMailPlugin::JUNK;
1875       } else if (trait == kGoodTrait) {
1876         junkPercent = nsIJunkMailPlugin::IS_HAM_SCORE;
1877         newClassification = nsIJunkMailPlugin::GOOD;
1878       }
1879     }
1880   }
1881 
1882   if (aJunkListener)
1883     aJunkListener->OnMessageClassified(messageURL, newClassification,
1884                                        junkPercent);
1885 
1886   if (aTraitListener) {
1887     // construct the outgoing listener arrays
1888     AutoTArray<uint32_t, kTraitAutoCapacity> traits;
1889     AutoTArray<uint32_t, kTraitAutoCapacity> percents;
1890     uint32_t newLength = newClassifications.Length();
1891     if (newLength > kTraitAutoCapacity) {
1892       traits.SetCapacity(newLength);
1893       percents.SetCapacity(newLength);
1894     }
1895     traits.AppendElements(newClassifications);
1896     for (uint32_t index = 0; index < newLength; index++)
1897       percents.AppendElement(100);  // This is 100 percent, or certainty
1898     aTraitListener->OnMessageTraitsClassified(messageURL, traits, percents);
1899   }
1900 
1901   if (mTrainingDataDirty && !trainingDataWasDirty && (mTimer != nullptr)) {
1902     // if training data became dirty just now, schedule flush
1903     // mMinFlushInterval msec from now
1904     MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug,
1905             ("starting training data flush timer %i msec", mMinFlushInterval));
1906     mTimer->InitWithNamedFuncCallback(
1907         nsBayesianFilter::TimerCallback, this, mMinFlushInterval,
1908         nsITimer::TYPE_ONE_SHOT, "nsBayesianFilter::TimerCallback");
1909   }
1910 }
1911 
GetUserHasClassified(bool * aResult)1912 NS_IMETHODIMP nsBayesianFilter::GetUserHasClassified(bool* aResult) {
1913   *aResult = ((mCorpus.getMessageCount(kGoodTrait) +
1914                mCorpus.getMessageCount(kJunkTrait)) &&
1915               mCorpus.countTokens());
1916   return NS_OK;
1917 }
1918 
1919 // Set message classification (only allows junk and good)
SetMessageClassification(const nsACString & aMsgURL,nsMsgJunkStatus aOldClassification,nsMsgJunkStatus aNewClassification,nsIMsgWindow * aMsgWindow,nsIJunkMailClassificationListener * aListener)1920 NS_IMETHODIMP nsBayesianFilter::SetMessageClassification(
1921     const nsACString& aMsgURL, nsMsgJunkStatus aOldClassification,
1922     nsMsgJunkStatus aNewClassification, nsIMsgWindow* aMsgWindow,
1923     nsIJunkMailClassificationListener* aListener) {
1924   AutoTArray<uint32_t, 1> oldClassifications;
1925   AutoTArray<uint32_t, 1> newClassifications;
1926 
1927   // convert between classifications and trait
1928   if (aOldClassification == nsIJunkMailPlugin::JUNK)
1929     oldClassifications.AppendElement(kJunkTrait);
1930   else if (aOldClassification == nsIJunkMailPlugin::GOOD)
1931     oldClassifications.AppendElement(kGoodTrait);
1932   if (aNewClassification == nsIJunkMailPlugin::JUNK)
1933     newClassifications.AppendElement(kJunkTrait);
1934   else if (aNewClassification == nsIJunkMailPlugin::GOOD)
1935     newClassifications.AppendElement(kGoodTrait);
1936 
1937   MessageObserver* analyzer = new MessageObserver(
1938       this, oldClassifications, newClassifications, aListener, nullptr);
1939   NS_ENSURE_TRUE(analyzer, NS_ERROR_OUT_OF_MEMORY);
1940 
1941   TokenStreamListener* tokenListener = new TokenStreamListener(analyzer);
1942   NS_ENSURE_TRUE(tokenListener, NS_ERROR_OUT_OF_MEMORY);
1943 
1944   analyzer->setTokenListener(tokenListener);
1945   return tokenizeMessage(aMsgURL, aMsgWindow, analyzer);
1946 }
1947 
ResetTrainingData()1948 NS_IMETHODIMP nsBayesianFilter::ResetTrainingData() {
1949   return mCorpus.resetTrainingData();
1950 }
1951 
DetailMessage(const nsACString & aMsgURI,uint32_t aProTrait,uint32_t aAntiTrait,nsIMsgTraitDetailListener * aDetailListener,nsIMsgWindow * aMsgWindow)1952 NS_IMETHODIMP nsBayesianFilter::DetailMessage(
1953     const nsACString& aMsgURI, uint32_t aProTrait, uint32_t aAntiTrait,
1954     nsIMsgTraitDetailListener* aDetailListener, nsIMsgWindow* aMsgWindow) {
1955   AutoTArray<uint32_t, 1> proTraits = {aProTrait};
1956   AutoTArray<uint32_t, 1> antiTraits = {aAntiTrait};
1957   AutoTArray<nsCString, 1> uris = {PromiseFlatCString(aMsgURI)};
1958 
1959   MessageClassifier* analyzer =
1960       new MessageClassifier(this, nullptr, nullptr, aDetailListener, proTraits,
1961                             antiTraits, aMsgWindow, uris);
1962   NS_ENSURE_TRUE(analyzer, NS_ERROR_OUT_OF_MEMORY);
1963 
1964   TokenStreamListener* tokenListener = new TokenStreamListener(analyzer);
1965   NS_ENSURE_TRUE(tokenListener, NS_ERROR_OUT_OF_MEMORY);
1966 
1967   analyzer->setTokenListener(tokenListener);
1968   return tokenizeMessage(aMsgURI, aMsgWindow, analyzer);
1969 }
1970 
1971 // nsIMsgCorpus implementation
1972 
CorpusCounts(uint32_t aTrait,uint32_t * aMessageCount,uint32_t * aTokenCount)1973 NS_IMETHODIMP nsBayesianFilter::CorpusCounts(uint32_t aTrait,
1974                                              uint32_t* aMessageCount,
1975                                              uint32_t* aTokenCount) {
1976   NS_ENSURE_ARG_POINTER(aTokenCount);
1977   *aTokenCount = mCorpus.countTokens();
1978   if (aTrait && aMessageCount) *aMessageCount = mCorpus.getMessageCount(aTrait);
1979   return NS_OK;
1980 }
1981 
ClearTrait(uint32_t aTrait)1982 NS_IMETHODIMP nsBayesianFilter::ClearTrait(uint32_t aTrait) {
1983   return mCorpus.ClearTrait(aTrait);
1984 }
1985 
1986 NS_IMETHODIMP
UpdateData(nsIFile * aFile,bool aIsAdd,const nsTArray<uint32_t> & aFromTraits,const nsTArray<uint32_t> & aToTraits)1987 nsBayesianFilter::UpdateData(nsIFile* aFile, bool aIsAdd,
1988                              const nsTArray<uint32_t>& aFromTraits,
1989                              const nsTArray<uint32_t>& aToTraits) {
1990   MOZ_ASSERT(aFromTraits.Length() == aToTraits.Length());
1991   return mCorpus.UpdateData(aFile, aIsAdd, aFromTraits, aToTraits);
1992 }
1993 
1994 NS_IMETHODIMP
GetTokenCount(const nsACString & aWord,uint32_t aTrait,uint32_t * aCount)1995 nsBayesianFilter::GetTokenCount(const nsACString& aWord, uint32_t aTrait,
1996                                 uint32_t* aCount) {
1997   NS_ENSURE_ARG_POINTER(aCount);
1998   CorpusToken* t = mCorpus.get(PromiseFlatCString(aWord).get());
1999   uint32_t count = mCorpus.getTraitCount(t, aTrait);
2000   *aCount = count;
2001   return NS_OK;
2002 }
2003 
2004 /* Corpus Store */
2005 
2006 /*
2007     Format of the training file for version 1:
2008     [0xFEEDFACE]
2009     [number good messages][number bad messages]
2010     [number good tokens]
2011     [count][length of word]word
2012     ...
2013     [number bad tokens]
2014     [count][length of word]word
2015     ...
2016 
2017      Format of the trait file for version 1:
2018     [0xFCA93601]  (the 01 is the version)
2019     for each trait to write
2020       [id of trait to write] (0 means end of list)
2021       [number of messages per trait]
2022       for each token with non-zero count
2023         [count]
2024         [length of word]word
2025 */
2026 
CorpusStore()2027 CorpusStore::CorpusStore()
2028     : TokenHash(sizeof(CorpusToken)),
2029       mNextTraitIndex(1)  // skip 0 since index=0 will mean end of linked list
2030 {
2031   getTrainingFile(getter_AddRefs(mTrainingFile));
2032   mTraitStore.SetCapacity(kTraitStoreCapacity);
2033   TraitPerToken traitPT(0, 0);
2034   mTraitStore.AppendElement(traitPT);  // dummy 0th element
2035 }
2036 
~CorpusStore()2037 CorpusStore::~CorpusStore() {}
2038 
writeUInt32(FILE * stream,uint32_t value)2039 inline int writeUInt32(FILE* stream, uint32_t value) {
2040   value = PR_htonl(value);
2041   return fwrite(&value, sizeof(uint32_t), 1, stream);
2042 }
2043 
readUInt32(FILE * stream,uint32_t * value)2044 inline int readUInt32(FILE* stream, uint32_t* value) {
2045   int n = fread(value, sizeof(uint32_t), 1, stream);
2046   if (n == 1) {
2047     *value = PR_ntohl(*value);
2048   }
2049   return n;
2050 }
2051 
forgetTokens(Tokenizer & aTokenizer,uint32_t aTraitId,uint32_t aCount)2052 void CorpusStore::forgetTokens(Tokenizer& aTokenizer, uint32_t aTraitId,
2053                                uint32_t aCount) {
2054   // if we are forgetting the tokens for a message, should only
2055   // subtract 1 from the occurrence count for that token in the training set
2056   // because we assume we only bumped the training set count once per messages
2057   // containing the token.
2058   TokenEnumeration tokens = aTokenizer.getTokens();
2059   while (tokens.hasMoreTokens()) {
2060     CorpusToken* token = static_cast<CorpusToken*>(tokens.nextToken());
2061     remove(token->mWord, aTraitId, aCount);
2062   }
2063 }
2064 
rememberTokens(Tokenizer & aTokenizer,uint32_t aTraitId,uint32_t aCount)2065 void CorpusStore::rememberTokens(Tokenizer& aTokenizer, uint32_t aTraitId,
2066                                  uint32_t aCount) {
2067   TokenEnumeration tokens = aTokenizer.getTokens();
2068   while (tokens.hasMoreTokens()) {
2069     CorpusToken* token = static_cast<CorpusToken*>(tokens.nextToken());
2070     if (!token) {
2071       NS_ERROR("null token");
2072       continue;
2073     }
2074     add(token->mWord, aTraitId, aCount);
2075   }
2076 }
2077 
writeTokens(FILE * stream,bool shrink,uint32_t aTraitId)2078 bool CorpusStore::writeTokens(FILE* stream, bool shrink, uint32_t aTraitId) {
2079   uint32_t tokenCount = countTokens();
2080   uint32_t newTokenCount = 0;
2081 
2082   // calculate the tokens for this trait to write
2083 
2084   TokenEnumeration tokens = getTokens();
2085   for (uint32_t i = 0; i < tokenCount; ++i) {
2086     CorpusToken* token = static_cast<CorpusToken*>(tokens.nextToken());
2087     uint32_t count = getTraitCount(token, aTraitId);
2088     // Shrinking the token database is accomplished by dividing all token counts
2089     // by 2. If shrinking, we'll ignore counts < 2, otherwise only ignore counts
2090     // of < 1
2091     if ((shrink && count > 1) || (!shrink && count)) newTokenCount++;
2092   }
2093 
2094   if (writeUInt32(stream, newTokenCount) != 1) return false;
2095 
2096   if (newTokenCount > 0) {
2097     TokenEnumeration tokens = getTokens();
2098     for (uint32_t i = 0; i < tokenCount; ++i) {
2099       CorpusToken* token = static_cast<CorpusToken*>(tokens.nextToken());
2100       uint32_t wordCount = getTraitCount(token, aTraitId);
2101       if (shrink) wordCount /= 2;
2102       if (!wordCount) continue;  // Don't output zero count words
2103       if (writeUInt32(stream, wordCount) != 1) return false;
2104       uint32_t tokenLength = strlen(token->mWord);
2105       if (writeUInt32(stream, tokenLength) != 1) return false;
2106       if (fwrite(token->mWord, tokenLength, 1, stream) != 1) return false;
2107     }
2108   }
2109   return true;
2110 }
2111 
readTokens(FILE * stream,int64_t fileSize,uint32_t aTraitId,bool aIsAdd)2112 bool CorpusStore::readTokens(FILE* stream, int64_t fileSize, uint32_t aTraitId,
2113                              bool aIsAdd) {
2114   uint32_t tokenCount;
2115   if (readUInt32(stream, &tokenCount) != 1) return false;
2116 
2117   int64_t fpos = ftell(stream);
2118   if (fpos < 0) return false;
2119 
2120   uint32_t bufferSize = 4096;
2121   char* buffer = new char[bufferSize];
2122   if (!buffer) return false;
2123 
2124   for (uint32_t i = 0; i < tokenCount; ++i) {
2125     uint32_t count;
2126     if (readUInt32(stream, &count) != 1) break;
2127     uint32_t size;
2128     if (readUInt32(stream, &size) != 1) break;
2129     fpos += 8;
2130     if (fpos + size > fileSize) {
2131       delete[] buffer;
2132       return false;
2133     }
2134     if (size >= bufferSize) {
2135       delete[] buffer;
2136       while (size >= bufferSize) {
2137         bufferSize *= 2;
2138         if (bufferSize == 0) return false;
2139       }
2140       buffer = new char[bufferSize];
2141       if (!buffer) return false;
2142     }
2143     if (fread(buffer, size, 1, stream) != 1) break;
2144     fpos += size;
2145     buffer[size] = '\0';
2146     if (aIsAdd)
2147       add(buffer, aTraitId, count);
2148     else
2149       remove(buffer, aTraitId, count);
2150   }
2151 
2152   delete[] buffer;
2153 
2154   return true;
2155 }
2156 
getTrainingFile(nsIFile ** aTrainingFile)2157 nsresult CorpusStore::getTrainingFile(nsIFile** aTrainingFile) {
2158   // should we cache the profile manager's directory?
2159   nsCOMPtr<nsIFile> profileDir;
2160 
2161   nsresult rv = NS_GetSpecialDirectory(NS_APP_USER_PROFILE_50_DIR,
2162                                        getter_AddRefs(profileDir));
2163   NS_ENSURE_SUCCESS(rv, rv);
2164   rv = profileDir->Append(u"training.dat"_ns);
2165   NS_ENSURE_SUCCESS(rv, rv);
2166 
2167   return profileDir->QueryInterface(NS_GET_IID(nsIFile), (void**)aTrainingFile);
2168 }
2169 
getTraitFile(nsIFile ** aTraitFile)2170 nsresult CorpusStore::getTraitFile(nsIFile** aTraitFile) {
2171   // should we cache the profile manager's directory?
2172   nsCOMPtr<nsIFile> profileDir;
2173 
2174   nsresult rv = NS_GetSpecialDirectory(NS_APP_USER_PROFILE_50_DIR,
2175                                        getter_AddRefs(profileDir));
2176   NS_ENSURE_SUCCESS(rv, rv);
2177 
2178   rv = profileDir->Append(u"traits.dat"_ns);
2179   NS_ENSURE_SUCCESS(rv, rv);
2180 
2181   return profileDir->QueryInterface(NS_GET_IID(nsIFile), (void**)aTraitFile);
2182 }
2183 
2184 static const char kMagicCookie[] = {'\xFE', '\xED', '\xFA', '\xCE'};
2185 
2186 // random string used to identify trait file and version (last byte is version)
2187 static const char kTraitCookie[] = {'\xFC', '\xA9', '\x36', '\x01'};
2188 
writeTrainingData(uint32_t aMaximumTokenCount)2189 void CorpusStore::writeTrainingData(uint32_t aMaximumTokenCount) {
2190   MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug,
2191           ("writeTrainingData() entered"));
2192   if (!mTrainingFile) return;
2193 
2194   /*
2195    * For backwards compatibility, write the good and junk tokens to
2196    * training.dat; additional traits are added to a different file
2197    */
2198 
2199   // open the file, and write out training data
2200   FILE* stream;
2201   nsresult rv = mTrainingFile->OpenANSIFileDesc("wb", &stream);
2202   if (NS_FAILED(rv)) return;
2203 
2204   // If the number of tokens exceeds our limit, set the shrink flag
2205   bool shrink = false;
2206   if ((aMaximumTokenCount > 0) &&  // if 0, do not limit tokens
2207       (countTokens() > aMaximumTokenCount)) {
2208     shrink = true;
2209     MOZ_LOG(BayesianFilterLogModule, LogLevel::Warning,
2210             ("shrinking token data file"));
2211   }
2212 
2213   // We implement shrink by dividing counts by two
2214   uint32_t shrinkFactor = shrink ? 2 : 1;
2215 
2216   if (!((fwrite(kMagicCookie, sizeof(kMagicCookie), 1, stream) == 1) &&
2217         (writeUInt32(stream, getMessageCount(kGoodTrait) / shrinkFactor)) &&
2218         (writeUInt32(stream, getMessageCount(kJunkTrait) / shrinkFactor)) &&
2219         writeTokens(stream, shrink, kGoodTrait) &&
2220         writeTokens(stream, shrink, kJunkTrait))) {
2221     NS_WARNING("failed to write training data.");
2222     fclose(stream);
2223     // delete the training data file, since it is potentially corrupt.
2224     mTrainingFile->Remove(false);
2225   } else {
2226     fclose(stream);
2227   }
2228 
2229   /*
2230    * Write the remaining data to a second file traits.dat
2231    */
2232 
2233   if (!mTraitFile) {
2234     getTraitFile(getter_AddRefs(mTraitFile));
2235     if (!mTraitFile) return;
2236   }
2237 
2238   // open the file, and write out training data
2239   rv = mTraitFile->OpenANSIFileDesc("wb", &stream);
2240   if (NS_FAILED(rv)) return;
2241 
2242   uint32_t numberOfTraits = mMessageCounts.Length();
2243   bool error;
2244   while (1)  // break on error or done
2245   {
2246     if ((error = (fwrite(kTraitCookie, sizeof(kTraitCookie), 1, stream) != 1)))
2247       break;
2248 
2249     for (uint32_t index = 0; index < numberOfTraits; index++) {
2250       uint32_t trait = mMessageCountsId[index];
2251       if (trait == 1 || trait == 2)
2252         continue;  // junk traits are stored in training.dat
2253       if ((error = (writeUInt32(stream, trait) != 1))) break;
2254       if ((error = (writeUInt32(stream, mMessageCounts[index] / shrinkFactor) !=
2255                     1)))
2256         break;
2257       if ((error = !writeTokens(stream, shrink, trait))) break;
2258     }
2259     break;
2260   }
2261   // we add a 0 at the end to represent end of trait list
2262   error = writeUInt32(stream, 0) != 1;
2263 
2264   fclose(stream);
2265   if (error) {
2266     NS_WARNING("failed to write trait data.");
2267     // delete the trait data file, since it is probably corrupt.
2268     mTraitFile->Remove(false);
2269   }
2270 
2271   if (shrink) {
2272     // We'll clear the tokens, and read them back in from the file.
2273     // Yes this is slower than in place, but this is a rare event.
2274 
2275     if (countTokens()) {
2276       clearTokens();
2277       for (uint32_t index = 0; index < numberOfTraits; index++)
2278         mMessageCounts[index] = 0;
2279     }
2280 
2281     readTrainingData();
2282   }
2283 }
2284 
readTrainingData()2285 void CorpusStore::readTrainingData() {
2286   /*
2287    * To maintain backwards compatibility, good and junk traits
2288    * are stored in a file "training.dat"
2289    */
2290   if (!mTrainingFile) return;
2291 
2292   bool exists;
2293   nsresult rv = mTrainingFile->Exists(&exists);
2294   if (NS_FAILED(rv) || !exists) return;
2295 
2296   FILE* stream;
2297   rv = mTrainingFile->OpenANSIFileDesc("rb", &stream);
2298   if (NS_FAILED(rv)) return;
2299 
2300   int64_t fileSize;
2301   rv = mTrainingFile->GetFileSize(&fileSize);
2302   if (NS_FAILED(rv)) return;
2303 
2304   // FIXME:  should make sure that the tokenizers are empty.
2305   char cookie[4];
2306   uint32_t goodMessageCount = 0, junkMessageCount = 0;
2307   if (!((fread(cookie, sizeof(cookie), 1, stream) == 1) &&
2308         (memcmp(cookie, kMagicCookie, sizeof(cookie)) == 0) &&
2309         (readUInt32(stream, &goodMessageCount) == 1) &&
2310         (readUInt32(stream, &junkMessageCount) == 1) &&
2311         readTokens(stream, fileSize, kGoodTrait, true) &&
2312         readTokens(stream, fileSize, kJunkTrait, true))) {
2313     NS_WARNING("failed to read training data.");
2314     MOZ_LOG(BayesianFilterLogModule, LogLevel::Error,
2315             ("failed to read training data."));
2316   }
2317   setMessageCount(kGoodTrait, goodMessageCount);
2318   setMessageCount(kJunkTrait, junkMessageCount);
2319 
2320   fclose(stream);
2321 
2322   /*
2323    * Additional traits are stored in traits.dat
2324    */
2325 
2326   if (!mTraitFile) {
2327     getTraitFile(getter_AddRefs(mTraitFile));
2328     if (!mTraitFile) return;
2329   }
2330 
2331   rv = mTraitFile->Exists(&exists);
2332   if (NS_FAILED(rv) || !exists) return;
2333 
2334   nsTArray<uint32_t> empty;
2335   rv = UpdateData(mTraitFile, true, empty, empty);
2336 
2337   if (NS_FAILED(rv)) {
2338     NS_WARNING("failed to read training data.");
2339     MOZ_LOG(BayesianFilterLogModule, LogLevel::Error,
2340             ("failed to read training data."));
2341   }
2342   return;
2343 }
2344 
resetTrainingData()2345 nsresult CorpusStore::resetTrainingData() {
2346   // clear out our in memory training tokens...
2347   if (countTokens()) clearTokens();
2348 
2349   uint32_t length = mMessageCounts.Length();
2350   for (uint32_t index = 0; index < length; index++) mMessageCounts[index] = 0;
2351 
2352   if (mTrainingFile) mTrainingFile->Remove(false);
2353   if (mTraitFile) mTraitFile->Remove(false);
2354   return NS_OK;
2355 }
2356 
get(const char * word)2357 inline CorpusToken* CorpusStore::get(const char* word) {
2358   return static_cast<CorpusToken*>(TokenHash::get(word));
2359 }
2360 
updateTrait(CorpusToken * token,uint32_t aTraitId,int32_t aCountChange)2361 nsresult CorpusStore::updateTrait(CorpusToken* token, uint32_t aTraitId,
2362                                   int32_t aCountChange) {
2363   NS_ENSURE_ARG_POINTER(token);
2364   uint32_t nextLink = token->mTraitLink;
2365   uint32_t lastLink = 0;
2366 
2367   uint32_t linkCount, maxLinks = 100;  // sanity check
2368   for (linkCount = 0; nextLink && linkCount < maxLinks; linkCount++) {
2369     TraitPerToken& traitPT = mTraitStore[nextLink];
2370     if (traitPT.mId == aTraitId) {
2371       // be careful with signed versus unsigned issues here
2372       if (static_cast<int32_t>(traitPT.mCount) + aCountChange > 0)
2373         traitPT.mCount += aCountChange;
2374       else
2375         traitPT.mCount = 0;
2376       // we could delete zero count traits here, but let's not. It's rare
2377       // anyway.
2378       return NS_OK;
2379     }
2380     lastLink = nextLink;
2381     nextLink = traitPT.mNextLink;
2382   }
2383   if (linkCount >= maxLinks) return NS_ERROR_FAILURE;
2384 
2385   // trait does not exist, so add it
2386 
2387   if (aCountChange > 0)  // don't set a negative count
2388   {
2389     TraitPerToken traitPT(aTraitId, aCountChange);
2390     if (mTraitStore.Length() == mNextTraitIndex)
2391       mTraitStore.InsertElementAt(mNextTraitIndex, traitPT);
2392     else if (mTraitStore.Length() > mNextTraitIndex)
2393       mTraitStore.ReplaceElementsAt(mNextTraitIndex, 1, traitPT);
2394     else
2395       return NS_ERROR_FAILURE;
2396     if (lastLink)
2397       // the token had a parent, so update it
2398       mTraitStore[lastLink].mNextLink = mNextTraitIndex;
2399     else
2400       // need to update the token's root link
2401       token->mTraitLink = mNextTraitIndex;
2402     mNextTraitIndex++;
2403   }
2404   return NS_OK;
2405 }
2406 
getTraitCount(CorpusToken * token,uint32_t aTraitId)2407 uint32_t CorpusStore::getTraitCount(CorpusToken* token, uint32_t aTraitId) {
2408   uint32_t nextLink;
2409   if (!token || !(nextLink = token->mTraitLink)) return 0;
2410 
2411   uint32_t linkCount, maxLinks = 100;  // sanity check
2412   for (linkCount = 0; nextLink && linkCount < maxLinks; linkCount++) {
2413     TraitPerToken& traitPT = mTraitStore[nextLink];
2414     if (traitPT.mId == aTraitId) return traitPT.mCount;
2415     nextLink = traitPT.mNextLink;
2416   }
2417   NS_ASSERTION(linkCount < maxLinks, "Corrupt trait count store");
2418 
2419   // trait not found (or error), so count is zero
2420   return 0;
2421 }
2422 
add(const char * word,uint32_t aTraitId,uint32_t aCount)2423 CorpusToken* CorpusStore::add(const char* word, uint32_t aTraitId,
2424                               uint32_t aCount) {
2425   CorpusToken* token = static_cast<CorpusToken*>(TokenHash::add(word));
2426   if (token) {
2427     MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug,
2428             ("adding word to corpus store: %s (Trait=%d) (deltaCount=%d)", word,
2429              aTraitId, aCount));
2430     updateTrait(token, aTraitId, aCount);
2431   }
2432   return token;
2433 }
2434 
remove(const char * word,uint32_t aTraitId,uint32_t aCount)2435 void CorpusStore::remove(const char* word, uint32_t aTraitId, uint32_t aCount) {
2436   MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug,
2437           ("remove word: %s (TraitId=%d) (Count=%d)", word, aTraitId, aCount));
2438   CorpusToken* token = get(word);
2439   if (token) updateTrait(token, aTraitId, -static_cast<int32_t>(aCount));
2440 }
2441 
getMessageCount(uint32_t aTraitId)2442 uint32_t CorpusStore::getMessageCount(uint32_t aTraitId) {
2443   size_t index = mMessageCountsId.IndexOf(aTraitId);
2444   if (index == mMessageCountsId.NoIndex) return 0;
2445   return mMessageCounts.ElementAt(index);
2446 }
2447 
setMessageCount(uint32_t aTraitId,uint32_t aCount)2448 void CorpusStore::setMessageCount(uint32_t aTraitId, uint32_t aCount) {
2449   size_t index = mMessageCountsId.IndexOf(aTraitId);
2450   if (index == mMessageCountsId.NoIndex) {
2451     mMessageCounts.AppendElement(aCount);
2452     mMessageCountsId.AppendElement(aTraitId);
2453   } else {
2454     mMessageCounts[index] = aCount;
2455   }
2456 }
2457 
UpdateData(nsIFile * aFile,bool aIsAdd,const nsTArray<uint32_t> & aFromTraits,const nsTArray<uint32_t> & aToTraits)2458 nsresult CorpusStore::UpdateData(nsIFile* aFile, bool aIsAdd,
2459                                  const nsTArray<uint32_t>& aFromTraits,
2460                                  const nsTArray<uint32_t>& aToTraits) {
2461   NS_ENSURE_ARG_POINTER(aFile);
2462   MOZ_ASSERT(aFromTraits.Length() == aToTraits.Length());
2463 
2464   int64_t fileSize;
2465   nsresult rv = aFile->GetFileSize(&fileSize);
2466   NS_ENSURE_SUCCESS(rv, rv);
2467 
2468   FILE* stream;
2469   rv = aFile->OpenANSIFileDesc("rb", &stream);
2470   NS_ENSURE_SUCCESS(rv, rv);
2471 
2472   bool error;
2473   do  // break on error or done
2474   {
2475     char cookie[4];
2476     if ((error = (fread(cookie, sizeof(cookie), 1, stream) != 1))) break;
2477 
2478     if ((error = memcmp(cookie, kTraitCookie, sizeof(cookie)))) break;
2479 
2480     uint32_t fileTrait;
2481     while (!(error = (readUInt32(stream, &fileTrait) != 1)) && fileTrait) {
2482       uint32_t count;
2483       if ((error = (readUInt32(stream, &count) != 1))) break;
2484 
2485       uint32_t localTrait = fileTrait;
2486       // remap the trait
2487       for (uint32_t i = 0; i < aFromTraits.Length(); i++) {
2488         if (aFromTraits[i] == fileTrait) localTrait = aToTraits[i];
2489       }
2490 
2491       uint32_t messageCount = getMessageCount(localTrait);
2492       if (aIsAdd)
2493         messageCount += count;
2494       else if (count > messageCount)
2495         messageCount = 0;
2496       else
2497         messageCount -= count;
2498       setMessageCount(localTrait, messageCount);
2499 
2500       if ((error = !readTokens(stream, fileSize, localTrait, aIsAdd))) break;
2501     }
2502     break;
2503   } while (0);
2504 
2505   fclose(stream);
2506 
2507   if (error) return NS_ERROR_FAILURE;
2508   return NS_OK;
2509 }
2510 
ClearTrait(uint32_t aTrait)2511 nsresult CorpusStore::ClearTrait(uint32_t aTrait) {
2512   // clear message counts
2513   setMessageCount(aTrait, 0);
2514 
2515   TokenEnumeration tokens = getTokens();
2516   while (tokens.hasMoreTokens()) {
2517     CorpusToken* token = static_cast<CorpusToken*>(tokens.nextToken());
2518     int32_t wordCount = static_cast<int32_t>(getTraitCount(token, aTrait));
2519     updateTrait(token, aTrait, -wordCount);
2520   }
2521   return NS_OK;
2522 }
2523