1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ 2 /* This Source Code Form is subject to the terms of the Mozilla Public 3 * License, v. 2.0. If a copy of the MPL was not distributed with this 4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 6 #ifndef nsBayesianFilter_h__ 7 #define nsBayesianFilter_h__ 8 9 #include <stdio.h> 10 #include "nsCOMPtr.h" 11 #include "nsIMsgFilterPlugin.h" 12 #include "PLDHashTable.h" 13 #include "nsITimer.h" 14 #include "nsTArray.h" 15 #include "nsString.h" 16 #include "nsWeakReference.h" 17 #include "nsIObserver.h" 18 #include "mozilla/intl/WordBreaker.h" 19 20 #include "mozilla/ArenaAllocator.h" 21 22 #define DEFAULT_MIN_INTERVAL_BETWEEN_WRITES 15 * 60 * 1000 23 24 struct Token; 25 class TokenEnumeration; 26 class TokenAnalyzer; 27 class nsIMsgWindow; 28 class nsIUTF8StringEnumerator; 29 struct BaseToken; 30 struct CorpusToken; 31 32 /** 33 * Helper class to enumerate Token objects in a PLDHashTable 34 * safely and without copying (see bugzilla #174859). The 35 * enumeration is safe to use until an Add() 36 * or Remove() is performed on the table. 37 */ 38 class TokenEnumeration { 39 public: 40 explicit TokenEnumeration(PLDHashTable* table); 41 bool hasMoreTokens(); 42 BaseToken* nextToken(); 43 44 private: 45 PLDHashTable::Iterator mIterator; 46 }; 47 48 // A trait is some aspect of a message, like being junk or tagged as 49 // Personal, that the statistical classifier should track. The Trait 50 // structure is a per-token representation of information pertaining to 51 // a message trait. 52 // 53 // Traits per token are maintained as a linked list. 54 // 55 struct TraitPerToken { 56 uint32_t mId; // identifying number for a trait 57 uint32_t mCount; // count of messages with this token and trait 58 uint32_t mNextLink; // index in mTraitStore for the next trait, or 0 59 // for none 60 TraitPerToken(uint32_t aId, uint32_t aCount); // inititializer 61 }; 62 63 // An Analysis is the statistical results for a particular message, a 64 // particular token, and for a particular pair of trait/antitrait, that 65 // is then used in subsequent analysis to score the message. 66 // 67 // Analyses per token are maintained as a linked list. 68 // 69 struct AnalysisPerToken { 70 uint32_t mTraitIndex; // index representing a protrait/antitrait pair. 71 // So if we are analyzing 3 different traits, then 72 // the first trait is 0, the second 1, etc. 73 double mDistance; // absolute value of mProbability - 0.5 74 double mProbability; // relative indicator of match of trait to token 75 uint32_t mNextLink; // index in mAnalysisStore for the Analysis object 76 // for the next trait index, or 0 for none. 77 // initializer 78 AnalysisPerToken(uint32_t aTraitIndex, double aDistance, double aProbability); 79 }; 80 81 class TokenHash { 82 public: 83 virtual ~TokenHash(); 84 /** 85 * Clears out the previous message tokens. 86 */ 87 nsresult clearTokens(); 88 uint32_t countTokens(); 89 TokenEnumeration getTokens(); 90 BaseToken* add(const char* word); 91 92 protected: 93 explicit TokenHash(uint32_t entrySize); 94 mozilla::ArenaAllocator<16384, 2> mWordPool; 95 uint32_t mEntrySize; 96 PLDHashTable mTokenTable; 97 char* copyWord(const char* word, uint32_t len); 98 BaseToken* get(const char* word); 99 }; 100 101 class Tokenizer : public TokenHash { 102 public: 103 Tokenizer(); 104 ~Tokenizer(); 105 106 Token* get(const char* word); 107 108 // The training set keeps an occurrence count on each word. This count 109 // is supposed to count the # of messages it occurs in. 110 // When add/remove is called while tokenizing a message and NOT the training 111 // set, 112 // 113 Token* add(const char* word, uint32_t count = 1); 114 115 Token* copyTokens(); 116 117 void tokenize(const char* text); 118 119 /** 120 * Creates specific tokens based on the mime headers for the message being 121 * tokenized 122 */ 123 void tokenizeHeaders(nsIUTF8StringEnumerator* aHeaderNames, 124 nsIUTF8StringEnumerator* aHeaderValues); 125 126 void tokenizeAttachment(const char* aContentType, const char* aFileName); 127 128 nsCString mBodyDelimiters; // delimiters for body tokenization 129 nsCString mHeaderDelimiters; // delimiters for header tokenization 130 131 // arrays of extra headers to tokenize / to not tokenize 132 nsTArray<nsCString> mEnabledHeaders; 133 nsTArray<nsCString> mDisabledHeaders; 134 // Delimiters used in tokenizing a particular header. 135 // Parallel array to mEnabledHeaders 136 nsTArray<nsCString> mEnabledHeadersDelimiters; 137 bool mCustomHeaderTokenization; // Are there any preference-set tokenization 138 // customizations? 139 uint32_t mMaxLengthForToken; // maximum length of a token 140 // should we convert iframe to div during tokenization? 141 bool mIframeToDiv; 142 143 private: 144 void tokenize_ascii_word(char* word); 145 void tokenize_japanese_word(char* chunk); 146 inline void addTokenForHeader(const char* aTokenPrefix, nsACString& aValue, 147 bool aTokenizeValue = false, 148 const char* aDelimiters = nullptr); 149 nsresult stripHTML(const nsAString& inString, nsAString& outString); 150 // helper function to escape \n, \t, etc from a CString 151 void UnescapeCString(nsCString& aCString); 152 nsresult ScannerNext(const char16_t* text, int32_t length, int32_t pos, 153 bool isLastBuffer, int32_t* begin, int32_t* end, 154 bool* _retval); 155 RefPtr<mozilla::intl::WordBreaker> mWordBreaker; 156 }; 157 158 /** 159 * Implements storage of a collection of message tokens and counts for 160 * a corpus of classified messages 161 */ 162 163 class CorpusStore : public TokenHash { 164 public: 165 CorpusStore(); 166 ~CorpusStore(); 167 168 /** 169 * retrieve the token structure for a particular string 170 * 171 * @param word the character representation of the token 172 * 173 * @return token structure containing counts, null if not found 174 */ 175 CorpusToken* get(const char* word); 176 177 /** 178 * add tokens to the storage, or increment counts if already exists. 179 * 180 * @param aTokenizer tokenizer for the list of tokens to remember 181 * @param aTraitId id for the trait whose counts will be remembered 182 * @param aCount number of new messages represented by the token list 183 */ 184 void rememberTokens(Tokenizer& aTokenizer, uint32_t aTraitId, 185 uint32_t aCount); 186 187 /** 188 * decrement counts for tokens in the storage, removing if all counts 189 * are zero 190 * 191 * @param aTokenizer tokenizer for the list of tokens to forget 192 * @param aTraitId id for the trait whose counts will be removed 193 * @param aCount number of messages represented by the token list 194 */ 195 void forgetTokens(Tokenizer& aTokenizer, uint32_t aTraitId, uint32_t aCount); 196 197 /** 198 * write the corpus information to file storage 199 * 200 * @param aMaximumTokenCount prune tokens if number of tokens exceeds 201 * this value. == 0 for no pruning 202 */ 203 void writeTrainingData(uint32_t aMaximumTokenCount); 204 205 /** 206 * read the corpus information from file storage 207 */ 208 void readTrainingData(); 209 210 /** 211 * delete the local corpus storage file and data 212 */ 213 nsresult resetTrainingData(); 214 215 /** 216 * get the count of messages whose tokens are stored that are associated 217 * with a trait 218 * 219 * @param aTraitId identifier for the trait 220 * @return number of messages for that trait 221 */ 222 uint32_t getMessageCount(uint32_t aTraitId); 223 224 /** 225 * set the count of messages whose tokens are stored that are associated 226 * with a trait 227 * 228 * @param aTraitId identifier for the trait 229 * @param aCount number of messages for that trait 230 */ 231 void setMessageCount(uint32_t aTraitId, uint32_t aCount); 232 233 /** 234 * get the count of messages associated with a particular token and trait 235 * 236 * @param token the token string and associated counts 237 * @param aTraitId identifier for the trait 238 */ 239 uint32_t getTraitCount(CorpusToken* token, uint32_t aTraitId); 240 241 /** 242 * Add (or remove) data from a particular file to the corpus data. 243 * 244 * @param aFile the file with the data, in the format: 245 * 246 * Format of the trait file for version 1: 247 * [0xFCA93601] (the 01 is the version) 248 * for each trait to write: 249 * [id of trait to write] (0 means end of list) 250 * [number of messages per trait] 251 * for each token with non-zero count 252 * [count] 253 * [length of word]word 254 * 255 * @param aIsAdd should the data be added, or removed? true if adding, 256 * else removing. 257 * 258 * @param aFromTraits array of trait ids used in aFile. If aFile contains 259 * trait ids that are not in this array, they are not 260 * remapped, but assumed to be local trait ids. 261 * 262 * @param aToTraits array of trait ids, corresponding to elements of 263 * aFromTraits, that represent the local trait ids to be 264 * used in storing data from aFile into the local corpus. 265 * 266 */ 267 nsresult UpdateData(nsIFile* aFile, bool aIsAdd, 268 const nsTArray<uint32_t>& aFromTraits, 269 const nsTArray<uint32_t>& aToTraits); 270 271 /** 272 * remove all counts (message and tokens) for a trait id 273 * 274 * @param aTrait trait id for the trait to remove 275 */ 276 nsresult ClearTrait(uint32_t aTrait); 277 278 protected: 279 /** 280 * return the local corpus storage file for junk traits 281 */ 282 nsresult getTrainingFile(nsIFile** aFile); 283 284 /** 285 * return the local corpus storage file for non-junk traits 286 */ 287 nsresult getTraitFile(nsIFile** aFile); 288 289 /** 290 * read token strings from the data file 291 * 292 * @param stream file stream with token data 293 * @param fileSize file size 294 * @param aTraitId id for the trait whose counts will be read 295 * @param aIsAdd true to add the counts, false to remove them 296 * 297 * @return true if successful, false if error 298 */ 299 bool readTokens(FILE* stream, int64_t fileSize, uint32_t aTraitId, 300 bool aIsAdd); 301 302 /** 303 * write token strings to the data file 304 */ 305 bool writeTokens(FILE* stream, bool shrink, uint32_t aTraitId); 306 307 /** 308 * remove counts for a token string 309 */ 310 void remove(const char* word, uint32_t aTraitId, uint32_t aCount); 311 312 /** 313 * add counts for a token string, adding the token string if new 314 */ 315 CorpusToken* add(const char* word, uint32_t aTraitId, uint32_t aCount); 316 317 /** 318 * change counts in a trait in the traits array, adding the trait if needed 319 */ 320 nsresult updateTrait(CorpusToken* token, uint32_t aTraitId, 321 int32_t aCountChange); 322 nsCOMPtr<nsIFile> mTrainingFile; // file used to store junk training data 323 nsCOMPtr<nsIFile> mTraitFile; // file used to store non-junk 324 // training data 325 nsTArray<TraitPerToken> mTraitStore; // memory for linked-list of counts 326 uint32_t mNextTraitIndex; // index in mTraitStore to first empty 327 // TraitPerToken 328 nsTArray<uint32_t> mMessageCounts; // count of messages per trait 329 // represented in the store 330 nsTArray<uint32_t> mMessageCountsId; // Parallel array to mMessageCounts, 331 // with the corresponding trait ID 332 }; 333 334 class nsBayesianFilter : public nsIJunkMailPlugin, 335 nsIMsgCorpus, 336 nsIObserver, 337 nsSupportsWeakReference { 338 public: 339 NS_DECL_ISUPPORTS 340 NS_DECL_NSIMSGFILTERPLUGIN 341 NS_DECL_NSIJUNKMAILPLUGIN 342 NS_DECL_NSIMSGCORPUS 343 NS_DECL_NSIOBSERVER 344 345 nsBayesianFilter(); 346 347 nsresult Init(); 348 349 nsresult tokenizeMessage(const nsACString& messageURI, 350 nsIMsgWindow* aMsgWindow, TokenAnalyzer* analyzer); 351 void classifyMessage(Tokenizer& tokens, const nsACString& messageURI, 352 nsIJunkMailClassificationListener* listener); 353 354 void classifyMessage(Tokenizer& tokenizer, const nsACString& messageURI, 355 nsTArray<uint32_t>& aProTraits, 356 nsTArray<uint32_t>& aAntiTraits, 357 nsIJunkMailClassificationListener* listener, 358 nsIMsgTraitClassificationListener* aTraitListener, 359 nsIMsgTraitDetailListener* aDetailListener); 360 361 void observeMessage(Tokenizer& tokens, const nsACString& messageURI, 362 nsTArray<uint32_t>& oldClassifications, 363 nsTArray<uint32_t>& newClassifications, 364 nsIJunkMailClassificationListener* listener, 365 nsIMsgTraitClassificationListener* aTraitListener); 366 367 protected: 368 virtual ~nsBayesianFilter(); 369 370 static void TimerCallback(nsITimer* aTimer, void* aClosure); 371 372 CorpusStore mCorpus; 373 double mJunkProbabilityThreshold; 374 int32_t mMaximumTokenCount; 375 bool mTrainingDataDirty; 376 int32_t mMinFlushInterval; // in milliseconds, must be positive 377 // and not too close to 0 378 nsCOMPtr<nsITimer> mTimer; 379 380 // index in mAnalysisStore for first empty AnalysisPerToken 381 uint32_t mNextAnalysisIndex; 382 // memory for linked list of AnalysisPerToken objects 383 nsTArray<AnalysisPerToken> mAnalysisStore; 384 /** 385 * Determine the location in mAnalysisStore where the AnalysisPerToken 386 * object for a particular token and trait is stored 387 */ 388 uint32_t getAnalysisIndex(Token& token, uint32_t aTraitIndex); 389 /** 390 * Set the value of the AnalysisPerToken object for a particular 391 * token and trait 392 */ 393 nsresult setAnalysis(Token& token, uint32_t aTraitIndex, double aDistance, 394 double aProbability); 395 }; 396 397 #endif // _nsBayesianFilter_h__ 398