1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3  * License, v. 2.0. If a copy of the MPL was not distributed with this
4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 
6 #ifndef nsBayesianFilter_h__
7 #define nsBayesianFilter_h__
8 
9 #include <stdio.h>
10 #include "nsCOMPtr.h"
11 #include "nsIMsgFilterPlugin.h"
12 #include "PLDHashTable.h"
13 #include "nsITimer.h"
14 #include "nsTArray.h"
15 #include "nsString.h"
16 #include "nsWeakReference.h"
17 #include "nsIObserver.h"
18 #include "mozilla/intl/WordBreaker.h"
19 
20 #include "mozilla/ArenaAllocator.h"
21 
22 #define DEFAULT_MIN_INTERVAL_BETWEEN_WRITES 15 * 60 * 1000
23 
24 struct Token;
25 class TokenEnumeration;
26 class TokenAnalyzer;
27 class nsIMsgWindow;
28 class nsIUTF8StringEnumerator;
29 struct BaseToken;
30 struct CorpusToken;
31 
32 /**
33  * Helper class to enumerate Token objects in a PLDHashTable
34  * safely and without copying (see bugzilla #174859). The
35  * enumeration is safe to use until an Add()
36  * or Remove() is performed on the table.
37  */
38 class TokenEnumeration {
39  public:
40   explicit TokenEnumeration(PLDHashTable* table);
41   bool hasMoreTokens();
42   BaseToken* nextToken();
43 
44  private:
45   PLDHashTable::Iterator mIterator;
46 };
47 
48 // A trait is some aspect of a message, like being junk or tagged as
49 // Personal, that the statistical classifier should track. The Trait
50 // structure is a per-token representation of information pertaining to
51 // a message trait.
52 //
53 // Traits per token are maintained as a linked list.
54 //
55 struct TraitPerToken {
56   uint32_t mId;        // identifying number for a trait
57   uint32_t mCount;     // count of messages with this token and trait
58   uint32_t mNextLink;  // index in mTraitStore for the next trait, or 0
59                        // for none
60   TraitPerToken(uint32_t aId, uint32_t aCount);  // inititializer
61 };
62 
63 // An Analysis is the statistical results for a particular message, a
64 // particular token, and for a particular pair of trait/antitrait, that
65 // is then used in subsequent analysis to score the message.
66 //
67 // Analyses per token are maintained as a linked list.
68 //
69 struct AnalysisPerToken {
70   uint32_t mTraitIndex;  // index representing a protrait/antitrait pair.
71                          // So if we are analyzing 3 different traits, then
72                          // the first trait is 0, the second 1, etc.
73   double mDistance;      // absolute value of mProbability - 0.5
74   double mProbability;   // relative indicator of match of trait to token
75   uint32_t mNextLink;    // index in mAnalysisStore for the Analysis object
76                          // for the next trait index, or 0 for none.
77   // initializer
78   AnalysisPerToken(uint32_t aTraitIndex, double aDistance, double aProbability);
79 };
80 
81 class TokenHash {
82  public:
83   virtual ~TokenHash();
84   /**
85    * Clears out the previous message tokens.
86    */
87   nsresult clearTokens();
88   uint32_t countTokens();
89   TokenEnumeration getTokens();
90   BaseToken* add(const char* word);
91 
92  protected:
93   explicit TokenHash(uint32_t entrySize);
94   mozilla::ArenaAllocator<16384, 2> mWordPool;
95   uint32_t mEntrySize;
96   PLDHashTable mTokenTable;
97   char* copyWord(const char* word, uint32_t len);
98   BaseToken* get(const char* word);
99 };
100 
101 class Tokenizer : public TokenHash {
102  public:
103   Tokenizer();
104   ~Tokenizer();
105 
106   Token* get(const char* word);
107 
108   // The training set keeps an occurrence count on each word. This count
109   // is supposed to count the # of messages it occurs in.
110   // When add/remove is called while tokenizing a message and NOT the training
111   // set,
112   //
113   Token* add(const char* word, uint32_t count = 1);
114 
115   Token* copyTokens();
116 
117   void tokenize(const char* text);
118 
119   /**
120    *  Creates specific tokens based on the mime headers for the message being
121    * tokenized
122    */
123   void tokenizeHeaders(nsIUTF8StringEnumerator* aHeaderNames,
124                        nsIUTF8StringEnumerator* aHeaderValues);
125 
126   void tokenizeAttachment(const char* aContentType, const char* aFileName);
127 
128   nsCString mBodyDelimiters;    // delimiters for body tokenization
129   nsCString mHeaderDelimiters;  // delimiters for header tokenization
130 
131   // arrays of extra headers to tokenize / to not tokenize
132   nsTArray<nsCString> mEnabledHeaders;
133   nsTArray<nsCString> mDisabledHeaders;
134   // Delimiters used in tokenizing a particular header.
135   // Parallel array to mEnabledHeaders
136   nsTArray<nsCString> mEnabledHeadersDelimiters;
137   bool mCustomHeaderTokenization;  // Are there any preference-set tokenization
138                                    // customizations?
139   uint32_t mMaxLengthForToken;     // maximum length of a token
140   // should we convert iframe to div during tokenization?
141   bool mIframeToDiv;
142 
143  private:
144   void tokenize_ascii_word(char* word);
145   void tokenize_japanese_word(char* chunk);
146   inline void addTokenForHeader(const char* aTokenPrefix, nsACString& aValue,
147                                 bool aTokenizeValue = false,
148                                 const char* aDelimiters = nullptr);
149   nsresult stripHTML(const nsAString& inString, nsAString& outString);
150   // helper function to escape \n, \t, etc from a CString
151   void UnescapeCString(nsCString& aCString);
152   nsresult ScannerNext(const char16_t* text, int32_t length, int32_t pos,
153                        bool isLastBuffer, int32_t* begin, int32_t* end,
154                        bool* _retval);
155   RefPtr<mozilla::intl::WordBreaker> mWordBreaker;
156 };
157 
158 /**
159  * Implements storage of a collection of message tokens and counts for
160  * a corpus of classified messages
161  */
162 
163 class CorpusStore : public TokenHash {
164  public:
165   CorpusStore();
166   ~CorpusStore();
167 
168   /**
169    * retrieve the token structure for a particular string
170    *
171    * @param word  the character representation of the token
172    *
173    * @return      token structure containing counts, null if not found
174    */
175   CorpusToken* get(const char* word);
176 
177   /**
178    * add tokens to the storage, or increment counts if already exists.
179    *
180    * @param aTokenizer tokenizer for the list of tokens to remember
181    * @param aTraitId   id for the trait whose counts will be remembered
182    * @param aCount     number of new messages represented by the token list
183    */
184   void rememberTokens(Tokenizer& aTokenizer, uint32_t aTraitId,
185                       uint32_t aCount);
186 
187   /**
188    * decrement counts for tokens in the storage, removing if all counts
189    * are zero
190    *
191    * @param aTokenizer tokenizer for the list of tokens to forget
192    * @param aTraitId   id for the trait whose counts will be removed
193    * @param aCount     number of messages represented by the token list
194    */
195   void forgetTokens(Tokenizer& aTokenizer, uint32_t aTraitId, uint32_t aCount);
196 
197   /**
198    * write the corpus information to file storage
199    *
200    * @param aMaximumTokenCount  prune tokens if number of tokens exceeds
201    *                            this value.  == 0  for no pruning
202    */
203   void writeTrainingData(uint32_t aMaximumTokenCount);
204 
205   /**
206    * read the corpus information from file storage
207    */
208   void readTrainingData();
209 
210   /**
211    * delete the local corpus storage file and data
212    */
213   nsresult resetTrainingData();
214 
215   /**
216    * get the count of messages whose tokens are stored that are associated
217    * with a trait
218    *
219    * @param aTraitId  identifier for the trait
220    * @return          number of messages for that trait
221    */
222   uint32_t getMessageCount(uint32_t aTraitId);
223 
224   /**
225    * set the count of messages whose tokens are stored that are associated
226    * with a trait
227    *
228    * @param aTraitId  identifier for the trait
229    * @param aCount    number of messages for that trait
230    */
231   void setMessageCount(uint32_t aTraitId, uint32_t aCount);
232 
233   /**
234    * get the count of messages associated with a particular token and trait
235    *
236    * @param  token     the token string and associated counts
237    * @param  aTraitId  identifier for the trait
238    */
239   uint32_t getTraitCount(CorpusToken* token, uint32_t aTraitId);
240 
241   /**
242    * Add (or remove) data from a particular file to the corpus data.
243    *
244    * @param aFile       the file with the data, in the format:
245    *
246    *                    Format of the trait file for version 1:
247    *                    [0xFCA93601]  (the 01 is the version)
248    *                    for each trait to write:
249    *                    [id of trait to write] (0 means end of list)
250    *                    [number of messages per trait]
251    *                    for each token with non-zero count
252    *                    [count]
253    *                    [length of word]word
254    *
255    * @param aIsAdd      should the data be added, or removed? true if adding,
256    *                    else removing.
257    *
258    * @param aFromTraits array of trait ids used in aFile. If aFile contains
259    *                    trait ids that are not in this array, they are not
260    *                    remapped, but assumed to be local trait ids.
261    *
262    * @param aToTraits   array of trait ids, corresponding to elements of
263    *                    aFromTraits, that represent the local trait ids to be
264    *                    used in storing data from aFile into the local corpus.
265    *
266    */
267   nsresult UpdateData(nsIFile* aFile, bool aIsAdd,
268                       const nsTArray<uint32_t>& aFromTraits,
269                       const nsTArray<uint32_t>& aToTraits);
270 
271   /**
272    * remove all counts (message and tokens) for a trait id
273    *
274    * @param aTrait  trait id for the trait to remove
275    */
276   nsresult ClearTrait(uint32_t aTrait);
277 
278  protected:
279   /**
280    * return the local corpus storage file for junk traits
281    */
282   nsresult getTrainingFile(nsIFile** aFile);
283 
284   /**
285    * return the local corpus storage file for non-junk traits
286    */
287   nsresult getTraitFile(nsIFile** aFile);
288 
289   /**
290    * read token strings from the data file
291    *
292    * @param stream     file stream with token data
293    * @param fileSize   file size
294    * @param aTraitId   id for the trait whose counts will be read
295    * @param aIsAdd     true to add the counts, false to remove them
296    *
297    * @return           true if successful, false if error
298    */
299   bool readTokens(FILE* stream, int64_t fileSize, uint32_t aTraitId,
300                   bool aIsAdd);
301 
302   /**
303    * write token strings to the data file
304    */
305   bool writeTokens(FILE* stream, bool shrink, uint32_t aTraitId);
306 
307   /**
308    * remove counts for a token string
309    */
310   void remove(const char* word, uint32_t aTraitId, uint32_t aCount);
311 
312   /**
313    * add counts for a token string, adding the token string if new
314    */
315   CorpusToken* add(const char* word, uint32_t aTraitId, uint32_t aCount);
316 
317   /**
318    * change counts in a trait in the traits array, adding the trait if needed
319    */
320   nsresult updateTrait(CorpusToken* token, uint32_t aTraitId,
321                        int32_t aCountChange);
322   nsCOMPtr<nsIFile> mTrainingFile;      // file used to store junk training data
323   nsCOMPtr<nsIFile> mTraitFile;         // file used to store non-junk
324                                         // training data
325   nsTArray<TraitPerToken> mTraitStore;  // memory for linked-list of counts
326   uint32_t mNextTraitIndex;             // index in mTraitStore to first empty
327                                         // TraitPerToken
328   nsTArray<uint32_t> mMessageCounts;    // count of messages per trait
329                                         // represented in the store
330   nsTArray<uint32_t> mMessageCountsId;  // Parallel array to mMessageCounts,
331                                         // with the corresponding trait ID
332 };
333 
334 class nsBayesianFilter : public nsIJunkMailPlugin,
335                          nsIMsgCorpus,
336                          nsIObserver,
337                          nsSupportsWeakReference {
338  public:
339   NS_DECL_ISUPPORTS
340   NS_DECL_NSIMSGFILTERPLUGIN
341   NS_DECL_NSIJUNKMAILPLUGIN
342   NS_DECL_NSIMSGCORPUS
343   NS_DECL_NSIOBSERVER
344 
345   nsBayesianFilter();
346 
347   nsresult Init();
348 
349   nsresult tokenizeMessage(const nsACString& messageURI,
350                            nsIMsgWindow* aMsgWindow, TokenAnalyzer* analyzer);
351   void classifyMessage(Tokenizer& tokens, const nsACString& messageURI,
352                        nsIJunkMailClassificationListener* listener);
353 
354   void classifyMessage(Tokenizer& tokenizer, const nsACString& messageURI,
355                        nsTArray<uint32_t>& aProTraits,
356                        nsTArray<uint32_t>& aAntiTraits,
357                        nsIJunkMailClassificationListener* listener,
358                        nsIMsgTraitClassificationListener* aTraitListener,
359                        nsIMsgTraitDetailListener* aDetailListener);
360 
361   void observeMessage(Tokenizer& tokens, const nsACString& messageURI,
362                       nsTArray<uint32_t>& oldClassifications,
363                       nsTArray<uint32_t>& newClassifications,
364                       nsIJunkMailClassificationListener* listener,
365                       nsIMsgTraitClassificationListener* aTraitListener);
366 
367  protected:
368   virtual ~nsBayesianFilter();
369 
370   static void TimerCallback(nsITimer* aTimer, void* aClosure);
371 
372   CorpusStore mCorpus;
373   double mJunkProbabilityThreshold;
374   int32_t mMaximumTokenCount;
375   bool mTrainingDataDirty;
376   int32_t mMinFlushInterval;  // in milliseconds, must be positive
377                               // and not too close to 0
378   nsCOMPtr<nsITimer> mTimer;
379 
380   // index in mAnalysisStore for first empty AnalysisPerToken
381   uint32_t mNextAnalysisIndex;
382   // memory for linked list of AnalysisPerToken objects
383   nsTArray<AnalysisPerToken> mAnalysisStore;
384   /**
385    * Determine the location in mAnalysisStore where the AnalysisPerToken
386    * object for a particular token and trait is stored
387    */
388   uint32_t getAnalysisIndex(Token& token, uint32_t aTraitIndex);
389   /**
390    * Set the value of the AnalysisPerToken object for a particular
391    * token and trait
392    */
393   nsresult setAnalysis(Token& token, uint32_t aTraitIndex, double aDistance,
394                        double aProbability);
395 };
396 
397 #endif  // _nsBayesianFilter_h__
398