1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 //
5 // This class handles the process of extracting all of the features from a
6 // page and computing a phishyness score.  The basic steps are:
7 //  - Run each feature extractor over the page, building up a FeatureMap of
8 //    feature -> value.
9 //  - SHA-256 hash all of the feature names in the map so that they match the
10 //    supplied model.
11 //  - Hand the hashed map off to a Scorer, which computes the probability that
12 //    the page is phishy.
13 //  - If the page is phishy, run the supplied callback.
14 //
15 // For more details, see phishing_*_feature_extractor.h, scorer.h, and
16 // client_model.proto.
17 
18 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_
19 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_
20 
21 #include <stdint.h>
22 
23 #include <memory>
24 #include <set>
25 
26 #include "base/callback.h"
27 #include "base/macros.h"
28 #include "base/memory/weak_ptr.h"
29 #include "base/strings/string16.h"
30 
31 namespace content {
32 class RenderFrame;
33 }
34 
35 namespace safe_browsing {
36 class ClientPhishingRequest;
37 class FeatureExtractorClock;
38 class FeatureMap;
39 class PhishingDOMFeatureExtractor;
40 class PhishingTermFeatureExtractor;
41 class PhishingUrlFeatureExtractor;
42 class Scorer;
43 
44 class PhishingClassifier {
45  public:
46   // Callback to be run when phishing classification finishes. The verdict
47   // is a ClientPhishingRequest which contains the verdict computed by the
48   // classifier as well as the extracted features.  If the verdict.is_phishing()
49   // is true, the page is considered phishy by the client-side model,
50   // and the browser should ping back to get a final verdict.  The
51   // verdict.client_score() is set to kInvalidScore if classification failed.
52   typedef base::OnceCallback<void(const ClientPhishingRequest& /* verdict */)>
53       DoneCallback;
54 
55   static const float kInvalidScore;
56 
57   // Creates a new PhishingClassifier object that will operate on
58   // |render_view|.  |clock| is used to time feature extractor operations, and
59   // the PhishingClassifier takes ownership of this object.  Note that the
60   // classifier will not be 'ready' until set_phishing_scorer() is called.
61   PhishingClassifier(content::RenderFrame* render_frame,
62                      FeatureExtractorClock* clock);
63   virtual ~PhishingClassifier();
64 
65   // Sets a scorer for the classifier to use in computing the phishiness score.
66   // This must live at least as long as the PhishingClassifier.  The caller is
67   // expected to cancel any pending classification before setting a phishing
68   // scorer.
69   void set_phishing_scorer(const Scorer* scorer);
70 
71   // Returns true if the classifier is ready to classify pages, i.e. it
72   // has had a scorer set via set_phishing_scorer().
73   bool is_ready() const;
74 
75   // Called by the RenderView when a page has finished loading.  This begins
76   // the feature extraction and scoring process. |page_text| should contain
77   // the plain text of a web page, including any subframes, as returned by
78   // RenderView::CaptureText().  |page_text| is owned by the caller, and must
79   // not be destroyed until either |done_callback| is run or
80   // CancelPendingClassification() is called.
81   //
82   // To avoid blocking the render thread for too long, phishing classification
83   // may run in several chunks of work, posting a task to the current
84   // MessageLoop to continue processing.  Once the scoring process is complete,
85   // |done_callback| is run on the current thread.  PhishingClassifier takes
86   // ownership of the callback.
87   //
88   // It is an error to call BeginClassification if the classifier is not yet
89   // ready.
90   virtual void BeginClassification(const base::string16* page_text,
91                                    DoneCallback callback);
92 
93   // Called by the RenderView (on the render thread) when a page is unloading
94   // or the RenderView is being destroyed.  This cancels any extraction that
95   // is in progress.  It is an error to call CancelPendingClassification if
96   // the classifier is not yet ready.
97   virtual void CancelPendingClassification();
98 
99  private:
100   // Any score equal to or above this value is considered phishy.
101   static const float kPhishyThreshold;
102 
103   // Begins the feature extraction process, by extracting URL features and
104   // beginning DOM feature extraction.
105   void BeginFeatureExtraction();
106 
107   // Callback to be run when DOM feature extraction is complete.
108   // If it was successful, begins term feature extraction, otherwise
109   // runs the DoneCallback with a non-phishy verdict.
110   void DOMExtractionFinished(bool success);
111 
112   // Callback to be run when term feature extraction is complete.
113   // If it was successful, computes a score and runs the DoneCallback.
114   // If extraction was unsuccessful, runs the DoneCallback with a
115   // non-phishy verdict.
116   void TermExtractionFinished(bool success);
117 
118   // Helper to verify that there is no pending phishing classification.  Dies
119   // in debug builds if the state is not as expected.  This is a no-op in
120   // release builds.
121   void CheckNoPendingClassification();
122 
123   // Helper method to run the DoneCallback and clear the state.
124   void RunCallback(const ClientPhishingRequest& verdict);
125 
126   // Helper to run the DoneCallback when feature extraction has failed.
127   // This always signals a non-phishy verdict for the page, with kInvalidScore.
128   void RunFailureCallback();
129 
130   // Clears the current state of the PhishingClassifier.
131   void Clear();
132 
133   content::RenderFrame* render_frame_;  // owns us
134   const Scorer* scorer_;  // owned by the caller
135   std::unique_ptr<FeatureExtractorClock> clock_;
136   std::unique_ptr<PhishingUrlFeatureExtractor> url_extractor_;
137   std::unique_ptr<PhishingDOMFeatureExtractor> dom_extractor_;
138   std::unique_ptr<PhishingTermFeatureExtractor> term_extractor_;
139 
140   // State for any in-progress extraction.
141   std::unique_ptr<FeatureMap> features_;
142   std::unique_ptr<std::set<uint32_t>> shingle_hashes_;
143   const base::string16* page_text_;  // owned by the caller
144   DoneCallback done_callback_;
145 
146   // Used in scheduling BeginFeatureExtraction tasks.
147   // These pointers are invalidated if classification is cancelled.
148   base::WeakPtrFactory<PhishingClassifier> weak_factory_{this};
149 
150   DISALLOW_COPY_AND_ASSIGN(PhishingClassifier);
151 };
152 
153 }  // namespace safe_browsing
154 
155 #endif  // CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_
156