1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 // 5 // This class handles the process of extracting all of the features from a 6 // page and computing a phishyness score. The basic steps are: 7 // - Run each feature extractor over the page, building up a FeatureMap of 8 // feature -> value. 9 // - SHA-256 hash all of the feature names in the map so that they match the 10 // supplied model. 11 // - Hand the hashed map off to a Scorer, which computes the probability that 12 // the page is phishy. 13 // - If the page is phishy, run the supplied callback. 14 // 15 // For more details, see phishing_*_feature_extractor.h, scorer.h, and 16 // client_model.proto. 17 18 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_ 19 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_ 20 21 #include <stdint.h> 22 23 #include <memory> 24 #include <set> 25 26 #include "base/callback.h" 27 #include "base/macros.h" 28 #include "base/memory/weak_ptr.h" 29 #include "base/strings/string16.h" 30 31 namespace content { 32 class RenderFrame; 33 } 34 35 namespace safe_browsing { 36 class ClientPhishingRequest; 37 class FeatureExtractorClock; 38 class FeatureMap; 39 class PhishingDOMFeatureExtractor; 40 class PhishingTermFeatureExtractor; 41 class PhishingUrlFeatureExtractor; 42 class Scorer; 43 44 class PhishingClassifier { 45 public: 46 // Callback to be run when phishing classification finishes. The verdict 47 // is a ClientPhishingRequest which contains the verdict computed by the 48 // classifier as well as the extracted features. If the verdict.is_phishing() 49 // is true, the page is considered phishy by the client-side model, 50 // and the browser should ping back to get a final verdict. The 51 // verdict.client_score() is set to kInvalidScore if classification failed. 52 typedef base::OnceCallback<void(const ClientPhishingRequest& /* verdict */)> 53 DoneCallback; 54 55 static const float kInvalidScore; 56 57 // Creates a new PhishingClassifier object that will operate on 58 // |render_view|. |clock| is used to time feature extractor operations, and 59 // the PhishingClassifier takes ownership of this object. Note that the 60 // classifier will not be 'ready' until set_phishing_scorer() is called. 61 PhishingClassifier(content::RenderFrame* render_frame, 62 FeatureExtractorClock* clock); 63 virtual ~PhishingClassifier(); 64 65 // Sets a scorer for the classifier to use in computing the phishiness score. 66 // This must live at least as long as the PhishingClassifier. The caller is 67 // expected to cancel any pending classification before setting a phishing 68 // scorer. 69 void set_phishing_scorer(const Scorer* scorer); 70 71 // Returns true if the classifier is ready to classify pages, i.e. it 72 // has had a scorer set via set_phishing_scorer(). 73 bool is_ready() const; 74 75 // Called by the RenderView when a page has finished loading. This begins 76 // the feature extraction and scoring process. |page_text| should contain 77 // the plain text of a web page, including any subframes, as returned by 78 // RenderView::CaptureText(). |page_text| is owned by the caller, and must 79 // not be destroyed until either |done_callback| is run or 80 // CancelPendingClassification() is called. 81 // 82 // To avoid blocking the render thread for too long, phishing classification 83 // may run in several chunks of work, posting a task to the current 84 // MessageLoop to continue processing. Once the scoring process is complete, 85 // |done_callback| is run on the current thread. PhishingClassifier takes 86 // ownership of the callback. 87 // 88 // It is an error to call BeginClassification if the classifier is not yet 89 // ready. 90 virtual void BeginClassification(const base::string16* page_text, 91 DoneCallback callback); 92 93 // Called by the RenderView (on the render thread) when a page is unloading 94 // or the RenderView is being destroyed. This cancels any extraction that 95 // is in progress. It is an error to call CancelPendingClassification if 96 // the classifier is not yet ready. 97 virtual void CancelPendingClassification(); 98 99 private: 100 // Any score equal to or above this value is considered phishy. 101 static const float kPhishyThreshold; 102 103 // Begins the feature extraction process, by extracting URL features and 104 // beginning DOM feature extraction. 105 void BeginFeatureExtraction(); 106 107 // Callback to be run when DOM feature extraction is complete. 108 // If it was successful, begins term feature extraction, otherwise 109 // runs the DoneCallback with a non-phishy verdict. 110 void DOMExtractionFinished(bool success); 111 112 // Callback to be run when term feature extraction is complete. 113 // If it was successful, computes a score and runs the DoneCallback. 114 // If extraction was unsuccessful, runs the DoneCallback with a 115 // non-phishy verdict. 116 void TermExtractionFinished(bool success); 117 118 // Helper to verify that there is no pending phishing classification. Dies 119 // in debug builds if the state is not as expected. This is a no-op in 120 // release builds. 121 void CheckNoPendingClassification(); 122 123 // Helper method to run the DoneCallback and clear the state. 124 void RunCallback(const ClientPhishingRequest& verdict); 125 126 // Helper to run the DoneCallback when feature extraction has failed. 127 // This always signals a non-phishy verdict for the page, with kInvalidScore. 128 void RunFailureCallback(); 129 130 // Clears the current state of the PhishingClassifier. 131 void Clear(); 132 133 content::RenderFrame* render_frame_; // owns us 134 const Scorer* scorer_; // owned by the caller 135 std::unique_ptr<FeatureExtractorClock> clock_; 136 std::unique_ptr<PhishingUrlFeatureExtractor> url_extractor_; 137 std::unique_ptr<PhishingDOMFeatureExtractor> dom_extractor_; 138 std::unique_ptr<PhishingTermFeatureExtractor> term_extractor_; 139 140 // State for any in-progress extraction. 141 std::unique_ptr<FeatureMap> features_; 142 std::unique_ptr<std::set<uint32_t>> shingle_hashes_; 143 const base::string16* page_text_; // owned by the caller 144 DoneCallback done_callback_; 145 146 // Used in scheduling BeginFeatureExtraction tasks. 147 // These pointers are invalidated if classification is cancelled. 148 base::WeakPtrFactory<PhishingClassifier> weak_factory_{this}; 149 150 DISALLOW_COPY_AND_ASSIGN(PhishingClassifier); 151 }; 152 153 } // namespace safe_browsing 154 155 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_ 156