1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 //
5 // PhishingDOMFeatureExtractor handles computing DOM-based features for the
6 // client-side phishing detection model.  These include the presence of various
7 // types of elements, ratios of external and secure links, and tokens for
8 // external domains linked to.
9 
10 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_
11 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_
12 
13 #include <memory>
14 #include <string>
15 
16 #include "base/callback.h"
17 #include "base/macros.h"
18 #include "base/memory/weak_ptr.h"
19 #include "third_party/blink/public/web/web_document.h"
20 
21 class GURL;
22 
23 namespace blink {
24 class WebElement;
25 }
26 
27 namespace safe_browsing {
28 class FeatureExtractorClock;
29 class FeatureMap;
30 
31 class PhishingDOMFeatureExtractor {
32  public:
33   // Callback to be run when feature extraction finishes.  The callback
34   // argument is true if extraction was successful, false otherwise.
35   typedef base::OnceCallback<void(bool)> DoneCallback;
36 
37   // Creates a PhishingDOMFeatureExtractor instance.
38   // |clock| is used for timing feature extractor operations, and may be
39   // mocked for testing.  The caller maintains ownership of the clock.
40   explicit PhishingDOMFeatureExtractor(FeatureExtractorClock* clock);
41   virtual ~PhishingDOMFeatureExtractor();
42 
43   // Begins extracting features into the given FeatureMap for the page.
44   // To avoid blocking the render thread for too long, the feature extractor
45   // may run in several chunks of work, posting a task to the current
46   // MessageLoop to continue processing.  Once feature extraction is complete,
47   // |done_callback| is run on the current thread.  PhishingDOMFeatureExtractor
48   // takes ownership of the callback.
49   void ExtractFeatures(blink::WebDocument document,
50                        FeatureMap* features,
51                        DoneCallback done_callback);
52 
53   // Cancels any pending feature extraction.  The DoneCallback will not be run.
54   // Must be called if there is a feature extraction in progress when the page
55   // is unloaded or the PhishingDOMFeatureExtractor is destroyed.
56   void CancelPendingExtraction();
57 
58  private:
59   struct FrameData;
60   struct PageFeatureState;
61 
62   // The maximum amount of wall time that we will spend on a single extraction
63   // iteration before pausing to let other MessageLoop tasks run.
64   static const int kMaxTimePerChunkMs;
65 
66   // The number of elements that we will process before checking to see whether
67   // kMaxTimePerChunkMs has elapsed.  Since checking the current time can be
68   // slow, we don't do this on every element processed.
69   static const int kClockCheckGranularity;
70 
71   // The maximum total amount of time that the feature extractor will run
72   // before giving up on the current page.
73   static const int kMaxTotalTimeMs;
74 
75   // Does the actual work of ExtractFeatures.  ExtractFeaturesWithTimeout runs
76   // until a predefined maximum amount of time has elapsed, then posts a task
77   // to the current MessageLoop to continue extraction.  When extraction
78   // finishes, calls RunCallback().
79   void ExtractFeaturesWithTimeout();
80 
81   // Handlers for the various HTML elements that we compute features for.
82   // Since some of the features (such as ratios) cannot be computed until
83   // feature extraction is finished, these handlers do not add to the feature
84   // map directly.  Instead, they update the values in the PageFeatureState.
85   void HandleLink(const blink::WebElement& element);
86   void HandleForm(const blink::WebElement& element);
87   void HandleImage(const blink::WebElement& element);
88   void HandleInput(const blink::WebElement& element);
89   void HandleScript(const blink::WebElement& element);
90 
91   // Helper to verify that there is no pending feature extraction.  Dies in
92   // debug builds if the state is not as expected.  This is a no-op in release
93   // builds.
94   void CheckNoPendingExtraction();
95 
96   // Runs |done_callback_| and then clears all internal state.
97   void RunCallback(bool success);
98 
99   // Clears all internal feature extraction state.
100   void Clear();
101 
102   // Called after advancing |cur_document_| to update the state in
103   // |cur_frame_data_|.
104   void ResetFrameData();
105 
106   // Returns the next document in frame-traversal order from cur_document_.
107   // If there are no more documents, returns a null WebDocument.
108   blink::WebDocument GetNextDocument();
109 
110   // Given a URL, checks whether the domain is different from the domain of
111   // the current frame's URL.  If so, stores the domain in |domain| and returns
112   // true, otherwise returns false.
113   virtual bool IsExternalDomain(const GURL& url, std::string* domain) const;
114 
115   // Given a partial URL, extend it to a full url based on the current frame's
116   // URL.
117   virtual blink::WebURL CompleteURL(const blink::WebElement& element,
118                                     const blink::WebString& partial_url);
119 
120   // Called once all frames have been processed to compute features from the
121   // PageFeatureState and add them to |features_|.  See features.h for a
122   // description of which features are computed.
123   void InsertFeatures();
124 
125 
126   // Non-owned pointer to our clock.
127   FeatureExtractorClock* clock_;
128 
129   // The output parameters from the most recent call to ExtractFeatures().
130   FeatureMap* features_;  // The caller keeps ownership of this.
131   DoneCallback done_callback_;
132 
133   // The current (sub-)document that we are processing.  May be a null document
134   // (isNull()) if we are not currently extracting features.
135   blink::WebDocument cur_document_;
136 
137   // Stores extra state for |cur_document_| that will be persisted until we
138   // advance to the next frame.
139   std::unique_ptr<FrameData> cur_frame_data_;
140 
141   // Stores the intermediate data used to create features.  This data is
142   // accumulated across all frames in the RenderView.
143   std::unique_ptr<PageFeatureState> page_feature_state_;
144 
145   // Used in scheduling ExtractFeaturesWithTimeout tasks.
146   // These pointers are invalidated if extraction is cancelled.
147   base::WeakPtrFactory<PhishingDOMFeatureExtractor> weak_factory_{this};
148 
149   DISALLOW_COPY_AND_ASSIGN(PhishingDOMFeatureExtractor);
150 };
151 
152 }  // namespace safe_browsing
153 
154 #endif  // CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_
155