1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "components/safe_browsing/content/renderer/phishing_classifier/phishing_classifier.h"
6 
7 #include <memory>
8 #include <string>
9 #include <utility>
10 
11 #include "base/bind.h"
12 #include "base/callback.h"
13 #include "base/compiler_specific.h"
14 #include "base/location.h"
15 #include "base/metrics/histogram_functions.h"
16 #include "base/metrics/histogram_macros.h"
17 #include "base/single_thread_task_runner.h"
18 #include "base/strings/string_util.h"
19 #include "base/task/task_traits.h"
20 #include "base/task/thread_pool.h"
21 #include "base/threading/thread_task_runner_handle.h"
22 #include "cc/paint/skia_paint_canvas.h"
23 #include "components/paint_preview/common/paint_preview_tracker.h"
24 #include "components/safe_browsing/buildflags.h"
25 #include "components/safe_browsing/content/renderer/phishing_classifier/features.h"
26 #include "components/safe_browsing/content/renderer/phishing_classifier/phishing_dom_feature_extractor.h"
27 #include "components/safe_browsing/content/renderer/phishing_classifier/phishing_term_feature_extractor.h"
28 #include "components/safe_browsing/content/renderer/phishing_classifier/phishing_url_feature_extractor.h"
29 #include "components/safe_browsing/content/renderer/phishing_classifier/scorer.h"
30 #include "components/safe_browsing/core/proto/csd.pb.h"
31 #include "content/public/renderer/render_frame.h"
32 #include "content/public/renderer/render_thread.h"
33 #include "crypto/sha2.h"
34 #include "skia/ext/legacy_display_globals.h"
35 #include "third_party/blink/public/platform/web_url.h"
36 #include "third_party/blink/public/platform/web_url_request.h"
37 #include "third_party/blink/public/web/web_document.h"
38 #include "third_party/blink/public/web/web_document_loader.h"
39 #include "third_party/blink/public/web/web_local_frame.h"
40 #include "third_party/blink/public/web/web_view.h"
41 #include "ui/gfx/geometry/rect_conversions.h"
42 #include "url/gurl.h"
43 
44 namespace safe_browsing {
45 
46 const float PhishingClassifier::kInvalidScore = -1.0;
47 const float PhishingClassifier::kPhishyThreshold = 0.5;
48 
PhishingClassifier(content::RenderFrame * render_frame)49 PhishingClassifier::PhishingClassifier(content::RenderFrame* render_frame)
50     : render_frame_(render_frame), scorer_(nullptr) {
51   Clear();
52 }
53 
~PhishingClassifier()54 PhishingClassifier::~PhishingClassifier() {
55   // The RenderView should have called CancelPendingClassification() before
56   // we are destroyed.
57   DCHECK(done_callback_.is_null());
58   DCHECK(!page_text_);
59 }
60 
set_phishing_scorer(const Scorer * scorer)61 void PhishingClassifier::set_phishing_scorer(const Scorer* scorer) {
62   DCHECK(done_callback_.is_null());
63   DCHECK(!page_text_);
64   scorer_ = scorer;
65   if (scorer_) {
66     url_extractor_ = std::make_unique<PhishingUrlFeatureExtractor>();
67     dom_extractor_ = std::make_unique<PhishingDOMFeatureExtractor>();
68     term_extractor_ = std::make_unique<PhishingTermFeatureExtractor>(
69         &scorer_->page_terms(), &scorer_->page_words(),
70         scorer_->max_words_per_term(), scorer_->murmurhash3_seed(),
71         scorer_->max_shingles_per_page(), scorer_->shingle_size());
72   } else {
73     // We're disabling client-side phishing detection, so tear down all
74     // of the relevant objects.
75     url_extractor_.reset();
76     dom_extractor_.reset();
77     term_extractor_.reset();
78   }
79 }
80 
is_ready() const81 bool PhishingClassifier::is_ready() const {
82   return !!scorer_;
83 }
84 
BeginClassification(const base::string16 * page_text,DoneCallback done_callback)85 void PhishingClassifier::BeginClassification(const base::string16* page_text,
86                                              DoneCallback done_callback) {
87   DCHECK(is_ready());
88 
89   // The RenderView should have called CancelPendingClassification() before
90   // starting a new classification, so DCHECK this.
91   DCHECK(done_callback_.is_null());
92   DCHECK(!page_text_);
93   // However, in an opt build, we will go ahead and clean up the pending
94   // classification so that we can start in a known state.
95   CancelPendingClassification();
96 
97   page_text_ = page_text;
98   done_callback_ = std::move(done_callback);
99 
100   // For consistency, we always want to invoke the DoneCallback
101   // asynchronously, rather than directly from this method.  To ensure that
102   // this is the case, post a task to begin feature extraction on the next
103   // iteration of the message loop.
104   base::ThreadTaskRunnerHandle::Get()->PostTask(
105       FROM_HERE, base::BindOnce(&PhishingClassifier::BeginFeatureExtraction,
106                                 weak_factory_.GetWeakPtr()));
107 }
108 
BeginFeatureExtraction()109 void PhishingClassifier::BeginFeatureExtraction() {
110   blink::WebLocalFrame* frame = render_frame_->GetWebFrame();
111 
112   // Check whether the URL is one that we should classify.
113   // Currently, we only classify http/https URLs that are GET requests.
114   GURL url(frame->GetDocument().Url());
115   if (!url.SchemeIsHTTPOrHTTPS()) {
116     RunFailureCallback();
117     return;
118   }
119 
120   blink::WebDocumentLoader* document_loader = frame->GetDocumentLoader();
121   if (!document_loader || document_loader->HttpMethod().Ascii() != "GET") {
122     RunFailureCallback();
123     return;
124   }
125 
126   features_.reset(new FeatureMap);
127   if (!url_extractor_->ExtractFeatures(url, features_.get())) {
128     RunFailureCallback();
129     return;
130   }
131 
132   // DOM feature extraction can take awhile, so it runs asynchronously
133   // in several chunks of work and invokes the callback when finished.
134   dom_extractor_->ExtractFeatures(
135       frame->GetDocument(), features_.get(),
136       base::BindOnce(&PhishingClassifier::DOMExtractionFinished,
137                      base::Unretained(this)));
138 }
139 
CancelPendingClassification()140 void PhishingClassifier::CancelPendingClassification() {
141   // Note that cancelling the feature extractors is simply a no-op if they
142   // were not running.
143   DCHECK(is_ready());
144   dom_extractor_->CancelPendingExtraction();
145   term_extractor_->CancelPendingExtraction();
146   weak_factory_.InvalidateWeakPtrs();
147   Clear();
148 }
149 
DOMExtractionFinished(bool success)150 void PhishingClassifier::DOMExtractionFinished(bool success) {
151   shingle_hashes_.reset(new std::set<uint32_t>);
152   if (success) {
153     // Term feature extraction can take awhile, so it runs asynchronously
154     // in several chunks of work and invokes the callback when finished.
155     term_extractor_->ExtractFeatures(
156         page_text_, features_.get(), shingle_hashes_.get(),
157         base::BindOnce(&PhishingClassifier::TermExtractionFinished,
158                        base::Unretained(this)));
159   } else {
160     RunFailureCallback();
161   }
162 }
163 
TermExtractionFinished(bool success)164 void PhishingClassifier::TermExtractionFinished(bool success) {
165   if (success) {
166 #if BUILDFLAG(FULL_SAFE_BROWSING)
167     ExtractVisualFeatures();
168 #else
169     VisualExtractionFinished(true);
170 #endif
171   } else {
172     RunFailureCallback();
173   }
174 }
175 
ExtractVisualFeatures()176 void PhishingClassifier::ExtractVisualFeatures() {
177   DCHECK(content::RenderThread::IsMainThread());
178   base::TimeTicks start_time = base::TimeTicks::Now();
179 
180   blink::WebLocalFrame* frame = render_frame_->GetWebFrame();
181   gfx::SizeF viewport_size = frame->View()->VisualViewportSize();
182   gfx::Rect bounds = ToEnclosingRect(gfx::RectF(viewport_size));
183   bitmap_ = std::make_unique<SkBitmap>();
184   // Use the Rec. 2020 color space, in case the user input is wide-gamut.
185   sk_sp<SkColorSpace> rec2020 = SkColorSpace::MakeRGB(
186       {2.22222f, 0.909672f, 0.0903276f, 0.222222f, 0.0812429f, 0, 0},
187       SkNamedGamut::kRec2020);
188   SkImageInfo bitmap_info = SkImageInfo::Make(
189       bounds.width(), bounds.height(), SkColorType::kRGBA_8888_SkColorType,
190       SkAlphaType::kUnpremul_SkAlphaType, rec2020);
191   if (!bitmap_->tryAllocPixels(bitmap_info))
192     return VisualExtractionFinished(/*success=*/false);
193   SkCanvas sk_canvas(*bitmap_, skia::LegacyDisplayGlobals::GetSkSurfaceProps());
194   cc::SkiaPaintCanvas cc_canvas(&sk_canvas);
195   auto tracker = std::make_unique<paint_preview::PaintPreviewTracker>(
196       base::UnguessableToken::Create(), frame->GetEmbeddingToken(),
197       /*is_main_frame=*/true);
198   cc_canvas.SetPaintPreviewTracker(tracker.get());
199   VisualExtractionFinished(frame->CapturePaintPreview(
200       bounds, &cc_canvas, /*include_linked_destinations=*/false));
201   base::UmaHistogramTimes("SBClientPhishing.VisualFeatureTime",
202                           base::TimeTicks::Now() - start_time);
203 }
204 
VisualExtractionFinished(bool success)205 void PhishingClassifier::VisualExtractionFinished(bool success) {
206   DCHECK(content::RenderThread::IsMainThread());
207   if (!success) {
208     RunFailureCallback();
209     return;
210   }
211 
212   blink::WebLocalFrame* main_frame = render_frame_->GetWebFrame();
213 
214   // Hash all of the features so that they match the model, then compute
215   // the score.
216   FeatureMap hashed_features;
217   std::unique_ptr<ClientPhishingRequest> verdict =
218       std::make_unique<ClientPhishingRequest>();
219   verdict->set_model_version(scorer_->model_version());
220   verdict->set_url(main_frame->GetDocument().Url().GetString().Utf8());
221   for (const auto& it : features_->features()) {
222     bool result = hashed_features.AddRealFeature(
223         crypto::SHA256HashString(it.first), it.second);
224     DCHECK(result);
225     ClientPhishingRequest::Feature* feature = verdict->add_feature_map();
226     feature->set_name(it.first);
227     feature->set_value(it.second);
228   }
229   for (const auto& it : *shingle_hashes_) {
230     verdict->add_shingle_hashes(it);
231   }
232   float score = static_cast<float>(scorer_->ComputeScore(hashed_features));
233   verdict->set_client_score(score);
234   verdict->set_is_phishing(score >= scorer_->threshold_probability());
235 
236 #if BUILDFLAG(FULL_SAFE_BROWSING)
237   visual_matching_start_ = base::TimeTicks::Now();
238   scorer_->GetMatchingVisualTargets(
239       *bitmap_, std::move(verdict),
240       base::BindOnce(&PhishingClassifier::OnVisualTargetsMatched,
241                      weak_factory_.GetWeakPtr()));
242 #else
243   RunCallback(*verdict);
244 #endif
245 }
246 
OnVisualTargetsMatched(std::unique_ptr<ClientPhishingRequest> verdict)247 void PhishingClassifier::OnVisualTargetsMatched(
248     std::unique_ptr<ClientPhishingRequest> verdict) {
249   DCHECK(content::RenderThread::IsMainThread());
250   if (!verdict->vision_match().empty()) {
251     verdict->set_is_phishing(true);
252   }
253   base::UmaHistogramTimes("SBClientPhishing.VisualComparisonTime",
254                           base::TimeTicks::Now() - visual_matching_start_);
255 
256   RunCallback(*verdict);
257 }
258 
RunCallback(const ClientPhishingRequest & verdict)259 void PhishingClassifier::RunCallback(const ClientPhishingRequest& verdict) {
260   std::move(done_callback_).Run(verdict);
261   Clear();
262 }
263 
RunFailureCallback()264 void PhishingClassifier::RunFailureCallback() {
265   ClientPhishingRequest verdict;
266   // In this case we're not guaranteed to have a valid URL.  Just set it
267   // to the empty string to make sure we have a valid protocol buffer.
268   verdict.set_url("");
269   verdict.set_client_score(kInvalidScore);
270   verdict.set_is_phishing(false);
271   RunCallback(verdict);
272 }
273 
Clear()274 void PhishingClassifier::Clear() {
275   page_text_ = nullptr;
276   done_callback_.Reset();
277   features_.reset(nullptr);
278   shingle_hashes_.reset(nullptr);
279   bitmap_.reset(nullptr);
280 }
281 
282 }  // namespace safe_browsing
283