1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "components/safe_browsing/content/renderer/threat_dom_details.h"
6 
7 #include <algorithm>
8 #include <map>
9 #include <string>
10 #include <unordered_set>
11 
12 #include "base/bind.h"
13 #include "base/compiler_specific.h"
14 #include "base/metrics/field_trial_params.h"
15 #include "base/metrics/histogram_macros.h"
16 #include "base/strings/string_piece.h"
17 #include "base/strings/string_split.h"
18 #include "base/strings/stringprintf.h"
19 #include "components/safe_browsing/core/features.h"
20 #include "content/public/renderer/render_frame.h"
21 #include "third_party/blink/public/platform/web_string.h"
22 #include "third_party/blink/public/web/web_document.h"
23 #include "third_party/blink/public/web/web_element.h"
24 #include "third_party/blink/public/web/web_element_collection.h"
25 #include "third_party/blink/public/web/web_frame.h"
26 #include "third_party/blink/public/web/web_local_frame.h"
27 
28 namespace safe_browsing {
29 
30 // A map for keeping track of the identity of DOM Elements, used to generate
31 // unique IDs for each element and lookup elements IDs by parent Element, to
32 // maintain proper parent/child relationships.
33 // They key is a WebNode from the DOM, which is basically a pointer so can be
34 // copied into the map when inserting new elements.
35 // The values are indices into the resource vector, and are used to retrieve IPC
36 // messages generated by ThreatDOMDetails.
37 using ElementToNodeMap = std::map<blink::WebNode, size_t>;
38 
39 // The name of the param containing the tags and attributes list.
40 const char kTagAndAttributeParamName[] = "tag_attribute_csv";
41 
42 namespace {
43 
44 // Predicate used to search |tag_and_attributes_list_| by tag_name.
45 class TagNameIs {
46  public:
TagNameIs(const std::string & tag)47   explicit TagNameIs(const std::string& tag) : tag_(tag) {}
operator ()(const TagAndAttributesItem & tag_and_attribute)48   bool operator()(const TagAndAttributesItem& tag_and_attribute) {
49     return tag_ == tag_and_attribute.tag_name;
50   }
51 
52  private:
53   std::string tag_;
54 };
55 
GetDefaultTagAndAttributeList(std::vector<TagAndAttributesItem> * tag_and_attributes_list)56 void GetDefaultTagAndAttributeList(
57     std::vector<TagAndAttributesItem>* tag_and_attributes_list) {
58   tag_and_attributes_list->clear();
59   // These entries must be sorted by tag name.
60   bool should_capture_js =
61       base::FeatureList::IsEnabled(kCaptureInlineJavascriptForGoogleAds);
62   if (should_capture_js)
63     tag_and_attributes_list->push_back(TagAndAttributesItem("a", {"onclick"}));
64   // These entries must be sorted by tag name.
65   // These tags are related to identifying Google ads.
66   tag_and_attributes_list->push_back(
67       TagAndAttributesItem("div", {"data-google-query-id", "id"}));
68   tag_and_attributes_list->push_back(TagAndAttributesItem("iframe", {"id"}));
69   if (should_capture_js)
70     tag_and_attributes_list->push_back(
71         TagAndAttributesItem("img", {"onclick"}));
72 }
73 
ParseTagAndAttributeParams(std::vector<TagAndAttributesItem> * tag_and_attributes_list)74 void ParseTagAndAttributeParams(
75     std::vector<TagAndAttributesItem>* tag_and_attributes_list) {
76   DCHECK(tag_and_attributes_list);
77   // If the feature is disabled we just use the default list. Otherwise the list
78   // from the Finch param will be the one used.
79   if (!base::FeatureList::IsEnabled(kThreatDomDetailsTagAndAttributeFeature)) {
80     GetDefaultTagAndAttributeList(tag_and_attributes_list);
81     return;
82   }
83   tag_and_attributes_list->clear();
84   const std::string& tag_attribute_csv_param =
85       base::GetFieldTrialParamValueByFeature(
86           kThreatDomDetailsTagAndAttributeFeature, kTagAndAttributeParamName);
87   if (tag_attribute_csv_param.empty()) {
88     return;
89   }
90 
91   std::vector<std::string> split =
92       base::SplitString(tag_attribute_csv_param, ",", base::TRIM_WHITESPACE,
93                         base::SPLIT_WANT_NONEMPTY);
94   // If we don't have the right number of pairs in the csv then don't bother
95   // parsing further.
96   if (split.size() % 2 != 0) {
97     return;
98   }
99   for (size_t i = 0; i < split.size(); i += 2) {
100     const std::string& tag_name = split[i];
101     const std::string& attribute = split[i + 1];
102     auto item_iter =
103         std::find_if(tag_and_attributes_list->begin(),
104                      tag_and_attributes_list->end(), TagNameIs(tag_name));
105     if (item_iter == tag_and_attributes_list->end()) {
106       TagAndAttributesItem item;
107       item.tag_name = tag_name;
108       item.attributes.push_back(attribute);
109       tag_and_attributes_list->push_back(item);
110     } else {
111       item_iter->attributes.push_back(attribute);
112     }
113   }
114 
115   std::sort(tag_and_attributes_list->begin(), tag_and_attributes_list->end(),
116             [](const TagAndAttributesItem& a, const TagAndAttributesItem& b) {
117               return a.tag_name < b.tag_name;
118             });
119 }
120 
GetNodeForElement(const blink::WebNode & element,const safe_browsing::ElementToNodeMap & element_to_node_map,std::vector<mojom::ThreatDOMDetailsNodePtr> * resources)121 mojom::ThreatDOMDetailsNode* GetNodeForElement(
122     const blink::WebNode& element,
123     const safe_browsing::ElementToNodeMap& element_to_node_map,
124     std::vector<mojom::ThreatDOMDetailsNodePtr>* resources) {
125   DCHECK_GT(element_to_node_map.count(element), 0u);
126   size_t resource_index = element_to_node_map.at(element);
127   return (*resources)[resource_index].get();
128 }
129 
TruncateAttributeString(const std::string & input)130 std::string TruncateAttributeString(const std::string& input) {
131   if (input.length() <= ThreatDOMDetails::kMaxAttributeStringLength) {
132     return input;
133   }
134 
135   std::string truncated;
136   base::TruncateUTF8ToByteSize(
137       input, ThreatDOMDetails::kMaxAttributeStringLength - 3, &truncated);
138   truncated.append("...");
139   return truncated;
140 }
141 
142 // Handler for the various HTML elements that we extract URLs from.
HandleElement(const blink::WebElement & element,const std::vector<TagAndAttributesItem> & tag_and_attributes_list,mojom::ThreatDOMDetailsNode * summary_node,std::vector<mojom::ThreatDOMDetailsNodePtr> * resources,safe_browsing::ElementToNodeMap * element_to_node_map)143 void HandleElement(
144     const blink::WebElement& element,
145     const std::vector<TagAndAttributesItem>& tag_and_attributes_list,
146     mojom::ThreatDOMDetailsNode* summary_node,
147     std::vector<mojom::ThreatDOMDetailsNodePtr>* resources,
148     safe_browsing::ElementToNodeMap* element_to_node_map) {
149   // Retrieve the link and resolve the link in case it's relative.
150   blink::WebURL full_url =
151       element.GetDocument().CompleteURL(element.GetAttribute("src"));
152 
153   const GURL& child_url = GURL(full_url);
154   if (!child_url.is_empty() && child_url.is_valid()) {
155     summary_node->children.push_back(child_url);
156   }
157 
158   mojom::ThreatDOMDetailsNodePtr child_node =
159       mojom::ThreatDOMDetailsNode::New();
160   child_node->url = child_url;
161   child_node->tag_name = element.TagName().Utf8();
162   child_node->parent = summary_node->url;
163   // The body of an iframe may be in a different renderer. Look up the routing
164   // ID of the local or remote frame and store it with the iframe node. If this
165   // element is not a frame then the result of the lookup will be null.
166   blink::WebFrame* subframe = blink::WebFrame::FromFrameOwnerElement(element);
167   if (subframe) {
168     child_node->child_frame_routing_id =
169         content::RenderFrame::GetRoutingIdForWebFrame(subframe);
170   }
171   if (base::FeatureList::IsEnabled(kCaptureInlineJavascriptForGoogleAds) &&
172       child_node->tag_name == "SCRIPT") {
173     child_node->inner_html = element.TextContent().Utf8();
174   }
175   // Populate the element's attributes, but only collect the ones that are
176   // configured in the finch study.
177   const auto& tag_attribute_iter = std::find_if(
178       tag_and_attributes_list.begin(), tag_and_attributes_list.end(),
179       TagNameIs(base::ToLowerASCII(child_node->tag_name)));
180   if (tag_attribute_iter != tag_and_attributes_list.end()) {
181     const std::vector<std::string> attributes_to_collect =
182         tag_attribute_iter->attributes;
183     for (const std::string& attribute : attributes_to_collect) {
184       blink::WebString attr_webstring = blink::WebString::FromASCII(attribute);
185       if (!element.HasAttribute(attr_webstring)) {
186         continue;
187       }
188       mojom::AttributeNameValuePtr attribute_name_value =
189           mojom::AttributeNameValue::New(
190               attribute, TruncateAttributeString(
191                              element.GetAttribute(attr_webstring).Ascii()));
192       child_node->attributes.push_back(std::move(attribute_name_value));
193       if (child_node->attributes.size() == ThreatDOMDetails::kMaxAttributes) {
194         break;
195       }
196     }
197   }
198 
199   // Update the ID mapping. First generate the ID for the current node.
200   // Then, if its parent is available, set the current node's parent ID, and
201   // also update the parent's children with the current node's ID.
202   const int child_id = static_cast<int>(element_to_node_map->size()) + 1;
203   child_node->node_id = child_id;
204   blink::WebNode cur_parent_element = element.ParentNode();
205   while (!cur_parent_element.IsNull()) {
206     if (element_to_node_map->count(cur_parent_element) > 0) {
207       mojom::ThreatDOMDetailsNode* parent_node = GetNodeForElement(
208           cur_parent_element, *element_to_node_map, resources);
209       child_node->parent_node_id = parent_node->node_id;
210       parent_node->child_node_ids.push_back(child_id);
211 
212       // TODO(lpz): Consider also updating the URL-level parent/child mapping
213       // here. Eg: child_node.parent=parent_node.url, and
214       // parent_node.children.push_back(child_url).
215       break;
216     } else {
217       // It's possible that the direct parent of this node wasn't handled, so it
218       // isn't represented in |element_to_node_map|. Try walking up the
219       // hierarchy to see if a parent further up was handled.
220       cur_parent_element = cur_parent_element.ParentNode();
221     }
222   }
223   // Add the child node to the list of resources.
224   resources->push_back(std::move(child_node));
225   // .. and remember which index it was inserted at so we can look it up later.
226   (*element_to_node_map)[element] = resources->size() - 1;
227 }
228 
ShouldHandleElement(const blink::WebElement & element,const std::vector<TagAndAttributesItem> & tag_and_attributes_list)229 bool ShouldHandleElement(
230     const blink::WebElement& element,
231     const std::vector<TagAndAttributesItem>& tag_and_attributes_list) {
232   // Resources with a SRC are always handled.
233   if ((element.HasHTMLTagName("iframe") || element.HasHTMLTagName("frame") ||
234        element.HasHTMLTagName("embed") || element.HasHTMLTagName("script")) &&
235       element.HasAttribute("src")) {
236     return true;
237   }
238   if (base::FeatureList::IsEnabled(kCaptureInlineJavascriptForGoogleAds) &&
239       element.HasHTMLTagName("script")) {
240     return true;
241   }
242   std::string tag_name_lower = base::ToLowerASCII(element.TagName().Ascii());
243   const auto& tag_attribute_iter =
244       std::find_if(tag_and_attributes_list.begin(),
245                    tag_and_attributes_list.end(), TagNameIs(tag_name_lower));
246   if (tag_attribute_iter == tag_and_attributes_list.end()) {
247     return false;
248   }
249 
250   const std::vector<std::string>& valid_attributes =
251       tag_attribute_iter->attributes;
252   for (const std::string& attribute : valid_attributes) {
253     if (element.HasAttribute(blink::WebString::FromASCII(attribute))) {
254       return true;
255     }
256   }
257   return false;
258 }
259 
260 }  // namespace
261 
TagAndAttributesItem()262 TagAndAttributesItem::TagAndAttributesItem() {}
263 
TagAndAttributesItem(const std::string & tag_name_param,const std::vector<std::string> & attributes_param)264 TagAndAttributesItem::TagAndAttributesItem(
265     const std::string& tag_name_param,
266     const std::vector<std::string>& attributes_param)
267     : tag_name(tag_name_param), attributes(attributes_param) {}
268 
TagAndAttributesItem(const TagAndAttributesItem & item)269 TagAndAttributesItem::TagAndAttributesItem(const TagAndAttributesItem& item)
270     : tag_name(item.tag_name), attributes(item.attributes) {}
271 
~TagAndAttributesItem()272 TagAndAttributesItem::~TagAndAttributesItem() {}
273 
274 uint32_t ThreatDOMDetails::kMaxNodes = 500;
275 uint32_t ThreatDOMDetails::kMaxAttributes = 100;
276 uint32_t ThreatDOMDetails::kMaxAttributeStringLength = 100;
277 
278 // static
Create(content::RenderFrame * render_frame,service_manager::BinderRegistry * registry)279 ThreatDOMDetails* ThreatDOMDetails::Create(
280     content::RenderFrame* render_frame,
281     service_manager::BinderRegistry* registry) {
282   // Private constructor and public static Create() method to facilitate
283   // stubbing out this class for binary-size reduction purposes.
284   return new ThreatDOMDetails(render_frame, registry);
285 }
286 
OnThreatReporterReceiver(mojo::PendingReceiver<mojom::ThreatReporter> receiver)287 void ThreatDOMDetails::OnThreatReporterReceiver(
288     mojo::PendingReceiver<mojom::ThreatReporter> receiver) {
289   threat_reporter_receivers_.Add(this, std::move(receiver));
290 }
291 
ThreatDOMDetails(content::RenderFrame * render_frame,service_manager::BinderRegistry * registry)292 ThreatDOMDetails::ThreatDOMDetails(content::RenderFrame* render_frame,
293                                    service_manager::BinderRegistry* registry)
294     : content::RenderFrameObserver(render_frame) {
295   ParseTagAndAttributeParams(&tag_and_attributes_list_);
296   // Base::Unretained() is safe here because both the registry and the
297   // ThreatDOMDetails are scoped to the same render frame.
298   registry->AddInterface(base::BindRepeating(
299       &ThreatDOMDetails::OnThreatReporterReceiver, base::Unretained(this)));
300 }
301 
~ThreatDOMDetails()302 ThreatDOMDetails::~ThreatDOMDetails() {}
303 
GetThreatDOMDetails(GetThreatDOMDetailsCallback callback)304 void ThreatDOMDetails::GetThreatDOMDetails(
305     GetThreatDOMDetailsCallback callback) {
306   std::vector<mojom::ThreatDOMDetailsNodePtr> resources;
307   ExtractResources(&resources);
308   // Notify the browser.
309   std::move(callback).Run(std::move(resources));
310 }
311 
ExtractResources(std::vector<mojom::ThreatDOMDetailsNodePtr> * resources)312 void ThreatDOMDetails::ExtractResources(
313     std::vector<mojom::ThreatDOMDetailsNodePtr>* resources) {
314   blink::WebLocalFrame* frame = render_frame()->GetWebFrame();
315   if (!frame)
316     return;
317   mojom::ThreatDOMDetailsNodePtr details_node =
318       mojom::ThreatDOMDetailsNode::New();
319   blink::WebDocument document = frame->GetDocument();
320   details_node->url = GURL(document.Url());
321   if (document.IsNull()) {
322     // Nothing in this frame. Just report its URL.
323     resources->push_back(std::move(details_node));
324     return;
325   }
326 
327   ElementToNodeMap element_to_node_map;
328   blink::WebElementCollection elements = document.All();
329   blink::WebElement element = elements.FirstItem();
330   for (; !element.IsNull(); element = elements.NextItem()) {
331     if (ShouldHandleElement(element, tag_and_attributes_list_)) {
332       HandleElement(element, tag_and_attributes_list_, details_node.get(),
333                     resources, &element_to_node_map);
334       if (resources->size() >= kMaxNodes) {
335         // We have reached kMaxNodes, exit early.
336         break;
337       }
338     }
339   }
340   resources->push_back(std::move(details_node));
341 }
342 
OnDestruct()343 void ThreatDOMDetails::OnDestruct() {
344   delete this;
345 }
346 
347 }  // namespace safe_browsing
348