1 // Copyright 2016 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef COMPONENTS_SAFE_BROWSING_CORE_DB_V4_PROTOCOL_MANAGER_UTIL_H_
6 #define COMPONENTS_SAFE_BROWSING_CORE_DB_V4_PROTOCOL_MANAGER_UTIL_H_
7 
8 // A class that implements the stateless methods used by the GetHashUpdate and
9 // GetFullHash stubby calls made by Chrome using the SafeBrowsing V4 protocol.
10 
11 #include <functional>
12 #include <initializer_list>
13 #include <memory>
14 #include <ostream>
15 #include <string>
16 #include <unordered_map>
17 #include <unordered_set>
18 #include <vector>
19 
20 #include "base/containers/flat_set.h"
21 #include "base/gtest_prod_util.h"
22 #include "base/strings/string_piece.h"
23 #include "components/safe_browsing/core/common/safe_browsing_prefs.h"
24 #include "components/safe_browsing/core/db/safebrowsing.pb.h"
25 #include "url/gurl.h"
26 
27 namespace net {
28 class HttpRequestHeaders;
29 class IPAddress;
30 }  // namespace net
31 
32 namespace safe_browsing {
33 
34 
35 // The size of the hash prefix, in bytes. It should be between 4 to 32 (full
36 // hash).
37 using PrefixSize = size_t;
38 
39 // The minimum expected size (in bytes) of a hash-prefix.
40 const PrefixSize kMinHashPrefixLength = 4;
41 
42 // The maximum expected size (in bytes) of a hash-prefix. This represents the
43 // length of a SHA256 hash.
44 const PrefixSize kMaxHashPrefixLength = 32;
45 
46 // A hash prefix sent by the SafeBrowsing PVer4 service.
47 using HashPrefix = std::string;
48 
49 // A full SHA256 hash.
50 using FullHash = HashPrefix;
51 
52 using ListUpdateRequest = FetchThreatListUpdatesRequest::ListUpdateRequest;
53 using ListUpdateResponse = FetchThreatListUpdatesResponse::ListUpdateResponse;
54 
55 void SetSbV4UrlPrefixForTesting(const char* url_prefix);
56 
57 // Config passed to the constructor of a V4 protocol manager.
58 struct V4ProtocolConfig {
59   // The safe browsing client name sent in each request.
60   std::string client_name;
61 
62   // Disable auto-updates using a command line switch.
63   bool disable_auto_update;
64 
65   // The Google API key.
66   std::string key_param;
67 
68   // Current product version sent in each request.
69   std::string version;
70 
71   V4ProtocolConfig(const std::string& client_name,
72                    bool disable_auto_update,
73                    const std::string& key_param,
74                    const std::string& version);
75   V4ProtocolConfig(const V4ProtocolConfig& other);
76   ~V4ProtocolConfig();
77 
78  private:
79   V4ProtocolConfig() = delete;
80 };
81 
82 // Get the v4 protocol config struct with a given client name, and ability to
83 // enable/disable database auto update.
84 V4ProtocolConfig GetV4ProtocolConfig(const std::string& client_name,
85                                      bool disable_auto_update);
86 
87 // Returns the URL to use for sending threat reports and other Safe Browsing
88 // hits back to Safe Browsing service.
89 std::string GetReportUrl(
90     const V4ProtocolConfig& config,
91     const std::string& method,
92     const ExtendedReportingLevel* reporting_level = nullptr,
93     const bool is_enhanced_protection = false);
94 
95 // Different types of threats that SafeBrowsing protects against. This is the
96 // type that's returned to the clients of SafeBrowsing in Chromium.
97 // GENERATED_JAVA_ENUM_PACKAGE: org.chromium.components.safe_browsing
98 // GENERATED_JAVA_PREFIX_TO_STRIP: SB_THREAT_TYPE_
99 enum SBThreatType {
100   // This type can be used for lists that can be checked synchronously so a
101   // client callback isn't required, or for whitelists.
102   SB_THREAT_TYPE_UNUSED,
103 
104   // No threat at all.
105   SB_THREAT_TYPE_SAFE,
106 
107   // The URL is being used for phishing.
108   SB_THREAT_TYPE_URL_PHISHING,
109 
110   // The URL hosts malware.
111   SB_THREAT_TYPE_URL_MALWARE,
112 
113   // The URL hosts unwanted programs.
114   SB_THREAT_TYPE_URL_UNWANTED,
115 
116   // The download URL is malware.
117   SB_THREAT_TYPE_URL_BINARY_MALWARE,
118 
119   // Url detected by the client-side phishing model.  Note that unlike the
120   // above values, this does not correspond to a downloaded list.
121   SB_THREAT_TYPE_URL_CLIENT_SIDE_PHISHING,
122 
123   // The Chrome extension or app (given by its ID) is malware.
124   SB_THREAT_TYPE_EXTENSION,
125 
126   // Url detected by the client-side malware IP list. This IP list is part
127   // of the client side detection model.
128   SB_THREAT_TYPE_URL_CLIENT_SIDE_MALWARE,
129 
130   // Url leads to a blacklisted resource script. Note that no warnings should be
131   // shown on this threat type, but an incident report might be sent.
132   SB_THREAT_TYPE_BLACKLISTED_RESOURCE,
133 
134   // Url abuses a permission API.
135   SB_THREAT_TYPE_API_ABUSE,
136 
137   // Activation patterns for the Subresource Filter.
138   SB_THREAT_TYPE_SUBRESOURCE_FILTER,
139 
140   // CSD Phishing whitelist.  This "threat" means a URL matched the whitelist.
141   SB_THREAT_TYPE_CSD_WHITELIST,
142 
143   // DEPRECATED. Url detected by password protection service.
144   DEPRECATED_SB_THREAT_TYPE_URL_PASSWORD_PROTECTION_PHISHING,
145 
146   // Saved password reuse detected on low reputation page,
147   SB_THREAT_TYPE_SAVED_PASSWORD_REUSE,
148 
149   // Chrome signed in and syncing gaia password reuse detected on low reputation
150   // page,
151   SB_THREAT_TYPE_SIGNED_IN_SYNC_PASSWORD_REUSE,
152 
153   // Chrome signed in non syncing gaia password reuse detected on low reputation
154   // page,
155   SB_THREAT_TYPE_SIGNED_IN_NON_SYNC_PASSWORD_REUSE,
156 
157   // A Google ad that caused a blocked autoredirect was collected
158   SB_THREAT_TYPE_BLOCKED_AD_REDIRECT,
159 
160   // A sample of an ad was collected
161   SB_THREAT_TYPE_AD_SAMPLE,
162 
163   // A report of Google ad that caused a blocked popup was collected.
164   SB_THREAT_TYPE_BLOCKED_AD_POPUP,
165 
166   // The page loaded a resource from the Suspicious Site list.
167   SB_THREAT_TYPE_SUSPICIOUS_SITE,
168 
169   // Enterprise password reuse detected on low reputation page.
170   SB_THREAT_TYPE_ENTERPRISE_PASSWORD_REUSE,
171 
172   // Potential billing detected.
173   SB_THREAT_TYPE_BILLING,
174 
175   // Off-market APK file downloaded, which could be potentially dangerous.
176   SB_THREAT_TYPE_APK_DOWNLOAD,
177 
178   // Match found in the local high-confidence allowlist.
179   SB_THREAT_TYPE_HIGH_CONFIDENCE_ALLOWLIST,
180 };
181 
182 using SBThreatTypeSet = base::flat_set<SBThreatType>;
183 
184 // Return true if |set| only contains types that are valid for CheckBrowseUrl().
185 // Intended for use in DCHECK().
186 bool SBThreatTypeSetIsValidForCheckBrowseUrl(const SBThreatTypeSet& set);
187 
188 // Shorthand for creating an SBThreatTypeSet from a list of SBThreatTypes. Use
189 // like CreateSBThreatTypeSet({SB_THREAT_TYPE_URL_PHISHING,
190 //                             SB_THREAT_TYPE_URL_MALWARE})
CreateSBThreatTypeSet(std::initializer_list<SBThreatType> set)191 inline SBThreatTypeSet CreateSBThreatTypeSet(
192     std::initializer_list<SBThreatType> set) {
193   return SBThreatTypeSet(set);
194 }
195 
196 // The information required to uniquely identify each list the client is
197 // interested in maintaining and downloading from the SafeBrowsing servers.
198 // For example, for digests of Malware binaries on Windows:
199 // platform_type = WINDOWS,
200 // threat_entry_type = EXECUTABLE,
201 // threat_type = MALWARE
202 class ListIdentifier {
203  public:
204   ListIdentifier(PlatformType platform_type,
205                  ThreatEntryType threat_entry_type,
206                  ThreatType threat_type);
207   explicit ListIdentifier(const ListUpdateResponse&);
208 
209   bool operator==(const ListIdentifier& other) const;
210   bool operator!=(const ListIdentifier& other) const;
211   size_t hash() const;
212 
platform_type()213   PlatformType platform_type() const { return platform_type_; }
threat_entry_type()214   ThreatEntryType threat_entry_type() const { return threat_entry_type_; }
threat_type()215   ThreatType threat_type() const { return threat_type_; }
216 
217  private:
218   PlatformType platform_type_;
219   ThreatEntryType threat_entry_type_;
220   ThreatType threat_type_;
221 
222   ListIdentifier() = delete;
223 };
224 
225 std::ostream& operator<<(std::ostream& os, const ListIdentifier& id);
226 
227 PlatformType GetCurrentPlatformType();
228 ListIdentifier GetCertCsdDownloadWhitelistId();
229 ListIdentifier GetChromeExtMalwareId();
230 ListIdentifier GetChromeUrlApiId();
231 ListIdentifier GetChromeUrlClientIncidentId();
232 ListIdentifier GetIpMalwareId();
233 ListIdentifier GetUrlBillingId();
234 ListIdentifier GetUrlCsdDownloadWhitelistId();
235 ListIdentifier GetUrlCsdWhitelistId();
236 ListIdentifier GetUrlHighConfidenceAllowlistId();
237 ListIdentifier GetUrlMalBinId();
238 ListIdentifier GetUrlMalwareId();
239 ListIdentifier GetUrlSocEngId();
240 ListIdentifier GetUrlSubresourceFilterId();
241 ListIdentifier GetUrlSuspiciousSiteId();
242 ListIdentifier GetUrlUwsId();
243 
244 // Returns the basename of the store file, without the ".store" extension.
245 std::string GetUmaSuffixForStore(const base::FilePath& file_path);
246 
247 // Represents the state of each store.
248 using StoreStateMap = std::unordered_map<ListIdentifier, std::string>;
249 
250 // Sever response, parsed in vector form.
251 using ParsedServerResponse = std::vector<std::unique_ptr<ListUpdateResponse>>;
252 
253 // Holds the hash prefix and the store that it matched in.
254 struct StoreAndHashPrefix {
255  public:
256   ListIdentifier list_id;
257   HashPrefix hash_prefix;
258 
259   StoreAndHashPrefix(ListIdentifier list_id, const HashPrefix& hash_prefix);
260   ~StoreAndHashPrefix();
261 
262   bool operator==(const StoreAndHashPrefix& other) const;
263   bool operator!=(const StoreAndHashPrefix& other) const;
264   size_t hash() const;
265 
266  private:
267   StoreAndHashPrefix() = delete;
268 };
269 
270 // Used to track the hash prefix and the store in which a full hash's prefix
271 // matched.
272 using StoreAndHashPrefixes = std::vector<StoreAndHashPrefix>;
273 
274 // Enumerate failures for histogramming purposes.  DO NOT CHANGE THE
275 // ORDERING OF THESE VALUES.
276 enum V4OperationResult {
277   // 200 response code means that the server recognized the request.
278   STATUS_200 = 0,
279 
280   // Subset of successful responses where the response body wasn't parsable.
281   PARSE_ERROR = 1,
282 
283   // Operation request failed (network error).
284   NETWORK_ERROR = 2,
285 
286   // Operation request returned HTTP result code other than 200.
287   HTTP_ERROR = 3,
288 
289   // Operation attempted during error backoff, no request sent.
290   BACKOFF_ERROR = 4,
291 
292   // Operation attempted before min wait duration elapsed, no request sent.
293   MIN_WAIT_DURATION_ERROR = 5,
294 
295   // Identical operation already pending.
296   ALREADY_PENDING_ERROR = 6,
297 
298   // Memory space for histograms is determined by the max.  ALWAYS
299   // ADD NEW VALUES BEFORE THIS ONE.
300   OPERATION_RESULT_MAX = 7
301 };
302 
303 // A class that provides static methods related to the Pver4 protocol.
304 class V4ProtocolManagerUtil {
305  public:
306   // Canonicalizes url as per Google Safe Browsing Specification.
307   // See: https://developers.google.com/safe-browsing/v4/urls-hashing
308   static void CanonicalizeUrl(const GURL& url,
309                               std::string* canonicalized_hostname,
310                               std::string* canonicalized_path,
311                               std::string* canonicalized_query);
312 
313   // This method returns the host suffix combinations from the hostname in the
314   // URL, as described here:
315   // https://developers.google.com/safe-browsing/v4/urls-hashing
316   static void GenerateHostVariantsToCheck(const std::string& host,
317                                           std::vector<std::string>* hosts);
318 
319   // This method returns the path prefix combinations from the path in the
320   // URL, as described here:
321   // https://developers.google.com/safe-browsing/v4/urls-hashing
322   static void GeneratePathVariantsToCheck(const std::string& path,
323                                           const std::string& query,
324                                           std::vector<std::string>* paths);
325 
326   // Given a URL, returns all the patterns we need to check.
327   static void GeneratePatternsToCheck(const GURL& url,
328                                       std::vector<std::string>* urls);
329 
330   // Returns a FullHash for the basic host+path pattern for a given URL after
331   // canonicalization. Not intended for general use.
332   static FullHash GetFullHash(const GURL& url);
333 
334   // Generates a Pver4 request URL and sets the appropriate header values.
335   // |request_base64| is the serialized request protocol buffer encoded in
336   // base 64.
337   // |method_name| is the name of the method to call, as specified in the proto,
338   // |config| is an instance of V4ProtocolConfig that stores the client config,
339   // |gurl| is set to the value of the PVer4 request URL,
340   // |headers| is populated with the appropriate header values.
341   static void GetRequestUrlAndHeaders(const std::string& request_base64,
342                                       const std::string& method_name,
343                                       const V4ProtocolConfig& config,
344                                       GURL* gurl,
345                                       net::HttpRequestHeaders* headers);
346 
347   // Worker function for calculating the backoff times.
348   // |multiplier| is doubled for each consecutive error after the
349   // first, and |error_count| is incremented with each call.
350   // Backoff interval is MIN(((2^(n-1))*15 minutes) * (RAND + 1), 24 hours)
351   // where n is the number of consecutive errors.
352   static base::TimeDelta GetNextBackOffInterval(size_t* error_count,
353                                                 size_t* multiplier);
354 
355   // Record HTTP response code when there's no error in fetching an HTTP
356   // request, and the error code, when there is.
357   // |metric_name| is the name of the UMA metric to record the response code or
358   // error code against, |net_error| represents the net error code of the HTTP
359   // request, and |response code| represents the HTTP response code received
360   // from the server.
361   static void RecordHttpResponseOrErrorCode(const char* metric_name,
362                                             int net_error,
363                                             int response_code);
364 
365   // Generate the set of FullHashes to check for |url|.
366   static void UrlToFullHashes(const GURL& url,
367                               std::vector<FullHash>* full_hashes);
368 
369   static bool FullHashToHashPrefix(const FullHash& full_hash,
370                                    PrefixSize prefix_size,
371                                    HashPrefix* hash_prefix);
372 
373   static bool FullHashToSmallestHashPrefix(const FullHash& full_hash,
374                                            HashPrefix* hash_prefix);
375 
376   static bool FullHashMatchesHashPrefix(const FullHash& full_hash,
377                                         const HashPrefix& hash_prefix);
378 
379   static void SetClientInfoFromConfig(ClientInfo* client_info,
380                                       const V4ProtocolConfig& config);
381 
382   static bool GetIPV6AddressFromString(const std::string& ip_address,
383                                        net::IPAddress* address);
384 
385   // Converts a IPV4 or IPV6 address in |ip_address| to the SHA1 hash of the
386   // corresponding packed IPV6 address in |hashed_encoded_ip|, and adds an
387   // extra byte containing the value 128 at the end. This is done to match the
388   // server implementation for calculating the hash prefix of an IP address.
389   static bool IPAddressToEncodedIPV6Hash(const std::string& ip_address,
390                                          FullHash* hashed_encoded_ip);
391 
392   // Stores the client state values for each of the lists in |store_state_map|
393   // into |list_client_states|.
394   static void GetListClientStatesFromStoreStateMap(
395       const std::unique_ptr<StoreStateMap>& store_state_map,
396       std::vector<std::string>* list_client_states);
397 
398  private:
V4ProtocolManagerUtil()399   V4ProtocolManagerUtil() {}
400 
401   FRIEND_TEST_ALL_PREFIXES(V4ProtocolManagerUtilTest, TestBackOffLogic);
402   FRIEND_TEST_ALL_PREFIXES(V4ProtocolManagerUtilTest,
403                            TestGetRequestUrlAndUpdateHeaders);
404   FRIEND_TEST_ALL_PREFIXES(V4ProtocolManagerUtilTest, UrlParsing);
405   FRIEND_TEST_ALL_PREFIXES(V4ProtocolManagerUtilTest, CanonicalizeUrl);
406 
407   // Composes a URL using |prefix|, |method| (e.g.: encodedFullHashes).
408   // |request_base64|, |client_id|, |version| and |key_param|. |prefix|
409   // should contain the entire url prefix including scheme, host and path.
410   static std::string ComposeUrl(const std::string& prefix,
411                                 const std::string& method,
412                                 const std::string& request_base64,
413                                 const std::string& key_param);
414 
415   // Sets the HTTP headers expected by a standard PVer4 request.
416   static void UpdateHeaders(net::HttpRequestHeaders* headers);
417 
418   // Given a URL, returns all the hosts we need to check.  They are returned
419   // in order of size (i.e. b.c is first, then a.b.c).
420   static void GenerateHostsToCheck(const GURL& url,
421                                    std::vector<std::string>* hosts);
422 
423   // Given a URL, returns all the paths we need to check.
424   static void GeneratePathsToCheck(const GURL& url,
425                                    std::vector<std::string>* paths);
426 
427   static std::string RemoveConsecutiveChars(base::StringPiece str,
428                                             const char c);
429 
430   DISALLOW_COPY_AND_ASSIGN(V4ProtocolManagerUtil);
431 };
432 
433 using StoresToCheck = std::unordered_set<ListIdentifier>;
434 
435 }  // namespace safe_browsing
436 
437 namespace std {
438 
439 template <>
440 struct hash<safe_browsing::PlatformType> {
441   std::size_t operator()(const safe_browsing::PlatformType& p) const {
442     return std::hash<unsigned int>()(p);
443   }
444 };
445 
446 template <>
447 struct hash<safe_browsing::ThreatEntryType> {
448   std::size_t operator()(const safe_browsing::ThreatEntryType& tet) const {
449     return std::hash<unsigned int>()(tet);
450   }
451 };
452 
453 template <>
454 struct hash<safe_browsing::ThreatType> {
455   std::size_t operator()(const safe_browsing::ThreatType& tt) const {
456     return std::hash<unsigned int>()(tt);
457   }
458 };
459 
460 template <>
461 struct hash<safe_browsing::ListIdentifier> {
462   std::size_t operator()(const safe_browsing::ListIdentifier& id) const {
463     return id.hash();
464   }
465 };
466 
467 }  // namespace std
468 
469 #endif  // COMPONENTS_SAFE_BROWSING_CORE_DB_V4_PROTOCOL_MANAGER_UTIL_H_
470