1 /*
2  * Copyright (C) 2004, 2007, 2008, 2011, 2012 Apple Inc. All rights reserved.
3  * Copyright (C) 2012 Research In Motion Limited. All rights reserved.
4  * Copyright (C) 2008, 2009, 2011 Google Inc. All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
19  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include "third_party/blink/renderer/platform/weborigin/kurl.h"
29 
30 #include <algorithm>
31 
32 #include "third_party/blink/renderer/platform/weborigin/known_ports.h"
33 #include "third_party/blink/renderer/platform/wtf/math_extras.h"
34 #include "third_party/blink/renderer/platform/wtf/std_lib_extras.h"
35 #include "third_party/blink/renderer/platform/wtf/text/string_hash.h"
36 #include "third_party/blink/renderer/platform/wtf/text/string_statics.h"
37 #include "third_party/blink/renderer/platform/wtf/text/string_utf8_adaptor.h"
38 #include "third_party/blink/renderer/platform/wtf/text/text_encoding.h"
39 #include "third_party/blink/renderer/platform/wtf/thread_specific.h"
40 #include "url/gurl.h"
41 #include "url/url_util.h"
42 #ifndef NDEBUG
43 #include <stdio.h>
44 #endif
45 
46 namespace blink {
47 
48 #if DCHECK_IS_ON()
AssertProtocolIsGood(const StringView protocol)49 static void AssertProtocolIsGood(const StringView protocol) {
50   DCHECK(protocol != "");
51   for (size_t i = 0; i < protocol.length(); ++i) {
52     LChar c = protocol.Characters8()[i];
53     DCHECK(c > ' ' && c < 0x7F && !(c >= 'A' && c <= 'Z'));
54   }
55 }
56 #endif
57 
58 // Note: You must ensure that |spec| is a valid canonicalized URL before calling
59 // this function.
AsURLChar8Subtle(const String & spec)60 static const char* AsURLChar8Subtle(const String& spec) {
61   DCHECK(spec.Is8Bit());
62   // characters8 really return characters in Latin-1, but because we
63   // canonicalize URL strings, we know that everything before the fragment
64   // identifier will actually be ASCII, which means this cast is safe as long as
65   // you don't look at the fragment component.
66   return reinterpret_cast<const char*>(spec.Characters8());
67 }
68 
69 // Returns the characters for the given string, or a pointer to a static empty
70 // string if the input string is null. This will always ensure we have a non-
71 // null character pointer since ReplaceComponents has special meaning for null.
CharactersOrEmpty(const StringUTF8Adaptor & string)72 static const char* CharactersOrEmpty(const StringUTF8Adaptor& string) {
73   static const char kZero = 0;
74   return string.data() ? string.data() : &kZero;
75 }
76 
IsSchemeFirstChar(char c)77 static bool IsSchemeFirstChar(char c) {
78   return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
79 }
80 
IsSchemeChar(char c)81 static bool IsSchemeChar(char c) {
82   return IsSchemeFirstChar(c) || (c >= '0' && c <= '9') || c == '.' ||
83          c == '-' || c == '+';
84 }
85 
IsUnicodeEncoding(const WTF::TextEncoding * encoding)86 static bool IsUnicodeEncoding(const WTF::TextEncoding* encoding) {
87   return encoding->EncodingForFormSubmission() == UTF8Encoding();
88 }
89 
90 namespace {
91 
92 class KURLCharsetConverter final : public url::CharsetConverter {
93   DISALLOW_NEW();
94 
95  public:
96   // The encoding parameter may be 0, but in this case the object must not be
97   // called.
KURLCharsetConverter(const WTF::TextEncoding * encoding)98   explicit KURLCharsetConverter(const WTF::TextEncoding* encoding)
99       : encoding_(encoding) {}
100 
ConvertFromUTF16(const base::char16 * input,int input_length,url::CanonOutput * output)101   void ConvertFromUTF16(const base::char16* input,
102                         int input_length,
103                         url::CanonOutput* output) override {
104     std::string encoded = encoding_->Encode(
105         String(input, input_length), WTF::kURLEncodedEntitiesForUnencodables);
106     output->Append(encoded.c_str(), static_cast<int>(encoded.length()));
107   }
108 
109  private:
110   const WTF::TextEncoding* encoding_;
111 };
112 
113 }  // namespace
114 
IsValidProtocol(const String & protocol)115 bool IsValidProtocol(const String& protocol) {
116   // RFC3986: ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
117   if (protocol.IsEmpty())
118     return false;
119   if (!IsSchemeFirstChar(protocol[0]))
120     return false;
121   unsigned protocol_length = protocol.length();
122   for (unsigned i = 1; i < protocol_length; i++) {
123     if (!IsSchemeChar(protocol[i]))
124       return false;
125   }
126   return true;
127 }
128 
StrippedForUseAsReferrer() const129 String KURL::StrippedForUseAsReferrer() const {
130   if (!ProtocolIsInHTTPFamily())
131     return String();
132 
133   if (parsed_.username.is_nonempty() || parsed_.password.is_nonempty() ||
134       parsed_.ref.is_valid()) {
135     KURL referrer(*this);
136     referrer.SetUser(String());
137     referrer.SetPass(String());
138     referrer.RemoveFragmentIdentifier();
139     return referrer.GetString();
140   }
141   return GetString();
142 }
143 
StrippedForUseAsHref() const144 String KURL::StrippedForUseAsHref() const {
145   if (parsed_.username.is_nonempty() || parsed_.password.is_nonempty()) {
146     KURL href(*this);
147     href.SetUser(String());
148     href.SetPass(String());
149     return href.GetString();
150   }
151   return GetString();
152 }
153 
IsLocalFile() const154 bool KURL::IsLocalFile() const {
155   // Including feed here might be a bad idea since drag and drop uses this check
156   // and including feed would allow feeds to potentially let someone's blog
157   // read the contents of the clipboard on a drag, even without a drop.
158   // Likewise with using the FrameLoader::shouldTreatURLAsLocal() function.
159   return ProtocolIs("file");
160 }
161 
ProtocolIsJavaScript(const String & url)162 bool ProtocolIsJavaScript(const String& url) {
163   return ProtocolIs(url, "javascript");
164 }
165 
BlankURL()166 const KURL& BlankURL() {
167   DEFINE_THREAD_SAFE_STATIC_LOCAL(ThreadSpecific<KURL>, static_blank_url, ());
168   KURL& blank_url = *static_blank_url;
169   if (blank_url.IsNull())
170     blank_url = KURL(AtomicString("about:blank"));
171   return blank_url;
172 }
173 
IsAboutBlankURL() const174 bool KURL::IsAboutBlankURL() const {
175   return *this == BlankURL();
176 }
177 
SrcdocURL()178 const KURL& SrcdocURL() {
179   DEFINE_THREAD_SAFE_STATIC_LOCAL(ThreadSpecific<KURL>, static_srcdoc_url, ());
180   KURL& srcdoc_url = *static_srcdoc_url;
181   if (srcdoc_url.IsNull())
182     srcdoc_url = KURL(AtomicString("about:srcdoc"));
183   return srcdoc_url;
184 }
185 
IsAboutSrcdocURL() const186 bool KURL::IsAboutSrcdocURL() const {
187   return *this == SrcdocURL();
188 }
189 
NullURL()190 const KURL& NullURL() {
191   DEFINE_THREAD_SAFE_STATIC_LOCAL(ThreadSpecific<KURL>, static_null_url, ());
192   return *static_null_url;
193 }
194 
ElidedString() const195 String KURL::ElidedString() const {
196   if (GetString().length() <= 1024)
197     return GetString();
198 
199   return GetString().Left(511) + "..." + GetString().Right(510);
200 }
201 
KURL()202 KURL::KURL() : is_valid_(false), protocol_is_in_http_family_(false) {}
203 
204 // Initializes with a string representing an absolute URL. No encoding
205 // information is specified. This generally happens when a KURL is converted
206 // to a string and then converted back. In this case, the URL is already
207 // canonical and in proper escaped form so needs no encoding. We treat it as
208 // UTF-8 just in case.
KURL(const String & url)209 KURL::KURL(const String& url) {
210   if (!url.IsNull())
211     Init(NullURL(), url, nullptr);
212   else {
213     // WebCore expects us to preserve the nullness of strings when this
214     // constructor is used. In all other cases, it expects a non-null
215     // empty string, which is what Init() will create.
216     is_valid_ = false;
217     protocol_is_in_http_family_ = false;
218   }
219 }
220 
221 // Initializes with a GURL. This is used to covert from a GURL to a KURL.
KURL(const GURL & gurl)222 KURL::KURL(const GURL& gurl) {
223   Init(NullURL() /* base */, String(gurl.spec().c_str()) /* relative */,
224        nullptr /* query_encoding */);
225 }
226 
CreateIsolated(const String & url)227 KURL KURL::CreateIsolated(const String& url) {
228   // FIXME: We should be able to skip this extra copy and created an
229   // isolated KURL more efficiently.
230   return KURL(url).Copy();
231 }
232 
233 // Constructs a new URL given a base URL and a possibly relative input URL.
234 // This assumes UTF-8 encoding.
KURL(const KURL & base,const String & relative)235 KURL::KURL(const KURL& base, const String& relative) {
236   Init(base, relative, nullptr);
237 }
238 
239 // Constructs a new URL given a base URL and a possibly relative input URL.
240 // Any query portion of the relative URL will be encoded in the given encoding.
KURL(const KURL & base,const String & relative,const WTF::TextEncoding & encoding)241 KURL::KURL(const KURL& base,
242            const String& relative,
243            const WTF::TextEncoding& encoding) {
244   Init(base, relative, &encoding.EncodingForFormSubmission());
245 }
246 
KURL(const AtomicString & canonical_string,const url::Parsed & parsed,bool is_valid)247 KURL::KURL(const AtomicString& canonical_string,
248            const url::Parsed& parsed,
249            bool is_valid)
250     : is_valid_(is_valid),
251       protocol_is_in_http_family_(false),
252       parsed_(parsed),
253       string_(canonical_string) {
254   InitProtocolMetadata();
255   InitInnerURL();
256 }
257 
KURL(const KURL & other)258 KURL::KURL(const KURL& other)
259     : is_valid_(other.is_valid_),
260       protocol_is_in_http_family_(other.protocol_is_in_http_family_),
261       protocol_(other.protocol_),
262       parsed_(other.parsed_),
263       string_(other.string_) {
264   if (other.inner_url_.get())
265     inner_url_ = std::make_unique<KURL>(other.inner_url_->Copy());
266 }
267 
268 KURL::~KURL() = default;
269 
operator =(const KURL & other)270 KURL& KURL::operator=(const KURL& other) {
271   is_valid_ = other.is_valid_;
272   protocol_is_in_http_family_ = other.protocol_is_in_http_family_;
273   protocol_ = other.protocol_;
274   parsed_ = other.parsed_;
275   string_ = other.string_;
276   if (other.inner_url_)
277     inner_url_ = std::make_unique<KURL>(other.inner_url_->Copy());
278   else
279     inner_url_.reset();
280   return *this;
281 }
282 
Copy() const283 KURL KURL::Copy() const {
284   KURL result;
285   result.is_valid_ = is_valid_;
286   result.protocol_is_in_http_family_ = protocol_is_in_http_family_;
287   result.protocol_ = protocol_.IsolatedCopy();
288   result.parsed_ = parsed_;
289   result.string_ = string_.IsolatedCopy();
290   if (inner_url_)
291     result.inner_url_ = std::make_unique<KURL>(inner_url_->Copy());
292   return result;
293 }
294 
IsNull() const295 bool KURL::IsNull() const {
296   return string_.IsNull();
297 }
298 
IsEmpty() const299 bool KURL::IsEmpty() const {
300   return string_.IsEmpty();
301 }
302 
IsValid() const303 bool KURL::IsValid() const {
304   return is_valid_;
305 }
306 
HasPort() const307 bool KURL::HasPort() const {
308   return HostEnd() < PathStart();
309 }
310 
ProtocolIsJavaScript() const311 bool KURL::ProtocolIsJavaScript() const {
312   return ComponentStringView(parsed_.scheme) == "javascript";
313 }
314 
ProtocolIsInHTTPFamily() const315 bool KURL::ProtocolIsInHTTPFamily() const {
316   return protocol_is_in_http_family_;
317 }
318 
HasPath() const319 bool KURL::HasPath() const {
320   // Note that http://www.google.com/" has a path, the path is "/". This can
321   // return false only for invalid or nonstandard URLs.
322   return parsed_.path.len >= 0;
323 }
324 
LastPathComponent() const325 String KURL::LastPathComponent() const {
326   if (!is_valid_)
327     return StringViewForInvalidComponent().ToString();
328   DCHECK(!string_.IsNull());
329 
330   // When the output ends in a slash, WebCore has different expectations than
331   // the GoogleURL library. For "/foo/bar/" the library will return the empty
332   // string, but WebCore wants "bar".
333   url::Component path = parsed_.path;
334   if (path.len > 0 && string_[path.end() - 1] == '/')
335     path.len--;
336 
337   url::Component file;
338   if (string_.Is8Bit())
339     url::ExtractFileName(AsURLChar8Subtle(string_), path, &file);
340   else
341     url::ExtractFileName(string_.Characters16(), path, &file);
342 
343   // Bug: https://bugs.webkit.org/show_bug.cgi?id=21015 this function returns
344   // a null string when the path is empty, which we duplicate here.
345   if (!file.is_nonempty())
346     return String();
347   return ComponentString(file);
348 }
349 
Protocol() const350 String KURL::Protocol() const {
351   DCHECK_EQ(ComponentString(parsed_.scheme), protocol_);
352   return protocol_;
353 }
354 
Host() const355 String KURL::Host() const {
356   return ComponentString(parsed_.host);
357 }
358 
Port() const359 uint16_t KURL::Port() const {
360   if (!is_valid_ || parsed_.port.len <= 0)
361     return 0;
362   DCHECK(!string_.IsNull());
363   int port = string_.Is8Bit()
364                  ? url::ParsePort(AsURLChar8Subtle(string_), parsed_.port)
365                  : url::ParsePort(string_.Characters16(), parsed_.port);
366   DCHECK_NE(port, url::PORT_UNSPECIFIED);  // Checked port.len <= 0 already.
367   DCHECK_NE(port, url::PORT_INVALID);      // Checked is_valid_ already.
368 
369   return static_cast<uint16_t>(port);
370 }
371 
372 // TODO(csharrison): Migrate pass() and user() to return a StringView. Most
373 // consumers just need to know if the string is empty.
374 
Pass() const375 String KURL::Pass() const {
376   // Bug: https://bugs.webkit.org/show_bug.cgi?id=21015 this function returns
377   // a null string when the password is empty, which we duplicate here.
378   if (!parsed_.password.is_nonempty())
379     return String();
380   return ComponentString(parsed_.password);
381 }
382 
User() const383 String KURL::User() const {
384   return ComponentString(parsed_.username);
385 }
386 
FragmentIdentifier() const387 String KURL::FragmentIdentifier() const {
388   // Empty but present refs ("foo.com/bar#") should result in the empty
389   // string, which componentString will produce. Nonexistent refs
390   // should be the null string.
391   if (!parsed_.ref.is_valid())
392     return String();
393   return ComponentString(parsed_.ref);
394 }
395 
HasFragmentIdentifier() const396 bool KURL::HasFragmentIdentifier() const {
397   return parsed_.ref.len >= 0;
398 }
399 
BaseAsString() const400 String KURL::BaseAsString() const {
401   // FIXME: There is probably a more efficient way to do this?
402   return string_.Left(PathAfterLastSlash());
403 }
404 
Query() const405 String KURL::Query() const {
406   if (parsed_.query.len >= 0)
407     return ComponentString(parsed_.query);
408 
409   // Bug: https://bugs.webkit.org/show_bug.cgi?id=21015 this function returns
410   // an empty string when the query is empty rather than a null (not sure
411   // which is right).
412   // Returns a null if the query is not specified, instead of empty.
413   if (parsed_.query.is_valid())
414     return g_empty_string;
415   return String();
416 }
417 
GetPath() const418 String KURL::GetPath() const {
419   return ComponentString(parsed_.path);
420 }
421 
SetProtocol(const String & protocol)422 bool KURL::SetProtocol(const String& protocol) {
423   // Firefox and IE remove everything after the first ':'.
424   wtf_size_t separator_position = protocol.find(':');
425   String new_protocol = protocol.Substring(0, separator_position);
426   StringUTF8Adaptor new_protocol_utf8(new_protocol);
427 
428   // If KURL is given an invalid scheme, it returns failure without modifying
429   // the URL at all. This is in contrast to most other setters which modify
430   // the URL and set "m_isValid."
431   url::RawCanonOutputT<char> canon_protocol;
432   url::Component protocol_component;
433   if (!url::CanonicalizeScheme(new_protocol_utf8.data(),
434                                url::Component(0, new_protocol_utf8.size()),
435                                &canon_protocol, &protocol_component) ||
436       !protocol_component.is_nonempty())
437     return false;
438 
439   url::Replacements<char> replacements;
440   replacements.SetScheme(CharactersOrEmpty(new_protocol_utf8),
441                          url::Component(0, new_protocol_utf8.size()));
442   ReplaceComponents(replacements);
443 
444   // isValid could be false but we still return true here. This is because
445   // WebCore or JS scripts can build up a URL by setting individual
446   // components, and a JS exception is based on the return value of this
447   // function. We want to throw the exception and stop the script only when
448   // its trying to set a bad protocol, and not when it maybe just hasn't
449   // finished building up its final scheme.
450   return true;
451 }
452 
SetHost(const String & host)453 void KURL::SetHost(const String& host) {
454   StringUTF8Adaptor host_utf8(host);
455   url::Replacements<char> replacements;
456   replacements.SetHost(CharactersOrEmpty(host_utf8),
457                        url::Component(0, host_utf8.size()));
458   ReplaceComponents(replacements);
459 }
460 
ParsePortFromStringPosition(const String & value,unsigned port_start)461 static String ParsePortFromStringPosition(const String& value,
462                                           unsigned port_start) {
463   // "008080junk" needs to be treated as port "8080" and "000" as "0".
464   size_t length = value.length();
465   unsigned port_end = port_start;
466   while (IsASCIIDigit(value[port_end]) && port_end < length)
467     ++port_end;
468   while (value[port_start] == '0' && port_start < port_end - 1)
469     ++port_start;
470 
471   // Required for backwards compat.
472   // https://www.w3.org/Bugs/Public/show_bug.cgi?id=23463
473   if (port_start == port_end)
474     return "0";
475 
476   return value.Substring(port_start, port_end - port_start);
477 }
478 
SetHostAndPort(const String & host_and_port)479 void KURL::SetHostAndPort(const String& host_and_port) {
480   // This method intentionally does very sloppy parsing for backwards
481   // compatibility. See https://url.spec.whatwg.org/#host-state for what we
482   // theoretically should be doing.
483 
484   // This logic for handling IPv6 addresses is adapted from ParseServerInfo in
485   // //url/third_party/mozilla/url_parse.cc. There's a slight behaviour
486   // difference for compatibility with the tests: the first colon after the
487   // address is considered to start the port, instead of the last.
488   wtf_size_t ipv6_terminator = host_and_port.ReverseFind(']');
489   if (ipv6_terminator == kNotFound) {
490     ipv6_terminator =
491         host_and_port.StartsWith('[') ? host_and_port.length() : 0;
492   }
493 
494   wtf_size_t colon = host_and_port.find(':', ipv6_terminator);
495 
496   if (colon == 0)
497     return;
498 
499   if (colon == kNotFound) {
500     // |host_and_port| does not include a port, so only overwrite the host.
501     url::Replacements<char> replacements;
502     StringUTF8Adaptor host_utf8(host_and_port);
503     replacements.SetHost(CharactersOrEmpty(host_utf8),
504                          url::Component(0, host_utf8.size()));
505     ReplaceComponents(replacements);
506     return;
507   }
508 
509   String host = host_and_port.Substring(0, colon);
510   String port = ParsePortFromStringPosition(host_and_port, colon + 1);
511 
512   StringUTF8Adaptor host_utf8(host);
513   StringUTF8Adaptor port_utf8(port);
514 
515   url::Replacements<char> replacements;
516   replacements.SetHost(CharactersOrEmpty(host_utf8),
517                        url::Component(0, host_utf8.size()));
518   replacements.SetPort(CharactersOrEmpty(port_utf8),
519                        url::Component(0, port_utf8.size()));
520   ReplaceComponents(replacements);
521 }
522 
RemovePort()523 void KURL::RemovePort() {
524   if (!HasPort())
525     return;
526   url::Replacements<char> replacements;
527   replacements.ClearPort();
528   ReplaceComponents(replacements);
529 }
530 
SetPort(const String & port)531 void KURL::SetPort(const String& port) {
532   String parsed_port = ParsePortFromStringPosition(port, 0);
533   SetPort(parsed_port.ToUInt());
534 }
535 
SetPort(uint16_t port)536 void KURL::SetPort(uint16_t port) {
537   if (IsDefaultPortForProtocol(port, Protocol())) {
538     RemovePort();
539     return;
540   }
541 
542   String port_string = String::Number(port);
543   DCHECK(port_string.Is8Bit());
544 
545   url::Replacements<char> replacements;
546   replacements.SetPort(reinterpret_cast<const char*>(port_string.Characters8()),
547                        url::Component(0, port_string.length()));
548   ReplaceComponents(replacements);
549 }
550 
SetUser(const String & user)551 void KURL::SetUser(const String& user) {
552   // This function is commonly called to clear the username, which we
553   // normally don't have, so we optimize this case.
554   if (user.IsEmpty() && !parsed_.username.is_valid())
555     return;
556 
557   // The canonicalizer will clear any usernames that are empty, so we
558   // don't have to explicitly call ClearUsername() here.
559   StringUTF8Adaptor user_utf8(user);
560   url::Replacements<char> replacements;
561   replacements.SetUsername(CharactersOrEmpty(user_utf8),
562                            url::Component(0, user_utf8.size()));
563   ReplaceComponents(replacements);
564 }
565 
SetPass(const String & pass)566 void KURL::SetPass(const String& pass) {
567   // This function is commonly called to clear the password, which we
568   // normally don't have, so we optimize this case.
569   if (pass.IsEmpty() && !parsed_.password.is_valid())
570     return;
571 
572   // The canonicalizer will clear any passwords that are empty, so we
573   // don't have to explicitly call ClearUsername() here.
574   StringUTF8Adaptor pass_utf8(pass);
575   url::Replacements<char> replacements;
576   replacements.SetPassword(CharactersOrEmpty(pass_utf8),
577                            url::Component(0, pass_utf8.size()));
578   ReplaceComponents(replacements);
579 }
580 
SetFragmentIdentifier(const String & fragment)581 void KURL::SetFragmentIdentifier(const String& fragment) {
582   // This function is commonly called to clear the ref, which we
583   // normally don't have, so we optimize this case.
584   if (fragment.IsNull() && !parsed_.ref.is_valid())
585     return;
586 
587   StringUTF8Adaptor fragment_utf8(fragment);
588 
589   url::Replacements<char> replacements;
590   if (fragment.IsNull()) {
591     replacements.ClearRef();
592   } else {
593     replacements.SetRef(CharactersOrEmpty(fragment_utf8),
594                         url::Component(0, fragment_utf8.size()));
595   }
596   ReplaceComponents(replacements);
597 }
598 
RemoveFragmentIdentifier()599 void KURL::RemoveFragmentIdentifier() {
600   url::Replacements<char> replacements;
601   replacements.ClearRef();
602   ReplaceComponents(replacements);
603 }
604 
SetQuery(const String & query)605 void KURL::SetQuery(const String& query) {
606   StringUTF8Adaptor query_utf8(query);
607   url::Replacements<char> replacements;
608   if (query.IsNull()) {
609     // KURL.cpp sets to null to clear any query.
610     replacements.ClearQuery();
611   } else if (query.length() > 0 && query[0] == '?') {
612     // WebCore expects the query string to begin with a question mark, but
613     // GoogleURL doesn't. So we trim off the question mark when setting.
614     replacements.SetQuery(CharactersOrEmpty(query_utf8),
615                           url::Component(1, query_utf8.size() - 1));
616   } else {
617     // When set with the empty string or something that doesn't begin with
618     // a question mark, KURL.cpp will add a question mark for you. The only
619     // way this isn't compatible is if you call this function with an empty
620     // string. KURL.cpp will leave a '?' with nothing following it in the
621     // URL, whereas we'll clear it.
622     // FIXME We should eliminate this difference.
623     replacements.SetQuery(CharactersOrEmpty(query_utf8),
624                           url::Component(0, query_utf8.size()));
625   }
626   ReplaceComponents(replacements);
627 }
628 
SetPath(const String & path)629 void KURL::SetPath(const String& path) {
630   // Empty paths will be canonicalized to "/", so we don't have to worry
631   // about calling ClearPath().
632   StringUTF8Adaptor path_utf8(path);
633   url::Replacements<char> replacements;
634   replacements.SetPath(CharactersOrEmpty(path_utf8),
635                        url::Component(0, path_utf8.size()));
636   ReplaceComponents(replacements);
637 }
638 
DecodeURLEscapeSequences(const String & string,DecodeURLMode mode)639 String DecodeURLEscapeSequences(const String& string, DecodeURLMode mode) {
640   StringUTF8Adaptor string_utf8(string);
641   url::RawCanonOutputT<base::char16> unescaped;
642   url::DecodeURLEscapeSequences(string_utf8.data(), string_utf8.size(), mode,
643                                 &unescaped);
644   return StringImpl::Create8BitIfPossible(
645       reinterpret_cast<UChar*>(unescaped.data()), unescaped.length());
646 }
647 
EncodeWithURLEscapeSequences(const String & not_encoded_string)648 String EncodeWithURLEscapeSequences(const String& not_encoded_string) {
649   std::string utf8 =
650       UTF8Encoding().Encode(not_encoded_string, WTF::kNoUnencodables);
651 
652   url::RawCanonOutputT<char> buffer;
653   int input_length = utf8.length();
654   if (buffer.capacity() < input_length * 3)
655     buffer.Resize(input_length * 3);
656 
657   url::EncodeURIComponent(utf8.c_str(), input_length, &buffer);
658   String escaped(buffer.data(), static_cast<unsigned>(buffer.length()));
659   // Unescape '/'; it's safe and much prettier.
660   escaped.Replace("%2F", "/");
661   return escaped;
662 }
663 
IsHierarchical() const664 bool KURL::IsHierarchical() const {
665   if (string_.IsNull() || !parsed_.scheme.is_nonempty())
666     return false;
667   return string_.Is8Bit()
668              ? url::IsStandard(AsURLChar8Subtle(string_), parsed_.scheme)
669              : url::IsStandard(string_.Characters16(), parsed_.scheme);
670 }
671 
EqualIgnoringFragmentIdentifier(const KURL & a,const KURL & b)672 bool EqualIgnoringFragmentIdentifier(const KURL& a, const KURL& b) {
673   // Compute the length of each URL without its ref. Note that the reference
674   // begin (if it exists) points to the character *after* the '#', so we need
675   // to subtract one.
676   int a_length = a.string_.length();
677   if (a.parsed_.ref.len >= 0)
678     a_length = a.parsed_.ref.begin - 1;
679 
680   int b_length = b.string_.length();
681   if (b.parsed_.ref.len >= 0)
682     b_length = b.parsed_.ref.begin - 1;
683 
684   if (a_length != b_length)
685     return false;
686 
687   const String& a_string = a.string_;
688   const String& b_string = b.string_;
689   // FIXME: Abstraction this into a function in WTFString.h.
690   for (int i = 0; i < a_length; ++i) {
691     if (a_string[i] != b_string[i])
692       return false;
693   }
694   return true;
695 }
696 
HostStart() const697 unsigned KURL::HostStart() const {
698   return parsed_.CountCharactersBefore(url::Parsed::HOST, false);
699 }
700 
HostEnd() const701 unsigned KURL::HostEnd() const {
702   return parsed_.CountCharactersBefore(url::Parsed::PORT, true);
703 }
704 
PathStart() const705 unsigned KURL::PathStart() const {
706   return parsed_.CountCharactersBefore(url::Parsed::PATH, false);
707 }
708 
PathEnd() const709 unsigned KURL::PathEnd() const {
710   return parsed_.CountCharactersBefore(url::Parsed::QUERY, true);
711 }
712 
PathAfterLastSlash() const713 unsigned KURL::PathAfterLastSlash() const {
714   if (string_.IsNull())
715     return 0;
716   if (!is_valid_ || !parsed_.path.is_valid())
717     return parsed_.CountCharactersBefore(url::Parsed::PATH, false);
718   url::Component filename;
719   if (string_.Is8Bit())
720     url::ExtractFileName(AsURLChar8Subtle(string_), parsed_.path, &filename);
721   else
722     url::ExtractFileName(string_.Characters16(), parsed_.path, &filename);
723   return filename.begin;
724 }
725 
ProtocolIs(const String & url,const char * protocol)726 bool ProtocolIs(const String& url, const char* protocol) {
727 #if DCHECK_IS_ON()
728   AssertProtocolIsGood(protocol);
729 #endif
730   if (url.IsNull())
731     return false;
732   if (url.Is8Bit()) {
733     return url::FindAndCompareScheme(AsURLChar8Subtle(url), url.length(),
734                                      protocol, nullptr);
735   }
736   return url::FindAndCompareScheme(url.Characters16(), url.length(), protocol,
737                                    nullptr);
738 }
739 
Init(const KURL & base,const String & relative,const WTF::TextEncoding * query_encoding)740 void KURL::Init(const KURL& base,
741                 const String& relative,
742                 const WTF::TextEncoding* query_encoding) {
743   // As a performance optimization, we do not use the charset converter
744   // if encoding is UTF-8 or other Unicode encodings. Note that this is
745   // per HTML5 2.5.3 (resolving URL). The URL canonicalizer will be more
746   // efficient with no charset converter object because it can do UTF-8
747   // internally with no extra copies.
748 
749   StringUTF8Adaptor base_utf8(base.GetString());
750 
751   // We feel free to make the charset converter object every time since it's
752   // just a wrapper around a reference.
753   KURLCharsetConverter charset_converter_object(query_encoding);
754   KURLCharsetConverter* charset_converter =
755       (!query_encoding || IsUnicodeEncoding(query_encoding))
756           ? nullptr
757           : &charset_converter_object;
758 
759   // Clamp to int max to avoid overflow.
760   url::RawCanonOutputT<char> output;
761   if (!relative.IsNull() && relative.Is8Bit()) {
762     StringUTF8Adaptor relative_utf8(relative);
763     is_valid_ = url::ResolveRelative(base_utf8.data(), base_utf8.size(),
764                                      base.parsed_, relative_utf8.data(),
765                                      clampTo<int>(relative_utf8.size()),
766                                      charset_converter, &output, &parsed_);
767   } else {
768     is_valid_ = url::ResolveRelative(base_utf8.data(), base_utf8.size(),
769                                      base.parsed_, relative.Characters16(),
770                                      clampTo<int>(relative.length()),
771                                      charset_converter, &output, &parsed_);
772   }
773 
774   // AtomicString::fromUTF8 will re-hash the raw output and check the
775   // AtomicStringTable (addWithTranslator) for the string. This can be very
776   // expensive for large URLs. However, since many URLs are generated from
777   // existing AtomicStrings (which already have their hashes computed), this
778   // fast path is used if the input string is already canonicalized.
779   //
780   // Because this optimization does not apply to non-AtomicStrings, explicitly
781   // check that the input is Atomic before moving forward with it. If we mark
782   // non-Atomic input as Atomic here, we will render the (const) input string
783   // thread unsafe.
784   if (!relative.IsNull() && relative.Impl()->IsAtomic() &&
785       StringView(output.data(), static_cast<unsigned>(output.length())) ==
786           relative) {
787     string_ = relative;
788   } else {
789     string_ = AtomicString::FromUTF8(output.data(), output.length());
790   }
791 
792   InitProtocolMetadata();
793   InitInnerURL();
794   DCHECK(!::blink::ProtocolIsJavaScript(string_) || ProtocolIsJavaScript());
795 }
796 
InitInnerURL()797 void KURL::InitInnerURL() {
798   if (!is_valid_) {
799     inner_url_.reset();
800     return;
801   }
802   if (url::Parsed* inner_parsed = parsed_.inner_parsed()) {
803     inner_url_ = std::make_unique<KURL>(
804         string_.Substring(inner_parsed->scheme.begin,
805                           inner_parsed->Length() - inner_parsed->scheme.begin));
806   } else {
807     inner_url_.reset();
808   }
809 }
810 
InitProtocolMetadata()811 void KURL::InitProtocolMetadata() {
812   if (!is_valid_) {
813     protocol_is_in_http_family_ = false;
814     protocol_ = ComponentString(parsed_.scheme);
815     return;
816   }
817 
818   DCHECK(!string_.IsNull());
819   StringView protocol = ComponentStringView(parsed_.scheme);
820   protocol_is_in_http_family_ = true;
821   if (protocol == WTF::g_https_atom) {
822     protocol_ = WTF::g_https_atom;
823   } else if (protocol == WTF::g_http_atom) {
824     protocol_ = WTF::g_http_atom;
825   } else {
826     protocol_ = protocol.ToAtomicString();
827     protocol_is_in_http_family_ = false;
828   }
829   DCHECK_EQ(protocol_, protocol_.DeprecatedLower());
830 }
831 
ProtocolIs(const StringView protocol) const832 bool KURL::ProtocolIs(const StringView protocol) const {
833 #if DCHECK_IS_ON()
834   AssertProtocolIsGood(protocol);
835 #endif
836 
837   // JavaScript URLs are "valid" and should be executed even if KURL decides
838   // they are invalid.  The free function protocolIsJavaScript() should be used
839   // instead.
840   // FIXME: Chromium code needs to be fixed for this assert to be enabled.
841   // DCHECK(strcmp(protocol, "javascript"));
842   return protocol_ == protocol;
843 }
844 
StringViewForInvalidComponent() const845 StringView KURL::StringViewForInvalidComponent() const {
846   return string_.IsNull() ? StringView() : StringView(StringImpl::empty_);
847 }
848 
ComponentStringView(const url::Component & component) const849 StringView KURL::ComponentStringView(const url::Component& component) const {
850   if (!is_valid_ || component.len <= 0)
851     return StringViewForInvalidComponent();
852   // begin and len are in terms of bytes which do not match
853   // if string() is UTF-16 and input contains non-ASCII characters.
854   // However, the only part in urlString that can contain non-ASCII
855   // characters is 'ref' at the end of the string. In that case,
856   // begin will always match the actual value and len (in terms of
857   // byte) will be longer than what's needed by 'mid'. However, mid
858   // truncates len to avoid go past the end of a string so that we can
859   // get away without doing anything here.
860 
861   int max_length = GetString().length() - component.begin;
862   return StringView(GetString(), component.begin,
863                     component.len > max_length ? max_length : component.len);
864 }
865 
ComponentString(const url::Component & component) const866 String KURL::ComponentString(const url::Component& component) const {
867   return ComponentStringView(component).ToString();
868 }
869 
870 template <typename CHAR>
ReplaceComponents(const url::Replacements<CHAR> & replacements)871 void KURL::ReplaceComponents(const url::Replacements<CHAR>& replacements) {
872   url::RawCanonOutputT<char> output;
873   url::Parsed new_parsed;
874 
875   StringUTF8Adaptor utf8(string_);
876   is_valid_ =
877       url::ReplaceComponents(utf8.data(), utf8.size(), parsed_, replacements,
878                              nullptr, &output, &new_parsed);
879 
880   parsed_ = new_parsed;
881   string_ = AtomicString::FromUTF8(output.data(), output.length());
882   InitProtocolMetadata();
883 }
884 
IsSafeToSendToAnotherThread() const885 bool KURL::IsSafeToSendToAnotherThread() const {
886   return string_.IsSafeToSendToAnotherThread() &&
887          (!inner_url_ || inner_url_->IsSafeToSendToAnotherThread());
888 }
889 
operator GURL() const890 KURL::operator GURL() const {
891   StringUTF8Adaptor utf8(string_);
892   return GURL(utf8.data(), utf8.size(), parsed_, is_valid_);
893 }
operator ==(const KURL & a,const KURL & b)894 bool operator==(const KURL& a, const KURL& b) {
895   return a.GetString() == b.GetString();
896 }
897 
operator ==(const KURL & a,const String & b)898 bool operator==(const KURL& a, const String& b) {
899   return a.GetString() == b;
900 }
901 
operator ==(const String & a,const KURL & b)902 bool operator==(const String& a, const KURL& b) {
903   return a == b.GetString();
904 }
905 
operator !=(const KURL & a,const KURL & b)906 bool operator!=(const KURL& a, const KURL& b) {
907   return a.GetString() != b.GetString();
908 }
909 
operator !=(const KURL & a,const String & b)910 bool operator!=(const KURL& a, const String& b) {
911   return a.GetString() != b;
912 }
913 
operator !=(const String & a,const KURL & b)914 bool operator!=(const String& a, const KURL& b) {
915   return a != b.GetString();
916 }
917 
operator <<(std::ostream & os,const KURL & url)918 std::ostream& operator<<(std::ostream& os, const KURL& url) {
919   return os << url.GetString();
920 }
921 
922 }  // namespace blink
923