1 /*
2 * Copyright (C) 2004, 2007, 2008, 2011, 2012 Apple Inc. All rights reserved.
3 * Copyright (C) 2012 Research In Motion Limited. All rights reserved.
4 * Copyright (C) 2008, 2009, 2011 Google Inc. All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #include "third_party/blink/renderer/platform/weborigin/kurl.h"
29
30 #include <algorithm>
31
32 #include "third_party/blink/renderer/platform/weborigin/known_ports.h"
33 #include "third_party/blink/renderer/platform/wtf/math_extras.h"
34 #include "third_party/blink/renderer/platform/wtf/std_lib_extras.h"
35 #include "third_party/blink/renderer/platform/wtf/text/string_hash.h"
36 #include "third_party/blink/renderer/platform/wtf/text/string_statics.h"
37 #include "third_party/blink/renderer/platform/wtf/text/string_utf8_adaptor.h"
38 #include "third_party/blink/renderer/platform/wtf/text/text_encoding.h"
39 #include "third_party/blink/renderer/platform/wtf/thread_specific.h"
40 #include "url/gurl.h"
41 #include "url/url_util.h"
42 #ifndef NDEBUG
43 #include <stdio.h>
44 #endif
45
46 namespace blink {
47
48 #if DCHECK_IS_ON()
AssertProtocolIsGood(const StringView protocol)49 static void AssertProtocolIsGood(const StringView protocol) {
50 DCHECK(protocol != "");
51 for (size_t i = 0; i < protocol.length(); ++i) {
52 LChar c = protocol.Characters8()[i];
53 DCHECK(c > ' ' && c < 0x7F && !(c >= 'A' && c <= 'Z'));
54 }
55 }
56 #endif
57
58 // Note: You must ensure that |spec| is a valid canonicalized URL before calling
59 // this function.
AsURLChar8Subtle(const String & spec)60 static const char* AsURLChar8Subtle(const String& spec) {
61 DCHECK(spec.Is8Bit());
62 // characters8 really return characters in Latin-1, but because we
63 // canonicalize URL strings, we know that everything before the fragment
64 // identifier will actually be ASCII, which means this cast is safe as long as
65 // you don't look at the fragment component.
66 return reinterpret_cast<const char*>(spec.Characters8());
67 }
68
69 // Returns the characters for the given string, or a pointer to a static empty
70 // string if the input string is null. This will always ensure we have a non-
71 // null character pointer since ReplaceComponents has special meaning for null.
CharactersOrEmpty(const StringUTF8Adaptor & string)72 static const char* CharactersOrEmpty(const StringUTF8Adaptor& string) {
73 static const char kZero = 0;
74 return string.data() ? string.data() : &kZero;
75 }
76
IsSchemeFirstChar(char c)77 static bool IsSchemeFirstChar(char c) {
78 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
79 }
80
IsSchemeChar(char c)81 static bool IsSchemeChar(char c) {
82 return IsSchemeFirstChar(c) || (c >= '0' && c <= '9') || c == '.' ||
83 c == '-' || c == '+';
84 }
85
IsUnicodeEncoding(const WTF::TextEncoding * encoding)86 static bool IsUnicodeEncoding(const WTF::TextEncoding* encoding) {
87 return encoding->EncodingForFormSubmission() == UTF8Encoding();
88 }
89
90 namespace {
91
92 class KURLCharsetConverter final : public url::CharsetConverter {
93 DISALLOW_NEW();
94
95 public:
96 // The encoding parameter may be 0, but in this case the object must not be
97 // called.
KURLCharsetConverter(const WTF::TextEncoding * encoding)98 explicit KURLCharsetConverter(const WTF::TextEncoding* encoding)
99 : encoding_(encoding) {}
100
ConvertFromUTF16(const base::char16 * input,int input_length,url::CanonOutput * output)101 void ConvertFromUTF16(const base::char16* input,
102 int input_length,
103 url::CanonOutput* output) override {
104 std::string encoded = encoding_->Encode(
105 String(input, input_length), WTF::kURLEncodedEntitiesForUnencodables);
106 output->Append(encoded.c_str(), static_cast<int>(encoded.length()));
107 }
108
109 private:
110 const WTF::TextEncoding* encoding_;
111 };
112
113 } // namespace
114
IsValidProtocol(const String & protocol)115 bool IsValidProtocol(const String& protocol) {
116 // RFC3986: ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
117 if (protocol.IsEmpty())
118 return false;
119 if (!IsSchemeFirstChar(protocol[0]))
120 return false;
121 unsigned protocol_length = protocol.length();
122 for (unsigned i = 1; i < protocol_length; i++) {
123 if (!IsSchemeChar(protocol[i]))
124 return false;
125 }
126 return true;
127 }
128
StrippedForUseAsReferrer() const129 String KURL::StrippedForUseAsReferrer() const {
130 if (!ProtocolIsInHTTPFamily())
131 return String();
132
133 if (parsed_.username.is_nonempty() || parsed_.password.is_nonempty() ||
134 parsed_.ref.is_valid()) {
135 KURL referrer(*this);
136 referrer.SetUser(String());
137 referrer.SetPass(String());
138 referrer.RemoveFragmentIdentifier();
139 return referrer.GetString();
140 }
141 return GetString();
142 }
143
StrippedForUseAsHref() const144 String KURL::StrippedForUseAsHref() const {
145 if (parsed_.username.is_nonempty() || parsed_.password.is_nonempty()) {
146 KURL href(*this);
147 href.SetUser(String());
148 href.SetPass(String());
149 return href.GetString();
150 }
151 return GetString();
152 }
153
IsLocalFile() const154 bool KURL::IsLocalFile() const {
155 // Including feed here might be a bad idea since drag and drop uses this check
156 // and including feed would allow feeds to potentially let someone's blog
157 // read the contents of the clipboard on a drag, even without a drop.
158 // Likewise with using the FrameLoader::shouldTreatURLAsLocal() function.
159 return ProtocolIs("file");
160 }
161
ProtocolIsJavaScript(const String & url)162 bool ProtocolIsJavaScript(const String& url) {
163 return ProtocolIs(url, "javascript");
164 }
165
BlankURL()166 const KURL& BlankURL() {
167 DEFINE_THREAD_SAFE_STATIC_LOCAL(ThreadSpecific<KURL>, static_blank_url, ());
168 KURL& blank_url = *static_blank_url;
169 if (blank_url.IsNull())
170 blank_url = KURL(AtomicString("about:blank"));
171 return blank_url;
172 }
173
IsAboutBlankURL() const174 bool KURL::IsAboutBlankURL() const {
175 return *this == BlankURL();
176 }
177
SrcdocURL()178 const KURL& SrcdocURL() {
179 DEFINE_THREAD_SAFE_STATIC_LOCAL(ThreadSpecific<KURL>, static_srcdoc_url, ());
180 KURL& srcdoc_url = *static_srcdoc_url;
181 if (srcdoc_url.IsNull())
182 srcdoc_url = KURL(AtomicString("about:srcdoc"));
183 return srcdoc_url;
184 }
185
IsAboutSrcdocURL() const186 bool KURL::IsAboutSrcdocURL() const {
187 return *this == SrcdocURL();
188 }
189
NullURL()190 const KURL& NullURL() {
191 DEFINE_THREAD_SAFE_STATIC_LOCAL(ThreadSpecific<KURL>, static_null_url, ());
192 return *static_null_url;
193 }
194
ElidedString() const195 String KURL::ElidedString() const {
196 if (GetString().length() <= 1024)
197 return GetString();
198
199 return GetString().Left(511) + "..." + GetString().Right(510);
200 }
201
KURL()202 KURL::KURL() : is_valid_(false), protocol_is_in_http_family_(false) {}
203
204 // Initializes with a string representing an absolute URL. No encoding
205 // information is specified. This generally happens when a KURL is converted
206 // to a string and then converted back. In this case, the URL is already
207 // canonical and in proper escaped form so needs no encoding. We treat it as
208 // UTF-8 just in case.
KURL(const String & url)209 KURL::KURL(const String& url) {
210 if (!url.IsNull())
211 Init(NullURL(), url, nullptr);
212 else {
213 // WebCore expects us to preserve the nullness of strings when this
214 // constructor is used. In all other cases, it expects a non-null
215 // empty string, which is what Init() will create.
216 is_valid_ = false;
217 protocol_is_in_http_family_ = false;
218 }
219 }
220
221 // Initializes with a GURL. This is used to covert from a GURL to a KURL.
KURL(const GURL & gurl)222 KURL::KURL(const GURL& gurl) {
223 Init(NullURL() /* base */, String(gurl.spec().c_str()) /* relative */,
224 nullptr /* query_encoding */);
225 }
226
CreateIsolated(const String & url)227 KURL KURL::CreateIsolated(const String& url) {
228 // FIXME: We should be able to skip this extra copy and created an
229 // isolated KURL more efficiently.
230 return KURL(url).Copy();
231 }
232
233 // Constructs a new URL given a base URL and a possibly relative input URL.
234 // This assumes UTF-8 encoding.
KURL(const KURL & base,const String & relative)235 KURL::KURL(const KURL& base, const String& relative) {
236 Init(base, relative, nullptr);
237 }
238
239 // Constructs a new URL given a base URL and a possibly relative input URL.
240 // Any query portion of the relative URL will be encoded in the given encoding.
KURL(const KURL & base,const String & relative,const WTF::TextEncoding & encoding)241 KURL::KURL(const KURL& base,
242 const String& relative,
243 const WTF::TextEncoding& encoding) {
244 Init(base, relative, &encoding.EncodingForFormSubmission());
245 }
246
KURL(const AtomicString & canonical_string,const url::Parsed & parsed,bool is_valid)247 KURL::KURL(const AtomicString& canonical_string,
248 const url::Parsed& parsed,
249 bool is_valid)
250 : is_valid_(is_valid),
251 protocol_is_in_http_family_(false),
252 parsed_(parsed),
253 string_(canonical_string) {
254 InitProtocolMetadata();
255 InitInnerURL();
256 }
257
KURL(const KURL & other)258 KURL::KURL(const KURL& other)
259 : is_valid_(other.is_valid_),
260 protocol_is_in_http_family_(other.protocol_is_in_http_family_),
261 protocol_(other.protocol_),
262 parsed_(other.parsed_),
263 string_(other.string_) {
264 if (other.inner_url_.get())
265 inner_url_ = std::make_unique<KURL>(other.inner_url_->Copy());
266 }
267
268 KURL::~KURL() = default;
269
operator =(const KURL & other)270 KURL& KURL::operator=(const KURL& other) {
271 is_valid_ = other.is_valid_;
272 protocol_is_in_http_family_ = other.protocol_is_in_http_family_;
273 protocol_ = other.protocol_;
274 parsed_ = other.parsed_;
275 string_ = other.string_;
276 if (other.inner_url_)
277 inner_url_ = std::make_unique<KURL>(other.inner_url_->Copy());
278 else
279 inner_url_.reset();
280 return *this;
281 }
282
Copy() const283 KURL KURL::Copy() const {
284 KURL result;
285 result.is_valid_ = is_valid_;
286 result.protocol_is_in_http_family_ = protocol_is_in_http_family_;
287 result.protocol_ = protocol_.IsolatedCopy();
288 result.parsed_ = parsed_;
289 result.string_ = string_.IsolatedCopy();
290 if (inner_url_)
291 result.inner_url_ = std::make_unique<KURL>(inner_url_->Copy());
292 return result;
293 }
294
IsNull() const295 bool KURL::IsNull() const {
296 return string_.IsNull();
297 }
298
IsEmpty() const299 bool KURL::IsEmpty() const {
300 return string_.IsEmpty();
301 }
302
IsValid() const303 bool KURL::IsValid() const {
304 return is_valid_;
305 }
306
HasPort() const307 bool KURL::HasPort() const {
308 return HostEnd() < PathStart();
309 }
310
ProtocolIsJavaScript() const311 bool KURL::ProtocolIsJavaScript() const {
312 return ComponentStringView(parsed_.scheme) == "javascript";
313 }
314
ProtocolIsInHTTPFamily() const315 bool KURL::ProtocolIsInHTTPFamily() const {
316 return protocol_is_in_http_family_;
317 }
318
HasPath() const319 bool KURL::HasPath() const {
320 // Note that http://www.google.com/" has a path, the path is "/". This can
321 // return false only for invalid or nonstandard URLs.
322 return parsed_.path.len >= 0;
323 }
324
LastPathComponent() const325 String KURL::LastPathComponent() const {
326 if (!is_valid_)
327 return StringViewForInvalidComponent().ToString();
328 DCHECK(!string_.IsNull());
329
330 // When the output ends in a slash, WebCore has different expectations than
331 // the GoogleURL library. For "/foo/bar/" the library will return the empty
332 // string, but WebCore wants "bar".
333 url::Component path = parsed_.path;
334 if (path.len > 0 && string_[path.end() - 1] == '/')
335 path.len--;
336
337 url::Component file;
338 if (string_.Is8Bit())
339 url::ExtractFileName(AsURLChar8Subtle(string_), path, &file);
340 else
341 url::ExtractFileName(string_.Characters16(), path, &file);
342
343 // Bug: https://bugs.webkit.org/show_bug.cgi?id=21015 this function returns
344 // a null string when the path is empty, which we duplicate here.
345 if (!file.is_nonempty())
346 return String();
347 return ComponentString(file);
348 }
349
Protocol() const350 String KURL::Protocol() const {
351 DCHECK_EQ(ComponentString(parsed_.scheme), protocol_);
352 return protocol_;
353 }
354
Host() const355 String KURL::Host() const {
356 return ComponentString(parsed_.host);
357 }
358
Port() const359 uint16_t KURL::Port() const {
360 if (!is_valid_ || parsed_.port.len <= 0)
361 return 0;
362 DCHECK(!string_.IsNull());
363 int port = string_.Is8Bit()
364 ? url::ParsePort(AsURLChar8Subtle(string_), parsed_.port)
365 : url::ParsePort(string_.Characters16(), parsed_.port);
366 DCHECK_NE(port, url::PORT_UNSPECIFIED); // Checked port.len <= 0 already.
367 DCHECK_NE(port, url::PORT_INVALID); // Checked is_valid_ already.
368
369 return static_cast<uint16_t>(port);
370 }
371
372 // TODO(csharrison): Migrate pass() and user() to return a StringView. Most
373 // consumers just need to know if the string is empty.
374
Pass() const375 String KURL::Pass() const {
376 // Bug: https://bugs.webkit.org/show_bug.cgi?id=21015 this function returns
377 // a null string when the password is empty, which we duplicate here.
378 if (!parsed_.password.is_nonempty())
379 return String();
380 return ComponentString(parsed_.password);
381 }
382
User() const383 String KURL::User() const {
384 return ComponentString(parsed_.username);
385 }
386
FragmentIdentifier() const387 String KURL::FragmentIdentifier() const {
388 // Empty but present refs ("foo.com/bar#") should result in the empty
389 // string, which componentString will produce. Nonexistent refs
390 // should be the null string.
391 if (!parsed_.ref.is_valid())
392 return String();
393 return ComponentString(parsed_.ref);
394 }
395
HasFragmentIdentifier() const396 bool KURL::HasFragmentIdentifier() const {
397 return parsed_.ref.len >= 0;
398 }
399
BaseAsString() const400 String KURL::BaseAsString() const {
401 // FIXME: There is probably a more efficient way to do this?
402 return string_.Left(PathAfterLastSlash());
403 }
404
Query() const405 String KURL::Query() const {
406 if (parsed_.query.len >= 0)
407 return ComponentString(parsed_.query);
408
409 // Bug: https://bugs.webkit.org/show_bug.cgi?id=21015 this function returns
410 // an empty string when the query is empty rather than a null (not sure
411 // which is right).
412 // Returns a null if the query is not specified, instead of empty.
413 if (parsed_.query.is_valid())
414 return g_empty_string;
415 return String();
416 }
417
GetPath() const418 String KURL::GetPath() const {
419 return ComponentString(parsed_.path);
420 }
421
SetProtocol(const String & protocol)422 bool KURL::SetProtocol(const String& protocol) {
423 // Firefox and IE remove everything after the first ':'.
424 wtf_size_t separator_position = protocol.find(':');
425 String new_protocol = protocol.Substring(0, separator_position);
426 StringUTF8Adaptor new_protocol_utf8(new_protocol);
427
428 // If KURL is given an invalid scheme, it returns failure without modifying
429 // the URL at all. This is in contrast to most other setters which modify
430 // the URL and set "m_isValid."
431 url::RawCanonOutputT<char> canon_protocol;
432 url::Component protocol_component;
433 if (!url::CanonicalizeScheme(new_protocol_utf8.data(),
434 url::Component(0, new_protocol_utf8.size()),
435 &canon_protocol, &protocol_component) ||
436 !protocol_component.is_nonempty())
437 return false;
438
439 url::Replacements<char> replacements;
440 replacements.SetScheme(CharactersOrEmpty(new_protocol_utf8),
441 url::Component(0, new_protocol_utf8.size()));
442 ReplaceComponents(replacements);
443
444 // isValid could be false but we still return true here. This is because
445 // WebCore or JS scripts can build up a URL by setting individual
446 // components, and a JS exception is based on the return value of this
447 // function. We want to throw the exception and stop the script only when
448 // its trying to set a bad protocol, and not when it maybe just hasn't
449 // finished building up its final scheme.
450 return true;
451 }
452
SetHost(const String & host)453 void KURL::SetHost(const String& host) {
454 StringUTF8Adaptor host_utf8(host);
455 url::Replacements<char> replacements;
456 replacements.SetHost(CharactersOrEmpty(host_utf8),
457 url::Component(0, host_utf8.size()));
458 ReplaceComponents(replacements);
459 }
460
ParsePortFromStringPosition(const String & value,unsigned port_start)461 static String ParsePortFromStringPosition(const String& value,
462 unsigned port_start) {
463 // "008080junk" needs to be treated as port "8080" and "000" as "0".
464 size_t length = value.length();
465 unsigned port_end = port_start;
466 while (IsASCIIDigit(value[port_end]) && port_end < length)
467 ++port_end;
468 while (value[port_start] == '0' && port_start < port_end - 1)
469 ++port_start;
470
471 // Required for backwards compat.
472 // https://www.w3.org/Bugs/Public/show_bug.cgi?id=23463
473 if (port_start == port_end)
474 return "0";
475
476 return value.Substring(port_start, port_end - port_start);
477 }
478
SetHostAndPort(const String & host_and_port)479 void KURL::SetHostAndPort(const String& host_and_port) {
480 // This method intentionally does very sloppy parsing for backwards
481 // compatibility. See https://url.spec.whatwg.org/#host-state for what we
482 // theoretically should be doing.
483
484 // This logic for handling IPv6 addresses is adapted from ParseServerInfo in
485 // //url/third_party/mozilla/url_parse.cc. There's a slight behaviour
486 // difference for compatibility with the tests: the first colon after the
487 // address is considered to start the port, instead of the last.
488 wtf_size_t ipv6_terminator = host_and_port.ReverseFind(']');
489 if (ipv6_terminator == kNotFound) {
490 ipv6_terminator =
491 host_and_port.StartsWith('[') ? host_and_port.length() : 0;
492 }
493
494 wtf_size_t colon = host_and_port.find(':', ipv6_terminator);
495
496 if (colon == 0)
497 return;
498
499 if (colon == kNotFound) {
500 // |host_and_port| does not include a port, so only overwrite the host.
501 url::Replacements<char> replacements;
502 StringUTF8Adaptor host_utf8(host_and_port);
503 replacements.SetHost(CharactersOrEmpty(host_utf8),
504 url::Component(0, host_utf8.size()));
505 ReplaceComponents(replacements);
506 return;
507 }
508
509 String host = host_and_port.Substring(0, colon);
510 String port = ParsePortFromStringPosition(host_and_port, colon + 1);
511
512 StringUTF8Adaptor host_utf8(host);
513 StringUTF8Adaptor port_utf8(port);
514
515 url::Replacements<char> replacements;
516 replacements.SetHost(CharactersOrEmpty(host_utf8),
517 url::Component(0, host_utf8.size()));
518 replacements.SetPort(CharactersOrEmpty(port_utf8),
519 url::Component(0, port_utf8.size()));
520 ReplaceComponents(replacements);
521 }
522
RemovePort()523 void KURL::RemovePort() {
524 if (!HasPort())
525 return;
526 url::Replacements<char> replacements;
527 replacements.ClearPort();
528 ReplaceComponents(replacements);
529 }
530
SetPort(const String & port)531 void KURL::SetPort(const String& port) {
532 String parsed_port = ParsePortFromStringPosition(port, 0);
533 SetPort(parsed_port.ToUInt());
534 }
535
SetPort(uint16_t port)536 void KURL::SetPort(uint16_t port) {
537 if (IsDefaultPortForProtocol(port, Protocol())) {
538 RemovePort();
539 return;
540 }
541
542 String port_string = String::Number(port);
543 DCHECK(port_string.Is8Bit());
544
545 url::Replacements<char> replacements;
546 replacements.SetPort(reinterpret_cast<const char*>(port_string.Characters8()),
547 url::Component(0, port_string.length()));
548 ReplaceComponents(replacements);
549 }
550
SetUser(const String & user)551 void KURL::SetUser(const String& user) {
552 // This function is commonly called to clear the username, which we
553 // normally don't have, so we optimize this case.
554 if (user.IsEmpty() && !parsed_.username.is_valid())
555 return;
556
557 // The canonicalizer will clear any usernames that are empty, so we
558 // don't have to explicitly call ClearUsername() here.
559 StringUTF8Adaptor user_utf8(user);
560 url::Replacements<char> replacements;
561 replacements.SetUsername(CharactersOrEmpty(user_utf8),
562 url::Component(0, user_utf8.size()));
563 ReplaceComponents(replacements);
564 }
565
SetPass(const String & pass)566 void KURL::SetPass(const String& pass) {
567 // This function is commonly called to clear the password, which we
568 // normally don't have, so we optimize this case.
569 if (pass.IsEmpty() && !parsed_.password.is_valid())
570 return;
571
572 // The canonicalizer will clear any passwords that are empty, so we
573 // don't have to explicitly call ClearUsername() here.
574 StringUTF8Adaptor pass_utf8(pass);
575 url::Replacements<char> replacements;
576 replacements.SetPassword(CharactersOrEmpty(pass_utf8),
577 url::Component(0, pass_utf8.size()));
578 ReplaceComponents(replacements);
579 }
580
SetFragmentIdentifier(const String & fragment)581 void KURL::SetFragmentIdentifier(const String& fragment) {
582 // This function is commonly called to clear the ref, which we
583 // normally don't have, so we optimize this case.
584 if (fragment.IsNull() && !parsed_.ref.is_valid())
585 return;
586
587 StringUTF8Adaptor fragment_utf8(fragment);
588
589 url::Replacements<char> replacements;
590 if (fragment.IsNull()) {
591 replacements.ClearRef();
592 } else {
593 replacements.SetRef(CharactersOrEmpty(fragment_utf8),
594 url::Component(0, fragment_utf8.size()));
595 }
596 ReplaceComponents(replacements);
597 }
598
RemoveFragmentIdentifier()599 void KURL::RemoveFragmentIdentifier() {
600 url::Replacements<char> replacements;
601 replacements.ClearRef();
602 ReplaceComponents(replacements);
603 }
604
SetQuery(const String & query)605 void KURL::SetQuery(const String& query) {
606 StringUTF8Adaptor query_utf8(query);
607 url::Replacements<char> replacements;
608 if (query.IsNull()) {
609 // KURL.cpp sets to null to clear any query.
610 replacements.ClearQuery();
611 } else if (query.length() > 0 && query[0] == '?') {
612 // WebCore expects the query string to begin with a question mark, but
613 // GoogleURL doesn't. So we trim off the question mark when setting.
614 replacements.SetQuery(CharactersOrEmpty(query_utf8),
615 url::Component(1, query_utf8.size() - 1));
616 } else {
617 // When set with the empty string or something that doesn't begin with
618 // a question mark, KURL.cpp will add a question mark for you. The only
619 // way this isn't compatible is if you call this function with an empty
620 // string. KURL.cpp will leave a '?' with nothing following it in the
621 // URL, whereas we'll clear it.
622 // FIXME We should eliminate this difference.
623 replacements.SetQuery(CharactersOrEmpty(query_utf8),
624 url::Component(0, query_utf8.size()));
625 }
626 ReplaceComponents(replacements);
627 }
628
SetPath(const String & path)629 void KURL::SetPath(const String& path) {
630 // Empty paths will be canonicalized to "/", so we don't have to worry
631 // about calling ClearPath().
632 StringUTF8Adaptor path_utf8(path);
633 url::Replacements<char> replacements;
634 replacements.SetPath(CharactersOrEmpty(path_utf8),
635 url::Component(0, path_utf8.size()));
636 ReplaceComponents(replacements);
637 }
638
DecodeURLEscapeSequences(const String & string,DecodeURLMode mode)639 String DecodeURLEscapeSequences(const String& string, DecodeURLMode mode) {
640 StringUTF8Adaptor string_utf8(string);
641 url::RawCanonOutputT<base::char16> unescaped;
642 url::DecodeURLEscapeSequences(string_utf8.data(), string_utf8.size(), mode,
643 &unescaped);
644 return StringImpl::Create8BitIfPossible(
645 reinterpret_cast<UChar*>(unescaped.data()), unescaped.length());
646 }
647
EncodeWithURLEscapeSequences(const String & not_encoded_string)648 String EncodeWithURLEscapeSequences(const String& not_encoded_string) {
649 std::string utf8 =
650 UTF8Encoding().Encode(not_encoded_string, WTF::kNoUnencodables);
651
652 url::RawCanonOutputT<char> buffer;
653 int input_length = utf8.length();
654 if (buffer.capacity() < input_length * 3)
655 buffer.Resize(input_length * 3);
656
657 url::EncodeURIComponent(utf8.c_str(), input_length, &buffer);
658 String escaped(buffer.data(), static_cast<unsigned>(buffer.length()));
659 // Unescape '/'; it's safe and much prettier.
660 escaped.Replace("%2F", "/");
661 return escaped;
662 }
663
IsHierarchical() const664 bool KURL::IsHierarchical() const {
665 if (string_.IsNull() || !parsed_.scheme.is_nonempty())
666 return false;
667 return string_.Is8Bit()
668 ? url::IsStandard(AsURLChar8Subtle(string_), parsed_.scheme)
669 : url::IsStandard(string_.Characters16(), parsed_.scheme);
670 }
671
EqualIgnoringFragmentIdentifier(const KURL & a,const KURL & b)672 bool EqualIgnoringFragmentIdentifier(const KURL& a, const KURL& b) {
673 // Compute the length of each URL without its ref. Note that the reference
674 // begin (if it exists) points to the character *after* the '#', so we need
675 // to subtract one.
676 int a_length = a.string_.length();
677 if (a.parsed_.ref.len >= 0)
678 a_length = a.parsed_.ref.begin - 1;
679
680 int b_length = b.string_.length();
681 if (b.parsed_.ref.len >= 0)
682 b_length = b.parsed_.ref.begin - 1;
683
684 if (a_length != b_length)
685 return false;
686
687 const String& a_string = a.string_;
688 const String& b_string = b.string_;
689 // FIXME: Abstraction this into a function in WTFString.h.
690 for (int i = 0; i < a_length; ++i) {
691 if (a_string[i] != b_string[i])
692 return false;
693 }
694 return true;
695 }
696
HostStart() const697 unsigned KURL::HostStart() const {
698 return parsed_.CountCharactersBefore(url::Parsed::HOST, false);
699 }
700
HostEnd() const701 unsigned KURL::HostEnd() const {
702 return parsed_.CountCharactersBefore(url::Parsed::PORT, true);
703 }
704
PathStart() const705 unsigned KURL::PathStart() const {
706 return parsed_.CountCharactersBefore(url::Parsed::PATH, false);
707 }
708
PathEnd() const709 unsigned KURL::PathEnd() const {
710 return parsed_.CountCharactersBefore(url::Parsed::QUERY, true);
711 }
712
PathAfterLastSlash() const713 unsigned KURL::PathAfterLastSlash() const {
714 if (string_.IsNull())
715 return 0;
716 if (!is_valid_ || !parsed_.path.is_valid())
717 return parsed_.CountCharactersBefore(url::Parsed::PATH, false);
718 url::Component filename;
719 if (string_.Is8Bit())
720 url::ExtractFileName(AsURLChar8Subtle(string_), parsed_.path, &filename);
721 else
722 url::ExtractFileName(string_.Characters16(), parsed_.path, &filename);
723 return filename.begin;
724 }
725
ProtocolIs(const String & url,const char * protocol)726 bool ProtocolIs(const String& url, const char* protocol) {
727 #if DCHECK_IS_ON()
728 AssertProtocolIsGood(protocol);
729 #endif
730 if (url.IsNull())
731 return false;
732 if (url.Is8Bit()) {
733 return url::FindAndCompareScheme(AsURLChar8Subtle(url), url.length(),
734 protocol, nullptr);
735 }
736 return url::FindAndCompareScheme(url.Characters16(), url.length(), protocol,
737 nullptr);
738 }
739
Init(const KURL & base,const String & relative,const WTF::TextEncoding * query_encoding)740 void KURL::Init(const KURL& base,
741 const String& relative,
742 const WTF::TextEncoding* query_encoding) {
743 // As a performance optimization, we do not use the charset converter
744 // if encoding is UTF-8 or other Unicode encodings. Note that this is
745 // per HTML5 2.5.3 (resolving URL). The URL canonicalizer will be more
746 // efficient with no charset converter object because it can do UTF-8
747 // internally with no extra copies.
748
749 StringUTF8Adaptor base_utf8(base.GetString());
750
751 // We feel free to make the charset converter object every time since it's
752 // just a wrapper around a reference.
753 KURLCharsetConverter charset_converter_object(query_encoding);
754 KURLCharsetConverter* charset_converter =
755 (!query_encoding || IsUnicodeEncoding(query_encoding))
756 ? nullptr
757 : &charset_converter_object;
758
759 // Clamp to int max to avoid overflow.
760 url::RawCanonOutputT<char> output;
761 if (!relative.IsNull() && relative.Is8Bit()) {
762 StringUTF8Adaptor relative_utf8(relative);
763 is_valid_ = url::ResolveRelative(base_utf8.data(), base_utf8.size(),
764 base.parsed_, relative_utf8.data(),
765 clampTo<int>(relative_utf8.size()),
766 charset_converter, &output, &parsed_);
767 } else {
768 is_valid_ = url::ResolveRelative(base_utf8.data(), base_utf8.size(),
769 base.parsed_, relative.Characters16(),
770 clampTo<int>(relative.length()),
771 charset_converter, &output, &parsed_);
772 }
773
774 // AtomicString::fromUTF8 will re-hash the raw output and check the
775 // AtomicStringTable (addWithTranslator) for the string. This can be very
776 // expensive for large URLs. However, since many URLs are generated from
777 // existing AtomicStrings (which already have their hashes computed), this
778 // fast path is used if the input string is already canonicalized.
779 //
780 // Because this optimization does not apply to non-AtomicStrings, explicitly
781 // check that the input is Atomic before moving forward with it. If we mark
782 // non-Atomic input as Atomic here, we will render the (const) input string
783 // thread unsafe.
784 if (!relative.IsNull() && relative.Impl()->IsAtomic() &&
785 StringView(output.data(), static_cast<unsigned>(output.length())) ==
786 relative) {
787 string_ = relative;
788 } else {
789 string_ = AtomicString::FromUTF8(output.data(), output.length());
790 }
791
792 InitProtocolMetadata();
793 InitInnerURL();
794 DCHECK(!::blink::ProtocolIsJavaScript(string_) || ProtocolIsJavaScript());
795 }
796
InitInnerURL()797 void KURL::InitInnerURL() {
798 if (!is_valid_) {
799 inner_url_.reset();
800 return;
801 }
802 if (url::Parsed* inner_parsed = parsed_.inner_parsed()) {
803 inner_url_ = std::make_unique<KURL>(
804 string_.Substring(inner_parsed->scheme.begin,
805 inner_parsed->Length() - inner_parsed->scheme.begin));
806 } else {
807 inner_url_.reset();
808 }
809 }
810
InitProtocolMetadata()811 void KURL::InitProtocolMetadata() {
812 if (!is_valid_) {
813 protocol_is_in_http_family_ = false;
814 protocol_ = ComponentString(parsed_.scheme);
815 return;
816 }
817
818 DCHECK(!string_.IsNull());
819 StringView protocol = ComponentStringView(parsed_.scheme);
820 protocol_is_in_http_family_ = true;
821 if (protocol == WTF::g_https_atom) {
822 protocol_ = WTF::g_https_atom;
823 } else if (protocol == WTF::g_http_atom) {
824 protocol_ = WTF::g_http_atom;
825 } else {
826 protocol_ = protocol.ToAtomicString();
827 protocol_is_in_http_family_ = false;
828 }
829 DCHECK_EQ(protocol_, protocol_.DeprecatedLower());
830 }
831
ProtocolIs(const StringView protocol) const832 bool KURL::ProtocolIs(const StringView protocol) const {
833 #if DCHECK_IS_ON()
834 AssertProtocolIsGood(protocol);
835 #endif
836
837 // JavaScript URLs are "valid" and should be executed even if KURL decides
838 // they are invalid. The free function protocolIsJavaScript() should be used
839 // instead.
840 // FIXME: Chromium code needs to be fixed for this assert to be enabled.
841 // DCHECK(strcmp(protocol, "javascript"));
842 return protocol_ == protocol;
843 }
844
StringViewForInvalidComponent() const845 StringView KURL::StringViewForInvalidComponent() const {
846 return string_.IsNull() ? StringView() : StringView(StringImpl::empty_);
847 }
848
ComponentStringView(const url::Component & component) const849 StringView KURL::ComponentStringView(const url::Component& component) const {
850 if (!is_valid_ || component.len <= 0)
851 return StringViewForInvalidComponent();
852 // begin and len are in terms of bytes which do not match
853 // if string() is UTF-16 and input contains non-ASCII characters.
854 // However, the only part in urlString that can contain non-ASCII
855 // characters is 'ref' at the end of the string. In that case,
856 // begin will always match the actual value and len (in terms of
857 // byte) will be longer than what's needed by 'mid'. However, mid
858 // truncates len to avoid go past the end of a string so that we can
859 // get away without doing anything here.
860
861 int max_length = GetString().length() - component.begin;
862 return StringView(GetString(), component.begin,
863 component.len > max_length ? max_length : component.len);
864 }
865
ComponentString(const url::Component & component) const866 String KURL::ComponentString(const url::Component& component) const {
867 return ComponentStringView(component).ToString();
868 }
869
870 template <typename CHAR>
ReplaceComponents(const url::Replacements<CHAR> & replacements)871 void KURL::ReplaceComponents(const url::Replacements<CHAR>& replacements) {
872 url::RawCanonOutputT<char> output;
873 url::Parsed new_parsed;
874
875 StringUTF8Adaptor utf8(string_);
876 is_valid_ =
877 url::ReplaceComponents(utf8.data(), utf8.size(), parsed_, replacements,
878 nullptr, &output, &new_parsed);
879
880 parsed_ = new_parsed;
881 string_ = AtomicString::FromUTF8(output.data(), output.length());
882 InitProtocolMetadata();
883 }
884
IsSafeToSendToAnotherThread() const885 bool KURL::IsSafeToSendToAnotherThread() const {
886 return string_.IsSafeToSendToAnotherThread() &&
887 (!inner_url_ || inner_url_->IsSafeToSendToAnotherThread());
888 }
889
operator GURL() const890 KURL::operator GURL() const {
891 StringUTF8Adaptor utf8(string_);
892 return GURL(utf8.data(), utf8.size(), parsed_, is_valid_);
893 }
operator ==(const KURL & a,const KURL & b)894 bool operator==(const KURL& a, const KURL& b) {
895 return a.GetString() == b.GetString();
896 }
897
operator ==(const KURL & a,const String & b)898 bool operator==(const KURL& a, const String& b) {
899 return a.GetString() == b;
900 }
901
operator ==(const String & a,const KURL & b)902 bool operator==(const String& a, const KURL& b) {
903 return a == b.GetString();
904 }
905
operator !=(const KURL & a,const KURL & b)906 bool operator!=(const KURL& a, const KURL& b) {
907 return a.GetString() != b.GetString();
908 }
909
operator !=(const KURL & a,const String & b)910 bool operator!=(const KURL& a, const String& b) {
911 return a.GetString() != b;
912 }
913
operator !=(const String & a,const KURL & b)914 bool operator!=(const String& a, const KURL& b) {
915 return a != b.GetString();
916 }
917
operator <<(std::ostream & os,const KURL & url)918 std::ostream& operator<<(std::ostream& os, const KURL& url) {
919 return os << url.GetString();
920 }
921
922 } // namespace blink
923