1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/logging.h"
6 #include "url/url_canon.h"
7 #include "url/url_canon_internal.h"
8 
9 namespace url {
10 
11 namespace {
12 
13 // For reference, here's what IE supports:
14 // Key: 0 (disallowed: failure if present in the input)
15 //      + (allowed either escaped or unescaped, and unmodified)
16 //      U (allowed escaped or unescaped but always unescaped if present in
17 //         escaped form)
18 //      E (allowed escaped or unescaped but always escaped if present in
19 //         unescaped form)
20 //      % (only allowed escaped in the input, will be unmodified).
21 //      I left blank alpha numeric characters.
22 //
23 //    00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
24 //    -----------------------------------------------
25 // 0   0  E  E  E  E  E  E  E  E  E  E  E  E  E  E  E
26 // 1   E  E  E  E  E  E  E  E  E  E  E  E  E  E  E  E
27 // 2   E  +  E  E  +  E  +  +  +  +  +  +  +  U  U  0
28 // 3                                 %  %  E  +  E  0  <-- Those are  : ; < = > ?
29 // 4   %
30 // 5                                    U  0  U  U  U  <-- Those are  [ \ ] ^ _
31 // 6   E                                               <-- That's  `
32 // 7                                    E  E  E  U  E  <-- Those are { | } ~ (UNPRINTABLE)
33 //
34 // NOTE: I didn't actually test all the control characters. Some may be
35 // disallowed in the input, but they are all accepted escaped except for 0.
36 // I also didn't test if characters affecting HTML parsing are allowed
37 // unescaped, e.g. (") or (#), which would indicate the beginning of the path.
38 // Surprisingly, space is accepted in the input and always escaped.
39 
40 // This table lists the canonical version of all characters we allow in the
41 // input, with 0 indicating it is disallowed. We use the magic kEscapedHostChar
42 // value to indicate that this character should be escaped. We are a little more
43 // restrictive than IE, but less restrictive than Firefox.
44 //
45 // Note that we disallow the % character. We will allow it when part of an
46 // escape sequence, of course, but this disallows "%25". Even though IE allows
47 // it, allowing it would put us in a funny state. If there was an invalid
48 // escape sequence like "%zz", we'll add "%25zz" to the output and fail.
49 // Allowing percents means we'll succeed a second time, so validity would change
50 // based on how many times you run the canonicalizer. We prefer to always report
51 // the same vailidity, so reject this.
52 const unsigned char kEsc = 0xff;
53 const unsigned char kHostCharLookup[0x80] = {
54 // 00-1f: all are invalid
55      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
56      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
57 //  ' '   !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /
58    kEsc,kEsc,kEsc,kEsc,kEsc,  0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.',  0,
59 //   0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?
60     '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':',  0 ,kEsc,kEsc,kEsc,  0 ,
61 //   @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O
62    kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
63 //   P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _
64     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[',  0 , ']',  0 , '_',
65 //   `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o
66    kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
67 //   p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~
68     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc,  0 ,  0 };
69 
70 // RFC1034 maximum FQDN length.
71 constexpr int kMaxHostLength = 253;
72 
73 // Generous padding to account for the fact that UTS#46 normalization can cause
74 // a long string to actually shrink and fit within the 253 character RFC1034
75 // FQDN length limit. Note that this can still be too short for pathological
76 // cases: An arbitrary number of characters (e.g. U+00AD SOFT HYPHEN) can be
77 // removed from the input by UTS#46 processing. However, this should be
78 // sufficient for all normally-encountered, non-abusive hostname strings.
79 constexpr int kMaxHostBufferLength = kMaxHostLength*5;
80 
81 const int kTempHostBufferLen = 1024;
82 typedef RawCanonOutputT<char, kTempHostBufferLen> StackBuffer;
83 typedef RawCanonOutputT<base::char16, kTempHostBufferLen> StackBufferW;
84 
85 // Scans a host name and fills in the output flags according to what we find.
86 // |has_non_ascii| will be true if there are any non-7-bit characters, and
87 // |has_escaped| will be true if there is a percent sign.
88 template<typename CHAR, typename UCHAR>
ScanHostname(const CHAR * spec,const Component & host,bool * has_non_ascii,bool * has_escaped)89 void ScanHostname(const CHAR* spec,
90                   const Component& host,
91                   bool* has_non_ascii,
92                   bool* has_escaped) {
93   int end = host.end();
94   *has_non_ascii = false;
95   *has_escaped = false;
96   for (int i = host.begin; i < end; i++) {
97     if (static_cast<UCHAR>(spec[i]) >= 0x80)
98       *has_non_ascii = true;
99     else if (spec[i] == '%')
100       *has_escaped = true;
101   }
102 }
103 
104 // Canonicalizes a host name that is entirely 8-bit characters (even though
105 // the type holding them may be 16 bits. Escaped characters will be unescaped.
106 // Non-7-bit characters (for example, UTF-8) will be passed unchanged.
107 //
108 // The |*has_non_ascii| flag will be true if there are non-7-bit characters in
109 // the output.
110 //
111 // This function is used in two situations:
112 //
113 //  * When the caller knows there is no non-ASCII or percent escaped
114 //    characters. This is what DoHost does. The result will be a completely
115 //    canonicalized host since we know nothing weird can happen (escaped
116 //    characters could be unescaped to non-7-bit, so they have to be treated
117 //    with suspicion at this point). It does not use the |has_non_ascii| flag.
118 //
119 //  * When the caller has an 8-bit string that may need unescaping.
120 //    DoComplexHost calls us this situation to do unescaping and validation.
121 //    After this, it may do other IDN operations depending on the value of the
122 //    |*has_non_ascii| flag.
123 //
124 // The return value indicates if the output is a potentially valid host name.
125 template<typename INCHAR, typename OUTCHAR>
DoSimpleHost(const INCHAR * host,int host_len,CanonOutputT<OUTCHAR> * output,bool * has_non_ascii)126 bool DoSimpleHost(const INCHAR* host,
127                   int host_len,
128                   CanonOutputT<OUTCHAR>* output,
129                   bool* has_non_ascii) {
130   *has_non_ascii = false;
131 
132   bool success = true;
133   for (int i = 0; i < host_len; ++i) {
134     unsigned int source = host[i];
135     if (source == '%') {
136       // Unescape first, if possible.
137       // Source will be used only if decode operation was successful.
138       if (!DecodeEscaped(host, &i, host_len, &source)) {
139         // Invalid escaped character. There is nothing that can make this
140         // host valid. We append an escaped percent so the URL looks reasonable
141         // and mark as failed.
142         AppendEscapedChar('%', output);
143         success = false;
144         continue;
145       }
146     }
147 
148     if (source < 0x80) {
149       // We have ASCII input, we can use our lookup table.
150       unsigned char replacement = kHostCharLookup[source];
151       if (!replacement) {
152         // Invalid character, add it as percent-escaped and mark as failed.
153         AppendEscapedChar(source, output);
154         success = false;
155       } else if (replacement == kEsc) {
156         // This character is valid but should be escaped.
157         AppendEscapedChar(source, output);
158       } else {
159         // Common case, the given character is valid in a hostname, the lookup
160         // table tells us the canonical representation of that character (lower
161         // cased).
162         output->push_back(replacement);
163       }
164     } else {
165       // It's a non-ascii char. Just push it to the output.
166       // In case where we have char16 input, and char output it's safe to
167       // cast char16->char only if input string was converted to ASCII.
168       output->push_back(static_cast<OUTCHAR>(source));
169       *has_non_ascii = true;
170     }
171   }
172   return success;
173 }
174 
175 // Canonicalizes a host that requires IDN conversion. Returns true on success
DoIDNHost(const base::char16 * src,int src_len,CanonOutput * output)176 bool DoIDNHost(const base::char16* src, int src_len, CanonOutput* output) {
177   int original_output_len = output->length();  // So we can rewind below.
178 
179   // We need to escape URL before doing IDN conversion, since punicode strings
180   // cannot be escaped after they are created.
181   RawCanonOutputW<kTempHostBufferLen> url_escaped_host;
182   bool has_non_ascii;
183   DoSimpleHost(src, src_len, &url_escaped_host, &has_non_ascii);
184   if (url_escaped_host.length() > kMaxHostBufferLength) {
185     AppendInvalidNarrowString(src, 0, src_len, output);
186     return false;
187   }
188 
189   StackBufferW wide_output;
190   if (!IDNToASCII(url_escaped_host.data(),
191                   url_escaped_host.length(),
192                   &wide_output)) {
193     // Some error, give up. This will write some reasonable looking
194     // representation of the string to the output.
195     AppendInvalidNarrowString(src, 0, src_len, output);
196     return false;
197   }
198 
199   // Now we check the ASCII output like a normal host. It will also handle
200   // unescaping. Although we unescaped everything before this function call, if
201   // somebody does %00 as fullwidth, ICU will convert this to ASCII.
202   bool success = DoSimpleHost(wide_output.data(),
203                               wide_output.length(),
204                               output, &has_non_ascii);
205   if (has_non_ascii) {
206     // ICU generated something that DoSimpleHost didn't think looked like
207     // ASCII. This is quite rare, but ICU might convert some characters to
208     // percent signs which might generate new escape sequences which might in
209     // turn be invalid. An example is U+FE6A "small percent" which ICU will
210     // name prep into an ASCII percent and then we can interpret the following
211     // characters as escaped characters.
212     //
213     // If DoSimpleHost didn't think the output was ASCII, just escape the
214     // thing we gave ICU and give up. DoSimpleHost will have handled a further
215     // level of escaping from ICU for simple ASCII cases (i.e. if ICU generates
216     // a new escaped ASCII sequence like "%41" we'll unescape it) but it won't
217     // do more (like handle escaped non-ASCII sequences). Handling the escaped
218     // ASCII isn't strictly necessary, but DoSimpleHost handles this case
219     // anyway so we handle it/
220     output->set_length(original_output_len);
221     AppendInvalidNarrowString(wide_output.data(), 0, wide_output.length(),
222                               output);
223     return false;
224   }
225   return success;
226 }
227 
228 // 8-bit convert host to its ASCII version: this converts the UTF-8 input to
229 // UTF-16. The has_escaped flag should be set if the input string requires
230 // unescaping.
DoComplexHost(const char * host,int host_len,bool has_non_ascii,bool has_escaped,CanonOutput * output)231 bool DoComplexHost(const char* host, int host_len,
232                    bool has_non_ascii, bool has_escaped, CanonOutput* output) {
233   // Save the current position in the output. We may write stuff and rewind it
234   // below, so we need to know where to rewind to.
235   int begin_length = output->length();
236 
237   // Points to the UTF-8 data we want to convert. This will either be the
238   // input or the unescaped version written to |*output| if necessary.
239   const char* utf8_source;
240   int utf8_source_len;
241   if (has_escaped) {
242     // Unescape before converting to UTF-16 for IDN. We write this into the
243     // output because it most likely does not require IDNization, and we can
244     // save another huge stack buffer. It will be replaced below if it requires
245     // IDN. This will also update our non-ASCII flag so we know whether the
246     // unescaped input requires IDN.
247     if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) {
248       // Error with some escape sequence. We'll call the current output
249       // complete. DoSimpleHost will have written some "reasonable" output.
250       return false;
251     }
252 
253     // Unescaping may have left us with ASCII input, in which case the
254     // unescaped version we wrote to output is complete.
255     if (!has_non_ascii) {
256       return true;
257     }
258 
259     // Save the pointer into the data was just converted (it may be appended to
260     // other data in the output buffer).
261     utf8_source = &output->data()[begin_length];
262     utf8_source_len = output->length() - begin_length;
263   } else {
264     // We don't need to unescape, use input for IDNization later. (We know the
265     // input has non-ASCII, or the simple version would have been called
266     // instead of us.)
267     utf8_source = host;
268     utf8_source_len = host_len;
269   }
270 
271   // Non-ASCII input requires IDN, convert to UTF-16 and do the IDN conversion.
272   // Above, we may have used the output to write the unescaped values to, so
273   // we have to rewind it to where we started after we convert it to UTF-16.
274   StackBufferW utf16;
275   if (!ConvertUTF8ToUTF16(utf8_source, utf8_source_len, &utf16)) {
276     // In this error case, the input may or may not be the output.
277     StackBuffer utf8;
278     for (int i = 0; i < utf8_source_len; i++)
279       utf8.push_back(utf8_source[i]);
280     output->set_length(begin_length);
281     AppendInvalidNarrowString(utf8.data(), 0, utf8.length(), output);
282     return false;
283   }
284   output->set_length(begin_length);
285 
286   // This will call DoSimpleHost which will do normal ASCII canonicalization
287   // and also check for IP addresses in the outpt.
288   return DoIDNHost(utf16.data(), utf16.length(), output);
289 }
290 
291 // UTF-16 convert host to its ASCII version. The set up is already ready for
292 // the backend, so we just pass through. The has_escaped flag should be set if
293 // the input string requires unescaping.
DoComplexHost(const base::char16 * host,int host_len,bool has_non_ascii,bool has_escaped,CanonOutput * output)294 bool DoComplexHost(const base::char16* host, int host_len,
295                    bool has_non_ascii, bool has_escaped, CanonOutput* output) {
296   if (has_escaped) {
297     // Yikes, we have escaped characters with wide input. The escaped
298     // characters should be interpreted as UTF-8. To solve this problem,
299     // we convert to UTF-8, unescape, then convert back to UTF-16 for IDN.
300     //
301     // We don't bother to optimize the conversion in the ASCII case (which
302     // *could* just be a copy) and use the UTF-8 path, because it should be
303     // very rare that host names have escaped characters, and it is relatively
304     // fast to do the conversion anyway.
305     StackBuffer utf8;
306     if (!ConvertUTF16ToUTF8(host, host_len, &utf8)) {
307       AppendInvalidNarrowString(host, 0, host_len, output);
308       return false;
309     }
310 
311     // Once we convert to UTF-8, we can use the 8-bit version of the complex
312     // host handling code above.
313     return DoComplexHost(utf8.data(), utf8.length(), has_non_ascii,
314                          has_escaped, output);
315   }
316 
317   // No unescaping necessary, we can safely pass the input to ICU. This
318   // function will only get called if we either have escaped or non-ascii
319   // input, so it's safe to just use ICU now. Even if the input is ASCII,
320   // this function will do the right thing (just slower than we could).
321   return DoIDNHost(host, host_len, output);
322 }
323 
324 template <typename CHAR, typename UCHAR>
DoHostSubstring(const CHAR * spec,const Component & host,CanonOutput * output)325 bool DoHostSubstring(const CHAR* spec,
326                      const Component& host,
327                      CanonOutput* output) {
328   bool has_non_ascii, has_escaped;
329   ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped);
330 
331   if (has_non_ascii || has_escaped) {
332     return DoComplexHost(&spec[host.begin], host.len, has_non_ascii,
333                          has_escaped, output);
334   }
335 
336   const bool success =
337       DoSimpleHost(&spec[host.begin], host.len, output, &has_non_ascii);
338   DCHECK(!has_non_ascii);
339   return success;
340 }
341 
342 template <typename CHAR, typename UCHAR>
DoHost(const CHAR * spec,const Component & host,CanonOutput * output,CanonHostInfo * host_info)343 void DoHost(const CHAR* spec,
344             const Component& host,
345             CanonOutput* output,
346             CanonHostInfo* host_info) {
347   if (host.len <= 0) {
348     // Empty hosts don't need anything.
349     host_info->family = CanonHostInfo::NEUTRAL;
350     host_info->out_host = Component();
351     return;
352   }
353 
354   // Keep track of output's initial length, so we can rewind later.
355   const int output_begin = output->length();
356 
357   if (DoHostSubstring<CHAR, UCHAR>(spec, host, output)) {
358     // After all the other canonicalization, check if we ended up with an IP
359     // address. IP addresses are small, so writing into this temporary buffer
360     // should not cause an allocation.
361     RawCanonOutput<64> canon_ip;
362     CanonicalizeIPAddress(output->data(),
363                           MakeRange(output_begin, output->length()),
364                           &canon_ip, host_info);
365 
366     // If we got an IPv4/IPv6 address, copy the canonical form back to the
367     // real buffer. Otherwise, it's a hostname or broken IP, in which case
368     // we just leave it in place.
369     if (host_info->IsIPAddress()) {
370       output->set_length(output_begin);
371       output->Append(canon_ip.data(), canon_ip.length());
372     }
373   } else {
374     // Canonicalization failed. Set BROKEN to notify the caller.
375     host_info->family = CanonHostInfo::BROKEN;
376   }
377 
378   host_info->out_host = MakeRange(output_begin, output->length());
379 }
380 
381 }  // namespace
382 
CanonicalizeHost(const char * spec,const Component & host,CanonOutput * output,Component * out_host)383 bool CanonicalizeHost(const char* spec,
384                       const Component& host,
385                       CanonOutput* output,
386                       Component* out_host) {
387   CanonHostInfo host_info;
388   DoHost<char, unsigned char>(spec, host, output, &host_info);
389   *out_host = host_info.out_host;
390   return (host_info.family != CanonHostInfo::BROKEN);
391 }
392 
CanonicalizeHost(const base::char16 * spec,const Component & host,CanonOutput * output,Component * out_host)393 bool CanonicalizeHost(const base::char16* spec,
394                       const Component& host,
395                       CanonOutput* output,
396                       Component* out_host) {
397   CanonHostInfo host_info;
398   DoHost<base::char16, base::char16>(spec, host, output, &host_info);
399   *out_host = host_info.out_host;
400   return (host_info.family != CanonHostInfo::BROKEN);
401 }
402 
CanonicalizeHostVerbose(const char * spec,const Component & host,CanonOutput * output,CanonHostInfo * host_info)403 void CanonicalizeHostVerbose(const char* spec,
404                              const Component& host,
405                              CanonOutput* output,
406                              CanonHostInfo* host_info) {
407   DoHost<char, unsigned char>(spec, host, output, host_info);
408 }
409 
CanonicalizeHostVerbose(const base::char16 * spec,const Component & host,CanonOutput * output,CanonHostInfo * host_info)410 void CanonicalizeHostVerbose(const base::char16* spec,
411                              const Component& host,
412                              CanonOutput* output,
413                              CanonHostInfo* host_info) {
414   DoHost<base::char16, base::char16>(spec, host, output, host_info);
415 }
416 
CanonicalizeHostSubstring(const char * spec,const Component & host,CanonOutput * output)417 bool CanonicalizeHostSubstring(const char* spec,
418                                const Component& host,
419                                CanonOutput* output) {
420   return DoHostSubstring<char, unsigned char>(spec, host, output);
421 }
422 
CanonicalizeHostSubstring(const base::char16 * spec,const Component & host,CanonOutput * output)423 bool CanonicalizeHostSubstring(const base::char16* spec,
424                                const Component& host,
425                                CanonOutput* output) {
426   return DoHostSubstring<base::char16, base::char16>(spec, host, output);
427 }
428 
429 }  // namespace url
430