1 // Copyright 2013-2014 The rust-url developers.
2 //
3 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6 // option. This file may not be copied, modified, or distributed
7 // except according to those terms.
8 
9 //! [*Unicode IDNA Compatibility Processing*
10 //! (Unicode Technical Standard #46)](http://www.unicode.org/reports/tr46/)
11 
12 use self::Mapping::*;
13 use punycode;
14 use std::cmp::Ordering::{Equal, Greater, Less};
15 use unicode_bidi::{bidi_class, BidiClass};
16 use unicode_normalization::char::is_combining_mark;
17 use unicode_normalization::UnicodeNormalization;
18 
19 include!("uts46_mapping_table.rs");
20 
21 const PUNYCODE_PREFIX: &'static str = "xn--";
22 
23 #[derive(Debug)]
24 struct StringTableSlice {
25     // Store these as separate fields so the structure will have an
26     // alignment of 1 and thus pack better into the Mapping enum, below.
27     byte_start_lo: u8,
28     byte_start_hi: u8,
29     byte_len: u8,
30 }
31 
decode_slice(slice: &StringTableSlice) -> &'static str32 fn decode_slice(slice: &StringTableSlice) -> &'static str {
33     let lo = slice.byte_start_lo as usize;
34     let hi = slice.byte_start_hi as usize;
35     let start = (hi << 8) | lo;
36     let len = slice.byte_len as usize;
37     &STRING_TABLE[start..(start + len)]
38 }
39 
40 #[repr(u8)]
41 #[derive(Debug)]
42 enum Mapping {
43     Valid,
44     Ignored,
45     Mapped(StringTableSlice),
46     Deviation(StringTableSlice),
47     Disallowed,
48     DisallowedStd3Valid,
49     DisallowedStd3Mapped(StringTableSlice),
50 }
51 
52 struct Range {
53     from: char,
54     to: char,
55 }
56 
find_char(codepoint: char) -> &'static Mapping57 fn find_char(codepoint: char) -> &'static Mapping {
58     let r = TABLE.binary_search_by(|ref range| {
59         if codepoint > range.to {
60             Less
61         } else if codepoint < range.from {
62             Greater
63         } else {
64             Equal
65         }
66     });
67     r.ok()
68         .map(|i| {
69             const SINGLE_MARKER: u16 = 1 << 15;
70 
71             let x = INDEX_TABLE[i];
72             let single = (x & SINGLE_MARKER) != 0;
73             let offset = !SINGLE_MARKER & x;
74 
75             if single {
76                 &MAPPING_TABLE[offset as usize]
77             } else {
78                 &MAPPING_TABLE[(offset + (codepoint as u16 - TABLE[i].from as u16)) as usize]
79             }
80         })
81         .unwrap()
82 }
83 
map_char(codepoint: char, config: Config, output: &mut String, errors: &mut Vec<Error>)84 fn map_char(codepoint: char, config: Config, output: &mut String, errors: &mut Vec<Error>) {
85     match *find_char(codepoint) {
86         Mapping::Valid => output.push(codepoint),
87         Mapping::Ignored => {}
88         Mapping::Mapped(ref slice) => output.push_str(decode_slice(slice)),
89         Mapping::Deviation(ref slice) => {
90             if config.transitional_processing {
91                 output.push_str(decode_slice(slice))
92             } else {
93                 output.push(codepoint)
94             }
95         }
96         Mapping::Disallowed => {
97             errors.push(Error::DissallowedCharacter);
98             output.push(codepoint);
99         }
100         Mapping::DisallowedStd3Valid => {
101             if config.use_std3_ascii_rules {
102                 errors.push(Error::DissallowedByStd3AsciiRules);
103             }
104             output.push(codepoint)
105         }
106         Mapping::DisallowedStd3Mapped(ref slice) => {
107             if config.use_std3_ascii_rules {
108                 errors.push(Error::DissallowedMappedInStd3);
109             }
110             output.push_str(decode_slice(slice))
111         }
112     }
113 }
114 
115 // http://tools.ietf.org/html/rfc5893#section-2
passes_bidi(label: &str, is_bidi_domain: bool) -> bool116 fn passes_bidi(label: &str, is_bidi_domain: bool) -> bool {
117     // Rule 0: Bidi Rules apply to Bidi Domain Names: a name with at least one RTL label.  A label
118     // is RTL if it contains at least one character of bidi class R, AL or AN.
119     if !is_bidi_domain {
120         return true;
121     }
122 
123     let mut chars = label.chars();
124     let first_char_class = match chars.next() {
125         Some(c) => bidi_class(c),
126         None => return true, // empty string
127     };
128 
129     match first_char_class {
130         // LTR label
131         BidiClass::L => {
132             // Rule 5
133             loop {
134                 match chars.next() {
135                     Some(c) => {
136                         if !matches!(
137                             bidi_class(c),
138                             BidiClass::L
139                                 | BidiClass::EN
140                                 | BidiClass::ES
141                                 | BidiClass::CS
142                                 | BidiClass::ET
143                                 | BidiClass::ON
144                                 | BidiClass::BN
145                                 | BidiClass::NSM
146                         ) {
147                             return false;
148                         }
149                     }
150                     None => {
151                         break;
152                     }
153                 }
154             }
155 
156             // Rule 6
157             // must end in L or EN followed by 0 or more NSM
158             let mut rev_chars = label.chars().rev();
159             let mut last_non_nsm = rev_chars.next();
160             loop {
161                 match last_non_nsm {
162                     Some(c) if bidi_class(c) == BidiClass::NSM => {
163                         last_non_nsm = rev_chars.next();
164                         continue;
165                     }
166                     _ => {
167                         break;
168                     }
169                 }
170             }
171             match last_non_nsm {
172                 Some(c) if bidi_class(c) == BidiClass::L || bidi_class(c) == BidiClass::EN => {}
173                 Some(_) => {
174                     return false;
175                 }
176                 _ => {}
177             }
178         }
179 
180         // RTL label
181         BidiClass::R | BidiClass::AL => {
182             let mut found_en = false;
183             let mut found_an = false;
184 
185             // Rule 2
186             loop {
187                 match chars.next() {
188                     Some(c) => {
189                         let char_class = bidi_class(c);
190 
191                         if char_class == BidiClass::EN {
192                             found_en = true;
193                         }
194                         if char_class == BidiClass::AN {
195                             found_an = true;
196                         }
197 
198                         if !matches!(
199                             char_class,
200                             BidiClass::R
201                                 | BidiClass::AL
202                                 | BidiClass::AN
203                                 | BidiClass::EN
204                                 | BidiClass::ES
205                                 | BidiClass::CS
206                                 | BidiClass::ET
207                                 | BidiClass::ON
208                                 | BidiClass::BN
209                                 | BidiClass::NSM
210                         ) {
211                             return false;
212                         }
213                     }
214                     None => {
215                         break;
216                     }
217                 }
218             }
219             // Rule 3
220             let mut rev_chars = label.chars().rev();
221             let mut last = rev_chars.next();
222             loop {
223                 // must end in L or EN followed by 0 or more NSM
224                 match last {
225                     Some(c) if bidi_class(c) == BidiClass::NSM => {
226                         last = rev_chars.next();
227                         continue;
228                     }
229                     _ => {
230                         break;
231                     }
232                 }
233             }
234             match last {
235                 Some(c)
236                     if matches!(
237                         bidi_class(c),
238                         BidiClass::R | BidiClass::AL | BidiClass::EN | BidiClass::AN
239                     ) => {}
240                 _ => {
241                     return false;
242                 }
243             }
244 
245             // Rule 4
246             if found_an && found_en {
247                 return false;
248             }
249         }
250 
251         // Rule 1: Should start with L or R/AL
252         _ => {
253             return false;
254         }
255     }
256 
257     return true;
258 }
259 
260 /// http://www.unicode.org/reports/tr46/#Validity_Criteria
validate_full(label: &str, is_bidi_domain: bool, config: Config, errors: &mut Vec<Error>)261 fn validate_full(label: &str, is_bidi_domain: bool, config: Config, errors: &mut Vec<Error>) {
262     // V1: Must be in NFC form.
263     if label.nfc().ne(label.chars()) {
264         errors.push(Error::ValidityCriteria);
265     } else {
266         validate(label, is_bidi_domain, config, errors);
267     }
268 }
269 
validate(label: &str, is_bidi_domain: bool, config: Config, errors: &mut Vec<Error>)270 fn validate(label: &str, is_bidi_domain: bool, config: Config, errors: &mut Vec<Error>) {
271     let first_char = label.chars().next();
272     if first_char == None {
273         // Empty string, pass
274     }
275     // V2: No U+002D HYPHEN-MINUS in both third and fourth positions.
276     //
277     // NOTE: Spec says that the label must not contain a HYPHEN-MINUS character in both the
278     // third and fourth positions. But nobody follows this criteria. See the spec issue below:
279     // https://github.com/whatwg/url/issues/53
280 
281     // V3: neither begin nor end with a U+002D HYPHEN-MINUS
282     else if config.check_hyphens && (label.starts_with("-") || label.ends_with("-")) {
283         errors.push(Error::ValidityCriteria);
284     }
285     // V4: not contain a U+002E FULL STOP
286     //
287     // Here, label can't contain '.' since the input is from .split('.')
288 
289     // V5: not begin with a GC=Mark
290     else if is_combining_mark(first_char.unwrap()) {
291         errors.push(Error::ValidityCriteria);
292     }
293     // V6: Check against Mapping Table
294     else if label.chars().any(|c| match *find_char(c) {
295         Mapping::Valid => false,
296         Mapping::Deviation(_) => config.transitional_processing,
297         Mapping::DisallowedStd3Valid => config.use_std3_ascii_rules,
298         _ => true,
299     }) {
300         errors.push(Error::ValidityCriteria);
301     }
302     // V7: ContextJ rules
303     //
304     // TODO: Implement rules and add *CheckJoiners* flag.
305 
306     // V8: Bidi rules
307     //
308     // TODO: Add *CheckBidi* flag
309     else if !passes_bidi(label, is_bidi_domain) {
310         errors.push(Error::ValidityCriteria);
311     }
312 }
313 
314 /// http://www.unicode.org/reports/tr46/#Processing
processing(domain: &str, config: Config, errors: &mut Vec<Error>) -> String315 fn processing(domain: &str, config: Config, errors: &mut Vec<Error>) -> String {
316     let mut mapped = String::with_capacity(domain.len());
317     for c in domain.chars() {
318         map_char(c, config, &mut mapped, errors)
319     }
320     let mut normalized = String::with_capacity(mapped.len());
321     normalized.extend(mapped.nfc());
322 
323     // Find out if it's a Bidi Domain Name
324     //
325     // First, check for literal bidi chars
326     let mut is_bidi_domain = domain
327         .chars()
328         .any(|c| matches!(bidi_class(c), BidiClass::R | BidiClass::AL | BidiClass::AN));
329     if !is_bidi_domain {
330         // Then check for punycode-encoded bidi chars
331         for label in normalized.split('.') {
332             if label.starts_with(PUNYCODE_PREFIX) {
333                 match punycode::decode_to_string(&label[PUNYCODE_PREFIX.len()..]) {
334                     Some(decoded_label) => {
335                         if decoded_label.chars().any(|c| {
336                             matches!(bidi_class(c), BidiClass::R | BidiClass::AL | BidiClass::AN)
337                         }) {
338                             is_bidi_domain = true;
339                         }
340                     }
341                     None => {
342                         is_bidi_domain = true;
343                     }
344                 }
345             }
346         }
347     }
348 
349     let mut validated = String::new();
350     let mut first = true;
351     for label in normalized.split('.') {
352         if !first {
353             validated.push('.');
354         }
355         first = false;
356         if label.starts_with(PUNYCODE_PREFIX) {
357             match punycode::decode_to_string(&label[PUNYCODE_PREFIX.len()..]) {
358                 Some(decoded_label) => {
359                     let config = config.transitional_processing(false);
360                     validate_full(&decoded_label, is_bidi_domain, config, errors);
361                     validated.push_str(&decoded_label)
362                 }
363                 None => errors.push(Error::PunycodeError),
364             }
365         } else {
366             // `normalized` is already `NFC` so we can skip that check
367             validate(label, is_bidi_domain, config, errors);
368             validated.push_str(label)
369         }
370     }
371     validated
372 }
373 
374 #[derive(Clone, Copy)]
375 pub struct Config {
376     use_std3_ascii_rules: bool,
377     transitional_processing: bool,
378     verify_dns_length: bool,
379     check_hyphens: bool,
380 }
381 
382 /// The defaults are that of https://url.spec.whatwg.org/#idna
383 impl Default for Config {
default() -> Self384     fn default() -> Self {
385         Config {
386             use_std3_ascii_rules: false,
387             transitional_processing: false,
388             check_hyphens: false,
389             // check_bidi: true,
390             // check_joiners: true,
391 
392             // Only use for to_ascii, not to_unicode
393             verify_dns_length: false,
394         }
395     }
396 }
397 
398 impl Config {
399     #[inline]
use_std3_ascii_rules(mut self, value: bool) -> Self400     pub fn use_std3_ascii_rules(mut self, value: bool) -> Self {
401         self.use_std3_ascii_rules = value;
402         self
403     }
404 
405     #[inline]
transitional_processing(mut self, value: bool) -> Self406     pub fn transitional_processing(mut self, value: bool) -> Self {
407         self.transitional_processing = value;
408         self
409     }
410 
411     #[inline]
verify_dns_length(mut self, value: bool) -> Self412     pub fn verify_dns_length(mut self, value: bool) -> Self {
413         self.verify_dns_length = value;
414         self
415     }
416 
417     #[inline]
check_hyphens(mut self, value: bool) -> Self418     pub fn check_hyphens(mut self, value: bool) -> Self {
419         self.check_hyphens = value;
420         self
421     }
422 
423     /// http://www.unicode.org/reports/tr46/#ToASCII
to_ascii(self, domain: &str) -> Result<String, Errors>424     pub fn to_ascii(self, domain: &str) -> Result<String, Errors> {
425         let mut errors = Vec::new();
426         let mut result = String::new();
427         let mut first = true;
428         for label in processing(domain, self, &mut errors).split('.') {
429             if !first {
430                 result.push('.');
431             }
432             first = false;
433             if label.is_ascii() {
434                 result.push_str(label);
435             } else {
436                 match punycode::encode_str(label) {
437                     Some(x) => {
438                         result.push_str(PUNYCODE_PREFIX);
439                         result.push_str(&x);
440                     }
441                     None => errors.push(Error::PunycodeError),
442                 }
443             }
444         }
445 
446         if self.verify_dns_length {
447             let domain = if result.ends_with(".") {
448                 &result[..result.len() - 1]
449             } else {
450                 &*result
451             };
452             if domain.len() < 1 || domain.split('.').any(|label| label.len() < 1) {
453                 errors.push(Error::TooShortForDns)
454             }
455             if domain.len() > 253 || domain.split('.').any(|label| label.len() > 63) {
456                 errors.push(Error::TooLongForDns)
457             }
458         }
459         if errors.is_empty() {
460             Ok(result)
461         } else {
462             Err(Errors(errors))
463         }
464     }
465 
466     /// http://www.unicode.org/reports/tr46/#ToUnicode
to_unicode(self, domain: &str) -> (String, Result<(), Errors>)467     pub fn to_unicode(self, domain: &str) -> (String, Result<(), Errors>) {
468         let mut errors = Vec::new();
469         let domain = processing(domain, self, &mut errors);
470         let errors = if errors.is_empty() {
471             Ok(())
472         } else {
473             Err(Errors(errors))
474         };
475         (domain, errors)
476     }
477 }
478 
479 #[derive(PartialEq, Eq, Clone, Copy, Debug)]
480 enum Error {
481     PunycodeError,
482     ValidityCriteria,
483     DissallowedByStd3AsciiRules,
484     DissallowedMappedInStd3,
485     DissallowedCharacter,
486     TooLongForDns,
487     TooShortForDns,
488 }
489 
490 /// Errors recorded during UTS #46 processing.
491 ///
492 /// This is opaque for now, only indicating the presence of at least one error.
493 /// More details may be exposed in the future.
494 #[derive(Debug)]
495 pub struct Errors(Vec<Error>);
496