1 // Copyright 2013-2014 The rust-url developers.
2 //
3 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6 // option. This file may not be copied, modified, or distributed
7 // except according to those terms.
8
9 //! [*Unicode IDNA Compatibility Processing*
10 //! (Unicode Technical Standard #46)](http://www.unicode.org/reports/tr46/)
11
12 use self::Mapping::*;
13 use crate::punycode;
14 use std::{error::Error as StdError, fmt};
15 use unicode_bidi::{bidi_class, BidiClass};
16 use unicode_normalization::char::is_combining_mark;
17 use unicode_normalization::{is_nfc, UnicodeNormalization};
18
19 include!("uts46_mapping_table.rs");
20
21 const PUNYCODE_PREFIX: &str = "xn--";
22
23 #[derive(Debug)]
24 struct StringTableSlice {
25 // Store these as separate fields so the structure will have an
26 // alignment of 1 and thus pack better into the Mapping enum, below.
27 byte_start_lo: u8,
28 byte_start_hi: u8,
29 byte_len: u8,
30 }
31
decode_slice(slice: &StringTableSlice) -> &'static str32 fn decode_slice(slice: &StringTableSlice) -> &'static str {
33 let lo = slice.byte_start_lo as usize;
34 let hi = slice.byte_start_hi as usize;
35 let start = (hi << 8) | lo;
36 let len = slice.byte_len as usize;
37 &STRING_TABLE[start..(start + len)]
38 }
39
40 #[repr(u8)]
41 #[derive(Debug)]
42 enum Mapping {
43 Valid,
44 Ignored,
45 Mapped(StringTableSlice),
46 Deviation(StringTableSlice),
47 Disallowed,
48 DisallowedStd3Valid,
49 DisallowedStd3Mapped(StringTableSlice),
50 DisallowedIdna2008,
51 }
52
find_char(codepoint: char) -> &'static Mapping53 fn find_char(codepoint: char) -> &'static Mapping {
54 let idx = match TABLE.binary_search_by_key(&codepoint, |&val| val.0) {
55 Ok(idx) => idx,
56 Err(idx) => idx - 1,
57 };
58
59 const SINGLE_MARKER: u16 = 1 << 15;
60
61 let (base, x) = TABLE[idx];
62 let single = (x & SINGLE_MARKER) != 0;
63 let offset = !SINGLE_MARKER & x;
64
65 if single {
66 &MAPPING_TABLE[offset as usize]
67 } else {
68 &MAPPING_TABLE[(offset + (codepoint as u16 - base as u16)) as usize]
69 }
70 }
71
72 struct Mapper<'a> {
73 chars: std::str::Chars<'a>,
74 config: Config,
75 errors: &'a mut Errors,
76 slice: Option<std::str::Chars<'static>>,
77 }
78
79 impl<'a> Iterator for Mapper<'a> {
80 type Item = char;
81
next(&mut self) -> Option<Self::Item>82 fn next(&mut self) -> Option<Self::Item> {
83 loop {
84 if let Some(s) = &mut self.slice {
85 match s.next() {
86 Some(c) => return Some(c),
87 None => {
88 self.slice = None;
89 }
90 }
91 }
92
93 let codepoint = self.chars.next()?;
94 if let '.' | '-' | 'a'..='z' | '0'..='9' = codepoint {
95 return Some(codepoint);
96 }
97
98 return Some(match *find_char(codepoint) {
99 Mapping::Valid => codepoint,
100 Mapping::Ignored => continue,
101 Mapping::Mapped(ref slice) => {
102 self.slice = Some(decode_slice(slice).chars());
103 continue;
104 }
105 Mapping::Deviation(ref slice) => {
106 if self.config.transitional_processing {
107 self.slice = Some(decode_slice(slice).chars());
108 continue;
109 } else {
110 codepoint
111 }
112 }
113 Mapping::Disallowed => {
114 self.errors.disallowed_character = true;
115 codepoint
116 }
117 Mapping::DisallowedStd3Valid => {
118 if self.config.use_std3_ascii_rules {
119 self.errors.disallowed_by_std3_ascii_rules = true;
120 };
121 codepoint
122 }
123 Mapping::DisallowedStd3Mapped(ref slice) => {
124 if self.config.use_std3_ascii_rules {
125 self.errors.disallowed_mapped_in_std3 = true;
126 };
127 self.slice = Some(decode_slice(slice).chars());
128 continue;
129 }
130 Mapping::DisallowedIdna2008 => {
131 if self.config.use_idna_2008_rules {
132 self.errors.disallowed_in_idna_2008 = true;
133 }
134 codepoint
135 }
136 });
137 }
138 }
139 }
140
141 // http://tools.ietf.org/html/rfc5893#section-2
passes_bidi(label: &str, is_bidi_domain: bool) -> bool142 fn passes_bidi(label: &str, is_bidi_domain: bool) -> bool {
143 // Rule 0: Bidi Rules apply to Bidi Domain Names: a name with at least one RTL label. A label
144 // is RTL if it contains at least one character of bidi class R, AL or AN.
145 if !is_bidi_domain {
146 return true;
147 }
148
149 let mut chars = label.chars();
150 let first_char_class = match chars.next() {
151 Some(c) => bidi_class(c),
152 None => return true, // empty string
153 };
154
155 match first_char_class {
156 // LTR label
157 BidiClass::L => {
158 // Rule 5
159 while let Some(c) = chars.next() {
160 if !matches!(
161 bidi_class(c),
162 BidiClass::L
163 | BidiClass::EN
164 | BidiClass::ES
165 | BidiClass::CS
166 | BidiClass::ET
167 | BidiClass::ON
168 | BidiClass::BN
169 | BidiClass::NSM
170 ) {
171 return false;
172 }
173 }
174
175 // Rule 6
176 // must end in L or EN followed by 0 or more NSM
177 let mut rev_chars = label.chars().rev();
178 let mut last_non_nsm = rev_chars.next();
179 loop {
180 match last_non_nsm {
181 Some(c) if bidi_class(c) == BidiClass::NSM => {
182 last_non_nsm = rev_chars.next();
183 continue;
184 }
185 _ => {
186 break;
187 }
188 }
189 }
190 match last_non_nsm {
191 Some(c) if bidi_class(c) == BidiClass::L || bidi_class(c) == BidiClass::EN => {}
192 Some(_) => {
193 return false;
194 }
195 _ => {}
196 }
197 }
198
199 // RTL label
200 BidiClass::R | BidiClass::AL => {
201 let mut found_en = false;
202 let mut found_an = false;
203
204 // Rule 2
205 for c in chars {
206 let char_class = bidi_class(c);
207 if char_class == BidiClass::EN {
208 found_en = true;
209 } else if char_class == BidiClass::AN {
210 found_an = true;
211 }
212
213 if !matches!(
214 char_class,
215 BidiClass::R
216 | BidiClass::AL
217 | BidiClass::AN
218 | BidiClass::EN
219 | BidiClass::ES
220 | BidiClass::CS
221 | BidiClass::ET
222 | BidiClass::ON
223 | BidiClass::BN
224 | BidiClass::NSM
225 ) {
226 return false;
227 }
228 }
229 // Rule 3
230 let mut rev_chars = label.chars().rev();
231 let mut last = rev_chars.next();
232 loop {
233 // must end in L or EN followed by 0 or more NSM
234 match last {
235 Some(c) if bidi_class(c) == BidiClass::NSM => {
236 last = rev_chars.next();
237 continue;
238 }
239 _ => {
240 break;
241 }
242 }
243 }
244 match last {
245 Some(c)
246 if matches!(
247 bidi_class(c),
248 BidiClass::R | BidiClass::AL | BidiClass::EN | BidiClass::AN
249 ) => {}
250 _ => {
251 return false;
252 }
253 }
254
255 // Rule 4
256 if found_an && found_en {
257 return false;
258 }
259 }
260
261 // Rule 1: Should start with L or R/AL
262 _ => {
263 return false;
264 }
265 }
266
267 true
268 }
269
270 /// Check the validity criteria for the given label
271 ///
272 /// V1 (NFC) and V8 (Bidi) are checked inside `processing()` to prevent doing duplicate work.
273 ///
274 /// http://www.unicode.org/reports/tr46/#Validity_Criteria
check_validity(label: &str, config: Config, errors: &mut Errors)275 fn check_validity(label: &str, config: Config, errors: &mut Errors) {
276 let first_char = label.chars().next();
277 if first_char == None {
278 // Empty string, pass
279 return;
280 }
281
282 // V2: No U+002D HYPHEN-MINUS in both third and fourth positions.
283 //
284 // NOTE: Spec says that the label must not contain a HYPHEN-MINUS character in both the
285 // third and fourth positions. But nobody follows this criteria. See the spec issue below:
286 // https://github.com/whatwg/url/issues/53
287
288 // V3: neither begin nor end with a U+002D HYPHEN-MINUS
289 if config.check_hyphens && (label.starts_with('-') || label.ends_with('-')) {
290 errors.check_hyphens = true;
291 return;
292 }
293
294 // V4: not contain a U+002E FULL STOP
295 //
296 // Here, label can't contain '.' since the input is from .split('.')
297
298 // V5: not begin with a GC=Mark
299 if is_combining_mark(first_char.unwrap()) {
300 errors.start_combining_mark = true;
301 return;
302 }
303
304 // V6: Check against Mapping Table
305 if label.chars().any(|c| match *find_char(c) {
306 Mapping::Valid | Mapping::DisallowedIdna2008 => false,
307 Mapping::Deviation(_) => config.transitional_processing,
308 Mapping::DisallowedStd3Valid => config.use_std3_ascii_rules,
309 _ => true,
310 }) {
311 errors.invalid_mapping = true;
312 return;
313 }
314
315 // V7: ContextJ rules
316 //
317 // TODO: Implement rules and add *CheckJoiners* flag.
318
319 // V8: Bidi rules are checked inside `processing()`
320 }
321
322 /// http://www.unicode.org/reports/tr46/#Processing
323 #[allow(clippy::manual_strip)] // introduced in 1.45, MSRV is 1.36
processing( domain: &str, config: Config, normalized: &mut String, output: &mut String, ) -> Errors324 fn processing(
325 domain: &str,
326 config: Config,
327 normalized: &mut String,
328 output: &mut String,
329 ) -> Errors {
330 // Weed out the simple cases: only allow all lowercase ASCII characters and digits where none
331 // of the labels start with PUNYCODE_PREFIX and labels don't start or end with hyphen.
332 let (mut prev, mut simple, mut puny_prefix) = ('?', !domain.is_empty(), 0);
333 for c in domain.chars() {
334 if c == '.' {
335 if prev == '-' {
336 simple = false;
337 break;
338 }
339 puny_prefix = 0;
340 continue;
341 } else if puny_prefix == 0 && c == '-' {
342 simple = false;
343 break;
344 } else if puny_prefix < 5 {
345 if c == ['x', 'n', '-', '-'][puny_prefix] {
346 puny_prefix += 1;
347 if puny_prefix == 4 {
348 simple = false;
349 break;
350 }
351 } else {
352 puny_prefix = 5;
353 }
354 }
355 if !c.is_ascii_lowercase() && !c.is_ascii_digit() {
356 simple = false;
357 break;
358 }
359 prev = c;
360 }
361
362 if simple {
363 output.push_str(domain);
364 return Errors::default();
365 }
366
367 normalized.clear();
368 let mut errors = Errors::default();
369 let offset = output.len();
370
371 let iter = Mapper {
372 chars: domain.chars(),
373 config,
374 errors: &mut errors,
375 slice: None,
376 };
377
378 normalized.extend(iter.nfc());
379
380 let mut decoder = punycode::Decoder::default();
381 let non_transitional = config.transitional_processing(false);
382 let (mut first, mut has_bidi_labels) = (true, false);
383 for label in normalized.split('.') {
384 if !first {
385 output.push('.');
386 }
387 first = false;
388 if label.starts_with(PUNYCODE_PREFIX) {
389 match decoder.decode(&label[PUNYCODE_PREFIX.len()..]) {
390 Ok(decode) => {
391 let start = output.len();
392 output.extend(decode);
393 let decoded_label = &output[start..];
394
395 if !has_bidi_labels {
396 has_bidi_labels |= is_bidi_domain(decoded_label);
397 }
398
399 if !errors.is_err() {
400 if !is_nfc(&decoded_label) {
401 errors.nfc = true;
402 } else {
403 check_validity(decoded_label, non_transitional, &mut errors);
404 }
405 }
406 }
407 Err(()) => {
408 has_bidi_labels = true;
409 errors.punycode = true;
410 }
411 }
412 } else {
413 if !has_bidi_labels {
414 has_bidi_labels |= is_bidi_domain(label);
415 }
416
417 // `normalized` is already `NFC` so we can skip that check
418 check_validity(label, config, &mut errors);
419 output.push_str(label)
420 }
421 }
422
423 for label in output[offset..].split('.') {
424 // V8: Bidi rules
425 //
426 // TODO: Add *CheckBidi* flag
427 if !passes_bidi(label, has_bidi_labels) {
428 errors.check_bidi = true;
429 break;
430 }
431 }
432
433 errors
434 }
435
436 #[derive(Default)]
437 pub struct Idna {
438 config: Config,
439 normalized: String,
440 output: String,
441 }
442
443 impl Idna {
new(config: Config) -> Self444 pub fn new(config: Config) -> Self {
445 Self {
446 config,
447 normalized: String::new(),
448 output: String::new(),
449 }
450 }
451
452 /// http://www.unicode.org/reports/tr46/#ToASCII
453 #[allow(clippy::wrong_self_convention)]
to_ascii<'a>(&'a mut self, domain: &str, out: &mut String) -> Result<(), Errors>454 pub fn to_ascii<'a>(&'a mut self, domain: &str, out: &mut String) -> Result<(), Errors> {
455 let mut errors = processing(domain, self.config, &mut self.normalized, &mut self.output);
456
457 let mut first = true;
458 for label in self.output.split('.') {
459 if !first {
460 out.push('.');
461 }
462 first = false;
463
464 if label.is_ascii() {
465 out.push_str(label);
466 } else {
467 let offset = out.len();
468 out.push_str(PUNYCODE_PREFIX);
469 if let Err(()) = punycode::encode_into(label.chars(), out) {
470 errors.punycode = true;
471 out.truncate(offset);
472 }
473 }
474 }
475
476 if self.config.verify_dns_length {
477 let domain = if out.ends_with('.') {
478 &out[..out.len() - 1]
479 } else {
480 &*out
481 };
482 if domain.is_empty() || domain.split('.').any(|label| label.is_empty()) {
483 errors.too_short_for_dns = true;
484 }
485 if domain.len() > 253 || domain.split('.').any(|label| label.len() > 63) {
486 errors.too_long_for_dns = true;
487 }
488 }
489
490 errors.into()
491 }
492
493 /// http://www.unicode.org/reports/tr46/#ToUnicode
494 #[allow(clippy::wrong_self_convention)]
to_unicode<'a>(&'a mut self, domain: &str, out: &mut String) -> Result<(), Errors>495 pub fn to_unicode<'a>(&'a mut self, domain: &str, out: &mut String) -> Result<(), Errors> {
496 processing(domain, self.config, &mut self.normalized, out).into()
497 }
498 }
499
500 #[derive(Clone, Copy)]
501 pub struct Config {
502 use_std3_ascii_rules: bool,
503 transitional_processing: bool,
504 verify_dns_length: bool,
505 check_hyphens: bool,
506 use_idna_2008_rules: bool,
507 }
508
509 /// The defaults are that of https://url.spec.whatwg.org/#idna
510 impl Default for Config {
default() -> Self511 fn default() -> Self {
512 Config {
513 use_std3_ascii_rules: false,
514 transitional_processing: false,
515 check_hyphens: false,
516 // check_bidi: true,
517 // check_joiners: true,
518
519 // Only use for to_ascii, not to_unicode
520 verify_dns_length: false,
521 use_idna_2008_rules: false,
522 }
523 }
524 }
525
526 impl Config {
527 #[inline]
use_std3_ascii_rules(mut self, value: bool) -> Self528 pub fn use_std3_ascii_rules(mut self, value: bool) -> Self {
529 self.use_std3_ascii_rules = value;
530 self
531 }
532
533 #[inline]
transitional_processing(mut self, value: bool) -> Self534 pub fn transitional_processing(mut self, value: bool) -> Self {
535 self.transitional_processing = value;
536 self
537 }
538
539 #[inline]
verify_dns_length(mut self, value: bool) -> Self540 pub fn verify_dns_length(mut self, value: bool) -> Self {
541 self.verify_dns_length = value;
542 self
543 }
544
545 #[inline]
check_hyphens(mut self, value: bool) -> Self546 pub fn check_hyphens(mut self, value: bool) -> Self {
547 self.check_hyphens = value;
548 self
549 }
550
551 #[inline]
use_idna_2008_rules(mut self, value: bool) -> Self552 pub fn use_idna_2008_rules(mut self, value: bool) -> Self {
553 self.use_idna_2008_rules = value;
554 self
555 }
556
557 /// http://www.unicode.org/reports/tr46/#ToASCII
to_ascii(self, domain: &str) -> Result<String, Errors>558 pub fn to_ascii(self, domain: &str) -> Result<String, Errors> {
559 let mut result = String::new();
560 let mut codec = Idna::new(self);
561 codec.to_ascii(domain, &mut result).map(|()| result)
562 }
563
564 /// http://www.unicode.org/reports/tr46/#ToUnicode
to_unicode(self, domain: &str) -> (String, Result<(), Errors>)565 pub fn to_unicode(self, domain: &str) -> (String, Result<(), Errors>) {
566 let mut codec = Idna::new(self);
567 let mut out = String::with_capacity(domain.len());
568 let result = codec.to_unicode(domain, &mut out);
569 (out, result)
570 }
571 }
572
is_bidi_domain(s: &str) -> bool573 fn is_bidi_domain(s: &str) -> bool {
574 for c in s.chars() {
575 if c.is_ascii_graphic() {
576 continue;
577 }
578 match bidi_class(c) {
579 BidiClass::R | BidiClass::AL | BidiClass::AN => return true,
580 _ => {}
581 }
582 }
583 false
584 }
585
586 /// Errors recorded during UTS #46 processing.
587 ///
588 /// This is opaque for now, indicating what types of errors have been encountered at least once.
589 /// More details may be exposed in the future.
590 #[derive(Default)]
591 pub struct Errors {
592 punycode: bool,
593 check_hyphens: bool,
594 check_bidi: bool,
595 start_combining_mark: bool,
596 invalid_mapping: bool,
597 nfc: bool,
598 disallowed_by_std3_ascii_rules: bool,
599 disallowed_mapped_in_std3: bool,
600 disallowed_character: bool,
601 too_long_for_dns: bool,
602 too_short_for_dns: bool,
603 disallowed_in_idna_2008: bool,
604 }
605
606 impl Errors {
is_err(&self) -> bool607 fn is_err(&self) -> bool {
608 let Errors {
609 punycode,
610 check_hyphens,
611 check_bidi,
612 start_combining_mark,
613 invalid_mapping,
614 nfc,
615 disallowed_by_std3_ascii_rules,
616 disallowed_mapped_in_std3,
617 disallowed_character,
618 too_long_for_dns,
619 too_short_for_dns,
620 disallowed_in_idna_2008,
621 } = *self;
622 punycode
623 || check_hyphens
624 || check_bidi
625 || start_combining_mark
626 || invalid_mapping
627 || nfc
628 || disallowed_by_std3_ascii_rules
629 || disallowed_mapped_in_std3
630 || disallowed_character
631 || too_long_for_dns
632 || too_short_for_dns
633 || disallowed_in_idna_2008
634 }
635 }
636
637 impl fmt::Debug for Errors {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result638 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
639 let Errors {
640 punycode,
641 check_hyphens,
642 check_bidi,
643 start_combining_mark,
644 invalid_mapping,
645 nfc,
646 disallowed_by_std3_ascii_rules,
647 disallowed_mapped_in_std3,
648 disallowed_character,
649 too_long_for_dns,
650 too_short_for_dns,
651 disallowed_in_idna_2008,
652 } = *self;
653
654 let fields = [
655 ("punycode", punycode),
656 ("check_hyphens", check_hyphens),
657 ("check_bidi", check_bidi),
658 ("start_combining_mark", start_combining_mark),
659 ("invalid_mapping", invalid_mapping),
660 ("nfc", nfc),
661 (
662 "disallowed_by_std3_ascii_rules",
663 disallowed_by_std3_ascii_rules,
664 ),
665 ("disallowed_mapped_in_std3", disallowed_mapped_in_std3),
666 ("disallowed_character", disallowed_character),
667 ("too_long_for_dns", too_long_for_dns),
668 ("too_short_for_dns", too_short_for_dns),
669 ("disallowed_in_idna_2008", disallowed_in_idna_2008),
670 ];
671
672 let mut empty = true;
673 f.write_str("Errors { ")?;
674 for (name, val) in &fields {
675 if *val {
676 if !empty {
677 f.write_str(", ")?;
678 }
679 f.write_str(*name)?;
680 empty = false;
681 }
682 }
683
684 if !empty {
685 f.write_str(" }")
686 } else {
687 f.write_str("}")
688 }
689 }
690 }
691
692 impl From<Errors> for Result<(), Errors> {
from(e: Errors) -> Result<(), Errors>693 fn from(e: Errors) -> Result<(), Errors> {
694 if !e.is_err() {
695 Ok(())
696 } else {
697 Err(e)
698 }
699 }
700 }
701
702 impl StdError for Errors {}
703
704 impl fmt::Display for Errors {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result705 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
706 fmt::Debug::fmt(self, f)
707 }
708 }
709
710 #[cfg(test)]
711 mod tests {
712 use super::{find_char, Mapping};
713
714 #[test]
mapping_fast_path()715 fn mapping_fast_path() {
716 assert_matches!(find_char('-'), &Mapping::Valid);
717 assert_matches!(find_char('.'), &Mapping::Valid);
718 for c in &['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] {
719 assert_matches!(find_char(*c), &Mapping::Valid);
720 }
721 for c in &[
722 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
723 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
724 ] {
725 assert_matches!(find_char(*c), &Mapping::Valid);
726 }
727 }
728 }
729