1 // Copyright 2013-2014 The rust-url developers.
2 //
3 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6 // option. This file may not be copied, modified, or distributed
7 // except according to those terms.
8
9 //! [*Unicode IDNA Compatibility Processing*
10 //! (Unicode Technical Standard #46)](http://www.unicode.org/reports/tr46/)
11
12 use self::Mapping::*;
13 use punycode;
14 use std::cmp::Ordering::{Equal, Greater, Less};
15 use unicode_bidi::{bidi_class, BidiClass};
16 use unicode_normalization::char::is_combining_mark;
17 use unicode_normalization::UnicodeNormalization;
18
19 include!("uts46_mapping_table.rs");
20
21 const PUNYCODE_PREFIX: &'static str = "xn--";
22
23 #[derive(Debug)]
24 struct StringTableSlice {
25 // Store these as separate fields so the structure will have an
26 // alignment of 1 and thus pack better into the Mapping enum, below.
27 byte_start_lo: u8,
28 byte_start_hi: u8,
29 byte_len: u8,
30 }
31
decode_slice(slice: &StringTableSlice) -> &'static str32 fn decode_slice(slice: &StringTableSlice) -> &'static str {
33 let lo = slice.byte_start_lo as usize;
34 let hi = slice.byte_start_hi as usize;
35 let start = (hi << 8) | lo;
36 let len = slice.byte_len as usize;
37 &STRING_TABLE[start..(start + len)]
38 }
39
40 #[repr(u8)]
41 #[derive(Debug)]
42 enum Mapping {
43 Valid,
44 Ignored,
45 Mapped(StringTableSlice),
46 Deviation(StringTableSlice),
47 Disallowed,
48 DisallowedStd3Valid,
49 DisallowedStd3Mapped(StringTableSlice),
50 }
51
52 struct Range {
53 from: char,
54 to: char,
55 }
56
find_char(codepoint: char) -> &'static Mapping57 fn find_char(codepoint: char) -> &'static Mapping {
58 let r = TABLE.binary_search_by(|ref range| {
59 if codepoint > range.to {
60 Less
61 } else if codepoint < range.from {
62 Greater
63 } else {
64 Equal
65 }
66 });
67 r.ok()
68 .map(|i| {
69 const SINGLE_MARKER: u16 = 1 << 15;
70
71 let x = INDEX_TABLE[i];
72 let single = (x & SINGLE_MARKER) != 0;
73 let offset = !SINGLE_MARKER & x;
74
75 if single {
76 &MAPPING_TABLE[offset as usize]
77 } else {
78 &MAPPING_TABLE[(offset + (codepoint as u16 - TABLE[i].from as u16)) as usize]
79 }
80 })
81 .unwrap()
82 }
83
map_char(codepoint: char, config: Config, output: &mut String, errors: &mut Vec<Error>)84 fn map_char(codepoint: char, config: Config, output: &mut String, errors: &mut Vec<Error>) {
85 match *find_char(codepoint) {
86 Mapping::Valid => output.push(codepoint),
87 Mapping::Ignored => {}
88 Mapping::Mapped(ref slice) => output.push_str(decode_slice(slice)),
89 Mapping::Deviation(ref slice) => {
90 if config.transitional_processing {
91 output.push_str(decode_slice(slice))
92 } else {
93 output.push(codepoint)
94 }
95 }
96 Mapping::Disallowed => {
97 errors.push(Error::DissallowedCharacter);
98 output.push(codepoint);
99 }
100 Mapping::DisallowedStd3Valid => {
101 if config.use_std3_ascii_rules {
102 errors.push(Error::DissallowedByStd3AsciiRules);
103 }
104 output.push(codepoint)
105 }
106 Mapping::DisallowedStd3Mapped(ref slice) => {
107 if config.use_std3_ascii_rules {
108 errors.push(Error::DissallowedMappedInStd3);
109 }
110 output.push_str(decode_slice(slice))
111 }
112 }
113 }
114
115 // http://tools.ietf.org/html/rfc5893#section-2
passes_bidi(label: &str, is_bidi_domain: bool) -> bool116 fn passes_bidi(label: &str, is_bidi_domain: bool) -> bool {
117 // Rule 0: Bidi Rules apply to Bidi Domain Names: a name with at least one RTL label. A label
118 // is RTL if it contains at least one character of bidi class R, AL or AN.
119 if !is_bidi_domain {
120 return true;
121 }
122
123 let mut chars = label.chars();
124 let first_char_class = match chars.next() {
125 Some(c) => bidi_class(c),
126 None => return true, // empty string
127 };
128
129 match first_char_class {
130 // LTR label
131 BidiClass::L => {
132 // Rule 5
133 loop {
134 match chars.next() {
135 Some(c) => {
136 if !matches!(
137 bidi_class(c),
138 BidiClass::L
139 | BidiClass::EN
140 | BidiClass::ES
141 | BidiClass::CS
142 | BidiClass::ET
143 | BidiClass::ON
144 | BidiClass::BN
145 | BidiClass::NSM
146 ) {
147 return false;
148 }
149 }
150 None => {
151 break;
152 }
153 }
154 }
155
156 // Rule 6
157 // must end in L or EN followed by 0 or more NSM
158 let mut rev_chars = label.chars().rev();
159 let mut last_non_nsm = rev_chars.next();
160 loop {
161 match last_non_nsm {
162 Some(c) if bidi_class(c) == BidiClass::NSM => {
163 last_non_nsm = rev_chars.next();
164 continue;
165 }
166 _ => {
167 break;
168 }
169 }
170 }
171 match last_non_nsm {
172 Some(c) if bidi_class(c) == BidiClass::L || bidi_class(c) == BidiClass::EN => {}
173 Some(_) => {
174 return false;
175 }
176 _ => {}
177 }
178 }
179
180 // RTL label
181 BidiClass::R | BidiClass::AL => {
182 let mut found_en = false;
183 let mut found_an = false;
184
185 // Rule 2
186 loop {
187 match chars.next() {
188 Some(c) => {
189 let char_class = bidi_class(c);
190
191 if char_class == BidiClass::EN {
192 found_en = true;
193 }
194 if char_class == BidiClass::AN {
195 found_an = true;
196 }
197
198 if !matches!(
199 char_class,
200 BidiClass::R
201 | BidiClass::AL
202 | BidiClass::AN
203 | BidiClass::EN
204 | BidiClass::ES
205 | BidiClass::CS
206 | BidiClass::ET
207 | BidiClass::ON
208 | BidiClass::BN
209 | BidiClass::NSM
210 ) {
211 return false;
212 }
213 }
214 None => {
215 break;
216 }
217 }
218 }
219 // Rule 3
220 let mut rev_chars = label.chars().rev();
221 let mut last = rev_chars.next();
222 loop {
223 // must end in L or EN followed by 0 or more NSM
224 match last {
225 Some(c) if bidi_class(c) == BidiClass::NSM => {
226 last = rev_chars.next();
227 continue;
228 }
229 _ => {
230 break;
231 }
232 }
233 }
234 match last {
235 Some(c)
236 if matches!(
237 bidi_class(c),
238 BidiClass::R | BidiClass::AL | BidiClass::EN | BidiClass::AN
239 ) => {}
240 _ => {
241 return false;
242 }
243 }
244
245 // Rule 4
246 if found_an && found_en {
247 return false;
248 }
249 }
250
251 // Rule 1: Should start with L or R/AL
252 _ => {
253 return false;
254 }
255 }
256
257 return true;
258 }
259
260 /// http://www.unicode.org/reports/tr46/#Validity_Criteria
validate_full(label: &str, is_bidi_domain: bool, config: Config, errors: &mut Vec<Error>)261 fn validate_full(label: &str, is_bidi_domain: bool, config: Config, errors: &mut Vec<Error>) {
262 // V1: Must be in NFC form.
263 if label.nfc().ne(label.chars()) {
264 errors.push(Error::ValidityCriteria);
265 } else {
266 validate(label, is_bidi_domain, config, errors);
267 }
268 }
269
validate(label: &str, is_bidi_domain: bool, config: Config, errors: &mut Vec<Error>)270 fn validate(label: &str, is_bidi_domain: bool, config: Config, errors: &mut Vec<Error>) {
271 let first_char = label.chars().next();
272 if first_char == None {
273 // Empty string, pass
274 }
275 // V2: No U+002D HYPHEN-MINUS in both third and fourth positions.
276 //
277 // NOTE: Spec says that the label must not contain a HYPHEN-MINUS character in both the
278 // third and fourth positions. But nobody follows this criteria. See the spec issue below:
279 // https://github.com/whatwg/url/issues/53
280
281 // V3: neither begin nor end with a U+002D HYPHEN-MINUS
282 else if config.check_hyphens && (label.starts_with("-") || label.ends_with("-")) {
283 errors.push(Error::ValidityCriteria);
284 }
285 // V4: not contain a U+002E FULL STOP
286 //
287 // Here, label can't contain '.' since the input is from .split('.')
288
289 // V5: not begin with a GC=Mark
290 else if is_combining_mark(first_char.unwrap()) {
291 errors.push(Error::ValidityCriteria);
292 }
293 // V6: Check against Mapping Table
294 else if label.chars().any(|c| match *find_char(c) {
295 Mapping::Valid => false,
296 Mapping::Deviation(_) => config.transitional_processing,
297 Mapping::DisallowedStd3Valid => config.use_std3_ascii_rules,
298 _ => true,
299 }) {
300 errors.push(Error::ValidityCriteria);
301 }
302 // V7: ContextJ rules
303 //
304 // TODO: Implement rules and add *CheckJoiners* flag.
305
306 // V8: Bidi rules
307 //
308 // TODO: Add *CheckBidi* flag
309 else if !passes_bidi(label, is_bidi_domain) {
310 errors.push(Error::ValidityCriteria);
311 }
312 }
313
314 /// http://www.unicode.org/reports/tr46/#Processing
processing(domain: &str, config: Config, errors: &mut Vec<Error>) -> String315 fn processing(domain: &str, config: Config, errors: &mut Vec<Error>) -> String {
316 let mut mapped = String::with_capacity(domain.len());
317 for c in domain.chars() {
318 map_char(c, config, &mut mapped, errors)
319 }
320 let mut normalized = String::with_capacity(mapped.len());
321 normalized.extend(mapped.nfc());
322
323 // Find out if it's a Bidi Domain Name
324 //
325 // First, check for literal bidi chars
326 let mut is_bidi_domain = domain
327 .chars()
328 .any(|c| matches!(bidi_class(c), BidiClass::R | BidiClass::AL | BidiClass::AN));
329 if !is_bidi_domain {
330 // Then check for punycode-encoded bidi chars
331 for label in normalized.split('.') {
332 if label.starts_with(PUNYCODE_PREFIX) {
333 match punycode::decode_to_string(&label[PUNYCODE_PREFIX.len()..]) {
334 Some(decoded_label) => {
335 if decoded_label.chars().any(|c| {
336 matches!(bidi_class(c), BidiClass::R | BidiClass::AL | BidiClass::AN)
337 }) {
338 is_bidi_domain = true;
339 }
340 }
341 None => {
342 is_bidi_domain = true;
343 }
344 }
345 }
346 }
347 }
348
349 let mut validated = String::new();
350 let mut first = true;
351 for label in normalized.split('.') {
352 if !first {
353 validated.push('.');
354 }
355 first = false;
356 if label.starts_with(PUNYCODE_PREFIX) {
357 match punycode::decode_to_string(&label[PUNYCODE_PREFIX.len()..]) {
358 Some(decoded_label) => {
359 let config = config.transitional_processing(false);
360 validate_full(&decoded_label, is_bidi_domain, config, errors);
361 validated.push_str(&decoded_label)
362 }
363 None => errors.push(Error::PunycodeError),
364 }
365 } else {
366 // `normalized` is already `NFC` so we can skip that check
367 validate(label, is_bidi_domain, config, errors);
368 validated.push_str(label)
369 }
370 }
371 validated
372 }
373
374 #[derive(Clone, Copy)]
375 pub struct Config {
376 use_std3_ascii_rules: bool,
377 transitional_processing: bool,
378 verify_dns_length: bool,
379 check_hyphens: bool,
380 }
381
382 /// The defaults are that of https://url.spec.whatwg.org/#idna
383 impl Default for Config {
default() -> Self384 fn default() -> Self {
385 Config {
386 use_std3_ascii_rules: false,
387 transitional_processing: false,
388 check_hyphens: false,
389 // check_bidi: true,
390 // check_joiners: true,
391
392 // Only use for to_ascii, not to_unicode
393 verify_dns_length: false,
394 }
395 }
396 }
397
398 impl Config {
399 #[inline]
use_std3_ascii_rules(mut self, value: bool) -> Self400 pub fn use_std3_ascii_rules(mut self, value: bool) -> Self {
401 self.use_std3_ascii_rules = value;
402 self
403 }
404
405 #[inline]
transitional_processing(mut self, value: bool) -> Self406 pub fn transitional_processing(mut self, value: bool) -> Self {
407 self.transitional_processing = value;
408 self
409 }
410
411 #[inline]
verify_dns_length(mut self, value: bool) -> Self412 pub fn verify_dns_length(mut self, value: bool) -> Self {
413 self.verify_dns_length = value;
414 self
415 }
416
417 #[inline]
check_hyphens(mut self, value: bool) -> Self418 pub fn check_hyphens(mut self, value: bool) -> Self {
419 self.check_hyphens = value;
420 self
421 }
422
423 /// http://www.unicode.org/reports/tr46/#ToASCII
to_ascii(self, domain: &str) -> Result<String, Errors>424 pub fn to_ascii(self, domain: &str) -> Result<String, Errors> {
425 let mut errors = Vec::new();
426 let mut result = String::new();
427 let mut first = true;
428 for label in processing(domain, self, &mut errors).split('.') {
429 if !first {
430 result.push('.');
431 }
432 first = false;
433 if label.is_ascii() {
434 result.push_str(label);
435 } else {
436 match punycode::encode_str(label) {
437 Some(x) => {
438 result.push_str(PUNYCODE_PREFIX);
439 result.push_str(&x);
440 }
441 None => errors.push(Error::PunycodeError),
442 }
443 }
444 }
445
446 if self.verify_dns_length {
447 let domain = if result.ends_with(".") {
448 &result[..result.len() - 1]
449 } else {
450 &*result
451 };
452 if domain.len() < 1 || domain.split('.').any(|label| label.len() < 1) {
453 errors.push(Error::TooShortForDns)
454 }
455 if domain.len() > 253 || domain.split('.').any(|label| label.len() > 63) {
456 errors.push(Error::TooLongForDns)
457 }
458 }
459 if errors.is_empty() {
460 Ok(result)
461 } else {
462 Err(Errors(errors))
463 }
464 }
465
466 /// http://www.unicode.org/reports/tr46/#ToUnicode
to_unicode(self, domain: &str) -> (String, Result<(), Errors>)467 pub fn to_unicode(self, domain: &str) -> (String, Result<(), Errors>) {
468 let mut errors = Vec::new();
469 let domain = processing(domain, self, &mut errors);
470 let errors = if errors.is_empty() {
471 Ok(())
472 } else {
473 Err(Errors(errors))
474 };
475 (domain, errors)
476 }
477 }
478
479 #[derive(PartialEq, Eq, Clone, Copy, Debug)]
480 enum Error {
481 PunycodeError,
482 ValidityCriteria,
483 DissallowedByStd3AsciiRules,
484 DissallowedMappedInStd3,
485 DissallowedCharacter,
486 TooLongForDns,
487 TooShortForDns,
488 }
489
490 /// Errors recorded during UTS #46 processing.
491 ///
492 /// This is opaque for now, only indicating the presence of at least one error.
493 /// More details may be exposed in the future.
494 #[derive(Debug)]
495 pub struct Errors(Vec<Error>);
496