1 // Copyright 2013 The rust-url developers.
2 //
3 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6 // option. This file may not be copied, modified, or distributed
7 // except according to those terms.
8 
9 //! Punycode ([RFC 3492](http://tools.ietf.org/html/rfc3492)) implementation.
10 //!
11 //! Since Punycode fundamentally works on unicode code points,
12 //! `encode` and `decode` take and return slices and vectors of `char`.
13 //! `encode_str` and `decode_to_string` provide convenience wrappers
14 //! that convert from and to Rust’s UTF-8 based `str` and `String` types.
15 
16 use std::char;
17 use std::u32;
18 
19 // Bootstring parameters for Punycode
20 static BASE: u32 = 36;
21 static T_MIN: u32 = 1;
22 static T_MAX: u32 = 26;
23 static SKEW: u32 = 38;
24 static DAMP: u32 = 700;
25 static INITIAL_BIAS: u32 = 72;
26 static INITIAL_N: u32 = 0x80;
27 static DELIMITER: char = '-';
28 
29 #[inline]
adapt(mut delta: u32, num_points: u32, first_time: bool) -> u3230 fn adapt(mut delta: u32, num_points: u32, first_time: bool) -> u32 {
31     delta /= if first_time { DAMP } else { 2 };
32     delta += delta / num_points;
33     let mut k = 0;
34     while delta > ((BASE - T_MIN) * T_MAX) / 2 {
35         delta /= BASE - T_MIN;
36         k += BASE;
37     }
38     k + (((BASE - T_MIN + 1) * delta) / (delta + SKEW))
39 }
40 
41 /// Convert Punycode to an Unicode `String`.
42 ///
43 /// This is a convenience wrapper around `decode`.
44 #[inline]
decode_to_string(input: &str) -> Option<String>45 pub fn decode_to_string(input: &str) -> Option<String> {
46     decode(input).map(|chars| chars.into_iter().collect())
47 }
48 
49 /// Convert Punycode to Unicode.
50 ///
51 /// Return None on malformed input or overflow.
52 /// Overflow can only happen on inputs that take more than
53 /// 63 encoded bytes, the DNS limit on domain name labels.
decode(input: &str) -> Option<Vec<char>>54 pub fn decode(input: &str) -> Option<Vec<char>> {
55     // Handle "basic" (ASCII) code points.
56     // They are encoded as-is before the last delimiter, if any.
57     let (mut output, input) = match input.rfind(DELIMITER) {
58         None => (Vec::new(), input),
59         Some(position) => (
60             input[..position].chars().collect(),
61             if position > 0 {
62                 &input[position + 1..]
63             } else {
64                 input
65             },
66         ),
67     };
68     let mut code_point = INITIAL_N;
69     let mut bias = INITIAL_BIAS;
70     let mut i = 0;
71     let mut iter = input.bytes();
72     loop {
73         let previous_i = i;
74         let mut weight = 1;
75         let mut k = BASE;
76         let mut byte = match iter.next() {
77             None => break,
78             Some(byte) => byte,
79         };
80         // Decode a generalized variable-length integer into delta,
81         // which gets added to i.
82         loop {
83             let digit = match byte {
84                 byte @ b'0'..=b'9' => byte - b'0' + 26,
85                 byte @ b'A'..=b'Z' => byte - b'A',
86                 byte @ b'a'..=b'z' => byte - b'a',
87                 _ => return None,
88             } as u32;
89             if digit > (u32::MAX - i) / weight {
90                 return None; // Overflow
91             }
92             i += digit * weight;
93             let t = if k <= bias {
94                 T_MIN
95             } else if k >= bias + T_MAX {
96                 T_MAX
97             } else {
98                 k - bias
99             };
100             if digit < t {
101                 break;
102             }
103             if weight > u32::MAX / (BASE - t) {
104                 return None; // Overflow
105             }
106             weight *= BASE - t;
107             k += BASE;
108             byte = match iter.next() {
109                 None => return None, // End of input before the end of this delta
110                 Some(byte) => byte,
111             };
112         }
113         let length = output.len() as u32;
114         bias = adapt(i - previous_i, length + 1, previous_i == 0);
115         if i / (length + 1) > u32::MAX - code_point {
116             return None; // Overflow
117         }
118         // i was supposed to wrap around from length+1 to 0,
119         // incrementing code_point each time.
120         code_point += i / (length + 1);
121         i %= length + 1;
122         let c = match char::from_u32(code_point) {
123             Some(c) => c,
124             None => return None,
125         };
126         output.insert(i as usize, c);
127         i += 1;
128     }
129     Some(output)
130 }
131 
132 /// Convert an Unicode `str` to Punycode.
133 ///
134 /// This is a convenience wrapper around `encode`.
135 #[inline]
encode_str(input: &str) -> Option<String>136 pub fn encode_str(input: &str) -> Option<String> {
137     encode(&input.chars().collect::<Vec<char>>())
138 }
139 
140 /// Convert Unicode to Punycode.
141 ///
142 /// Return None on overflow, which can only happen on inputs that would take more than
143 /// 63 encoded bytes, the DNS limit on domain name labels.
encode(input: &[char]) -> Option<String>144 pub fn encode(input: &[char]) -> Option<String> {
145     // Handle "basic" (ASCII) code points. They are encoded as-is.
146     let output_bytes = input
147         .iter()
148         .filter_map(|&c| if c.is_ascii() { Some(c as u8) } else { None })
149         .collect();
150     let mut output = unsafe { String::from_utf8_unchecked(output_bytes) };
151     let basic_length = output.len() as u32;
152     if basic_length > 0 {
153         output.push_str("-")
154     }
155     let mut code_point = INITIAL_N;
156     let mut delta = 0;
157     let mut bias = INITIAL_BIAS;
158     let mut processed = basic_length;
159     let input_length = input.len() as u32;
160     while processed < input_length {
161         // All code points < code_point have been handled already.
162         // Find the next larger one.
163         let min_code_point = input
164             .iter()
165             .map(|&c| c as u32)
166             .filter(|&c| c >= code_point)
167             .min()
168             .unwrap();
169         if min_code_point - code_point > (u32::MAX - delta) / (processed + 1) {
170             return None; // Overflow
171         }
172         // Increase delta to advance the decoder’s <code_point,i> state to <min_code_point,0>
173         delta += (min_code_point - code_point) * (processed + 1);
174         code_point = min_code_point;
175         for &c in input {
176             let c = c as u32;
177             if c < code_point {
178                 delta += 1;
179                 if delta == 0 {
180                     return None; // Overflow
181                 }
182             }
183             if c == code_point {
184                 // Represent delta as a generalized variable-length integer:
185                 let mut q = delta;
186                 let mut k = BASE;
187                 loop {
188                     let t = if k <= bias {
189                         T_MIN
190                     } else if k >= bias + T_MAX {
191                         T_MAX
192                     } else {
193                         k - bias
194                     };
195                     if q < t {
196                         break;
197                     }
198                     let value = t + ((q - t) % (BASE - t));
199                     output.push(value_to_digit(value));
200                     q = (q - t) / (BASE - t);
201                     k += BASE;
202                 }
203                 output.push(value_to_digit(q));
204                 bias = adapt(delta, processed + 1, processed == basic_length);
205                 delta = 0;
206                 processed += 1;
207             }
208         }
209         delta += 1;
210         code_point += 1;
211     }
212     Some(output)
213 }
214 
215 #[inline]
value_to_digit(value: u32) -> char216 fn value_to_digit(value: u32) -> char {
217     match value {
218         0..=25 => (value as u8 + 'a' as u8) as char, // a..z
219         26..=35 => (value as u8 - 26 + '0' as u8) as char, // 0..9
220         _ => panic!(),
221     }
222 }
223