1 // Copyright 2013 The rust-url developers.
2 //
3 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6 // option. This file may not be copied, modified, or distributed
7 // except according to those terms.
8 
9 //! Punycode ([RFC 3492](http://tools.ietf.org/html/rfc3492)) implementation.
10 //!
11 //! Since Punycode fundamentally works on unicode code points,
12 //! `encode` and `decode` take and return slices and vectors of `char`.
13 //! `encode_str` and `decode_to_string` provide convenience wrappers
14 //! that convert from and to Rust’s UTF-8 based `str` and `String` types.
15 
16 use std::char;
17 use std::u32;
18 
19 // Bootstring parameters for Punycode
20 static BASE: u32 = 36;
21 static T_MIN: u32 = 1;
22 static T_MAX: u32 = 26;
23 static SKEW: u32 = 38;
24 static DAMP: u32 = 700;
25 static INITIAL_BIAS: u32 = 72;
26 static INITIAL_N: u32 = 0x80;
27 static DELIMITER: char = '-';
28 
29 #[inline]
adapt(mut delta: u32, num_points: u32, first_time: bool) -> u3230 fn adapt(mut delta: u32, num_points: u32, first_time: bool) -> u32 {
31     delta /= if first_time { DAMP } else { 2 };
32     delta += delta / num_points;
33     let mut k = 0;
34     while delta > ((BASE - T_MIN) * T_MAX) / 2 {
35         delta /= BASE - T_MIN;
36         k += BASE;
37     }
38     k + (((BASE - T_MIN + 1) * delta) / (delta + SKEW))
39 }
40 
41 /// Convert Punycode to an Unicode `String`.
42 ///
43 /// This is a convenience wrapper around `decode`.
44 #[inline]
decode_to_string(input: &str) -> Option<String>45 pub fn decode_to_string(input: &str) -> Option<String> {
46     decode(input).map(|chars| chars.into_iter().collect())
47 }
48 
49 /// Convert Punycode to Unicode.
50 ///
51 /// Return None on malformed input or overflow.
52 /// Overflow can only happen on inputs that take more than
53 /// 63 encoded bytes, the DNS limit on domain name labels.
decode(input: &str) -> Option<Vec<char>>54 pub fn decode(input: &str) -> Option<Vec<char>> {
55     Some(Decoder::default().decode(input).ok()?.collect())
56 }
57 
58 #[derive(Default)]
59 pub(crate) struct Decoder {
60     insertions: Vec<(usize, char)>,
61 }
62 
63 impl Decoder {
64     /// Split the input iterator and return a Vec with insertions of encoded characters
decode<'a>(&'a mut self, input: &'a str) -> Result<Decode<'a>, ()>65     pub(crate) fn decode<'a>(&'a mut self, input: &'a str) -> Result<Decode<'a>, ()> {
66         self.insertions.clear();
67         // Handle "basic" (ASCII) code points.
68         // They are encoded as-is before the last delimiter, if any.
69         let (base, input) = match input.rfind(DELIMITER) {
70             None => ("", input),
71             Some(position) => (
72                 &input[..position],
73                 if position > 0 {
74                     &input[position + 1..]
75                 } else {
76                     input
77                 },
78             ),
79         };
80 
81         if !base.is_ascii() {
82             return Err(());
83         }
84 
85         let base_len = base.len();
86         let mut length = base_len as u32;
87         let mut code_point = INITIAL_N;
88         let mut bias = INITIAL_BIAS;
89         let mut i = 0;
90         let mut iter = input.bytes();
91         loop {
92             let previous_i = i;
93             let mut weight = 1;
94             let mut k = BASE;
95             let mut byte = match iter.next() {
96                 None => break,
97                 Some(byte) => byte,
98             };
99 
100             // Decode a generalized variable-length integer into delta,
101             // which gets added to i.
102             loop {
103                 let digit = match byte {
104                     byte @ b'0'..=b'9' => byte - b'0' + 26,
105                     byte @ b'A'..=b'Z' => byte - b'A',
106                     byte @ b'a'..=b'z' => byte - b'a',
107                     _ => return Err(()),
108                 } as u32;
109                 if digit > (u32::MAX - i) / weight {
110                     return Err(()); // Overflow
111                 }
112                 i += digit * weight;
113                 let t = if k <= bias {
114                     T_MIN
115                 } else if k >= bias + T_MAX {
116                     T_MAX
117                 } else {
118                     k - bias
119                 };
120                 if digit < t {
121                     break;
122                 }
123                 if weight > u32::MAX / (BASE - t) {
124                     return Err(()); // Overflow
125                 }
126                 weight *= BASE - t;
127                 k += BASE;
128                 byte = match iter.next() {
129                     None => return Err(()), // End of input before the end of this delta
130                     Some(byte) => byte,
131                 };
132             }
133 
134             bias = adapt(i - previous_i, length + 1, previous_i == 0);
135             if i / (length + 1) > u32::MAX - code_point {
136                 return Err(()); // Overflow
137             }
138 
139             // i was supposed to wrap around from length+1 to 0,
140             // incrementing code_point each time.
141             code_point += i / (length + 1);
142             i %= length + 1;
143             let c = match char::from_u32(code_point) {
144                 Some(c) => c,
145                 None => return Err(()),
146             };
147 
148             // Move earlier insertions farther out in the string
149             for (idx, _) in &mut self.insertions {
150                 if *idx >= i as usize {
151                     *idx += 1;
152                 }
153             }
154             self.insertions.push((i as usize, c));
155             length += 1;
156             i += 1;
157         }
158 
159         self.insertions.sort_by_key(|(i, _)| *i);
160         Ok(Decode {
161             base: base.chars(),
162             insertions: &self.insertions,
163             inserted: 0,
164             position: 0,
165             len: base_len + self.insertions.len(),
166         })
167     }
168 }
169 
170 pub(crate) struct Decode<'a> {
171     base: std::str::Chars<'a>,
172     pub(crate) insertions: &'a [(usize, char)],
173     inserted: usize,
174     position: usize,
175     len: usize,
176 }
177 
178 impl<'a> Iterator for Decode<'a> {
179     type Item = char;
180 
next(&mut self) -> Option<Self::Item>181     fn next(&mut self) -> Option<Self::Item> {
182         loop {
183             match self.insertions.get(self.inserted) {
184                 Some((pos, c)) if *pos == self.position => {
185                     self.inserted += 1;
186                     self.position += 1;
187                     return Some(*c);
188                 }
189                 _ => {}
190             }
191             if let Some(c) = self.base.next() {
192                 self.position += 1;
193                 return Some(c);
194             } else if self.inserted >= self.insertions.len() {
195                 return None;
196             }
197         }
198     }
199 
size_hint(&self) -> (usize, Option<usize>)200     fn size_hint(&self) -> (usize, Option<usize>) {
201         let len = self.len - self.position;
202         (len, Some(len))
203     }
204 }
205 
206 impl<'a> ExactSizeIterator for Decode<'a> {
len(&self) -> usize207     fn len(&self) -> usize {
208         self.len - self.position
209     }
210 }
211 
212 /// Convert an Unicode `str` to Punycode.
213 ///
214 /// This is a convenience wrapper around `encode`.
215 #[inline]
encode_str(input: &str) -> Option<String>216 pub fn encode_str(input: &str) -> Option<String> {
217     let mut buf = String::with_capacity(input.len());
218     encode_into(input.chars(), &mut buf).ok().map(|()| buf)
219 }
220 
221 /// Convert Unicode to Punycode.
222 ///
223 /// Return None on overflow, which can only happen on inputs that would take more than
224 /// 63 encoded bytes, the DNS limit on domain name labels.
encode(input: &[char]) -> Option<String>225 pub fn encode(input: &[char]) -> Option<String> {
226     let mut buf = String::with_capacity(input.len());
227     encode_into(input.iter().copied(), &mut buf)
228         .ok()
229         .map(|()| buf)
230 }
231 
encode_into<I>(input: I, output: &mut String) -> Result<(), ()> where I: Iterator<Item = char> + Clone,232 pub(crate) fn encode_into<I>(input: I, output: &mut String) -> Result<(), ()>
233 where
234     I: Iterator<Item = char> + Clone,
235 {
236     // Handle "basic" (ASCII) code points. They are encoded as-is.
237     let (mut input_length, mut basic_length) = (0, 0);
238     for c in input.clone() {
239         input_length += 1;
240         if c.is_ascii() {
241             output.push(c);
242             basic_length += 1;
243         }
244     }
245 
246     if basic_length > 0 {
247         output.push('-')
248     }
249     let mut code_point = INITIAL_N;
250     let mut delta = 0;
251     let mut bias = INITIAL_BIAS;
252     let mut processed = basic_length;
253     while processed < input_length {
254         // All code points < code_point have been handled already.
255         // Find the next larger one.
256         let min_code_point = input
257             .clone()
258             .map(|c| c as u32)
259             .filter(|&c| c >= code_point)
260             .min()
261             .unwrap();
262         if min_code_point - code_point > (u32::MAX - delta) / (processed + 1) {
263             return Err(()); // Overflow
264         }
265         // Increase delta to advance the decoder’s <code_point,i> state to <min_code_point,0>
266         delta += (min_code_point - code_point) * (processed + 1);
267         code_point = min_code_point;
268         for c in input.clone() {
269             let c = c as u32;
270             if c < code_point {
271                 delta += 1;
272                 if delta == 0 {
273                     return Err(()); // Overflow
274                 }
275             }
276             if c == code_point {
277                 // Represent delta as a generalized variable-length integer:
278                 let mut q = delta;
279                 let mut k = BASE;
280                 loop {
281                     let t = if k <= bias {
282                         T_MIN
283                     } else if k >= bias + T_MAX {
284                         T_MAX
285                     } else {
286                         k - bias
287                     };
288                     if q < t {
289                         break;
290                     }
291                     let value = t + ((q - t) % (BASE - t));
292                     output.push(value_to_digit(value));
293                     q = (q - t) / (BASE - t);
294                     k += BASE;
295                 }
296                 output.push(value_to_digit(q));
297                 bias = adapt(delta, processed + 1, processed == basic_length);
298                 delta = 0;
299                 processed += 1;
300             }
301         }
302         delta += 1;
303         code_point += 1;
304     }
305     Ok(())
306 }
307 
308 #[inline]
value_to_digit(value: u32) -> char309 fn value_to_digit(value: u32) -> char {
310     match value {
311         0..=25 => (value as u8 + b'a') as char,       // a..z
312         26..=35 => (value as u8 - 26 + b'0') as char, // 0..9
313         _ => panic!(),
314     }
315 }
316