1 // Copyright 2013 Simon Sapin.
2 //
3 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6 // option. This file may not be copied, modified, or distributed
7 // except according to those terms.
8 
9 //! Punycode ([RFC 3492](http://tools.ietf.org/html/rfc3492)) implementation.
10 //!
11 //! Since Punycode fundamentally works on unicode code points,
12 //! `encode` and `decode` take and return slices and vectors of `char`.
13 //! `encode_str` and `decode_to_string` provide convenience wrappers
14 //! that convert from and to Rust’s UTF-8 based `str` and `String` types.
15 
16 use std::u32;
17 use std::char;
18 use std::ascii::AsciiExt;
19 
20 // Bootstring parameters for Punycode
21 static BASE: u32 = 36;
22 static T_MIN: u32 = 1;
23 static T_MAX: u32 = 26;
24 static SKEW: u32 = 38;
25 static DAMP: u32 = 700;
26 static INITIAL_BIAS: u32 = 72;
27 static INITIAL_N: u32 = 0x80;
28 static DELIMITER: char = '-';
29 
30 
31 #[inline]
adapt(mut delta: u32, num_points: u32, first_time: bool) -> u3232 fn adapt(mut delta: u32, num_points: u32, first_time: bool) -> u32 {
33     delta /= if first_time { DAMP } else { 2 };
34     delta += delta / num_points;
35     let mut k = 0;
36     while delta > ((BASE - T_MIN) * T_MAX) / 2 {
37         delta /= BASE - T_MIN;
38         k += BASE;
39     }
40     k + (((BASE - T_MIN + 1) * delta) / (delta + SKEW))
41 }
42 
43 
44 /// Convert Punycode to an Unicode `String`.
45 ///
46 /// This is a convenience wrapper around `decode`.
47 #[inline]
decode_to_string(input: &str) -> Option<String>48 pub fn decode_to_string(input: &str) -> Option<String> {
49     decode(input).map(|chars| chars.into_iter().collect())
50 }
51 
52 
53 /// Convert Punycode to Unicode.
54 ///
55 /// Return None on malformed input or overflow.
56 /// Overflow can only happen on inputs that take more than
57 /// 63 encoded bytes, the DNS limit on domain name labels.
decode(input: &str) -> Option<Vec<char>>58 pub fn decode(input: &str) -> Option<Vec<char>> {
59     // Handle "basic" (ASCII) code points.
60     // They are encoded as-is before the last delimiter, if any.
61     let (mut output, input) = match input.rfind(DELIMITER) {
62         None => (Vec::new(), input),
63         Some(position) => (
64             input[..position].chars().collect(),
65             if position > 0 { &input[position + 1..] } else { input }
66         )
67     };
68     let mut code_point = INITIAL_N;
69     let mut bias = INITIAL_BIAS;
70     let mut i = 0;
71     let mut iter = input.bytes();
72     loop {
73         let previous_i = i;
74         let mut weight = 1;
75         let mut k = BASE;
76         let mut byte = match iter.next() {
77             None => break,
78             Some(byte) => byte,
79         };
80         // Decode a generalized variable-length integer into delta,
81         // which gets added to i.
82         loop {
83             let digit = match byte {
84                 byte @ b'0' ... b'9' => byte - b'0' + 26,
85                 byte @ b'A' ... b'Z' => byte - b'A',
86                 byte @ b'a' ... b'z' => byte - b'a',
87                 _ => return None
88             } as u32;
89             if digit > (u32::MAX - i) / weight {
90                 return None  // Overflow
91             }
92             i += digit * weight;
93             let t = if k <= bias { T_MIN }
94                     else if k >= bias + T_MAX { T_MAX }
95                     else { k - bias };
96             if digit < t {
97                 break
98             }
99             if weight > u32::MAX / (BASE - t) {
100                 return None  // Overflow
101             }
102             weight *= BASE - t;
103             k += BASE;
104             byte = match iter.next() {
105                 None => return None,  // End of input before the end of this delta
106                 Some(byte) => byte,
107             };
108         }
109         let length = output.len() as u32;
110         bias = adapt(i - previous_i, length + 1, previous_i == 0);
111         if i / (length + 1) > u32::MAX - code_point {
112             return None  // Overflow
113         }
114         // i was supposed to wrap around from length+1 to 0,
115         // incrementing code_point each time.
116         code_point += i / (length + 1);
117         i %= length + 1;
118         let c = match char::from_u32(code_point) {
119             Some(c) => c,
120             None => return None
121         };
122         output.insert(i as usize, c);
123         i += 1;
124     }
125     Some(output)
126 }
127 
128 
129 /// Convert an Unicode `str` to Punycode.
130 ///
131 /// This is a convenience wrapper around `encode`.
132 #[inline]
encode_str(input: &str) -> Option<String>133 pub fn encode_str(input: &str) -> Option<String> {
134     encode(&input.chars().collect::<Vec<char>>())
135 }
136 
137 
138 /// Convert Unicode to Punycode.
139 ///
140 /// Return None on overflow, which can only happen on inputs that would take more than
141 /// 63 encoded bytes, the DNS limit on domain name labels.
encode(input: &[char]) -> Option<String>142 pub fn encode(input: &[char]) -> Option<String> {
143     // Handle "basic" (ASCII) code points. They are encoded as-is.
144     let output_bytes = input.iter().filter_map(|&c|
145         if c.is_ascii() { Some(c as u8) } else { None }
146     ).collect();
147     let mut output = unsafe { String::from_utf8_unchecked(output_bytes) };
148     let basic_length = output.len() as u32;
149     if basic_length > 0 {
150         output.push_str("-")
151     }
152     let mut code_point = INITIAL_N;
153     let mut delta = 0;
154     let mut bias = INITIAL_BIAS;
155     let mut processed = basic_length;
156     let input_length = input.len() as u32;
157     while processed < input_length {
158         // All code points < code_point have been handled already.
159         // Find the next larger one.
160         let min_code_point = input.iter().map(|&c| c as u32)
161                                   .filter(|&c| c >= code_point).min().unwrap();
162         if min_code_point - code_point > (u32::MAX - delta) / (processed + 1) {
163             return None  // Overflow
164         }
165         // Increase delta to advance the decoder’s <code_point,i> state to <min_code_point,0>
166         delta += (min_code_point - code_point) * (processed + 1);
167         code_point = min_code_point;
168         for &c in input {
169             let c = c as u32;
170             if c < code_point {
171                 delta += 1;
172                 if delta == 0 {
173                     return None  // Overflow
174                 }
175             }
176             if c == code_point {
177                 // Represent delta as a generalized variable-length integer:
178                 let mut q = delta;
179                 let mut k = BASE;
180                 loop {
181                     let t = if k <= bias { T_MIN }
182                             else if k >= bias + T_MAX { T_MAX }
183                             else { k - bias };
184                     if q < t {
185                         break
186                     }
187                     let value = t + ((q - t) % (BASE - t));
188                     value_to_digit(value, &mut output);
189                     q = (q - t) / (BASE - t);
190                     k += BASE;
191                 }
192                 value_to_digit(q, &mut output);
193                 bias = adapt(delta, processed + 1, processed == basic_length);
194                 delta = 0;
195                 processed += 1;
196             }
197         }
198         delta += 1;
199         code_point += 1;
200     }
201     Some(output)
202 }
203 
204 
205 #[inline]
value_to_digit(value: u32, output: &mut String)206 fn value_to_digit(value: u32, output: &mut String) {
207     let code_point = match value {
208         0 ... 25 => value + 0x61,  // a..z
209         26 ... 35 => value - 26 + 0x30,  // 0..9
210         _ => panic!()
211     };
212     unsafe { output.as_mut_vec().push(code_point as u8) }
213 }
214