1 // Copyright 2013 The rust-url developers.
2 //
3 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6 // option. This file may not be copied, modified, or distributed
7 // except according to those terms.
8
9 //! Punycode ([RFC 3492](http://tools.ietf.org/html/rfc3492)) implementation.
10 //!
11 //! Since Punycode fundamentally works on unicode code points,
12 //! `encode` and `decode` take and return slices and vectors of `char`.
13 //! `encode_str` and `decode_to_string` provide convenience wrappers
14 //! that convert from and to Rust’s UTF-8 based `str` and `String` types.
15
16 use std::u32;
17 use std::char;
18 #[allow(unused_imports, deprecated)]
19 use std::ascii::AsciiExt;
20
21 // Bootstring parameters for Punycode
22 static BASE: u32 = 36;
23 static T_MIN: u32 = 1;
24 static T_MAX: u32 = 26;
25 static SKEW: u32 = 38;
26 static DAMP: u32 = 700;
27 static INITIAL_BIAS: u32 = 72;
28 static INITIAL_N: u32 = 0x80;
29 static DELIMITER: char = '-';
30
31
32 #[inline]
adapt(mut delta: u32, num_points: u32, first_time: bool) -> u3233 fn adapt(mut delta: u32, num_points: u32, first_time: bool) -> u32 {
34 delta /= if first_time { DAMP } else { 2 };
35 delta += delta / num_points;
36 let mut k = 0;
37 while delta > ((BASE - T_MIN) * T_MAX) / 2 {
38 delta /= BASE - T_MIN;
39 k += BASE;
40 }
41 k + (((BASE - T_MIN + 1) * delta) / (delta + SKEW))
42 }
43
44
45 /// Convert Punycode to an Unicode `String`.
46 ///
47 /// This is a convenience wrapper around `decode`.
48 #[inline]
decode_to_string(input: &str) -> Option<String>49 pub fn decode_to_string(input: &str) -> Option<String> {
50 decode(input).map(|chars| chars.into_iter().collect())
51 }
52
53
54 /// Convert Punycode to Unicode.
55 ///
56 /// Return None on malformed input or overflow.
57 /// Overflow can only happen on inputs that take more than
58 /// 63 encoded bytes, the DNS limit on domain name labels.
decode(input: &str) -> Option<Vec<char>>59 pub fn decode(input: &str) -> Option<Vec<char>> {
60 // Handle "basic" (ASCII) code points.
61 // They are encoded as-is before the last delimiter, if any.
62 let (mut output, input) = match input.rfind(DELIMITER) {
63 None => (Vec::new(), input),
64 Some(position) => (
65 input[..position].chars().collect(),
66 if position > 0 { &input[position + 1..] } else { input }
67 )
68 };
69 let mut code_point = INITIAL_N;
70 let mut bias = INITIAL_BIAS;
71 let mut i = 0;
72 let mut iter = input.bytes();
73 loop {
74 let previous_i = i;
75 let mut weight = 1;
76 let mut k = BASE;
77 let mut byte = match iter.next() {
78 None => break,
79 Some(byte) => byte,
80 };
81 // Decode a generalized variable-length integer into delta,
82 // which gets added to i.
83 loop {
84 let digit = match byte {
85 byte @ b'0' ... b'9' => byte - b'0' + 26,
86 byte @ b'A' ... b'Z' => byte - b'A',
87 byte @ b'a' ... b'z' => byte - b'a',
88 _ => return None
89 } as u32;
90 if digit > (u32::MAX - i) / weight {
91 return None // Overflow
92 }
93 i += digit * weight;
94 let t = if k <= bias { T_MIN }
95 else if k >= bias + T_MAX { T_MAX }
96 else { k - bias };
97 if digit < t {
98 break
99 }
100 if weight > u32::MAX / (BASE - t) {
101 return None // Overflow
102 }
103 weight *= BASE - t;
104 k += BASE;
105 byte = match iter.next() {
106 None => return None, // End of input before the end of this delta
107 Some(byte) => byte,
108 };
109 }
110 let length = output.len() as u32;
111 bias = adapt(i - previous_i, length + 1, previous_i == 0);
112 if i / (length + 1) > u32::MAX - code_point {
113 return None // Overflow
114 }
115 // i was supposed to wrap around from length+1 to 0,
116 // incrementing code_point each time.
117 code_point += i / (length + 1);
118 i %= length + 1;
119 let c = match char::from_u32(code_point) {
120 Some(c) => c,
121 None => return None
122 };
123 output.insert(i as usize, c);
124 i += 1;
125 }
126 Some(output)
127 }
128
129
130 /// Convert an Unicode `str` to Punycode.
131 ///
132 /// This is a convenience wrapper around `encode`.
133 #[inline]
encode_str(input: &str) -> Option<String>134 pub fn encode_str(input: &str) -> Option<String> {
135 encode(&input.chars().collect::<Vec<char>>())
136 }
137
138
139 /// Convert Unicode to Punycode.
140 ///
141 /// Return None on overflow, which can only happen on inputs that would take more than
142 /// 63 encoded bytes, the DNS limit on domain name labels.
encode(input: &[char]) -> Option<String>143 pub fn encode(input: &[char]) -> Option<String> {
144 // Handle "basic" (ASCII) code points. They are encoded as-is.
145 let output_bytes = input.iter().filter_map(|&c|
146 if c.is_ascii() { Some(c as u8) } else { None }
147 ).collect();
148 let mut output = unsafe { String::from_utf8_unchecked(output_bytes) };
149 let basic_length = output.len() as u32;
150 if basic_length > 0 {
151 output.push_str("-")
152 }
153 let mut code_point = INITIAL_N;
154 let mut delta = 0;
155 let mut bias = INITIAL_BIAS;
156 let mut processed = basic_length;
157 let input_length = input.len() as u32;
158 while processed < input_length {
159 // All code points < code_point have been handled already.
160 // Find the next larger one.
161 let min_code_point = input.iter().map(|&c| c as u32)
162 .filter(|&c| c >= code_point).min().unwrap();
163 if min_code_point - code_point > (u32::MAX - delta) / (processed + 1) {
164 return None // Overflow
165 }
166 // Increase delta to advance the decoder’s <code_point,i> state to <min_code_point,0>
167 delta += (min_code_point - code_point) * (processed + 1);
168 code_point = min_code_point;
169 for &c in input {
170 let c = c as u32;
171 if c < code_point {
172 delta += 1;
173 if delta == 0 {
174 return None // Overflow
175 }
176 }
177 if c == code_point {
178 // Represent delta as a generalized variable-length integer:
179 let mut q = delta;
180 let mut k = BASE;
181 loop {
182 let t = if k <= bias { T_MIN }
183 else if k >= bias + T_MAX { T_MAX }
184 else { k - bias };
185 if q < t {
186 break
187 }
188 let value = t + ((q - t) % (BASE - t));
189 output.push(value_to_digit(value));
190 q = (q - t) / (BASE - t);
191 k += BASE;
192 }
193 output.push(value_to_digit(q));
194 bias = adapt(delta, processed + 1, processed == basic_length);
195 delta = 0;
196 processed += 1;
197 }
198 }
199 delta += 1;
200 code_point += 1;
201 }
202 Some(output)
203 }
204
205
206 #[inline]
value_to_digit(value: u32) -> char207 fn value_to_digit(value: u32) -> char {
208 match value {
209 0 ... 25 => (value as u8 + 'a' as u8) as char, // a..z
210 26 ... 35 => (value as u8 - 26 + '0' as u8) as char, // 0..9
211 _ => panic!()
212 }
213 }
214