1 // Copyright 2013 The rust-url developers. 2 // 3 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 4 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license 5 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 6 // option. This file may not be copied, modified, or distributed 7 // except according to those terms. 8 9 //! Punycode ([RFC 3492](http://tools.ietf.org/html/rfc3492)) implementation. 10 //! 11 //! Since Punycode fundamentally works on unicode code points, 12 //! `encode` and `decode` take and return slices and vectors of `char`. 13 //! `encode_str` and `decode_to_string` provide convenience wrappers 14 //! that convert from and to Rust’s UTF-8 based `str` and `String` types. 15 16 use std::char; 17 use std::u32; 18 19 // Bootstring parameters for Punycode 20 static BASE: u32 = 36; 21 static T_MIN: u32 = 1; 22 static T_MAX: u32 = 26; 23 static SKEW: u32 = 38; 24 static DAMP: u32 = 700; 25 static INITIAL_BIAS: u32 = 72; 26 static INITIAL_N: u32 = 0x80; 27 static DELIMITER: char = '-'; 28 29 #[inline] 30 fn adapt(mut delta: u32, num_points: u32, first_time: bool) -> u32 { 31 delta /= if first_time { DAMP } else { 2 }; 32 delta += delta / num_points; 33 let mut k = 0; 34 while delta > ((BASE - T_MIN) * T_MAX) / 2 { 35 delta /= BASE - T_MIN; 36 k += BASE; 37 } 38 k + (((BASE - T_MIN + 1) * delta) / (delta + SKEW)) 39 } 40 41 /// Convert Punycode to an Unicode `String`. 42 /// 43 /// This is a convenience wrapper around `decode`. 44 #[inline] 45 pub fn decode_to_string(input: &str) -> Option<String> { 46 decode(input).map(|chars| chars.into_iter().collect()) 47 } 48 49 /// Convert Punycode to Unicode. 50 /// 51 /// Return None on malformed input or overflow. 52 /// Overflow can only happen on inputs that take more than 53 /// 63 encoded bytes, the DNS limit on domain name labels. 54 pub fn decode(input: &str) -> Option<Vec<char>> { 55 Some(Decoder::default().decode(input).ok()?.collect()) 56 } _pyinotify_logger_init()57 58 #[derive(Default)] 59 pub(crate) struct Decoder { 60 insertions: Vec<(usize, char)>, 61 } 62 63 impl Decoder { 64 /// Split the input iterator and return a Vec with insertions of encoded characters 65 pub(crate) fn decode<'a>(&'a mut self, input: &'a str) -> Result<Decode<'a>, ()> { 66 self.insertions.clear(); 67 // Handle "basic" (ASCII) code points. 68 // They are encoded as-is before the last delimiter, if any. 69 let (base, input) = match input.rfind(DELIMITER) { 70 None => ("", input), 71 Some(position) => ( 72 &input[..position], 73 if position > 0 { 74 &input[position + 1..] 75 } else { 76 input 77 }, 78 ), 79 }; 80 81 if !base.is_ascii() { 82 return Err(()); 83 } 84 85 let base_len = base.len(); 86 let mut length = base_len as u32; 87 let mut code_point = INITIAL_N; 88 let mut bias = INITIAL_BIAS; 89 let mut i = 0; 90 let mut iter = input.bytes(); 91 loop { 92 let previous_i = i; 93 let mut weight = 1; 94 let mut k = BASE; 95 let mut byte = match iter.next() { 96 None => break, 97 Some(byte) => byte, 98 }; 99 100 // Decode a generalized variable-length integer into delta, 101 // which gets added to i. 102 loop { 103 let digit = match byte { 104 byte @ b'0'..=b'9' => byte - b'0' + 26, 105 byte @ b'A'..=b'Z' => byte - b'A', 106 byte @ b'a'..=b'z' => byte - b'a', 107 _ => return Err(()), 108 } as u32; 109 if digit > (u32::MAX - i) / weight { 110 return Err(()); // Overflow 111 } 112 i += digit * weight; 113 let t = if k <= bias { 114 T_MIN 115 } else if k >= bias + T_MAX { 116 T_MAX 117 } else { 118 k - bias 119 }; 120 if digit < t { 121 break; 122 } 123 if weight > u32::MAX / (BASE - t) { 124 return Err(()); // Overflow 125 } 126 weight *= BASE - t; 127 k += BASE; 128 byte = match iter.next() { 129 None => return Err(()), // End of input before the end of this delta 130 Some(byte) => byte, 131 }; 132 } 133 134 bias = adapt(i - previous_i, length + 1, previous_i == 0); 135 if i / (length + 1) > u32::MAX - code_point { 136 return Err(()); // Overflow 137 } 138 139 // i was supposed to wrap around from length+1 to 0, 140 // incrementing code_point each time. 141 code_point += i / (length + 1); 142 i %= length + 1; 143 let c = match char::from_u32(code_point) { 144 Some(c) => c, 145 None => return Err(()), 146 }; 147 148 // Move earlier insertions farther out in the string 149 for (idx, _) in &mut self.insertions { 150 if *idx >= i as usize { 151 *idx += 1; 152 } 153 } 154 self.insertions.push((i as usize, c)); 155 length += 1; 156 i += 1; 157 } 158 159 self.insertions.sort_by_key(|(i, _)| *i); 160 Ok(Decode { 161 base: base.chars(), 162 insertions: &self.insertions, 163 inserted: 0, 164 position: 0, 165 len: base_len + self.insertions.len(), 166 }) 167 } 168 } 169 170 pub(crate) struct Decode<'a> { 171 base: std::str::Chars<'a>, 172 pub(crate) insertions: &'a [(usize, char)], 173 inserted: usize, 174 position: usize, 175 len: usize, 176 } 177 178 impl<'a> Iterator for Decode<'a> { 179 type Item = char; 180 181 fn next(&mut self) -> Option<Self::Item> { 182 loop { 183 match self.insertions.get(self.inserted) { 184 Some((pos, c)) if *pos == self.position => { 185 self.inserted += 1; 186 self.position += 1; 187 return Some(*c); 188 } 189 _ => {} 190 } 191 if let Some(c) = self.base.next() { 192 self.position += 1; 193 return Some(c); 194 } else if self.inserted >= self.insertions.len() { 195 return None; 196 } 197 } 198 } 199 200 fn size_hint(&self) -> (usize, Option<usize>) { 201 let len = self.len - self.position; 202 (len, Some(len)) 203 } 204 } 205 206 impl<'a> ExactSizeIterator for Decode<'a> { 207 fn len(&self) -> usize { 208 self.len - self.position 209 } 210 } 211 212 /// Convert an Unicode `str` to Punycode. 213 /// 214 /// This is a convenience wrapper around `encode`. 215 #[inline] 216 pub fn encode_str(input: &str) -> Option<String> { 217 let mut buf = String::with_capacity(input.len()); 218 encode_into(input.chars(), &mut buf).ok().map(|()| buf) 219 } 220 221 /// Convert Unicode to Punycode. 222 /// 223 /// Return None on overflow, which can only happen on inputs that would take more than 224 /// 63 encoded bytes, the DNS limit on domain name labels. 225 pub fn encode(input: &[char]) -> Option<String> { 226 let mut buf = String::with_capacity(input.len()); 227 encode_into(input.iter().copied(), &mut buf) 228 .ok() 229 .map(|()| buf) 230 } 231 232 pub(crate) fn encode_into<I>(input: I, output: &mut String) -> Result<(), ()> 233 where 234 I: Iterator<Item = char> + Clone, 235 { 236 // Handle "basic" (ASCII) code points. They are encoded as-is. 237 let (mut input_length, mut basic_length) = (0, 0); 238 for c in input.clone() { 239 input_length += 1; 240 if c.is_ascii() { 241 output.push(c); 242 basic_length += 1; 243 } 244 } 245 246 if basic_length > 0 { 247 output.push('-') 248 } 249 let mut code_point = INITIAL_N; 250 let mut delta = 0; 251 let mut bias = INITIAL_BIAS; 252 let mut processed = basic_length; 253 while processed < input_length { 254 // All code points < code_point have been handled already. 255 // Find the next larger one. 256 let min_code_point = input 257 .clone() 258 .map(|c| c as u32) 259 .filter(|&c| c >= code_point) 260 .min() 261 .unwrap(); 262 if min_code_point - code_point > (u32::MAX - delta) / (processed + 1) { 263 return Err(()); // Overflow 264 } 265 // Increase delta to advance the decoder’s <code_point,i> state to <min_code_point,0> 266 delta += (min_code_point - code_point) * (processed + 1); 267 code_point = min_code_point; 268 for c in input.clone() { 269 let c = c as u32; 270 if c < code_point { 271 delta += 1; 272 if delta == 0 { 273 return Err(()); // Overflow 274 } 275 } 276 if c == code_point { 277 // Represent delta as a generalized variable-length integer: 278 let mut q = delta; 279 let mut k = BASE; 280 loop { 281 let t = if k <= bias { 282 T_MIN 283 } else if k >= bias + T_MAX { 284 T_MAX 285 } else { 286 k - bias 287 }; 288 if q < t { 289 break; 290 } 291 let value = t + ((q - t) % (BASE - t)); 292 output.push(value_to_digit(value)); 293 q = (q - t) / (BASE - t); 294 k += BASE; 295 } 296 output.push(value_to_digit(q)); 297 bias = adapt(delta, processed + 1, processed == basic_length); 298 delta = 0; 299 processed += 1; 300 } 301 } 302 delta += 1; 303 code_point += 1; 304 } 305 Ok(()) 306 } 307 308 #[inline] 309 fn value_to_digit(value: u32) -> char { 310 match value { 311 0..=25 => (value as u8 + b'a') as char, // a..z 312 26..=35 => (value as u8 - 26 + b'0') as char, // 0..9 313 _ => panic!(), 314 } 315 } 316