1 // This is a part of rust-encoding. 2 // Copyright (c) 2013-2015, Kang Seonghoon. 3 // See README.md and LICENSE.txt for details. 4 5 //! An interface for retrieving an encoding (or a set of encodings) from a string/numeric label. 6 7 use all; 8 use types::EncodingRef; whatwg_index(name, comments)9 10 /// Returns an encoding from given label, defined in the WHATWG Encoding standard, if any. 11 /// Implements "get an encoding" algorithm: http://encoding.spec.whatwg.org/#concept-encoding-get 12 pub fn encoding_from_whatwg_label(label: &str) -> Option<EncodingRef> { 13 let label = label.trim_matches(&[' ', '\n', '\r', '\t', '\x0C'][..]); 14 let label: String = 15 label.chars().map(|c| match c { 'A'...'Z' => (c as u8 + 32) as char, _ => c }).collect(); 16 match &label[..] { 17 "unicode-1-1-utf-8" | 18 "utf-8" | 19 "utf8" => 20 Some(all::UTF_8 as EncodingRef), 21 "866" | 22 "cp866" | 23 "csibm866" | 24 "ibm866" => 25 Some(all::IBM866 as EncodingRef), 26 "csisolatin2" | 27 "iso-8859-2" | 28 "iso-ir-101" | 29 "iso8859-2" | 30 "iso88592" | 31 "iso_8859-2" | 32 "iso_8859-2:1987" | 33 "l2" | 34 "latin2" => 35 Some(all::ISO_8859_2 as EncodingRef), 36 "csisolatin3" | 37 "iso-8859-3" | 38 "iso-ir-109" | 39 "iso8859-3" | 40 "iso88593" | 41 "iso_8859-3" | 42 "iso_8859-3:1988" | 43 "l3" | 44 "latin3" => 45 Some(all::ISO_8859_3 as EncodingRef), 46 "csisolatin4" | 47 "iso-8859-4" | 48 "iso-ir-110" | 49 "iso8859-4" | 50 "iso88594" | 51 "iso_8859-4" | 52 "iso_8859-4:1988" | 53 "l4" | 54 "latin4" => 55 Some(all::ISO_8859_4 as EncodingRef), 56 "csisolatincyrillic" | 57 "cyrillic" | 58 "iso-8859-5" | 59 "iso-ir-144" | 60 "iso8859-5" | 61 "iso88595" | 62 "iso_8859-5" | 63 "iso_8859-5:1988" => 64 Some(all::ISO_8859_5 as EncodingRef), 65 "arabic" | 66 "asmo-708" | 67 "csiso88596e" | 68 "csiso88596i" | 69 "csisolatinarabic" | 70 "ecma-114" | 71 "iso-8859-6" | 72 "iso-8859-6-e" | 73 "iso-8859-6-i" | 74 "iso-ir-127" | 75 "iso8859-6" | 76 "iso88596" | 77 "iso_8859-6" | 78 "iso_8859-6:1987" => 79 Some(all::ISO_8859_6 as EncodingRef), 80 "csisolatingreek" | 81 "ecma-118" | 82 "elot_928" | 83 "greek" | 84 "greek8" | 85 "iso-8859-7" | 86 "iso-ir-126" | 87 "iso8859-7" | 88 "iso88597" | 89 "iso_8859-7" | 90 "iso_8859-7:1987" | 91 "sun_eu_greek" => 92 Some(all::ISO_8859_7 as EncodingRef), 93 "csiso88598e" | 94 "csisolatinhebrew" | 95 "hebrew" | 96 "iso-8859-8" | 97 "iso-8859-8-e" | 98 "iso-ir-138" | 99 "iso8859-8" | 100 "iso88598" | 101 "iso_8859-8" | 102 "iso_8859-8:1988" | 103 "visual" => 104 Some(all::ISO_8859_8 as EncodingRef), 105 "csiso88598i" | 106 "iso-8859-8-i" | 107 "logical" => 108 Some(all::whatwg::ISO_8859_8_I as EncodingRef), 109 "csisolatin6" | 110 "iso-8859-10" | 111 "iso-ir-157" | 112 "iso8859-10" | 113 "iso885910" | 114 "l6" | 115 "latin6" => 116 Some(all::ISO_8859_10 as EncodingRef), 117 "iso-8859-13" | 118 "iso8859-13" | 119 "iso885913" => 120 Some(all::ISO_8859_13 as EncodingRef), 121 "iso-8859-14" | 122 "iso8859-14" | 123 "iso885914" => 124 Some(all::ISO_8859_14 as EncodingRef), 125 "csisolatin9" | 126 "iso-8859-15" | 127 "iso8859-15" | 128 "iso885915" | 129 "iso_8859-15" | 130 "l9" => 131 Some(all::ISO_8859_15 as EncodingRef), 132 "iso-8859-16" => 133 Some(all::ISO_8859_16 as EncodingRef), 134 "cskoi8r" | 135 "koi" | 136 "koi8" | 137 "koi8-r" | 138 "koi8_r" => 139 Some(all::KOI8_R as EncodingRef), 140 "koi8-u" => 141 Some(all::KOI8_U as EncodingRef), 142 "csmacintosh" | 143 "mac" | 144 "macintosh" | 145 "x-mac-roman" => 146 Some(all::MAC_ROMAN as EncodingRef), 147 "dos-874" | 148 "iso-8859-11" | 149 "iso8859-11" | 150 "iso885911" | 151 "tis-620" | 152 "windows-874" => 153 Some(all::WINDOWS_874 as EncodingRef), 154 "cp1250" | 155 "windows-1250" | 156 "x-cp1250" => 157 Some(all::WINDOWS_1250 as EncodingRef), 158 "cp1251" | 159 "windows-1251" | 160 "x-cp1251" => 161 Some(all::WINDOWS_1251 as EncodingRef), 162 "ansi_x3.4-1968" | 163 "ascii" | 164 "cp1252" | 165 "cp819" | 166 "csisolatin1" | 167 "ibm819" | 168 "iso-8859-1" | 169 "iso-ir-100" | 170 "iso8859-1" | 171 "iso88591" | 172 "iso_8859-1" | 173 "iso_8859-1:1987" | 174 "l1" | 175 "latin1" | 176 "us-ascii" | 177 "windows-1252" | 178 "x-cp1252" => 179 Some(all::WINDOWS_1252 as EncodingRef), 180 "cp1253" | 181 "windows-1253" | 182 "x-cp1253" => 183 Some(all::WINDOWS_1253 as EncodingRef), 184 "cp1254" | 185 "csisolatin5" | 186 "iso-8859-9" | 187 "iso-ir-148" | 188 "iso8859-9" | 189 "iso88599" | 190 "iso_8859-9" | 191 "iso_8859-9:1989" | 192 "l5" | 193 "latin5" | 194 "windows-1254" | 195 "x-cp1254" => 196 Some(all::WINDOWS_1254 as EncodingRef), 197 "cp1255" | 198 "windows-1255" | 199 "x-cp1255" => 200 Some(all::WINDOWS_1255 as EncodingRef), 201 "cp1256" | 202 "windows-1256" | 203 "x-cp1256" => 204 Some(all::WINDOWS_1256 as EncodingRef), 205 "cp1257" | 206 "windows-1257" | 207 "x-cp1257" => 208 Some(all::WINDOWS_1257 as EncodingRef), 209 "cp1258" | 210 "windows-1258" | 211 "x-cp1258" => 212 Some(all::WINDOWS_1258 as EncodingRef), 213 "x-mac-cyrillic" | 214 "x-mac-ukrainian" => 215 Some(all::MAC_CYRILLIC as EncodingRef), 216 "chinese" | 217 "csgb2312" | 218 "csiso58gb231280" | 219 "gb2312" | 220 "gb_2312" | 221 "gb_2312-80" | 222 "gbk" | 223 "iso-ir-58" | 224 "x-gbk" => 225 Some(all::GBK as EncodingRef), 226 "gb18030" => 227 Some(all::GB18030 as EncodingRef), 228 "big5" | 229 "big5-hkscs" | 230 "cn-big5" | 231 "csbig5" | 232 "x-x-big5" => 233 Some(all::BIG5_2003 as EncodingRef), 234 "cseucpkdfmtjapanese" | 235 "euc-jp" | 236 "x-euc-jp" => 237 Some(all::EUC_JP as EncodingRef), 238 "csiso2022jp" | 239 "iso-2022-jp" => 240 Some(all::ISO_2022_JP as EncodingRef), 241 "csshiftjis" | 242 "ms_kanji" | 243 "shift-jis" | 244 "shift_jis" | 245 "sjis" | 246 "windows-31j" | 247 "x-sjis" => 248 Some(all::WINDOWS_31J as EncodingRef), 249 "cseuckr" | 250 "csksc56011987" | 251 "euc-kr" | 252 "iso-ir-149" | 253 "korean" | 254 "ks_c_5601-1987" | 255 "ks_c_5601-1989" | 256 "ksc5601" | 257 "ksc_5601" | 258 "windows-949" => 259 Some(all::WINDOWS_949 as EncodingRef), 260 "csiso2022kr" | 261 "hz-gb-2312" | 262 "iso-2022-kr" | 263 "iso-2022-cn" | 264 "iso-2022-cn-ext" => 265 Some(all::whatwg::REPLACEMENT as EncodingRef), 266 "utf-16be" => 267 Some(all::UTF_16BE as EncodingRef), 268 "utf-16" | 269 "utf-16le" => 270 Some(all::UTF_16LE as EncodingRef), 271 "x-user-defined" => 272 Some(all::whatwg::X_USER_DEFINED as EncodingRef), 273 _ => None 274 } 275 } 276 277 /// Returns an encoding from Windows code page number. 278 /// http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756%28v=vs.85%29.aspx 279 /// Sometimes it can return a *superset* of the requested encoding, e.g. for several CJK encodings. 280 pub fn encoding_from_windows_code_page(cp: usize) -> Option<EncodingRef> { 281 match cp { 282 65001 => Some(all::UTF_8 as EncodingRef), 283 866 => Some(all::IBM866 as EncodingRef), 284 28591 => Some(all::ISO_8859_1 as EncodingRef), 285 28592 => Some(all::ISO_8859_2 as EncodingRef), 286 28593 => Some(all::ISO_8859_3 as EncodingRef), 287 28594 => Some(all::ISO_8859_4 as EncodingRef), 288 28595 => Some(all::ISO_8859_5 as EncodingRef), 289 28596 => Some(all::ISO_8859_6 as EncodingRef), 290 28597 => Some(all::ISO_8859_7 as EncodingRef), 291 28598 => Some(all::ISO_8859_8 as EncodingRef), 292 38598 => Some(all::whatwg::ISO_8859_8_I as EncodingRef), 293 28603 => Some(all::ISO_8859_13 as EncodingRef), 294 28605 => Some(all::ISO_8859_15 as EncodingRef), 295 20866 => Some(all::KOI8_R as EncodingRef), 296 21866 => Some(all::KOI8_U as EncodingRef), 297 10000 => Some(all::MAC_ROMAN as EncodingRef), 298 874 => Some(all::WINDOWS_874 as EncodingRef), 299 1250 => Some(all::WINDOWS_1250 as EncodingRef), 300 1251 => Some(all::WINDOWS_1251 as EncodingRef), 301 1252 => Some(all::WINDOWS_1252 as EncodingRef), 302 1253 => Some(all::WINDOWS_1253 as EncodingRef), 303 1254 => Some(all::WINDOWS_1254 as EncodingRef), 304 1255 => Some(all::WINDOWS_1255 as EncodingRef), 305 1256 => Some(all::WINDOWS_1256 as EncodingRef), 306 1257 => Some(all::WINDOWS_1257 as EncodingRef), 307 1258 => Some(all::WINDOWS_1258 as EncodingRef), 308 1259 => Some(all::MAC_CYRILLIC as EncodingRef), 309 936 | 54936 => Some(all::GB18030 as EncodingRef), // XXX technically wrong 310 52936 => Some(all::HZ as EncodingRef), 311 950 => Some(all::BIG5_2003 as EncodingRef), 312 20932 => Some(all::EUC_JP as EncodingRef), 313 50220 => Some(all::ISO_2022_JP as EncodingRef), 314 932 => Some(all::WINDOWS_31J as EncodingRef), 315 949 => Some(all::WINDOWS_949 as EncodingRef), 316 1201 => Some(all::UTF_16BE as EncodingRef), 317 1200 => Some(all::UTF_16LE as EncodingRef), 318 _ => None 319 } 320 } 321 322 #[cfg(test)] 323 mod tests { 324 extern crate test; 325 use all; 326 use super::encoding_from_whatwg_label; 327 328 #[test] 329 fn test_encoding_from_whatwg_label() { 330 assert!(encoding_from_whatwg_label("utf-8").is_some()); 331 assert!(encoding_from_whatwg_label("UTF-8").is_some()); 332 assert!(encoding_from_whatwg_label("\t\n\x0C\r utf-8\t\n\x0C\r ").is_some()); 333 assert!(encoding_from_whatwg_label("\u{A0}utf-8").is_none(), 334 "Non-ASCII whitespace should not be trimmed"); 335 assert!(encoding_from_whatwg_label("greek").is_some()); 336 assert!(encoding_from_whatwg_label("gree\u{212A}").is_none(), 337 "Case-insensitive matching should be ASCII only. Kelvin sign does not match k."); 338 339 // checks if the `whatwg_name` method returns the label that resolves back to that encoding 340 for encoding in all::encodings() { 341 if let Some(whatwg_name) = encoding.whatwg_name() { 342 if whatwg_name == "replacement" { continue; } 343 assert_eq!(encoding_from_whatwg_label(whatwg_name).and_then(|e| e.whatwg_name()), 344 Some(whatwg_name)); 345 } 346 } 347 } 348 349 #[bench] 350 fn bench_encoding_from_whatwg_label(bencher: &mut test::Bencher) { 351 bencher.iter(|| test::black_box({ 352 encoding_from_whatwg_label("iso-8859-bazinga") 353 })) 354 } 355 } 356 357