1 // This is a part of rust-encoding.
2 // Copyright (c) 2013-2015, Kang Seonghoon.
3 // See README.md and LICENSE.txt for details.
4 
5 //! An interface for retrieving an encoding (or a set of encodings) from a string/numeric label.
6 
7 use all;
8 use types::EncodingRef;
9 
10 /// Returns an encoding from given label, defined in the WHATWG Encoding standard, if any.
11 /// Implements "get an encoding" algorithm: http://encoding.spec.whatwg.org/#concept-encoding-get
encoding_from_whatwg_label(label: &str) -> Option<EncodingRef>12 pub fn encoding_from_whatwg_label(label: &str) -> Option<EncodingRef> {
13     let label = label.trim_matches(&[' ', '\n', '\r', '\t', '\x0C'][..]);
14     let label: String =
15         label.chars().map(|c| match c { 'A'...'Z' => (c as u8 + 32) as char, _ => c }).collect();
16     match &label[..] {
17         "unicode-1-1-utf-8" |
18         "utf-8" |
19         "utf8" =>
20             Some(all::UTF_8 as EncodingRef),
21         "866" |
22         "cp866" |
23         "csibm866" |
24         "ibm866" =>
25             Some(all::IBM866 as EncodingRef),
26         "csisolatin2" |
27         "iso-8859-2" |
28         "iso-ir-101" |
29         "iso8859-2" |
30         "iso88592" |
31         "iso_8859-2" |
32         "iso_8859-2:1987" |
33         "l2" |
34         "latin2" =>
35             Some(all::ISO_8859_2 as EncodingRef),
36         "csisolatin3" |
37         "iso-8859-3" |
38         "iso-ir-109" |
39         "iso8859-3" |
40         "iso88593" |
41         "iso_8859-3" |
42         "iso_8859-3:1988" |
43         "l3" |
44         "latin3" =>
45             Some(all::ISO_8859_3 as EncodingRef),
46         "csisolatin4" |
47         "iso-8859-4" |
48         "iso-ir-110" |
49         "iso8859-4" |
50         "iso88594" |
51         "iso_8859-4" |
52         "iso_8859-4:1988" |
53         "l4" |
54         "latin4" =>
55             Some(all::ISO_8859_4 as EncodingRef),
56         "csisolatincyrillic" |
57         "cyrillic" |
58         "iso-8859-5" |
59         "iso-ir-144" |
60         "iso8859-5" |
61         "iso88595" |
62         "iso_8859-5" |
63         "iso_8859-5:1988" =>
64             Some(all::ISO_8859_5 as EncodingRef),
65         "arabic" |
66         "asmo-708" |
67         "csiso88596e" |
68         "csiso88596i" |
69         "csisolatinarabic" |
70         "ecma-114" |
71         "iso-8859-6" |
72         "iso-8859-6-e" |
73         "iso-8859-6-i" |
74         "iso-ir-127" |
75         "iso8859-6" |
76         "iso88596" |
77         "iso_8859-6" |
78         "iso_8859-6:1987" =>
79             Some(all::ISO_8859_6 as EncodingRef),
80         "csisolatingreek" |
81         "ecma-118" |
82         "elot_928" |
83         "greek" |
84         "greek8" |
85         "iso-8859-7" |
86         "iso-ir-126" |
87         "iso8859-7" |
88         "iso88597" |
89         "iso_8859-7" |
90         "iso_8859-7:1987" |
91         "sun_eu_greek" =>
92             Some(all::ISO_8859_7 as EncodingRef),
93         "csiso88598e" |
94         "csisolatinhebrew" |
95         "hebrew" |
96         "iso-8859-8" |
97         "iso-8859-8-e" |
98         "iso-ir-138" |
99         "iso8859-8" |
100         "iso88598" |
101         "iso_8859-8" |
102         "iso_8859-8:1988" |
103         "visual" =>
104             Some(all::ISO_8859_8 as EncodingRef),
105         "csiso88598i" |
106         "iso-8859-8-i" |
107         "logical" =>
108             Some(all::whatwg::ISO_8859_8_I as EncodingRef),
109         "csisolatin6" |
110         "iso-8859-10" |
111         "iso-ir-157" |
112         "iso8859-10" |
113         "iso885910" |
114         "l6" |
115         "latin6" =>
116             Some(all::ISO_8859_10 as EncodingRef),
117         "iso-8859-13" |
118         "iso8859-13" |
119         "iso885913" =>
120             Some(all::ISO_8859_13 as EncodingRef),
121         "iso-8859-14" |
122         "iso8859-14" |
123         "iso885914" =>
124             Some(all::ISO_8859_14 as EncodingRef),
125         "csisolatin9" |
126         "iso-8859-15" |
127         "iso8859-15" |
128         "iso885915" |
129         "iso_8859-15" |
130         "l9" =>
131             Some(all::ISO_8859_15 as EncodingRef),
132         "iso-8859-16" =>
133             Some(all::ISO_8859_16 as EncodingRef),
134         "cskoi8r" |
135         "koi" |
136         "koi8" |
137         "koi8-r" |
138         "koi8_r" =>
139             Some(all::KOI8_R as EncodingRef),
140         "koi8-u" =>
141             Some(all::KOI8_U as EncodingRef),
142         "csmacintosh" |
143         "mac" |
144         "macintosh" |
145         "x-mac-roman" =>
146             Some(all::MAC_ROMAN as EncodingRef),
147         "dos-874" |
148         "iso-8859-11" |
149         "iso8859-11" |
150         "iso885911" |
151         "tis-620" |
152         "windows-874" =>
153             Some(all::WINDOWS_874 as EncodingRef),
154         "cp1250" |
155         "windows-1250" |
156         "x-cp1250" =>
157             Some(all::WINDOWS_1250 as EncodingRef),
158         "cp1251" |
159         "windows-1251" |
160         "x-cp1251" =>
161             Some(all::WINDOWS_1251 as EncodingRef),
162         "ansi_x3.4-1968" |
163         "ascii" |
164         "cp1252" |
165         "cp819" |
166         "csisolatin1" |
167         "ibm819" |
168         "iso-8859-1" |
169         "iso-ir-100" |
170         "iso8859-1" |
171         "iso88591" |
172         "iso_8859-1" |
173         "iso_8859-1:1987" |
174         "l1" |
175         "latin1" |
176         "us-ascii" |
177         "windows-1252" |
178         "x-cp1252" =>
179             Some(all::WINDOWS_1252 as EncodingRef),
180         "cp1253" |
181         "windows-1253" |
182         "x-cp1253" =>
183             Some(all::WINDOWS_1253 as EncodingRef),
184         "cp1254" |
185         "csisolatin5" |
186         "iso-8859-9" |
187         "iso-ir-148" |
188         "iso8859-9" |
189         "iso88599" |
190         "iso_8859-9" |
191         "iso_8859-9:1989" |
192         "l5" |
193         "latin5" |
194         "windows-1254" |
195         "x-cp1254" =>
196             Some(all::WINDOWS_1254 as EncodingRef),
197         "cp1255" |
198         "windows-1255" |
199         "x-cp1255" =>
200             Some(all::WINDOWS_1255 as EncodingRef),
201         "cp1256" |
202         "windows-1256" |
203         "x-cp1256" =>
204             Some(all::WINDOWS_1256 as EncodingRef),
205         "cp1257" |
206         "windows-1257" |
207         "x-cp1257" =>
208             Some(all::WINDOWS_1257 as EncodingRef),
209         "cp1258" |
210         "windows-1258" |
211         "x-cp1258" =>
212             Some(all::WINDOWS_1258 as EncodingRef),
213         "x-mac-cyrillic" |
214         "x-mac-ukrainian" =>
215             Some(all::MAC_CYRILLIC as EncodingRef),
216         "chinese" |
217         "csgb2312" |
218         "csiso58gb231280" |
219         "gb2312" |
220         "gb_2312" |
221         "gb_2312-80" |
222         "gbk" |
223         "iso-ir-58" |
224         "x-gbk" =>
225             Some(all::GBK as EncodingRef),
226         "gb18030" =>
227             Some(all::GB18030 as EncodingRef),
228         "big5" |
229         "big5-hkscs" |
230         "cn-big5" |
231         "csbig5" |
232         "x-x-big5" =>
233             Some(all::BIG5_2003 as EncodingRef),
234         "cseucpkdfmtjapanese" |
235         "euc-jp" |
236         "x-euc-jp" =>
237             Some(all::EUC_JP as EncodingRef),
238         "csiso2022jp" |
239         "iso-2022-jp" =>
240             Some(all::ISO_2022_JP as EncodingRef),
241         "csshiftjis" |
242         "ms_kanji" |
243         "shift-jis" |
244         "shift_jis" |
245         "sjis" |
246         "windows-31j" |
247         "x-sjis" =>
248             Some(all::WINDOWS_31J as EncodingRef),
249         "cseuckr" |
250         "csksc56011987" |
251         "euc-kr" |
252         "iso-ir-149" |
253         "korean" |
254         "ks_c_5601-1987" |
255         "ks_c_5601-1989" |
256         "ksc5601" |
257         "ksc_5601" |
258         "windows-949" =>
259             Some(all::WINDOWS_949 as EncodingRef),
260         "csiso2022kr" |
261         "hz-gb-2312" |
262         "iso-2022-kr" |
263         "iso-2022-cn" |
264         "iso-2022-cn-ext" =>
265             Some(all::whatwg::REPLACEMENT as EncodingRef),
266         "utf-16be" =>
267             Some(all::UTF_16BE as EncodingRef),
268         "utf-16" |
269         "utf-16le" =>
270             Some(all::UTF_16LE as EncodingRef),
271         "x-user-defined" =>
272             Some(all::whatwg::X_USER_DEFINED as EncodingRef),
273         _ => None
274     }
275 }
276 
277 /// Returns an encoding from Windows code page number.
278 /// http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756%28v=vs.85%29.aspx
279 /// Sometimes it can return a *superset* of the requested encoding, e.g. for several CJK encodings.
encoding_from_windows_code_page(cp: usize) -> Option<EncodingRef>280 pub fn encoding_from_windows_code_page(cp: usize) -> Option<EncodingRef> {
281     match cp {
282         65001 => Some(all::UTF_8 as EncodingRef),
283         866 => Some(all::IBM866 as EncodingRef),
284         28591 => Some(all::ISO_8859_1 as EncodingRef),
285         28592 => Some(all::ISO_8859_2 as EncodingRef),
286         28593 => Some(all::ISO_8859_3 as EncodingRef),
287         28594 => Some(all::ISO_8859_4 as EncodingRef),
288         28595 => Some(all::ISO_8859_5 as EncodingRef),
289         28596 => Some(all::ISO_8859_6 as EncodingRef),
290         28597 => Some(all::ISO_8859_7 as EncodingRef),
291         28598 => Some(all::ISO_8859_8 as EncodingRef),
292         38598 => Some(all::whatwg::ISO_8859_8_I as EncodingRef),
293         28603 => Some(all::ISO_8859_13 as EncodingRef),
294         28605 => Some(all::ISO_8859_15 as EncodingRef),
295         20866 => Some(all::KOI8_R as EncodingRef),
296         21866 => Some(all::KOI8_U as EncodingRef),
297         10000 => Some(all::MAC_ROMAN as EncodingRef),
298         874 => Some(all::WINDOWS_874 as EncodingRef),
299         1250 => Some(all::WINDOWS_1250 as EncodingRef),
300         1251 => Some(all::WINDOWS_1251 as EncodingRef),
301         1252 => Some(all::WINDOWS_1252 as EncodingRef),
302         1253 => Some(all::WINDOWS_1253 as EncodingRef),
303         1254 => Some(all::WINDOWS_1254 as EncodingRef),
304         1255 => Some(all::WINDOWS_1255 as EncodingRef),
305         1256 => Some(all::WINDOWS_1256 as EncodingRef),
306         1257 => Some(all::WINDOWS_1257 as EncodingRef),
307         1258 => Some(all::WINDOWS_1258 as EncodingRef),
308         1259 => Some(all::MAC_CYRILLIC as EncodingRef),
309         936 | 54936 => Some(all::GB18030 as EncodingRef), // XXX technically wrong
310         52936 => Some(all::HZ as EncodingRef),
311         950 => Some(all::BIG5_2003 as EncodingRef),
312         20932 => Some(all::EUC_JP as EncodingRef),
313         50220 => Some(all::ISO_2022_JP as EncodingRef),
314         932 => Some(all::WINDOWS_31J as EncodingRef),
315         949 => Some(all::WINDOWS_949 as EncodingRef),
316         1201 => Some(all::UTF_16BE as EncodingRef),
317         1200 => Some(all::UTF_16LE as EncodingRef),
318         _ => None
319     }
320 }
321 
322 #[cfg(test)]
323 mod tests {
324     extern crate test;
325     use all;
326     use super::encoding_from_whatwg_label;
327 
328     #[test]
test_encoding_from_whatwg_label()329     fn test_encoding_from_whatwg_label() {
330         assert!(encoding_from_whatwg_label("utf-8").is_some());
331         assert!(encoding_from_whatwg_label("UTF-8").is_some());
332         assert!(encoding_from_whatwg_label("\t\n\x0C\r utf-8\t\n\x0C\r ").is_some());
333         assert!(encoding_from_whatwg_label("\u{A0}utf-8").is_none(),
334                 "Non-ASCII whitespace should not be trimmed");
335         assert!(encoding_from_whatwg_label("greek").is_some());
336         assert!(encoding_from_whatwg_label("gree\u{212A}").is_none(),
337                 "Case-insensitive matching should be ASCII only. Kelvin sign does not match k.");
338 
339         // checks if the `whatwg_name` method returns the label that resolves back to that encoding
340         for encoding in all::encodings() {
341             if let Some(whatwg_name) = encoding.whatwg_name() {
342                 if whatwg_name == "replacement" { continue; }
343                 assert_eq!(encoding_from_whatwg_label(whatwg_name).and_then(|e| e.whatwg_name()),
344                            Some(whatwg_name));
345             }
346         }
347     }
348 
349     #[bench]
bench_encoding_from_whatwg_label(bencher: &mut test::Bencher)350     fn bench_encoding_from_whatwg_label(bencher: &mut test::Bencher) {
351         bencher.iter(|| test::black_box({
352             encoding_from_whatwg_label("iso-8859-bazinga")
353         }))
354     }
355 }
356 
357