1 // This is a part of rust-encoding.
2 // Copyright (c) 2013-2015, Kang Seonghoon.
3 // See README.md and LICENSE.txt for details.
4 //
5 // Portions Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
6 //
7 // Permission is hereby granted, free of charge, to any person obtaining a copy
8 // of this software and associated documentation files (the "Software"), to deal
9 // in the Software without restriction, including without limitation the rights
10 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 // copies of the Software, and to permit persons to whom the Software is
12 // furnished to do so, subject to the following conditions:
13 //
14 // The above copyright notice and this permission notice shall be included in
15 // all copies or substantial portions of the Software.
16 //
17 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 // SOFTWARE.
24 
25 //! UTF-8, the universal encoding.
26 
27 use std::{str, mem};
28 use std::convert::Into;
29 use types::*;
30 
31 /**
32  * UTF-8 (UCS Transformation Format, 8-bit).
33  *
34  * This is a Unicode encoding compatible to ASCII (ISO/IEC 646:US)
35  * and able to represent all Unicode codepoints uniquely and unambiguously.
36  * It has a variable-length design,
37  * where one codepoint may use 1 (up to U+007F), 2 (up to U+07FF), 3 (up to U+FFFF)
38  * and 4 bytes (up to U+10FFFF) depending on its value.
39  * The first byte of the sequence is distinct from other "continuation" bytes of the sequence
40  * making UTF-8 self-synchronizable and easy to handle.
41  * It has a fixed endianness, and can be lexicographically sorted by codepoints.
42  *
43  * The UTF-8 scanner used by this module is heavily based on Bjoern Hoehrmann's
44  * [Flexible and Economical UTF-8 Decoder](http://bjoern.hoehrmann.de/utf-8/decoder/dfa/).
45  */
46 #[derive(Clone, Copy)]
47 pub struct UTF8Encoding;
48 
49 impl Encoding for UTF8Encoding {
name(&self) -> &'static str50     fn name(&self) -> &'static str { "utf-8" }
whatwg_name(&self) -> Option<&'static str>51     fn whatwg_name(&self) -> Option<&'static str> { Some("utf-8") }
raw_encoder(&self) -> Box<RawEncoder>52     fn raw_encoder(&self) -> Box<RawEncoder> { UTF8Encoder::new() }
raw_decoder(&self) -> Box<RawDecoder>53     fn raw_decoder(&self) -> Box<RawDecoder> { UTF8Decoder::new() }
54 }
55 
56 /// An encoder for UTF-8.
57 #[derive(Clone, Copy)]
58 pub struct UTF8Encoder;
59 
60 impl UTF8Encoder {
new() -> Box<RawEncoder>61     pub fn new() -> Box<RawEncoder> { Box::new(UTF8Encoder) }
62 }
63 
64 impl RawEncoder for UTF8Encoder {
from_self(&self) -> Box<RawEncoder>65     fn from_self(&self) -> Box<RawEncoder> { UTF8Encoder::new() }
is_ascii_compatible(&self) -> bool66     fn is_ascii_compatible(&self) -> bool { true }
67 
raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option<CodecError>)68     fn raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option<CodecError>) {
69         let input: &[u8] = input.as_bytes();
70         assert!(str::from_utf8(input).is_ok());
71         output.write_bytes(input);
72         (input.len(), None)
73     }
74 
raw_finish(&mut self, _output: &mut ByteWriter) -> Option<CodecError>75     fn raw_finish(&mut self, _output: &mut ByteWriter) -> Option<CodecError> {
76         None
77     }
78 }
79 
80 /// A decoder for UTF-8.
81 #[derive(Clone, Copy)]
82 pub struct UTF8Decoder {
83     queuelen: usize,
84     queue: [u8; 4],
85     state: u8,
86 }
87 
88 impl UTF8Decoder {
new() -> Box<RawDecoder>89     pub fn new() -> Box<RawDecoder> {
90         Box::new(UTF8Decoder { queuelen: 0, queue: [0; 4], state: INITIAL_STATE })
91     }
92 }
93 
94 static CHAR_CATEGORY: [u8; 256] = [
95     //  0 (00-7F): one byte sequence
96     //  1 (80-8F): continuation byte
97     //  2 (C2-DF): start of two byte sequence
98     //  3 (E1-EC,EE-EF): start of three byte sequence, next byte unrestricted
99     //  4 (ED): start of three byte sequence, next byte restricted to non-surrogates (80-9F)
100     //  5 (F4): start of four byte sequence, next byte restricted to 0+10FFFF (80-8F)
101     //  6 (F1-F3): start of four byte sequence, next byte unrestricted
102     //  7 (A0-BF): continuation byte
103     //  8 (C0-C1,F5-FF): invalid (overlong or out-of-range) start of multi byte sequences
104     //  9 (90-9F): continuation byte
105     // 10 (E0): start of three byte sequence, next byte restricted to non-overlong (A0-BF)
106     // 11 (F0): start of four byte sequence, next byte restricted to non-overlong (90-BF)
107 
108      0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
109      0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
110      0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
111      0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
112      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
113      7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
114      8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
115     10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
116 ];
117 
118 static STATE_TRANSITIONS: [u8; 110] = [
119      0,98,12,24,48,84,72,98,98,98,36,60,       //  0: '??
120     86, 0,86,86,86,86,86, 0,86, 0,86,86,       // 12: .. 'cc
121     86,12,86,86,86,86,86,12,86,12,86,86,       // 24: .. 'cc cc
122     86,86,86,86,86,86,86,12,86,86,86,86,       // 36: .. 'cc(A0-BF) cc
123     86,12,86,86,86,86,86,86,86,12,86,86,       // 48: .. 'cc(80-9F) cc
124     86,86,86,86,86,86,86,24,86,24,86,86,       // 60: .. 'cc(90-BF) cc cc
125     86,24,86,86,86,86,86,24,86,24,86,86,       // 72: .. 'cc cc cc
126     86,24,86,86,86,86,86,86,86,86,86,86,86,86, // 84: .. 'cc(80-8F) cc cc
127        // 86,86,86,86,86,86,86,86,86,86,86,86, // 86: .. xx '..
128           98,98,98,98,98,98,98,98,98,98,98,98, // 98: xx '..
129 ];
130 
131 static INITIAL_STATE: u8 = 0;
132 static ACCEPT_STATE: u8 = 0;
133 static REJECT_STATE: u8 = 98;
134 static REJECT_STATE_WITH_BACKUP: u8 = 86;
135 
136 macro_rules! is_reject_state(($state:expr) => ($state >= REJECT_STATE_WITH_BACKUP));
137 macro_rules! next_state(($state:expr, $ch:expr) => (
138     STATE_TRANSITIONS[($state + CHAR_CATEGORY[$ch as usize]) as usize]
139 ));
140 
141 impl RawDecoder for UTF8Decoder {
from_self(&self) -> Box<RawDecoder>142     fn from_self(&self) -> Box<RawDecoder> { UTF8Decoder::new() }
is_ascii_compatible(&self) -> bool143     fn is_ascii_compatible(&self) -> bool { true }
144 
raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option<CodecError>)145     fn raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option<CodecError>) {
146         output.writer_hint(input.len());
147 
148         fn write_bytes(output: &mut StringWriter, bytes: &[u8]) {
149             output.write_str(unsafe {mem::transmute(bytes)});
150         }
151 
152         let mut state = self.state;
153         let mut processed = 0;
154         let mut offset = 0;
155 
156         // optimization: if we are in the initial state, quickly skip to the first non-MSB-set byte.
157         if state == INITIAL_STATE {
158             let first_msb = input.iter().position(|&ch| ch >= 0x80).unwrap_or(input.len());
159             offset += first_msb;
160             processed += first_msb;
161         }
162 
163         for (i, &ch) in input[offset..].iter().enumerate() {
164             state = next_state!(state, ch);
165             if state == ACCEPT_STATE {
166                 processed = i + offset + 1;
167             } else if is_reject_state!(state) {
168                 let upto = if state == REJECT_STATE {i + offset + 1} else {i + offset};
169                 self.state = INITIAL_STATE;
170                 if processed > 0 && self.queuelen > 0 { // flush `queue` outside the problem
171                     write_bytes(output, &self.queue[0..self.queuelen]);
172                 }
173                 self.queuelen = 0;
174                 write_bytes(output, &input[0..processed]);
175                 return (processed, Some(CodecError {
176                     upto: upto as isize, cause: "invalid sequence".into()
177                 }));
178             }
179         }
180 
181         self.state = state;
182         if processed > 0 && self.queuelen > 0 { // flush `queue`
183             write_bytes(output, &self.queue[0..self.queuelen]);
184             self.queuelen = 0;
185         }
186         write_bytes(output, &input[0..processed]);
187         if processed < input.len() {
188             let morequeuelen = input.len() - processed;
189             for i in 0..morequeuelen {
190                 self.queue[self.queuelen + i] = input[processed + i];
191             }
192             self.queuelen += morequeuelen;
193         }
194         (processed, None)
195     }
196 
raw_finish(&mut self, _output: &mut StringWriter) -> Option<CodecError>197     fn raw_finish(&mut self, _output: &mut StringWriter) -> Option<CodecError> {
198         let state = self.state;
199         let queuelen = self.queuelen;
200         self.state = INITIAL_STATE;
201         self.queuelen = 0;
202         if state != ACCEPT_STATE {
203             Some(CodecError { upto: 0, cause: "incomplete sequence".into() })
204         } else {
205             assert!(queuelen == 0);
206             None
207         }
208     }
209 }
210 
211 /// Almost equivalent to `std::str::from_utf8`.
212 /// This function is provided for the fair benchmark against the stdlib's UTF-8 conversion
213 /// functions, as rust-encoding always allocates a new string.
from_utf8<'a>(input: &'a [u8]) -> Option<&'a str>214 pub fn from_utf8<'a>(input: &'a [u8]) -> Option<&'a str> {
215     let mut iter = input.iter();
216     let mut state;
217 
218     macro_rules! return_as_whole(() => (return Some(unsafe {mem::transmute(input)})));
219 
220     // optimization: if we are in the initial state, quickly skip to the first non-MSB-set byte.
221     loop {
222         match iter.next() {
223             Some(&ch) if ch < 0x80 => {}
224             Some(&ch) => {
225                 state = next_state!(INITIAL_STATE, ch);
226                 break;
227             }
228             None => { return_as_whole!(); }
229         }
230     }
231 
232     for &ch in iter {
233         state = next_state!(state, ch);
234         if is_reject_state!(state) { return None; }
235     }
236     if state != ACCEPT_STATE { return None; }
237     return_as_whole!();
238 }
239 
240 #[cfg(test)]
241 mod tests {
242     // portions of these tests are adopted from Markus Kuhn's UTF-8 decoder capability and
243     // stress test: <http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt>.
244 
245     use super::{UTF8Encoding, from_utf8};
246     use std::str;
247     use testutils;
248     use types::*;
249 
250     #[test]
test_valid()251     fn test_valid() {
252         // one byte
253         let mut d = UTF8Encoding.raw_decoder();
254         assert_feed_ok!(d, [0x41], [], "A");
255         assert_feed_ok!(d, [0x42, 0x43], [], "BC");
256         assert_feed_ok!(d, [], [], "");
257         assert_feed_ok!(d, [0x44, 0x45, 0x46], [], "DEF");
258         assert_finish_ok!(d, "");
259 
260         // two bytes
261         let mut d = UTF8Encoding.raw_decoder();
262         assert_feed_ok!(d, [0xc2, 0xa2], [], "\u{a2}");
263         assert_feed_ok!(d, [0xc2, 0xac, 0xc2, 0xa9], [], "\u{ac}\u{0a9}");
264         assert_feed_ok!(d, [], [], "");
265         assert_feed_ok!(d, [0xd5, 0xa1, 0xd5, 0xb5, 0xd5, 0xa2, 0xd5, 0xb8, 0xd6, 0x82,
266                             0xd5, 0xa2, 0xd5, 0xa5, 0xd5, 0xb6], [],
267                         "\u{561}\u{0575}\u{562}\u{578}\u{582}\u{562}\u{565}\u{576}");
268         assert_finish_ok!(d, "");
269 
270         // three bytes
271         let mut d = UTF8Encoding.raw_decoder();
272         assert_feed_ok!(d, [0xed, 0x92, 0x89], [], "\u{d489}");
273         assert_feed_ok!(d, [0xe6, 0xbc, 0xa2, 0xe5, 0xad, 0x97], [], "\u{6f22}\u{5b57}");
274         assert_feed_ok!(d, [], [], "");
275         assert_feed_ok!(d, [0xc9, 0x99, 0xc9, 0x94, 0xc9, 0x90], [], "\u{259}\u{0254}\u{250}");
276         assert_finish_ok!(d, "");
277 
278         // four bytes
279         let mut d = UTF8Encoding.raw_decoder();
280         assert_feed_ok!(d, [0xf0, 0x90, 0x82, 0x82], [], "\u{10082}");
281         assert_feed_ok!(d, [], [], "");
282         assert_finish_ok!(d, "");
283 
284         // we don't test encoders as it is largely a no-op.
285     }
286 
287     #[test]
test_valid_boundary()288     fn test_valid_boundary() {
289         let mut d = UTF8Encoding.raw_decoder();
290         assert_feed_ok!(d, [0x00], [], "\x00");
291         assert_finish_ok!(d, "");
292 
293         let mut d = UTF8Encoding.raw_decoder();
294         assert_feed_ok!(d, [0x7f], [], "\x7f");
295         assert_finish_ok!(d, "");
296 
297         let mut d = UTF8Encoding.raw_decoder();
298         assert_feed_ok!(d, [0xc2, 0x80], [], "\u{80}");
299         assert_finish_ok!(d, "");
300 
301         let mut d = UTF8Encoding.raw_decoder();
302         assert_feed_ok!(d, [0xdf, 0xbf], [], "\u{7ff}");
303         assert_finish_ok!(d, "");
304 
305         let mut d = UTF8Encoding.raw_decoder();
306         assert_feed_ok!(d, [0xe0, 0xa0, 0x80], [], "\u{800}");
307         assert_finish_ok!(d, "");
308 
309         let mut d = UTF8Encoding.raw_decoder();
310         assert_feed_ok!(d, [0xed, 0x9f, 0xbf], [], "\u{d7ff}");
311         assert_finish_ok!(d, "");
312 
313         let mut d = UTF8Encoding.raw_decoder();
314         assert_feed_ok!(d, [0xee, 0x80, 0x80], [], "\u{e000}");
315         assert_finish_ok!(d, "");
316 
317         let mut d = UTF8Encoding.raw_decoder();
318         assert_feed_ok!(d, [0xef, 0xbf, 0xbf], [], "\u{ffff}");
319         assert_finish_ok!(d, "");
320 
321         let mut d = UTF8Encoding.raw_decoder();
322         assert_feed_ok!(d, [0xf0, 0x90, 0x80, 0x80], [], "\u{10000}");
323         assert_finish_ok!(d, "");
324 
325         let mut d = UTF8Encoding.raw_decoder();
326         assert_feed_ok!(d, [0xf4, 0x8f, 0xbf, 0xbf], [], "\u{10ffff}");
327         assert_finish_ok!(d, "");
328     }
329 
330     #[test]
test_valid_partial()331     fn test_valid_partial() {
332         let mut d = UTF8Encoding.raw_decoder();
333         assert_feed_ok!(d, [], [0xf0], "");
334         assert_feed_ok!(d, [], [0x90], "");
335         assert_feed_ok!(d, [], [0x82], "");
336         assert_feed_ok!(d, [0x82], [0xed], "\u{10082}");
337         assert_feed_ok!(d, [0x92, 0x89], [], "\u{d489}");
338         assert_finish_ok!(d, "");
339 
340         let mut d = UTF8Encoding.raw_decoder();
341         assert_feed_ok!(d, [], [0xc2], "");
342         assert_feed_ok!(d, [0xa9, 0x20], [], "\u{a9}\u{020}");
343         assert_finish_ok!(d, "");
344     }
345 
346     #[test]
test_invalid_continuation()347     fn test_invalid_continuation() {
348         for c in 0x80..0xc0 {
349             let mut d = UTF8Encoding.raw_decoder();
350             assert_feed_err!(d, [], [c], [], "");
351             assert_finish_ok!(d, "");
352 
353             let mut d = UTF8Encoding.raw_decoder();
354             assert_feed_err!(d, [], [c], [c], "");
355             assert_finish_ok!(d, "");
356 
357             let mut d = UTF8Encoding.raw_decoder();
358             assert_feed_err!(d, [], [c], [c, c], "");
359             assert_finish_ok!(d, "");
360         }
361     }
362 
363     #[test]
test_invalid_surrogate()364     fn test_invalid_surrogate() {
365         // surrogates should fail at the second byte.
366 
367         let mut d = UTF8Encoding.raw_decoder();
368         assert_feed_err!(d, [], [0xed], [0xa0, 0x80], "");
369         assert_finish_ok!(d, "");
370 
371         let mut d = UTF8Encoding.raw_decoder();
372         assert_feed_err!(d, [], [0xed], [0xad, 0xbf], "");
373         assert_finish_ok!(d, "");
374 
375         let mut d = UTF8Encoding.raw_decoder();
376         assert_feed_err!(d, [], [0xed], [0xae, 0x80], "");
377         assert_finish_ok!(d, "");
378 
379         let mut d = UTF8Encoding.raw_decoder();
380         assert_feed_err!(d, [], [0xed], [0xaf, 0xbf], "");
381         assert_finish_ok!(d, "");
382 
383         let mut d = UTF8Encoding.raw_decoder();
384         assert_feed_err!(d, [], [0xed], [0xb0, 0x80], "");
385         assert_finish_ok!(d, "");
386 
387         let mut d = UTF8Encoding.raw_decoder();
388         assert_feed_err!(d, [], [0xed], [0xbe, 0x80], "");
389         assert_finish_ok!(d, "");
390 
391         let mut d = UTF8Encoding.raw_decoder();
392         assert_feed_err!(d, [], [0xed], [0xbf, 0xbf], "");
393         assert_finish_ok!(d, "");
394     }
395 
396     #[test]
test_invalid_boundary()397     fn test_invalid_boundary() {
398         // as with surrogates, should fail at the second byte.
399         let mut d = UTF8Encoding.raw_decoder();
400         assert_feed_err!(d, [], [0xf4], [0x90, 0x90, 0x90], ""); // U+110000
401         assert_finish_ok!(d, "");
402     }
403 
404     #[test]
test_invalid_start_immediate_test_finish()405     fn test_invalid_start_immediate_test_finish() {
406         for c in 0xf5..0x100 {
407             let c = c as u8;
408             let mut d = UTF8Encoding.raw_decoder();
409             assert_feed_err!(d, [], [c], [], "");
410             assert_finish_ok!(d, "");
411         }
412     }
413 
414     #[test]
test_invalid_start_followed_by_space()415     fn test_invalid_start_followed_by_space() {
416         for c in 0xf5..0x100 {
417             let c = c as u8;
418 
419             let mut d = UTF8Encoding.raw_decoder();
420             assert_feed_err!(d, [], [c], [0x20], "");
421             assert_finish_ok!(d, "");
422 
423             let mut d = UTF8Encoding.raw_decoder();
424             assert_feed_err!(d, [], [c], [], "");
425             assert_feed_ok!(d, [0x20], [], "\x20");
426             assert_finish_ok!(d, "");
427         }
428     }
429 
430     #[test]
test_invalid_lone_start_immediate_test_finish()431     fn test_invalid_lone_start_immediate_test_finish() {
432         for c in 0xc2..0xf5 {
433             let mut d = UTF8Encoding.raw_decoder();
434             assert_feed_ok!(d, [], [c], ""); // wait for cont. bytes
435             assert_finish_err!(d, "");
436         }
437     }
438 
439     #[test]
test_invalid_lone_start_followed_by_space()440     fn test_invalid_lone_start_followed_by_space() {
441         for c in 0xc2..0xf5 {
442             let mut d = UTF8Encoding.raw_decoder();
443             assert_feed_err!(d, [], [c], [0x20], "");
444             assert_finish_ok!(d, "");
445 
446             let mut d = UTF8Encoding.raw_decoder();
447             assert_feed_ok!(d, [], [c], ""); // wait for cont. bytes
448             assert_feed_err!(d, [], [], [0x20], "");
449             assert_finish_ok!(d, "");
450         }
451     }
452 
453     #[test]
test_invalid_incomplete_three_byte_seq_followed_by_space()454     fn test_invalid_incomplete_three_byte_seq_followed_by_space() {
455         for b in 0xe0..0xf5 {
456             let c = if b == 0xe0 || b == 0xf0 {0xa0} else {0x80};
457 
458             let mut d = UTF8Encoding.raw_decoder();
459             assert_feed_err!(d, [], [b, c], [0x20], "");
460             assert_finish_ok!(d, "");
461 
462             let mut d = UTF8Encoding.raw_decoder();
463             assert_feed_ok!(d, [], [b, c], ""); // wait for cont. bytes
464             assert_feed_err!(d, [], [], [0x20], "");
465             assert_finish_ok!(d, "");
466 
467             let mut d = UTF8Encoding.raw_decoder();
468             assert_feed_ok!(d, [], [b], ""); // wait for cont. bytes
469             assert_feed_err!(d, [], [c], [0x20], "");
470             assert_finish_ok!(d, "");
471 
472             let mut d = UTF8Encoding.raw_decoder();
473             assert_feed_ok!(d, [], [b], ""); // wait for cont. bytes
474             assert_feed_ok!(d, [], [c], ""); // wait for cont. bytes
475             assert_feed_err!(d, [], [], [0x20], "");
476             assert_finish_ok!(d, "");
477         }
478     }
479 
480     #[test]
test_invalid_incomplete_four_byte_seq_followed_by_space()481     fn test_invalid_incomplete_four_byte_seq_followed_by_space() {
482         for a in 0xf0..0xf5 {
483             let b = if a == 0xf0 {0xa0} else {0x80};
484             let c = 0x80;
485 
486             let mut d = UTF8Encoding.raw_decoder();
487             assert_feed_err!(d, [], [a, b, c], [0x20], "");
488             assert_finish_ok!(d, "");
489 
490             let mut d = UTF8Encoding.raw_decoder();
491             assert_feed_ok!(d, [], [a], ""); // wait for cont. bytes
492             assert_feed_ok!(d, [], [b], ""); // wait for cont. bytes
493             assert_feed_ok!(d, [], [c], ""); // wait for cont. bytes
494             assert_feed_err!(d, [], [], [0x20], "");
495             assert_finish_ok!(d, "");
496 
497             let mut d = UTF8Encoding.raw_decoder();
498             assert_feed_ok!(d, [], [a, b], ""); // wait for cont. bytes
499             assert_feed_err!(d, [], [c], [0x20], "");
500             assert_finish_ok!(d, "");
501 
502             let mut d = UTF8Encoding.raw_decoder();
503             assert_feed_ok!(d, [], [a, b, c], ""); // wait for cont. bytes
504             assert_feed_err!(d, [], [], [0x20], "");
505             assert_finish_ok!(d, "");
506         }
507     }
508 
509     #[test]
test_invalid_too_many_cont_bytes()510     fn test_invalid_too_many_cont_bytes() {
511         let mut d = UTF8Encoding.raw_decoder();
512         assert_feed_err!(d, [0xc2, 0x80], [0x80], [], "\u{80}");
513         assert_finish_ok!(d, "");
514 
515         let mut d = UTF8Encoding.raw_decoder();
516         assert_feed_err!(d, [0xe0, 0xa0, 0x80], [0x80], [], "\u{800}");
517         assert_finish_ok!(d, "");
518 
519         let mut d = UTF8Encoding.raw_decoder();
520         assert_feed_err!(d, [0xf0, 0x90, 0x80, 0x80], [0x80], [], "\u{10000}");
521         assert_finish_ok!(d, "");
522 
523         // no continuation byte is consumed after 5/6-byte sequence starters and FE/FF
524         let mut d = UTF8Encoding.raw_decoder();
525         assert_feed_err!(d, [], [0xf8], [0x88, 0x80, 0x80, 0x80, 0x80], "");
526         assert_finish_ok!(d, "");
527 
528         let mut d = UTF8Encoding.raw_decoder();
529         assert_feed_err!(d, [], [0xfc], [0x84, 0x80, 0x80, 0x80, 0x80, 0x80], "");
530         assert_finish_ok!(d, "");
531 
532         let mut d = UTF8Encoding.raw_decoder();
533         assert_feed_err!(d, [], [0xfe], [0x80], "");
534         assert_finish_ok!(d, "");
535 
536         let mut d = UTF8Encoding.raw_decoder();
537         assert_feed_err!(d, [], [0xff], [0x80], "");
538         assert_finish_ok!(d, "");
539     }
540 
541     #[test]
test_invalid_too_many_cont_bytes_partial()542     fn test_invalid_too_many_cont_bytes_partial() {
543         let mut d = UTF8Encoding.raw_decoder();
544         assert_feed_ok!(d, [], [0xc2], "");
545         assert_feed_err!(d, [0x80], [0x80], [], "\u{80}");
546         assert_finish_ok!(d, "");
547 
548         let mut d = UTF8Encoding.raw_decoder();
549         assert_feed_ok!(d, [], [0xe0, 0xa0], "");
550         assert_feed_err!(d, [0x80], [0x80], [], "\u{800}");
551         assert_finish_ok!(d, "");
552 
553         let mut d = UTF8Encoding.raw_decoder();
554         assert_feed_ok!(d, [], [0xf0, 0x90, 0x80], "");
555         assert_feed_err!(d, [0x80], [0x80], [], "\u{10000}");
556         assert_finish_ok!(d, "");
557 
558         // no continuation byte is consumed after 5/6-byte sequence starters and FE/FF
559         let mut d = UTF8Encoding.raw_decoder();
560         assert_feed_err!(d, [], [0xf8], [], "");
561         assert_feed_err!(d, [], [0x88], [0x80, 0x80, 0x80, 0x80], "");
562         assert_finish_ok!(d, "");
563 
564         let mut d = UTF8Encoding.raw_decoder();
565         assert_feed_err!(d, [], [0xfc], [], "");
566         assert_feed_err!(d, [], [0x84], [0x80, 0x80, 0x80, 0x80, 0x80], "");
567         assert_finish_ok!(d, "");
568 
569         let mut d = UTF8Encoding.raw_decoder();
570         assert_feed_err!(d, [], [0xfe], [], "");
571         assert_feed_err!(d, [], [0x80], [], "");
572         assert_finish_ok!(d, "");
573 
574         let mut d = UTF8Encoding.raw_decoder();
575         assert_feed_err!(d, [], [0xff], [], "");
576         assert_feed_err!(d, [], [0x80], [], "");
577         assert_finish_ok!(d, "");
578     }
579 
580     #[test]
test_invalid_overlong_minimal()581     fn test_invalid_overlong_minimal() {
582         let mut d = UTF8Encoding.raw_decoder();
583         assert_feed_err!(d, [], [0xc0], [0x80], "");
584         assert_finish_ok!(d, "");
585 
586         let mut d = UTF8Encoding.raw_decoder();
587         assert_feed_err!(d, [], [0xe0], [0x80, 0x80], "");
588         assert_finish_ok!(d, "");
589 
590         let mut d = UTF8Encoding.raw_decoder();
591         assert_feed_err!(d, [], [0xf0], [0x80, 0x80, 0x80], "");
592         assert_finish_ok!(d, "");
593     }
594 
595     #[test]
test_invalid_overlong_maximal()596     fn test_invalid_overlong_maximal() {
597         let mut d = UTF8Encoding.raw_decoder();
598         assert_feed_err!(d, [], [0xc1], [0xbf], "");
599         assert_finish_ok!(d, "");
600 
601         let mut d = UTF8Encoding.raw_decoder();
602         assert_feed_err!(d, [], [0xe0], [0x9f, 0xbf], "");
603         assert_finish_ok!(d, "");
604 
605         let mut d = UTF8Encoding.raw_decoder();
606         assert_feed_err!(d, [], [0xf0], [0x8f, 0xbf, 0xbf], "");
607         assert_finish_ok!(d, "");
608     }
609 
610     #[test]
test_feed_after_finish()611     fn test_feed_after_finish() {
612         let mut d = UTF8Encoding.raw_decoder();
613         assert_feed_ok!(d, [0xc2, 0x80], [0xc2], "\u{80}");
614         assert_finish_err!(d, "");
615         assert_feed_ok!(d, [0xc2, 0x80], [], "\u{80}");
616         assert_finish_ok!(d, "");
617     }
618 
619     #[test]
test_correct_from_utf8()620     fn test_correct_from_utf8() {
621         let s = testutils::ASCII_TEXT.as_bytes();
622         assert_eq!(from_utf8(s), str::from_utf8(s).ok());
623 
624         let s = testutils::KOREAN_TEXT.as_bytes();
625         assert_eq!(from_utf8(s), str::from_utf8(s).ok());
626 
627         let s = testutils::INVALID_UTF8_TEXT;
628         assert_eq!(from_utf8(s), str::from_utf8(s).ok());
629     }
630 
631     mod bench_ascii {
632         extern crate test;
633         use super::super::{UTF8Encoding, from_utf8};
634         use std::str;
635         use testutils;
636         use types::*;
637 
638         #[bench]
bench_encode(bencher: &mut test::Bencher)639         fn bench_encode(bencher: &mut test::Bencher) {
640             let s = testutils::ASCII_TEXT;
641             bencher.bytes = s.len() as u64;
642             bencher.iter(|| test::black_box({
643                 UTF8Encoding.encode(s, EncoderTrap::Strict)
644             }))
645         }
646 
647         #[bench]
bench_decode(bencher: &mut test::Bencher)648         fn bench_decode(bencher: &mut test::Bencher) {
649             let s = testutils::ASCII_TEXT.as_bytes();
650             bencher.bytes = s.len() as u64;
651             bencher.iter(|| test::black_box({
652                 UTF8Encoding.decode(s, DecoderTrap::Strict)
653             }))
654         }
655 
656         #[bench]
bench_from_utf8(bencher: &mut test::Bencher)657         fn bench_from_utf8(bencher: &mut test::Bencher) {
658             let s = testutils::ASCII_TEXT.as_bytes();
659             bencher.bytes = s.len() as u64;
660             bencher.iter(|| test::black_box({
661                 from_utf8(s)
662             }))
663         }
664 
665         #[bench] // for the comparison
bench_stdlib_from_utf8(bencher: &mut test::Bencher)666         fn bench_stdlib_from_utf8(bencher: &mut test::Bencher) {
667             let s = testutils::ASCII_TEXT.as_bytes();
668             bencher.bytes = s.len() as u64;
669             bencher.iter(|| test::black_box({
670                 str::from_utf8(s)
671             }))
672         }
673 
674         #[bench] // for the comparison
bench_stdlib_from_utf8_lossy(bencher: &mut test::Bencher)675         fn bench_stdlib_from_utf8_lossy(bencher: &mut test::Bencher) {
676             let s = testutils::ASCII_TEXT.as_bytes();
677             bencher.bytes = s.len() as u64;
678             bencher.iter(|| test::black_box({
679                 String::from_utf8_lossy(s)
680             }))
681         }
682     }
683 
684     // why Korean? it has an excellent mix of multibyte sequences and ASCII sequences
685     // unlike other CJK scripts, so it reflects a practical use case a bit better.
686     mod bench_korean {
687         extern crate test;
688         use super::super::{UTF8Encoding, from_utf8};
689         use std::str;
690         use testutils;
691         use types::*;
692 
693         #[bench]
bench_encode(bencher: &mut test::Bencher)694         fn bench_encode(bencher: &mut test::Bencher) {
695             let s = testutils::KOREAN_TEXT;
696             bencher.bytes = s.len() as u64;
697             bencher.iter(|| test::black_box({
698                 UTF8Encoding.encode(s, EncoderTrap::Strict)
699             }))
700         }
701 
702         #[bench]
bench_decode(bencher: &mut test::Bencher)703         fn bench_decode(bencher: &mut test::Bencher) {
704             let s = testutils::KOREAN_TEXT.as_bytes();
705             bencher.bytes = s.len() as u64;
706             bencher.iter(|| test::black_box({
707                 UTF8Encoding.decode(s, DecoderTrap::Strict)
708             }))
709         }
710 
711         #[bench]
bench_from_utf8(bencher: &mut test::Bencher)712         fn bench_from_utf8(bencher: &mut test::Bencher) {
713             let s = testutils::KOREAN_TEXT.as_bytes();
714             bencher.bytes = s.len() as u64;
715             bencher.iter(|| test::black_box({
716                 from_utf8(s)
717             }))
718         }
719 
720         #[bench] // for the comparison
bench_stdlib_from_utf8(bencher: &mut test::Bencher)721         fn bench_stdlib_from_utf8(bencher: &mut test::Bencher) {
722             let s = testutils::KOREAN_TEXT.as_bytes();
723             bencher.bytes = s.len() as u64;
724             bencher.iter(|| test::black_box({
725                 str::from_utf8(s)
726             }))
727         }
728 
729         #[bench] // for the comparison
bench_stdlib_from_utf8_lossy(bencher: &mut test::Bencher)730         fn bench_stdlib_from_utf8_lossy(bencher: &mut test::Bencher) {
731             let s = testutils::KOREAN_TEXT.as_bytes();
732             bencher.bytes = s.len() as u64;
733             bencher.iter(|| test::black_box({
734                 String::from_utf8_lossy(s)
735             }))
736         }
737     }
738 
739     mod bench_lossy_invalid {
740         extern crate test;
741         use super::super::{UTF8Encoding, from_utf8};
742         use std::str;
743         use testutils;
744         use types::*;
745         use types::DecoderTrap::Replace as DecodeReplace;
746 
747         #[bench]
bench_decode_replace(bencher: &mut test::Bencher)748         fn bench_decode_replace(bencher: &mut test::Bencher) {
749             let s = testutils::INVALID_UTF8_TEXT;
750             bencher.bytes = s.len() as u64;
751             bencher.iter(|| test::black_box({
752                 UTF8Encoding.decode(s, DecodeReplace)
753             }))
754         }
755 
756         #[bench] // for the comparison
bench_from_utf8_failing(bencher: &mut test::Bencher)757         fn bench_from_utf8_failing(bencher: &mut test::Bencher) {
758             let s = testutils::INVALID_UTF8_TEXT;
759             bencher.bytes = s.len() as u64;
760             bencher.iter(|| test::black_box({
761                 from_utf8(s)
762             }))
763         }
764 
765         #[bench] // for the comparison
bench_stdlib_from_utf8_failing(bencher: &mut test::Bencher)766         fn bench_stdlib_from_utf8_failing(bencher: &mut test::Bencher) {
767             let s = testutils::INVALID_UTF8_TEXT;
768             bencher.bytes = s.len() as u64;
769             bencher.iter(|| test::black_box({
770                 str::from_utf8(s)
771             }))
772         }
773 
774         #[bench] // for the comparison
bench_stdlib_from_utf8_lossy(bencher: &mut test::Bencher)775         fn bench_stdlib_from_utf8_lossy(bencher: &mut test::Bencher) {
776             let s = testutils::INVALID_UTF8_TEXT;
777             bencher.bytes = s.len() as u64;
778             bencher.iter(|| test::black_box({
779                 String::from_utf8_lossy(s)
780             }))
781         }
782     }
783 
784     mod bench_lossy_external {
785         extern crate test;
786         use super::super::{UTF8Encoding, from_utf8};
787         use std::str;
788         use testutils;
789         use types::*;
790         use types::DecoderTrap::Replace as DecodeReplace;
791 
792         #[bench]
bench_decode_replace(bencher: &mut test::Bencher)793         fn bench_decode_replace(bencher: &mut test::Bencher) {
794             let s = testutils::get_external_bench_data();
795             bencher.bytes = s.len() as u64;
796             bencher.iter(|| test::black_box({
797                 UTF8Encoding.decode(&s, DecodeReplace)
798             }))
799         }
800 
801         #[bench] // for the comparison
bench_from_utf8_failing(bencher: &mut test::Bencher)802         fn bench_from_utf8_failing(bencher: &mut test::Bencher) {
803             let s = testutils::get_external_bench_data();
804             bencher.bytes = s.len() as u64;
805             bencher.iter(|| test::black_box({
806                 from_utf8(&s)
807             }))
808         }
809 
810         #[bench] // for the comparison
bench_stdlib_from_utf8_failing(bencher: &mut test::Bencher)811         fn bench_stdlib_from_utf8_failing(bencher: &mut test::Bencher) {
812             let s = testutils::get_external_bench_data();
813             bencher.bytes = s.len() as u64;
814             bencher.iter(|| test::black_box({
815                 str::from_utf8(&s)
816             }))
817         }
818 
819         #[bench] // for the comparison
bench_stdlib_from_utf8_lossy(bencher: &mut test::Bencher)820         fn bench_stdlib_from_utf8_lossy(bencher: &mut test::Bencher) {
821             let s = testutils::get_external_bench_data();
822             bencher.bytes = s.len() as u64;
823             bencher.iter(|| test::black_box({
824                 String::from_utf8_lossy(&s)
825             }))
826         }
827     }
828 }
829