1 // This is a part of rust-encoding.
2 // Copyright (c) 2013-2015, Kang Seonghoon.
3 // See README.md and LICENSE.txt for details.
4 //
5 // Portions Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
6 //
7 // Permission is hereby granted, free of charge, to any person obtaining a copy
8 // of this software and associated documentation files (the "Software"), to deal
9 // in the Software without restriction, including without limitation the rights
10 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 // copies of the Software, and to permit persons to whom the Software is
12 // furnished to do so, subject to the following conditions:
13 //
14 // The above copyright notice and this permission notice shall be included in
15 // all copies or substantial portions of the Software.
16 //
17 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 // SOFTWARE.
24
25 //! UTF-8, the universal encoding.
26
27 use std::{str, mem};
28 use std::convert::Into;
29 use types::*;
30
31 /**
32 * UTF-8 (UCS Transformation Format, 8-bit).
33 *
34 * This is a Unicode encoding compatible to ASCII (ISO/IEC 646:US)
35 * and able to represent all Unicode codepoints uniquely and unambiguously.
36 * It has a variable-length design,
37 * where one codepoint may use 1 (up to U+007F), 2 (up to U+07FF), 3 (up to U+FFFF)
38 * and 4 bytes (up to U+10FFFF) depending on its value.
39 * The first byte of the sequence is distinct from other "continuation" bytes of the sequence
40 * making UTF-8 self-synchronizable and easy to handle.
41 * It has a fixed endianness, and can be lexicographically sorted by codepoints.
42 *
43 * The UTF-8 scanner used by this module is heavily based on Bjoern Hoehrmann's
44 * [Flexible and Economical UTF-8 Decoder](http://bjoern.hoehrmann.de/utf-8/decoder/dfa/).
45 */
46 #[derive(Clone, Copy)]
47 pub struct UTF8Encoding;
48
49 impl Encoding for UTF8Encoding {
name(&self) -> &'static str50 fn name(&self) -> &'static str { "utf-8" }
whatwg_name(&self) -> Option<&'static str>51 fn whatwg_name(&self) -> Option<&'static str> { Some("utf-8") }
raw_encoder(&self) -> Box<RawEncoder>52 fn raw_encoder(&self) -> Box<RawEncoder> { UTF8Encoder::new() }
raw_decoder(&self) -> Box<RawDecoder>53 fn raw_decoder(&self) -> Box<RawDecoder> { UTF8Decoder::new() }
54 }
55
56 /// An encoder for UTF-8.
57 #[derive(Clone, Copy)]
58 pub struct UTF8Encoder;
59
60 impl UTF8Encoder {
new() -> Box<RawEncoder>61 pub fn new() -> Box<RawEncoder> { Box::new(UTF8Encoder) }
62 }
63
64 impl RawEncoder for UTF8Encoder {
from_self(&self) -> Box<RawEncoder>65 fn from_self(&self) -> Box<RawEncoder> { UTF8Encoder::new() }
is_ascii_compatible(&self) -> bool66 fn is_ascii_compatible(&self) -> bool { true }
67
raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option<CodecError>)68 fn raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option<CodecError>) {
69 let input: &[u8] = input.as_bytes();
70 assert!(str::from_utf8(input).is_ok());
71 output.write_bytes(input);
72 (input.len(), None)
73 }
74
raw_finish(&mut self, _output: &mut ByteWriter) -> Option<CodecError>75 fn raw_finish(&mut self, _output: &mut ByteWriter) -> Option<CodecError> {
76 None
77 }
78 }
79
80 /// A decoder for UTF-8.
81 #[derive(Clone, Copy)]
82 pub struct UTF8Decoder {
83 queuelen: usize,
84 queue: [u8; 4],
85 state: u8,
86 }
87
88 impl UTF8Decoder {
new() -> Box<RawDecoder>89 pub fn new() -> Box<RawDecoder> {
90 Box::new(UTF8Decoder { queuelen: 0, queue: [0; 4], state: INITIAL_STATE })
91 }
92 }
93
94 static CHAR_CATEGORY: [u8; 256] = [
95 // 0 (00-7F): one byte sequence
96 // 1 (80-8F): continuation byte
97 // 2 (C2-DF): start of two byte sequence
98 // 3 (E1-EC,EE-EF): start of three byte sequence, next byte unrestricted
99 // 4 (ED): start of three byte sequence, next byte restricted to non-surrogates (80-9F)
100 // 5 (F4): start of four byte sequence, next byte restricted to 0+10FFFF (80-8F)
101 // 6 (F1-F3): start of four byte sequence, next byte unrestricted
102 // 7 (A0-BF): continuation byte
103 // 8 (C0-C1,F5-FF): invalid (overlong or out-of-range) start of multi byte sequences
104 // 9 (90-9F): continuation byte
105 // 10 (E0): start of three byte sequence, next byte restricted to non-overlong (A0-BF)
106 // 11 (F0): start of four byte sequence, next byte restricted to non-overlong (90-BF)
107
108 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
109 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
110 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
111 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
112 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
113 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
114 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
115 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
116 ];
117
118 static STATE_TRANSITIONS: [u8; 110] = [
119 0,98,12,24,48,84,72,98,98,98,36,60, // 0: '??
120 86, 0,86,86,86,86,86, 0,86, 0,86,86, // 12: .. 'cc
121 86,12,86,86,86,86,86,12,86,12,86,86, // 24: .. 'cc cc
122 86,86,86,86,86,86,86,12,86,86,86,86, // 36: .. 'cc(A0-BF) cc
123 86,12,86,86,86,86,86,86,86,12,86,86, // 48: .. 'cc(80-9F) cc
124 86,86,86,86,86,86,86,24,86,24,86,86, // 60: .. 'cc(90-BF) cc cc
125 86,24,86,86,86,86,86,24,86,24,86,86, // 72: .. 'cc cc cc
126 86,24,86,86,86,86,86,86,86,86,86,86,86,86, // 84: .. 'cc(80-8F) cc cc
127 // 86,86,86,86,86,86,86,86,86,86,86,86, // 86: .. xx '..
128 98,98,98,98,98,98,98,98,98,98,98,98, // 98: xx '..
129 ];
130
131 static INITIAL_STATE: u8 = 0;
132 static ACCEPT_STATE: u8 = 0;
133 static REJECT_STATE: u8 = 98;
134 static REJECT_STATE_WITH_BACKUP: u8 = 86;
135
136 macro_rules! is_reject_state(($state:expr) => ($state >= REJECT_STATE_WITH_BACKUP));
137 macro_rules! next_state(($state:expr, $ch:expr) => (
138 STATE_TRANSITIONS[($state + CHAR_CATEGORY[$ch as usize]) as usize]
139 ));
140
141 impl RawDecoder for UTF8Decoder {
from_self(&self) -> Box<RawDecoder>142 fn from_self(&self) -> Box<RawDecoder> { UTF8Decoder::new() }
is_ascii_compatible(&self) -> bool143 fn is_ascii_compatible(&self) -> bool { true }
144
raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option<CodecError>)145 fn raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option<CodecError>) {
146 output.writer_hint(input.len());
147
148 fn write_bytes(output: &mut StringWriter, bytes: &[u8]) {
149 output.write_str(unsafe {mem::transmute(bytes)});
150 }
151
152 let mut state = self.state;
153 let mut processed = 0;
154 let mut offset = 0;
155
156 // optimization: if we are in the initial state, quickly skip to the first non-MSB-set byte.
157 if state == INITIAL_STATE {
158 let first_msb = input.iter().position(|&ch| ch >= 0x80).unwrap_or(input.len());
159 offset += first_msb;
160 processed += first_msb;
161 }
162
163 for (i, &ch) in input[offset..].iter().enumerate() {
164 state = next_state!(state, ch);
165 if state == ACCEPT_STATE {
166 processed = i + offset + 1;
167 } else if is_reject_state!(state) {
168 let upto = if state == REJECT_STATE {i + offset + 1} else {i + offset};
169 self.state = INITIAL_STATE;
170 if processed > 0 && self.queuelen > 0 { // flush `queue` outside the problem
171 write_bytes(output, &self.queue[0..self.queuelen]);
172 }
173 self.queuelen = 0;
174 write_bytes(output, &input[0..processed]);
175 return (processed, Some(CodecError {
176 upto: upto as isize, cause: "invalid sequence".into()
177 }));
178 }
179 }
180
181 self.state = state;
182 if processed > 0 && self.queuelen > 0 { // flush `queue`
183 write_bytes(output, &self.queue[0..self.queuelen]);
184 self.queuelen = 0;
185 }
186 write_bytes(output, &input[0..processed]);
187 if processed < input.len() {
188 let morequeuelen = input.len() - processed;
189 for i in 0..morequeuelen {
190 self.queue[self.queuelen + i] = input[processed + i];
191 }
192 self.queuelen += morequeuelen;
193 }
194 (processed, None)
195 }
196
raw_finish(&mut self, _output: &mut StringWriter) -> Option<CodecError>197 fn raw_finish(&mut self, _output: &mut StringWriter) -> Option<CodecError> {
198 let state = self.state;
199 let queuelen = self.queuelen;
200 self.state = INITIAL_STATE;
201 self.queuelen = 0;
202 if state != ACCEPT_STATE {
203 Some(CodecError { upto: 0, cause: "incomplete sequence".into() })
204 } else {
205 assert!(queuelen == 0);
206 None
207 }
208 }
209 }
210
211 /// Almost equivalent to `std::str::from_utf8`.
212 /// This function is provided for the fair benchmark against the stdlib's UTF-8 conversion
213 /// functions, as rust-encoding always allocates a new string.
from_utf8<'a>(input: &'a [u8]) -> Option<&'a str>214 pub fn from_utf8<'a>(input: &'a [u8]) -> Option<&'a str> {
215 let mut iter = input.iter();
216 let mut state;
217
218 macro_rules! return_as_whole(() => (return Some(unsafe {mem::transmute(input)})));
219
220 // optimization: if we are in the initial state, quickly skip to the first non-MSB-set byte.
221 loop {
222 match iter.next() {
223 Some(&ch) if ch < 0x80 => {}
224 Some(&ch) => {
225 state = next_state!(INITIAL_STATE, ch);
226 break;
227 }
228 None => { return_as_whole!(); }
229 }
230 }
231
232 for &ch in iter {
233 state = next_state!(state, ch);
234 if is_reject_state!(state) { return None; }
235 }
236 if state != ACCEPT_STATE { return None; }
237 return_as_whole!();
238 }
239
240 #[cfg(test)]
241 mod tests {
242 // portions of these tests are adopted from Markus Kuhn's UTF-8 decoder capability and
243 // stress test: <http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt>.
244
245 use super::{UTF8Encoding, from_utf8};
246 use std::str;
247 use testutils;
248 use types::*;
249
250 #[test]
test_valid()251 fn test_valid() {
252 // one byte
253 let mut d = UTF8Encoding.raw_decoder();
254 assert_feed_ok!(d, [0x41], [], "A");
255 assert_feed_ok!(d, [0x42, 0x43], [], "BC");
256 assert_feed_ok!(d, [], [], "");
257 assert_feed_ok!(d, [0x44, 0x45, 0x46], [], "DEF");
258 assert_finish_ok!(d, "");
259
260 // two bytes
261 let mut d = UTF8Encoding.raw_decoder();
262 assert_feed_ok!(d, [0xc2, 0xa2], [], "\u{a2}");
263 assert_feed_ok!(d, [0xc2, 0xac, 0xc2, 0xa9], [], "\u{ac}\u{0a9}");
264 assert_feed_ok!(d, [], [], "");
265 assert_feed_ok!(d, [0xd5, 0xa1, 0xd5, 0xb5, 0xd5, 0xa2, 0xd5, 0xb8, 0xd6, 0x82,
266 0xd5, 0xa2, 0xd5, 0xa5, 0xd5, 0xb6], [],
267 "\u{561}\u{0575}\u{562}\u{578}\u{582}\u{562}\u{565}\u{576}");
268 assert_finish_ok!(d, "");
269
270 // three bytes
271 let mut d = UTF8Encoding.raw_decoder();
272 assert_feed_ok!(d, [0xed, 0x92, 0x89], [], "\u{d489}");
273 assert_feed_ok!(d, [0xe6, 0xbc, 0xa2, 0xe5, 0xad, 0x97], [], "\u{6f22}\u{5b57}");
274 assert_feed_ok!(d, [], [], "");
275 assert_feed_ok!(d, [0xc9, 0x99, 0xc9, 0x94, 0xc9, 0x90], [], "\u{259}\u{0254}\u{250}");
276 assert_finish_ok!(d, "");
277
278 // four bytes
279 let mut d = UTF8Encoding.raw_decoder();
280 assert_feed_ok!(d, [0xf0, 0x90, 0x82, 0x82], [], "\u{10082}");
281 assert_feed_ok!(d, [], [], "");
282 assert_finish_ok!(d, "");
283
284 // we don't test encoders as it is largely a no-op.
285 }
286
287 #[test]
test_valid_boundary()288 fn test_valid_boundary() {
289 let mut d = UTF8Encoding.raw_decoder();
290 assert_feed_ok!(d, [0x00], [], "\x00");
291 assert_finish_ok!(d, "");
292
293 let mut d = UTF8Encoding.raw_decoder();
294 assert_feed_ok!(d, [0x7f], [], "\x7f");
295 assert_finish_ok!(d, "");
296
297 let mut d = UTF8Encoding.raw_decoder();
298 assert_feed_ok!(d, [0xc2, 0x80], [], "\u{80}");
299 assert_finish_ok!(d, "");
300
301 let mut d = UTF8Encoding.raw_decoder();
302 assert_feed_ok!(d, [0xdf, 0xbf], [], "\u{7ff}");
303 assert_finish_ok!(d, "");
304
305 let mut d = UTF8Encoding.raw_decoder();
306 assert_feed_ok!(d, [0xe0, 0xa0, 0x80], [], "\u{800}");
307 assert_finish_ok!(d, "");
308
309 let mut d = UTF8Encoding.raw_decoder();
310 assert_feed_ok!(d, [0xed, 0x9f, 0xbf], [], "\u{d7ff}");
311 assert_finish_ok!(d, "");
312
313 let mut d = UTF8Encoding.raw_decoder();
314 assert_feed_ok!(d, [0xee, 0x80, 0x80], [], "\u{e000}");
315 assert_finish_ok!(d, "");
316
317 let mut d = UTF8Encoding.raw_decoder();
318 assert_feed_ok!(d, [0xef, 0xbf, 0xbf], [], "\u{ffff}");
319 assert_finish_ok!(d, "");
320
321 let mut d = UTF8Encoding.raw_decoder();
322 assert_feed_ok!(d, [0xf0, 0x90, 0x80, 0x80], [], "\u{10000}");
323 assert_finish_ok!(d, "");
324
325 let mut d = UTF8Encoding.raw_decoder();
326 assert_feed_ok!(d, [0xf4, 0x8f, 0xbf, 0xbf], [], "\u{10ffff}");
327 assert_finish_ok!(d, "");
328 }
329
330 #[test]
test_valid_partial()331 fn test_valid_partial() {
332 let mut d = UTF8Encoding.raw_decoder();
333 assert_feed_ok!(d, [], [0xf0], "");
334 assert_feed_ok!(d, [], [0x90], "");
335 assert_feed_ok!(d, [], [0x82], "");
336 assert_feed_ok!(d, [0x82], [0xed], "\u{10082}");
337 assert_feed_ok!(d, [0x92, 0x89], [], "\u{d489}");
338 assert_finish_ok!(d, "");
339
340 let mut d = UTF8Encoding.raw_decoder();
341 assert_feed_ok!(d, [], [0xc2], "");
342 assert_feed_ok!(d, [0xa9, 0x20], [], "\u{a9}\u{020}");
343 assert_finish_ok!(d, "");
344 }
345
346 #[test]
test_invalid_continuation()347 fn test_invalid_continuation() {
348 for c in 0x80..0xc0 {
349 let mut d = UTF8Encoding.raw_decoder();
350 assert_feed_err!(d, [], [c], [], "");
351 assert_finish_ok!(d, "");
352
353 let mut d = UTF8Encoding.raw_decoder();
354 assert_feed_err!(d, [], [c], [c], "");
355 assert_finish_ok!(d, "");
356
357 let mut d = UTF8Encoding.raw_decoder();
358 assert_feed_err!(d, [], [c], [c, c], "");
359 assert_finish_ok!(d, "");
360 }
361 }
362
363 #[test]
test_invalid_surrogate()364 fn test_invalid_surrogate() {
365 // surrogates should fail at the second byte.
366
367 let mut d = UTF8Encoding.raw_decoder();
368 assert_feed_err!(d, [], [0xed], [0xa0, 0x80], "");
369 assert_finish_ok!(d, "");
370
371 let mut d = UTF8Encoding.raw_decoder();
372 assert_feed_err!(d, [], [0xed], [0xad, 0xbf], "");
373 assert_finish_ok!(d, "");
374
375 let mut d = UTF8Encoding.raw_decoder();
376 assert_feed_err!(d, [], [0xed], [0xae, 0x80], "");
377 assert_finish_ok!(d, "");
378
379 let mut d = UTF8Encoding.raw_decoder();
380 assert_feed_err!(d, [], [0xed], [0xaf, 0xbf], "");
381 assert_finish_ok!(d, "");
382
383 let mut d = UTF8Encoding.raw_decoder();
384 assert_feed_err!(d, [], [0xed], [0xb0, 0x80], "");
385 assert_finish_ok!(d, "");
386
387 let mut d = UTF8Encoding.raw_decoder();
388 assert_feed_err!(d, [], [0xed], [0xbe, 0x80], "");
389 assert_finish_ok!(d, "");
390
391 let mut d = UTF8Encoding.raw_decoder();
392 assert_feed_err!(d, [], [0xed], [0xbf, 0xbf], "");
393 assert_finish_ok!(d, "");
394 }
395
396 #[test]
test_invalid_boundary()397 fn test_invalid_boundary() {
398 // as with surrogates, should fail at the second byte.
399 let mut d = UTF8Encoding.raw_decoder();
400 assert_feed_err!(d, [], [0xf4], [0x90, 0x90, 0x90], ""); // U+110000
401 assert_finish_ok!(d, "");
402 }
403
404 #[test]
test_invalid_start_immediate_test_finish()405 fn test_invalid_start_immediate_test_finish() {
406 for c in 0xf5..0x100 {
407 let c = c as u8;
408 let mut d = UTF8Encoding.raw_decoder();
409 assert_feed_err!(d, [], [c], [], "");
410 assert_finish_ok!(d, "");
411 }
412 }
413
414 #[test]
test_invalid_start_followed_by_space()415 fn test_invalid_start_followed_by_space() {
416 for c in 0xf5..0x100 {
417 let c = c as u8;
418
419 let mut d = UTF8Encoding.raw_decoder();
420 assert_feed_err!(d, [], [c], [0x20], "");
421 assert_finish_ok!(d, "");
422
423 let mut d = UTF8Encoding.raw_decoder();
424 assert_feed_err!(d, [], [c], [], "");
425 assert_feed_ok!(d, [0x20], [], "\x20");
426 assert_finish_ok!(d, "");
427 }
428 }
429
430 #[test]
test_invalid_lone_start_immediate_test_finish()431 fn test_invalid_lone_start_immediate_test_finish() {
432 for c in 0xc2..0xf5 {
433 let mut d = UTF8Encoding.raw_decoder();
434 assert_feed_ok!(d, [], [c], ""); // wait for cont. bytes
435 assert_finish_err!(d, "");
436 }
437 }
438
439 #[test]
test_invalid_lone_start_followed_by_space()440 fn test_invalid_lone_start_followed_by_space() {
441 for c in 0xc2..0xf5 {
442 let mut d = UTF8Encoding.raw_decoder();
443 assert_feed_err!(d, [], [c], [0x20], "");
444 assert_finish_ok!(d, "");
445
446 let mut d = UTF8Encoding.raw_decoder();
447 assert_feed_ok!(d, [], [c], ""); // wait for cont. bytes
448 assert_feed_err!(d, [], [], [0x20], "");
449 assert_finish_ok!(d, "");
450 }
451 }
452
453 #[test]
test_invalid_incomplete_three_byte_seq_followed_by_space()454 fn test_invalid_incomplete_three_byte_seq_followed_by_space() {
455 for b in 0xe0..0xf5 {
456 let c = if b == 0xe0 || b == 0xf0 {0xa0} else {0x80};
457
458 let mut d = UTF8Encoding.raw_decoder();
459 assert_feed_err!(d, [], [b, c], [0x20], "");
460 assert_finish_ok!(d, "");
461
462 let mut d = UTF8Encoding.raw_decoder();
463 assert_feed_ok!(d, [], [b, c], ""); // wait for cont. bytes
464 assert_feed_err!(d, [], [], [0x20], "");
465 assert_finish_ok!(d, "");
466
467 let mut d = UTF8Encoding.raw_decoder();
468 assert_feed_ok!(d, [], [b], ""); // wait for cont. bytes
469 assert_feed_err!(d, [], [c], [0x20], "");
470 assert_finish_ok!(d, "");
471
472 let mut d = UTF8Encoding.raw_decoder();
473 assert_feed_ok!(d, [], [b], ""); // wait for cont. bytes
474 assert_feed_ok!(d, [], [c], ""); // wait for cont. bytes
475 assert_feed_err!(d, [], [], [0x20], "");
476 assert_finish_ok!(d, "");
477 }
478 }
479
480 #[test]
test_invalid_incomplete_four_byte_seq_followed_by_space()481 fn test_invalid_incomplete_four_byte_seq_followed_by_space() {
482 for a in 0xf0..0xf5 {
483 let b = if a == 0xf0 {0xa0} else {0x80};
484 let c = 0x80;
485
486 let mut d = UTF8Encoding.raw_decoder();
487 assert_feed_err!(d, [], [a, b, c], [0x20], "");
488 assert_finish_ok!(d, "");
489
490 let mut d = UTF8Encoding.raw_decoder();
491 assert_feed_ok!(d, [], [a], ""); // wait for cont. bytes
492 assert_feed_ok!(d, [], [b], ""); // wait for cont. bytes
493 assert_feed_ok!(d, [], [c], ""); // wait for cont. bytes
494 assert_feed_err!(d, [], [], [0x20], "");
495 assert_finish_ok!(d, "");
496
497 let mut d = UTF8Encoding.raw_decoder();
498 assert_feed_ok!(d, [], [a, b], ""); // wait for cont. bytes
499 assert_feed_err!(d, [], [c], [0x20], "");
500 assert_finish_ok!(d, "");
501
502 let mut d = UTF8Encoding.raw_decoder();
503 assert_feed_ok!(d, [], [a, b, c], ""); // wait for cont. bytes
504 assert_feed_err!(d, [], [], [0x20], "");
505 assert_finish_ok!(d, "");
506 }
507 }
508
509 #[test]
test_invalid_too_many_cont_bytes()510 fn test_invalid_too_many_cont_bytes() {
511 let mut d = UTF8Encoding.raw_decoder();
512 assert_feed_err!(d, [0xc2, 0x80], [0x80], [], "\u{80}");
513 assert_finish_ok!(d, "");
514
515 let mut d = UTF8Encoding.raw_decoder();
516 assert_feed_err!(d, [0xe0, 0xa0, 0x80], [0x80], [], "\u{800}");
517 assert_finish_ok!(d, "");
518
519 let mut d = UTF8Encoding.raw_decoder();
520 assert_feed_err!(d, [0xf0, 0x90, 0x80, 0x80], [0x80], [], "\u{10000}");
521 assert_finish_ok!(d, "");
522
523 // no continuation byte is consumed after 5/6-byte sequence starters and FE/FF
524 let mut d = UTF8Encoding.raw_decoder();
525 assert_feed_err!(d, [], [0xf8], [0x88, 0x80, 0x80, 0x80, 0x80], "");
526 assert_finish_ok!(d, "");
527
528 let mut d = UTF8Encoding.raw_decoder();
529 assert_feed_err!(d, [], [0xfc], [0x84, 0x80, 0x80, 0x80, 0x80, 0x80], "");
530 assert_finish_ok!(d, "");
531
532 let mut d = UTF8Encoding.raw_decoder();
533 assert_feed_err!(d, [], [0xfe], [0x80], "");
534 assert_finish_ok!(d, "");
535
536 let mut d = UTF8Encoding.raw_decoder();
537 assert_feed_err!(d, [], [0xff], [0x80], "");
538 assert_finish_ok!(d, "");
539 }
540
541 #[test]
test_invalid_too_many_cont_bytes_partial()542 fn test_invalid_too_many_cont_bytes_partial() {
543 let mut d = UTF8Encoding.raw_decoder();
544 assert_feed_ok!(d, [], [0xc2], "");
545 assert_feed_err!(d, [0x80], [0x80], [], "\u{80}");
546 assert_finish_ok!(d, "");
547
548 let mut d = UTF8Encoding.raw_decoder();
549 assert_feed_ok!(d, [], [0xe0, 0xa0], "");
550 assert_feed_err!(d, [0x80], [0x80], [], "\u{800}");
551 assert_finish_ok!(d, "");
552
553 let mut d = UTF8Encoding.raw_decoder();
554 assert_feed_ok!(d, [], [0xf0, 0x90, 0x80], "");
555 assert_feed_err!(d, [0x80], [0x80], [], "\u{10000}");
556 assert_finish_ok!(d, "");
557
558 // no continuation byte is consumed after 5/6-byte sequence starters and FE/FF
559 let mut d = UTF8Encoding.raw_decoder();
560 assert_feed_err!(d, [], [0xf8], [], "");
561 assert_feed_err!(d, [], [0x88], [0x80, 0x80, 0x80, 0x80], "");
562 assert_finish_ok!(d, "");
563
564 let mut d = UTF8Encoding.raw_decoder();
565 assert_feed_err!(d, [], [0xfc], [], "");
566 assert_feed_err!(d, [], [0x84], [0x80, 0x80, 0x80, 0x80, 0x80], "");
567 assert_finish_ok!(d, "");
568
569 let mut d = UTF8Encoding.raw_decoder();
570 assert_feed_err!(d, [], [0xfe], [], "");
571 assert_feed_err!(d, [], [0x80], [], "");
572 assert_finish_ok!(d, "");
573
574 let mut d = UTF8Encoding.raw_decoder();
575 assert_feed_err!(d, [], [0xff], [], "");
576 assert_feed_err!(d, [], [0x80], [], "");
577 assert_finish_ok!(d, "");
578 }
579
580 #[test]
test_invalid_overlong_minimal()581 fn test_invalid_overlong_minimal() {
582 let mut d = UTF8Encoding.raw_decoder();
583 assert_feed_err!(d, [], [0xc0], [0x80], "");
584 assert_finish_ok!(d, "");
585
586 let mut d = UTF8Encoding.raw_decoder();
587 assert_feed_err!(d, [], [0xe0], [0x80, 0x80], "");
588 assert_finish_ok!(d, "");
589
590 let mut d = UTF8Encoding.raw_decoder();
591 assert_feed_err!(d, [], [0xf0], [0x80, 0x80, 0x80], "");
592 assert_finish_ok!(d, "");
593 }
594
595 #[test]
test_invalid_overlong_maximal()596 fn test_invalid_overlong_maximal() {
597 let mut d = UTF8Encoding.raw_decoder();
598 assert_feed_err!(d, [], [0xc1], [0xbf], "");
599 assert_finish_ok!(d, "");
600
601 let mut d = UTF8Encoding.raw_decoder();
602 assert_feed_err!(d, [], [0xe0], [0x9f, 0xbf], "");
603 assert_finish_ok!(d, "");
604
605 let mut d = UTF8Encoding.raw_decoder();
606 assert_feed_err!(d, [], [0xf0], [0x8f, 0xbf, 0xbf], "");
607 assert_finish_ok!(d, "");
608 }
609
610 #[test]
test_feed_after_finish()611 fn test_feed_after_finish() {
612 let mut d = UTF8Encoding.raw_decoder();
613 assert_feed_ok!(d, [0xc2, 0x80], [0xc2], "\u{80}");
614 assert_finish_err!(d, "");
615 assert_feed_ok!(d, [0xc2, 0x80], [], "\u{80}");
616 assert_finish_ok!(d, "");
617 }
618
619 #[test]
test_correct_from_utf8()620 fn test_correct_from_utf8() {
621 let s = testutils::ASCII_TEXT.as_bytes();
622 assert_eq!(from_utf8(s), str::from_utf8(s).ok());
623
624 let s = testutils::KOREAN_TEXT.as_bytes();
625 assert_eq!(from_utf8(s), str::from_utf8(s).ok());
626
627 let s = testutils::INVALID_UTF8_TEXT;
628 assert_eq!(from_utf8(s), str::from_utf8(s).ok());
629 }
630
631 mod bench_ascii {
632 extern crate test;
633 use super::super::{UTF8Encoding, from_utf8};
634 use std::str;
635 use testutils;
636 use types::*;
637
638 #[bench]
bench_encode(bencher: &mut test::Bencher)639 fn bench_encode(bencher: &mut test::Bencher) {
640 let s = testutils::ASCII_TEXT;
641 bencher.bytes = s.len() as u64;
642 bencher.iter(|| test::black_box({
643 UTF8Encoding.encode(s, EncoderTrap::Strict)
644 }))
645 }
646
647 #[bench]
bench_decode(bencher: &mut test::Bencher)648 fn bench_decode(bencher: &mut test::Bencher) {
649 let s = testutils::ASCII_TEXT.as_bytes();
650 bencher.bytes = s.len() as u64;
651 bencher.iter(|| test::black_box({
652 UTF8Encoding.decode(s, DecoderTrap::Strict)
653 }))
654 }
655
656 #[bench]
bench_from_utf8(bencher: &mut test::Bencher)657 fn bench_from_utf8(bencher: &mut test::Bencher) {
658 let s = testutils::ASCII_TEXT.as_bytes();
659 bencher.bytes = s.len() as u64;
660 bencher.iter(|| test::black_box({
661 from_utf8(s)
662 }))
663 }
664
665 #[bench] // for the comparison
bench_stdlib_from_utf8(bencher: &mut test::Bencher)666 fn bench_stdlib_from_utf8(bencher: &mut test::Bencher) {
667 let s = testutils::ASCII_TEXT.as_bytes();
668 bencher.bytes = s.len() as u64;
669 bencher.iter(|| test::black_box({
670 str::from_utf8(s)
671 }))
672 }
673
674 #[bench] // for the comparison
bench_stdlib_from_utf8_lossy(bencher: &mut test::Bencher)675 fn bench_stdlib_from_utf8_lossy(bencher: &mut test::Bencher) {
676 let s = testutils::ASCII_TEXT.as_bytes();
677 bencher.bytes = s.len() as u64;
678 bencher.iter(|| test::black_box({
679 String::from_utf8_lossy(s)
680 }))
681 }
682 }
683
684 // why Korean? it has an excellent mix of multibyte sequences and ASCII sequences
685 // unlike other CJK scripts, so it reflects a practical use case a bit better.
686 mod bench_korean {
687 extern crate test;
688 use super::super::{UTF8Encoding, from_utf8};
689 use std::str;
690 use testutils;
691 use types::*;
692
693 #[bench]
bench_encode(bencher: &mut test::Bencher)694 fn bench_encode(bencher: &mut test::Bencher) {
695 let s = testutils::KOREAN_TEXT;
696 bencher.bytes = s.len() as u64;
697 bencher.iter(|| test::black_box({
698 UTF8Encoding.encode(s, EncoderTrap::Strict)
699 }))
700 }
701
702 #[bench]
bench_decode(bencher: &mut test::Bencher)703 fn bench_decode(bencher: &mut test::Bencher) {
704 let s = testutils::KOREAN_TEXT.as_bytes();
705 bencher.bytes = s.len() as u64;
706 bencher.iter(|| test::black_box({
707 UTF8Encoding.decode(s, DecoderTrap::Strict)
708 }))
709 }
710
711 #[bench]
bench_from_utf8(bencher: &mut test::Bencher)712 fn bench_from_utf8(bencher: &mut test::Bencher) {
713 let s = testutils::KOREAN_TEXT.as_bytes();
714 bencher.bytes = s.len() as u64;
715 bencher.iter(|| test::black_box({
716 from_utf8(s)
717 }))
718 }
719
720 #[bench] // for the comparison
bench_stdlib_from_utf8(bencher: &mut test::Bencher)721 fn bench_stdlib_from_utf8(bencher: &mut test::Bencher) {
722 let s = testutils::KOREAN_TEXT.as_bytes();
723 bencher.bytes = s.len() as u64;
724 bencher.iter(|| test::black_box({
725 str::from_utf8(s)
726 }))
727 }
728
729 #[bench] // for the comparison
bench_stdlib_from_utf8_lossy(bencher: &mut test::Bencher)730 fn bench_stdlib_from_utf8_lossy(bencher: &mut test::Bencher) {
731 let s = testutils::KOREAN_TEXT.as_bytes();
732 bencher.bytes = s.len() as u64;
733 bencher.iter(|| test::black_box({
734 String::from_utf8_lossy(s)
735 }))
736 }
737 }
738
739 mod bench_lossy_invalid {
740 extern crate test;
741 use super::super::{UTF8Encoding, from_utf8};
742 use std::str;
743 use testutils;
744 use types::*;
745 use types::DecoderTrap::Replace as DecodeReplace;
746
747 #[bench]
bench_decode_replace(bencher: &mut test::Bencher)748 fn bench_decode_replace(bencher: &mut test::Bencher) {
749 let s = testutils::INVALID_UTF8_TEXT;
750 bencher.bytes = s.len() as u64;
751 bencher.iter(|| test::black_box({
752 UTF8Encoding.decode(s, DecodeReplace)
753 }))
754 }
755
756 #[bench] // for the comparison
bench_from_utf8_failing(bencher: &mut test::Bencher)757 fn bench_from_utf8_failing(bencher: &mut test::Bencher) {
758 let s = testutils::INVALID_UTF8_TEXT;
759 bencher.bytes = s.len() as u64;
760 bencher.iter(|| test::black_box({
761 from_utf8(s)
762 }))
763 }
764
765 #[bench] // for the comparison
bench_stdlib_from_utf8_failing(bencher: &mut test::Bencher)766 fn bench_stdlib_from_utf8_failing(bencher: &mut test::Bencher) {
767 let s = testutils::INVALID_UTF8_TEXT;
768 bencher.bytes = s.len() as u64;
769 bencher.iter(|| test::black_box({
770 str::from_utf8(s)
771 }))
772 }
773
774 #[bench] // for the comparison
bench_stdlib_from_utf8_lossy(bencher: &mut test::Bencher)775 fn bench_stdlib_from_utf8_lossy(bencher: &mut test::Bencher) {
776 let s = testutils::INVALID_UTF8_TEXT;
777 bencher.bytes = s.len() as u64;
778 bencher.iter(|| test::black_box({
779 String::from_utf8_lossy(s)
780 }))
781 }
782 }
783
784 mod bench_lossy_external {
785 extern crate test;
786 use super::super::{UTF8Encoding, from_utf8};
787 use std::str;
788 use testutils;
789 use types::*;
790 use types::DecoderTrap::Replace as DecodeReplace;
791
792 #[bench]
bench_decode_replace(bencher: &mut test::Bencher)793 fn bench_decode_replace(bencher: &mut test::Bencher) {
794 let s = testutils::get_external_bench_data();
795 bencher.bytes = s.len() as u64;
796 bencher.iter(|| test::black_box({
797 UTF8Encoding.decode(&s, DecodeReplace)
798 }))
799 }
800
801 #[bench] // for the comparison
bench_from_utf8_failing(bencher: &mut test::Bencher)802 fn bench_from_utf8_failing(bencher: &mut test::Bencher) {
803 let s = testutils::get_external_bench_data();
804 bencher.bytes = s.len() as u64;
805 bencher.iter(|| test::black_box({
806 from_utf8(&s)
807 }))
808 }
809
810 #[bench] // for the comparison
bench_stdlib_from_utf8_failing(bencher: &mut test::Bencher)811 fn bench_stdlib_from_utf8_failing(bencher: &mut test::Bencher) {
812 let s = testutils::get_external_bench_data();
813 bencher.bytes = s.len() as u64;
814 bencher.iter(|| test::black_box({
815 str::from_utf8(&s)
816 }))
817 }
818
819 #[bench] // for the comparison
bench_stdlib_from_utf8_lossy(bencher: &mut test::Bencher)820 fn bench_stdlib_from_utf8_lossy(bencher: &mut test::Bencher) {
821 let s = testutils::get_external_bench_data();
822 bencher.bytes = s.len() as u64;
823 bencher.iter(|| test::black_box({
824 String::from_utf8_lossy(&s)
825 }))
826 }
827 }
828 }
829