1 //! A table-driven UTF-8 Parser
2 //!
3 //! This module implements a table-driven UTF-8 parser which should
4 //! theoretically contain the minimal number of branches (1). The only branch is
5 //! on the `Action` returned from unpacking a transition.
6 #![cfg_attr(all(feature = "nightly", test), feature(test))]
7 #![no_std]
8 
9 use core::char;
10 
11 mod types;
12 
13 use types::{Action, State};
14 
15 /// Handles codepoint and invalid sequence events from the parser.
16 pub trait Receiver {
17     /// Called whenever a codepoint is parsed successfully
codepoint(&mut self, _: char)18     fn codepoint(&mut self, _: char);
19 
20     /// Called when an invalid_sequence is detected
invalid_sequence(&mut self)21     fn invalid_sequence(&mut self);
22 }
23 
24 /// A parser for Utf8 Characters
25 ///
26 /// Repeatedly call `advance` with bytes to emit Utf8 characters
27 #[derive(Default)]
28 pub struct Parser {
29     point: u32,
30     state: State,
31 }
32 
33 /// Continuation bytes are masked with this value.
34 const CONTINUATION_MASK: u8 = 0b0011_1111;
35 
36 impl Parser {
37     /// Create a new Parser
new() -> Parser38     pub fn new() -> Parser {
39         Parser { point: 0, state: State::Ground }
40     }
41 
42     /// Advance the parser
43     ///
44     /// The provider receiver will be called whenever a codepoint is completed or an invalid
45     /// sequence is detected.
advance<R>(&mut self, receiver: &mut R, byte: u8) where R: Receiver,46     pub fn advance<R>(&mut self, receiver: &mut R, byte: u8)
47     where
48         R: Receiver,
49     {
50         let (state, action) = self.state.advance(byte);
51         self.perform_action(receiver, byte, action);
52         self.state = state;
53     }
54 
perform_action<R>(&mut self, receiver: &mut R, byte: u8, action: Action) where R: Receiver,55     fn perform_action<R>(&mut self, receiver: &mut R, byte: u8, action: Action)
56     where
57         R: Receiver,
58     {
59         match action {
60             Action::InvalidSequence => {
61                 self.point = 0;
62                 receiver.invalid_sequence();
63             },
64             Action::EmitByte => {
65                 receiver.codepoint(byte as char);
66             },
67             Action::SetByte1 => {
68                 let point = self.point | ((byte & CONTINUATION_MASK) as u32);
69                 let c = unsafe { char::from_u32_unchecked(point) };
70                 self.point = 0;
71 
72                 receiver.codepoint(c);
73             },
74             Action::SetByte2 => {
75                 self.point |= ((byte & CONTINUATION_MASK) as u32) << 6;
76             },
77             Action::SetByte2Top => {
78                 self.point |= ((byte & 0b0001_1111) as u32) << 6;
79             },
80             Action::SetByte3 => {
81                 self.point |= ((byte & CONTINUATION_MASK) as u32) << 12;
82             },
83             Action::SetByte3Top => {
84                 self.point |= ((byte & 0b0000_1111) as u32) << 12;
85             },
86             Action::SetByte4 => {
87                 self.point |= ((byte & 0b0000_0111) as u32) << 18;
88             },
89         }
90     }
91 }
92 
93 #[cfg(all(feature = "nightly", test))]
94 mod benches {
95     extern crate std;
96     extern crate test;
97 
98     use super::{Parser, Receiver};
99 
100     use self::test::{black_box, Bencher};
101 
102     static UTF8_DEMO: &[u8] = include_bytes!("../tests/UTF-8-demo.txt");
103 
104     impl Receiver for () {
codepoint(&mut self, c: char)105         fn codepoint(&mut self, c: char) {
106             black_box(c);
107         }
108 
invalid_sequence(&mut self)109         fn invalid_sequence(&mut self) {}
110     }
111 
112     #[bench]
parse_bench_utf8_demo(b: &mut Bencher)113     fn parse_bench_utf8_demo(b: &mut Bencher) {
114         let mut parser = Parser::new();
115 
116         b.iter(|| {
117             for byte in UTF8_DEMO {
118                 parser.advance(&mut (), *byte);
119             }
120         })
121     }
122 
123     #[bench]
std_string_parse_utf8(b: &mut Bencher)124     fn std_string_parse_utf8(b: &mut Bencher) {
125         b.iter(|| {
126             for c in std::str::from_utf8(UTF8_DEMO).unwrap().chars() {
127                 black_box(c);
128             }
129         });
130     }
131 }
132