1 //! A table-driven UTF-8 Parser 2 //! 3 //! This module implements a table-driven UTF-8 parser which should 4 //! theoretically contain the minimal number of branches (1). The only branch is 5 //! on the `Action` returned from unpacking a transition. 6 #![cfg_attr(all(feature = "nightly", test), feature(test))] 7 #![no_std] 8 9 use core::char; 10 11 mod types; 12 13 use types::{Action, State}; 14 15 /// Handles codepoint and invalid sequence events from the parser. 16 pub trait Receiver { 17 /// Called whenever a codepoint is parsed successfully codepoint(&mut self, _: char)18 fn codepoint(&mut self, _: char); 19 20 /// Called when an invalid_sequence is detected invalid_sequence(&mut self)21 fn invalid_sequence(&mut self); 22 } 23 24 /// A parser for Utf8 Characters 25 /// 26 /// Repeatedly call `advance` with bytes to emit Utf8 characters 27 #[derive(Default)] 28 pub struct Parser { 29 point: u32, 30 state: State, 31 } 32 33 /// Continuation bytes are masked with this value. 34 const CONTINUATION_MASK: u8 = 0b0011_1111; 35 36 impl Parser { 37 /// Create a new Parser new() -> Parser38 pub fn new() -> Parser { 39 Parser { point: 0, state: State::Ground } 40 } 41 42 /// Advance the parser 43 /// 44 /// The provider receiver will be called whenever a codepoint is completed or an invalid 45 /// sequence is detected. advance<R>(&mut self, receiver: &mut R, byte: u8) where R: Receiver,46 pub fn advance<R>(&mut self, receiver: &mut R, byte: u8) 47 where 48 R: Receiver, 49 { 50 let (state, action) = self.state.advance(byte); 51 self.perform_action(receiver, byte, action); 52 self.state = state; 53 } 54 perform_action<R>(&mut self, receiver: &mut R, byte: u8, action: Action) where R: Receiver,55 fn perform_action<R>(&mut self, receiver: &mut R, byte: u8, action: Action) 56 where 57 R: Receiver, 58 { 59 match action { 60 Action::InvalidSequence => { 61 self.point = 0; 62 receiver.invalid_sequence(); 63 }, 64 Action::EmitByte => { 65 receiver.codepoint(byte as char); 66 }, 67 Action::SetByte1 => { 68 let point = self.point | ((byte & CONTINUATION_MASK) as u32); 69 let c = unsafe { char::from_u32_unchecked(point) }; 70 self.point = 0; 71 72 receiver.codepoint(c); 73 }, 74 Action::SetByte2 => { 75 self.point |= ((byte & CONTINUATION_MASK) as u32) << 6; 76 }, 77 Action::SetByte2Top => { 78 self.point |= ((byte & 0b0001_1111) as u32) << 6; 79 }, 80 Action::SetByte3 => { 81 self.point |= ((byte & CONTINUATION_MASK) as u32) << 12; 82 }, 83 Action::SetByte3Top => { 84 self.point |= ((byte & 0b0000_1111) as u32) << 12; 85 }, 86 Action::SetByte4 => { 87 self.point |= ((byte & 0b0000_0111) as u32) << 18; 88 }, 89 } 90 } 91 } 92 93 #[cfg(all(feature = "nightly", test))] 94 mod benches { 95 extern crate std; 96 extern crate test; 97 98 use super::{Parser, Receiver}; 99 100 use self::test::{black_box, Bencher}; 101 102 static UTF8_DEMO: &[u8] = include_bytes!("../tests/UTF-8-demo.txt"); 103 104 impl Receiver for () { codepoint(&mut self, c: char)105 fn codepoint(&mut self, c: char) { 106 black_box(c); 107 } 108 invalid_sequence(&mut self)109 fn invalid_sequence(&mut self) {} 110 } 111 112 #[bench] parse_bench_utf8_demo(b: &mut Bencher)113 fn parse_bench_utf8_demo(b: &mut Bencher) { 114 let mut parser = Parser::new(); 115 116 b.iter(|| { 117 for byte in UTF8_DEMO { 118 parser.advance(&mut (), *byte); 119 } 120 }) 121 } 122 123 #[bench] std_string_parse_utf8(b: &mut Bencher)124 fn std_string_parse_utf8(b: &mut Bencher) { 125 b.iter(|| { 126 for c in std::str::from_utf8(UTF8_DEMO).unwrap().chars() { 127 black_box(c); 128 } 129 }); 130 } 131 } 132