1 // Copyright 2014-2017 The html5ever Project Developers. See the 2 // COPYRIGHT file at the top-level directory of this distribution. 3 // 4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license 6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 7 // option. This file may not be copied, modified, or distributed 8 // except according to those terms. 9 10 //! The `BufferQueue` struct and helper types. 11 //! 12 //! This type is designed for the efficient parsing of string data, especially where many 13 //! significant characters are from the ascii range 0-63. This includes, for example, important 14 //! characters in xml/html parsing. 15 //! 16 //! Good and predictable performance is achieved by avoiding allocation where possible (a.k.a. zero 17 //! copy). 18 //! 19 //! [`BufferQueue`]: struct.BufferQueue.html 20 21 use std::collections::VecDeque; 22 23 use tendril::StrTendril; 24 25 pub use self::SetResult::{FromSet, NotFromSet}; 26 use crate::util::smallcharset::SmallCharSet; 27 28 /// Result from [`pop_except_from`] containing either a character from a [`SmallCharSet`], or a 29 /// string buffer of characters not from the set. 30 /// 31 /// [`pop_except_from`]: struct.BufferQueue.html#method.pop_except_from 32 /// [`SmallCharSet`]: ../struct.SmallCharSet.html 33 #[derive(PartialEq, Eq, Debug)] 34 pub enum SetResult { 35 /// A character from the `SmallCharSet`. 36 FromSet(char), 37 /// A string buffer containing no characters from the `SmallCharSet`. 38 NotFromSet(StrTendril), 39 } 40 41 /// A queue of owned string buffers, which supports incrementally consuming characters. 42 /// 43 /// Internally it uses [`VecDeque`] and has the same complexity properties. 44 /// 45 /// [`VecDeque`]: https://doc.rust-lang.org/std/collections/struct.VecDeque.html 46 #[derive(Debug)] 47 pub struct BufferQueue { 48 /// Buffers to process. 49 buffers: VecDeque<StrTendril>, 50 } 51 52 impl BufferQueue { 53 /// Create an empty BufferQueue. 54 #[inline] new() -> BufferQueue55 pub fn new() -> BufferQueue { 56 BufferQueue { 57 buffers: VecDeque::with_capacity(16), 58 } 59 } 60 61 /// Returns whether the queue is empty. 62 #[inline] is_empty(&self) -> bool63 pub fn is_empty(&self) -> bool { 64 self.buffers.is_empty() 65 } 66 67 /// Get the buffer at the beginning of the queue. 68 #[inline] pop_front(&mut self) -> Option<StrTendril>69 pub fn pop_front(&mut self) -> Option<StrTendril> { 70 self.buffers.pop_front() 71 } 72 73 /// Add a buffer to the beginning of the queue. 74 /// 75 /// If the buffer is empty, it will be skipped. push_front(&mut self, buf: StrTendril)76 pub fn push_front(&mut self, buf: StrTendril) { 77 if buf.len32() == 0 { 78 return; 79 } 80 self.buffers.push_front(buf); 81 } 82 83 /// Add a buffer to the end of the queue. 84 /// 85 /// If the buffer is empty, it will be skipped. push_back(&mut self, buf: StrTendril)86 pub fn push_back(&mut self, buf: StrTendril) { 87 if buf.len32() == 0 { 88 return; 89 } 90 self.buffers.push_back(buf); 91 } 92 93 /// Look at the next available character without removing it, if the queue is not empty. peek(&self) -> Option<char>94 pub fn peek(&self) -> Option<char> { 95 debug_assert!( 96 self.buffers 97 .iter() 98 .find(|el| el.len32() == 0) 99 .is_none(), 100 "invariant \"all buffers in the queue are non-empty\" failed" 101 ); 102 self.buffers.front().map(|b| b.chars().next().unwrap()) 103 } 104 105 /// Get the next character if one is available, removing it from the queue. 106 /// 107 /// This function manages the buffers, removing them as they become empty. next(&mut self) -> Option<char>108 pub fn next(&mut self) -> Option<char> { 109 let (result, now_empty) = match self.buffers.front_mut() { 110 None => (None, false), 111 Some(buf) => { 112 let c = buf.pop_front_char().expect("empty buffer in queue"); 113 (Some(c), buf.is_empty()) 114 }, 115 }; 116 117 if now_empty { 118 self.buffers.pop_front(); 119 } 120 121 result 122 } 123 124 /// Pops and returns either a single character from the given set, or 125 /// a buffer of characters none of which are in the set. 126 /// 127 /// # Examples 128 /// 129 /// ``` 130 /// # #[macro_use] extern crate markup5ever; 131 /// # #[macro_use] extern crate tendril; 132 /// # fn main() { 133 /// use markup5ever::buffer_queue::{BufferQueue, SetResult}; 134 /// 135 /// let mut queue = BufferQueue::new(); 136 /// queue.push_back(format_tendril!(r#"<some_tag attr="text">SomeText</some_tag>"#)); 137 /// let set = small_char_set!(b'<' b'>' b' ' b'=' b'"' b'/'); 138 /// let tag = format_tendril!("some_tag"); 139 /// let attr = format_tendril!("attr"); 140 /// let attr_val = format_tendril!("text"); 141 /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('<'))); 142 /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(tag))); 143 /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet(' '))); 144 /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(attr))); 145 /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('='))); 146 /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('"'))); 147 /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(attr_val))); 148 /// // ... 149 /// # } 150 /// ``` pop_except_from(&mut self, set: SmallCharSet) -> Option<SetResult>151 pub fn pop_except_from(&mut self, set: SmallCharSet) -> Option<SetResult> { 152 let (result, now_empty) = match self.buffers.front_mut() { 153 None => (None, false), 154 Some(buf) => { 155 let n = set.nonmember_prefix_len(&buf); 156 if n > 0 { 157 let out; 158 unsafe { 159 out = buf.unsafe_subtendril(0, n); 160 buf.unsafe_pop_front(n); 161 } 162 (Some(NotFromSet(out)), buf.is_empty()) 163 } else { 164 let c = buf.pop_front_char().expect("empty buffer in queue"); 165 (Some(FromSet(c)), buf.is_empty()) 166 } 167 }, 168 }; 169 170 // Unborrow self for this part. 171 if now_empty { 172 self.buffers.pop_front(); 173 } 174 175 result 176 } 177 178 /// Consume bytes matching the pattern, using a custom comparison function `eq`. 179 /// 180 /// Returns `Some(true)` if there is a match, `Some(false)` if there is no match, or `None` if 181 /// it wasn't possible to know (more data is needed). 182 /// 183 /// The custom comparison function is used elsewhere to compare ascii-case-insensitively. 184 /// 185 /// # Examples 186 /// 187 /// ``` 188 /// # extern crate markup5ever; 189 /// # #[macro_use] extern crate tendril; 190 /// # fn main() { 191 /// use markup5ever::buffer_queue::{BufferQueue}; 192 /// 193 /// let mut queue = BufferQueue::new(); 194 /// queue.push_back(format_tendril!("testtext")); 195 /// let test_str = "test"; 196 /// assert_eq!(queue.eat("test", |&a, &b| a == b), Some(true)); 197 /// assert_eq!(queue.eat("text", |&a, &b| a == b), Some(true)); 198 /// assert!(queue.is_empty()); 199 /// # } 200 /// ``` eat<F: Fn(&u8, &u8) -> bool>(&mut self, pat: &str, eq: F) -> Option<bool>201 pub fn eat<F: Fn(&u8, &u8) -> bool>(&mut self, pat: &str, eq: F) -> Option<bool> { 202 let mut buffers_exhausted = 0; 203 let mut consumed_from_last = 0; 204 205 self.buffers.front()?; 206 207 for pattern_byte in pat.bytes() { 208 if buffers_exhausted >= self.buffers.len() { 209 return None; 210 } 211 let buf = &self.buffers[buffers_exhausted]; 212 213 if !eq(&buf.as_bytes()[consumed_from_last], &pattern_byte) { 214 return Some(false); 215 } 216 217 consumed_from_last += 1; 218 if consumed_from_last >= buf.len() { 219 buffers_exhausted += 1; 220 consumed_from_last = 0; 221 } 222 } 223 224 // We have a match. Commit changes to the BufferQueue. 225 for _ in 0..buffers_exhausted { 226 self.buffers.pop_front(); 227 } 228 229 match self.buffers.front_mut() { 230 None => assert_eq!(consumed_from_last, 0), 231 Some(ref mut buf) => buf.pop_front(consumed_from_last as u32), 232 } 233 234 Some(true) 235 } 236 } 237 238 #[cfg(test)] 239 #[allow(non_snake_case)] 240 mod test { 241 use tendril::SliceExt; 242 243 use super::BufferQueue; 244 use super::SetResult::{FromSet, NotFromSet}; 245 246 #[test] smoke_test()247 fn smoke_test() { 248 let mut bq = BufferQueue::new(); 249 assert_eq!(bq.peek(), None); 250 assert_eq!(bq.next(), None); 251 252 bq.push_back("abc".to_tendril()); 253 assert_eq!(bq.peek(), Some('a')); 254 assert_eq!(bq.next(), Some('a')); 255 assert_eq!(bq.peek(), Some('b')); 256 assert_eq!(bq.peek(), Some('b')); 257 assert_eq!(bq.next(), Some('b')); 258 assert_eq!(bq.peek(), Some('c')); 259 assert_eq!(bq.next(), Some('c')); 260 assert_eq!(bq.peek(), None); 261 assert_eq!(bq.next(), None); 262 } 263 264 #[test] can_unconsume()265 fn can_unconsume() { 266 let mut bq = BufferQueue::new(); 267 bq.push_back("abc".to_tendril()); 268 assert_eq!(bq.next(), Some('a')); 269 270 bq.push_front("xy".to_tendril()); 271 assert_eq!(bq.next(), Some('x')); 272 assert_eq!(bq.next(), Some('y')); 273 assert_eq!(bq.next(), Some('b')); 274 assert_eq!(bq.next(), Some('c')); 275 assert_eq!(bq.next(), None); 276 } 277 278 #[test] can_pop_except_set()279 fn can_pop_except_set() { 280 let mut bq = BufferQueue::new(); 281 bq.push_back("abc&def".to_tendril()); 282 let mut pop = || bq.pop_except_from(small_char_set!('&')); 283 assert_eq!(pop(), Some(NotFromSet("abc".to_tendril()))); 284 assert_eq!(pop(), Some(FromSet('&'))); 285 assert_eq!(pop(), Some(NotFromSet("def".to_tendril()))); 286 assert_eq!(pop(), None); 287 } 288 289 #[test] can_eat()290 fn can_eat() { 291 // This is not very comprehensive. We rely on the tokenizer 292 // integration tests for more thorough testing with many 293 // different input buffer splits. 294 let mut bq = BufferQueue::new(); 295 bq.push_back("a".to_tendril()); 296 bq.push_back("bc".to_tendril()); 297 assert_eq!(bq.eat("abcd", u8::eq_ignore_ascii_case), None); 298 assert_eq!(bq.eat("ax", u8::eq_ignore_ascii_case), Some(false)); 299 assert_eq!(bq.eat("ab", u8::eq_ignore_ascii_case), Some(true)); 300 assert_eq!(bq.next(), Some('c')); 301 assert_eq!(bq.next(), None); 302 } 303 } 304