1 // Copyright 2014-2017 The html5ever Project Developers. See the
2 // COPYRIGHT file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 //! The `BufferQueue` struct and helper types.
11 //!
12 //! This type is designed for the efficient parsing of string data, especially where many
13 //! significant characters are from the ascii range 0-63. This includes, for example, important
14 //! characters in xml/html parsing.
15 //!
16 //! Good and predictable performance is achieved by avoiding allocation where possible (a.k.a. zero
17 //! copy).
18 //!
19 //! [`BufferQueue`]: struct.BufferQueue.html
20 
21 use std::collections::VecDeque;
22 
23 use tendril::StrTendril;
24 
25 pub use self::SetResult::{FromSet, NotFromSet};
26 use crate::util::smallcharset::SmallCharSet;
27 
28 /// Result from [`pop_except_from`] containing either a character from a [`SmallCharSet`], or a
29 /// string buffer of characters not from the set.
30 ///
31 /// [`pop_except_from`]: struct.BufferQueue.html#method.pop_except_from
32 /// [`SmallCharSet`]: ../struct.SmallCharSet.html
33 #[derive(PartialEq, Eq, Debug)]
34 pub enum SetResult {
35     /// A character from the `SmallCharSet`.
36     FromSet(char),
37     /// A string buffer containing no characters from the `SmallCharSet`.
38     NotFromSet(StrTendril),
39 }
40 
41 /// A queue of owned string buffers, which supports incrementally consuming characters.
42 ///
43 /// Internally it uses [`VecDeque`] and has the same complexity properties.
44 ///
45 /// [`VecDeque`]: https://doc.rust-lang.org/std/collections/struct.VecDeque.html
46 #[derive(Debug)]
47 pub struct BufferQueue {
48     /// Buffers to process.
49     buffers: VecDeque<StrTendril>,
50 }
51 
52 impl BufferQueue {
53     /// Create an empty BufferQueue.
54     #[inline]
new() -> BufferQueue55     pub fn new() -> BufferQueue {
56         BufferQueue {
57             buffers: VecDeque::with_capacity(16),
58         }
59     }
60 
61     /// Returns whether the queue is empty.
62     #[inline]
is_empty(&self) -> bool63     pub fn is_empty(&self) -> bool {
64         self.buffers.is_empty()
65     }
66 
67     /// Get the buffer at the beginning of the queue.
68     #[inline]
pop_front(&mut self) -> Option<StrTendril>69     pub fn pop_front(&mut self) -> Option<StrTendril> {
70         self.buffers.pop_front()
71     }
72 
73     /// Add a buffer to the beginning of the queue.
74     ///
75     /// If the buffer is empty, it will be skipped.
push_front(&mut self, buf: StrTendril)76     pub fn push_front(&mut self, buf: StrTendril) {
77         if buf.len32() == 0 {
78             return;
79         }
80         self.buffers.push_front(buf);
81     }
82 
83     /// Add a buffer to the end of the queue.
84     ///
85     /// If the buffer is empty, it will be skipped.
push_back(&mut self, buf: StrTendril)86     pub fn push_back(&mut self, buf: StrTendril) {
87         if buf.len32() == 0 {
88             return;
89         }
90         self.buffers.push_back(buf);
91     }
92 
93     /// Look at the next available character without removing it, if the queue is not empty.
peek(&self) -> Option<char>94     pub fn peek(&self) -> Option<char> {
95         debug_assert!(
96             self.buffers
97                 .iter()
98                 .find(|el| el.len32() == 0)
99                 .is_none(),
100             "invariant \"all buffers in the queue are non-empty\" failed"
101         );
102         self.buffers.front().map(|b| b.chars().next().unwrap())
103     }
104 
105     /// Get the next character if one is available, removing it from the queue.
106     ///
107     /// This function manages the buffers, removing them as they become empty.
next(&mut self) -> Option<char>108     pub fn next(&mut self) -> Option<char> {
109         let (result, now_empty) = match self.buffers.front_mut() {
110             None => (None, false),
111             Some(buf) => {
112                 let c = buf.pop_front_char().expect("empty buffer in queue");
113                 (Some(c), buf.is_empty())
114             },
115         };
116 
117         if now_empty {
118             self.buffers.pop_front();
119         }
120 
121         result
122     }
123 
124     /// Pops and returns either a single character from the given set, or
125     /// a buffer of characters none of which are in the set.
126     ///
127     /// # Examples
128     ///
129     /// ```
130     /// # #[macro_use] extern crate markup5ever;
131     /// # #[macro_use] extern crate tendril;
132     /// # fn main() {
133     /// use markup5ever::buffer_queue::{BufferQueue, SetResult};
134     ///
135     /// let mut queue = BufferQueue::new();
136     /// queue.push_back(format_tendril!(r#"<some_tag attr="text">SomeText</some_tag>"#));
137     /// let set = small_char_set!(b'<' b'>' b' ' b'=' b'"' b'/');
138     /// let tag = format_tendril!("some_tag");
139     /// let attr = format_tendril!("attr");
140     /// let attr_val = format_tendril!("text");
141     /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('<')));
142     /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(tag)));
143     /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet(' ')));
144     /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(attr)));
145     /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('=')));
146     /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('"')));
147     /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(attr_val)));
148     /// // ...
149     /// # }
150     /// ```
pop_except_from(&mut self, set: SmallCharSet) -> Option<SetResult>151     pub fn pop_except_from(&mut self, set: SmallCharSet) -> Option<SetResult> {
152         let (result, now_empty) = match self.buffers.front_mut() {
153             None => (None, false),
154             Some(buf) => {
155                 let n = set.nonmember_prefix_len(&buf);
156                 if n > 0 {
157                     let out;
158                     unsafe {
159                         out = buf.unsafe_subtendril(0, n);
160                         buf.unsafe_pop_front(n);
161                     }
162                     (Some(NotFromSet(out)), buf.is_empty())
163                 } else {
164                     let c = buf.pop_front_char().expect("empty buffer in queue");
165                     (Some(FromSet(c)), buf.is_empty())
166                 }
167             },
168         };
169 
170         // Unborrow self for this part.
171         if now_empty {
172             self.buffers.pop_front();
173         }
174 
175         result
176     }
177 
178     /// Consume bytes matching the pattern, using a custom comparison function `eq`.
179     ///
180     /// Returns `Some(true)` if there is a match, `Some(false)` if there is no match, or `None` if
181     /// it wasn't possible to know (more data is needed).
182     ///
183     /// The custom comparison function is used elsewhere to compare ascii-case-insensitively.
184     ///
185     /// # Examples
186     ///
187     /// ```
188     /// # extern crate markup5ever;
189     /// # #[macro_use] extern crate tendril;
190     /// # fn main() {
191     /// use markup5ever::buffer_queue::{BufferQueue};
192     ///
193     /// let mut queue = BufferQueue::new();
194     /// queue.push_back(format_tendril!("testtext"));
195     /// let test_str = "test";
196     /// assert_eq!(queue.eat("test", |&a, &b| a == b), Some(true));
197     /// assert_eq!(queue.eat("text", |&a, &b| a == b), Some(true));
198     /// assert!(queue.is_empty());
199     /// # }
200     /// ```
eat<F: Fn(&u8, &u8) -> bool>(&mut self, pat: &str, eq: F) -> Option<bool>201     pub fn eat<F: Fn(&u8, &u8) -> bool>(&mut self, pat: &str, eq: F) -> Option<bool> {
202         let mut buffers_exhausted = 0;
203         let mut consumed_from_last = 0;
204 
205         self.buffers.front()?;
206 
207         for pattern_byte in pat.bytes() {
208             if buffers_exhausted >= self.buffers.len() {
209                 return None;
210             }
211             let buf = &self.buffers[buffers_exhausted];
212 
213             if !eq(&buf.as_bytes()[consumed_from_last], &pattern_byte) {
214                 return Some(false);
215             }
216 
217             consumed_from_last += 1;
218             if consumed_from_last >= buf.len() {
219                 buffers_exhausted += 1;
220                 consumed_from_last = 0;
221             }
222         }
223 
224         // We have a match. Commit changes to the BufferQueue.
225         for _ in 0..buffers_exhausted {
226             self.buffers.pop_front();
227         }
228 
229         match self.buffers.front_mut() {
230             None => assert_eq!(consumed_from_last, 0),
231             Some(ref mut buf) => buf.pop_front(consumed_from_last as u32),
232         }
233 
234         Some(true)
235     }
236 }
237 
238 #[cfg(test)]
239 #[allow(non_snake_case)]
240 mod test {
241     use tendril::SliceExt;
242 
243     use super::BufferQueue;
244     use super::SetResult::{FromSet, NotFromSet};
245 
246     #[test]
smoke_test()247     fn smoke_test() {
248         let mut bq = BufferQueue::new();
249         assert_eq!(bq.peek(), None);
250         assert_eq!(bq.next(), None);
251 
252         bq.push_back("abc".to_tendril());
253         assert_eq!(bq.peek(), Some('a'));
254         assert_eq!(bq.next(), Some('a'));
255         assert_eq!(bq.peek(), Some('b'));
256         assert_eq!(bq.peek(), Some('b'));
257         assert_eq!(bq.next(), Some('b'));
258         assert_eq!(bq.peek(), Some('c'));
259         assert_eq!(bq.next(), Some('c'));
260         assert_eq!(bq.peek(), None);
261         assert_eq!(bq.next(), None);
262     }
263 
264     #[test]
can_unconsume()265     fn can_unconsume() {
266         let mut bq = BufferQueue::new();
267         bq.push_back("abc".to_tendril());
268         assert_eq!(bq.next(), Some('a'));
269 
270         bq.push_front("xy".to_tendril());
271         assert_eq!(bq.next(), Some('x'));
272         assert_eq!(bq.next(), Some('y'));
273         assert_eq!(bq.next(), Some('b'));
274         assert_eq!(bq.next(), Some('c'));
275         assert_eq!(bq.next(), None);
276     }
277 
278     #[test]
can_pop_except_set()279     fn can_pop_except_set() {
280         let mut bq = BufferQueue::new();
281         bq.push_back("abc&def".to_tendril());
282         let mut pop = || bq.pop_except_from(small_char_set!('&'));
283         assert_eq!(pop(), Some(NotFromSet("abc".to_tendril())));
284         assert_eq!(pop(), Some(FromSet('&')));
285         assert_eq!(pop(), Some(NotFromSet("def".to_tendril())));
286         assert_eq!(pop(), None);
287     }
288 
289     #[test]
can_eat()290     fn can_eat() {
291         // This is not very comprehensive.  We rely on the tokenizer
292         // integration tests for more thorough testing with many
293         // different input buffer splits.
294         let mut bq = BufferQueue::new();
295         bq.push_back("a".to_tendril());
296         bq.push_back("bc".to_tendril());
297         assert_eq!(bq.eat("abcd", u8::eq_ignore_ascii_case), None);
298         assert_eq!(bq.eat("ax", u8::eq_ignore_ascii_case), Some(false));
299         assert_eq!(bq.eat("ab", u8::eq_ignore_ascii_case), Some(true));
300         assert_eq!(bq.next(), Some('c'));
301         assert_eq!(bq.next(), None);
302     }
303 }
304