1 use std::cmp;
2 use std::io;
3 use std::ptr;
4 
5 /// The default buffer capacity that we use for the stream buffer.
6 const DEFAULT_BUFFER_CAPACITY: usize = 8 * (1 << 10); // 8 KB
7 
8 /// A fairly simple roll buffer for supporting stream searches.
9 ///
10 /// This buffer acts as a temporary place to store a fixed amount of data when
11 /// reading from a stream. Its central purpose is to allow "rolling" some
12 /// suffix of the data to the beginning of the buffer before refilling it with
13 /// more data from the stream. For example, let's say we are trying to match
14 /// "foobar" on a stream. When we report the match, we'd like to not only
15 /// report the correct offsets at which the match occurs, but also the matching
16 /// bytes themselves. So let's say our stream is a file with the following
17 /// contents: `test test foobar test test`. Now assume that we happen to read
18 /// the aforementioned file in two chunks: `test test foo` and `bar test test`.
19 /// Naively, it would not be possible to report a single contiguous `foobar`
20 /// match, but this roll buffer allows us to do that. Namely, after the second
21 /// read, the contents of the buffer should be `st foobar test test`, where the
22 /// search should ultimately resume immediately after `foo`. (The prefix `st `
23 /// is included because the roll buffer saves N bytes at the end of the buffer,
24 /// where N is the maximum possible length of a match.)
25 ///
26 /// A lot of the logic for dealing with this is unfortunately split out between
27 /// this roll buffer and the `StreamChunkIter`.
28 #[derive(Debug)]
29 pub struct Buffer {
30     /// The raw buffer contents. This has a fixed size and never increases.
31     buf: Vec<u8>,
32     /// The minimum size of the buffer, which is equivalent to the maximum
33     /// possible length of a match. This corresponds to the amount that we
34     /// roll
35     min: usize,
36     /// The end of the contents of this buffer.
37     end: usize,
38 }
39 
40 impl Buffer {
41     /// Create a new buffer for stream searching. The minimum buffer length
42     /// given should be the size of the maximum possible match length.
new(min_buffer_len: usize) -> Buffer43     pub fn new(min_buffer_len: usize) -> Buffer {
44         let min = cmp::max(1, min_buffer_len);
45         // The minimum buffer amount is also the amount that we roll our
46         // buffer in order to support incremental searching. To this end,
47         // our actual capacity needs to be at least 1 byte bigger than our
48         // minimum amount, otherwise we won't have any overlap. In actuality,
49         // we want our buffer to be a bit bigger than that for performance
50         // reasons, so we set a lower bound of `8 * min`.
51         //
52         // TODO: It would be good to find a way to test the streaming
53         // implementation with the minimal buffer size. For now, we just
54         // uncomment out the next line and comment out the subsequent line.
55         // let capacity = 1 + min;
56         let capacity = cmp::max(min * 8, DEFAULT_BUFFER_CAPACITY);
57         Buffer { buf: vec![0; capacity], min, end: 0 }
58     }
59 
60     /// Return the contents of this buffer.
61     #[inline]
buffer(&self) -> &[u8]62     pub fn buffer(&self) -> &[u8] {
63         &self.buf[..self.end]
64     }
65 
66     /// Return the minimum size of the buffer. The only way a buffer may be
67     /// smaller than this is if the stream itself contains less than the
68     /// minimum buffer amount.
69     #[inline]
min_buffer_len(&self) -> usize70     pub fn min_buffer_len(&self) -> usize {
71         self.min
72     }
73 
74     /// Return the total length of the contents in the buffer.
75     #[inline]
len(&self) -> usize76     pub fn len(&self) -> usize {
77         self.end
78     }
79 
80     /// Return all free capacity in this buffer.
free_buffer(&mut self) -> &mut [u8]81     fn free_buffer(&mut self) -> &mut [u8] {
82         &mut self.buf[self.end..]
83     }
84 
85     /// Refill the contents of this buffer by reading as much as possible into
86     /// this buffer's free capacity. If no more bytes could be read, then this
87     /// returns false. Otherwise, this reads until it has filled the buffer
88     /// past the minimum amount.
fill<R: io::Read>(&mut self, mut rdr: R) -> io::Result<bool>89     pub fn fill<R: io::Read>(&mut self, mut rdr: R) -> io::Result<bool> {
90         let mut readany = false;
91         loop {
92             let readlen = rdr.read(self.free_buffer())?;
93             if readlen == 0 {
94                 return Ok(readany);
95             }
96             readany = true;
97             self.end += readlen;
98             if self.len() >= self.min {
99                 return Ok(true);
100             }
101         }
102     }
103 
104     /// Roll the contents of the buffer so that the suffix of this buffer is
105     /// moved to the front and all other contents are dropped. The size of the
106     /// suffix corresponds precisely to the minimum buffer length.
107     ///
108     /// This should only be called when the entire contents of this buffer have
109     /// been searched.
roll(&mut self)110     pub fn roll(&mut self) {
111         let roll_start = self
112             .end
113             .checked_sub(self.min)
114             .expect("buffer capacity should be bigger than minimum amount");
115         let roll_len = self.min;
116 
117         assert!(roll_start + roll_len <= self.end);
118         unsafe {
119             // SAFETY: A buffer contains Copy data, so there's no problem
120             // moving it around. Safety also depends on our indices being in
121             // bounds, which they always should be, given the assert above.
122             //
123             // TODO: Switch to [T]::copy_within once our MSRV is high enough.
124             ptr::copy(
125                 self.buf[roll_start..].as_ptr(),
126                 self.buf.as_mut_ptr(),
127                 roll_len,
128             );
129         }
130         self.end = roll_len;
131     }
132 }
133