1 use std::cmp; 2 use std::io; 3 use std::ptr; 4 5 /// The default buffer capacity that we use for the stream buffer. 6 const DEFAULT_BUFFER_CAPACITY: usize = 8 * (1 << 10); // 8 KB 7 8 /// A fairly simple roll buffer for supporting stream searches. 9 /// 10 /// This buffer acts as a temporary place to store a fixed amount of data when 11 /// reading from a stream. Its central purpose is to allow "rolling" some 12 /// suffix of the data to the beginning of the buffer before refilling it with 13 /// more data from the stream. For example, let's say we are trying to match 14 /// "foobar" on a stream. When we report the match, we'd like to not only 15 /// report the correct offsets at which the match occurs, but also the matching 16 /// bytes themselves. So let's say our stream is a file with the following 17 /// contents: `test test foobar test test`. Now assume that we happen to read 18 /// the aforementioned file in two chunks: `test test foo` and `bar test test`. 19 /// Naively, it would not be possible to report a single contiguous `foobar` 20 /// match, but this roll buffer allows us to do that. Namely, after the second 21 /// read, the contents of the buffer should be `st foobar test test`, where the 22 /// search should ultimately resume immediately after `foo`. (The prefix `st ` 23 /// is included because the roll buffer saves N bytes at the end of the buffer, 24 /// where N is the maximum possible length of a match.) 25 /// 26 /// A lot of the logic for dealing with this is unfortunately split out between 27 /// this roll buffer and the `StreamChunkIter`. 28 #[derive(Debug)] 29 pub struct Buffer { 30 /// The raw buffer contents. This has a fixed size and never increases. 31 buf: Vec<u8>, 32 /// The minimum size of the buffer, which is equivalent to the maximum 33 /// possible length of a match. This corresponds to the amount that we 34 /// roll 35 min: usize, 36 /// The end of the contents of this buffer. 37 end: usize, 38 } 39 40 impl Buffer { 41 /// Create a new buffer for stream searching. The minimum buffer length 42 /// given should be the size of the maximum possible match length. new(min_buffer_len: usize) -> Buffer43 pub fn new(min_buffer_len: usize) -> Buffer { 44 let min = cmp::max(1, min_buffer_len); 45 // The minimum buffer amount is also the amount that we roll our 46 // buffer in order to support incremental searching. To this end, 47 // our actual capacity needs to be at least 1 byte bigger than our 48 // minimum amount, otherwise we won't have any overlap. In actuality, 49 // we want our buffer to be a bit bigger than that for performance 50 // reasons, so we set a lower bound of `8 * min`. 51 // 52 // TODO: It would be good to find a way to test the streaming 53 // implementation with the minimal buffer size. For now, we just 54 // uncomment out the next line and comment out the subsequent line. 55 // let capacity = 1 + min; 56 let capacity = cmp::max(min * 8, DEFAULT_BUFFER_CAPACITY); 57 Buffer { buf: vec![0; capacity], min, end: 0 } 58 } 59 60 /// Return the contents of this buffer. 61 #[inline] buffer(&self) -> &[u8]62 pub fn buffer(&self) -> &[u8] { 63 &self.buf[..self.end] 64 } 65 66 /// Return the minimum size of the buffer. The only way a buffer may be 67 /// smaller than this is if the stream itself contains less than the 68 /// minimum buffer amount. 69 #[inline] min_buffer_len(&self) -> usize70 pub fn min_buffer_len(&self) -> usize { 71 self.min 72 } 73 74 /// Return the total length of the contents in the buffer. 75 #[inline] len(&self) -> usize76 pub fn len(&self) -> usize { 77 self.end 78 } 79 80 /// Return all free capacity in this buffer. free_buffer(&mut self) -> &mut [u8]81 fn free_buffer(&mut self) -> &mut [u8] { 82 &mut self.buf[self.end..] 83 } 84 85 /// Refill the contents of this buffer by reading as much as possible into 86 /// this buffer's free capacity. If no more bytes could be read, then this 87 /// returns false. Otherwise, this reads until it has filled the buffer 88 /// past the minimum amount. fill<R: io::Read>(&mut self, mut rdr: R) -> io::Result<bool>89 pub fn fill<R: io::Read>(&mut self, mut rdr: R) -> io::Result<bool> { 90 let mut readany = false; 91 loop { 92 let readlen = rdr.read(self.free_buffer())?; 93 if readlen == 0 { 94 return Ok(readany); 95 } 96 readany = true; 97 self.end += readlen; 98 if self.len() >= self.min { 99 return Ok(true); 100 } 101 } 102 } 103 104 /// Roll the contents of the buffer so that the suffix of this buffer is 105 /// moved to the front and all other contents are dropped. The size of the 106 /// suffix corresponds precisely to the minimum buffer length. 107 /// 108 /// This should only be called when the entire contents of this buffer have 109 /// been searched. roll(&mut self)110 pub fn roll(&mut self) { 111 let roll_start = self 112 .end 113 .checked_sub(self.min) 114 .expect("buffer capacity should be bigger than minimum amount"); 115 let roll_len = self.min; 116 117 assert!(roll_start + roll_len <= self.end); 118 unsafe { 119 // SAFETY: A buffer contains Copy data, so there's no problem 120 // moving it around. Safety also depends on our indices being in 121 // bounds, which they always should be, given the assert above. 122 // 123 // TODO: Switch to [T]::copy_within once our MSRV is high enough. 124 ptr::copy( 125 self.buf[roll_start..].as_ptr(), 126 self.buf.as_mut_ptr(), 127 roll_len, 128 ); 129 } 130 self.end = roll_len; 131 } 132 } 133