1 /*!
2 Utilities for working with I/O using byte strings.
3 
4 This module currently only exports a single trait, `BufReadExt`, which provides
5 facilities for conveniently and efficiently working with lines as byte strings.
6 
7 More APIs may be added in the future.
8 */
9 
10 use std::io;
11 
12 use crate::ext_slice::ByteSlice;
13 use crate::ext_vec::ByteVec;
14 
15 /// An extention trait for
16 /// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html)
17 /// which provides convenience APIs for dealing with byte strings.
18 pub trait BufReadExt: io::BufRead {
19     /// Returns an iterator over the lines of this reader, where each line
20     /// is represented as a byte string.
21     ///
22     /// Each item yielded by this iterator is a `io::Result<Vec<u8>>`, where
23     /// an error is yielded if there was a problem reading from the underlying
24     /// reader.
25     ///
26     /// On success, the next line in the iterator is returned. The line does
27     /// *not* contain a trailing `\n` or `\r\n`.
28     ///
29     /// # Examples
30     ///
31     /// Basic usage:
32     ///
33     /// ```
34     /// use std::io;
35     ///
36     /// use bstr::io::BufReadExt;
37     ///
38     /// # fn example() -> Result<(), io::Error> {
39     /// let cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
40     ///
41     /// let mut lines = vec![];
42     /// for result in cursor.byte_lines() {
43     ///     let line = result?;
44     ///     lines.push(line);
45     /// }
46     /// assert_eq!(lines.len(), 3);
47     /// assert_eq!(lines[0], "lorem".as_bytes());
48     /// assert_eq!(lines[1], "ipsum".as_bytes());
49     /// assert_eq!(lines[2], "dolor".as_bytes());
50     /// # Ok(()) }; example().unwrap()
51     /// ```
byte_lines(self) -> ByteLines<Self> where Self: Sized,52     fn byte_lines(self) -> ByteLines<Self>
53     where
54         Self: Sized,
55     {
56         ByteLines { buf: self }
57     }
58 
59     /// Returns an iterator over byte-terminated records of this reader, where
60     /// each record is represented as a byte string.
61     ///
62     /// Each item yielded by this iterator is a `io::Result<Vec<u8>>`, where
63     /// an error is yielded if there was a problem reading from the underlying
64     /// reader.
65     ///
66     /// On success, the next record in the iterator is returned. The record
67     /// does *not* contain its trailing terminator.
68     ///
69     /// Note that calling `byte_records(b'\n')` differs from `byte_lines()` in
70     /// that it has no special handling for `\r`.
71     ///
72     /// # Examples
73     ///
74     /// Basic usage:
75     ///
76     /// ```
77     /// use std::io;
78     ///
79     /// use bstr::io::BufReadExt;
80     ///
81     /// # fn example() -> Result<(), io::Error> {
82     /// let cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
83     ///
84     /// let mut records = vec![];
85     /// for result in cursor.byte_records(b'\x00') {
86     ///     let record = result?;
87     ///     records.push(record);
88     /// }
89     /// assert_eq!(records.len(), 3);
90     /// assert_eq!(records[0], "lorem".as_bytes());
91     /// assert_eq!(records[1], "ipsum".as_bytes());
92     /// assert_eq!(records[2], "dolor".as_bytes());
93     /// # Ok(()) }; example().unwrap()
94     /// ```
byte_records(self, terminator: u8) -> ByteRecords<Self> where Self: Sized,95     fn byte_records(self, terminator: u8) -> ByteRecords<Self>
96     where
97         Self: Sized,
98     {
99         ByteRecords { terminator, buf: self }
100     }
101 
102     /// Executes the given closure on each line in the underlying reader.
103     ///
104     /// If the closure returns an error (or if the underlying reader returns an
105     /// error), then iteration is stopped and the error is returned. If false
106     /// is returned, then iteration is stopped and no error is returned.
107     ///
108     /// The closure given is called on exactly the same values as yielded by
109     /// the [`byte_lines`](trait.BufReadExt.html#method.byte_lines)
110     /// iterator. Namely, lines do _not_ contain trailing `\n` or `\r\n` bytes.
111     ///
112     /// This routine is useful for iterating over lines as quickly as
113     /// possible. Namely, a single allocation is reused for each line.
114     ///
115     /// # Examples
116     ///
117     /// Basic usage:
118     ///
119     /// ```
120     /// use std::io;
121     ///
122     /// use bstr::io::BufReadExt;
123     ///
124     /// # fn example() -> Result<(), io::Error> {
125     /// let cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
126     ///
127     /// let mut lines = vec![];
128     /// cursor.for_byte_line(|line| {
129     ///     lines.push(line.to_vec());
130     ///     Ok(true)
131     /// })?;
132     /// assert_eq!(lines.len(), 3);
133     /// assert_eq!(lines[0], "lorem".as_bytes());
134     /// assert_eq!(lines[1], "ipsum".as_bytes());
135     /// assert_eq!(lines[2], "dolor".as_bytes());
136     /// # Ok(()) }; example().unwrap()
137     /// ```
for_byte_line<F>(self, mut for_each_line: F) -> io::Result<()> where Self: Sized, F: FnMut(&[u8]) -> io::Result<bool>,138     fn for_byte_line<F>(self, mut for_each_line: F) -> io::Result<()>
139     where
140         Self: Sized,
141         F: FnMut(&[u8]) -> io::Result<bool>,
142     {
143         self.for_byte_line_with_terminator(|line| {
144             for_each_line(&trim_line_slice(&line))
145         })
146     }
147 
148     /// Executes the given closure on each byte-terminated record in the
149     /// underlying reader.
150     ///
151     /// If the closure returns an error (or if the underlying reader returns an
152     /// error), then iteration is stopped and the error is returned. If false
153     /// is returned, then iteration is stopped and no error is returned.
154     ///
155     /// The closure given is called on exactly the same values as yielded by
156     /// the [`byte_records`](trait.BufReadExt.html#method.byte_records)
157     /// iterator. Namely, records do _not_ contain a trailing terminator byte.
158     ///
159     /// This routine is useful for iterating over records as quickly as
160     /// possible. Namely, a single allocation is reused for each record.
161     ///
162     /// # Examples
163     ///
164     /// Basic usage:
165     ///
166     /// ```
167     /// use std::io;
168     ///
169     /// use bstr::io::BufReadExt;
170     ///
171     /// # fn example() -> Result<(), io::Error> {
172     /// let cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
173     ///
174     /// let mut records = vec![];
175     /// cursor.for_byte_record(b'\x00', |record| {
176     ///     records.push(record.to_vec());
177     ///     Ok(true)
178     /// })?;
179     /// assert_eq!(records.len(), 3);
180     /// assert_eq!(records[0], "lorem".as_bytes());
181     /// assert_eq!(records[1], "ipsum".as_bytes());
182     /// assert_eq!(records[2], "dolor".as_bytes());
183     /// # Ok(()) }; example().unwrap()
184     /// ```
for_byte_record<F>( self, terminator: u8, mut for_each_record: F, ) -> io::Result<()> where Self: Sized, F: FnMut(&[u8]) -> io::Result<bool>,185     fn for_byte_record<F>(
186         self,
187         terminator: u8,
188         mut for_each_record: F,
189     ) -> io::Result<()>
190     where
191         Self: Sized,
192         F: FnMut(&[u8]) -> io::Result<bool>,
193     {
194         self.for_byte_record_with_terminator(terminator, |chunk| {
195             for_each_record(&trim_record_slice(&chunk, terminator))
196         })
197     }
198 
199     /// Executes the given closure on each line in the underlying reader.
200     ///
201     /// If the closure returns an error (or if the underlying reader returns an
202     /// error), then iteration is stopped and the error is returned. If false
203     /// is returned, then iteration is stopped and no error is returned.
204     ///
205     /// Unlike
206     /// [`for_byte_line`](trait.BufReadExt.html#method.for_byte_line),
207     /// the lines given to the closure *do* include the line terminator, if one
208     /// exists.
209     ///
210     /// This routine is useful for iterating over lines as quickly as
211     /// possible. Namely, a single allocation is reused for each line.
212     ///
213     /// This is identical to `for_byte_record_with_terminator` with a
214     /// terminator of `\n`.
215     ///
216     /// # Examples
217     ///
218     /// Basic usage:
219     ///
220     /// ```
221     /// use std::io;
222     ///
223     /// use bstr::io::BufReadExt;
224     ///
225     /// # fn example() -> Result<(), io::Error> {
226     /// let cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
227     ///
228     /// let mut lines = vec![];
229     /// cursor.for_byte_line_with_terminator(|line| {
230     ///     lines.push(line.to_vec());
231     ///     Ok(true)
232     /// })?;
233     /// assert_eq!(lines.len(), 3);
234     /// assert_eq!(lines[0], "lorem\n".as_bytes());
235     /// assert_eq!(lines[1], "ipsum\r\n".as_bytes());
236     /// assert_eq!(lines[2], "dolor".as_bytes());
237     /// # Ok(()) }; example().unwrap()
238     /// ```
for_byte_line_with_terminator<F>( self, for_each_line: F, ) -> io::Result<()> where Self: Sized, F: FnMut(&[u8]) -> io::Result<bool>,239     fn for_byte_line_with_terminator<F>(
240         self,
241         for_each_line: F,
242     ) -> io::Result<()>
243     where
244         Self: Sized,
245         F: FnMut(&[u8]) -> io::Result<bool>,
246     {
247         self.for_byte_record_with_terminator(b'\n', for_each_line)
248     }
249 
250     /// Executes the given closure on each byte-terminated record in the
251     /// underlying reader.
252     ///
253     /// If the closure returns an error (or if the underlying reader returns an
254     /// error), then iteration is stopped and the error is returned. If false
255     /// is returned, then iteration is stopped and no error is returned.
256     ///
257     /// Unlike
258     /// [`for_byte_record`](trait.BufReadExt.html#method.for_byte_record),
259     /// the lines given to the closure *do* include the record terminator, if
260     /// one exists.
261     ///
262     /// This routine is useful for iterating over records as quickly as
263     /// possible. Namely, a single allocation is reused for each record.
264     ///
265     /// # Examples
266     ///
267     /// Basic usage:
268     ///
269     /// ```
270     /// use std::io;
271     ///
272     /// use bstr::B;
273     /// use bstr::io::BufReadExt;
274     ///
275     /// # fn example() -> Result<(), io::Error> {
276     /// let cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
277     ///
278     /// let mut records = vec![];
279     /// cursor.for_byte_record_with_terminator(b'\x00', |record| {
280     ///     records.push(record.to_vec());
281     ///     Ok(true)
282     /// })?;
283     /// assert_eq!(records.len(), 3);
284     /// assert_eq!(records[0], B(b"lorem\x00"));
285     /// assert_eq!(records[1], B("ipsum\x00"));
286     /// assert_eq!(records[2], B("dolor"));
287     /// # Ok(()) }; example().unwrap()
288     /// ```
for_byte_record_with_terminator<F>( mut self, terminator: u8, mut for_each_record: F, ) -> io::Result<()> where Self: Sized, F: FnMut(&[u8]) -> io::Result<bool>,289     fn for_byte_record_with_terminator<F>(
290         mut self,
291         terminator: u8,
292         mut for_each_record: F,
293     ) -> io::Result<()>
294     where
295         Self: Sized,
296         F: FnMut(&[u8]) -> io::Result<bool>,
297     {
298         let mut bytes = vec![];
299         let mut res = Ok(());
300         let mut consumed = 0;
301         'outer: loop {
302             // Lend out complete record slices from our buffer
303             {
304                 let mut buf = self.fill_buf()?;
305                 while let Some(index) = buf.find_byte(terminator) {
306                     let (record, rest) = buf.split_at(index + 1);
307                     buf = rest;
308                     consumed += record.len();
309                     match for_each_record(&record) {
310                         Ok(false) => break 'outer,
311                         Err(err) => {
312                             res = Err(err);
313                             break 'outer;
314                         }
315                         _ => (),
316                     }
317                 }
318 
319                 // Copy the final record fragment to our local buffer. This
320                 // saves read_until() from re-scanning a buffer we know
321                 // contains no remaining terminators.
322                 bytes.extend_from_slice(&buf);
323                 consumed += buf.len();
324             }
325 
326             self.consume(consumed);
327             consumed = 0;
328 
329             // N.B. read_until uses a different version of memchr that may
330             // be slower than the memchr crate that bstr uses. However, this
331             // should only run for a fairly small number of records, assuming a
332             // decent buffer size.
333             self.read_until(terminator, &mut bytes)?;
334             if bytes.is_empty() || !for_each_record(&bytes)? {
335                 break;
336             }
337             bytes.clear();
338         }
339         self.consume(consumed);
340         res
341     }
342 }
343 
344 impl<B: io::BufRead> BufReadExt for B {}
345 
346 /// An iterator over lines from an instance of
347 /// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html).
348 ///
349 /// This iterator is generally created by calling the
350 /// [`byte_lines`](trait.BufReadExt.html#method.byte_lines)
351 /// method on the
352 /// [`BufReadExt`](trait.BufReadExt.html)
353 /// trait.
354 #[derive(Debug)]
355 pub struct ByteLines<B> {
356     buf: B,
357 }
358 
359 /// An iterator over records from an instance of
360 /// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html).
361 ///
362 /// A byte record is any sequence of bytes terminated by a particular byte
363 /// chosen by the caller. For example, NUL separated byte strings are said to
364 /// be NUL-terminated byte records.
365 ///
366 /// This iterator is generally created by calling the
367 /// [`byte_records`](trait.BufReadExt.html#method.byte_records)
368 /// method on the
369 /// [`BufReadExt`](trait.BufReadExt.html)
370 /// trait.
371 #[derive(Debug)]
372 pub struct ByteRecords<B> {
373     buf: B,
374     terminator: u8,
375 }
376 
377 impl<B: io::BufRead> Iterator for ByteLines<B> {
378     type Item = io::Result<Vec<u8>>;
379 
next(&mut self) -> Option<io::Result<Vec<u8>>>380     fn next(&mut self) -> Option<io::Result<Vec<u8>>> {
381         let mut bytes = vec![];
382         match self.buf.read_until(b'\n', &mut bytes) {
383             Err(e) => Some(Err(e)),
384             Ok(0) => None,
385             Ok(_) => {
386                 trim_line(&mut bytes);
387                 Some(Ok(bytes))
388             }
389         }
390     }
391 }
392 
393 impl<B: io::BufRead> Iterator for ByteRecords<B> {
394     type Item = io::Result<Vec<u8>>;
395 
next(&mut self) -> Option<io::Result<Vec<u8>>>396     fn next(&mut self) -> Option<io::Result<Vec<u8>>> {
397         let mut bytes = vec![];
398         match self.buf.read_until(self.terminator, &mut bytes) {
399             Err(e) => Some(Err(e)),
400             Ok(0) => None,
401             Ok(_) => {
402                 trim_record(&mut bytes, self.terminator);
403                 Some(Ok(bytes))
404             }
405         }
406     }
407 }
408 
trim_line(line: &mut Vec<u8>)409 fn trim_line(line: &mut Vec<u8>) {
410     if line.last_byte() == Some(b'\n') {
411         line.pop_byte();
412         if line.last_byte() == Some(b'\r') {
413             line.pop_byte();
414         }
415     }
416 }
417 
trim_line_slice(mut line: &[u8]) -> &[u8]418 fn trim_line_slice(mut line: &[u8]) -> &[u8] {
419     if line.last_byte() == Some(b'\n') {
420         line = &line[..line.len() - 1];
421         if line.last_byte() == Some(b'\r') {
422             line = &line[..line.len() - 1];
423         }
424     }
425     line
426 }
427 
trim_record(record: &mut Vec<u8>, terminator: u8)428 fn trim_record(record: &mut Vec<u8>, terminator: u8) {
429     if record.last_byte() == Some(terminator) {
430         record.pop_byte();
431     }
432 }
433 
trim_record_slice(mut record: &[u8], terminator: u8) -> &[u8]434 fn trim_record_slice(mut record: &[u8], terminator: u8) -> &[u8] {
435     if record.last_byte() == Some(terminator) {
436         record = &record[..record.len() - 1];
437     }
438     record
439 }
440 
441 #[cfg(test)]
442 mod tests {
443     use super::BufReadExt;
444     use crate::bstring::BString;
445 
collect_lines<B: AsRef<[u8]>>(slice: B) -> Vec<BString>446     fn collect_lines<B: AsRef<[u8]>>(slice: B) -> Vec<BString> {
447         let mut lines = vec![];
448         slice
449             .as_ref()
450             .for_byte_line(|line| {
451                 lines.push(BString::from(line.to_vec()));
452                 Ok(true)
453             })
454             .unwrap();
455         lines
456     }
457 
collect_lines_term<B: AsRef<[u8]>>(slice: B) -> Vec<BString>458     fn collect_lines_term<B: AsRef<[u8]>>(slice: B) -> Vec<BString> {
459         let mut lines = vec![];
460         slice
461             .as_ref()
462             .for_byte_line_with_terminator(|line| {
463                 lines.push(BString::from(line.to_vec()));
464                 Ok(true)
465             })
466             .unwrap();
467         lines
468     }
469 
470     #[test]
lines_without_terminator()471     fn lines_without_terminator() {
472         assert_eq!(collect_lines(""), Vec::<BString>::new());
473 
474         assert_eq!(collect_lines("\n"), vec![""]);
475         assert_eq!(collect_lines("\n\n"), vec!["", ""]);
476         assert_eq!(collect_lines("a\nb\n"), vec!["a", "b"]);
477         assert_eq!(collect_lines("a\nb"), vec!["a", "b"]);
478         assert_eq!(collect_lines("abc\nxyz\n"), vec!["abc", "xyz"]);
479         assert_eq!(collect_lines("abc\nxyz"), vec!["abc", "xyz"]);
480 
481         assert_eq!(collect_lines("\r\n"), vec![""]);
482         assert_eq!(collect_lines("\r\n\r\n"), vec!["", ""]);
483         assert_eq!(collect_lines("a\r\nb\r\n"), vec!["a", "b"]);
484         assert_eq!(collect_lines("a\r\nb"), vec!["a", "b"]);
485         assert_eq!(collect_lines("abc\r\nxyz\r\n"), vec!["abc", "xyz"]);
486         assert_eq!(collect_lines("abc\r\nxyz"), vec!["abc", "xyz"]);
487 
488         assert_eq!(collect_lines("abc\rxyz"), vec!["abc\rxyz"]);
489     }
490 
491     #[test]
lines_with_terminator()492     fn lines_with_terminator() {
493         assert_eq!(collect_lines_term(""), Vec::<BString>::new());
494 
495         assert_eq!(collect_lines_term("\n"), vec!["\n"]);
496         assert_eq!(collect_lines_term("\n\n"), vec!["\n", "\n"]);
497         assert_eq!(collect_lines_term("a\nb\n"), vec!["a\n", "b\n"]);
498         assert_eq!(collect_lines_term("a\nb"), vec!["a\n", "b"]);
499         assert_eq!(collect_lines_term("abc\nxyz\n"), vec!["abc\n", "xyz\n"]);
500         assert_eq!(collect_lines_term("abc\nxyz"), vec!["abc\n", "xyz"]);
501 
502         assert_eq!(collect_lines_term("\r\n"), vec!["\r\n"]);
503         assert_eq!(collect_lines_term("\r\n\r\n"), vec!["\r\n", "\r\n"]);
504         assert_eq!(collect_lines_term("a\r\nb\r\n"), vec!["a\r\n", "b\r\n"]);
505         assert_eq!(collect_lines_term("a\r\nb"), vec!["a\r\n", "b"]);
506         assert_eq!(
507             collect_lines_term("abc\r\nxyz\r\n"),
508             vec!["abc\r\n", "xyz\r\n"]
509         );
510         assert_eq!(collect_lines_term("abc\r\nxyz"), vec!["abc\r\n", "xyz"]);
511 
512         assert_eq!(collect_lines_term("abc\rxyz"), vec!["abc\rxyz"]);
513     }
514 }
515