1 use std::io::{self, BufRead};
2 use std::error::Error;
3 use std::fmt;
4 use std::str;
5 use super::*;
6 
7 /// Wraps a `std::io::BufRead` buffered byte stream and decode it as UTF-8.
8 pub struct BufReadDecoder<B: BufRead> {
9     buf_read: B,
10     bytes_consumed: usize,
11     incomplete: Incomplete,
12 }
13 
14 #[derive(Debug)]
15 pub enum BufReadDecoderError<'a> {
16     /// Represents one UTF-8 error in the byte stream.
17     ///
18     /// In lossy decoding, each such error should be replaced with U+FFFD.
19     /// (See `BufReadDecoder::next_lossy` and `BufReadDecoderError::lossy`.)
20     InvalidByteSequence(&'a [u8]),
21 
22     /// An I/O error from the underlying byte stream
23     Io(io::Error),
24 }
25 
26 impl<'a> BufReadDecoderError<'a> {
27     /// Replace UTF-8 errors with U+FFFD
lossy(self) -> Result<&'static str, io::Error>28     pub fn lossy(self) -> Result<&'static str, io::Error> {
29         match self {
30             BufReadDecoderError::Io(error) => Err(error),
31             BufReadDecoderError::InvalidByteSequence(_) => Ok(REPLACEMENT_CHARACTER),
32         }
33     }
34 }
35 
36 impl<'a> fmt::Display for BufReadDecoderError<'a> {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result37     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
38         match *self {
39             BufReadDecoderError::InvalidByteSequence(bytes) => {
40                 write!(f, "invalid byte sequence: {:02x?}", bytes)
41             }
42             BufReadDecoderError::Io(ref err) => write!(f, "underlying bytestream error: {}", err),
43         }
44     }
45 }
46 
47 impl<'a> Error for BufReadDecoderError<'a> {
source(&self) -> Option<&(dyn Error + 'static)>48     fn source(&self) -> Option<&(dyn Error + 'static)> {
49         match *self {
50             BufReadDecoderError::InvalidByteSequence(_) => None,
51             BufReadDecoderError::Io(ref err) => Some(err),
52         }
53     }
54 }
55 
56 impl<B: BufRead> BufReadDecoder<B> {
57     /// This is to `Read::read_to_string` what `String::from_utf8_lossy` is to `String::from_utf8`.
read_to_string_lossy(buf_read: B) -> io::Result<String>58     pub fn read_to_string_lossy(buf_read: B) -> io::Result<String> {
59         let mut decoder = Self::new(buf_read);
60         let mut string = String::new();
61         while let Some(result) = decoder.next_lossy() {
62             string.push_str(result?)
63         }
64         Ok(string)
65     }
66 
new(buf_read: B) -> Self67     pub fn new(buf_read: B) -> Self {
68         Self {
69             buf_read,
70             bytes_consumed: 0,
71             incomplete: Incomplete::empty(),
72         }
73     }
74 
75     /// Same as `BufReadDecoder::next_strict`, but replace UTF-8 errors with U+FFFD.
next_lossy(&mut self) -> Option<io::Result<&str>>76     pub fn next_lossy(&mut self) -> Option<io::Result<&str>> {
77         self.next_strict().map(|result| result.or_else(|e| e.lossy()))
78     }
79 
80     /// Decode and consume the next chunk of UTF-8 input.
81     ///
82     /// This method is intended to be called repeatedly until it returns `None`,
83     /// which represents EOF from the underlying byte stream.
84     /// This is similar to `Iterator::next`,
85     /// except that decoded chunks borrow the decoder (~iterator)
86     /// so they need to be handled or copied before the next chunk can start decoding.
next_strict(&mut self) -> Option<Result<&str, BufReadDecoderError>>87     pub fn next_strict(&mut self) -> Option<Result<&str, BufReadDecoderError>> {
88         enum BytesSource {
89             BufRead(usize),
90             Incomplete,
91         }
92         macro_rules! try_io {
93             ($io_result: expr) => {
94                 match $io_result {
95                     Ok(value) => value,
96                     Err(error) => return Some(Err(BufReadDecoderError::Io(error)))
97                 }
98             }
99         }
100         let (source, result) = loop {
101             if self.bytes_consumed > 0 {
102                 self.buf_read.consume(self.bytes_consumed);
103                 self.bytes_consumed = 0;
104             }
105             let buf = try_io!(self.buf_read.fill_buf());
106 
107             // Force loop iteration to go through an explicit `continue`
108             enum Unreachable {}
109             let _: Unreachable = if self.incomplete.is_empty() {
110                 if buf.is_empty() {
111                     return None  // EOF
112                 }
113                 match str::from_utf8(buf) {
114                     Ok(_) => {
115                         break (BytesSource::BufRead(buf.len()), Ok(()))
116                     }
117                     Err(error) => {
118                         let valid_up_to = error.valid_up_to();
119                         if valid_up_to > 0 {
120                             break (BytesSource::BufRead(valid_up_to), Ok(()))
121                         }
122                         match error.error_len() {
123                             Some(invalid_sequence_length) => {
124                                 break (BytesSource::BufRead(invalid_sequence_length), Err(()))
125                             }
126                             None => {
127                                 self.bytes_consumed = buf.len();
128                                 self.incomplete = Incomplete::new(buf);
129                                 // need more input bytes
130                                 continue
131                             }
132                         }
133                     }
134                 }
135             } else {
136                 if buf.is_empty() {
137                     break (BytesSource::Incomplete, Err(()))  // EOF with incomplete code point
138                 }
139                 let (consumed, opt_result) = self.incomplete.try_complete_offsets(buf);
140                 self.bytes_consumed = consumed;
141                 match opt_result {
142                     None => {
143                         // need more input bytes
144                         continue
145                     }
146                     Some(result) => {
147                         break (BytesSource::Incomplete, result)
148                     }
149                 }
150             };
151         };
152         let bytes = match source {
153             BytesSource::BufRead(byte_count) => {
154                 self.bytes_consumed = byte_count;
155                 let buf = try_io!(self.buf_read.fill_buf());
156                 &buf[..byte_count]
157             }
158             BytesSource::Incomplete => {
159                 self.incomplete.take_buffer()
160             }
161         };
162         match result {
163             Ok(()) => Some(Ok(unsafe { str::from_utf8_unchecked(bytes) })),
164             Err(()) => Some(Err(BufReadDecoderError::InvalidByteSequence(bytes))),
165         }
166     }
167 }
168