1 use std::io::{self, BufRead}; 2 use std::error::Error; 3 use std::fmt; 4 use std::str; 5 use super::*; 6 7 /// Wraps a `std::io::BufRead` buffered byte stream and decode it as UTF-8. 8 pub struct BufReadDecoder<B: BufRead> { 9 buf_read: B, 10 bytes_consumed: usize, 11 incomplete: Incomplete, 12 } 13 14 #[derive(Debug)] 15 pub enum BufReadDecoderError<'a> { 16 /// Represents one UTF-8 error in the byte stream. 17 /// 18 /// In lossy decoding, each such error should be replaced with U+FFFD. 19 /// (See `BufReadDecoder::next_lossy` and `BufReadDecoderError::lossy`.) 20 InvalidByteSequence(&'a [u8]), 21 22 /// An I/O error from the underlying byte stream 23 Io(io::Error), 24 } 25 26 impl<'a> BufReadDecoderError<'a> { 27 /// Replace UTF-8 errors with U+FFFD lossy(self) -> Result<&'static str, io::Error>28 pub fn lossy(self) -> Result<&'static str, io::Error> { 29 match self { 30 BufReadDecoderError::Io(error) => Err(error), 31 BufReadDecoderError::InvalidByteSequence(_) => Ok(REPLACEMENT_CHARACTER), 32 } 33 } 34 } 35 36 impl<'a> fmt::Display for BufReadDecoderError<'a> { fmt(&self, f: &mut fmt::Formatter) -> fmt::Result37 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 38 match *self { 39 BufReadDecoderError::InvalidByteSequence(bytes) => { 40 write!(f, "invalid byte sequence: {:02x?}", bytes) 41 } 42 BufReadDecoderError::Io(ref err) => write!(f, "underlying bytestream error: {}", err), 43 } 44 } 45 } 46 47 impl<'a> Error for BufReadDecoderError<'a> { source(&self) -> Option<&(dyn Error + 'static)>48 fn source(&self) -> Option<&(dyn Error + 'static)> { 49 match *self { 50 BufReadDecoderError::InvalidByteSequence(_) => None, 51 BufReadDecoderError::Io(ref err) => Some(err), 52 } 53 } 54 } 55 56 impl<B: BufRead> BufReadDecoder<B> { 57 /// This is to `Read::read_to_string` what `String::from_utf8_lossy` is to `String::from_utf8`. read_to_string_lossy(buf_read: B) -> io::Result<String>58 pub fn read_to_string_lossy(buf_read: B) -> io::Result<String> { 59 let mut decoder = Self::new(buf_read); 60 let mut string = String::new(); 61 while let Some(result) = decoder.next_lossy() { 62 string.push_str(result?) 63 } 64 Ok(string) 65 } 66 new(buf_read: B) -> Self67 pub fn new(buf_read: B) -> Self { 68 Self { 69 buf_read, 70 bytes_consumed: 0, 71 incomplete: Incomplete::empty(), 72 } 73 } 74 75 /// Same as `BufReadDecoder::next_strict`, but replace UTF-8 errors with U+FFFD. next_lossy(&mut self) -> Option<io::Result<&str>>76 pub fn next_lossy(&mut self) -> Option<io::Result<&str>> { 77 self.next_strict().map(|result| result.or_else(|e| e.lossy())) 78 } 79 80 /// Decode and consume the next chunk of UTF-8 input. 81 /// 82 /// This method is intended to be called repeatedly until it returns `None`, 83 /// which represents EOF from the underlying byte stream. 84 /// This is similar to `Iterator::next`, 85 /// except that decoded chunks borrow the decoder (~iterator) 86 /// so they need to be handled or copied before the next chunk can start decoding. next_strict(&mut self) -> Option<Result<&str, BufReadDecoderError>>87 pub fn next_strict(&mut self) -> Option<Result<&str, BufReadDecoderError>> { 88 enum BytesSource { 89 BufRead(usize), 90 Incomplete, 91 } 92 macro_rules! try_io { 93 ($io_result: expr) => { 94 match $io_result { 95 Ok(value) => value, 96 Err(error) => return Some(Err(BufReadDecoderError::Io(error))) 97 } 98 } 99 } 100 let (source, result) = loop { 101 if self.bytes_consumed > 0 { 102 self.buf_read.consume(self.bytes_consumed); 103 self.bytes_consumed = 0; 104 } 105 let buf = try_io!(self.buf_read.fill_buf()); 106 107 // Force loop iteration to go through an explicit `continue` 108 enum Unreachable {} 109 let _: Unreachable = if self.incomplete.is_empty() { 110 if buf.is_empty() { 111 return None // EOF 112 } 113 match str::from_utf8(buf) { 114 Ok(_) => { 115 break (BytesSource::BufRead(buf.len()), Ok(())) 116 } 117 Err(error) => { 118 let valid_up_to = error.valid_up_to(); 119 if valid_up_to > 0 { 120 break (BytesSource::BufRead(valid_up_to), Ok(())) 121 } 122 match error.error_len() { 123 Some(invalid_sequence_length) => { 124 break (BytesSource::BufRead(invalid_sequence_length), Err(())) 125 } 126 None => { 127 self.bytes_consumed = buf.len(); 128 self.incomplete = Incomplete::new(buf); 129 // need more input bytes 130 continue 131 } 132 } 133 } 134 } 135 } else { 136 if buf.is_empty() { 137 break (BytesSource::Incomplete, Err(())) // EOF with incomplete code point 138 } 139 let (consumed, opt_result) = self.incomplete.try_complete_offsets(buf); 140 self.bytes_consumed = consumed; 141 match opt_result { 142 None => { 143 // need more input bytes 144 continue 145 } 146 Some(result) => { 147 break (BytesSource::Incomplete, result) 148 } 149 } 150 }; 151 }; 152 let bytes = match source { 153 BytesSource::BufRead(byte_count) => { 154 self.bytes_consumed = byte_count; 155 let buf = try_io!(self.buf_read.fill_buf()); 156 &buf[..byte_count] 157 } 158 BytesSource::Incomplete => { 159 self.incomplete.take_buffer() 160 } 161 }; 162 match result { 163 Ok(()) => Some(Ok(unsafe { str::from_utf8_unchecked(bytes) })), 164 Err(()) => Some(Err(BufReadDecoderError::InvalidByteSequence(bytes))), 165 } 166 } 167 } 168