1 use std::cmp;
2 use std::io;
3 
4 use encoding_rs::{CoderResult, Decoder, Encoding};
5 
6 /// This is the minimum amount of space that a decoder-to-utf8-with-replacement
7 /// will use for any state and any input.
8 const TINY_BUFFER_SIZE: usize = 7;
9 
10 /// A tiny transcoder performs transcoding incrementally even when a caller
11 /// provided buffer is not large enough.
12 ///
13 /// This use case comes up when implementing streaming transcoding in cases
14 /// where it is permissible to provide incomplete UTF-8 sequences to the
15 /// caller (e.g., when decoding into a `&[u8]` where the caller must be capable
16 /// of handling invalid UTF-8). In particular, this type specifically handles
17 /// cases where a caller provided buffer is too small to store a full UTF-8
18 /// sequence. Thus, this type should be used in cases where the caller provided
19 /// buffer has length 3 or fewer.
20 ///
21 /// This could likely be done with better performance by allocating a larger
22 /// buffer for these cases, but we instead opt to handle this without
23 /// allocation under the assumption that tiny caller provided buffers are
24 /// probably a pathological case.
25 #[derive(Clone, Debug)]
26 pub struct TinyTranscoder {
27     /// This is where we store the results of a transcoding. Since we are
28     /// always decoding to UTF-8, 7 bytes is sufficient to represent any
29     /// codepoint.
30     partial: [u8; TINY_BUFFER_SIZE],
31     /// The number of bytes written in `partial`.
32     len: usize,
33     /// The position in `partial` at which the next byte should be read.
34     pos: usize,
35 }
36 
37 impl TinyTranscoder {
38     /// Create a new tiny transcoder that is ready for use.
new() -> TinyTranscoder39     pub fn new() -> TinyTranscoder {
40         TinyTranscoder { partial: [0; TINY_BUFFER_SIZE], len: 0, pos: 0 }
41     }
42 
43     /// Transcode the contents of `src` into this buffer using the provided
44     /// decoder, and return the number of bytes consumed in `src` and the
45     /// number of bytes written to this transcoder.
46     ///
47     /// The results of transcoding can be read using the TinyTranscoder's
48     /// `io::Read` implementation.
49     ///
50     /// If `last` is true, then this signals to the decoder that we've reached
51     /// EOF and `src` must be empty. Otherwise, if `last` is false, then
52     /// `src` must be non-empty. Violating either of these constraits will
53     /// cause a panic.
54     ///
55     /// Finally, if this transcoder still has unconsumed bytes from a previous
56     /// transcode, then this panics. Callers must consume all bytes from a
57     /// previous transcoding before performing another one.
transcode( &mut self, decoder: &mut Decoder, src: &[u8], last: bool, ) -> (usize, usize)58     pub fn transcode(
59         &mut self,
60         decoder: &mut Decoder,
61         src: &[u8],
62         last: bool,
63     ) -> (usize, usize) {
64         assert!(self.as_slice().is_empty(), "transcoder has unconsumed bytes");
65         if last {
66             assert!(src.is_empty(), "src must be empty when last==true");
67         }
68         let (res, nin, nout, _) =
69             decoder.decode_to_utf8(src, &mut self.partial[..], last);
70         if last {
71             assert_eq!(
72                 res,
73                 CoderResult::InputEmpty,
74                 "input should be exhausted",
75             );
76         }
77         self.pos = 0;
78         self.len = nout;
79         (nin, nout)
80     }
81 
82     /// Return the the bytes remaining to be read as a slice.
as_slice(&self) -> &[u8]83     fn as_slice(&self) -> &[u8] {
84         &self.partial[self.pos..self.len]
85     }
86 }
87 
88 impl io::Read for TinyTranscoder {
read(&mut self, buf: &mut [u8]) -> io::Result<usize>89     fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
90         if self.pos >= self.len {
91             return Ok(0);
92         }
93         let mut count = 0;
94         for (src, dst) in self.as_slice().iter().zip(buf) {
95             *dst = *src;
96             count += 1;
97         }
98         self.pos += count;
99         Ok(count)
100     }
101 }
102 
103 /// `BomPeeker` wraps `R` and satisfies the `io::Read` interface while also
104 /// providing a peek at the BOM if one exists. Peeking at the BOM does not
105 /// advance the reader.
106 #[derive(Debug)]
107 pub struct BomPeeker<R> {
108     rdr: R,
109     strip: bool,
110     bom: Option<PossibleBom>,
111     nread: usize,
112 }
113 
114 impl<R: io::Read> BomPeeker<R> {
115     /// Create a new BomPeeker that includes the BOM in calls to `read`.
116     ///
117     /// The first three bytes can be read using the `peek_bom` method, but
118     /// will not advance the reader.
with_bom(rdr: R) -> BomPeeker<R>119     pub fn with_bom(rdr: R) -> BomPeeker<R> {
120         BomPeeker { rdr: rdr, strip: false, bom: None, nread: 0 }
121     }
122 
123     /// Create a new BomPeeker that never includes the BOM in calls to `read`.
without_bom(rdr: R) -> BomPeeker<R>124     pub fn without_bom(rdr: R) -> BomPeeker<R> {
125         BomPeeker { rdr: rdr, strip: true, bom: None, nread: 0 }
126     }
127 
128     /// Peek at the first three bytes of the underlying reader.
129     ///
130     /// This does not advance the reader provided by `BomPeeker`.
131     ///
132     /// If the underlying reader does not have at least two bytes available,
133     /// then `None` is returned.
peek_bom(&mut self) -> io::Result<PossibleBom>134     pub fn peek_bom(&mut self) -> io::Result<PossibleBom> {
135         if let Some(bom) = self.bom {
136             return Ok(bom);
137         }
138         // If the underlying reader fails or panics, make sure we set at least
139         // an empty BOM so that we don't end up here again..
140         self.bom = Some(PossibleBom::new());
141 
142         // OK, try to read the BOM.
143         let mut buf = [0u8; 3];
144         let bom_len = read_full(&mut self.rdr, &mut buf)?;
145         self.bom = Some(PossibleBom { bytes: buf, len: bom_len });
146         Ok(self.bom.unwrap())
147     }
148 }
149 
150 impl<R: io::Read> io::Read for BomPeeker<R> {
read(&mut self, buf: &mut [u8]) -> io::Result<usize>151     fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
152         if self.nread < 3 {
153             let bom = self.peek_bom()?;
154 
155             // If we don't have a valid BOM (e.g., no encoding for it), then
156             // we always pass through the first 3 bytes. Otherwise, if we have
157             // a valid BOM, we only pass it thru if we don't want to strip it.
158             let bom = bom.as_slice(!self.strip);
159             if self.nread < bom.len() {
160                 let rest = &bom[self.nread..];
161                 let len = cmp::min(buf.len(), rest.len());
162                 buf[..len].copy_from_slice(&rest[..len]);
163                 self.nread += len;
164                 return Ok(len);
165             }
166         }
167         let nread = self.rdr.read(buf)?;
168         self.nread += nread;
169         Ok(nread)
170     }
171 }
172 
173 /// A PossibleBom is a sequence of bytes at the beginning of a stream that
174 /// may represent an actual BOM. To detect the BOM, this must contain at
175 /// least 3 bytes.
176 ///
177 /// If this is a valid UTF-8 or UTF-16 BOM, then an encoding_rs decoder can
178 /// be built from the BOM.
179 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
180 pub struct PossibleBom {
181     bytes: [u8; 3],
182     len: usize,
183 }
184 
185 impl PossibleBom {
186     /// Build a new empty BOM.
new() -> PossibleBom187     fn new() -> PossibleBom {
188         PossibleBom { bytes: [0; 3], len: 0 }
189     }
190 
191     /// Return the BOM as a normal slice.
192     ///
193     /// If `bom` is true, then this includes any leading BOM bytes. Otherwise,
194     /// this only includes non-BOM bytes.
as_slice(&self, bom: bool) -> &[u8]195     fn as_slice(&self, bom: bool) -> &[u8] {
196         let slice = &self.bytes[0..self.len];
197         if bom || slice.len() <= 1 {
198             slice
199         } else if &slice[0..2] == b"\xFF\xFE" || &slice[0..2] == b"\xFE\xFF" {
200             &slice[2..]
201         } else if slice == b"\xEF\xBB\xBF" {
202             &[]
203         } else {
204             slice
205         }
206     }
207 
208     /// If this is a valid UTF-8 or UTF-16 BOM, return its corresponding
209     /// encoding. Otherwise, return `None`.
encoding(&self) -> Option<&'static Encoding>210     pub fn encoding(&self) -> Option<&'static Encoding> {
211         let bom = self.as_slice(true);
212         if bom.len() < 3 {
213             return None;
214         }
215         if let Some((enc, _)) = Encoding::for_bom(bom) {
216             return Some(enc);
217         }
218         None
219     }
220 }
221 
222 /// Like `io::Read::read_exact`, except it never returns `UnexpectedEof` and
223 /// instead returns the number of bytes read if EOF is seen before filling
224 /// `buf`.
read_full<R: io::Read>( mut rdr: R, mut buf: &mut [u8], ) -> io::Result<usize>225 pub fn read_full<R: io::Read>(
226     mut rdr: R,
227     mut buf: &mut [u8],
228 ) -> io::Result<usize> {
229     let mut nread = 0;
230     while !buf.is_empty() {
231         match rdr.read(buf) {
232             Ok(0) => break,
233             Ok(n) => {
234                 nread += n;
235                 let tmp = buf;
236                 buf = &mut tmp[n..];
237             }
238             Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}
239             Err(e) => return Err(e),
240         }
241     }
242     Ok(nread)
243 }
244 
245 #[cfg(test)]
246 mod tests {
247     use super::{BomPeeker, PossibleBom, TinyTranscoder};
248     use encoding_rs::Encoding;
249     use std::io::Read;
250 
251     #[test]
tiny_utf16_normal()252     fn tiny_utf16_normal() {
253         let enc = Encoding::for_label(b"utf-16le").unwrap();
254         let mut dec = enc.new_decoder_with_bom_removal();
255         let mut bytes = &b"f\x00o\x00o\x00b\x00a\x00r\x00b\x00a\x00z\x00"[..];
256         let mut tiny = TinyTranscoder::new();
257         let mut tmp = [0u8; 1];
258 
259         let (nin, nout) = tiny.transcode(&mut dec, bytes, false);
260         assert_eq!(nin, 14);
261         assert_eq!(nout, 7);
262         bytes = &bytes[nin..];
263 
264         assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
265         assert_eq!(tmp, [b'f'; 1]);
266         assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
267         assert_eq!(tmp, [b'o'; 1]);
268         assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
269         assert_eq!(tmp, [b'o'; 1]);
270         assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
271         assert_eq!(tmp, [b'b'; 1]);
272         assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
273         assert_eq!(tmp, [b'a'; 1]);
274         assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
275         assert_eq!(tmp, [b'r'; 1]);
276         assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
277         assert_eq!(tmp, [b'b'; 1]);
278 
279         let (nin, nout) = tiny.transcode(&mut dec, bytes, false);
280         assert_eq!(nin, 4);
281         assert_eq!(nout, 2);
282         bytes = &bytes[nin..];
283 
284         assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
285         assert_eq!(tmp, [b'a'; 1]);
286         assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
287         assert_eq!(tmp, [b'z'; 1]);
288 
289         let (nin, nout) = tiny.transcode(&mut dec, bytes, true);
290         assert_eq!(nin, 0);
291         assert_eq!(nout, 0);
292 
293         assert_eq!(tiny.read(&mut tmp).unwrap(), 0);
294     }
295 
296     #[test]
tiny_utf16_invalid()297     fn tiny_utf16_invalid() {
298         let enc = Encoding::for_label(b"utf-16le").unwrap();
299         let mut dec = enc.new_decoder_with_bom_removal();
300         let mut bytes = &b"\x00"[..];
301         let mut tiny = TinyTranscoder::new();
302         let mut tmp = [0u8; 1];
303 
304         let (nin, nout) = tiny.transcode(&mut dec, bytes, false);
305         assert_eq!(nin, 1);
306         assert_eq!(nout, 0);
307         assert_eq!(tiny.read(&mut tmp).unwrap(), 0);
308         bytes = &bytes[nin..];
309 
310         let (nin, nout) = tiny.transcode(&mut dec, bytes, true);
311         assert_eq!(nin, 0);
312         assert_eq!(nout, 3);
313 
314         assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
315         assert_eq!(tmp, [b'\xEF'; 1]);
316         assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
317         assert_eq!(tmp, [b'\xBF'; 1]);
318         assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
319         assert_eq!(tmp, [b'\xBD'; 1]);
320         assert_eq!(tiny.read(&mut tmp).unwrap(), 0);
321     }
322 
323     #[test]
peeker_empty()324     fn peeker_empty() {
325         let buf = [];
326         let mut peeker = BomPeeker::with_bom(&buf[..]);
327         assert_eq!(PossibleBom::new(), peeker.peek_bom().unwrap());
328 
329         let mut tmp = [0; 100];
330         assert_eq!(0, peeker.read(&mut tmp).unwrap());
331     }
332 
333     #[test]
peeker_one()334     fn peeker_one() {
335         let buf = [1];
336         let mut peeker = BomPeeker::with_bom(&buf[..]);
337         assert_eq!(
338             PossibleBom { bytes: [1, 0, 0], len: 1 },
339             peeker.peek_bom().unwrap()
340         );
341 
342         let mut tmp = [0; 100];
343         assert_eq!(1, peeker.read(&mut tmp).unwrap());
344         assert_eq!(1, tmp[0]);
345         assert_eq!(0, peeker.read(&mut tmp).unwrap());
346     }
347 
348     #[test]
peeker_two()349     fn peeker_two() {
350         let buf = [1, 2];
351         let mut peeker = BomPeeker::with_bom(&buf[..]);
352         assert_eq!(
353             PossibleBom { bytes: [1, 2, 0], len: 2 },
354             peeker.peek_bom().unwrap()
355         );
356 
357         let mut tmp = [0; 100];
358         assert_eq!(2, peeker.read(&mut tmp).unwrap());
359         assert_eq!(1, tmp[0]);
360         assert_eq!(2, tmp[1]);
361         assert_eq!(0, peeker.read(&mut tmp).unwrap());
362     }
363 
364     #[test]
peeker_three()365     fn peeker_three() {
366         let buf = [1, 2, 3];
367         let mut peeker = BomPeeker::with_bom(&buf[..]);
368         assert_eq!(
369             PossibleBom { bytes: [1, 2, 3], len: 3 },
370             peeker.peek_bom().unwrap()
371         );
372 
373         let mut tmp = [0; 100];
374         assert_eq!(3, peeker.read(&mut tmp).unwrap());
375         assert_eq!(1, tmp[0]);
376         assert_eq!(2, tmp[1]);
377         assert_eq!(3, tmp[2]);
378         assert_eq!(0, peeker.read(&mut tmp).unwrap());
379     }
380 
381     #[test]
peeker_four()382     fn peeker_four() {
383         let buf = [1, 2, 3, 4];
384         let mut peeker = BomPeeker::with_bom(&buf[..]);
385         assert_eq!(
386             PossibleBom { bytes: [1, 2, 3], len: 3 },
387             peeker.peek_bom().unwrap()
388         );
389 
390         let mut tmp = [0; 100];
391         assert_eq!(3, peeker.read(&mut tmp).unwrap());
392         assert_eq!(1, tmp[0]);
393         assert_eq!(2, tmp[1]);
394         assert_eq!(3, tmp[2]);
395         assert_eq!(1, peeker.read(&mut tmp).unwrap());
396         assert_eq!(4, tmp[0]);
397         assert_eq!(0, peeker.read(&mut tmp).unwrap());
398     }
399 
400     #[test]
peeker_one_at_a_time()401     fn peeker_one_at_a_time() {
402         let buf = [1, 2, 3, 4];
403         let mut peeker = BomPeeker::with_bom(&buf[..]);
404 
405         let mut tmp = [0; 1];
406         assert_eq!(0, peeker.read(&mut tmp[..0]).unwrap());
407         assert_eq!(0, tmp[0]);
408         assert_eq!(1, peeker.read(&mut tmp).unwrap());
409         assert_eq!(1, tmp[0]);
410         assert_eq!(1, peeker.read(&mut tmp).unwrap());
411         assert_eq!(2, tmp[0]);
412         assert_eq!(1, peeker.read(&mut tmp).unwrap());
413         assert_eq!(3, tmp[0]);
414         assert_eq!(1, peeker.read(&mut tmp).unwrap());
415         assert_eq!(4, tmp[0]);
416     }
417 
418     #[test]
peeker_without_bom()419     fn peeker_without_bom() {
420         let buf = [b'\xEF', b'\xBB', b'\xBF', b'a'];
421         let mut peeker = BomPeeker::without_bom(&buf[..]);
422         assert_eq!(
423             PossibleBom { bytes: [b'\xEF', b'\xBB', b'\xBF'], len: 3 },
424             peeker.peek_bom().unwrap()
425         );
426 
427         let mut tmp = [0; 100];
428         assert_eq!(1, peeker.read(&mut tmp).unwrap());
429         assert_eq!(b'a', tmp[0]);
430         assert_eq!(0, peeker.read(&mut tmp).unwrap());
431     }
432 
433     #[test]
peeker_without_bom_nobom()434     fn peeker_without_bom_nobom() {
435         let buf = [1, 2, 3, 4];
436         let mut peeker = BomPeeker::without_bom(&buf[..]);
437         assert_eq!(
438             PossibleBom { bytes: [1, 2, 3], len: 3 },
439             peeker.peek_bom().unwrap()
440         );
441 
442         let mut tmp = [0; 100];
443         assert_eq!(3, peeker.read(&mut tmp).unwrap());
444         assert_eq!(1, tmp[0]);
445         assert_eq!(2, tmp[1]);
446         assert_eq!(3, tmp[2]);
447         assert_eq!(1, peeker.read(&mut tmp).unwrap());
448         assert_eq!(4, tmp[0]);
449         assert_eq!(0, peeker.read(&mut tmp).unwrap());
450     }
451 }
452