1 use std::cmp;
2 use std::io;
3
4 use encoding_rs::{CoderResult, Decoder, Encoding};
5
6 /// This is the minimum amount of space that a decoder-to-utf8-with-replacement
7 /// will use for any state and any input.
8 const TINY_BUFFER_SIZE: usize = 7;
9
10 /// A tiny transcoder performs transcoding incrementally even when a caller
11 /// provided buffer is not large enough.
12 ///
13 /// This use case comes up when implementing streaming transcoding in cases
14 /// where it is permissible to provide incomplete UTF-8 sequences to the
15 /// caller (e.g., when decoding into a `&[u8]` where the caller must be capable
16 /// of handling invalid UTF-8). In particular, this type specifically handles
17 /// cases where a caller provided buffer is too small to store a full UTF-8
18 /// sequence. Thus, this type should be used in cases where the caller provided
19 /// buffer has length 3 or fewer.
20 ///
21 /// This could likely be done with better performance by allocating a larger
22 /// buffer for these cases, but we instead opt to handle this without
23 /// allocation under the assumption that tiny caller provided buffers are
24 /// probably a pathological case.
25 #[derive(Clone, Debug)]
26 pub struct TinyTranscoder {
27 /// This is where we store the results of a transcoding. Since we are
28 /// always decoding to UTF-8, 7 bytes is sufficient to represent any
29 /// codepoint.
30 partial: [u8; TINY_BUFFER_SIZE],
31 /// The number of bytes written in `partial`.
32 len: usize,
33 /// The position in `partial` at which the next byte should be read.
34 pos: usize,
35 }
36
37 impl TinyTranscoder {
38 /// Create a new tiny transcoder that is ready for use.
new() -> TinyTranscoder39 pub fn new() -> TinyTranscoder {
40 TinyTranscoder { partial: [0; TINY_BUFFER_SIZE], len: 0, pos: 0 }
41 }
42
43 /// Transcode the contents of `src` into this buffer using the provided
44 /// decoder, and return the number of bytes consumed in `src` and the
45 /// number of bytes written to this transcoder.
46 ///
47 /// The results of transcoding can be read using the TinyTranscoder's
48 /// `io::Read` implementation.
49 ///
50 /// If `last` is true, then this signals to the decoder that we've reached
51 /// EOF and `src` must be empty. Otherwise, if `last` is false, then
52 /// `src` must be non-empty. Violating either of these constraits will
53 /// cause a panic.
54 ///
55 /// Finally, if this transcoder still has unconsumed bytes from a previous
56 /// transcode, then this panics. Callers must consume all bytes from a
57 /// previous transcoding before performing another one.
transcode( &mut self, decoder: &mut Decoder, src: &[u8], last: bool, ) -> (usize, usize)58 pub fn transcode(
59 &mut self,
60 decoder: &mut Decoder,
61 src: &[u8],
62 last: bool,
63 ) -> (usize, usize) {
64 assert!(self.as_slice().is_empty(), "transcoder has unconsumed bytes");
65 if last {
66 assert!(src.is_empty(), "src must be empty when last==true");
67 }
68 let (res, nin, nout, _) =
69 decoder.decode_to_utf8(src, &mut self.partial[..], last);
70 if last {
71 assert_eq!(
72 res,
73 CoderResult::InputEmpty,
74 "input should be exhausted",
75 );
76 }
77 self.pos = 0;
78 self.len = nout;
79 (nin, nout)
80 }
81
82 /// Return the the bytes remaining to be read as a slice.
as_slice(&self) -> &[u8]83 fn as_slice(&self) -> &[u8] {
84 &self.partial[self.pos..self.len]
85 }
86 }
87
88 impl io::Read for TinyTranscoder {
read(&mut self, buf: &mut [u8]) -> io::Result<usize>89 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
90 if self.pos >= self.len {
91 return Ok(0);
92 }
93 let mut count = 0;
94 for (src, dst) in self.as_slice().iter().zip(buf) {
95 *dst = *src;
96 count += 1;
97 }
98 self.pos += count;
99 Ok(count)
100 }
101 }
102
103 /// `BomPeeker` wraps `R` and satisfies the `io::Read` interface while also
104 /// providing a peek at the BOM if one exists. Peeking at the BOM does not
105 /// advance the reader.
106 #[derive(Debug)]
107 pub struct BomPeeker<R> {
108 rdr: R,
109 strip: bool,
110 bom: Option<PossibleBom>,
111 nread: usize,
112 }
113
114 impl<R: io::Read> BomPeeker<R> {
115 /// Create a new BomPeeker that includes the BOM in calls to `read`.
116 ///
117 /// The first three bytes can be read using the `peek_bom` method, but
118 /// will not advance the reader.
with_bom(rdr: R) -> BomPeeker<R>119 pub fn with_bom(rdr: R) -> BomPeeker<R> {
120 BomPeeker { rdr: rdr, strip: false, bom: None, nread: 0 }
121 }
122
123 /// Create a new BomPeeker that never includes the BOM in calls to `read`.
without_bom(rdr: R) -> BomPeeker<R>124 pub fn without_bom(rdr: R) -> BomPeeker<R> {
125 BomPeeker { rdr: rdr, strip: true, bom: None, nread: 0 }
126 }
127
128 /// Peek at the first three bytes of the underlying reader.
129 ///
130 /// This does not advance the reader provided by `BomPeeker`.
131 ///
132 /// If the underlying reader does not have at least two bytes available,
133 /// then `None` is returned.
peek_bom(&mut self) -> io::Result<PossibleBom>134 pub fn peek_bom(&mut self) -> io::Result<PossibleBom> {
135 if let Some(bom) = self.bom {
136 return Ok(bom);
137 }
138 // If the underlying reader fails or panics, make sure we set at least
139 // an empty BOM so that we don't end up here again..
140 self.bom = Some(PossibleBom::new());
141
142 // OK, try to read the BOM.
143 let mut buf = [0u8; 3];
144 let bom_len = read_full(&mut self.rdr, &mut buf)?;
145 self.bom = Some(PossibleBom { bytes: buf, len: bom_len });
146 Ok(self.bom.unwrap())
147 }
148 }
149
150 impl<R: io::Read> io::Read for BomPeeker<R> {
read(&mut self, buf: &mut [u8]) -> io::Result<usize>151 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
152 if self.nread < 3 {
153 let bom = self.peek_bom()?;
154
155 // If we don't have a valid BOM (e.g., no encoding for it), then
156 // we always pass through the first 3 bytes. Otherwise, if we have
157 // a valid BOM, we only pass it thru if we don't want to strip it.
158 let bom = bom.as_slice(!self.strip);
159 if self.nread < bom.len() {
160 let rest = &bom[self.nread..];
161 let len = cmp::min(buf.len(), rest.len());
162 buf[..len].copy_from_slice(&rest[..len]);
163 self.nread += len;
164 return Ok(len);
165 }
166 }
167 let nread = self.rdr.read(buf)?;
168 self.nread += nread;
169 Ok(nread)
170 }
171 }
172
173 /// A PossibleBom is a sequence of bytes at the beginning of a stream that
174 /// may represent an actual BOM. To detect the BOM, this must contain at
175 /// least 3 bytes.
176 ///
177 /// If this is a valid UTF-8 or UTF-16 BOM, then an encoding_rs decoder can
178 /// be built from the BOM.
179 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
180 pub struct PossibleBom {
181 bytes: [u8; 3],
182 len: usize,
183 }
184
185 impl PossibleBom {
186 /// Build a new empty BOM.
new() -> PossibleBom187 fn new() -> PossibleBom {
188 PossibleBom { bytes: [0; 3], len: 0 }
189 }
190
191 /// Return the BOM as a normal slice.
192 ///
193 /// If `bom` is true, then this includes any leading BOM bytes. Otherwise,
194 /// this only includes non-BOM bytes.
as_slice(&self, bom: bool) -> &[u8]195 fn as_slice(&self, bom: bool) -> &[u8] {
196 let slice = &self.bytes[0..self.len];
197 if bom || slice.len() <= 1 {
198 slice
199 } else if &slice[0..2] == b"\xFF\xFE" || &slice[0..2] == b"\xFE\xFF" {
200 &slice[2..]
201 } else if slice == b"\xEF\xBB\xBF" {
202 &[]
203 } else {
204 slice
205 }
206 }
207
208 /// If this is a valid UTF-8 or UTF-16 BOM, return its corresponding
209 /// encoding. Otherwise, return `None`.
encoding(&self) -> Option<&'static Encoding>210 pub fn encoding(&self) -> Option<&'static Encoding> {
211 let bom = self.as_slice(true);
212 if bom.len() < 3 {
213 return None;
214 }
215 if let Some((enc, _)) = Encoding::for_bom(bom) {
216 return Some(enc);
217 }
218 None
219 }
220 }
221
222 /// Like `io::Read::read_exact`, except it never returns `UnexpectedEof` and
223 /// instead returns the number of bytes read if EOF is seen before filling
224 /// `buf`.
read_full<R: io::Read>( mut rdr: R, mut buf: &mut [u8], ) -> io::Result<usize>225 pub fn read_full<R: io::Read>(
226 mut rdr: R,
227 mut buf: &mut [u8],
228 ) -> io::Result<usize> {
229 let mut nread = 0;
230 while !buf.is_empty() {
231 match rdr.read(buf) {
232 Ok(0) => break,
233 Ok(n) => {
234 nread += n;
235 let tmp = buf;
236 buf = &mut tmp[n..];
237 }
238 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}
239 Err(e) => return Err(e),
240 }
241 }
242 Ok(nread)
243 }
244
245 #[cfg(test)]
246 mod tests {
247 use super::{BomPeeker, PossibleBom, TinyTranscoder};
248 use encoding_rs::Encoding;
249 use std::io::Read;
250
251 #[test]
tiny_utf16_normal()252 fn tiny_utf16_normal() {
253 let enc = Encoding::for_label(b"utf-16le").unwrap();
254 let mut dec = enc.new_decoder_with_bom_removal();
255 let mut bytes = &b"f\x00o\x00o\x00b\x00a\x00r\x00b\x00a\x00z\x00"[..];
256 let mut tiny = TinyTranscoder::new();
257 let mut tmp = [0u8; 1];
258
259 let (nin, nout) = tiny.transcode(&mut dec, bytes, false);
260 assert_eq!(nin, 14);
261 assert_eq!(nout, 7);
262 bytes = &bytes[nin..];
263
264 assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
265 assert_eq!(tmp, [b'f'; 1]);
266 assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
267 assert_eq!(tmp, [b'o'; 1]);
268 assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
269 assert_eq!(tmp, [b'o'; 1]);
270 assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
271 assert_eq!(tmp, [b'b'; 1]);
272 assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
273 assert_eq!(tmp, [b'a'; 1]);
274 assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
275 assert_eq!(tmp, [b'r'; 1]);
276 assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
277 assert_eq!(tmp, [b'b'; 1]);
278
279 let (nin, nout) = tiny.transcode(&mut dec, bytes, false);
280 assert_eq!(nin, 4);
281 assert_eq!(nout, 2);
282 bytes = &bytes[nin..];
283
284 assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
285 assert_eq!(tmp, [b'a'; 1]);
286 assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
287 assert_eq!(tmp, [b'z'; 1]);
288
289 let (nin, nout) = tiny.transcode(&mut dec, bytes, true);
290 assert_eq!(nin, 0);
291 assert_eq!(nout, 0);
292
293 assert_eq!(tiny.read(&mut tmp).unwrap(), 0);
294 }
295
296 #[test]
tiny_utf16_invalid()297 fn tiny_utf16_invalid() {
298 let enc = Encoding::for_label(b"utf-16le").unwrap();
299 let mut dec = enc.new_decoder_with_bom_removal();
300 let mut bytes = &b"\x00"[..];
301 let mut tiny = TinyTranscoder::new();
302 let mut tmp = [0u8; 1];
303
304 let (nin, nout) = tiny.transcode(&mut dec, bytes, false);
305 assert_eq!(nin, 1);
306 assert_eq!(nout, 0);
307 assert_eq!(tiny.read(&mut tmp).unwrap(), 0);
308 bytes = &bytes[nin..];
309
310 let (nin, nout) = tiny.transcode(&mut dec, bytes, true);
311 assert_eq!(nin, 0);
312 assert_eq!(nout, 3);
313
314 assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
315 assert_eq!(tmp, [b'\xEF'; 1]);
316 assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
317 assert_eq!(tmp, [b'\xBF'; 1]);
318 assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
319 assert_eq!(tmp, [b'\xBD'; 1]);
320 assert_eq!(tiny.read(&mut tmp).unwrap(), 0);
321 }
322
323 #[test]
peeker_empty()324 fn peeker_empty() {
325 let buf = [];
326 let mut peeker = BomPeeker::with_bom(&buf[..]);
327 assert_eq!(PossibleBom::new(), peeker.peek_bom().unwrap());
328
329 let mut tmp = [0; 100];
330 assert_eq!(0, peeker.read(&mut tmp).unwrap());
331 }
332
333 #[test]
peeker_one()334 fn peeker_one() {
335 let buf = [1];
336 let mut peeker = BomPeeker::with_bom(&buf[..]);
337 assert_eq!(
338 PossibleBom { bytes: [1, 0, 0], len: 1 },
339 peeker.peek_bom().unwrap()
340 );
341
342 let mut tmp = [0; 100];
343 assert_eq!(1, peeker.read(&mut tmp).unwrap());
344 assert_eq!(1, tmp[0]);
345 assert_eq!(0, peeker.read(&mut tmp).unwrap());
346 }
347
348 #[test]
peeker_two()349 fn peeker_two() {
350 let buf = [1, 2];
351 let mut peeker = BomPeeker::with_bom(&buf[..]);
352 assert_eq!(
353 PossibleBom { bytes: [1, 2, 0], len: 2 },
354 peeker.peek_bom().unwrap()
355 );
356
357 let mut tmp = [0; 100];
358 assert_eq!(2, peeker.read(&mut tmp).unwrap());
359 assert_eq!(1, tmp[0]);
360 assert_eq!(2, tmp[1]);
361 assert_eq!(0, peeker.read(&mut tmp).unwrap());
362 }
363
364 #[test]
peeker_three()365 fn peeker_three() {
366 let buf = [1, 2, 3];
367 let mut peeker = BomPeeker::with_bom(&buf[..]);
368 assert_eq!(
369 PossibleBom { bytes: [1, 2, 3], len: 3 },
370 peeker.peek_bom().unwrap()
371 );
372
373 let mut tmp = [0; 100];
374 assert_eq!(3, peeker.read(&mut tmp).unwrap());
375 assert_eq!(1, tmp[0]);
376 assert_eq!(2, tmp[1]);
377 assert_eq!(3, tmp[2]);
378 assert_eq!(0, peeker.read(&mut tmp).unwrap());
379 }
380
381 #[test]
peeker_four()382 fn peeker_four() {
383 let buf = [1, 2, 3, 4];
384 let mut peeker = BomPeeker::with_bom(&buf[..]);
385 assert_eq!(
386 PossibleBom { bytes: [1, 2, 3], len: 3 },
387 peeker.peek_bom().unwrap()
388 );
389
390 let mut tmp = [0; 100];
391 assert_eq!(3, peeker.read(&mut tmp).unwrap());
392 assert_eq!(1, tmp[0]);
393 assert_eq!(2, tmp[1]);
394 assert_eq!(3, tmp[2]);
395 assert_eq!(1, peeker.read(&mut tmp).unwrap());
396 assert_eq!(4, tmp[0]);
397 assert_eq!(0, peeker.read(&mut tmp).unwrap());
398 }
399
400 #[test]
peeker_one_at_a_time()401 fn peeker_one_at_a_time() {
402 let buf = [1, 2, 3, 4];
403 let mut peeker = BomPeeker::with_bom(&buf[..]);
404
405 let mut tmp = [0; 1];
406 assert_eq!(0, peeker.read(&mut tmp[..0]).unwrap());
407 assert_eq!(0, tmp[0]);
408 assert_eq!(1, peeker.read(&mut tmp).unwrap());
409 assert_eq!(1, tmp[0]);
410 assert_eq!(1, peeker.read(&mut tmp).unwrap());
411 assert_eq!(2, tmp[0]);
412 assert_eq!(1, peeker.read(&mut tmp).unwrap());
413 assert_eq!(3, tmp[0]);
414 assert_eq!(1, peeker.read(&mut tmp).unwrap());
415 assert_eq!(4, tmp[0]);
416 }
417
418 #[test]
peeker_without_bom()419 fn peeker_without_bom() {
420 let buf = [b'\xEF', b'\xBB', b'\xBF', b'a'];
421 let mut peeker = BomPeeker::without_bom(&buf[..]);
422 assert_eq!(
423 PossibleBom { bytes: [b'\xEF', b'\xBB', b'\xBF'], len: 3 },
424 peeker.peek_bom().unwrap()
425 );
426
427 let mut tmp = [0; 100];
428 assert_eq!(1, peeker.read(&mut tmp).unwrap());
429 assert_eq!(b'a', tmp[0]);
430 assert_eq!(0, peeker.read(&mut tmp).unwrap());
431 }
432
433 #[test]
peeker_without_bom_nobom()434 fn peeker_without_bom_nobom() {
435 let buf = [1, 2, 3, 4];
436 let mut peeker = BomPeeker::without_bom(&buf[..]);
437 assert_eq!(
438 PossibleBom { bytes: [1, 2, 3], len: 3 },
439 peeker.peek_bom().unwrap()
440 );
441
442 let mut tmp = [0; 100];
443 assert_eq!(3, peeker.read(&mut tmp).unwrap());
444 assert_eq!(1, tmp[0]);
445 assert_eq!(2, tmp[1]);
446 assert_eq!(3, tmp[2]);
447 assert_eq!(1, peeker.read(&mut tmp).unwrap());
448 assert_eq!(4, tmp[0]);
449 assert_eq!(0, peeker.read(&mut tmp).unwrap());
450 }
451 }
452