1 // Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 use super::*;
11 use handles::*;
12 use variant::*;
13 
14 pub struct Utf16Decoder {
15     lead_surrogate: u16, // If non-zero and pending_bmp == false, a pending lead surrogate
16     lead_byte: Option<u8>,
17     be: bool,
18     pending_bmp: bool, // if true, lead_surrogate is actually pending BMP
19 }
20 
21 impl Utf16Decoder {
new(big_endian: bool) -> VariantDecoder22     pub fn new(big_endian: bool) -> VariantDecoder {
23         VariantDecoder::Utf16(Utf16Decoder {
24             lead_surrogate: 0,
25             lead_byte: None,
26             be: big_endian,
27             pending_bmp: false,
28         })
29     }
30 
additional_from_state(&self) -> usize31     pub fn additional_from_state(&self) -> usize {
32         1 + if self.lead_byte.is_some() { 1 } else { 0 }
33             + if self.lead_surrogate == 0 { 0 } else { 2 }
34     }
35 
max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize>36     pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
37         checked_add(
38             1,
39             checked_div(byte_length.checked_add(self.additional_from_state()), 2),
40         )
41     }
42 
max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize>43     pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
44         checked_add(
45             1,
46             checked_mul(
47                 3,
48                 checked_div(byte_length.checked_add(self.additional_from_state()), 2),
49             ),
50         )
51     }
52 
max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize>53     pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
54         checked_add(
55             1,
56             checked_mul(
57                 3,
58                 checked_div(byte_length.checked_add(self.additional_from_state()), 2),
59             ),
60         )
61     }
62 
63     decoder_functions!(
64         {
65             if self.pending_bmp {
66                 match dest.check_space_bmp() {
67                     Space::Full(_) => {
68                         return (DecoderResult::OutputFull, 0, 0);
69                     }
70                     Space::Available(destination_handle) => {
71                         destination_handle.write_bmp(self.lead_surrogate);
72                         self.pending_bmp = false;
73                         self.lead_surrogate = 0;
74                     }
75                 }
76             }
77         },
78         {
79             // This is the fast path. The rest runs only at the
80             // start and end for partial sequences.
81             if self.lead_byte.is_none() && self.lead_surrogate == 0 {
82                 if let Some((read, written)) = if self.be {
83                     dest.copy_utf16_from::<BigEndian>(&mut source)
84                 } else {
85                     dest.copy_utf16_from::<LittleEndian>(&mut source)
86                 } {
87                     return (DecoderResult::Malformed(2, 0), read, written);
88                 }
89             }
90         },
91         {
92             debug_assert!(!self.pending_bmp);
93             if self.lead_surrogate != 0 || self.lead_byte.is_some() {
94                 // We need to check space without intent to write in order to
95                 // make sure that there is space for the replacement character.
96                 match dest.check_space_bmp() {
97                     Space::Full(_) => {
98                         return (DecoderResult::OutputFull, 0, 0);
99                     }
100                     Space::Available(_) => {
101                         if self.lead_surrogate != 0 {
102                             self.lead_surrogate = 0;
103                             match self.lead_byte {
104                                 None => {
105                                     return (
106                                         DecoderResult::Malformed(2, 0),
107                                         src_consumed,
108                                         dest.written(),
109                                     );
110                                 }
111                                 Some(_) => {
112                                     self.lead_byte = None;
113                                     return (
114                                         DecoderResult::Malformed(3, 0),
115                                         src_consumed,
116                                         dest.written(),
117                                     );
118                                 }
119                             }
120                         }
121                         debug_assert!(self.lead_byte.is_some());
122                         self.lead_byte = None;
123                         return (DecoderResult::Malformed(1, 0), src_consumed, dest.written());
124                     }
125                 }
126             }
127         },
128         {
129             match self.lead_byte {
130                 None => {
131                     self.lead_byte = Some(b);
132                     continue;
133                 }
134                 Some(lead) => {
135                     self.lead_byte = None;
136                     let code_unit = if self.be {
137                         u16::from(lead) << 8 | u16::from(b)
138                     } else {
139                         u16::from(b) << 8 | u16::from(lead)
140                     };
141                     let high_bits = code_unit & 0xFC00u16;
142                     if high_bits == 0xD800u16 {
143                         // high surrogate
144                         if self.lead_surrogate != 0 {
145                             // The previous high surrogate was in
146                             // error and this one becomes the new
147                             // pending one.
148                             self.lead_surrogate = code_unit as u16;
149                             return (
150                                 DecoderResult::Malformed(2, 2),
151                                 unread_handle.consumed(),
152                                 destination_handle.written(),
153                             );
154                         }
155                         self.lead_surrogate = code_unit;
156                         continue;
157                     }
158                     if high_bits == 0xDC00u16 {
159                         // low surrogate
160                         if self.lead_surrogate == 0 {
161                             return (
162                                 DecoderResult::Malformed(2, 0),
163                                 unread_handle.consumed(),
164                                 destination_handle.written(),
165                             );
166                         }
167                         destination_handle.write_surrogate_pair(self.lead_surrogate, code_unit);
168                         self.lead_surrogate = 0;
169                         continue;
170                     }
171                     // bmp
172                     if self.lead_surrogate != 0 {
173                         // The previous high surrogate was in
174                         // error and this code unit becomes a
175                         // pending BMP character.
176                         self.lead_surrogate = code_unit;
177                         self.pending_bmp = true;
178                         return (
179                             DecoderResult::Malformed(2, 2),
180                             unread_handle.consumed(),
181                             destination_handle.written(),
182                         );
183                     }
184                     destination_handle.write_bmp(code_unit);
185                     continue;
186                 }
187             }
188         },
189         self,
190         src_consumed,
191         dest,
192         source,
193         b,
194         destination_handle,
195         unread_handle,
196         check_space_astral
197     );
198 }
199 
200 // Any copyright to the test code below this comment is dedicated to the
201 // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
202 
203 #[cfg(test)]
204 mod tests {
205     use super::super::testing::*;
206     use super::super::*;
207 
decode_utf_16le(bytes: &[u8], expect: &str)208     fn decode_utf_16le(bytes: &[u8], expect: &str) {
209         decode_without_padding(UTF_16LE, bytes, expect);
210     }
211 
decode_utf_16be(bytes: &[u8], expect: &str)212     fn decode_utf_16be(bytes: &[u8], expect: &str) {
213         decode_without_padding(UTF_16BE, bytes, expect);
214     }
215 
encode_utf_16le(string: &str, expect: &[u8])216     fn encode_utf_16le(string: &str, expect: &[u8]) {
217         encode(UTF_16LE, string, expect);
218     }
219 
encode_utf_16be(string: &str, expect: &[u8])220     fn encode_utf_16be(string: &str, expect: &[u8]) {
221         encode(UTF_16BE, string, expect);
222     }
223 
224     #[test]
test_utf_16_decode()225     fn test_utf_16_decode() {
226         decode_utf_16le(b"", "");
227         decode_utf_16be(b"", "");
228 
229         decode_utf_16le(b"\x61\x00\x62\x00", "\u{0061}\u{0062}");
230         decode_utf_16be(b"\x00\x61\x00\x62", "\u{0061}\u{0062}");
231 
232         decode_utf_16le(b"\xFE\xFF\x00\x61\x00\x62", "\u{0061}\u{0062}");
233         decode_utf_16be(b"\xFF\xFE\x61\x00\x62\x00", "\u{0061}\u{0062}");
234 
235         decode_utf_16le(b"\x61\x00\x62", "\u{0061}\u{FFFD}");
236         decode_utf_16be(b"\x00\x61\x00", "\u{0061}\u{FFFD}");
237 
238         decode_utf_16le(b"\x3D\xD8\xA9", "\u{FFFD}");
239         decode_utf_16be(b"\xD8\x3D\xDC", "\u{FFFD}");
240 
241         decode_utf_16le(b"\x3D\xD8\xA9\xDC\x03\x26", "\u{1F4A9}\u{2603}");
242         decode_utf_16be(b"\xD8\x3D\xDC\xA9\x26\x03", "\u{1F4A9}\u{2603}");
243 
244         decode_utf_16le(b"\xA9\xDC\x03\x26", "\u{FFFD}\u{2603}");
245         decode_utf_16be(b"\xDC\xA9\x26\x03", "\u{FFFD}\u{2603}");
246 
247         decode_utf_16le(b"\x3D\xD8\x03\x26", "\u{FFFD}\u{2603}");
248         decode_utf_16be(b"\xD8\x3D\x26\x03", "\u{FFFD}\u{2603}");
249 
250         // The \xFF makes sure that the parts before and after have different alignment
251         let long_le = b"\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\x00\x00\x00\x00\x00\x00\x00\x00\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\xFF\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\x00\x00\x00\x00\x00\x00\x00\x00\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8";
252         let long_be = b"\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\x00\x00\x00\x00\x00\x00\x00\x00\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\xFF\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\x00\x00\x00\x00\x00\x00\x00\x00\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D";
253         let long_expect = "\x00\x00\x00\x00\u{1F4A9}\x00\x00\x00\x00\u{FFFD}\x00\x00\x00\x00\u{FFFD}\x00\x00\x00\x00\x00\x00\x00\x00\u{FFFD}";
254         decode_utf_16le(&long_le[..long_le.len() / 2], long_expect);
255         decode_utf_16be(&long_be[..long_be.len() / 2], long_expect);
256         decode_utf_16le(&long_le[long_le.len() / 2 + 1..], long_expect);
257         decode_utf_16be(&long_be[long_be.len() / 2 + 1..], long_expect);
258     }
259 
260     #[test]
test_utf_16_encode()261     fn test_utf_16_encode() {
262         // Empty
263         encode_utf_16be("", b"");
264         encode_utf_16le("", b"");
265 
266         // Encodes as UTF-8
267         assert_eq!(UTF_16LE.new_encoder().encoding(), UTF_8);
268         assert_eq!(UTF_16BE.new_encoder().encoding(), UTF_8);
269         encode_utf_16le("\u{1F4A9}\u{2603}", "\u{1F4A9}\u{2603}".as_bytes());
270         encode_utf_16be("\u{1F4A9}\u{2603}", "\u{1F4A9}\u{2603}".as_bytes());
271     }
272 
273     #[test]
test_utf_16be_decode_one_by_one()274     fn test_utf_16be_decode_one_by_one() {
275         let input = b"\x00\x61\x00\xE4\x26\x03\xD8\x3D\xDC\xA9";
276         let mut output = [0u16; 20];
277         let mut decoder = UTF_16BE.new_decoder();
278         for b in input.chunks(1) {
279             assert_eq!(b.len(), 1);
280             let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
281             let (result, read, _, had_errors) =
282                 decoder.decode_to_utf16(b, &mut output[..needed], false);
283             assert_eq!(result, CoderResult::InputEmpty);
284             assert_eq!(read, 1);
285             assert!(!had_errors);
286         }
287     }
288 
289     #[test]
test_utf_16le_decode_one_by_one()290     fn test_utf_16le_decode_one_by_one() {
291         let input = b"\x61\x00\xE4\x00\x03\x26\x3D\xD8\xA9\xDC";
292         let mut output = [0u16; 20];
293         let mut decoder = UTF_16LE.new_decoder();
294         for b in input.chunks(1) {
295             assert_eq!(b.len(), 1);
296             let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
297             let (result, read, _, had_errors) =
298                 decoder.decode_to_utf16(b, &mut output[..needed], false);
299             assert_eq!(result, CoderResult::InputEmpty);
300             assert_eq!(read, 1);
301             assert!(!had_errors);
302         }
303     }
304 
305     #[test]
test_utf_16be_decode_three_at_a_time()306     fn test_utf_16be_decode_three_at_a_time() {
307         let input = b"\x00\xE4\x26\x03\xD8\x3D\xDC\xA9\x00\x61\x00\xE4";
308         let mut output = [0u16; 20];
309         let mut decoder = UTF_16BE.new_decoder();
310         for b in input.chunks(3) {
311             assert_eq!(b.len(), 3);
312             let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
313             let (result, read, _, had_errors) =
314                 decoder.decode_to_utf16(b, &mut output[..needed], false);
315             assert_eq!(result, CoderResult::InputEmpty);
316             assert_eq!(read, b.len());
317             assert!(!had_errors);
318         }
319     }
320 
321     #[test]
test_utf_16le_decode_three_at_a_time()322     fn test_utf_16le_decode_three_at_a_time() {
323         let input = b"\xE4\x00\x03\x26\x3D\xD8\xA9\xDC\x61\x00\xE4\x00";
324         let mut output = [0u16; 20];
325         let mut decoder = UTF_16LE.new_decoder();
326         for b in input.chunks(3) {
327             assert_eq!(b.len(), 3);
328             let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
329             let (result, read, _, had_errors) =
330                 decoder.decode_to_utf16(b, &mut output[..needed], false);
331             assert_eq!(result, CoderResult::InputEmpty);
332             assert_eq!(read, b.len());
333             assert!(!had_errors);
334         }
335     }
336 
337     #[test]
test_utf_16le_decode_bom_prefixed_split_byte_pair()338     fn test_utf_16le_decode_bom_prefixed_split_byte_pair() {
339         let mut output = [0u16; 20];
340         let mut decoder = UTF_16LE.new_decoder();
341         {
342             let needed = decoder.max_utf16_buffer_length(1).unwrap();
343             let (result, read, written, had_errors) =
344                 decoder.decode_to_utf16(b"\xFF", &mut output[..needed], false);
345             assert_eq!(result, CoderResult::InputEmpty);
346             assert_eq!(read, 1);
347             assert_eq!(written, 0);
348             assert!(!had_errors);
349         }
350         {
351             let needed = decoder.max_utf16_buffer_length(1).unwrap();
352             let (result, read, written, had_errors) =
353                 decoder.decode_to_utf16(b"\xFD", &mut output[..needed], true);
354             assert_eq!(result, CoderResult::InputEmpty);
355             assert_eq!(read, 1);
356             assert_eq!(written, 1);
357             assert!(!had_errors);
358             assert_eq!(output[0], 0xFDFF);
359         }
360     }
361 
362     #[test]
test_utf_16be_decode_bom_prefixed_split_byte_pair()363     fn test_utf_16be_decode_bom_prefixed_split_byte_pair() {
364         let mut output = [0u16; 20];
365         let mut decoder = UTF_16BE.new_decoder();
366         {
367             let needed = decoder.max_utf16_buffer_length(1).unwrap();
368             let (result, read, written, had_errors) =
369                 decoder.decode_to_utf16(b"\xFE", &mut output[..needed], false);
370             assert_eq!(result, CoderResult::InputEmpty);
371             assert_eq!(read, 1);
372             assert_eq!(written, 0);
373             assert!(!had_errors);
374         }
375         {
376             let needed = decoder.max_utf16_buffer_length(1).unwrap();
377             let (result, read, written, had_errors) =
378                 decoder.decode_to_utf16(b"\xFD", &mut output[..needed], true);
379             assert_eq!(result, CoderResult::InputEmpty);
380             assert_eq!(read, 1);
381             assert_eq!(written, 1);
382             assert!(!had_errors);
383             assert_eq!(output[0], 0xFEFD);
384         }
385     }
386 
387     #[test]
test_utf_16le_decode_bom_prefix()388     fn test_utf_16le_decode_bom_prefix() {
389         let mut output = [0u16; 20];
390         let mut decoder = UTF_16LE.new_decoder();
391         {
392             let needed = decoder.max_utf16_buffer_length(1).unwrap();
393             let (result, read, written, had_errors) =
394                 decoder.decode_to_utf16(b"\xFF", &mut output[..needed], true);
395             assert_eq!(result, CoderResult::InputEmpty);
396             assert_eq!(read, 1);
397             assert_eq!(written, 1);
398             assert!(had_errors);
399             assert_eq!(output[0], 0xFFFD);
400         }
401     }
402 
403     #[test]
test_utf_16be_decode_bom_prefix()404     fn test_utf_16be_decode_bom_prefix() {
405         let mut output = [0u16; 20];
406         let mut decoder = UTF_16BE.new_decoder();
407         {
408             let needed = decoder.max_utf16_buffer_length(1).unwrap();
409             let (result, read, written, had_errors) =
410                 decoder.decode_to_utf16(b"\xFE", &mut output[..needed], true);
411             assert_eq!(result, CoderResult::InputEmpty);
412             assert_eq!(read, 1);
413             assert_eq!(written, 1);
414             assert!(had_errors);
415             assert_eq!(output[0], 0xFFFD);
416         }
417     }
418 
419     #[test]
test_utf_16le_decode_near_end()420     fn test_utf_16le_decode_near_end() {
421         let mut output = [0u8; 4];
422         let mut decoder = UTF_16LE.new_decoder();
423         {
424             let (result, read, written, had_errors) =
425                 decoder.decode_to_utf8(&[0x03], &mut output[..], false);
426             assert_eq!(result, CoderResult::InputEmpty);
427             assert_eq!(read, 1);
428             assert_eq!(written, 0);
429             assert!(!had_errors);
430             assert_eq!(output[0], 0x0);
431         }
432         {
433             let (result, read, written, had_errors) =
434                 decoder.decode_to_utf8(&[0x26, 0x03, 0x26], &mut output[..], false);
435             assert_eq!(result, CoderResult::OutputFull);
436             assert_eq!(read, 1);
437             assert_eq!(written, 3);
438             assert!(!had_errors);
439             assert_eq!(output[0], 0xE2);
440             assert_eq!(output[1], 0x98);
441             assert_eq!(output[2], 0x83);
442             assert_eq!(output[3], 0x00);
443         }
444     }
445 
446     #[test]
test_utf_16be_decode_near_end()447     fn test_utf_16be_decode_near_end() {
448         let mut output = [0u8; 4];
449         let mut decoder = UTF_16BE.new_decoder();
450         {
451             let (result, read, written, had_errors) =
452                 decoder.decode_to_utf8(&[0x26], &mut output[..], false);
453             assert_eq!(result, CoderResult::InputEmpty);
454             assert_eq!(read, 1);
455             assert_eq!(written, 0);
456             assert!(!had_errors);
457             assert_eq!(output[0], 0x0);
458         }
459         {
460             let (result, read, written, had_errors) =
461                 decoder.decode_to_utf8(&[0x03, 0x26, 0x03], &mut output[..], false);
462             assert_eq!(result, CoderResult::OutputFull);
463             assert_eq!(read, 1);
464             assert_eq!(written, 3);
465             assert!(!had_errors);
466             assert_eq!(output[0], 0xE2);
467             assert_eq!(output[1], 0x98);
468             assert_eq!(output[2], 0x83);
469             assert_eq!(output[3], 0x00);
470         }
471     }
472 }
473