1 // Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 use super::*;
11 use ascii::*;
12 use data::position;
13 use handles::*;
14 use variant::*;
15 
16 pub struct SingleByteDecoder {
17     table: &'static [u16; 128],
18 }
19 
20 impl SingleByteDecoder {
new(data: &'static [u16; 128]) -> VariantDecoder21     pub fn new(data: &'static [u16; 128]) -> VariantDecoder {
22         VariantDecoder::SingleByte(SingleByteDecoder { table: data })
23     }
24 
max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize>25     pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
26         Some(byte_length)
27     }
28 
max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize>29     pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
30         byte_length.checked_mul(3)
31     }
32 
max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize>33     pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
34         byte_length.checked_mul(3)
35     }
36 
decode_to_utf8_raw( &mut self, src: &[u8], dst: &mut [u8], _last: bool, ) -> (DecoderResult, usize, usize)37     pub fn decode_to_utf8_raw(
38         &mut self,
39         src: &[u8],
40         dst: &mut [u8],
41         _last: bool,
42     ) -> (DecoderResult, usize, usize) {
43         let mut source = ByteSource::new(src);
44         let mut dest = Utf8Destination::new(dst);
45         'outermost: loop {
46             match dest.copy_ascii_from_check_space_bmp(&mut source) {
47                 CopyAsciiResult::Stop(ret) => return ret,
48                 CopyAsciiResult::GoOn((mut non_ascii, mut handle)) => 'middle: loop {
49                     // Start non-boilerplate
50                     //
51                     // Since the non-ASCIIness of `non_ascii` is hidden from
52                     // the optimizer, it can't figure out that it's OK to
53                     // statically omit the bound check when accessing
54                     // `[u16; 128]` with an index
55                     // `non_ascii as usize - 0x80usize`.
56                     let mapped =
57                         unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) };
58                     // let mapped = self.table[non_ascii as usize - 0x80usize];
59                     if mapped == 0u16 {
60                         return (
61                             DecoderResult::Malformed(1, 0),
62                             source.consumed(),
63                             handle.written(),
64                         );
65                     }
66                     let dest_again = handle.write_bmp_excl_ascii(mapped);
67                     // End non-boilerplate
68                     match source.check_available() {
69                         Space::Full(src_consumed) => {
70                             return (
71                                 DecoderResult::InputEmpty,
72                                 src_consumed,
73                                 dest_again.written(),
74                             );
75                         }
76                         Space::Available(source_handle) => {
77                             match dest_again.check_space_bmp() {
78                                 Space::Full(dst_written) => {
79                                     return (
80                                         DecoderResult::OutputFull,
81                                         source_handle.consumed(),
82                                         dst_written,
83                                     );
84                                 }
85                                 Space::Available(mut destination_handle) => {
86                                     let (mut b, unread_handle) = source_handle.read();
87                                     let source_again = unread_handle.commit();
88                                     'innermost: loop {
89                                         if b > 127 {
90                                             non_ascii = b;
91                                             handle = destination_handle;
92                                             continue 'middle;
93                                         }
94                                         // Testing on Haswell says that we should write the
95                                         // byte unconditionally instead of trying to unread it
96                                         // to make it part of the next SIMD stride.
97                                         let dest_again_again = destination_handle.write_ascii(b);
98                                         if b < 60 {
99                                             // We've got punctuation
100                                             match source_again.check_available() {
101                                                 Space::Full(src_consumed_again) => {
102                                                     return (
103                                                         DecoderResult::InputEmpty,
104                                                         src_consumed_again,
105                                                         dest_again_again.written(),
106                                                     );
107                                                 }
108                                                 Space::Available(source_handle_again) => {
109                                                     match dest_again_again.check_space_bmp() {
110                                                         Space::Full(dst_written_again) => {
111                                                             return (
112                                                                 DecoderResult::OutputFull,
113                                                                 source_handle_again.consumed(),
114                                                                 dst_written_again,
115                                                             );
116                                                         }
117                                                         Space::Available(
118                                                             destination_handle_again,
119                                                         ) => {
120                                                             let (b_again, _unread_handle_again) =
121                                                                 source_handle_again.read();
122                                                             b = b_again;
123                                                             destination_handle =
124                                                                 destination_handle_again;
125                                                             continue 'innermost;
126                                                         }
127                                                     }
128                                                 }
129                                             }
130                                         }
131                                         // We've got markup or ASCII text
132                                         continue 'outermost;
133                                     }
134                                 }
135                             }
136                         }
137                     }
138                 },
139             }
140         }
141     }
142 
decode_to_utf16_raw( &mut self, src: &[u8], dst: &mut [u16], _last: bool, ) -> (DecoderResult, usize, usize)143     pub fn decode_to_utf16_raw(
144         &mut self,
145         src: &[u8],
146         dst: &mut [u16],
147         _last: bool,
148     ) -> (DecoderResult, usize, usize) {
149         let (pending, length) = if dst.len() < src.len() {
150             (DecoderResult::OutputFull, dst.len())
151         } else {
152             (DecoderResult::InputEmpty, src.len())
153         };
154         let mut converted = 0usize;
155         'outermost: loop {
156             match unsafe {
157                 ascii_to_basic_latin(
158                     src.as_ptr().add(converted),
159                     dst.as_mut_ptr().add(converted),
160                     length - converted,
161                 )
162             } {
163                 None => {
164                     return (pending, length, length);
165                 }
166                 Some((mut non_ascii, consumed)) => {
167                     converted += consumed;
168                     'middle: loop {
169                         // `converted` doesn't count the reading of `non_ascii` yet.
170                         // Since the non-ASCIIness of `non_ascii` is hidden from
171                         // the optimizer, it can't figure out that it's OK to
172                         // statically omit the bound check when accessing
173                         // `[u16; 128]` with an index
174                         // `non_ascii as usize - 0x80usize`.
175                         let mapped =
176                             unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) };
177                         // let mapped = self.table[non_ascii as usize - 0x80usize];
178                         if mapped == 0u16 {
179                             return (
180                                 DecoderResult::Malformed(1, 0),
181                                 converted + 1, // +1 `for non_ascii`
182                                 converted,
183                             );
184                         }
185                         unsafe {
186                             // The bound check has already been performed
187                             *(dst.get_unchecked_mut(converted)) = mapped;
188                         }
189                         converted += 1;
190                         // Next, handle ASCII punctuation and non-ASCII without
191                         // going back to ASCII acceleration. Non-ASCII scripts
192                         // use ASCII punctuation, so this avoid going to
193                         // acceleration just for punctuation/space and then
194                         // failing. This is a significant boost to non-ASCII
195                         // scripts.
196                         // TODO: Split out Latin converters without this part
197                         // this stuff makes Latin script-conversion slower.
198                         if converted == length {
199                             return (pending, length, length);
200                         }
201                         let mut b = unsafe { *(src.get_unchecked(converted)) };
202                         'innermost: loop {
203                             if b > 127 {
204                                 non_ascii = b;
205                                 continue 'middle;
206                             }
207                             // Testing on Haswell says that we should write the
208                             // byte unconditionally instead of trying to unread it
209                             // to make it part of the next SIMD stride.
210                             unsafe {
211                                 *(dst.get_unchecked_mut(converted)) = u16::from(b);
212                             }
213                             converted += 1;
214                             if b < 60 {
215                                 // We've got punctuation
216                                 if converted == length {
217                                     return (pending, length, length);
218                                 }
219                                 b = unsafe { *(src.get_unchecked(converted)) };
220                                 continue 'innermost;
221                             }
222                             // We've got markup or ASCII text
223                             continue 'outermost;
224                         }
225                     }
226                 }
227             }
228         }
229     }
230 
latin1_byte_compatible_up_to(&self, buffer: &[u8]) -> usize231     pub fn latin1_byte_compatible_up_to(&self, buffer: &[u8]) -> usize {
232         let mut bytes = buffer;
233         let mut total = 0;
234         loop {
235             if let Some((non_ascii, offset)) = validate_ascii(bytes) {
236                 total += offset;
237                 let mapped = unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) };
238                 if mapped != u16::from(non_ascii) {
239                     return total;
240                 }
241                 total += 1;
242                 bytes = &bytes[offset + 1..];
243             } else {
244                 return total;
245             }
246         }
247     }
248 }
249 
250 pub struct SingleByteEncoder {
251     table: &'static [u16; 128],
252     run_bmp_offset: usize,
253     run_byte_offset: usize,
254     run_length: usize,
255 }
256 
257 impl SingleByteEncoder {
new( encoding: &'static Encoding, data: &'static [u16; 128], run_bmp_offset: u16, run_byte_offset: u8, run_length: u8, ) -> Encoder258     pub fn new(
259         encoding: &'static Encoding,
260         data: &'static [u16; 128],
261         run_bmp_offset: u16,
262         run_byte_offset: u8,
263         run_length: u8,
264     ) -> Encoder {
265         Encoder::new(
266             encoding,
267             VariantEncoder::SingleByte(SingleByteEncoder {
268                 table: data,
269                 run_bmp_offset: run_bmp_offset as usize,
270                 run_byte_offset: run_byte_offset as usize,
271                 run_length: run_length as usize,
272             }),
273         )
274     }
275 
max_buffer_length_from_utf16_without_replacement( &self, u16_length: usize, ) -> Option<usize>276     pub fn max_buffer_length_from_utf16_without_replacement(
277         &self,
278         u16_length: usize,
279     ) -> Option<usize> {
280         Some(u16_length)
281     }
282 
max_buffer_length_from_utf8_without_replacement( &self, byte_length: usize, ) -> Option<usize>283     pub fn max_buffer_length_from_utf8_without_replacement(
284         &self,
285         byte_length: usize,
286     ) -> Option<usize> {
287         Some(byte_length)
288     }
289 
290     #[inline(always)]
encode_u16(&self, code_unit: u16) -> Option<u8>291     fn encode_u16(&self, code_unit: u16) -> Option<u8> {
292         // First, we see if the code unit falls into a run of consecutive
293         // code units that can be mapped by offset. This is very efficient
294         // for most non-Latin encodings as well as Latin1-ish encodings.
295         //
296         // For encodings that don't fit this pattern, the run (which may
297         // have the length of just one) just establishes the starting point
298         // for the next rule.
299         //
300         // Next, we do a forward linear search in the part of the index
301         // after the run. Even in non-Latin1-ish Latin encodings (except
302         // macintosh), the lower case letters are here.
303         //
304         // Next, we search the third quadrant up to the start of the run
305         // (upper case letters in Latin encodings except macintosh, in
306         // Greek and in KOI encodings) and then the second quadrant,
307         // except if the run stared before the third quadrant, we search
308         // the second quadrant up to the run.
309         //
310         // Last, we search the first quadrant, which has unused controls
311         // or punctuation in most encodings. This is bad for macintosh
312         // and IBM866, but those are rare.
313 
314         // Run of consecutive units
315         let unit_as_usize = code_unit as usize;
316         let offset = unit_as_usize.wrapping_sub(self.run_bmp_offset);
317         if offset < self.run_length {
318             return Some((128 + self.run_byte_offset + offset) as u8);
319         }
320 
321         // Search after the run
322         let tail_start = self.run_byte_offset + self.run_length;
323         if let Some(pos) = position(&self.table[tail_start..], code_unit) {
324             return Some((128 + tail_start + pos) as u8);
325         }
326 
327         if self.run_byte_offset >= 64 {
328             // Search third quadrant before the run
329             if let Some(pos) = position(&self.table[64..self.run_byte_offset], code_unit) {
330                 return Some(((128 + 64) + pos) as u8);
331             }
332 
333             // Search second quadrant
334             if let Some(pos) = position(&self.table[32..64], code_unit) {
335                 return Some(((128 + 32) + pos) as u8);
336             }
337         } else if let Some(pos) = position(&self.table[32..self.run_byte_offset], code_unit) {
338             // windows-1252, windows-874, ISO-8859-15 and ISO-8859-5
339             // Search second quadrant before the run
340             return Some(((128 + 32) + pos) as u8);
341         }
342 
343         // Search first quadrant
344         if let Some(pos) = position(&self.table[..32], code_unit) {
345             return Some((128 + pos) as u8);
346         }
347 
348         None
349     }
350 
351     ascii_compatible_bmp_encoder_function!(
352         {
353             match self.encode_u16(bmp) {
354                 Some(byte) => handle.write_one(byte),
355                 None => {
356                     return (
357                         EncoderResult::unmappable_from_bmp(bmp),
358                         source.consumed(),
359                         handle.written(),
360                     );
361                 }
362             }
363         },
364         bmp,
365         self,
366         source,
367         handle,
368         copy_ascii_to_check_space_one,
369         check_space_one,
370         encode_from_utf8_raw,
371         str,
372         Utf8Source,
373         true
374     );
375 
encode_from_utf16_raw( &mut self, src: &[u16], dst: &mut [u8], _last: bool, ) -> (EncoderResult, usize, usize)376     pub fn encode_from_utf16_raw(
377         &mut self,
378         src: &[u16],
379         dst: &mut [u8],
380         _last: bool,
381     ) -> (EncoderResult, usize, usize) {
382         let (pending, length) = if dst.len() < src.len() {
383             (EncoderResult::OutputFull, dst.len())
384         } else {
385             (EncoderResult::InputEmpty, src.len())
386         };
387         let mut converted = 0usize;
388         'outermost: loop {
389             match unsafe {
390                 basic_latin_to_ascii(
391                     src.as_ptr().add(converted),
392                     dst.as_mut_ptr().add(converted),
393                     length - converted,
394                 )
395             } {
396                 None => {
397                     return (pending, length, length);
398                 }
399                 Some((mut non_ascii, consumed)) => {
400                     converted += consumed;
401                     'middle: loop {
402                         // `converted` doesn't count the reading of `non_ascii` yet.
403                         match self.encode_u16(non_ascii) {
404                             Some(byte) => {
405                                 unsafe {
406                                     *(dst.get_unchecked_mut(converted)) = byte;
407                                 }
408                                 converted += 1;
409                             }
410                             None => {
411                                 // At this point, we need to know if we
412                                 // have a surrogate.
413                                 let high_bits = non_ascii & 0xFC00u16;
414                                 if high_bits == 0xD800u16 {
415                                     // high surrogate
416                                     if converted + 1 == length {
417                                         // End of buffer. This surrogate is unpaired.
418                                         return (
419                                             EncoderResult::Unmappable('\u{FFFD}'),
420                                             converted + 1, // +1 `for non_ascii`
421                                             converted,
422                                         );
423                                     }
424                                     let second =
425                                         u32::from(unsafe { *src.get_unchecked(converted + 1) });
426                                     if second & 0xFC00u32 != 0xDC00u32 {
427                                         return (
428                                             EncoderResult::Unmappable('\u{FFFD}'),
429                                             converted + 1, // +1 `for non_ascii`
430                                             converted,
431                                         );
432                                     }
433                                     // The next code unit is a low surrogate.
434                                     let astral: char = unsafe {
435                                         ::std::char::from_u32_unchecked(
436                                             (u32::from(non_ascii) << 10) + second
437                                                 - (((0xD800u32 << 10) - 0x1_0000u32) + 0xDC00u32),
438                                         )
439                                     };
440                                     return (
441                                         EncoderResult::Unmappable(astral),
442                                         converted + 2, // +2 `for non_ascii` and `second`
443                                         converted,
444                                     );
445                                 }
446                                 if high_bits == 0xDC00u16 {
447                                     // Unpaired low surrogate
448                                     return (
449                                         EncoderResult::Unmappable('\u{FFFD}'),
450                                         converted + 1, // +1 `for non_ascii`
451                                         converted,
452                                     );
453                                 }
454                                 return (
455                                     EncoderResult::unmappable_from_bmp(non_ascii),
456                                     converted + 1, // +1 `for non_ascii`
457                                     converted,
458                                 );
459                             }
460                         }
461                         // Next, handle ASCII punctuation and non-ASCII without
462                         // going back to ASCII acceleration. Non-ASCII scripts
463                         // use ASCII punctuation, so this avoid going to
464                         // acceleration just for punctuation/space and then
465                         // failing. This is a significant boost to non-ASCII
466                         // scripts.
467                         // TODO: Split out Latin converters without this part
468                         // this stuff makes Latin script-conversion slower.
469                         if converted == length {
470                             return (pending, length, length);
471                         }
472                         let mut unit = unsafe { *(src.get_unchecked(converted)) };
473                         'innermost: loop {
474                             if unit > 127 {
475                                 non_ascii = unit;
476                                 continue 'middle;
477                             }
478                             // Testing on Haswell says that we should write the
479                             // byte unconditionally instead of trying to unread it
480                             // to make it part of the next SIMD stride.
481                             unsafe {
482                                 *(dst.get_unchecked_mut(converted)) = unit as u8;
483                             }
484                             converted += 1;
485                             if unit < 60 {
486                                 // We've got punctuation
487                                 if converted == length {
488                                     return (pending, length, length);
489                                 }
490                                 unit = unsafe { *(src.get_unchecked(converted)) };
491                                 continue 'innermost;
492                             }
493                             // We've got markup or ASCII text
494                             continue 'outermost;
495                         }
496                     }
497                 }
498             }
499         }
500     }
501 }
502 
503 // Any copyright to the test code below this comment is dedicated to the
504 // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
505 
506 #[cfg(test)]
507 mod tests {
508     use super::super::testing::*;
509     use super::super::*;
510 
511     #[test]
test_windows_1255_ca()512     fn test_windows_1255_ca() {
513         decode(WINDOWS_1255, b"\xCA", "\u{05BA}");
514         encode(WINDOWS_1255, "\u{05BA}", b"\xCA");
515     }
516 
517     #[test]
test_ascii_punctuation()518     fn test_ascii_punctuation() {
519         let bytes = b"\xC1\xF5\xF4\xFC \xE5\xDF\xED\xE1\xE9 \xDD\xED\xE1 \xF4\xE5\xF3\xF4. \xC1\xF5\xF4\xFC \xE5\xDF\xED\xE1\xE9 \xDD\xED\xE1 \xF4\xE5\xF3\xF4.";
520         let characters = "\u{0391}\u{03C5}\u{03C4}\u{03CC} \
521                           \u{03B5}\u{03AF}\u{03BD}\u{03B1}\u{03B9} \u{03AD}\u{03BD}\u{03B1} \
522                           \u{03C4}\u{03B5}\u{03C3}\u{03C4}. \u{0391}\u{03C5}\u{03C4}\u{03CC} \
523                           \u{03B5}\u{03AF}\u{03BD}\u{03B1}\u{03B9} \u{03AD}\u{03BD}\u{03B1} \
524                           \u{03C4}\u{03B5}\u{03C3}\u{03C4}.";
525         decode(WINDOWS_1253, bytes, characters);
526         encode(WINDOWS_1253, characters, bytes);
527     }
528 
529     #[test]
test_decode_malformed()530     fn test_decode_malformed() {
531         decode(
532             WINDOWS_1253,
533             b"\xC1\xF5\xD2\xF4\xFC",
534             "\u{0391}\u{03C5}\u{FFFD}\u{03C4}\u{03CC}",
535         );
536     }
537 
538     #[test]
test_encode_unmappables()539     fn test_encode_unmappables() {
540         encode(
541             WINDOWS_1253,
542             "\u{0391}\u{03C5}\u{2603}\u{03C4}\u{03CC}",
543             b"\xC1\xF5&#9731;\xF4\xFC",
544         );
545         encode(
546             WINDOWS_1253,
547             "\u{0391}\u{03C5}\u{1F4A9}\u{03C4}\u{03CC}",
548             b"\xC1\xF5&#128169;\xF4\xFC",
549         );
550     }
551 
552     #[test]
test_encode_unpaired_surrogates()553     fn test_encode_unpaired_surrogates() {
554         encode_from_utf16(
555             WINDOWS_1253,
556             &[0x0391u16, 0x03C5u16, 0xDCA9u16, 0x03C4u16, 0x03CCu16],
557             b"\xC1\xF5&#65533;\xF4\xFC",
558         );
559         encode_from_utf16(
560             WINDOWS_1253,
561             &[0x0391u16, 0x03C5u16, 0xD83Du16, 0x03C4u16, 0x03CCu16],
562             b"\xC1\xF5&#65533;\xF4\xFC",
563         );
564         encode_from_utf16(
565             WINDOWS_1253,
566             &[0x0391u16, 0x03C5u16, 0x03C4u16, 0x03CCu16, 0xD83Du16],
567             b"\xC1\xF5\xF4\xFC&#65533;",
568         );
569     }
570 
571     pub const HIGH_BYTES: &'static [u8; 128] = &[
572         0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E,
573         0x8F, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D,
574         0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC,
575         0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB,
576         0xBC, 0xBD, 0xBE, 0xBF, 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA,
577         0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9,
578         0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8,
579         0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
580         0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
581     ];
582 
decode_single_byte(encoding: &'static Encoding, data: &'static [u16; 128])583     fn decode_single_byte(encoding: &'static Encoding, data: &'static [u16; 128]) {
584         let mut with_replacement = [0u16; 128];
585         let mut it = data.iter().enumerate();
586         loop {
587             match it.next() {
588                 Some((i, code_point)) => {
589                     if *code_point == 0 {
590                         with_replacement[i] = 0xFFFD;
591                     } else {
592                         with_replacement[i] = *code_point;
593                     }
594                 }
595                 None => {
596                     break;
597                 }
598             }
599         }
600 
601         decode_to_utf16(encoding, HIGH_BYTES, &with_replacement[..]);
602     }
603 
encode_single_byte(encoding: &'static Encoding, data: &'static [u16; 128])604     fn encode_single_byte(encoding: &'static Encoding, data: &'static [u16; 128]) {
605         let mut with_zeros = [0u8; 128];
606         let mut it = data.iter().enumerate();
607         loop {
608             match it.next() {
609                 Some((i, code_point)) => {
610                     if *code_point == 0 {
611                         with_zeros[i] = 0;
612                     } else {
613                         with_zeros[i] = HIGH_BYTES[i];
614                     }
615                 }
616                 None => {
617                     break;
618                 }
619             }
620         }
621 
622         encode_from_utf16(encoding, data, &with_zeros[..]);
623     }
624 
625     #[test]
test_single_byte_from_two_low_surrogates()626     fn test_single_byte_from_two_low_surrogates() {
627         let expectation = b"&#65533;&#65533;";
628         let mut output = [0u8; 40];
629         let mut encoder = WINDOWS_1253.new_encoder();
630         let (result, read, written, had_errors) =
631             encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
632         assert_eq!(result, CoderResult::InputEmpty);
633         assert_eq!(read, 2);
634         assert_eq!(written, expectation.len());
635         assert!(had_errors);
636         assert_eq!(&output[..written], expectation);
637     }
638 
639     // These tests are so self-referential that they are pretty useless.
640 
641     // BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
642     // Instead, please regenerate using generate-encoding-data.py
643 
644     #[test]
test_single_byte_decode()645     fn test_single_byte_decode() {
646         decode_single_byte(IBM866, &data::SINGLE_BYTE_DATA.ibm866);
647         decode_single_byte(ISO_8859_10, &data::SINGLE_BYTE_DATA.iso_8859_10);
648         decode_single_byte(ISO_8859_13, &data::SINGLE_BYTE_DATA.iso_8859_13);
649         decode_single_byte(ISO_8859_14, &data::SINGLE_BYTE_DATA.iso_8859_14);
650         decode_single_byte(ISO_8859_15, &data::SINGLE_BYTE_DATA.iso_8859_15);
651         decode_single_byte(ISO_8859_16, &data::SINGLE_BYTE_DATA.iso_8859_16);
652         decode_single_byte(ISO_8859_2, &data::SINGLE_BYTE_DATA.iso_8859_2);
653         decode_single_byte(ISO_8859_3, &data::SINGLE_BYTE_DATA.iso_8859_3);
654         decode_single_byte(ISO_8859_4, &data::SINGLE_BYTE_DATA.iso_8859_4);
655         decode_single_byte(ISO_8859_5, &data::SINGLE_BYTE_DATA.iso_8859_5);
656         decode_single_byte(ISO_8859_6, &data::SINGLE_BYTE_DATA.iso_8859_6);
657         decode_single_byte(ISO_8859_7, &data::SINGLE_BYTE_DATA.iso_8859_7);
658         decode_single_byte(ISO_8859_8, &data::SINGLE_BYTE_DATA.iso_8859_8);
659         decode_single_byte(KOI8_R, &data::SINGLE_BYTE_DATA.koi8_r);
660         decode_single_byte(KOI8_U, &data::SINGLE_BYTE_DATA.koi8_u);
661         decode_single_byte(MACINTOSH, &data::SINGLE_BYTE_DATA.macintosh);
662         decode_single_byte(WINDOWS_1250, &data::SINGLE_BYTE_DATA.windows_1250);
663         decode_single_byte(WINDOWS_1251, &data::SINGLE_BYTE_DATA.windows_1251);
664         decode_single_byte(WINDOWS_1252, &data::SINGLE_BYTE_DATA.windows_1252);
665         decode_single_byte(WINDOWS_1253, &data::SINGLE_BYTE_DATA.windows_1253);
666         decode_single_byte(WINDOWS_1254, &data::SINGLE_BYTE_DATA.windows_1254);
667         decode_single_byte(WINDOWS_1255, &data::SINGLE_BYTE_DATA.windows_1255);
668         decode_single_byte(WINDOWS_1256, &data::SINGLE_BYTE_DATA.windows_1256);
669         decode_single_byte(WINDOWS_1257, &data::SINGLE_BYTE_DATA.windows_1257);
670         decode_single_byte(WINDOWS_1258, &data::SINGLE_BYTE_DATA.windows_1258);
671         decode_single_byte(WINDOWS_874, &data::SINGLE_BYTE_DATA.windows_874);
672         decode_single_byte(X_MAC_CYRILLIC, &data::SINGLE_BYTE_DATA.x_mac_cyrillic);
673     }
674 
675     #[test]
test_single_byte_encode()676     fn test_single_byte_encode() {
677         encode_single_byte(IBM866, &data::SINGLE_BYTE_DATA.ibm866);
678         encode_single_byte(ISO_8859_10, &data::SINGLE_BYTE_DATA.iso_8859_10);
679         encode_single_byte(ISO_8859_13, &data::SINGLE_BYTE_DATA.iso_8859_13);
680         encode_single_byte(ISO_8859_14, &data::SINGLE_BYTE_DATA.iso_8859_14);
681         encode_single_byte(ISO_8859_15, &data::SINGLE_BYTE_DATA.iso_8859_15);
682         encode_single_byte(ISO_8859_16, &data::SINGLE_BYTE_DATA.iso_8859_16);
683         encode_single_byte(ISO_8859_2, &data::SINGLE_BYTE_DATA.iso_8859_2);
684         encode_single_byte(ISO_8859_3, &data::SINGLE_BYTE_DATA.iso_8859_3);
685         encode_single_byte(ISO_8859_4, &data::SINGLE_BYTE_DATA.iso_8859_4);
686         encode_single_byte(ISO_8859_5, &data::SINGLE_BYTE_DATA.iso_8859_5);
687         encode_single_byte(ISO_8859_6, &data::SINGLE_BYTE_DATA.iso_8859_6);
688         encode_single_byte(ISO_8859_7, &data::SINGLE_BYTE_DATA.iso_8859_7);
689         encode_single_byte(ISO_8859_8, &data::SINGLE_BYTE_DATA.iso_8859_8);
690         encode_single_byte(KOI8_R, &data::SINGLE_BYTE_DATA.koi8_r);
691         encode_single_byte(KOI8_U, &data::SINGLE_BYTE_DATA.koi8_u);
692         encode_single_byte(MACINTOSH, &data::SINGLE_BYTE_DATA.macintosh);
693         encode_single_byte(WINDOWS_1250, &data::SINGLE_BYTE_DATA.windows_1250);
694         encode_single_byte(WINDOWS_1251, &data::SINGLE_BYTE_DATA.windows_1251);
695         encode_single_byte(WINDOWS_1252, &data::SINGLE_BYTE_DATA.windows_1252);
696         encode_single_byte(WINDOWS_1253, &data::SINGLE_BYTE_DATA.windows_1253);
697         encode_single_byte(WINDOWS_1254, &data::SINGLE_BYTE_DATA.windows_1254);
698         encode_single_byte(WINDOWS_1255, &data::SINGLE_BYTE_DATA.windows_1255);
699         encode_single_byte(WINDOWS_1256, &data::SINGLE_BYTE_DATA.windows_1256);
700         encode_single_byte(WINDOWS_1257, &data::SINGLE_BYTE_DATA.windows_1257);
701         encode_single_byte(WINDOWS_1258, &data::SINGLE_BYTE_DATA.windows_1258);
702         encode_single_byte(WINDOWS_874, &data::SINGLE_BYTE_DATA.windows_874);
703         encode_single_byte(X_MAC_CYRILLIC, &data::SINGLE_BYTE_DATA.x_mac_cyrillic);
704     }
705     // END GENERATED CODE
706 }
707