1 // Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 use super::*;
11 use ascii::ascii_to_basic_latin;
12 use ascii::basic_latin_to_ascii;
13 use ascii::validate_ascii;
14 use handles::*;
15 use mem::convert_utf16_to_utf8_partial;
16 use variant::*;
17 
18 cfg_if! {
19     if #[cfg(feature = "simd-accel")] {
20         use ::std::intrinsics::unlikely;
21         use ::std::intrinsics::likely;
22     } else {
23         #[inline(always)]
24         // Unsafe to match the intrinsic, which is needlessly unsafe.
25         unsafe fn unlikely(b: bool) -> bool {
26             b
27         }
28         #[inline(always)]
29         // Unsafe to match the intrinsic, which is needlessly unsafe.
30         unsafe fn likely(b: bool) -> bool {
31             b
32         }
33     }
34 }
35 
36 #[repr(align(64))] // Align to cache lines
37 pub struct Utf8Data {
38     pub table: [u8; 384],
39 }
40 
41 // BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
42 // Instead, please regenerate using generate-encoding-data.py
43 
44 pub static UTF8_DATA: Utf8Data = Utf8Data {
45     table: [
46         252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
47         252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
48         252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
49         252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
50         252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
51         252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
52         252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
53         252, 252, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 148, 148, 148,
54         148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 164, 164, 164, 164, 164,
55         164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164,
56         164, 164, 164, 164, 164, 164, 164, 164, 164, 252, 252, 252, 252, 252, 252, 252, 252, 252,
57         252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
58         252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
59         252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
60         252, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
61         4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
62         4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
63         8, 8, 8, 8, 8, 8, 8, 16, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 32, 8, 8, 64, 8, 8, 8, 128, 4,
64         4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
65     ],
66 };
67 
68 // END GENERATED CODE
69 
utf8_valid_up_to(src: &[u8]) -> usize70 pub fn utf8_valid_up_to(src: &[u8]) -> usize {
71     let mut read = 0;
72     'outer: loop {
73         let mut byte = {
74             let src_remaining = &src[read..];
75             match validate_ascii(src_remaining) {
76                 None => {
77                     return src.len();
78                 }
79                 Some((non_ascii, consumed)) => {
80                     read += consumed;
81                     non_ascii
82                 }
83             }
84         };
85         // Check for the longest sequence to avoid checking twice for the
86         // multi-byte sequences. This can't overflow with 64-bit address space,
87         // because full 64 bits aren't in use. In the 32-bit PAE case, for this
88         // to overflow would mean that the source slice would be so large that
89         // the address space of the process would not have space for any code.
90         // Therefore, the slice cannot be so long that this would overflow.
91         if unsafe { likely(read + 4 <= src.len()) } {
92             'inner: loop {
93                 // At this point, `byte` is not included in `read`, because we
94                 // don't yet know that a) the UTF-8 sequence is valid and b) that there
95                 // is output space if it is an astral sequence.
96                 // Inspecting the lead byte directly is faster than what the
97                 // std lib does!
98                 if unsafe { likely(in_inclusive_range8(byte, 0xC2, 0xDF)) } {
99                     // Two-byte
100                     let second = unsafe { *(src.get_unchecked(read + 1)) };
101                     if !in_inclusive_range8(second, 0x80, 0xBF) {
102                         break 'outer;
103                     }
104                     read += 2;
105 
106                     // Next lead (manually inlined)
107                     if unsafe { likely(read + 4 <= src.len()) } {
108                         byte = unsafe { *(src.get_unchecked(read)) };
109                         if byte < 0x80 {
110                             read += 1;
111                             continue 'outer;
112                         }
113                         continue 'inner;
114                     }
115                     break 'inner;
116                 }
117                 if unsafe { likely(byte < 0xF0) } {
118                     'three: loop {
119                         // Three-byte
120                         let second = unsafe { *(src.get_unchecked(read + 1)) };
121                         let third = unsafe { *(src.get_unchecked(read + 2)) };
122                         if ((UTF8_DATA.table[usize::from(second)]
123                             & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
124                             | (third >> 6))
125                             != 2
126                         {
127                             break 'outer;
128                         }
129                         read += 3;
130 
131                         // Next lead (manually inlined)
132                         if unsafe { likely(read + 4 <= src.len()) } {
133                             byte = unsafe { *(src.get_unchecked(read)) };
134                             if in_inclusive_range8(byte, 0xE0, 0xEF) {
135                                 continue 'three;
136                             }
137                             if unsafe { likely(byte < 0x80) } {
138                                 read += 1;
139                                 continue 'outer;
140                             }
141                             continue 'inner;
142                         }
143                         break 'inner;
144                     }
145                 }
146                 // Four-byte
147                 let second = unsafe { *(src.get_unchecked(read + 1)) };
148                 let third = unsafe { *(src.get_unchecked(read + 2)) };
149                 let fourth = unsafe { *(src.get_unchecked(read + 3)) };
150                 if (u16::from(
151                     UTF8_DATA.table[usize::from(second)]
152                         & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) },
153                 ) | u16::from(third >> 6)
154                     | (u16::from(fourth & 0xC0) << 2))
155                     != 0x202
156                 {
157                     break 'outer;
158                 }
159                 read += 4;
160 
161                 // Next lead
162                 if unsafe { likely(read + 4 <= src.len()) } {
163                     byte = unsafe { *(src.get_unchecked(read)) };
164                     if byte < 0x80 {
165                         read += 1;
166                         continue 'outer;
167                     }
168                     continue 'inner;
169                 }
170                 break 'inner;
171             }
172         }
173         // We can't have a complete 4-byte sequence, but we could still have
174         // one to three shorter sequences.
175         'tail: loop {
176             // >= is better for bound check elision than ==
177             if read >= src.len() {
178                 break 'outer;
179             }
180             byte = src[read];
181             // At this point, `byte` is not included in `read`, because we
182             // don't yet know that a) the UTF-8 sequence is valid and b) that there
183             // is output space if it is an astral sequence.
184             // Inspecting the lead byte directly is faster than what the
185             // std lib does!
186             if byte < 0x80 {
187                 read += 1;
188                 continue 'tail;
189             }
190             if in_inclusive_range8(byte, 0xC2, 0xDF) {
191                 // Two-byte
192                 let new_read = read + 2;
193                 if new_read > src.len() {
194                     break 'outer;
195                 }
196                 let second = src[read + 1];
197                 if !in_inclusive_range8(second, 0x80, 0xBF) {
198                     break 'outer;
199                 }
200                 read += 2;
201                 continue 'tail;
202             }
203             // We need to exclude valid four byte lead bytes, because
204             // `UTF8_DATA.second_mask` covers
205             if byte < 0xF0 {
206                 // Three-byte
207                 let new_read = read + 3;
208                 if new_read > src.len() {
209                     break 'outer;
210                 }
211                 let second = src[read + 1];
212                 let third = src[read + 2];
213                 if ((UTF8_DATA.table[usize::from(second)]
214                     & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
215                     | (third >> 6))
216                     != 2
217                 {
218                     break 'outer;
219                 }
220                 read += 3;
221                 // `'tail` handles sequences shorter than 4, so
222                 // there can't be another sequence after this one.
223                 break 'outer;
224             }
225             break 'outer;
226         }
227     }
228     read
229 }
230 
231 #[cfg_attr(feature = "cargo-clippy", allow(never_loop, cyclomatic_complexity))]
convert_utf8_to_utf16_up_to_invalid(src: &[u8], dst: &mut [u16]) -> (usize, usize)232 pub fn convert_utf8_to_utf16_up_to_invalid(src: &[u8], dst: &mut [u16]) -> (usize, usize) {
233     let mut read = 0;
234     let mut written = 0;
235     'outer: loop {
236         let mut byte = {
237             let src_remaining = &src[read..];
238             let dst_remaining = &mut dst[written..];
239             let length = ::std::cmp::min(src_remaining.len(), dst_remaining.len());
240             match unsafe {
241                 ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length)
242             } {
243                 None => {
244                     read += length;
245                     written += length;
246                     break 'outer;
247                 }
248                 Some((non_ascii, consumed)) => {
249                     read += consumed;
250                     written += consumed;
251                     non_ascii
252                 }
253             }
254         };
255         // Check for the longest sequence to avoid checking twice for the
256         // multi-byte sequences. This can't overflow with 64-bit address space,
257         // because full 64 bits aren't in use. In the 32-bit PAE case, for this
258         // to overflow would mean that the source slice would be so large that
259         // the address space of the process would not have space for any code.
260         // Therefore, the slice cannot be so long that this would overflow.
261         if unsafe { likely(read + 4 <= src.len()) } {
262             'inner: loop {
263                 // At this point, `byte` is not included in `read`, because we
264                 // don't yet know that a) the UTF-8 sequence is valid and b) that there
265                 // is output space if it is an astral sequence.
266                 // We know, thanks to `ascii_to_basic_latin` that there is output
267                 // space for at least one UTF-16 code unit, so no need to check
268                 // for output space in the BMP cases.
269                 // Inspecting the lead byte directly is faster than what the
270                 // std lib does!
271                 if unsafe { likely(in_inclusive_range8(byte, 0xC2, 0xDF)) } {
272                     // Two-byte
273                     let second = unsafe { *(src.get_unchecked(read + 1)) };
274                     if !in_inclusive_range8(second, 0x80, 0xBF) {
275                         break 'outer;
276                     }
277                     unsafe {
278                         *(dst.get_unchecked_mut(written)) =
279                             ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F)
280                     };
281                     read += 2;
282                     written += 1;
283 
284                     // Next lead (manually inlined)
285                     if written == dst.len() {
286                         break 'outer;
287                     }
288                     if unsafe { likely(read + 4 <= src.len()) } {
289                         byte = unsafe { *(src.get_unchecked(read)) };
290                         if byte < 0x80 {
291                             unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
292                             read += 1;
293                             written += 1;
294                             continue 'outer;
295                         }
296                         continue 'inner;
297                     }
298                     break 'inner;
299                 }
300                 if unsafe { likely(byte < 0xF0) } {
301                     'three: loop {
302                         // Three-byte
303                         let second = unsafe { *(src.get_unchecked(read + 1)) };
304                         let third = unsafe { *(src.get_unchecked(read + 2)) };
305                         if ((UTF8_DATA.table[usize::from(second)]
306                             & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
307                             | (third >> 6))
308                             != 2
309                         {
310                             break 'outer;
311                         }
312                         let point = ((u16::from(byte) & 0xF) << 12)
313                             | ((u16::from(second) & 0x3F) << 6)
314                             | (u16::from(third) & 0x3F);
315                         unsafe { *(dst.get_unchecked_mut(written)) = point };
316                         read += 3;
317                         written += 1;
318 
319                         // Next lead (manually inlined)
320                         if written == dst.len() {
321                             break 'outer;
322                         }
323                         if unsafe { likely(read + 4 <= src.len()) } {
324                             byte = unsafe { *(src.get_unchecked(read)) };
325                             if in_inclusive_range8(byte, 0xE0, 0xEF) {
326                                 continue 'three;
327                             }
328                             if unsafe { likely(byte < 0x80) } {
329                                 unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
330                                 read += 1;
331                                 written += 1;
332                                 continue 'outer;
333                             }
334                             continue 'inner;
335                         }
336                         break 'inner;
337                     }
338                 }
339                 // Four-byte
340                 if written + 1 == dst.len() {
341                     break 'outer;
342                 }
343                 let second = unsafe { *(src.get_unchecked(read + 1)) };
344                 let third = unsafe { *(src.get_unchecked(read + 2)) };
345                 let fourth = unsafe { *(src.get_unchecked(read + 3)) };
346                 if (u16::from(
347                     UTF8_DATA.table[usize::from(second)]
348                         & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) },
349                 ) | u16::from(third >> 6)
350                     | (u16::from(fourth & 0xC0) << 2))
351                     != 0x202
352                 {
353                     break 'outer;
354                 }
355                 let point = ((u32::from(byte) & 0x7) << 18)
356                     | ((u32::from(second) & 0x3F) << 12)
357                     | ((u32::from(third) & 0x3F) << 6)
358                     | (u32::from(fourth) & 0x3F);
359                 unsafe { *(dst.get_unchecked_mut(written)) = (0xD7C0 + (point >> 10)) as u16 };
360                 unsafe {
361                     *(dst.get_unchecked_mut(written + 1)) = (0xDC00 + (point & 0x3FF)) as u16
362                 };
363                 read += 4;
364                 written += 2;
365 
366                 // Next lead
367                 if written == dst.len() {
368                     break 'outer;
369                 }
370                 if unsafe { likely(read + 4 <= src.len()) } {
371                     byte = unsafe { *(src.get_unchecked(read)) };
372                     if byte < 0x80 {
373                         unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
374                         read += 1;
375                         written += 1;
376                         continue 'outer;
377                     }
378                     continue 'inner;
379                 }
380                 break 'inner;
381             }
382         }
383         // We can't have a complete 4-byte sequence, but we could still have
384         // one to three shorter sequences.
385         'tail: loop {
386             // >= is better for bound check elision than ==
387             if read >= src.len() || written >= dst.len() {
388                 break 'outer;
389             }
390             byte = src[read];
391             // At this point, `byte` is not included in `read`, because we
392             // don't yet know that a) the UTF-8 sequence is valid and b) that there
393             // is output space if it is an astral sequence.
394             // Inspecting the lead byte directly is faster than what the
395             // std lib does!
396             if byte < 0x80 {
397                 dst[written] = u16::from(byte);
398                 read += 1;
399                 written += 1;
400                 continue 'tail;
401             }
402             if in_inclusive_range8(byte, 0xC2, 0xDF) {
403                 // Two-byte
404                 let new_read = read + 2;
405                 if new_read > src.len() {
406                     break 'outer;
407                 }
408                 let second = src[read + 1];
409                 if !in_inclusive_range8(second, 0x80, 0xBF) {
410                     break 'outer;
411                 }
412                 dst[written] = ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F);
413                 read += 2;
414                 written += 1;
415                 continue 'tail;
416             }
417             // We need to exclude valid four byte lead bytes, because
418             // `UTF8_DATA.second_mask` covers
419             if byte < 0xF0 {
420                 // Three-byte
421                 let new_read = read + 3;
422                 if new_read > src.len() {
423                     break 'outer;
424                 }
425                 let second = src[read + 1];
426                 let third = src[read + 2];
427                 if ((UTF8_DATA.table[usize::from(second)]
428                     & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
429                     | (third >> 6))
430                     != 2
431                 {
432                     break 'outer;
433                 }
434                 let point = ((u16::from(byte) & 0xF) << 12)
435                     | ((u16::from(second) & 0x3F) << 6)
436                     | (u16::from(third) & 0x3F);
437                 dst[written] = point;
438                 read += 3;
439                 written += 1;
440                 // `'tail` handles sequences shorter than 4, so
441                 // there can't be another sequence after this one.
442                 break 'outer;
443             }
444             break 'outer;
445         }
446     }
447     (read, written)
448 }
449 
450 pub struct Utf8Decoder {
451     code_point: u32,
452     bytes_seen: usize,   // 1, 2 or 3: counts continuations only
453     bytes_needed: usize, // 1, 2 or 3: counts continuations only
454     lower_boundary: u8,
455     upper_boundary: u8,
456 }
457 
458 impl Utf8Decoder {
new_inner() -> Utf8Decoder459     pub fn new_inner() -> Utf8Decoder {
460         Utf8Decoder {
461             code_point: 0,
462             bytes_seen: 0,
463             bytes_needed: 0,
464             lower_boundary: 0x80u8,
465             upper_boundary: 0xBFu8,
466         }
467     }
468 
new() -> VariantDecoder469     pub fn new() -> VariantDecoder {
470         VariantDecoder::Utf8(Utf8Decoder::new_inner())
471     }
472 
in_neutral_state(&self) -> bool473     pub fn in_neutral_state(&self) -> bool {
474         self.bytes_needed == 0
475     }
476 
extra_from_state(&self) -> usize477     fn extra_from_state(&self) -> usize {
478         if self.bytes_needed == 0 {
479             0
480         } else {
481             self.bytes_seen + 1
482         }
483     }
484 
max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize>485     pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
486         byte_length.checked_add(1 + self.extra_from_state())
487     }
488 
max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize>489     pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
490         byte_length.checked_add(3 + self.extra_from_state())
491     }
492 
max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize>493     pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
494         checked_add(
495             3,
496             checked_mul(3, byte_length.checked_add(self.extra_from_state())),
497         )
498     }
499 
500     decoder_functions!(
501         {},
502         {
503             // This is the fast path. The rest runs only at the
504             // start and end for partial sequences.
505             if self.bytes_needed == 0 {
506                 dest.copy_utf8_up_to_invalid_from(&mut source);
507             }
508         },
509         {
510             if self.bytes_needed != 0 {
511                 let bad_bytes = (self.bytes_seen + 1) as u8;
512                 self.code_point = 0;
513                 self.bytes_needed = 0;
514                 self.bytes_seen = 0;
515                 return (
516                     DecoderResult::Malformed(bad_bytes, 0),
517                     src_consumed,
518                     dest.written(),
519                 );
520             }
521         },
522         {
523             if self.bytes_needed == 0 {
524                 if b < 0x80u8 {
525                     destination_handle.write_ascii(b);
526                     continue;
527                 }
528                 if b < 0xC2u8 {
529                     return (
530                         DecoderResult::Malformed(1, 0),
531                         unread_handle.consumed(),
532                         destination_handle.written(),
533                     );
534                 }
535                 if b < 0xE0u8 {
536                     self.bytes_needed = 1;
537                     self.code_point = u32::from(b) & 0x1F;
538                     continue;
539                 }
540                 if b < 0xF0u8 {
541                     if b == 0xE0u8 {
542                         self.lower_boundary = 0xA0u8;
543                     } else if b == 0xEDu8 {
544                         self.upper_boundary = 0x9Fu8;
545                     }
546                     self.bytes_needed = 2;
547                     self.code_point = u32::from(b) & 0xF;
548                     continue;
549                 }
550                 if b < 0xF5u8 {
551                     if b == 0xF0u8 {
552                         self.lower_boundary = 0x90u8;
553                     } else if b == 0xF4u8 {
554                         self.upper_boundary = 0x8Fu8;
555                     }
556                     self.bytes_needed = 3;
557                     self.code_point = u32::from(b) & 0x7;
558                     continue;
559                 }
560                 return (
561                     DecoderResult::Malformed(1, 0),
562                     unread_handle.consumed(),
563                     destination_handle.written(),
564                 );
565             }
566             // self.bytes_needed != 0
567             if !(b >= self.lower_boundary && b <= self.upper_boundary) {
568                 let bad_bytes = (self.bytes_seen + 1) as u8;
569                 self.code_point = 0;
570                 self.bytes_needed = 0;
571                 self.bytes_seen = 0;
572                 self.lower_boundary = 0x80u8;
573                 self.upper_boundary = 0xBFu8;
574                 return (
575                     DecoderResult::Malformed(bad_bytes, 0),
576                     unread_handle.unread(),
577                     destination_handle.written(),
578                 );
579             }
580             self.lower_boundary = 0x80u8;
581             self.upper_boundary = 0xBFu8;
582             self.code_point = (self.code_point << 6) | (u32::from(b) & 0x3F);
583             self.bytes_seen += 1;
584             if self.bytes_seen != self.bytes_needed {
585                 continue;
586             }
587             if self.bytes_needed == 3 {
588                 destination_handle.write_astral(self.code_point);
589             } else {
590                 destination_handle.write_bmp_excl_ascii(self.code_point as u16);
591             }
592             self.code_point = 0;
593             self.bytes_needed = 0;
594             self.bytes_seen = 0;
595             continue;
596         },
597         self,
598         src_consumed,
599         dest,
600         source,
601         b,
602         destination_handle,
603         unread_handle,
604         check_space_astral
605     );
606 }
607 
608 #[cfg_attr(feature = "cargo-clippy", allow(never_loop))]
609 #[inline(never)]
convert_utf16_to_utf8_partial_inner(src: &[u16], dst: &mut [u8]) -> (usize, usize)610 pub fn convert_utf16_to_utf8_partial_inner(src: &[u16], dst: &mut [u8]) -> (usize, usize) {
611     let mut read = 0;
612     let mut written = 0;
613     'outer: loop {
614         let mut unit = {
615             let src_remaining = &src[read..];
616             let dst_remaining = &mut dst[written..];
617             let length = if dst_remaining.len() < src_remaining.len() {
618                 dst_remaining.len()
619             } else {
620                 src_remaining.len()
621             };
622             match unsafe {
623                 basic_latin_to_ascii(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length)
624             } {
625                 None => {
626                     read += length;
627                     written += length;
628                     return (read, written);
629                 }
630                 Some((non_ascii, consumed)) => {
631                     read += consumed;
632                     written += consumed;
633                     non_ascii
634                 }
635             }
636         };
637         'inner: loop {
638             // The following loop is only broken out of as a goto forward.
639             loop {
640                 // Unfortunately, this check isn't enough for the compiler to elide
641                 // the bound checks on writes to dst, which is why they are manually
642                 // elided, which makes a measurable difference.
643                 if written.checked_add(4).unwrap() > dst.len() {
644                     return (read, written);
645                 }
646                 read += 1;
647                 if unit < 0x800 {
648                     unsafe {
649                         *(dst.get_unchecked_mut(written)) = (unit >> 6) as u8 | 0xC0u8;
650                         written += 1;
651                         *(dst.get_unchecked_mut(written)) = (unit & 0x3F) as u8 | 0x80u8;
652                         written += 1;
653                     }
654                     break;
655                 }
656                 let unit_minus_surrogate_start = unit.wrapping_sub(0xD800);
657                 if unsafe { likely(unit_minus_surrogate_start > (0xDFFF - 0xD800)) } {
658                     unsafe {
659                         *(dst.get_unchecked_mut(written)) = (unit >> 12) as u8 | 0xE0u8;
660                         written += 1;
661                         *(dst.get_unchecked_mut(written)) = ((unit & 0xFC0) >> 6) as u8 | 0x80u8;
662                         written += 1;
663                         *(dst.get_unchecked_mut(written)) = (unit & 0x3F) as u8 | 0x80u8;
664                         written += 1;
665                     }
666                     break;
667                 }
668                 if unsafe { likely(unit_minus_surrogate_start <= (0xDBFF - 0xD800)) } {
669                     // high surrogate
670                     // read > src.len() is impossible, but using
671                     // >= instead of == allows the compiler to elide a bound check.
672                     if read >= src.len() {
673                         debug_assert_eq!(read, src.len());
674                         // Unpaired surrogate at the end of the buffer.
675                         unsafe {
676                             *(dst.get_unchecked_mut(written)) = 0xEFu8;
677                             written += 1;
678                             *(dst.get_unchecked_mut(written)) = 0xBFu8;
679                             written += 1;
680                             *(dst.get_unchecked_mut(written)) = 0xBDu8;
681                             written += 1;
682                         }
683                         return (read, written);
684                     }
685                     let second = src[read];
686                     let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00);
687                     if unsafe { likely(second_minus_low_surrogate_start <= (0xDFFF - 0xDC00)) } {
688                         // The next code unit is a low surrogate. Advance position.
689                         read += 1;
690                         let astral = (u32::from(unit) << 10) + u32::from(second)
691                             - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32);
692                         unsafe {
693                             *(dst.get_unchecked_mut(written)) = (astral >> 18) as u8 | 0xF0u8;
694                             written += 1;
695                             *(dst.get_unchecked_mut(written)) =
696                                 ((astral & 0x3F000u32) >> 12) as u8 | 0x80u8;
697                             written += 1;
698                             *(dst.get_unchecked_mut(written)) =
699                                 ((astral & 0xFC0u32) >> 6) as u8 | 0x80u8;
700                             written += 1;
701                             *(dst.get_unchecked_mut(written)) = (astral & 0x3F) as u8 | 0x80u8;
702                             written += 1;
703                         }
704                         break;
705                     }
706                     // The next code unit is not a low surrogate. Don't advance
707                     // position and treat the high surrogate as unpaired.
708                     // Fall through
709                 }
710                 // Unpaired low surrogate
711                 unsafe {
712                     *(dst.get_unchecked_mut(written)) = 0xEFu8;
713                     written += 1;
714                     *(dst.get_unchecked_mut(written)) = 0xBFu8;
715                     written += 1;
716                     *(dst.get_unchecked_mut(written)) = 0xBDu8;
717                     written += 1;
718                 }
719                 break;
720             }
721             // Now see if the next unit is Basic Latin
722             // read > src.len() is impossible, but using
723             // >= instead of == allows the compiler to elide a bound check.
724             if read >= src.len() {
725                 debug_assert_eq!(read, src.len());
726                 return (read, written);
727             }
728             unit = src[read];
729             if unsafe { unlikely(unit < 0x80) } {
730                 // written > dst.len() is impossible, but using
731                 // >= instead of == allows the compiler to elide a bound check.
732                 if written >= dst.len() {
733                     debug_assert_eq!(written, dst.len());
734                     return (read, written);
735                 }
736                 dst[written] = unit as u8;
737                 read += 1;
738                 written += 1;
739                 // Mysteriously, adding a punctuation check here makes
740                 // the expected benificiary cases *slower*!
741                 continue 'outer;
742             }
743             continue 'inner;
744         }
745     }
746 }
747 
748 #[inline(never)]
convert_utf16_to_utf8_partial_tail(src: &[u16], dst: &mut [u8]) -> (usize, usize)749 pub fn convert_utf16_to_utf8_partial_tail(src: &[u16], dst: &mut [u8]) -> (usize, usize) {
750     // Everything below is cold code!
751     let mut read = 0;
752     let mut written = 0;
753     let mut unit = src[read];
754     // We now have up to 3 output slots, so an astral character
755     // will not fit.
756     if unit < 0x800 {
757         loop {
758             if unit < 0x80 {
759                 if written >= dst.len() {
760                     return (read, written);
761                 }
762                 read += 1;
763                 dst[written] = unit as u8;
764                 written += 1;
765             } else if unit < 0x800 {
766                 if written + 2 > dst.len() {
767                     return (read, written);
768                 }
769                 read += 1;
770                 dst[written] = (unit >> 6) as u8 | 0xC0u8;
771                 written += 1;
772                 dst[written] = (unit & 0x3F) as u8 | 0x80u8;
773                 written += 1;
774             } else {
775                 return (read, written);
776             }
777             // read > src.len() is impossible, but using
778             // >= instead of == allows the compiler to elide a bound check.
779             if read >= src.len() {
780                 debug_assert_eq!(read, src.len());
781                 return (read, written);
782             }
783             unit = src[read];
784         }
785     }
786     // Could be an unpaired surrogate, but we'll need 3 output
787     // slots in any case.
788     if written + 3 > dst.len() {
789         return (read, written);
790     }
791     read += 1;
792     let unit_minus_surrogate_start = unit.wrapping_sub(0xD800);
793     if unit_minus_surrogate_start <= (0xDFFF - 0xD800) {
794         // Got surrogate
795         if unit_minus_surrogate_start <= (0xDBFF - 0xD800) {
796             // Got high surrogate
797             if read >= src.len() {
798                 // Unpaired high surrogate
799                 unit = 0xFFFD;
800             } else {
801                 let second = src[read];
802                 if in_inclusive_range16(second, 0xDC00, 0xDFFF) {
803                     // Valid surrogate pair, but we know it won't fit.
804                     read -= 1;
805                     return (read, written);
806                 }
807                 // Unpaired high
808                 unit = 0xFFFD;
809             }
810         } else {
811             // Unpaired low
812             unit = 0xFFFD;
813         }
814     }
815     dst[written] = (unit >> 12) as u8 | 0xE0u8;
816     written += 1;
817     dst[written] = ((unit & 0xFC0) >> 6) as u8 | 0x80u8;
818     written += 1;
819     dst[written] = (unit & 0x3F) as u8 | 0x80u8;
820     written += 1;
821     debug_assert_eq!(written, dst.len());
822     (read, written)
823 }
824 
825 pub struct Utf8Encoder;
826 
827 impl Utf8Encoder {
new(encoding: &'static Encoding) -> Encoder828     pub fn new(encoding: &'static Encoding) -> Encoder {
829         Encoder::new(encoding, VariantEncoder::Utf8(Utf8Encoder))
830     }
831 
max_buffer_length_from_utf16_without_replacement( &self, u16_length: usize, ) -> Option<usize>832     pub fn max_buffer_length_from_utf16_without_replacement(
833         &self,
834         u16_length: usize,
835     ) -> Option<usize> {
836         u16_length.checked_mul(3)
837     }
838 
max_buffer_length_from_utf8_without_replacement( &self, byte_length: usize, ) -> Option<usize>839     pub fn max_buffer_length_from_utf8_without_replacement(
840         &self,
841         byte_length: usize,
842     ) -> Option<usize> {
843         Some(byte_length)
844     }
845 
encode_from_utf16_raw( &mut self, src: &[u16], dst: &mut [u8], _last: bool, ) -> (EncoderResult, usize, usize)846     pub fn encode_from_utf16_raw(
847         &mut self,
848         src: &[u16],
849         dst: &mut [u8],
850         _last: bool,
851     ) -> (EncoderResult, usize, usize) {
852         let (read, written) = convert_utf16_to_utf8_partial(src, dst);
853         (
854             if read == src.len() {
855                 EncoderResult::InputEmpty
856             } else {
857                 EncoderResult::OutputFull
858             },
859             read,
860             written,
861         )
862     }
863 
encode_from_utf8_raw( &mut self, src: &str, dst: &mut [u8], _last: bool, ) -> (EncoderResult, usize, usize)864     pub fn encode_from_utf8_raw(
865         &mut self,
866         src: &str,
867         dst: &mut [u8],
868         _last: bool,
869     ) -> (EncoderResult, usize, usize) {
870         let bytes = src.as_bytes();
871         let mut to_write = bytes.len();
872         if to_write <= dst.len() {
873             (&mut dst[..to_write]).copy_from_slice(bytes);
874             return (EncoderResult::InputEmpty, to_write, to_write);
875         }
876         to_write = dst.len();
877         // Move back until we find a UTF-8 sequence boundary.
878         while (bytes[to_write] & 0xC0) == 0x80 {
879             to_write -= 1;
880         }
881         (&mut dst[..to_write]).copy_from_slice(&bytes[..to_write]);
882         (EncoderResult::OutputFull, to_write, to_write)
883     }
884 }
885 
886 // Any copyright to the test code below this comment is dedicated to the
887 // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
888 
889 #[cfg(test)]
890 mod tests {
891     use super::super::testing::*;
892     use super::super::*;
893 
894     //    fn decode_utf8_to_utf16(bytes: &[u8], expect: &[u16]) {
895     //        decode_to_utf16_without_replacement(UTF_8, bytes, expect);
896     //    }
897 
decode_utf8_to_utf8(bytes: &[u8], expect: &str)898     fn decode_utf8_to_utf8(bytes: &[u8], expect: &str) {
899         decode_to_utf8(UTF_8, bytes, expect);
900     }
901 
decode_valid_utf8(string: &str)902     fn decode_valid_utf8(string: &str) {
903         decode_utf8_to_utf8(string.as_bytes(), string);
904     }
905 
encode_utf8_from_utf16(string: &[u16], expect: &[u8])906     fn encode_utf8_from_utf16(string: &[u16], expect: &[u8]) {
907         encode_from_utf16(UTF_8, string, expect);
908     }
909 
encode_utf8_from_utf8(string: &str, expect: &[u8])910     fn encode_utf8_from_utf8(string: &str, expect: &[u8]) {
911         encode_from_utf8(UTF_8, string, expect);
912     }
913 
encode_utf8_from_utf16_with_output_limit( string: &[u16], expect: &str, limit: usize, expect_result: EncoderResult, )914     fn encode_utf8_from_utf16_with_output_limit(
915         string: &[u16],
916         expect: &str,
917         limit: usize,
918         expect_result: EncoderResult,
919     ) {
920         let mut dst = Vec::new();
921         {
922             dst.resize(limit, 0u8);
923             let mut encoder = UTF_8.new_encoder();
924             let (result, read, written) =
925                 encoder.encode_from_utf16_without_replacement(string, &mut dst, false);
926             assert_eq!(result, expect_result);
927             if expect_result == EncoderResult::InputEmpty {
928                 assert_eq!(read, string.len());
929             }
930             assert_eq!(&dst[..written], expect.as_bytes());
931         }
932         {
933             dst.resize(64, 0u8);
934             for (i, elem) in dst.iter_mut().enumerate() {
935                 *elem = i as u8;
936             }
937             let mut encoder = UTF_8.new_encoder();
938             let (_, _, mut j) =
939                 encoder.encode_from_utf16_without_replacement(string, &mut dst, false);
940             while j < dst.len() {
941                 assert_eq!(usize::from(dst[j]), j);
942                 j += 1;
943             }
944         }
945     }
946 
947     #[test]
test_utf8_decode()948     fn test_utf8_decode() {
949         // Empty
950         decode_valid_utf8("");
951         // ASCII
952         decode_valid_utf8("ab");
953         // Low BMP
954         decode_valid_utf8("a\u{E4}Z");
955         // High BMP
956         decode_valid_utf8("a\u{2603}Z");
957         // Astral
958         decode_valid_utf8("a\u{1F4A9}Z");
959         // Low BMP with last byte missing
960         decode_utf8_to_utf8(b"a\xC3Z", "a\u{FFFD}Z");
961         decode_utf8_to_utf8(b"a\xC3", "a\u{FFFD}");
962         // High BMP with last byte missing
963         decode_utf8_to_utf8(b"a\xE2\x98Z", "a\u{FFFD}Z");
964         decode_utf8_to_utf8(b"a\xE2\x98", "a\u{FFFD}");
965         // Astral with last byte missing
966         decode_utf8_to_utf8(b"a\xF0\x9F\x92Z", "a\u{FFFD}Z");
967         decode_utf8_to_utf8(b"a\xF0\x9F\x92", "a\u{FFFD}");
968         // Lone highest continuation
969         decode_utf8_to_utf8(b"a\xBFZ", "a\u{FFFD}Z");
970         decode_utf8_to_utf8(b"a\xBF", "a\u{FFFD}");
971         // Two lone highest continuations
972         decode_utf8_to_utf8(b"a\xBF\xBFZ", "a\u{FFFD}\u{FFFD}Z");
973         decode_utf8_to_utf8(b"a\xBF\xBF", "a\u{FFFD}\u{FFFD}");
974         // Low BMP followed by lowest lone continuation
975         decode_utf8_to_utf8(b"a\xC3\xA4\x80Z", "a\u{E4}\u{FFFD}Z");
976         decode_utf8_to_utf8(b"a\xC3\xA4\x80", "a\u{E4}\u{FFFD}");
977         // Low BMP followed by highest lone continuation
978         decode_utf8_to_utf8(b"a\xC3\xA4\xBFZ", "a\u{E4}\u{FFFD}Z");
979         decode_utf8_to_utf8(b"a\xC3\xA4\xBF", "a\u{E4}\u{FFFD}");
980         // High BMP followed by lowest lone continuation
981         decode_utf8_to_utf8(b"a\xE2\x98\x83\x80Z", "a\u{2603}\u{FFFD}Z");
982         decode_utf8_to_utf8(b"a\xE2\x98\x83\x80", "a\u{2603}\u{FFFD}");
983         // High BMP followed by highest lone continuation
984         decode_utf8_to_utf8(b"a\xE2\x98\x83\xBFZ", "a\u{2603}\u{FFFD}Z");
985         decode_utf8_to_utf8(b"a\xE2\x98\x83\xBF", "a\u{2603}\u{FFFD}");
986         // Astral followed by lowest lone continuation
987         decode_utf8_to_utf8(b"a\xF0\x9F\x92\xA9\x80Z", "a\u{1F4A9}\u{FFFD}Z");
988         decode_utf8_to_utf8(b"a\xF0\x9F\x92\xA9\x80", "a\u{1F4A9}\u{FFFD}");
989         // Astral followed by highest lone continuation
990         decode_utf8_to_utf8(b"a\xF0\x9F\x92\xA9\xBFZ", "a\u{1F4A9}\u{FFFD}Z");
991         decode_utf8_to_utf8(b"a\xF0\x9F\x92\xA9\xBF", "a\u{1F4A9}\u{FFFD}");
992 
993         // Boundary conditions
994         // Lowest single-byte
995         decode_valid_utf8("Z\x00");
996         decode_valid_utf8("Z\x00Z");
997         // Lowest single-byte as two-byte overlong sequence
998         decode_utf8_to_utf8(b"a\xC0\x80", "a\u{FFFD}\u{FFFD}");
999         decode_utf8_to_utf8(b"a\xC0\x80Z", "a\u{FFFD}\u{FFFD}Z");
1000         // Lowest single-byte as three-byte overlong sequence
1001         decode_utf8_to_utf8(b"a\xE0\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1002         decode_utf8_to_utf8(b"a\xE0\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1003         // Lowest single-byte as four-byte overlong sequence
1004         decode_utf8_to_utf8(b"a\xF0\x80\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1005         decode_utf8_to_utf8(b"a\xF0\x80\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1006         // One below lowest single-byte
1007         decode_utf8_to_utf8(b"a\xFF", "a\u{FFFD}");
1008         decode_utf8_to_utf8(b"a\xFFZ", "a\u{FFFD}Z");
1009         // Highest single-byte
1010         decode_valid_utf8("a\x7F");
1011         decode_valid_utf8("a\x7FZ");
1012         // Highest single-byte as two-byte overlong sequence
1013         decode_utf8_to_utf8(b"a\xC1\xBF", "a\u{FFFD}\u{FFFD}");
1014         decode_utf8_to_utf8(b"a\xC1\xBFZ", "a\u{FFFD}\u{FFFD}Z");
1015         // Highest single-byte as three-byte overlong sequence
1016         decode_utf8_to_utf8(b"a\xE0\x81\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1017         decode_utf8_to_utf8(b"a\xE0\x81\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1018         // Highest single-byte as four-byte overlong sequence
1019         decode_utf8_to_utf8(b"a\xF0\x80\x81\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1020         decode_utf8_to_utf8(b"a\xF0\x80\x81\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1021         // One past highest single byte (also lone continuation)
1022         decode_utf8_to_utf8(b"a\x80Z", "a\u{FFFD}Z");
1023         decode_utf8_to_utf8(b"a\x80", "a\u{FFFD}");
1024         // Two lone continuations
1025         decode_utf8_to_utf8(b"a\x80\x80Z", "a\u{FFFD}\u{FFFD}Z");
1026         decode_utf8_to_utf8(b"a\x80\x80", "a\u{FFFD}\u{FFFD}");
1027         // Three lone continuations
1028         decode_utf8_to_utf8(b"a\x80\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1029         decode_utf8_to_utf8(b"a\x80\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1030         // Four lone continuations
1031         decode_utf8_to_utf8(b"a\x80\x80\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1032         decode_utf8_to_utf8(b"a\x80\x80\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1033         // Lowest two-byte
1034         decode_utf8_to_utf8(b"a\xC2\x80", "a\u{0080}");
1035         decode_utf8_to_utf8(b"a\xC2\x80Z", "a\u{0080}Z");
1036         // Lowest two-byte as three-byte overlong sequence
1037         decode_utf8_to_utf8(b"a\xE0\x82\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1038         decode_utf8_to_utf8(b"a\xE0\x82\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1039         // Lowest two-byte as four-byte overlong sequence
1040         decode_utf8_to_utf8(b"a\xF0\x80\x82\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1041         decode_utf8_to_utf8(b"a\xF0\x80\x82\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1042         // Lead one below lowest two-byte
1043         decode_utf8_to_utf8(b"a\xC1\x80", "a\u{FFFD}\u{FFFD}");
1044         decode_utf8_to_utf8(b"a\xC1\x80Z", "a\u{FFFD}\u{FFFD}Z");
1045         // Trail one below lowest two-byte
1046         decode_utf8_to_utf8(b"a\xC2\x7F", "a\u{FFFD}\u{007F}");
1047         decode_utf8_to_utf8(b"a\xC2\x7FZ", "a\u{FFFD}\u{007F}Z");
1048         // Highest two-byte
1049         decode_utf8_to_utf8(b"a\xDF\xBF", "a\u{07FF}");
1050         decode_utf8_to_utf8(b"a\xDF\xBFZ", "a\u{07FF}Z");
1051         // Highest two-byte as three-byte overlong sequence
1052         decode_utf8_to_utf8(b"a\xE0\x9F\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1053         decode_utf8_to_utf8(b"a\xE0\x9F\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1054         // Highest two-byte as four-byte overlong sequence
1055         decode_utf8_to_utf8(b"a\xF0\x80\x9F\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1056         decode_utf8_to_utf8(b"a\xF0\x80\x9F\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1057         // Lowest three-byte
1058         decode_utf8_to_utf8(b"a\xE0\xA0\x80", "a\u{0800}");
1059         decode_utf8_to_utf8(b"a\xE0\xA0\x80Z", "a\u{0800}Z");
1060         // Lowest three-byte as four-byte overlong sequence
1061         decode_utf8_to_utf8(b"a\xF0\x80\xA0\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1062         decode_utf8_to_utf8(b"a\xF0\x80\xA0\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1063         // Highest below surrogates
1064         decode_utf8_to_utf8(b"a\xED\x9F\xBF", "a\u{D7FF}");
1065         decode_utf8_to_utf8(b"a\xED\x9F\xBFZ", "a\u{D7FF}Z");
1066         // Highest below surrogates as four-byte overlong sequence
1067         decode_utf8_to_utf8(b"a\xF0\x8D\x9F\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1068         decode_utf8_to_utf8(b"a\xF0\x8D\x9F\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1069         // First surrogate
1070         decode_utf8_to_utf8(b"a\xED\xA0\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1071         decode_utf8_to_utf8(b"a\xED\xA0\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1072         // First surrogate as four-byte overlong sequence
1073         decode_utf8_to_utf8(b"a\xF0\x8D\xA0\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1074         decode_utf8_to_utf8(b"a\xF0\x8D\xA0\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1075         // Last surrogate
1076         decode_utf8_to_utf8(b"a\xED\xBF\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1077         decode_utf8_to_utf8(b"a\xED\xBF\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1078         // Last surrogate as four-byte overlong sequence
1079         decode_utf8_to_utf8(b"a\xF0\x8D\xBF\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1080         decode_utf8_to_utf8(b"a\xF0\x8D\xBF\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1081         // Lowest above surrogates
1082         decode_utf8_to_utf8(b"a\xEE\x80\x80", "a\u{E000}");
1083         decode_utf8_to_utf8(b"a\xEE\x80\x80Z", "a\u{E000}Z");
1084         // Lowest above surrogates as four-byte overlong sequence
1085         decode_utf8_to_utf8(b"a\xF0\x8E\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1086         decode_utf8_to_utf8(b"a\xF0\x8E\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1087         // Highest three-byte
1088         decode_utf8_to_utf8(b"a\xEF\xBF\xBF", "a\u{FFFF}");
1089         decode_utf8_to_utf8(b"a\xEF\xBF\xBFZ", "a\u{FFFF}Z");
1090         // Highest three-byte as four-byte overlong sequence
1091         decode_utf8_to_utf8(b"a\xF0\x8F\xBF\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1092         decode_utf8_to_utf8(b"a\xF0\x8F\xBF\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1093         // Lowest four-byte
1094         decode_utf8_to_utf8(b"a\xF0\x90\x80\x80", "a\u{10000}");
1095         decode_utf8_to_utf8(b"a\xF0\x90\x80\x80Z", "a\u{10000}Z");
1096         // Highest four-byte
1097         decode_utf8_to_utf8(b"a\xF4\x8F\xBF\xBF", "a\u{10FFFF}");
1098         decode_utf8_to_utf8(b"a\xF4\x8F\xBF\xBFZ", "a\u{10FFFF}Z");
1099         // One past highest four-byte
1100         decode_utf8_to_utf8(b"a\xF4\x90\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1101         decode_utf8_to_utf8(b"a\xF4\x90\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1102 
1103         // Highest four-byte with last byte replaced with 0xFF
1104         decode_utf8_to_utf8(b"a\xF4\x8F\xBF\xFF", "a\u{FFFD}\u{FFFD}");
1105         decode_utf8_to_utf8(b"a\xF4\x8F\xBF\xFFZ", "a\u{FFFD}\u{FFFD}Z");
1106     }
1107 
1108     #[test]
test_utf8_encode()1109     fn test_utf8_encode() {
1110         // Empty
1111         encode_utf8_from_utf16(&[], b"");
1112         encode_utf8_from_utf8("", b"");
1113 
1114         encode_utf8_from_utf16(&[0x0000], "\u{0000}".as_bytes());
1115         encode_utf8_from_utf16(&[0x007F], "\u{007F}".as_bytes());
1116         encode_utf8_from_utf16(&[0x0080], "\u{0080}".as_bytes());
1117         encode_utf8_from_utf16(&[0x07FF], "\u{07FF}".as_bytes());
1118         encode_utf8_from_utf16(&[0x0800], "\u{0800}".as_bytes());
1119         encode_utf8_from_utf16(&[0xD7FF], "\u{D7FF}".as_bytes());
1120         encode_utf8_from_utf16(&[0xD800], "\u{FFFD}".as_bytes());
1121         encode_utf8_from_utf16(&[0xD800, 0x0062], "\u{FFFD}\u{0062}".as_bytes());
1122         encode_utf8_from_utf16(&[0xDFFF], "\u{FFFD}".as_bytes());
1123         encode_utf8_from_utf16(&[0xDFFF, 0x0062], "\u{FFFD}\u{0062}".as_bytes());
1124         encode_utf8_from_utf16(&[0xE000], "\u{E000}".as_bytes());
1125         encode_utf8_from_utf16(&[0xFFFF], "\u{FFFF}".as_bytes());
1126         encode_utf8_from_utf16(&[0xD800, 0xDC00], "\u{10000}".as_bytes());
1127         encode_utf8_from_utf16(&[0xDBFF, 0xDFFF], "\u{10FFFF}".as_bytes());
1128         encode_utf8_from_utf16(&[0xDC00, 0xDEDE], "\u{FFFD}\u{FFFD}".as_bytes());
1129     }
1130 
1131     #[test]
test_encode_utf8_from_utf16_with_output_limit()1132     fn test_encode_utf8_from_utf16_with_output_limit() {
1133         encode_utf8_from_utf16_with_output_limit(&[0x0062], "\u{62}", 1, EncoderResult::InputEmpty);
1134         encode_utf8_from_utf16_with_output_limit(&[0x00A7], "\u{A7}", 2, EncoderResult::InputEmpty);
1135         encode_utf8_from_utf16_with_output_limit(
1136             &[0x2603],
1137             "\u{2603}",
1138             3,
1139             EncoderResult::InputEmpty,
1140         );
1141         encode_utf8_from_utf16_with_output_limit(
1142             &[0xD83D, 0xDCA9],
1143             "\u{1F4A9}",
1144             4,
1145             EncoderResult::InputEmpty,
1146         );
1147 
1148         encode_utf8_from_utf16_with_output_limit(&[0x00A7], "", 1, EncoderResult::OutputFull);
1149         encode_utf8_from_utf16_with_output_limit(&[0x2603], "", 2, EncoderResult::OutputFull);
1150         encode_utf8_from_utf16_with_output_limit(
1151             &[0xD83D, 0xDCA9],
1152             "",
1153             3,
1154             EncoderResult::OutputFull,
1155         );
1156 
1157         encode_utf8_from_utf16_with_output_limit(
1158             &[0x0063, 0x0062],
1159             "\u{63}\u{62}",
1160             2,
1161             EncoderResult::InputEmpty,
1162         );
1163         encode_utf8_from_utf16_with_output_limit(
1164             &[0x0063, 0x00A7],
1165             "\u{63}\u{A7}",
1166             3,
1167             EncoderResult::InputEmpty,
1168         );
1169         encode_utf8_from_utf16_with_output_limit(
1170             &[0x0063, 0x2603],
1171             "\u{63}\u{2603}",
1172             4,
1173             EncoderResult::InputEmpty,
1174         );
1175         encode_utf8_from_utf16_with_output_limit(
1176             &[0x0063, 0xD83D, 0xDCA9],
1177             "\u{63}\u{1F4A9}",
1178             5,
1179             EncoderResult::InputEmpty,
1180         );
1181 
1182         encode_utf8_from_utf16_with_output_limit(
1183             &[0x0063, 0x00A7],
1184             "\u{63}",
1185             2,
1186             EncoderResult::OutputFull,
1187         );
1188         encode_utf8_from_utf16_with_output_limit(
1189             &[0x0063, 0x2603],
1190             "\u{63}",
1191             3,
1192             EncoderResult::OutputFull,
1193         );
1194         encode_utf8_from_utf16_with_output_limit(
1195             &[0x0063, 0xD83D, 0xDCA9],
1196             "\u{63}",
1197             4,
1198             EncoderResult::OutputFull,
1199         );
1200 
1201         encode_utf8_from_utf16_with_output_limit(
1202             &[0x00B6, 0x0062],
1203             "\u{B6}\u{62}",
1204             3,
1205             EncoderResult::InputEmpty,
1206         );
1207         encode_utf8_from_utf16_with_output_limit(
1208             &[0x00B6, 0x00A7],
1209             "\u{B6}\u{A7}",
1210             4,
1211             EncoderResult::InputEmpty,
1212         );
1213         encode_utf8_from_utf16_with_output_limit(
1214             &[0x00B6, 0x2603],
1215             "\u{B6}\u{2603}",
1216             5,
1217             EncoderResult::InputEmpty,
1218         );
1219         encode_utf8_from_utf16_with_output_limit(
1220             &[0x00B6, 0xD83D, 0xDCA9],
1221             "\u{B6}\u{1F4A9}",
1222             6,
1223             EncoderResult::InputEmpty,
1224         );
1225 
1226         encode_utf8_from_utf16_with_output_limit(
1227             &[0x00B6, 0x00A7],
1228             "\u{B6}",
1229             3,
1230             EncoderResult::OutputFull,
1231         );
1232         encode_utf8_from_utf16_with_output_limit(
1233             &[0x00B6, 0x2603],
1234             "\u{B6}",
1235             4,
1236             EncoderResult::OutputFull,
1237         );
1238         encode_utf8_from_utf16_with_output_limit(
1239             &[0x00B6, 0xD83D, 0xDCA9],
1240             "\u{B6}",
1241             5,
1242             EncoderResult::OutputFull,
1243         );
1244 
1245         encode_utf8_from_utf16_with_output_limit(
1246             &[0x263A, 0x0062],
1247             "\u{263A}\u{62}",
1248             4,
1249             EncoderResult::InputEmpty,
1250         );
1251         encode_utf8_from_utf16_with_output_limit(
1252             &[0x263A, 0x00A7],
1253             "\u{263A}\u{A7}",
1254             5,
1255             EncoderResult::InputEmpty,
1256         );
1257         encode_utf8_from_utf16_with_output_limit(
1258             &[0x263A, 0x2603],
1259             "\u{263A}\u{2603}",
1260             6,
1261             EncoderResult::InputEmpty,
1262         );
1263         encode_utf8_from_utf16_with_output_limit(
1264             &[0x263A, 0xD83D, 0xDCA9],
1265             "\u{263A}\u{1F4A9}",
1266             7,
1267             EncoderResult::InputEmpty,
1268         );
1269 
1270         encode_utf8_from_utf16_with_output_limit(
1271             &[0x263A, 0x00A7],
1272             "\u{263A}",
1273             4,
1274             EncoderResult::OutputFull,
1275         );
1276         encode_utf8_from_utf16_with_output_limit(
1277             &[0x263A, 0x2603],
1278             "\u{263A}",
1279             5,
1280             EncoderResult::OutputFull,
1281         );
1282         encode_utf8_from_utf16_with_output_limit(
1283             &[0x263A, 0xD83D, 0xDCA9],
1284             "\u{263A}",
1285             6,
1286             EncoderResult::OutputFull,
1287         );
1288 
1289         encode_utf8_from_utf16_with_output_limit(
1290             &[0xD83D, 0xDE0E, 0x0062],
1291             "\u{1F60E}\u{62}",
1292             5,
1293             EncoderResult::InputEmpty,
1294         );
1295         encode_utf8_from_utf16_with_output_limit(
1296             &[0xD83D, 0xDE0E, 0x00A7],
1297             "\u{1F60E}\u{A7}",
1298             6,
1299             EncoderResult::InputEmpty,
1300         );
1301         encode_utf8_from_utf16_with_output_limit(
1302             &[0xD83D, 0xDE0E, 0x2603],
1303             "\u{1F60E}\u{2603}",
1304             7,
1305             EncoderResult::InputEmpty,
1306         );
1307         encode_utf8_from_utf16_with_output_limit(
1308             &[0xD83D, 0xDE0E, 0xD83D, 0xDCA9],
1309             "\u{1F60E}\u{1F4A9}",
1310             8,
1311             EncoderResult::InputEmpty,
1312         );
1313 
1314         encode_utf8_from_utf16_with_output_limit(
1315             &[0xD83D, 0xDE0E, 0x00A7],
1316             "\u{1F60E}",
1317             5,
1318             EncoderResult::OutputFull,
1319         );
1320         encode_utf8_from_utf16_with_output_limit(
1321             &[0xD83D, 0xDE0E, 0x2603],
1322             "\u{1F60E}",
1323             6,
1324             EncoderResult::OutputFull,
1325         );
1326         encode_utf8_from_utf16_with_output_limit(
1327             &[0xD83D, 0xDE0E, 0xD83D, 0xDCA9],
1328             "\u{1F60E}",
1329             7,
1330             EncoderResult::OutputFull,
1331         );
1332 
1333         encode_utf8_from_utf16_with_output_limit(
1334             &[0x0063, 0x00B6, 0x0062, 0x0062],
1335             "\u{63}\u{B6}\u{62}\u{62}",
1336             5,
1337             EncoderResult::InputEmpty,
1338         );
1339         encode_utf8_from_utf16_with_output_limit(
1340             &[0x0063, 0x00B6, 0x0062, 0x0062],
1341             "\u{63}\u{B6}\u{62}",
1342             4,
1343             EncoderResult::OutputFull,
1344         );
1345 
1346         encode_utf8_from_utf16_with_output_limit(
1347             &[0x0063, 0x00B6, 0x0062, 0x0062, 0x0062],
1348             "\u{63}\u{B6}\u{62}\u{62}\u{62}",
1349             6,
1350             EncoderResult::InputEmpty,
1351         );
1352         encode_utf8_from_utf16_with_output_limit(
1353             &[0x0063, 0x00B6, 0x0062, 0x0062, 0x0062],
1354             "\u{63}\u{B6}\u{62}\u{62}",
1355             5,
1356             EncoderResult::OutputFull,
1357         );
1358 
1359         encode_utf8_from_utf16_with_output_limit(
1360             &[0x263A, 0x0062, 0x0062],
1361             "\u{263A}\u{62}\u{62}",
1362             5,
1363             EncoderResult::InputEmpty,
1364         );
1365         encode_utf8_from_utf16_with_output_limit(
1366             &[0x263A, 0x0062, 0x0062],
1367             "\u{263A}\u{62}",
1368             4,
1369             EncoderResult::OutputFull,
1370         );
1371 
1372         encode_utf8_from_utf16_with_output_limit(
1373             &[0x263A, 0x0062, 0x0062, 0x0062],
1374             "\u{263A}\u{62}\u{62}\u{62}",
1375             6,
1376             EncoderResult::InputEmpty,
1377         );
1378         encode_utf8_from_utf16_with_output_limit(
1379             &[0x263A, 0x0062, 0x0062, 0x0062],
1380             "\u{263A}\u{62}\u{62}",
1381             5,
1382             EncoderResult::OutputFull,
1383         );
1384 
1385         encode_utf8_from_utf16_with_output_limit(
1386             &[0x0063, 0x00B6, 0x00A7],
1387             "\u{63}\u{B6}\u{A7}",
1388             5,
1389             EncoderResult::InputEmpty,
1390         );
1391         encode_utf8_from_utf16_with_output_limit(
1392             &[0x0063, 0x00B6, 0x00A7],
1393             "\u{63}\u{B6}",
1394             4,
1395             EncoderResult::OutputFull,
1396         );
1397 
1398         encode_utf8_from_utf16_with_output_limit(
1399             &[0x0063, 0x00B6, 0x00A7, 0x0062],
1400             "\u{63}\u{B6}\u{A7}\u{62}",
1401             6,
1402             EncoderResult::InputEmpty,
1403         );
1404         encode_utf8_from_utf16_with_output_limit(
1405             &[0x0063, 0x00B6, 0x00A7, 0x0062],
1406             "\u{63}\u{B6}\u{A7}",
1407             5,
1408             EncoderResult::OutputFull,
1409         );
1410 
1411         encode_utf8_from_utf16_with_output_limit(
1412             &[0x263A, 0x00A7, 0x0062],
1413             "\u{263A}\u{A7}\u{62}",
1414             6,
1415             EncoderResult::InputEmpty,
1416         );
1417         encode_utf8_from_utf16_with_output_limit(
1418             &[0x263A, 0x00A7, 0x0062],
1419             "\u{263A}\u{A7}",
1420             5,
1421             EncoderResult::OutputFull,
1422         );
1423 
1424         encode_utf8_from_utf16_with_output_limit(
1425             &[0x0063, 0x00B6, 0x0062, 0x00A7],
1426             "\u{63}\u{B6}\u{62}\u{A7}",
1427             6,
1428             EncoderResult::InputEmpty,
1429         );
1430         encode_utf8_from_utf16_with_output_limit(
1431             &[0x0063, 0x00B6, 0x0062, 0x00A7],
1432             "\u{63}\u{B6}\u{62}",
1433             5,
1434             EncoderResult::OutputFull,
1435         );
1436 
1437         encode_utf8_from_utf16_with_output_limit(
1438             &[0x263A, 0x0062, 0x00A7],
1439             "\u{263A}\u{62}\u{A7}",
1440             6,
1441             EncoderResult::InputEmpty,
1442         );
1443         encode_utf8_from_utf16_with_output_limit(
1444             &[0x263A, 0x0062, 0x00A7],
1445             "\u{263A}\u{62}",
1446             5,
1447             EncoderResult::OutputFull,
1448         );
1449 
1450         encode_utf8_from_utf16_with_output_limit(
1451             &[0x0063, 0x00B6, 0x2603],
1452             "\u{63}\u{B6}\u{2603}",
1453             6,
1454             EncoderResult::InputEmpty,
1455         );
1456         encode_utf8_from_utf16_with_output_limit(
1457             &[0x0063, 0x00B6, 0x2603],
1458             "\u{63}\u{B6}",
1459             5,
1460             EncoderResult::OutputFull,
1461         );
1462 
1463         encode_utf8_from_utf16_with_output_limit(
1464             &[0x263A, 0x2603],
1465             "\u{263A}\u{2603}",
1466             6,
1467             EncoderResult::InputEmpty,
1468         );
1469         encode_utf8_from_utf16_with_output_limit(
1470             &[0x263A, 0x2603],
1471             "\u{263A}",
1472             5,
1473             EncoderResult::OutputFull,
1474         );
1475 
1476         encode_utf8_from_utf16_with_output_limit(
1477             &[0x0063, 0x00B6, 0xD83D],
1478             "\u{63}\u{B6}\u{FFFD}",
1479             6,
1480             EncoderResult::InputEmpty,
1481         );
1482         encode_utf8_from_utf16_with_output_limit(
1483             &[0x0063, 0x00B6, 0xD83D],
1484             "\u{63}\u{B6}",
1485             5,
1486             EncoderResult::OutputFull,
1487         );
1488 
1489         encode_utf8_from_utf16_with_output_limit(
1490             &[0x263A, 0xD83D],
1491             "\u{263A}\u{FFFD}",
1492             6,
1493             EncoderResult::InputEmpty,
1494         );
1495         encode_utf8_from_utf16_with_output_limit(
1496             &[0x263A, 0xD83D],
1497             "\u{263A}",
1498             5,
1499             EncoderResult::OutputFull,
1500         );
1501 
1502         encode_utf8_from_utf16_with_output_limit(
1503             &[0x0063, 0x00B6, 0xDCA9],
1504             "\u{63}\u{B6}\u{FFFD}",
1505             6,
1506             EncoderResult::InputEmpty,
1507         );
1508         encode_utf8_from_utf16_with_output_limit(
1509             &[0x0063, 0x00B6, 0xDCA9],
1510             "\u{63}\u{B6}",
1511             5,
1512             EncoderResult::OutputFull,
1513         );
1514 
1515         encode_utf8_from_utf16_with_output_limit(
1516             &[0x263A, 0xDCA9],
1517             "\u{263A}\u{FFFD}",
1518             6,
1519             EncoderResult::InputEmpty,
1520         );
1521         encode_utf8_from_utf16_with_output_limit(
1522             &[0x263A, 0xDCA9],
1523             "\u{263A}",
1524             5,
1525             EncoderResult::OutputFull,
1526         );
1527     }
1528 
1529     #[test]
test_utf8_max_length_from_utf16()1530     fn test_utf8_max_length_from_utf16() {
1531         let mut encoder = UTF_8.new_encoder();
1532         let mut output = [0u8; 13];
1533         let input = &[0x2C9Fu16, 0x2CA9u16, 0x2CA3u16, 0x2C9Fu16];
1534         let needed = encoder
1535             .max_buffer_length_from_utf16_without_replacement(input.len())
1536             .unwrap();
1537         let (result, _, _) =
1538             encoder.encode_from_utf16_without_replacement(input, &mut output[..needed], true);
1539         assert_eq!(result, EncoderResult::InputEmpty);
1540     }
1541 
1542     #[test]
test_decode_bom_prefixed_split_byte_triple()1543     fn test_decode_bom_prefixed_split_byte_triple() {
1544         let mut output = [0u16; 20];
1545         let mut decoder = UTF_8.new_decoder();
1546         {
1547             let needed = decoder.max_utf16_buffer_length(1).unwrap();
1548             let (result, read, written, had_errors) =
1549                 decoder.decode_to_utf16(b"\xEF", &mut output[..needed], false);
1550             assert_eq!(result, CoderResult::InputEmpty);
1551             assert_eq!(read, 1);
1552             assert_eq!(written, 0);
1553             assert!(!had_errors);
1554         }
1555         {
1556             let needed = decoder.max_utf16_buffer_length(1).unwrap();
1557             let (result, read, written, had_errors) =
1558                 decoder.decode_to_utf16(b"\xBF", &mut output[..needed], false);
1559             assert_eq!(result, CoderResult::InputEmpty);
1560             assert_eq!(read, 1);
1561             assert_eq!(written, 0);
1562             assert!(!had_errors);
1563         }
1564         {
1565             let needed = decoder.max_utf16_buffer_length(1).unwrap();
1566             let (result, read, written, had_errors) =
1567                 decoder.decode_to_utf16(b"\xBE", &mut output[..needed], true);
1568             assert_eq!(result, CoderResult::InputEmpty);
1569             assert_eq!(read, 1);
1570             assert_eq!(written, 1);
1571             assert!(!had_errors);
1572             assert_eq!(output[0], 0xFFFE);
1573         }
1574     }
1575 
1576     #[test]
test_decode_bom_prefixed_split_byte_pair()1577     fn test_decode_bom_prefixed_split_byte_pair() {
1578         let mut output = [0u16; 20];
1579         let mut decoder = UTF_8.new_decoder();
1580         {
1581             let needed = decoder.max_utf16_buffer_length(1).unwrap();
1582             let (result, read, written, had_errors) =
1583                 decoder.decode_to_utf16(b"\xEF", &mut output[..needed], false);
1584             assert_eq!(result, CoderResult::InputEmpty);
1585             assert_eq!(read, 1);
1586             assert_eq!(written, 0);
1587             assert!(!had_errors);
1588         }
1589         {
1590             let needed = decoder.max_utf16_buffer_length(1).unwrap();
1591             let (result, read, written, had_errors) =
1592                 decoder.decode_to_utf16(b"\xBC", &mut output[..needed], true);
1593             assert_eq!(result, CoderResult::InputEmpty);
1594             assert_eq!(read, 1);
1595             assert_eq!(written, 1);
1596             assert!(had_errors);
1597             assert_eq!(output[0], 0xFFFD);
1598         }
1599     }
1600 
1601     #[test]
test_decode_bom_prefix()1602     fn test_decode_bom_prefix() {
1603         let mut output = [0u16; 20];
1604         let mut decoder = UTF_8.new_decoder();
1605         {
1606             let needed = decoder.max_utf16_buffer_length(1).unwrap();
1607             let (result, read, written, had_errors) =
1608                 decoder.decode_to_utf16(b"\xEF", &mut output[..needed], true);
1609             assert_eq!(result, CoderResult::InputEmpty);
1610             assert_eq!(read, 1);
1611             assert_eq!(written, 1);
1612             assert!(had_errors);
1613             assert_eq!(output[0], 0xFFFD);
1614         }
1615     }
1616 
1617     #[test]
test_tail()1618     fn test_tail() {
1619         let mut output = [0u16; 1];
1620         let mut decoder = UTF_8.new_decoder_without_bom_handling();
1621         {
1622             let (result, read, written, had_errors) =
1623                 decoder.decode_to_utf16("\u{E4}a".as_bytes(), &mut output[..], false);
1624             assert_eq!(result, CoderResult::OutputFull);
1625             assert_eq!(read, 2);
1626             assert_eq!(written, 1);
1627             assert!(!had_errors);
1628             assert_eq!(output[0], 0x00E4);
1629         }
1630     }
1631 }
1632