1 // Copyright 2012-2014 The Rust Project Developers and Eric Kidd.  See the
2 // COPYRIGHT-RUST.txt file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed except
8 // according to those terms.
9 
10 
11 //! A simple library implementing the [CESU-8 compatibility encoding
12 //! scheme](http://www.unicode.org/reports/tr26/tr26-2.html).  This is a
13 //! non-standard variant of UTF-8 that is used internally by some systems
14 //! that need to represent UTF-16 data as 8-bit characters.  Yes, this is
15 //! ugly.
16 //!
17 //! Use of this encoding is discouraged by the Unicode Consortium.  It's OK
18 //! for working with existing internal APIs, but it should not be used for
19 //! transmitting or storing data.
20 //!
21 //! ```
22 //! use std::borrow::Cow;
23 //! use cesu8::{from_cesu8, to_cesu8};
24 //!
25 //! // 16-bit Unicode characters are the same in UTF-8 and CESU-8.
26 //! assert_eq!(Cow::Borrowed("aé日".as_bytes()),
27 //!            to_cesu8("aé日"));
28 //! assert_eq!(Cow::Borrowed("aé日"),
29 //!            from_cesu8("aé日".as_bytes()).unwrap());
30 //!
31 //! // This string is CESU-8 data containing a 6-byte surrogate pair,
32 //! // which decodes to a 4-byte UTF-8 string.
33 //! let data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81];
34 //! assert_eq!(Cow::Borrowed("\u{10401}"),
35 //!            from_cesu8(data).unwrap());
36 //! ```
37 //!
38 //! ### A note about security
39 //!
40 //! As a general rule, this library is intended to fail on malformed or
41 //! unexpected input.  CESU-8 is supposed to be an internal-only format,
42 //! and if we're seeing malformed data, we assume that it's either a bug in
43 //! somebody's code, or an attacker is trying to improperly encode data to
44 //! evade security checks.
45 //!
46 //! If you have a use case for lossy conversion to UTF-8, or conversion
47 //! from mixed UTF-8/CESU-8 data, please feel free to submit a pull request
48 //! for `from_cesu8_lossy_permissive` with appropriate behavior.
49 //!
50 //! ### Java and U+0000, and other variants
51 //!
52 //! Java uses the CESU-8 encoding as described above, but with one
53 //! difference: The null character U+0000 is represented as an overlong
54 //! UTF-8 sequence `C0 80`. This is supported by the `from_java_cesu8` and
55 //! `to_java_cesu8` methods.
56 //!
57 //! ### Surrogate pairs and UTF-8
58 //!
59 //! The UTF-16 encoding uses "surrogate pairs" to represent Unicode code
60 //! points in the range from U+10000 to U+10FFFF.  These are 16-bit numbers
61 //! in the range 0xD800 to 0xDFFF.
62 //!
63 //! * 0xD800 to 0xDBFF: First half of surrogate pair.  When encoded as
64 //!   CESU-8, these become **1110**1101 **10**100000 **10**000000 to
65 //!   **1110**1101 **10**101111 **10**111111.
66 //!
67 //! * 0xDC00 to 0xDFFF: Second half of surrogate pair.  These become
68 //!   **1110**1101 **10**110000 **10**000000 to
69 //!   **1110**1101 **10**111111 **10**111111.
70 //!
71 //! Wikipedia [explains](http://en.wikipedia.org/wiki/UTF-16) the
72 //! code point to UTF-16 conversion process:
73 //!
74 //! > Consider the encoding of U+10437 (��):
75 //! >
76 //! > * Subtract 0x10000 from 0x10437. The result is 0x00437, 0000 0000 0100
77 //! >   0011 0111.
78 //! > * Split this into the high 10-bit value and the low 10-bit value:
79 //! >   0000000001 and 0000110111.
80 //! > * Add 0xD800 to the high value to form the high surrogate: 0xD800 +
81 //! >   0x0001 = 0xD801.
82 //! > * Add 0xDC00 to the low value to form the low surrogate: 0xDC00 +
83 //! >   0x0037 = 0xDC37.
84 
85 #![warn(missing_docs)]
86 
87 
88 use std::borrow::Cow;
89 use std::error::Error;
90 use std::fmt;
91 use std::result::Result;
92 use std::slice;
93 use std::str::{from_utf8, from_utf8_unchecked};
94 use unicode::utf8_char_width;
95 
96 mod unicode;
97 
98 /// Mask of the value bits of a continuation byte.
99 const CONT_MASK: u8 = 0b0011_1111u8;
100 /// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte.
101 const TAG_CONT_U8: u8 = 0b1000_0000u8;
102 
103 /// The CESU-8 data could not be decoded as valid UTF-8 data.
104 #[derive(Clone, Copy, Debug)]
105 pub struct Cesu8DecodingError;
106 
107 impl Error for Cesu8DecodingError {
description(&self) -> &str108     fn description(&self) -> &str { "decoding error" }
cause(&self) -> Option<&Error>109     fn cause(&self) -> Option<&Error> { None }
110 }
111 
112 impl fmt::Display for Cesu8DecodingError {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result113     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
114         write!(f, "could not convert CESU-8 data to UTF-8")
115     }
116 }
117 
118 /// Which variant of the encoding are we working with?
119 #[derive(PartialEq, Eq)]
120 enum Variant {
121     /// Regular CESU-8, with '\0' represented by itself.
122     Standard,
123     /// This is technically Java's "Modified UTF-8", which is supposedly
124     /// like CESU-8, except that it UTF-8 encodes the '\0' byte.  I'm sure
125     /// it seemed like a good idea at the time.
126     Java,
127 }
128 
129 /// Convert CESU-8 data to a Rust string, re-encoding only if necessary.
130 /// Returns an error if the data cannot be represented as valid UTF-8.
131 ///
132 /// ```
133 /// use std::borrow::Cow;
134 /// use cesu8::from_cesu8;
135 ///
136 /// // This string is valid as UTF-8 or CESU-8, so it doesn't change,
137 /// // and we can convert it without allocating memory.
138 /// assert_eq!(Cow::Borrowed("aé日"),
139 ///            from_cesu8("aé日".as_bytes()).unwrap());
140 ///
141 /// // This string is CESU-8 data containing a 6-byte surrogate pair,
142 /// // which becomes a 4-byte UTF-8 string.
143 /// let data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81];
144 /// assert_eq!(Cow::Borrowed("\u{10401}"),
145 ///            from_cesu8(data).unwrap());
146 /// ```
from_cesu8(bytes: &[u8]) -> Result<Cow<str>, Cesu8DecodingError>147 pub fn from_cesu8(bytes: &[u8]) -> Result<Cow<str>, Cesu8DecodingError> {
148     from_cesu8_internal(bytes, Variant::Standard)
149 }
150 
151 /// Convert Java's modified UTF-8 data to a Rust string, re-encoding only if
152 /// necessary. Returns an error if the data cannot be represented as valid
153 /// UTF-8.
154 ///
155 /// ```
156 /// use std::borrow::Cow;
157 /// use cesu8::from_java_cesu8;
158 ///
159 /// // This string is valid as UTF-8 or modified UTF-8, so it doesn't change,
160 /// // and we can convert it without allocating memory.
161 /// assert_eq!(Cow::Borrowed("aé日"),
162 ///            from_java_cesu8("aé日".as_bytes()).unwrap());
163 ///
164 /// // This string is modified UTF-8 data containing a 6-byte surrogate pair,
165 /// // which becomes a 4-byte UTF-8 string.
166 /// let data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81];
167 /// assert_eq!(Cow::Borrowed("\u{10401}"),
168 ///            from_java_cesu8(data).unwrap());
169 ///
170 /// // This string is modified UTF-8 data containing null code-points.
171 /// let data = &[0xC0, 0x80, 0xC0, 0x80];
172 /// assert_eq!(Cow::Borrowed("\0\0"),
173 ///            from_java_cesu8(data).unwrap());
174 /// ```
from_java_cesu8(bytes: &[u8]) -> Result<Cow<str>, Cesu8DecodingError>175 pub fn from_java_cesu8(bytes: &[u8]) -> Result<Cow<str>, Cesu8DecodingError> {
176     from_cesu8_internal(bytes, Variant::Java)
177 }
178 
179 /// Do the actual work of decoding.
from_cesu8_internal(bytes: &[u8], variant: Variant) -> Result<Cow<str>, Cesu8DecodingError>180 fn from_cesu8_internal(bytes: &[u8], variant: Variant) ->
181     Result<Cow<str>, Cesu8DecodingError>
182 {
183     match from_utf8(bytes) {
184         Ok(str) => Ok(Cow::Borrowed(str)),
185         _ => {
186             let mut decoded = Vec::with_capacity(bytes.len());
187             if decode_from_iter(&mut decoded, &mut bytes.iter(), variant) {
188                 // Keep this assertion in debug mode only.  It's important
189                 // that this assertion is true, because Rust assumes that
190                 // all UTF-8 strings are valid.
191                 debug_assert!(from_utf8(&decoded[..]).is_ok());
192                 Ok(Cow::Owned(unsafe { String::from_utf8_unchecked(decoded) }))
193             } else {
194                 Err(Cesu8DecodingError)
195             }
196         }
197     }
198 }
199 
200 #[test]
test_from_cesu8()201 fn test_from_cesu8() {
202     // The surrogate-encoded character below is from the ICU library's
203     // icu/source/test/testdata/conversion.txt test case.
204     let data = &[0x4D, 0xE6, 0x97, 0xA5, 0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81, 0x7F];
205     assert_eq!(Cow::Borrowed("M日\u{10401}\u{7F}"),
206                from_cesu8(data).unwrap());
207 
208     // We used to have test data from the CESU-8 specification, but when we
209     // worked it through manually, we got the wrong answer:
210     //
211     // Input: [0xED, 0xAE, 0x80, 0xED, 0xB0, 0x80]
212     // Binary: 11101101 10101110 10000000 11101101 10110000 10000000
213     //
214     // 0b1101_101110_000000 -> 0xDB80
215     // 0b1101_110000_000000 -> 0xDC00
216     //
217     // ((0xDB80 - 0xD800) << 10) | (0xDC00 - 0xDC00) -> 0xE0000
218     // 0x10000 + 0xE0000 -> 0xF0000
219     //
220     // The spec claims that we are supposed to get 0x10000, not 0xF0000.
221     // Since I can't reconcile this example data with the text of the
222     // specification, I decided to use a test character from ICU instead.
223 }
224 
225 // Our internal decoder, based on Rust's is_utf8 implementation.
decode_from_iter( decoded: &mut Vec<u8>, iter: &mut slice::Iter<u8>, variant: Variant) -> bool226 fn decode_from_iter(
227     decoded: &mut Vec<u8>, iter: &mut slice::Iter<u8>, variant: Variant)
228     -> bool
229 {
230     macro_rules! err {
231         () => { return false }
232     }
233     macro_rules! next {
234         () => {
235             match iter.next() {
236                 Some(a) => *a,
237                 // We needed data, but there was none: error!
238                 None => err!()
239             }
240         }
241     }
242     macro_rules! next_cont {
243         () => {
244             {
245                 let byte = next!();
246                 if (byte) & !CONT_MASK == TAG_CONT_U8 { byte } else { err!() }
247             }
248         }
249     }
250 
251     loop {
252         let first = match iter.next() {
253             Some(&b) => b,
254             // We're at the end of the iterator and a codepoint boundary at
255             // the same time, so this string is valid.
256             None => return true
257         };
258 
259         if variant == Variant::Java && first == 0 {
260             // Java's modified UTF-8 should never contain \0 directly.
261             err!();
262         } else if first < 128 {
263             // Pass ASCII through directly.
264             decoded.push(first);
265         } else if first == 0xc0 && variant == Variant::Java {
266             match next!() {
267                 0x80 => decoded.push(0),
268                 _ => err!(),
269             }
270         } else {
271             let w = utf8_char_width(first);
272             let second = next_cont!();
273             match w {
274                 // Two-byte sequences can be used directly.
275                 2 => { decoded.extend([first, second].iter().cloned()); }
276                 3 => {
277                     let third = next_cont!();
278                     match (first, second) {
279                         // These are valid UTF-8, so pass them through.
280                         (0xE0         , 0xA0 ... 0xBF) |
281                         (0xE1 ... 0xEC, 0x80 ... 0xBF) |
282                         (0xED         , 0x80 ... 0x9F) |
283                         (0xEE ... 0xEF, 0x80 ... 0xBF) => {
284                             decoded.extend([first, second, third].iter()
285                                                .cloned())
286                         }
287                         // First half a surrogate pair, so decode.
288                         (0xED         , 0xA0 ... 0xAF) => {
289                             if next!() != 0xED { err!() }
290                             let fifth = next_cont!();
291                             if fifth < 0xB0 || 0xBF < fifth { err!() }
292                             let sixth = next_cont!();
293                             let s = dec_surrogates(second, third, fifth, sixth);
294                             decoded.extend(s.iter().cloned());
295                         }
296                         _ => err!()
297                     }
298                 }
299                 _ => err!()
300             }
301         }
302     }
303 }
304 
305 /// Convert the two trailing bytes from a CESU-8 surrogate to a regular
306 /// surrogate value.
dec_surrogate(second: u8, third: u8) -> u32307 fn dec_surrogate(second: u8, third: u8) -> u32 {
308     0xD000u32 | ((second & CONT_MASK) as u32) << 6 | (third & CONT_MASK) as u32
309 }
310 
311 /// Convert the bytes from a CESU-8 surrogate pair into a valid UTF-8
312 /// sequence.  Assumes input is valid.
dec_surrogates(second: u8, third: u8, fifth: u8, sixth: u8) -> [u8; 4]313 fn dec_surrogates(second: u8, third: u8, fifth: u8, sixth: u8) -> [u8; 4] {
314     // Convert to a 32-bit code point.
315     let s1 = dec_surrogate(second, third);
316     let s2 = dec_surrogate(fifth, sixth);
317     let c = 0x10000 + (((s1 - 0xD800) << 10) | (s2 - 0xDC00));
318     //println!("{:0>8b} {:0>8b} {:0>8b} -> {:0>16b}", 0xEDu8, second, third, s1);
319     //println!("{:0>8b} {:0>8b} {:0>8b} -> {:0>16b}", 0xEDu8, fifth, sixth, s2);
320     //println!("-> {:0>32b}", c);
321     assert!(0x010000 <= c && c <= 0x10FFFF);
322 
323     // Convert to UTF-8.
324     // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
325     [0b1111_0000u8 | ((c & 0b1_1100_0000_0000_0000_0000) >> 18) as u8,
326      TAG_CONT_U8   | ((c & 0b0_0011_1111_0000_0000_0000) >> 12) as u8,
327      TAG_CONT_U8   | ((c & 0b0_0000_0000_1111_1100_0000) >>  6) as u8,
328      TAG_CONT_U8   | ((c & 0b0_0000_0000_0000_0011_1111)      ) as u8]
329 }
330 
331 /// Convert a Rust `&str` to CESU-8 bytes.
332 ///
333 /// ```
334 /// use std::borrow::Cow;
335 /// use cesu8::to_cesu8;
336 ///
337 /// // This string is valid as UTF-8 or CESU-8, so it doesn't change,
338 /// // and we can convert it without allocating memory.
339 /// assert_eq!(Cow::Borrowed("aé日".as_bytes()), to_cesu8("aé日"));
340 ///
341 /// // This string is a 4-byte UTF-8 string, which becomes a 6-byte CESU-8
342 /// // vector.
343 /// assert_eq!(Cow::Borrowed(&[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81]),
344 ///            to_cesu8("\u{10401}"));
345 /// ```
to_cesu8(text: &str) -> Cow<[u8]>346 pub fn to_cesu8(text: &str) -> Cow<[u8]> {
347     if is_valid_cesu8(text) {
348         Cow::Borrowed(text.as_bytes())
349     } else {
350         Cow::Owned(to_cesu8_internal(text, Variant::Standard))
351     }
352 }
353 
354 /// Convert a Rust `&str` to Java's modified UTF-8 bytes.
355 ///
356 /// ```
357 /// use std::borrow::Cow;
358 /// use cesu8::to_java_cesu8;
359 ///
360 /// // This string is valid as UTF-8 or CESU-8, so it doesn't change,
361 /// // and we can convert it without allocating memory.
362 /// assert_eq!(Cow::Borrowed("aé日".as_bytes()), to_java_cesu8("aé日"));
363 ///
364 /// // This string is a 4-byte UTF-8 string, which becomes a 6-byte modified
365 /// // UTF-8 vector.
366 /// assert_eq!(Cow::Borrowed(&[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81]),
367 ///            to_java_cesu8("\u{10401}"));
368 ///
369 /// // This string contains null, which becomes 2-byte modified UTF-8 encoding
370 /// assert_eq!(Cow::Borrowed(&[0xC0, 0x80, 0xC0, 0x80]),
371 ///            to_java_cesu8("\0\0"));
372 /// ```
to_java_cesu8(text: &str) -> Cow<[u8]>373 pub fn to_java_cesu8(text: &str) -> Cow<[u8]> {
374     if is_valid_java_cesu8(text) {
375         Cow::Borrowed(text.as_bytes())
376     } else {
377         Cow::Owned(to_cesu8_internal(text, Variant::Java))
378     }
379 }
380 
to_cesu8_internal(text: &str, variant: Variant) -> Vec<u8>381 fn to_cesu8_internal(text: &str, variant: Variant) -> Vec<u8> {
382     let bytes = text.as_bytes();
383     let mut encoded = Vec::with_capacity(bytes.len() + bytes.len() >> 2);
384     let mut i = 0;
385     while i < bytes.len() {
386         let b = bytes[i];
387         if variant == Variant::Java && b == 0 {
388             encoded.push(0xc0);
389             encoded.push(0x80);
390             i += 1;
391         } else if b < 128 {
392             // Pass ASCII through quickly.
393             encoded.push(b);
394             i += 1;
395         } else {
396             // Figure out how many bytes we need for this character.
397             let w = utf8_char_width(b);
398             assert!(w <= 4);
399             assert!(i + w <= bytes.len());
400             if w != 4 {
401                 // Pass through short UTF-8 sequences unmodified.
402                 encoded.extend(bytes[i..i+w].iter().cloned());
403             } else {
404                 // Encode 4-byte sequences as 6 bytes.
405                 let s = unsafe { from_utf8_unchecked(&bytes[i..i+w]) };
406                 let c = s.chars().next().unwrap() as u32 - 0x10000;
407                 let mut s: [u16; 2] = [0; 2];
408                 s[0] = ((c >> 10) as u16)   | 0xD800;
409                 s[1] = ((c & 0x3FF) as u16) | 0xDC00;
410                 encoded.extend(enc_surrogate(s[0]).iter().cloned());
411                 encoded.extend(enc_surrogate(s[1]).iter().cloned());
412             }
413             i += w;
414         }
415     }
416     encoded
417 }
418 
419 /// Check whether a Rust string contains valid CESU-8 data.
is_valid_cesu8(text: &str) -> bool420 pub fn is_valid_cesu8(text: &str) -> bool {
421     // We rely on the fact that Rust strings are guaranteed to be valid
422     // UTF-8.
423     for b in text.bytes() {
424         if (b & !CONT_MASK) == TAG_CONT_U8 { continue; }
425         if utf8_char_width(b) > 3 { return false; }
426     }
427     true
428 }
429 
430 /// Check whether a Rust string contains valid Java's modified UTF-8 data.
is_valid_java_cesu8(text: &str) -> bool431 pub fn is_valid_java_cesu8(text: &str) -> bool {
432     !text.contains('\0') && is_valid_cesu8(text)
433 }
434 
435 #[test]
test_valid_cesu8()436 fn test_valid_cesu8() {
437     assert!(is_valid_cesu8("aé日"));
438     assert!(is_valid_java_cesu8("aé日"));
439     assert!(!is_valid_cesu8("\u{10401}"));
440     assert!(!is_valid_java_cesu8("\u{10401}"));
441     assert!(is_valid_cesu8("\0\0"));
442     assert!(!is_valid_java_cesu8("\0\0"));
443 }
444 
445 
446 /// Encode a single surrogate as CESU-8.
enc_surrogate(surrogate: u16) -> [u8; 3]447 fn enc_surrogate(surrogate: u16) -> [u8; 3] {
448     assert!(0xD800 <= surrogate && surrogate <= 0xDFFF);
449     // 1110xxxx 10xxxxxx 10xxxxxx
450     [0b11100000  | ((surrogate & 0b11110000_00000000) >> 12) as u8,
451      TAG_CONT_U8 | ((surrogate & 0b00001111_11000000) >>  6) as u8,
452      TAG_CONT_U8 | ((surrogate & 0b00000000_00111111)      ) as u8]
453 }
454