1 /* Copyright 2016 The encode_unicode Developers
2  *
3  * Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
4  * http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
5  * http://opensource.org/licenses/MIT>, at your option. This file may not be
6  * copied, modified, or distributed except according to those terms.
7  */
8 
9 use errors::{FromStrError, EmptyStrError, NonAsciiError, InvalidUtf8Slice, InvalidUtf8Array};
10 use utf8_iterators::Utf8Iterator;
11 use traits::{CharExt, U8UtfExt};
12 use utf16_char::Utf16Char;
13 extern crate core;
14 use self::core::{hash, fmt, str, ptr};
15 use self::core::cmp::Ordering;
16 use self::core::borrow::Borrow;
17 use self::core::ops::Deref;
18 use self::core::mem::transmute;
19 #[cfg(feature="std")]
20 use self::core::iter::FromIterator;
21 #[cfg(feature="std")]
22 #[allow(deprecated)]
23 use std::ascii::AsciiExt;
24 #[cfg(feature="ascii")]
25 extern crate ascii;
26 #[cfg(feature="ascii")]
27 use self::ascii::{AsciiChar,ToAsciiChar,ToAsciiCharError};
28 
29 
30 // I don't think there is any good default value for char, but char does.
31 #[derive(Default)]
32 // char doesn't do anything more advanced than u32 for Eq/Ord, so we shouldn't either.
33 // The default impl of Ord for arrays works out because longer codepoints
34 //     start with more ones, so if they're equal, the length is the same,
35 // breaks down for values above 0x1f_ff_ff but those can only be created by unsafe code.
36 #[derive(PartialEq,Eq, PartialOrd,Ord)]
37 
38 #[derive(Clone,Copy)]
39 
40 
41 /// An unicode codepoint stored as UTF-8.
42 ///
43 /// It can be borrowed as a `str`, and has the same size as `char`.
44 pub struct Utf8Char {
45     bytes: [u8; 4],
46 }
47 
48 
49   /////////////////////
50  //conversion traits//
51 /////////////////////
52 impl str::FromStr for Utf8Char {
53     type Err = FromStrError;
54     /// Create an `Utf8Char` from a string slice.
55     /// The string must contain exactly one codepoint.
56     ///
57     /// # Examples
58     ///
59     /// ```
60     /// use encode_unicode::error::FromStrError::*;
61     /// use encode_unicode::Utf8Char;
62     /// use std::str::FromStr;
63     ///
64     /// assert_eq!(Utf8Char::from_str("a"), Ok(Utf8Char::from('a')));
65     /// assert_eq!(Utf8Char::from_str("��"), Ok(Utf8Char::from('��')));
66     /// assert_eq!(Utf8Char::from_str(""), Err(Empty));
67     /// assert_eq!(Utf8Char::from_str("ab"), Err(MultipleCodepoints));
68     /// assert_eq!(Utf8Char::from_str("é"), Err(MultipleCodepoints));// 'e'+u301 combining mark
69     /// ```
from_str(s: &str) -> Result<Self, FromStrError>70     fn from_str(s: &str) -> Result<Self, FromStrError> {
71         if s.is_empty() {
72             Err(FromStrError::Empty)
73         } else if s.len() != 1+s.as_bytes()[0].extra_utf8_bytes_unchecked() {
74             Err(FromStrError::MultipleCodepoints)
75         } else {
76             let mut bytes = [0; 4];
77             bytes[..s.len()].copy_from_slice(s.as_bytes());
78             Ok(Utf8Char{bytes: bytes})
79         }
80     }
81 }
82 impl From<Utf16Char> for Utf8Char {
from(utf16: Utf16Char) -> Utf8Char83     fn from(utf16: Utf16Char) -> Utf8Char {
84         match utf16.to_tuple() {
85             (a @ 0...0x00_7f, _) => {
86                 Utf8Char{ bytes: [a as u8, 0, 0, 0] }
87             },
88             (u @ 0...0x07_ff, _) => {
89                 let b = 0x80 |  (u & 0x00_3f) as u8;
90                 let a = 0xc0 | ((u & 0x07_c0) >> 6) as u8;
91                 Utf8Char{ bytes: [a, b, 0, 0] }
92             },
93             (u, None) => {
94                 let c = 0x80 |  (u & 0x00_3f) as u8;
95                 let b = 0x80 | ((u & 0x0f_c0) >> 6) as u8;
96                 let a = 0xe0 | ((u & 0xf0_00) >> 12) as u8;
97                 Utf8Char{ bytes: [a, b, c, 0] }
98             },
99             (f, Some(s)) => {
100                 let f = f + (0x01_00_00u32 >> 10) as u16;
101                 let d = 0x80 |  (s & 0x00_3f) as u8;
102                 let c = 0x80 | ((s & 0x03_c0) >> 6) as u8
103                              | ((f & 0x00_03) << 4) as u8;
104                 let b = 0x80 | ((f & 0x00_fc) >> 2) as u8;
105                 let a = 0xf0 | ((f & 0x07_00) >> 8) as u8;
106                 Utf8Char{ bytes: [a, b, c, d] }
107             }
108         }
109     }
110 }
111 impl From<char> for Utf8Char {
from(c: char) -> Self112     fn from(c: char) -> Self {
113         Utf8Char{ bytes: c.to_utf8_array().0 }
114     }
115 }
116 impl From<Utf8Char> for char {
from(uc: Utf8Char) -> char117     fn from(uc: Utf8Char) -> char {
118         unsafe{ char::from_utf8_exact_slice_unchecked(&uc.bytes[..uc.len()]) }
119     }
120 }
121 impl IntoIterator for Utf8Char {
122     type Item=u8;
123     type IntoIter=Utf8Iterator;
124     /// Iterate over the byte values.
into_iter(self) -> Utf8Iterator125     fn into_iter(self) -> Utf8Iterator {
126         Utf8Iterator::from(self)
127     }
128 }
129 
130 #[cfg(feature="std")]
131 impl Extend<Utf8Char> for Vec<u8> {
extend<I:IntoIterator<Item=Utf8Char>>(&mut self, iter: I)132     fn extend<I:IntoIterator<Item=Utf8Char>>(&mut self,  iter: I) {
133         let iter = iter.into_iter();
134         self.reserve(iter.size_hint().0);
135         for u8c in iter {
136             // twice as fast as self.extend_from_slice(u8c.as_bytes());
137             self.push(u8c.bytes[0]);
138             for &extra in &u8c.bytes[1..] {
139                 if extra != 0 {
140                     self.push(extra);
141                 }
142             }
143         }
144     }
145 }
146 #[cfg(feature="std")]
147 impl<'a> Extend<&'a Utf8Char> for Vec<u8> {
extend<I:IntoIterator<Item=&'a Utf8Char>>(&mut self, iter: I)148     fn extend<I:IntoIterator<Item=&'a Utf8Char>>(&mut self,  iter: I) {
149         self.extend(iter.into_iter().cloned())
150     }
151 }
152 #[cfg(feature="std")]
153 impl Extend<Utf8Char> for String {
extend<I:IntoIterator<Item=Utf8Char>>(&mut self, iter: I)154     fn extend<I:IntoIterator<Item=Utf8Char>>(&mut self,  iter: I) {
155         unsafe { self.as_mut_vec().extend(iter) }
156     }
157 }
158 #[cfg(feature="std")]
159 impl<'a> Extend<&'a Utf8Char> for String {
extend<I:IntoIterator<Item=&'a Utf8Char>>(&mut self, iter: I)160     fn extend<I:IntoIterator<Item=&'a Utf8Char>>(&mut self,  iter: I) {
161         self.extend(iter.into_iter().cloned())
162     }
163 }
164 #[cfg(feature="std")]
165 impl FromIterator<Utf8Char> for String {
from_iter<I:IntoIterator<Item=Utf8Char>>(iter: I) -> String166     fn from_iter<I:IntoIterator<Item=Utf8Char>>(iter: I) -> String {
167         let mut string = String::new();
168         string.extend(iter);
169         return string;
170     }
171 }
172 #[cfg(feature="std")]
173 impl<'a> FromIterator<&'a Utf8Char> for String {
from_iter<I:IntoIterator<Item=&'a Utf8Char>>(iter: I) -> String174     fn from_iter<I:IntoIterator<Item=&'a Utf8Char>>(iter: I) -> String {
175         iter.into_iter().cloned().collect()
176     }
177 }
178 #[cfg(feature="std")]
179 impl FromIterator<Utf8Char> for Vec<u8> {
from_iter<I:IntoIterator<Item=Utf8Char>>(iter: I) -> Self180     fn from_iter<I:IntoIterator<Item=Utf8Char>>(iter: I) -> Self {
181         iter.into_iter().collect::<String>().into_bytes()
182     }
183 }
184 #[cfg(feature="std")]
185 impl<'a> FromIterator<&'a Utf8Char> for Vec<u8> {
from_iter<I:IntoIterator<Item=&'a Utf8Char>>(iter: I) -> Self186     fn from_iter<I:IntoIterator<Item=&'a Utf8Char>>(iter: I) -> Self {
187         iter.into_iter().cloned().collect::<String>().into_bytes()
188     }
189 }
190 
191 
192   /////////////////
193  //getter traits//
194 /////////////////
195 impl AsRef<[u8]> for Utf8Char {
as_ref(&self) -> &[u8]196     fn as_ref(&self) -> &[u8] {
197         &self.bytes[..self.len()]
198     }
199 }
200 impl AsRef<str> for Utf8Char {
as_ref(&self) -> &str201     fn as_ref(&self) -> &str {
202         unsafe{ str::from_utf8_unchecked( self.as_ref() ) }
203     }
204 }
205 impl Borrow<[u8]> for Utf8Char {
borrow(&self) -> &[u8]206     fn borrow(&self) -> &[u8] {
207         self.as_ref()
208     }
209 }
210 impl Borrow<str> for Utf8Char {
borrow(&self) -> &str211     fn borrow(&self) -> &str {
212         self.as_ref()
213     }
214 }
215 impl Deref for Utf8Char {
216     type Target = str;
deref(&self) -> &Self::Target217     fn deref(&self) -> &Self::Target {
218         self.as_ref()
219     }
220 }
221 
222 
223   ////////////////
224  //ascii traits//
225 ////////////////
226 #[cfg(feature="std")]
227 #[allow(deprecated)]
228 impl AsciiExt for Utf8Char {
229     type Owned = Utf8Char;
is_ascii(&self) -> bool230     fn is_ascii(&self) -> bool {
231         self.bytes[0].is_ascii()
232     }
eq_ignore_ascii_case(&self, other: &Self) -> bool233     fn eq_ignore_ascii_case(&self,  other: &Self) -> bool {
234         if self.is_ascii() {self.bytes[0].eq_ignore_ascii_case(&other.bytes[0])}
235         else               {self == other}
236     }
to_ascii_uppercase(&self) -> Self::Owned237     fn to_ascii_uppercase(&self) -> Self::Owned {
238         let mut uc = *self;
239         uc.make_ascii_uppercase();
240         uc
241     }
to_ascii_lowercase(&self) -> Self::Owned242     fn to_ascii_lowercase(&self) -> Self::Owned {
243         let mut uc = *self;
244         uc.make_ascii_lowercase();
245         uc
246     }
make_ascii_uppercase(&mut self)247     fn make_ascii_uppercase(&mut self) {
248         self.bytes[0].make_ascii_uppercase()
249     }
make_ascii_lowercase(&mut self)250     fn make_ascii_lowercase(&mut self) {
251         self.bytes[0].make_ascii_lowercase();
252     }
253 }
254 
255 #[cfg(feature="ascii")]
256 /// Requires the feature "ascii".
257 impl From<AsciiChar> for Utf8Char {
from(ac: AsciiChar) -> Self258     fn from(ac: AsciiChar) -> Self {
259         Utf8Char{ bytes: [ac.as_byte(),0,0,0] }
260     }
261 }
262 #[cfg(feature="ascii")]
263 /// Requires the feature "ascii".
264 impl ToAsciiChar for Utf8Char {
to_ascii_char(self) -> Result<AsciiChar, ToAsciiCharError>265     fn to_ascii_char(self) -> Result<AsciiChar, ToAsciiCharError> {
266         self.bytes[0].to_ascii_char()
267     }
to_ascii_char_unchecked(self) -> AsciiChar268     unsafe fn to_ascii_char_unchecked(self) -> AsciiChar {
269         self.bytes[0].to_ascii_char_unchecked()
270     }
271 }
272 
273 
274   /////////////////////////////////////////////////////////
275  //Genaral traits that cannot be derived to emulate char//
276 /////////////////////////////////////////////////////////
277 impl hash::Hash for Utf8Char {
hash<H : hash::Hasher>(&self, state: &mut H)278     fn hash<H : hash::Hasher>(&self,  state: &mut H) {
279         self.to_char().hash(state);
280     }
281 }
282 impl fmt::Debug for Utf8Char {
fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result283     fn fmt(&self,  fmtr: &mut fmt::Formatter) -> fmt::Result {
284         fmt::Debug::fmt(&self.to_char(), fmtr)
285     }
286 }
287 impl fmt::Display for Utf8Char {
fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result288     fn fmt(&self,  fmtr: &mut fmt::Formatter) -> fmt::Result {
289         fmtr.write_str(self.as_str())
290     }
291 }
292 
293 
294   ////////////////////////////////
295  //Comparisons with other types//
296 ////////////////////////////////
297 impl PartialEq<char> for Utf8Char {
eq(&self, u32c: &char) -> bool298     fn eq(&self,  u32c: &char) -> bool {
299         *self == Utf8Char::from(*u32c)
300     }
301 }
302 impl PartialEq<Utf8Char> for char {
eq(&self, u8c: &Utf8Char) -> bool303     fn eq(&self,  u8c: &Utf8Char) -> bool {
304         Utf8Char::from(*self) == *u8c
305     }
306 }
307 impl PartialOrd<char> for Utf8Char {
partial_cmp(&self, u32c: &char) -> Option<Ordering>308     fn partial_cmp(&self,  u32c: &char) -> Option<Ordering> {
309         self.partial_cmp(&Self::from(*u32c))
310     }
311 }
312 impl PartialOrd<Utf8Char> for char {
partial_cmp(&self, u8c: &Utf8Char) -> Option<Ordering>313     fn partial_cmp(&self,  u8c: &Utf8Char) -> Option<Ordering> {
314         Utf8Char::from(*self).partial_cmp(u8c)
315     }
316 }
317 
318 impl PartialEq<Utf16Char> for Utf8Char {
eq(&self, u16c: &Utf16Char) -> bool319     fn eq(&self,  u16c: &Utf16Char) -> bool {
320         *self == Self::from(*u16c)
321     }
322 }
323 impl PartialOrd<Utf16Char> for Utf8Char {
partial_cmp(&self, u16c: &Utf16Char) -> Option<Ordering>324     fn partial_cmp(&self,  u16c: &Utf16Char) -> Option<Ordering> {
325         self.partial_cmp(&Self::from(*u16c))
326     }
327 }
328 // The other direction is implemented in utf16_char.rs
329 
330 /// Only considers the byte equal if both it and the `Utf8Char` represents ASCII characters.
331 ///
332 /// There is no impl in the opposite direction, as this should only be used to
333 /// compare `Utf8Char`s against constants.
334 ///
335 /// # Examples
336 ///
337 /// ```
338 /// # use encode_unicode::Utf8Char;
339 /// assert!(Utf8Char::from('8') == b'8');
340 /// assert!(Utf8Char::from_array([0xf1,0x80,0x80,0x80]).unwrap() != 0xf1);
341 /// assert!(Utf8Char::from('\u{ff}') != 0xff);
342 /// assert!(Utf8Char::from('\u{80}') != 0x80);
343 /// ```
344 impl PartialEq<u8> for Utf8Char {
eq(&self, byte: &u8) -> bool345     fn eq(&self,  byte: &u8) -> bool {
346         self.bytes[0] == *byte  &&  self.bytes[1] == 0
347     }
348 }
349 #[cfg(feature = "ascii")]
350 /// `Utf8Char`s that are not ASCII never compare equal.
351 impl PartialEq<AsciiChar> for Utf8Char {
352     #[inline]
eq(&self, ascii: &AsciiChar) -> bool353     fn eq(&self,  ascii: &AsciiChar) -> bool {
354         self.bytes[0] == *ascii as u8
355     }
356 }
357 #[cfg(feature = "ascii")]
358 /// `Utf8Char`s that are not ASCII never compare equal.
359 impl PartialEq<Utf8Char> for AsciiChar {
360     #[inline]
eq(&self, u8c: &Utf8Char) -> bool361     fn eq(&self,  u8c: &Utf8Char) -> bool {
362         u8c == self
363     }
364 }
365 #[cfg(feature = "ascii")]
366 /// `Utf8Char`s that are not ASCII always compare greater.
367 impl PartialOrd<AsciiChar> for Utf8Char {
368     #[inline]
partial_cmp(&self, ascii: &AsciiChar) -> Option<Ordering>369     fn partial_cmp(&self,  ascii: &AsciiChar) -> Option<Ordering> {
370         self.bytes[0].partial_cmp(ascii)
371     }
372 }
373 #[cfg(feature = "ascii")]
374 /// `Utf8Char`s that are not ASCII always compare greater.
375 impl PartialOrd<Utf8Char> for AsciiChar {
376     #[inline]
partial_cmp(&self, u8c: &Utf8Char) -> Option<Ordering>377     fn partial_cmp(&self,  u8c: &Utf8Char) -> Option<Ordering> {
378         self.partial_cmp(&u8c.bytes[0])
379     }
380 }
381 
382 
383   ///////////////////////////////////////////////////////
384  //pub impls that should be together for nicer rustdoc//
385 ///////////////////////////////////////////////////////
386 impl Utf8Char {
387     /// Create an `Utf8Char` from the first codepoint in a `str`.
388     ///
389     /// Returns an error if the `str` is empty.
390     ///
391     /// # Examples
392     ///
393     /// ```
394     /// use encode_unicode::Utf8Char;
395     ///
396     /// assert_eq!(Utf8Char::from_str_start("a"), Ok((Utf8Char::from('a'),1)));
397     /// assert_eq!(Utf8Char::from_str_start("ab"), Ok((Utf8Char::from('a'),1)));
398     /// assert_eq!(Utf8Char::from_str_start("�� "), Ok((Utf8Char::from('��'),4)));
399     /// assert_eq!(Utf8Char::from_str_start("é"), Ok((Utf8Char::from('e'),1)));// 'e'+u301 combining mark
400     /// assert!(Utf8Char::from_str_start("").is_err());
401     /// ```
from_str_start(src: &str) -> Result<(Self,usize),EmptyStrError>402     pub fn from_str_start(src: &str) -> Result<(Self,usize),EmptyStrError> {
403         unsafe {
404             if src.is_empty() {
405                 Err(EmptyStrError)
406             } else {
407                 Ok(Utf8Char::from_slice_start_unchecked(src.as_bytes()))
408             }
409         }
410     }
411     /// Create an `Utf8Char` of the first codepoint in an UTF-8 slice.
412     /// Also returns the length of the UTF-8 sequence for the codepoint.
413     ///
414     /// If the slice is from a `str`, use `::from_str_start()` to skip UTF-8 validation.
415     ///
416     /// # Errors
417     ///
418     /// Returns an `Err` if the slice is empty, doesn't start with a valid
419     /// UTF-8 sequence or is too short for the sequence.
420     ///
421     /// # Examples
422     ///
423     /// ```
424     /// use encode_unicode::Utf8Char;
425     /// use encode_unicode::error::InvalidUtf8Slice::*;
426     /// use encode_unicode::error::InvalidUtf8::*;
427     ///
428     /// assert_eq!(Utf8Char::from_slice_start(&[b'A', b'B', b'C']), Ok((Utf8Char::from('A'),1)));
429     /// assert_eq!(Utf8Char::from_slice_start(&[0xdd, 0xbb]), Ok((Utf8Char::from('\u{77b}'),2)));
430     ///
431     /// assert_eq!(Utf8Char::from_slice_start(&[]), Err(TooShort(1)));
432     /// assert_eq!(Utf8Char::from_slice_start(&[0xf0, 0x99]), Err(TooShort(4)));
433     /// assert_eq!(Utf8Char::from_slice_start(&[0xee, b'F', 0x80]), Err(Utf8(NotAContinuationByte(1))));
434     /// assert_eq!(Utf8Char::from_slice_start(&[0xee, 0x99, 0x0f]), Err(Utf8(NotAContinuationByte(2))));
435     /// ```
from_slice_start(src: &[u8]) -> Result<(Self,usize),InvalidUtf8Slice>436     pub fn from_slice_start(src: &[u8]) -> Result<(Self,usize),InvalidUtf8Slice> {
437         char::from_utf8_slice_start(src).map(|(_,len)| {
438             let mut bytes = [0; 4];
439             bytes[..len].copy_from_slice(&src[..len]);
440             (Utf8Char{ bytes: bytes }, len)
441         })
442     }
443     /// A `from_slice_start()` that doesn't validate the codepoint.
444     ///
445     /// # Safety
446     ///
447     /// The slice must be non-empty and start with a valid UTF-8 codepoint.
448     /// Invalid or incomplete values might cause reads of uninitalized memory.
from_slice_start_unchecked(src: &[u8]) -> (Self,usize)449     pub unsafe fn from_slice_start_unchecked(src: &[u8]) -> (Self,usize) {
450         let len = 1+src.get_unchecked(0).extra_utf8_bytes_unchecked();
451         let mut bytes = [0; 4];
452         ptr::copy_nonoverlapping(src.as_ptr(), &mut bytes[0] as *mut u8, len);
453         (Utf8Char{ bytes: bytes }, len)
454     }
455     /// Create an `Utf8Char` from a byte array after validating it.
456     ///
457     /// The codepoint must start at the first byte.
458     /// Unused bytes are set to zero by this function and so can be anything.
459     ///
460     /// # Errors
461     ///
462     /// Returns an `Err` if the array doesn't start with a valid UTF-8 sequence.
463     ///
464     /// # Examples
465     ///
466     /// ```
467     /// use encode_unicode::Utf8Char;
468     /// use encode_unicode::error::InvalidUtf8Array::*;
469     /// use encode_unicode::error::InvalidUtf8::*;
470     /// use encode_unicode::error::InvalidCodepoint::*;
471     ///
472     /// assert_eq!(Utf8Char::from_array([b'A', 0, 0, 0]), Ok(Utf8Char::from('A')));
473     /// assert_eq!(Utf8Char::from_array([0xf4, 0x8b, 0xbb, 0xbb]), Ok(Utf8Char::from('\u{10befb}')));
474     /// assert_eq!(Utf8Char::from_array([b'A', b'B', b'C', b'D']), Ok(Utf8Char::from('A')));
475     /// assert_eq!(Utf8Char::from_array([0, 0, 0xcc, 0xbb]), Ok(Utf8Char::from('\0')));
476     ///
477     /// assert_eq!(Utf8Char::from_array([0xef, b'F', 0x80, 0x80]), Err(Utf8(NotAContinuationByte(1))));
478     /// assert_eq!(Utf8Char::from_array([0xc1, 0x80, 0, 0]), Err(Utf8(OverLong)));
479     /// assert_eq!(Utf8Char::from_array([0xf7, 0xaa, 0x99, 0x88]), Err(Codepoint(TooHigh)));
480     /// ```
from_array(utf8: [u8;4]) -> Result<Self,InvalidUtf8Array>481     pub fn from_array(utf8: [u8;4]) -> Result<Self,InvalidUtf8Array> {
482         unsafe {
483             // perform all validation
484             try!(char::from_utf8_array(utf8));
485             let extra = utf8[0].extra_utf8_bytes_unchecked() as u32;
486             // zero unused bytes in one operation by transmuting the arrary to
487             // u32, apply an endian-corrected mask and transmute back
488             let mask = u32::from_le(0xff_ff_ff_ff >> 8*(3-extra));
489             let unused_zeroed = mask  &  transmute::<_,u32>(utf8);
490             Ok(Utf8Char{ bytes: transmute(unused_zeroed) })
491         }
492     }
493     /// Zero-cost constructor.
494     ///
495     /// # Safety
496     ///
497     /// Must contain a valid codepoint starting at the first byte, with the
498     /// unused bytes zeroed.
499     /// Bad values can easily lead to undefined behavior.
500     #[inline]
from_array_unchecked(utf8: [u8;4]) -> Self501     pub unsafe fn from_array_unchecked(utf8: [u8;4]) -> Self {
502         Utf8Char{ bytes: utf8 }
503     }
504     /// Create an `Utf8Char` from a single byte.
505     ///
506     /// The byte must be an ASCII character.
507     ///
508     /// # Errors
509     ///
510     /// Returns `NonAsciiError` if the byte greater than 127.
511     ///
512     /// # Examples
513     ///
514     /// ```
515     /// # use encode_unicode::Utf8Char;
516     /// assert_eq!(Utf8Char::from_ascii(b'a').unwrap(), 'a');
517     /// assert!(Utf8Char::from_ascii(128).is_err());
518     /// ```
from_ascii(ascii: u8) -> Result<Self,NonAsciiError>519     pub fn from_ascii(ascii: u8) -> Result<Self,NonAsciiError> {
520         if ascii as i8 >= 0 {
521             Ok(Utf8Char{ bytes: [ascii, 0, 0, 0] })
522         } else {
523             Err(NonAsciiError)
524         }
525     }
526     /// Create an `Utf8Char` from a single byte without checking that it's a
527     /// valid codepoint on its own, which is only true for ASCII characters.
528     ///
529     /// # Safety
530     ///
531     /// The byte must be less than 128.
532     #[inline]
from_ascii_unchecked(ascii: u8) -> Self533     pub unsafe fn from_ascii_unchecked(ascii: u8) -> Self {
534         Utf8Char{ bytes: [ascii, 0, 0, 0] }
535     }
536 
537     /// The number of bytes this character needs.
538     ///
539     /// Is between 1 and 4 (inclusive) and identical to `.as_ref().len()` or
540     /// `.as_char().len_utf8()`.
541     #[inline]
len(self) -> usize542     pub fn len(self) -> usize {
543         // Invariants of the extra bytes enambles algorithms that
544         // `u8.extra_utf8_bytes_unchecked()` cannot use.
545         // Some of them turned out to require fewer x86 instructions:
546 
547         // Exploits that unused bytes are zero and calculates the number of
548         // trailing zero bytes.
549         // Setting a bit in the first byte prevents the function from returning
550         // 0 for '\0' (which has 32 leading zeros).
551         // trailing and leading is swapped below to optimize for little-endian
552         // architectures.
553         (4 - (u32::to_le(unsafe{transmute(self.bytes)})|1).leading_zeros()/8) as usize
554 
555         // Exploits that the extra bytes have their most significant bit set if
556         // in use.
557         // Takes fewer instructions than the one above if popcnt can be used,
558         // (which it cannot by default,
559         //  set RUSTFLAGS='-C target-cpu=native' to enable)
560         //let all: u32 = unsafe{transmute(self.bytes)};
561         //let msb_mask = u32::from_be(0x00808080);
562         //let add_one = u32::from_be(0x80000000);
563         //((all & msb_mask) | add_one).count_ones() as usize
564     }
565     // There is no .is_emty() because this type is never empty.
566 
567     /// Checks that the codepoint is an ASCII character.
is_ascii(&self) -> bool568     pub fn is_ascii(&self) -> bool {
569         self.bytes[0] <= 127
570     }
571     /// Checks that two characters are an ASCII case-insensitive match.
572     ///
573     /// Is equivalent to `a.to_ascii_lowercase() == b.to_ascii_lowercase()`.
574     #[cfg(feature="std")]
eq_ignore_ascii_case(&self, other: &Self) -> bool575     pub fn eq_ignore_ascii_case(&self,  other: &Self) -> bool {
576         if self.is_ascii() {self.bytes[0].eq_ignore_ascii_case(&other.bytes[0])}
577         else               {self == other}
578     }
579     /// Converts the character to its ASCII upper case equivalent.
580     ///
581     /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
582     /// but non-ASCII letters are unchanged.
583     #[cfg(feature="std")]
to_ascii_uppercase(&self) -> Self584     pub fn to_ascii_uppercase(&self) -> Self {
585         let mut uc = *self;
586         uc.make_ascii_uppercase();
587         uc
588     }
589     /// Converts the character to its ASCII lower case equivalent.
590     ///
591     /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
592     /// but non-ASCII letters are unchanged.
593     #[cfg(feature="std")]
to_ascii_lowercase(&self) -> Self594     pub fn to_ascii_lowercase(&self) -> Self {
595         let mut uc = *self;
596         uc.make_ascii_lowercase();
597         uc
598     }
599     /// Converts the character to its ASCII upper case equivalent in-place.
600     ///
601     /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
602     /// but non-ASCII letters are unchanged.
603     #[inline]
604     #[cfg(feature="std")]
make_ascii_uppercase(&mut self)605     pub fn make_ascii_uppercase(&mut self) {
606         self.bytes[0].make_ascii_uppercase()
607     }
608     /// Converts the character to its ASCII lower case equivalent in-place.
609     ///
610     /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
611     /// but non-ASCII letters are unchanged.
612     #[inline]
613     #[cfg(feature="std")]
make_ascii_lowercase(&mut self)614     pub fn make_ascii_lowercase(&mut self) {
615         self.bytes[0].make_ascii_lowercase();
616     }
617 
618     /// Convert from UTF-8 to UTF-32
to_char(self) -> char619     pub fn to_char(self) -> char {
620         self.into()
621     }
622     /// Write the internal representation to a slice,
623     /// and then returns the number of bytes written.
624     ///
625     /// # Panics
626     ///
627     /// Will panic the buffer is too small;
628     /// You can get the required length from `.len()`,
629     /// but a buffer of length four is always large enough.
to_slice(self, dst: &mut[u8]) -> usize630     pub fn to_slice(self,  dst: &mut[u8]) -> usize {
631         if self.len() > dst.len() {
632             panic!("The provided buffer is too small.");
633         }
634         dst[..self.len()].copy_from_slice(&self.bytes[..self.len()]);
635         self.len()
636     }
637     /// Expose the internal array and the number of used bytes.
to_array(self) -> ([u8;4],usize)638     pub fn to_array(self) -> ([u8;4],usize) {
639         (self.bytes, self.len())
640     }
641     /// Return a `str` view of the array the codepoint is stored as.
642     ///
643     /// Is an unambiguous version of `.as_ref()`.
as_str(&self) -> &str644     pub fn as_str(&self) -> &str {
645         self.deref()
646     }
647 }
648