1 /* Copyright 2016 The encode_unicode Developers 2 * 3 * Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or 4 * http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or 5 * http://opensource.org/licenses/MIT>, at your option. This file may not be 6 * copied, modified, or distributed except according to those terms. 7 */ 8 9 use errors::{FromStrError, EmptyStrError, NonAsciiError, InvalidUtf8Slice, InvalidUtf8Array}; 10 use utf8_iterators::Utf8Iterator; 11 use traits::{CharExt, U8UtfExt}; 12 use utf16_char::Utf16Char; 13 extern crate core; 14 use self::core::{hash, fmt, str, ptr}; 15 use self::core::cmp::Ordering; 16 use self::core::borrow::Borrow; 17 use self::core::ops::Deref; 18 use self::core::mem::transmute; 19 #[cfg(feature="std")] 20 use self::core::iter::FromIterator; 21 #[cfg(feature="std")] 22 #[allow(deprecated)] 23 use std::ascii::AsciiExt; 24 #[cfg(feature="ascii")] 25 extern crate ascii; 26 #[cfg(feature="ascii")] 27 use self::ascii::{AsciiChar,ToAsciiChar,ToAsciiCharError}; 28 29 30 // I don't think there is any good default value for char, but char does. 31 #[derive(Default)] 32 // char doesn't do anything more advanced than u32 for Eq/Ord, so we shouldn't either. 33 // The default impl of Ord for arrays works out because longer codepoints 34 // start with more ones, so if they're equal, the length is the same, 35 // breaks down for values above 0x1f_ff_ff but those can only be created by unsafe code. 36 #[derive(PartialEq,Eq, PartialOrd,Ord)] 37 38 #[derive(Clone,Copy)] 39 40 41 /// An unicode codepoint stored as UTF-8. 42 /// 43 /// It can be borrowed as a `str`, and has the same size as `char`. 44 pub struct Utf8Char { 45 bytes: [u8; 4], 46 } 47 48 49 ///////////////////// 50 //conversion traits// 51 ///////////////////// 52 impl str::FromStr for Utf8Char { 53 type Err = FromStrError; 54 /// Create an `Utf8Char` from a string slice. 55 /// The string must contain exactly one codepoint. 56 /// 57 /// # Examples 58 /// 59 /// ``` 60 /// use encode_unicode::error::FromStrError::*; 61 /// use encode_unicode::Utf8Char; 62 /// use std::str::FromStr; 63 /// 64 /// assert_eq!(Utf8Char::from_str("a"), Ok(Utf8Char::from('a'))); 65 /// assert_eq!(Utf8Char::from_str(""), Ok(Utf8Char::from(''))); 66 /// assert_eq!(Utf8Char::from_str(""), Err(Empty)); 67 /// assert_eq!(Utf8Char::from_str("ab"), Err(MultipleCodepoints)); 68 /// assert_eq!(Utf8Char::from_str("é"), Err(MultipleCodepoints));// 'e'+u301 combining mark 69 /// ``` from_str(s: &str) -> Result<Self, FromStrError>70 fn from_str(s: &str) -> Result<Self, FromStrError> { 71 if s.is_empty() { 72 Err(FromStrError::Empty) 73 } else if s.len() != 1+s.as_bytes()[0].extra_utf8_bytes_unchecked() { 74 Err(FromStrError::MultipleCodepoints) 75 } else { 76 let mut bytes = [0; 4]; 77 bytes[..s.len()].copy_from_slice(s.as_bytes()); 78 Ok(Utf8Char{bytes: bytes}) 79 } 80 } 81 } 82 impl From<Utf16Char> for Utf8Char { from(utf16: Utf16Char) -> Utf8Char83 fn from(utf16: Utf16Char) -> Utf8Char { 84 match utf16.to_tuple() { 85 (a @ 0...0x00_7f, _) => { 86 Utf8Char{ bytes: [a as u8, 0, 0, 0] } 87 }, 88 (u @ 0...0x07_ff, _) => { 89 let b = 0x80 | (u & 0x00_3f) as u8; 90 let a = 0xc0 | ((u & 0x07_c0) >> 6) as u8; 91 Utf8Char{ bytes: [a, b, 0, 0] } 92 }, 93 (u, None) => { 94 let c = 0x80 | (u & 0x00_3f) as u8; 95 let b = 0x80 | ((u & 0x0f_c0) >> 6) as u8; 96 let a = 0xe0 | ((u & 0xf0_00) >> 12) as u8; 97 Utf8Char{ bytes: [a, b, c, 0] } 98 }, 99 (f, Some(s)) => { 100 let f = f + (0x01_00_00u32 >> 10) as u16; 101 let d = 0x80 | (s & 0x00_3f) as u8; 102 let c = 0x80 | ((s & 0x03_c0) >> 6) as u8 103 | ((f & 0x00_03) << 4) as u8; 104 let b = 0x80 | ((f & 0x00_fc) >> 2) as u8; 105 let a = 0xf0 | ((f & 0x07_00) >> 8) as u8; 106 Utf8Char{ bytes: [a, b, c, d] } 107 } 108 } 109 } 110 } 111 impl From<char> for Utf8Char { from(c: char) -> Self112 fn from(c: char) -> Self { 113 Utf8Char{ bytes: c.to_utf8_array().0 } 114 } 115 } 116 impl From<Utf8Char> for char { from(uc: Utf8Char) -> char117 fn from(uc: Utf8Char) -> char { 118 unsafe{ char::from_utf8_exact_slice_unchecked(&uc.bytes[..uc.len()]) } 119 } 120 } 121 impl IntoIterator for Utf8Char { 122 type Item=u8; 123 type IntoIter=Utf8Iterator; 124 /// Iterate over the byte values. into_iter(self) -> Utf8Iterator125 fn into_iter(self) -> Utf8Iterator { 126 Utf8Iterator::from(self) 127 } 128 } 129 130 #[cfg(feature="std")] 131 impl Extend<Utf8Char> for Vec<u8> { extend<I:IntoIterator<Item=Utf8Char>>(&mut self, iter: I)132 fn extend<I:IntoIterator<Item=Utf8Char>>(&mut self, iter: I) { 133 let iter = iter.into_iter(); 134 self.reserve(iter.size_hint().0); 135 for u8c in iter { 136 // twice as fast as self.extend_from_slice(u8c.as_bytes()); 137 self.push(u8c.bytes[0]); 138 for &extra in &u8c.bytes[1..] { 139 if extra != 0 { 140 self.push(extra); 141 } 142 } 143 } 144 } 145 } 146 #[cfg(feature="std")] 147 impl<'a> Extend<&'a Utf8Char> for Vec<u8> { extend<I:IntoIterator<Item=&'a Utf8Char>>(&mut self, iter: I)148 fn extend<I:IntoIterator<Item=&'a Utf8Char>>(&mut self, iter: I) { 149 self.extend(iter.into_iter().cloned()) 150 } 151 } 152 #[cfg(feature="std")] 153 impl Extend<Utf8Char> for String { extend<I:IntoIterator<Item=Utf8Char>>(&mut self, iter: I)154 fn extend<I:IntoIterator<Item=Utf8Char>>(&mut self, iter: I) { 155 unsafe { self.as_mut_vec().extend(iter) } 156 } 157 } 158 #[cfg(feature="std")] 159 impl<'a> Extend<&'a Utf8Char> for String { extend<I:IntoIterator<Item=&'a Utf8Char>>(&mut self, iter: I)160 fn extend<I:IntoIterator<Item=&'a Utf8Char>>(&mut self, iter: I) { 161 self.extend(iter.into_iter().cloned()) 162 } 163 } 164 #[cfg(feature="std")] 165 impl FromIterator<Utf8Char> for String { from_iter<I:IntoIterator<Item=Utf8Char>>(iter: I) -> String166 fn from_iter<I:IntoIterator<Item=Utf8Char>>(iter: I) -> String { 167 let mut string = String::new(); 168 string.extend(iter); 169 return string; 170 } 171 } 172 #[cfg(feature="std")] 173 impl<'a> FromIterator<&'a Utf8Char> for String { from_iter<I:IntoIterator<Item=&'a Utf8Char>>(iter: I) -> String174 fn from_iter<I:IntoIterator<Item=&'a Utf8Char>>(iter: I) -> String { 175 iter.into_iter().cloned().collect() 176 } 177 } 178 #[cfg(feature="std")] 179 impl FromIterator<Utf8Char> for Vec<u8> { from_iter<I:IntoIterator<Item=Utf8Char>>(iter: I) -> Self180 fn from_iter<I:IntoIterator<Item=Utf8Char>>(iter: I) -> Self { 181 iter.into_iter().collect::<String>().into_bytes() 182 } 183 } 184 #[cfg(feature="std")] 185 impl<'a> FromIterator<&'a Utf8Char> for Vec<u8> { from_iter<I:IntoIterator<Item=&'a Utf8Char>>(iter: I) -> Self186 fn from_iter<I:IntoIterator<Item=&'a Utf8Char>>(iter: I) -> Self { 187 iter.into_iter().cloned().collect::<String>().into_bytes() 188 } 189 } 190 191 192 ///////////////// 193 //getter traits// 194 ///////////////// 195 impl AsRef<[u8]> for Utf8Char { as_ref(&self) -> &[u8]196 fn as_ref(&self) -> &[u8] { 197 &self.bytes[..self.len()] 198 } 199 } 200 impl AsRef<str> for Utf8Char { as_ref(&self) -> &str201 fn as_ref(&self) -> &str { 202 unsafe{ str::from_utf8_unchecked( self.as_ref() ) } 203 } 204 } 205 impl Borrow<[u8]> for Utf8Char { borrow(&self) -> &[u8]206 fn borrow(&self) -> &[u8] { 207 self.as_ref() 208 } 209 } 210 impl Borrow<str> for Utf8Char { borrow(&self) -> &str211 fn borrow(&self) -> &str { 212 self.as_ref() 213 } 214 } 215 impl Deref for Utf8Char { 216 type Target = str; deref(&self) -> &Self::Target217 fn deref(&self) -> &Self::Target { 218 self.as_ref() 219 } 220 } 221 222 223 //////////////// 224 //ascii traits// 225 //////////////// 226 #[cfg(feature="std")] 227 #[allow(deprecated)] 228 impl AsciiExt for Utf8Char { 229 type Owned = Utf8Char; is_ascii(&self) -> bool230 fn is_ascii(&self) -> bool { 231 self.bytes[0].is_ascii() 232 } eq_ignore_ascii_case(&self, other: &Self) -> bool233 fn eq_ignore_ascii_case(&self, other: &Self) -> bool { 234 if self.is_ascii() {self.bytes[0].eq_ignore_ascii_case(&other.bytes[0])} 235 else {self == other} 236 } to_ascii_uppercase(&self) -> Self::Owned237 fn to_ascii_uppercase(&self) -> Self::Owned { 238 let mut uc = *self; 239 uc.make_ascii_uppercase(); 240 uc 241 } to_ascii_lowercase(&self) -> Self::Owned242 fn to_ascii_lowercase(&self) -> Self::Owned { 243 let mut uc = *self; 244 uc.make_ascii_lowercase(); 245 uc 246 } make_ascii_uppercase(&mut self)247 fn make_ascii_uppercase(&mut self) { 248 self.bytes[0].make_ascii_uppercase() 249 } make_ascii_lowercase(&mut self)250 fn make_ascii_lowercase(&mut self) { 251 self.bytes[0].make_ascii_lowercase(); 252 } 253 } 254 255 #[cfg(feature="ascii")] 256 /// Requires the feature "ascii". 257 impl From<AsciiChar> for Utf8Char { from(ac: AsciiChar) -> Self258 fn from(ac: AsciiChar) -> Self { 259 Utf8Char{ bytes: [ac.as_byte(),0,0,0] } 260 } 261 } 262 #[cfg(feature="ascii")] 263 /// Requires the feature "ascii". 264 impl ToAsciiChar for Utf8Char { to_ascii_char(self) -> Result<AsciiChar, ToAsciiCharError>265 fn to_ascii_char(self) -> Result<AsciiChar, ToAsciiCharError> { 266 self.bytes[0].to_ascii_char() 267 } to_ascii_char_unchecked(self) -> AsciiChar268 unsafe fn to_ascii_char_unchecked(self) -> AsciiChar { 269 self.bytes[0].to_ascii_char_unchecked() 270 } 271 } 272 273 274 ///////////////////////////////////////////////////////// 275 //Genaral traits that cannot be derived to emulate char// 276 ///////////////////////////////////////////////////////// 277 impl hash::Hash for Utf8Char { hash<H : hash::Hasher>(&self, state: &mut H)278 fn hash<H : hash::Hasher>(&self, state: &mut H) { 279 self.to_char().hash(state); 280 } 281 } 282 impl fmt::Debug for Utf8Char { fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result283 fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { 284 fmt::Debug::fmt(&self.to_char(), fmtr) 285 } 286 } 287 impl fmt::Display for Utf8Char { fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result288 fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { 289 fmtr.write_str(self.as_str()) 290 } 291 } 292 293 294 //////////////////////////////// 295 //Comparisons with other types// 296 //////////////////////////////// 297 impl PartialEq<char> for Utf8Char { eq(&self, u32c: &char) -> bool298 fn eq(&self, u32c: &char) -> bool { 299 *self == Utf8Char::from(*u32c) 300 } 301 } 302 impl PartialEq<Utf8Char> for char { eq(&self, u8c: &Utf8Char) -> bool303 fn eq(&self, u8c: &Utf8Char) -> bool { 304 Utf8Char::from(*self) == *u8c 305 } 306 } 307 impl PartialOrd<char> for Utf8Char { partial_cmp(&self, u32c: &char) -> Option<Ordering>308 fn partial_cmp(&self, u32c: &char) -> Option<Ordering> { 309 self.partial_cmp(&Self::from(*u32c)) 310 } 311 } 312 impl PartialOrd<Utf8Char> for char { partial_cmp(&self, u8c: &Utf8Char) -> Option<Ordering>313 fn partial_cmp(&self, u8c: &Utf8Char) -> Option<Ordering> { 314 Utf8Char::from(*self).partial_cmp(u8c) 315 } 316 } 317 318 impl PartialEq<Utf16Char> for Utf8Char { eq(&self, u16c: &Utf16Char) -> bool319 fn eq(&self, u16c: &Utf16Char) -> bool { 320 *self == Self::from(*u16c) 321 } 322 } 323 impl PartialOrd<Utf16Char> for Utf8Char { partial_cmp(&self, u16c: &Utf16Char) -> Option<Ordering>324 fn partial_cmp(&self, u16c: &Utf16Char) -> Option<Ordering> { 325 self.partial_cmp(&Self::from(*u16c)) 326 } 327 } 328 // The other direction is implemented in utf16_char.rs 329 330 /// Only considers the byte equal if both it and the `Utf8Char` represents ASCII characters. 331 /// 332 /// There is no impl in the opposite direction, as this should only be used to 333 /// compare `Utf8Char`s against constants. 334 /// 335 /// # Examples 336 /// 337 /// ``` 338 /// # use encode_unicode::Utf8Char; 339 /// assert!(Utf8Char::from('8') == b'8'); 340 /// assert!(Utf8Char::from_array([0xf1,0x80,0x80,0x80]).unwrap() != 0xf1); 341 /// assert!(Utf8Char::from('\u{ff}') != 0xff); 342 /// assert!(Utf8Char::from('\u{80}') != 0x80); 343 /// ``` 344 impl PartialEq<u8> for Utf8Char { eq(&self, byte: &u8) -> bool345 fn eq(&self, byte: &u8) -> bool { 346 self.bytes[0] == *byte && self.bytes[1] == 0 347 } 348 } 349 #[cfg(feature = "ascii")] 350 /// `Utf8Char`s that are not ASCII never compare equal. 351 impl PartialEq<AsciiChar> for Utf8Char { 352 #[inline] eq(&self, ascii: &AsciiChar) -> bool353 fn eq(&self, ascii: &AsciiChar) -> bool { 354 self.bytes[0] == *ascii as u8 355 } 356 } 357 #[cfg(feature = "ascii")] 358 /// `Utf8Char`s that are not ASCII never compare equal. 359 impl PartialEq<Utf8Char> for AsciiChar { 360 #[inline] eq(&self, u8c: &Utf8Char) -> bool361 fn eq(&self, u8c: &Utf8Char) -> bool { 362 u8c == self 363 } 364 } 365 #[cfg(feature = "ascii")] 366 /// `Utf8Char`s that are not ASCII always compare greater. 367 impl PartialOrd<AsciiChar> for Utf8Char { 368 #[inline] partial_cmp(&self, ascii: &AsciiChar) -> Option<Ordering>369 fn partial_cmp(&self, ascii: &AsciiChar) -> Option<Ordering> { 370 self.bytes[0].partial_cmp(ascii) 371 } 372 } 373 #[cfg(feature = "ascii")] 374 /// `Utf8Char`s that are not ASCII always compare greater. 375 impl PartialOrd<Utf8Char> for AsciiChar { 376 #[inline] partial_cmp(&self, u8c: &Utf8Char) -> Option<Ordering>377 fn partial_cmp(&self, u8c: &Utf8Char) -> Option<Ordering> { 378 self.partial_cmp(&u8c.bytes[0]) 379 } 380 } 381 382 383 /////////////////////////////////////////////////////// 384 //pub impls that should be together for nicer rustdoc// 385 /////////////////////////////////////////////////////// 386 impl Utf8Char { 387 /// Create an `Utf8Char` from the first codepoint in a `str`. 388 /// 389 /// Returns an error if the `str` is empty. 390 /// 391 /// # Examples 392 /// 393 /// ``` 394 /// use encode_unicode::Utf8Char; 395 /// 396 /// assert_eq!(Utf8Char::from_str_start("a"), Ok((Utf8Char::from('a'),1))); 397 /// assert_eq!(Utf8Char::from_str_start("ab"), Ok((Utf8Char::from('a'),1))); 398 /// assert_eq!(Utf8Char::from_str_start(" "), Ok((Utf8Char::from(''),4))); 399 /// assert_eq!(Utf8Char::from_str_start("é"), Ok((Utf8Char::from('e'),1)));// 'e'+u301 combining mark 400 /// assert!(Utf8Char::from_str_start("").is_err()); 401 /// ``` from_str_start(src: &str) -> Result<(Self,usize),EmptyStrError>402 pub fn from_str_start(src: &str) -> Result<(Self,usize),EmptyStrError> { 403 unsafe { 404 if src.is_empty() { 405 Err(EmptyStrError) 406 } else { 407 Ok(Utf8Char::from_slice_start_unchecked(src.as_bytes())) 408 } 409 } 410 } 411 /// Create an `Utf8Char` of the first codepoint in an UTF-8 slice. 412 /// Also returns the length of the UTF-8 sequence for the codepoint. 413 /// 414 /// If the slice is from a `str`, use `::from_str_start()` to skip UTF-8 validation. 415 /// 416 /// # Errors 417 /// 418 /// Returns an `Err` if the slice is empty, doesn't start with a valid 419 /// UTF-8 sequence or is too short for the sequence. 420 /// 421 /// # Examples 422 /// 423 /// ``` 424 /// use encode_unicode::Utf8Char; 425 /// use encode_unicode::error::InvalidUtf8Slice::*; 426 /// use encode_unicode::error::InvalidUtf8::*; 427 /// 428 /// assert_eq!(Utf8Char::from_slice_start(&[b'A', b'B', b'C']), Ok((Utf8Char::from('A'),1))); 429 /// assert_eq!(Utf8Char::from_slice_start(&[0xdd, 0xbb]), Ok((Utf8Char::from('\u{77b}'),2))); 430 /// 431 /// assert_eq!(Utf8Char::from_slice_start(&[]), Err(TooShort(1))); 432 /// assert_eq!(Utf8Char::from_slice_start(&[0xf0, 0x99]), Err(TooShort(4))); 433 /// assert_eq!(Utf8Char::from_slice_start(&[0xee, b'F', 0x80]), Err(Utf8(NotAContinuationByte(1)))); 434 /// assert_eq!(Utf8Char::from_slice_start(&[0xee, 0x99, 0x0f]), Err(Utf8(NotAContinuationByte(2)))); 435 /// ``` from_slice_start(src: &[u8]) -> Result<(Self,usize),InvalidUtf8Slice>436 pub fn from_slice_start(src: &[u8]) -> Result<(Self,usize),InvalidUtf8Slice> { 437 char::from_utf8_slice_start(src).map(|(_,len)| { 438 let mut bytes = [0; 4]; 439 bytes[..len].copy_from_slice(&src[..len]); 440 (Utf8Char{ bytes: bytes }, len) 441 }) 442 } 443 /// A `from_slice_start()` that doesn't validate the codepoint. 444 /// 445 /// # Safety 446 /// 447 /// The slice must be non-empty and start with a valid UTF-8 codepoint. 448 /// Invalid or incomplete values might cause reads of uninitalized memory. from_slice_start_unchecked(src: &[u8]) -> (Self,usize)449 pub unsafe fn from_slice_start_unchecked(src: &[u8]) -> (Self,usize) { 450 let len = 1+src.get_unchecked(0).extra_utf8_bytes_unchecked(); 451 let mut bytes = [0; 4]; 452 ptr::copy_nonoverlapping(src.as_ptr(), &mut bytes[0] as *mut u8, len); 453 (Utf8Char{ bytes: bytes }, len) 454 } 455 /// Create an `Utf8Char` from a byte array after validating it. 456 /// 457 /// The codepoint must start at the first byte. 458 /// Unused bytes are set to zero by this function and so can be anything. 459 /// 460 /// # Errors 461 /// 462 /// Returns an `Err` if the array doesn't start with a valid UTF-8 sequence. 463 /// 464 /// # Examples 465 /// 466 /// ``` 467 /// use encode_unicode::Utf8Char; 468 /// use encode_unicode::error::InvalidUtf8Array::*; 469 /// use encode_unicode::error::InvalidUtf8::*; 470 /// use encode_unicode::error::InvalidCodepoint::*; 471 /// 472 /// assert_eq!(Utf8Char::from_array([b'A', 0, 0, 0]), Ok(Utf8Char::from('A'))); 473 /// assert_eq!(Utf8Char::from_array([0xf4, 0x8b, 0xbb, 0xbb]), Ok(Utf8Char::from('\u{10befb}'))); 474 /// assert_eq!(Utf8Char::from_array([b'A', b'B', b'C', b'D']), Ok(Utf8Char::from('A'))); 475 /// assert_eq!(Utf8Char::from_array([0, 0, 0xcc, 0xbb]), Ok(Utf8Char::from('\0'))); 476 /// 477 /// assert_eq!(Utf8Char::from_array([0xef, b'F', 0x80, 0x80]), Err(Utf8(NotAContinuationByte(1)))); 478 /// assert_eq!(Utf8Char::from_array([0xc1, 0x80, 0, 0]), Err(Utf8(OverLong))); 479 /// assert_eq!(Utf8Char::from_array([0xf7, 0xaa, 0x99, 0x88]), Err(Codepoint(TooHigh))); 480 /// ``` from_array(utf8: [u8;4]) -> Result<Self,InvalidUtf8Array>481 pub fn from_array(utf8: [u8;4]) -> Result<Self,InvalidUtf8Array> { 482 unsafe { 483 // perform all validation 484 try!(char::from_utf8_array(utf8)); 485 let extra = utf8[0].extra_utf8_bytes_unchecked() as u32; 486 // zero unused bytes in one operation by transmuting the arrary to 487 // u32, apply an endian-corrected mask and transmute back 488 let mask = u32::from_le(0xff_ff_ff_ff >> 8*(3-extra)); 489 let unused_zeroed = mask & transmute::<_,u32>(utf8); 490 Ok(Utf8Char{ bytes: transmute(unused_zeroed) }) 491 } 492 } 493 /// Zero-cost constructor. 494 /// 495 /// # Safety 496 /// 497 /// Must contain a valid codepoint starting at the first byte, with the 498 /// unused bytes zeroed. 499 /// Bad values can easily lead to undefined behavior. 500 #[inline] from_array_unchecked(utf8: [u8;4]) -> Self501 pub unsafe fn from_array_unchecked(utf8: [u8;4]) -> Self { 502 Utf8Char{ bytes: utf8 } 503 } 504 /// Create an `Utf8Char` from a single byte. 505 /// 506 /// The byte must be an ASCII character. 507 /// 508 /// # Errors 509 /// 510 /// Returns `NonAsciiError` if the byte greater than 127. 511 /// 512 /// # Examples 513 /// 514 /// ``` 515 /// # use encode_unicode::Utf8Char; 516 /// assert_eq!(Utf8Char::from_ascii(b'a').unwrap(), 'a'); 517 /// assert!(Utf8Char::from_ascii(128).is_err()); 518 /// ``` from_ascii(ascii: u8) -> Result<Self,NonAsciiError>519 pub fn from_ascii(ascii: u8) -> Result<Self,NonAsciiError> { 520 if ascii as i8 >= 0 { 521 Ok(Utf8Char{ bytes: [ascii, 0, 0, 0] }) 522 } else { 523 Err(NonAsciiError) 524 } 525 } 526 /// Create an `Utf8Char` from a single byte without checking that it's a 527 /// valid codepoint on its own, which is only true for ASCII characters. 528 /// 529 /// # Safety 530 /// 531 /// The byte must be less than 128. 532 #[inline] from_ascii_unchecked(ascii: u8) -> Self533 pub unsafe fn from_ascii_unchecked(ascii: u8) -> Self { 534 Utf8Char{ bytes: [ascii, 0, 0, 0] } 535 } 536 537 /// The number of bytes this character needs. 538 /// 539 /// Is between 1 and 4 (inclusive) and identical to `.as_ref().len()` or 540 /// `.as_char().len_utf8()`. 541 #[inline] len(self) -> usize542 pub fn len(self) -> usize { 543 // Invariants of the extra bytes enambles algorithms that 544 // `u8.extra_utf8_bytes_unchecked()` cannot use. 545 // Some of them turned out to require fewer x86 instructions: 546 547 // Exploits that unused bytes are zero and calculates the number of 548 // trailing zero bytes. 549 // Setting a bit in the first byte prevents the function from returning 550 // 0 for '\0' (which has 32 leading zeros). 551 // trailing and leading is swapped below to optimize for little-endian 552 // architectures. 553 (4 - (u32::to_le(unsafe{transmute(self.bytes)})|1).leading_zeros()/8) as usize 554 555 // Exploits that the extra bytes have their most significant bit set if 556 // in use. 557 // Takes fewer instructions than the one above if popcnt can be used, 558 // (which it cannot by default, 559 // set RUSTFLAGS='-C target-cpu=native' to enable) 560 //let all: u32 = unsafe{transmute(self.bytes)}; 561 //let msb_mask = u32::from_be(0x00808080); 562 //let add_one = u32::from_be(0x80000000); 563 //((all & msb_mask) | add_one).count_ones() as usize 564 } 565 // There is no .is_emty() because this type is never empty. 566 567 /// Checks that the codepoint is an ASCII character. is_ascii(&self) -> bool568 pub fn is_ascii(&self) -> bool { 569 self.bytes[0] <= 127 570 } 571 /// Checks that two characters are an ASCII case-insensitive match. 572 /// 573 /// Is equivalent to `a.to_ascii_lowercase() == b.to_ascii_lowercase()`. 574 #[cfg(feature="std")] eq_ignore_ascii_case(&self, other: &Self) -> bool575 pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool { 576 if self.is_ascii() {self.bytes[0].eq_ignore_ascii_case(&other.bytes[0])} 577 else {self == other} 578 } 579 /// Converts the character to its ASCII upper case equivalent. 580 /// 581 /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z', 582 /// but non-ASCII letters are unchanged. 583 #[cfg(feature="std")] to_ascii_uppercase(&self) -> Self584 pub fn to_ascii_uppercase(&self) -> Self { 585 let mut uc = *self; 586 uc.make_ascii_uppercase(); 587 uc 588 } 589 /// Converts the character to its ASCII lower case equivalent. 590 /// 591 /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z', 592 /// but non-ASCII letters are unchanged. 593 #[cfg(feature="std")] to_ascii_lowercase(&self) -> Self594 pub fn to_ascii_lowercase(&self) -> Self { 595 let mut uc = *self; 596 uc.make_ascii_lowercase(); 597 uc 598 } 599 /// Converts the character to its ASCII upper case equivalent in-place. 600 /// 601 /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z', 602 /// but non-ASCII letters are unchanged. 603 #[inline] 604 #[cfg(feature="std")] make_ascii_uppercase(&mut self)605 pub fn make_ascii_uppercase(&mut self) { 606 self.bytes[0].make_ascii_uppercase() 607 } 608 /// Converts the character to its ASCII lower case equivalent in-place. 609 /// 610 /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z', 611 /// but non-ASCII letters are unchanged. 612 #[inline] 613 #[cfg(feature="std")] make_ascii_lowercase(&mut self)614 pub fn make_ascii_lowercase(&mut self) { 615 self.bytes[0].make_ascii_lowercase(); 616 } 617 618 /// Convert from UTF-8 to UTF-32 to_char(self) -> char619 pub fn to_char(self) -> char { 620 self.into() 621 } 622 /// Write the internal representation to a slice, 623 /// and then returns the number of bytes written. 624 /// 625 /// # Panics 626 /// 627 /// Will panic the buffer is too small; 628 /// You can get the required length from `.len()`, 629 /// but a buffer of length four is always large enough. to_slice(self, dst: &mut[u8]) -> usize630 pub fn to_slice(self, dst: &mut[u8]) -> usize { 631 if self.len() > dst.len() { 632 panic!("The provided buffer is too small."); 633 } 634 dst[..self.len()].copy_from_slice(&self.bytes[..self.len()]); 635 self.len() 636 } 637 /// Expose the internal array and the number of used bytes. to_array(self) -> ([u8;4],usize)638 pub fn to_array(self) -> ([u8;4],usize) { 639 (self.bytes, self.len()) 640 } 641 /// Return a `str` view of the array the codepoint is stored as. 642 /// 643 /// Is an unambiguous version of `.as_ref()`. as_str(&self) -> &str644 pub fn as_str(&self) -> &str { 645 self.deref() 646 } 647 } 648