1 //! This crate exposes the Unicode `Script` and `Script_Extension` 2 //! properties from [UAX #24](http://www.unicode.org/reports/tr24/) 3 4 #![cfg_attr(not(test), no_std)] 5 #![cfg_attr(feature = "bench", feature(test))] 6 7 #[rustfmt::skip] 8 mod tables; 9 10 use core::convert::TryFrom; 11 use core::fmt; 12 use core::u64; 13 pub use tables::script_extensions; 14 use tables::{get_script, get_script_extension, NEXT_SCRIPT}; 15 pub use tables::{Script, UNICODE_VERSION}; 16 17 impl Script { 18 /// Get the full name of a script full_name(self) -> &'static str19 pub fn full_name(self) -> &'static str { 20 self.inner_full_name() 21 } 22 23 /// Get the four-character short name of a script short_name(self) -> &'static str24 pub fn short_name(self) -> &'static str { 25 self.inner_short_name() 26 } 27 28 /// Is this script "Recommended" according to 29 /// [UAX #31](www.unicode.org/reports/tr31/#Table_Recommended_Scripts)? is_recommended(self) -> bool30 pub fn is_recommended(self) -> bool { 31 use Script::*; 32 match self { 33 Common | Inherited | Arabic | Armenian | Bengali | Bopomofo | Cyrillic | Devanagari 34 | Ethiopic | Georgian | Greek | Gujarati | Gurmukhi | Han | Hangul | Hebrew 35 | Hiragana | Kannada | Katakana | Khmer | Lao | Latin | Malayalam | Myanmar | Oriya 36 | Sinhala | Tamil | Telugu | Thaana | Thai | Tibetan => true, 37 _ => false, 38 } 39 } 40 } 41 42 impl From<Script> for ScriptExtension { from(script: Script) -> Self43 fn from(script: Script) -> Self { 44 if script == Script::Common { 45 ScriptExtension::new_common() 46 } else if script == Script::Inherited { 47 ScriptExtension::new_inherited() 48 } else if script == Script::Unknown { 49 ScriptExtension::new_unknown() 50 } else { 51 let mut first = 0; 52 let mut second = 0; 53 let mut third = 0; 54 let bit = script as u8; 55 // Find out which field it's in, and set the appropriate bit there 56 if bit < 64 { 57 first = 1 << bit as u64; 58 } else if bit < 128 { 59 // offset by 64 since `bit` is an absolute number, 60 // not relative to the chunk 61 second = 1 << (bit - 64) as u64; 62 } else { 63 third = 1 << (bit - 128) as u32; 64 } 65 ScriptExtension::new(first, second, third) 66 } 67 } 68 } 69 70 impl TryFrom<ScriptExtension> for Script { 71 type Error = (); try_from(ext: ScriptExtension) -> Result<Self, ()>72 fn try_from(ext: ScriptExtension) -> Result<Self, ()> { 73 if ext.is_common_or_inherited() { 74 if ext.common { 75 Ok(Script::Common) 76 } else { 77 Ok(Script::Inherited) 78 } 79 } else if ext.is_empty() { 80 Ok(Script::Unknown) 81 } else { 82 // filled elements will have set ones 83 let fo = ext.first.count_ones(); 84 let so = ext.second.count_ones(); 85 let to = ext.third.count_ones(); 86 // only one bit set, in the first chunk 87 if fo == 1 && so == 0 && to == 0 { 88 // use trailing_zeroes() to figure out which bit it is 89 Ok(Script::for_integer(ext.first.trailing_zeros() as u8)) 90 // only one bit set, in the second chunk 91 } else if fo == 0 && so == 1 && to == 0 { 92 Ok(Script::for_integer(64 + ext.second.trailing_zeros() as u8)) 93 // only one bit set, in the third chunk 94 } else if fo == 0 && so == 0 && to == 1 { 95 Ok(Script::for_integer(128 + ext.third.trailing_zeros() as u8)) 96 } else { 97 Err(()) 98 } 99 } 100 } 101 } 102 103 impl Default for Script { default() -> Self104 fn default() -> Self { 105 Script::Common 106 } 107 } 108 109 impl From<char> for Script { from(o: char) -> Self110 fn from(o: char) -> Self { 111 o.script() 112 } 113 } 114 115 #[derive(Clone, Copy, PartialEq, Eq, Hash)] 116 #[non_exhaustive] 117 /// A value for the `Script_Extension` property 118 /// 119 /// [`ScriptExtension`] is one or more [`Script`] 120 /// 121 /// This is essentially an optimized version of `Vec<Script>` that uses bitfields 122 pub struct ScriptExtension { 123 // A bitset for the first 64 scripts 124 first: u64, 125 // A bitset for the scripts 65-128 126 second: u64, 127 // A bitset for scripts after 128 128 third: u32, 129 // Both Common and Inherited are represented by all used bits being set, 130 // this flag lets us distinguish the two. 131 common: bool, 132 } 133 134 impl ScriptExtension { 135 // We don't use the complete u32 of `third`, so the "all" value is not just u32::MAX 136 // Instead, we take the number of the next (unused) script bit, subtract 128 to bring 137 // it in the range of `third`, create a u32 with just that bit set, and subtract 1 138 // to create one with all the lower bits set. 139 const THIRD_MAX: u32 = ((1 << (NEXT_SCRIPT - 128)) - 1); 140 new(first: u64, second: u64, third: u32) -> Self141 pub(crate) const fn new(first: u64, second: u64, third: u32) -> Self { 142 ScriptExtension { 143 first, 144 second, 145 third, 146 common: false, 147 } 148 } 149 new_common() -> Self150 pub(crate) const fn new_common() -> Self { 151 ScriptExtension { 152 first: u64::MAX, 153 second: u64::MAX, 154 third: Self::THIRD_MAX, 155 common: true, 156 } 157 } 158 new_inherited() -> Self159 pub(crate) const fn new_inherited() -> Self { 160 ScriptExtension { 161 first: u64::MAX, 162 second: u64::MAX, 163 third: Self::THIRD_MAX, 164 common: false, 165 } 166 } 167 new_unknown() -> Self168 pub(crate) const fn new_unknown() -> Self { 169 ScriptExtension { 170 first: 0, 171 second: 0, 172 third: 0, 173 common: false, 174 } 175 } 176 is_common_or_inherited(self) -> bool177 const fn is_common_or_inherited(self) -> bool { 178 (self.first == u64::MAX) & (self.second == u64::MAX) & (self.third == Self::THIRD_MAX) 179 } 180 181 /// Checks if the script extension is Common is_common(self) -> bool182 pub const fn is_common(self) -> bool { 183 self.is_common_or_inherited() & self.common 184 } 185 186 /// Checks if the script extension is Inherited is_inherited(self) -> bool187 pub const fn is_inherited(self) -> bool { 188 self.is_common_or_inherited() & !self.common 189 } 190 191 /// Checks if the script extension is empty (unknown) is_empty(self) -> bool192 pub const fn is_empty(self) -> bool { 193 (self.first == 0) & (self.second == 0) & (self.third == 0) 194 } 195 196 /// Intersect this `ScriptExtension` with another `ScriptExtension`. Produces `Unknown` if things 197 /// do not intersect. This is equivalent to [`ScriptExtension::intersection`] but it stores the result 198 /// in `self` 199 /// 200 /// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting 201 /// everything, the intersection of `Common` and `Inherited` is `Inherited` intersect_with(&mut self, other: Self)202 pub fn intersect_with(&mut self, other: Self) { 203 *self = self.intersection(other) 204 } 205 206 /// Find the intersection between two ScriptExtensions. Returns Unknown if things 207 /// do not intersect. 208 /// 209 /// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting 210 /// everything, the intersection of `Common` and `Inherited` is `Inherited` intersection(self, other: Self) -> Self211 pub const fn intersection(self, other: Self) -> Self { 212 let first = self.first & other.first; 213 let second = self.second & other.second; 214 let third = self.third & other.third; 215 let common = self.common & other.common; 216 ScriptExtension { 217 first, 218 second, 219 third, 220 common, 221 } 222 } 223 224 /// Find the union between two ScriptExtensions. 225 /// 226 /// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting 227 /// everything, the union of `Common` and `Inherited` is `Common` union(self, other: Self) -> Self228 pub const fn union(self, other: Self) -> Self { 229 let first = self.first | other.first; 230 let second = self.second | other.second; 231 let third = self.third | other.third; 232 let common = self.common | other.common; 233 ScriptExtension { 234 first, 235 second, 236 third, 237 common, 238 } 239 } 240 241 /// Check if this ScriptExtension contains the given script 242 /// 243 /// Should be used with specific scripts only, this will 244 /// return `true` if `self` is not `Unknown` and `script` is 245 /// `Common` or `Inherited` contains_script(self, script: Script) -> bool246 pub fn contains_script(self, script: Script) -> bool { 247 !self.intersection(script.into()).is_empty() 248 } 249 250 /// Get the intersection of script extensions of all characters 251 /// in a string. for_str(x: &str) -> Self252 pub fn for_str(x: &str) -> Self { 253 let mut ext = ScriptExtension::default(); 254 for ch in x.chars() { 255 ext.intersect_with(ch.into()); 256 } 257 ext 258 } 259 260 /// Iterate over the scripts in this string 261 /// 262 /// Will never yeild Script::Unknown iter(self) -> ScriptIterator263 pub fn iter(self) -> ScriptIterator { 264 ScriptIterator { ext: self } 265 } 266 } 267 268 impl Default for ScriptExtension { default() -> Self269 fn default() -> Self { 270 ScriptExtension::new_common() 271 } 272 } 273 274 impl From<char> for ScriptExtension { from(o: char) -> Self275 fn from(o: char) -> Self { 276 o.script_extension() 277 } 278 } 279 280 impl From<&'_ str> for ScriptExtension { from(o: &'_ str) -> Self281 fn from(o: &'_ str) -> Self { 282 Self::for_str(o) 283 } 284 } 285 286 impl fmt::Debug for ScriptExtension { fmt(&self, f: &mut fmt::Formatter) -> fmt::Result287 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 288 write!(f, "ScriptExtension(")?; 289 if self.is_common() { 290 write!(f, "Common")?; 291 } else if self.is_inherited() { 292 write!(f, "Inherited")?; 293 } else if self.is_empty() { 294 write!(f, "Unknown")?; 295 } else { 296 let mut first = true; 297 for script in self.iter() { 298 if !first { 299 write!(f, " + ")?; 300 first = false; 301 } 302 script.full_name().fmt(f)?; 303 } 304 } 305 write!(f, ")") 306 } 307 } 308 309 /// Extension trait on `char` for calculating script properties 310 pub trait UnicodeScript { 311 /// Get the script for a given character script(&self) -> Script312 fn script(&self) -> Script; 313 /// Get the Script_Extension for a given character script_extension(&self) -> ScriptExtension314 fn script_extension(&self) -> ScriptExtension; 315 } 316 317 impl UnicodeScript for char { script(&self) -> Script318 fn script(&self) -> Script { 319 get_script(*self).unwrap_or(Script::Unknown) 320 } 321 script_extension(&self) -> ScriptExtension322 fn script_extension(&self) -> ScriptExtension { 323 get_script_extension(*self).unwrap_or_else(|| self.script().into()) 324 } 325 } 326 327 /// Iterator over scripts in a [ScriptExtension]. 328 /// 329 /// Can be obtained ia [ScriptExtension::iter()] 330 pub struct ScriptIterator { 331 ext: ScriptExtension, 332 } 333 334 impl Iterator for ScriptIterator { 335 type Item = Script; 336 next(&mut self) -> Option<Script>337 fn next(&mut self) -> Option<Script> { 338 if self.ext.is_common_or_inherited() { 339 let common = self.ext.common; 340 self.ext = ScriptExtension::new_unknown(); 341 if common { 342 Some(Script::Common) 343 } else { 344 Some(Script::Inherited) 345 } 346 // Are there bits left in the first chunk? 347 } else if self.ext.first != 0 { 348 // Find the next bit 349 let bit = self.ext.first.trailing_zeros(); 350 // unset just that bit 351 self.ext.first &= !(1 << bit); 352 Some(Script::for_integer(bit as u8)) 353 // Are there bits left in the second chunk? 354 } else if self.ext.second != 0 { 355 let bit = self.ext.second.trailing_zeros(); 356 self.ext.second &= !(1 << bit); 357 Some(Script::for_integer(64 + bit as u8)) 358 // Are there bits left in the third chunk? 359 } else if self.ext.third != 0 { 360 let bit = self.ext.third.trailing_zeros(); 361 self.ext.third &= !(1 << bit); 362 Some(Script::for_integer(128 + bit as u8)) 363 } else { 364 // Script::Unknown 365 None 366 } 367 } 368 } 369 370 #[cfg(test)] 371 mod tests { 372 use crate::*; 373 use std::collections::HashSet; 374 use std::convert::TryInto; 375 376 #[cfg(feature = "bench")] 377 use test::bench::Bencher; 378 #[cfg(feature = "bench")] 379 extern crate test; 380 381 #[test] test_conversion()382 fn test_conversion() { 383 let mut seen_scripts = HashSet::new(); 384 let mut seen_exts = HashSet::new(); 385 for bit in 0..NEXT_SCRIPT { 386 let script = Script::for_integer(bit); 387 let ext = script.into(); 388 if seen_scripts.contains(&script) { 389 panic!("Found script {:?} twice!", script) 390 } 391 if seen_exts.contains(&ext) { 392 panic!("Found extension {:?} twice!", ext) 393 } 394 seen_scripts.insert(script); 395 seen_exts.insert(ext); 396 assert_eq!(script as u8, bit); 397 assert!(!ScriptExtension::new_common().intersection(ext).is_empty()); 398 assert!(!ScriptExtension::new_inherited() 399 .intersection(ext) 400 .is_empty()); 401 assert!(ScriptExtension::new_unknown().intersection(ext).is_empty()); 402 assert_eq!(ext.iter().collect::<Vec<_>>(), vec![script]); 403 assert_eq!(Ok(script), ext.try_into()); 404 } 405 } 406 407 #[test] test_specific()408 fn test_specific() { 409 let s = "सवव मानवी व्यद्क् जन्मतःच स्वतींत्र आहेत व त्ाींना समान प्रवतष्ठा व समान अविकार आहेत. त्ाींना ववचारशद्क् व सवविे कबुद्द्धलाभलेली आहे. व त्ाींनी एकमेकाींशी बींिुत्वाचाभावनेने आचरण करावे."; 410 let ext = ScriptExtension::for_str(s); 411 assert_eq!(ext, script_extensions::DEVA); 412 println!( 413 "{:?}", 414 script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH 415 ); 416 println!( 417 "{:?}", 418 ext.intersection( 419 script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH 420 ) 421 ); 422 assert!(!ext 423 .intersection(script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH) 424 .is_empty()); 425 426 let u = ext.union(Script::Dogra.into()); 427 assert_eq!( 428 u.intersection( 429 script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH 430 ), 431 u 432 ); 433 } 434 435 #[test] test_specific_ext()436 fn test_specific_ext() { 437 let ext = script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH; 438 439 let all: HashSet<_> = ext.iter().collect(); 440 441 for bit in 0..NEXT_SCRIPT { 442 let script = Script::for_integer(bit); 443 444 if all.contains(&script) { 445 assert!(ext.contains_script(script)) 446 } else { 447 assert!(!ext.contains_script(script)) 448 } 449 } 450 451 assert!(ext.contains_script(Script::Devanagari)); 452 assert!(ext.contains_script(Script::Dogra)); 453 assert!(ext.contains_script(Script::Gujarati)); 454 assert!(ext.contains_script(Script::Gurmukhi)); 455 assert!(ext.contains_script(Script::Khojki)); 456 assert!(ext.contains_script(Script::Kaithi)); 457 assert!(ext.contains_script(Script::Mahajani)); 458 assert!(ext.contains_script(Script::Modi)); 459 assert!(ext.contains_script(Script::Khudawadi)); 460 assert!(ext.contains_script(Script::Takri)); 461 assert!(ext.contains_script(Script::Tirhuta)); 462 463 let scr: Result<Script, _> = ext.try_into(); 464 assert!(scr.is_err()); 465 } 466 467 #[cfg(feature = "bench")] 468 #[bench] bench_script_intersection(b: &mut Bencher)469 fn bench_script_intersection(b: &mut Bencher) { 470 b.iter(|| { 471 let script = test::black_box(Script::Devanagari); 472 let ext = test::black_box(script_extensions::BENG_DEVA_DOGR_GONG_GONM_GRAN_GUJR_GURU_KNDA_MAHJ_MLYM_NAND_ORYA_SIND_SINH_SYLO_TAKR_TAML_TELU_TIRH); 473 test::black_box(ext.intersection(script.into())); 474 }) 475 } 476 477 #[cfg(feature = "bench")] 478 #[bench] bench_ext_to_script(b: &mut Bencher)479 fn bench_ext_to_script(b: &mut Bencher) { 480 let ext: ScriptExtension = Script::Devanagari.into(); 481 b.iter(|| { 482 let ext = test::black_box(ext); 483 let script: Result<Script, _> = ext.try_into(); 484 let _ = test::black_box(script); 485 }) 486 } 487 488 #[cfg(feature = "bench")] 489 #[bench] bench_script_to_ext(b: &mut Bencher)490 fn bench_script_to_ext(b: &mut Bencher) { 491 b.iter(|| { 492 let script = test::black_box(Script::Devanagari); 493 let ext: ScriptExtension = script.into(); 494 test::black_box(ext); 495 }) 496 } 497 498 #[cfg(feature = "bench")] 499 #[bench] bench_ext_intersection(b: &mut Bencher)500 fn bench_ext_intersection(b: &mut Bencher) { 501 b.iter(|| { 502 let e1 = test::black_box(script_extensions::ARAB_ROHG_SYRC_THAA); 503 let e2 = test::black_box(script_extensions::BENG_DEVA_DOGR_GONG_GONM_GRAN_GUJR_GURU_KNDA_MAHJ_MLYM_NAND_ORYA_SIND_SINH_SYLO_TAKR_TAML_TELU_TIRH); 504 test::black_box(e2.intersection(e1)); 505 }) 506 } 507 508 #[cfg(feature = "bench")] 509 #[bench] bench_to_vec(b: &mut Bencher)510 fn bench_to_vec(b: &mut Bencher) { 511 b.iter(|| { 512 let ext = test::black_box(script_extensions::BENG_DEVA_DOGR_GONG_GONM_GRAN_GUJR_GURU_KNDA_MAHJ_MLYM_NAND_ORYA_SIND_SINH_SYLO_TAKR_TAML_TELU_TIRH); 513 test::black_box(ext.iter().collect::<Vec<_>>()); 514 }) 515 } 516 517 #[cfg(feature = "bench")] 518 #[bench] bench_string_ext(b: &mut Bencher)519 fn bench_string_ext(b: &mut Bencher) { 520 b.iter(|| { 521 let s = test::black_box("सवव मानवी व्यद्क् जन्मतःच स्वतींत्र आहेत व त्ाींना समान प्रवतष्ठा व समान अविकार आहेत. त्ाींना ववचारशद्क् व सवविे कबुद्द्धलाभलेली आहे. व त्ाींनी एकमेकाींशी बींिुत्वाचाभावनेने आचरण करावे."); 522 test::black_box(ScriptExtension::for_str(s)); 523 }) 524 } 525 } 526