1 //! This crate exposes the Unicode `Script` and `Script_Extension`
2 //! properties from [UAX #24](http://www.unicode.org/reports/tr24/)
3 
4 #![cfg_attr(not(test), no_std)]
5 #![cfg_attr(feature = "bench", feature(test))]
6 
7 #[rustfmt::skip]
8 mod tables;
9 
10 use core::convert::TryFrom;
11 use core::fmt;
12 use core::u64;
13 pub use tables::script_extensions;
14 use tables::{get_script, get_script_extension, NEXT_SCRIPT};
15 pub use tables::{Script, UNICODE_VERSION};
16 
17 impl Script {
18     /// Get the full name of a script
full_name(self) -> &'static str19     pub fn full_name(self) -> &'static str {
20         self.inner_full_name()
21     }
22 
23     /// Get the four-character short name of a script
short_name(self) -> &'static str24     pub fn short_name(self) -> &'static str {
25         self.inner_short_name()
26     }
27 
28     /// Is this script "Recommended" according to
29     /// [UAX #31](www.unicode.org/reports/tr31/#Table_Recommended_Scripts)?
is_recommended(self) -> bool30     pub fn is_recommended(self) -> bool {
31         use Script::*;
32         match self {
33             Common | Inherited | Arabic | Armenian | Bengali | Bopomofo | Cyrillic | Devanagari
34             | Ethiopic | Georgian | Greek | Gujarati | Gurmukhi | Han | Hangul | Hebrew
35             | Hiragana | Kannada | Katakana | Khmer | Lao | Latin | Malayalam | Myanmar | Oriya
36             | Sinhala | Tamil | Telugu | Thaana | Thai | Tibetan => true,
37             _ => false,
38         }
39     }
40 }
41 
42 impl From<Script> for ScriptExtension {
from(script: Script) -> Self43     fn from(script: Script) -> Self {
44         if script == Script::Common {
45             ScriptExtension::new_common()
46         } else if script == Script::Inherited {
47             ScriptExtension::new_inherited()
48         } else if script == Script::Unknown {
49             ScriptExtension::new_unknown()
50         } else {
51             let mut first = 0;
52             let mut second = 0;
53             let mut third = 0;
54             let bit = script as u8;
55             // Find out which field it's in, and set the appropriate bit there
56             if bit < 64 {
57                 first = 1 << bit as u64;
58             } else if bit < 128 {
59                 // offset by 64 since `bit` is an absolute number,
60                 // not relative to the chunk
61                 second = 1 << (bit - 64) as u64;
62             } else {
63                 third = 1 << (bit - 128) as u32;
64             }
65             ScriptExtension::new(first, second, third)
66         }
67     }
68 }
69 
70 impl TryFrom<ScriptExtension> for Script {
71     type Error = ();
try_from(ext: ScriptExtension) -> Result<Self, ()>72     fn try_from(ext: ScriptExtension) -> Result<Self, ()> {
73         if ext.is_common_or_inherited() {
74             if ext.common {
75                 Ok(Script::Common)
76             } else {
77                 Ok(Script::Inherited)
78             }
79         } else if ext.is_empty() {
80             Ok(Script::Unknown)
81         } else {
82             // filled elements will have set ones
83             let fo = ext.first.count_ones();
84             let so = ext.second.count_ones();
85             let to = ext.third.count_ones();
86             // only one bit set, in the first chunk
87             if fo == 1 && so == 0 && to == 0 {
88                 // use trailing_zeroes() to figure out which bit it is
89                 Ok(Script::for_integer(ext.first.trailing_zeros() as u8))
90             // only one bit set, in the second chunk
91             } else if fo == 0 && so == 1 && to == 0 {
92                 Ok(Script::for_integer(64 + ext.second.trailing_zeros() as u8))
93             // only one bit set, in the third chunk
94             } else if fo == 0 && so == 0 && to == 1 {
95                 Ok(Script::for_integer(128 + ext.third.trailing_zeros() as u8))
96             } else {
97                 Err(())
98             }
99         }
100     }
101 }
102 
103 impl Default for Script {
default() -> Self104     fn default() -> Self {
105         Script::Common
106     }
107 }
108 
109 impl From<char> for Script {
from(o: char) -> Self110     fn from(o: char) -> Self {
111         o.script()
112     }
113 }
114 
115 #[derive(Clone, Copy, PartialEq, Eq, Hash)]
116 #[non_exhaustive]
117 /// A value for the `Script_Extension` property
118 ///
119 /// [`ScriptExtension`] is one or more [`Script`]
120 ///
121 /// This is essentially an optimized version of `Vec<Script>` that uses bitfields
122 pub struct ScriptExtension {
123     // A bitset for the first 64 scripts
124     first: u64,
125     // A bitset for the scripts 65-128
126     second: u64,
127     // A bitset for scripts after 128
128     third: u32,
129     // Both Common and Inherited are represented by all used bits being set,
130     // this flag lets us distinguish the two.
131     common: bool,
132 }
133 
134 impl ScriptExtension {
135     // We don't use the complete u32 of `third`, so the "all" value is not just u32::MAX
136     // Instead, we take the number of the next (unused) script bit, subtract 128 to bring
137     // it in the range of `third`, create a u32 with just that bit set, and subtract 1
138     // to create one with all the lower bits set.
139     const THIRD_MAX: u32 = ((1 << (NEXT_SCRIPT - 128)) - 1);
140 
new(first: u64, second: u64, third: u32) -> Self141     pub(crate) const fn new(first: u64, second: u64, third: u32) -> Self {
142         ScriptExtension {
143             first,
144             second,
145             third,
146             common: false,
147         }
148     }
149 
new_common() -> Self150     pub(crate) const fn new_common() -> Self {
151         ScriptExtension {
152             first: u64::MAX,
153             second: u64::MAX,
154             third: Self::THIRD_MAX,
155             common: true,
156         }
157     }
158 
new_inherited() -> Self159     pub(crate) const fn new_inherited() -> Self {
160         ScriptExtension {
161             first: u64::MAX,
162             second: u64::MAX,
163             third: Self::THIRD_MAX,
164             common: false,
165         }
166     }
167 
new_unknown() -> Self168     pub(crate) const fn new_unknown() -> Self {
169         ScriptExtension {
170             first: 0,
171             second: 0,
172             third: 0,
173             common: false,
174         }
175     }
176 
is_common_or_inherited(self) -> bool177     const fn is_common_or_inherited(self) -> bool {
178         (self.first == u64::MAX) & (self.second == u64::MAX) & (self.third == Self::THIRD_MAX)
179     }
180 
181     /// Checks if the script extension is Common
is_common(self) -> bool182     pub const fn is_common(self) -> bool {
183         self.is_common_or_inherited() & self.common
184     }
185 
186     /// Checks if the script extension is Inherited
is_inherited(self) -> bool187     pub const fn is_inherited(self) -> bool {
188         self.is_common_or_inherited() & !self.common
189     }
190 
191     /// Checks if the script extension is empty (unknown)
is_empty(self) -> bool192     pub const fn is_empty(self) -> bool {
193         (self.first == 0) & (self.second == 0) & (self.third == 0)
194     }
195 
196     /// Intersect this `ScriptExtension` with another `ScriptExtension`. Produces `Unknown` if things
197     /// do not intersect. This is equivalent to [`ScriptExtension::intersection`] but it stores the result
198     /// in `self`
199     ///
200     /// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting
201     /// everything, the intersection of `Common` and `Inherited` is `Inherited`
intersect_with(&mut self, other: Self)202     pub fn intersect_with(&mut self, other: Self) {
203         *self = self.intersection(other)
204     }
205 
206     /// Find the intersection between two ScriptExtensions. Returns Unknown if things
207     /// do not intersect.
208     ///
209     /// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting
210     /// everything, the intersection of `Common` and `Inherited` is `Inherited`
intersection(self, other: Self) -> Self211     pub const fn intersection(self, other: Self) -> Self {
212         let first = self.first & other.first;
213         let second = self.second & other.second;
214         let third = self.third & other.third;
215         let common = self.common & other.common;
216         ScriptExtension {
217             first,
218             second,
219             third,
220             common,
221         }
222     }
223 
224     /// Find the union between two ScriptExtensions.
225     ///
226     /// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting
227     /// everything, the union of `Common` and `Inherited` is `Common`
union(self, other: Self) -> Self228     pub const fn union(self, other: Self) -> Self {
229         let first = self.first | other.first;
230         let second = self.second | other.second;
231         let third = self.third | other.third;
232         let common = self.common | other.common;
233         ScriptExtension {
234             first,
235             second,
236             third,
237             common,
238         }
239     }
240 
241     /// Check if this ScriptExtension contains the given script
242     ///
243     /// Should be used with specific scripts only, this will
244     /// return `true` if `self` is not `Unknown` and `script` is
245     /// `Common` or `Inherited`
contains_script(self, script: Script) -> bool246     pub fn contains_script(self, script: Script) -> bool {
247         !self.intersection(script.into()).is_empty()
248     }
249 
250     /// Get the intersection of script extensions of all characters
251     /// in a string.
for_str(x: &str) -> Self252     pub fn for_str(x: &str) -> Self {
253         let mut ext = ScriptExtension::default();
254         for ch in x.chars() {
255             ext.intersect_with(ch.into());
256         }
257         ext
258     }
259 
260     /// Iterate over the scripts in this string
261     ///
262     /// Will never yeild Script::Unknown
iter(self) -> ScriptIterator263     pub fn iter(self) -> ScriptIterator {
264         ScriptIterator { ext: self }
265     }
266 }
267 
268 impl Default for ScriptExtension {
default() -> Self269     fn default() -> Self {
270         ScriptExtension::new_common()
271     }
272 }
273 
274 impl From<char> for ScriptExtension {
from(o: char) -> Self275     fn from(o: char) -> Self {
276         o.script_extension()
277     }
278 }
279 
280 impl From<&'_ str> for ScriptExtension {
from(o: &'_ str) -> Self281     fn from(o: &'_ str) -> Self {
282         Self::for_str(o)
283     }
284 }
285 
286 impl fmt::Debug for ScriptExtension {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result287     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
288         write!(f, "ScriptExtension(")?;
289         if self.is_common() {
290             write!(f, "Common")?;
291         } else if self.is_inherited() {
292             write!(f, "Inherited")?;
293         } else if self.is_empty() {
294             write!(f, "Unknown")?;
295         } else {
296             let mut first = true;
297             for script in self.iter() {
298                 if !first {
299                     write!(f, " + ")?;
300                     first = false;
301                 }
302                 script.full_name().fmt(f)?;
303             }
304         }
305         write!(f, ")")
306     }
307 }
308 
309 /// Extension trait on `char` for calculating script properties
310 pub trait UnicodeScript {
311     /// Get the script for a given character
script(&self) -> Script312     fn script(&self) -> Script;
313     /// Get the Script_Extension for a given character
script_extension(&self) -> ScriptExtension314     fn script_extension(&self) -> ScriptExtension;
315 }
316 
317 impl UnicodeScript for char {
script(&self) -> Script318     fn script(&self) -> Script {
319         get_script(*self).unwrap_or(Script::Unknown)
320     }
321 
script_extension(&self) -> ScriptExtension322     fn script_extension(&self) -> ScriptExtension {
323         get_script_extension(*self).unwrap_or_else(|| self.script().into())
324     }
325 }
326 
327 /// Iterator over scripts in a [ScriptExtension].
328 ///
329 /// Can be obtained ia [ScriptExtension::iter()]
330 pub struct ScriptIterator {
331     ext: ScriptExtension,
332 }
333 
334 impl Iterator for ScriptIterator {
335     type Item = Script;
336 
next(&mut self) -> Option<Script>337     fn next(&mut self) -> Option<Script> {
338         if self.ext.is_common_or_inherited() {
339             let common = self.ext.common;
340             self.ext = ScriptExtension::new_unknown();
341             if common {
342                 Some(Script::Common)
343             } else {
344                 Some(Script::Inherited)
345             }
346         // Are there bits left in the first chunk?
347         } else if self.ext.first != 0 {
348             // Find the next bit
349             let bit = self.ext.first.trailing_zeros();
350             // unset just that bit
351             self.ext.first &= !(1 << bit);
352             Some(Script::for_integer(bit as u8))
353         // Are there bits left in the second chunk?
354         } else if self.ext.second != 0 {
355             let bit = self.ext.second.trailing_zeros();
356             self.ext.second &= !(1 << bit);
357             Some(Script::for_integer(64 + bit as u8))
358         // Are there bits left in the third chunk?
359         } else if self.ext.third != 0 {
360             let bit = self.ext.third.trailing_zeros();
361             self.ext.third &= !(1 << bit);
362             Some(Script::for_integer(128 + bit as u8))
363         } else {
364             // Script::Unknown
365             None
366         }
367     }
368 }
369 
370 #[cfg(test)]
371 mod tests {
372     use crate::*;
373     use std::collections::HashSet;
374     use std::convert::TryInto;
375 
376     #[cfg(feature = "bench")]
377     use test::bench::Bencher;
378     #[cfg(feature = "bench")]
379     extern crate test;
380 
381     #[test]
test_conversion()382     fn test_conversion() {
383         let mut seen_scripts = HashSet::new();
384         let mut seen_exts = HashSet::new();
385         for bit in 0..NEXT_SCRIPT {
386             let script = Script::for_integer(bit);
387             let ext = script.into();
388             if seen_scripts.contains(&script) {
389                 panic!("Found script {:?} twice!", script)
390             }
391             if seen_exts.contains(&ext) {
392                 panic!("Found extension {:?} twice!", ext)
393             }
394             seen_scripts.insert(script);
395             seen_exts.insert(ext);
396             assert_eq!(script as u8, bit);
397             assert!(!ScriptExtension::new_common().intersection(ext).is_empty());
398             assert!(!ScriptExtension::new_inherited()
399                 .intersection(ext)
400                 .is_empty());
401             assert!(ScriptExtension::new_unknown().intersection(ext).is_empty());
402             assert_eq!(ext.iter().collect::<Vec<_>>(), vec![script]);
403             assert_eq!(Ok(script), ext.try_into());
404         }
405     }
406 
407     #[test]
test_specific()408     fn test_specific() {
409         let s = "सवव मानवी व्यद्क् जन्मतःच स्वतींत्र आहेत व त्ाींना समान प्रवतष्ठा व समान अविकार आहेत. त्ाींना ववचारशद्क् व सवविे कबुद्द्धलाभलेली आहे. व त्ाींनी एकमेकाींशी बींिुत्वाचाभावनेने आचरण करावे.";
410         let ext = ScriptExtension::for_str(s);
411         assert_eq!(ext, script_extensions::DEVA);
412         println!(
413             "{:?}",
414             script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
415         );
416         println!(
417             "{:?}",
418             ext.intersection(
419                 script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
420             )
421         );
422         assert!(!ext
423             .intersection(script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH)
424             .is_empty());
425 
426         let u = ext.union(Script::Dogra.into());
427         assert_eq!(
428             u.intersection(
429                 script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
430             ),
431             u
432         );
433     }
434 
435     #[test]
test_specific_ext()436     fn test_specific_ext() {
437         let ext = script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH;
438 
439         let all: HashSet<_> = ext.iter().collect();
440 
441         for bit in 0..NEXT_SCRIPT {
442             let script = Script::for_integer(bit);
443 
444             if all.contains(&script) {
445                 assert!(ext.contains_script(script))
446             } else {
447                 assert!(!ext.contains_script(script))
448             }
449         }
450 
451         assert!(ext.contains_script(Script::Devanagari));
452         assert!(ext.contains_script(Script::Dogra));
453         assert!(ext.contains_script(Script::Gujarati));
454         assert!(ext.contains_script(Script::Gurmukhi));
455         assert!(ext.contains_script(Script::Khojki));
456         assert!(ext.contains_script(Script::Kaithi));
457         assert!(ext.contains_script(Script::Mahajani));
458         assert!(ext.contains_script(Script::Modi));
459         assert!(ext.contains_script(Script::Khudawadi));
460         assert!(ext.contains_script(Script::Takri));
461         assert!(ext.contains_script(Script::Tirhuta));
462 
463         let scr: Result<Script, _> = ext.try_into();
464         assert!(scr.is_err());
465     }
466 
467     #[cfg(feature = "bench")]
468     #[bench]
bench_script_intersection(b: &mut Bencher)469     fn bench_script_intersection(b: &mut Bencher) {
470         b.iter(|| {
471             let script = test::black_box(Script::Devanagari);
472             let ext = test::black_box(script_extensions::BENG_DEVA_DOGR_GONG_GONM_GRAN_GUJR_GURU_KNDA_MAHJ_MLYM_NAND_ORYA_SIND_SINH_SYLO_TAKR_TAML_TELU_TIRH);
473             test::black_box(ext.intersection(script.into()));
474         })
475     }
476 
477     #[cfg(feature = "bench")]
478     #[bench]
bench_ext_to_script(b: &mut Bencher)479     fn bench_ext_to_script(b: &mut Bencher) {
480         let ext: ScriptExtension = Script::Devanagari.into();
481         b.iter(|| {
482             let ext = test::black_box(ext);
483             let script: Result<Script, _> = ext.try_into();
484             let _ = test::black_box(script);
485         })
486     }
487 
488     #[cfg(feature = "bench")]
489     #[bench]
bench_script_to_ext(b: &mut Bencher)490     fn bench_script_to_ext(b: &mut Bencher) {
491         b.iter(|| {
492             let script = test::black_box(Script::Devanagari);
493             let ext: ScriptExtension = script.into();
494             test::black_box(ext);
495         })
496     }
497 
498     #[cfg(feature = "bench")]
499     #[bench]
bench_ext_intersection(b: &mut Bencher)500     fn bench_ext_intersection(b: &mut Bencher) {
501         b.iter(|| {
502             let e1 = test::black_box(script_extensions::ARAB_ROHG_SYRC_THAA);
503             let e2 = test::black_box(script_extensions::BENG_DEVA_DOGR_GONG_GONM_GRAN_GUJR_GURU_KNDA_MAHJ_MLYM_NAND_ORYA_SIND_SINH_SYLO_TAKR_TAML_TELU_TIRH);
504             test::black_box(e2.intersection(e1));
505         })
506     }
507 
508     #[cfg(feature = "bench")]
509     #[bench]
bench_to_vec(b: &mut Bencher)510     fn bench_to_vec(b: &mut Bencher) {
511         b.iter(|| {
512             let ext = test::black_box(script_extensions::BENG_DEVA_DOGR_GONG_GONM_GRAN_GUJR_GURU_KNDA_MAHJ_MLYM_NAND_ORYA_SIND_SINH_SYLO_TAKR_TAML_TELU_TIRH);
513             test::black_box(ext.iter().collect::<Vec<_>>());
514         })
515     }
516 
517     #[cfg(feature = "bench")]
518     #[bench]
bench_string_ext(b: &mut Bencher)519     fn bench_string_ext(b: &mut Bencher) {
520         b.iter(|| {
521             let s = test::black_box("सवव मानवी व्यद्क् जन्मतःच स्वतींत्र आहेत व त्ाींना समान प्रवतष्ठा व समान अविकार आहेत. त्ाींना ववचारशद्क् व सवविे कबुद्द्धलाभलेली आहे. व त्ाींनी एकमेकाींशी बींिुत्वाचाभावनेने आचरण करावे.");
522             test::black_box(ScriptExtension::for_str(s));
523         })
524     }
525 }
526