1 use normalize::{
2     hangul_decomposition_length,
3     is_hangul_syllable,
4 };
5 use lookups::{
6     canonical_combining_class, canonical_fully_decomposed, compatibility_fully_decomposed,
7     stream_safe_trailing_nonstarters,
8 };
9 use tables::stream_safe_leading_nonstarters;
10 
11 pub(crate) const MAX_NONSTARTERS: usize = 30;
12 const COMBINING_GRAPHEME_JOINER: char = '\u{034F}';
13 
14 /// UAX15-D4: This iterator keeps track of how many non-starters there have been
15 /// since the last starter in *NFKD* and will emit a Combining Grapheme Joiner
16 /// (U+034F) if the count exceeds 30.
17 pub struct StreamSafe<I> {
18     iter: I,
19     nonstarter_count: usize,
20     buffer: Option<char>,
21 }
22 
23 impl<I> StreamSafe<I> {
new(iter: I) -> Self24     pub(crate) fn new(iter: I) -> Self {
25         Self { iter, nonstarter_count: 0, buffer: None }
26     }
27 }
28 
29 impl<I: Iterator<Item=char>> Iterator for StreamSafe<I> {
30     type Item = char;
31 
32     #[inline]
next(&mut self) -> Option<char>33     fn next(&mut self) -> Option<char> {
34         if let Some(ch) = self.buffer.take() {
35             return Some(ch);
36         }
37         let next_ch = match self.iter.next() {
38             None => return None,
39             Some(c) => c,
40         };
41         let d = classify_nonstarters(next_ch);
42         if self.nonstarter_count + d.leading_nonstarters > MAX_NONSTARTERS {
43             self.buffer = Some(next_ch);
44             self.nonstarter_count = 0;
45             return Some(COMBINING_GRAPHEME_JOINER);
46         }
47 
48         // No starters in the decomposition, so keep accumulating
49         if d.leading_nonstarters == d.decomposition_len {
50             self.nonstarter_count += d.decomposition_len;
51         }
52         // Otherwise, restart the nonstarter counter.
53         else {
54             self.nonstarter_count = d.trailing_nonstarters;
55         }
56         Some(next_ch)
57     }
58 }
59 
60 #[derive(Debug)]
61 pub(crate) struct Decomposition {
62     pub(crate) leading_nonstarters: usize,
63     pub(crate) trailing_nonstarters: usize,
64     pub(crate) decomposition_len: usize,
65 }
66 
67 #[inline]
classify_nonstarters(c: char) -> Decomposition68 pub(crate) fn classify_nonstarters(c: char) -> Decomposition {
69     // As usual, fast path for ASCII (which is always a starter)
70     if c <= '\x7f' {
71         return Decomposition {
72             leading_nonstarters: 0,
73             trailing_nonstarters: 0,
74             decomposition_len: 1,
75         }
76     }
77     // Next, special case Hangul, since it's not handled by our tables.
78     if is_hangul_syllable(c) {
79         return Decomposition {
80             leading_nonstarters: 0,
81             trailing_nonstarters: 0,
82             decomposition_len: hangul_decomposition_length(c),
83         };
84     }
85     let decomp = compatibility_fully_decomposed(c)
86         .or_else(|| canonical_fully_decomposed(c));
87     match decomp {
88         Some(decomp) => {
89             Decomposition {
90                 leading_nonstarters: stream_safe_leading_nonstarters(c),
91                 trailing_nonstarters: stream_safe_trailing_nonstarters(c),
92                 decomposition_len: decomp.len(),
93             }
94         },
95         None => {
96             let is_nonstarter = canonical_combining_class(c) != 0;
97             let nonstarter = if is_nonstarter { 1 } else { 0 };
98             Decomposition {
99                 leading_nonstarters: nonstarter,
100                 trailing_nonstarters: nonstarter,
101                 decomposition_len: 1,
102             }
103         }
104     }
105 }
106 
107 #[cfg(test)]
108 mod tests {
109     use super::{
110         StreamSafe,
111         classify_nonstarters,
112     };
113     use std::char;
114     use normalize::decompose_compatible;
115     use lookups::canonical_combining_class;
116 
stream_safe(s: &str) -> String117     fn stream_safe(s: &str) -> String {
118         StreamSafe::new(s.chars()).collect()
119     }
120 
121     #[test]
test_simple()122     fn test_simple() {
123         let technically_okay = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}ngerzone";
124         assert_eq!(stream_safe(technically_okay), technically_okay);
125 
126         let too_much = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone";
127         assert_ne!(stream_safe(too_much), too_much);
128     }
129 
130     #[test]
test_classify_nonstarters()131     fn test_classify_nonstarters() {
132         // Highest character in the `compat_fully_decomp` table is 2FA1D
133         for ch in 0..0x2FA1E {
134             let ch = match char::from_u32(ch) {
135                 Some(c) => c,
136                 None => continue,
137             };
138             let c = classify_nonstarters(ch);
139             let mut s = vec![];
140             decompose_compatible(ch, |c| s.push(c));
141 
142             assert_eq!(s.len(), c.decomposition_len);
143 
144             let num_leading = s
145                 .iter()
146                 .take_while(|&c| canonical_combining_class(*c) != 0)
147                 .count();
148             let num_trailing = s
149                 .iter()
150                 .rev()
151                 .take_while(|&c| canonical_combining_class(*c) != 0)
152                 .count();
153 
154             assert_eq!(num_leading, c.leading_nonstarters);
155             assert_eq!(num_trailing, c.trailing_nonstarters);
156         }
157     }
158 }
159