1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10 
11 use core::cmp;
12 use core::iter::Filter;
13 
14 // All of the logic for forward iteration over sentences
15 mod fwd {
16     use tables::sentence::SentenceCat;
17     use core::cmp;
18 
19     // Describe a parsed part of source string as described in this table:
20     // https://unicode.org/reports/tr29/#Default_Sentence_Boundaries
21     #[derive(Clone, Copy, PartialEq, Eq)]
22     enum StatePart {
23         Sot,
24         Eot,
25         Other,
26         CR,
27         LF,
28         Sep,
29         ATerm,
30         UpperLower,
31         ClosePlus,
32         SpPlus,
33         STerm
34     }
35 
36     #[derive(Clone, PartialEq, Eq)]
37     struct SentenceBreaksState(pub [StatePart; 4]);
38 
39     const INITIAL_STATE: SentenceBreaksState = SentenceBreaksState([
40         StatePart::Sot,
41         StatePart::Sot,
42         StatePart::Sot,
43         StatePart::Sot
44     ]);
45 
46     #[derive(Clone)]
47     pub struct SentenceBreaks<'a> {
48         pub string: &'a str,
49         pos: usize,
50         state: SentenceBreaksState
51     }
52 
53     impl SentenceBreaksState {
54         // Attempt to advance the internal state by one part
55         // Whitespace and some punctutation will be collapsed
next(&self, cat: SentenceCat) -> SentenceBreaksState56         fn next(&self, cat: SentenceCat) -> SentenceBreaksState {
57             let &SentenceBreaksState(parts) = self;
58             let parts = match (parts[3], cat) {
59                 (StatePart::ClosePlus, SentenceCat::SC_Close) => parts,
60                 (StatePart::SpPlus, SentenceCat::SC_Sp) => parts,
61                 _ => [
62                     parts[1],
63                     parts[2],
64                     parts[3],
65                     match cat {
66                         SentenceCat::SC_CR => StatePart::CR,
67                         SentenceCat::SC_LF => StatePart::LF,
68                         SentenceCat::SC_Sep => StatePart::Sep,
69                         SentenceCat::SC_ATerm => StatePart::ATerm,
70                         SentenceCat::SC_Upper |
71                         SentenceCat::SC_Lower => StatePart::UpperLower,
72                         SentenceCat::SC_Close => StatePart::ClosePlus,
73                         SentenceCat::SC_Sp => StatePart::SpPlus,
74                         SentenceCat::SC_STerm => StatePart::STerm,
75                         _ => StatePart::Other
76                     }
77                 ]
78             };
79             SentenceBreaksState(parts)
80         }
81 
end(&self) -> SentenceBreaksState82         fn end(&self) -> SentenceBreaksState {
83             let &SentenceBreaksState(parts) = self;
84             SentenceBreaksState([
85                 parts[1],
86                 parts[2],
87                 parts[3],
88                 StatePart::Eot
89             ])
90         }
91 
92         // Helper function to check if state head matches a single `StatePart`
match1(&self, part: StatePart) -> bool93         fn match1(&self, part: StatePart) -> bool {
94             let &SentenceBreaksState(parts) = self;
95             part == parts[3]
96         }
97 
98         // Helper function to check if first two `StateParts` in state match
99         // the given two
match2(&self, part1: StatePart, part2: StatePart) -> bool100         fn match2(&self, part1: StatePart, part2: StatePart) -> bool {
101             let &SentenceBreaksState(parts) = self;
102             part1 == parts[2] && part2 == parts[3]
103         }
104     }
105 
106     // https://unicode.org/reports/tr29/#SB8
107     // TODO cache this, it is currently quadratic
match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool108     fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool {
109         let &SentenceBreaksState(parts) = state;
110         let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
111         if parts[idx] == StatePart::ClosePlus { idx -= 1 }
112 
113         if parts[idx] == StatePart::ATerm {
114             use tables::sentence as se;
115 
116             for next_char in ahead.chars() {
117                 //( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
118                 match se::sentence_category(next_char) {
119                     se::SC_Lower => return true,
120                     se::SC_OLetter |
121                     se::SC_Upper |
122                     se::SC_Sep | se::SC_CR | se::SC_LF |
123                     se::SC_STerm | se::SC_ATerm => return false,
124                     _ => continue
125                 }
126             }
127         }
128 
129         false
130     }
131 
132     // https://unicode.org/reports/tr29/#SB8a
match_sb8a(state: &SentenceBreaksState) -> bool133     fn match_sb8a(state: &SentenceBreaksState) -> bool {
134         // SATerm Close* Sp*
135         let &SentenceBreaksState(parts) = state;
136         let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
137         if parts[idx] == StatePart::ClosePlus { idx -= 1 }
138         parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
139     }
140 
141     // https://unicode.org/reports/tr29/#SB9
match_sb9(state: &SentenceBreaksState) -> bool142     fn match_sb9(state: &SentenceBreaksState) -> bool {
143         // SATerm Close*
144         let &SentenceBreaksState(parts) = state;
145         let idx = if parts[3] == StatePart::ClosePlus { 2 } else { 3 };
146         parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
147     }
148 
149     // https://unicode.org/reports/tr29/#SB11
match_sb11(state: &SentenceBreaksState) -> bool150     fn match_sb11(state: &SentenceBreaksState) -> bool {
151         // SATerm Close* Sp* ParaSep?
152         let &SentenceBreaksState(parts) = state;
153         let mut idx = match parts[3] {
154             StatePart::Sep |
155             StatePart::CR |
156             StatePart::LF => 2,
157             _ => 3
158         };
159 
160         if parts[idx] == StatePart::SpPlus { idx -= 1 }
161         if parts[idx] == StatePart::ClosePlus { idx -= 1}
162 
163         parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
164     }
165 
166     impl<'a> Iterator for SentenceBreaks<'a> {
167         // Returns the index of the character which follows a break
168         type Item = usize;
169 
170         #[inline]
size_hint(&self) -> (usize, Option<usize>)171         fn size_hint(&self) -> (usize, Option<usize>) {
172             let slen = self.string.len();
173             // A sentence could be one character
174             (cmp::min(slen, 2), Some(slen + 1))
175         }
176 
177         #[inline]
next(&mut self) -> Option<usize>178         fn next(&mut self) -> Option<usize> {
179             use tables::sentence as se;
180 
181             for next_char in self.string[self.pos..].chars() {
182                 let position_before = self.pos;
183                 let state_before = self.state.clone();
184 
185                 let next_cat = se::sentence_category(next_char);
186 
187                 self.pos += next_char.len_utf8();
188                 self.state = self.state.next(next_cat);
189 
190                 match next_cat {
191                     // SB1 https://unicode.org/reports/tr29/#SB1
192                     _ if state_before.match1(StatePart::Sot) =>
193                         return Some(position_before),
194 
195                     // SB2 is handled when inner iterator (chars) is finished
196 
197                     // SB3 https://unicode.org/reports/tr29/#SB3
198                     SentenceCat::SC_LF if state_before.match1(StatePart::CR) =>
199                         continue,
200 
201                     // SB4 https://unicode.org/reports/tr29/#SB4
202                     _ if state_before.match1(StatePart::Sep)
203                         || state_before.match1(StatePart::CR)
204                         || state_before.match1(StatePart::LF)
205                     => return Some(position_before),
206 
207                     // SB5 https://unicode.org/reports/tr29/#SB5
208                     SentenceCat::SC_Extend |
209                     SentenceCat::SC_Format => self.state = state_before,
210 
211                     // SB6 https://unicode.org/reports/tr29/#SB6
212                     SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) =>
213                         continue,
214 
215                     // SB7 https://unicode.org/reports/tr29/#SB7
216                     SentenceCat::SC_Upper if state_before.match2(StatePart::UpperLower, StatePart::ATerm) =>
217                         continue,
218 
219                     // SB8 https://unicode.org/reports/tr29/#SB8
220                     _ if match_sb8(&state_before, &self.string[position_before..]) =>
221                         continue,
222 
223                     // SB8a https://unicode.org/reports/tr29/#SB8a
224                     SentenceCat::SC_SContinue |
225                     SentenceCat::SC_STerm |
226                     SentenceCat::SC_ATerm if match_sb8a(&state_before) =>
227                         continue,
228 
229                     // SB9 https://unicode.org/reports/tr29/#SB9
230                     SentenceCat::SC_Close |
231                     SentenceCat::SC_Sp |
232                     SentenceCat::SC_Sep |
233                     SentenceCat::SC_CR |
234                     SentenceCat::SC_LF if match_sb9(&state_before) =>
235                         continue,
236 
237                     // SB10 https://unicode.org/reports/tr29/#SB10
238                     SentenceCat::SC_Sp |
239                     SentenceCat::SC_Sep |
240                     SentenceCat::SC_CR |
241                     SentenceCat::SC_LF if match_sb8a(&state_before) =>
242                         continue,
243 
244                     // SB11 https://unicode.org/reports/tr29/#SB11
245                     _ if match_sb11(&state_before) =>
246                         return Some(position_before),
247 
248                     // SB998 https://unicode.org/reports/tr29/#SB998
249                     _ => continue
250                 }
251             }
252 
253             // SB2 https://unicode.org/reports/tr29/#SB2
254             if self.state.match1(StatePart::Sot) {
255                 None
256             } else if self.state.match1(StatePart::Eot) {
257                 None
258             } else {
259                 self.state = self.state.end();
260                 Some(self.pos)
261             }
262         }
263     }
264 
new_sentence_breaks<'a>(source: &'a str) -> SentenceBreaks<'a>265     pub fn new_sentence_breaks<'a>(source: &'a str) -> SentenceBreaks<'a> {
266         SentenceBreaks { string: source, pos: 0, state: INITIAL_STATE }
267     }
268 
269 }
270 
271 /// An iterator over the substrings of a string which, after splitting the string on
272 /// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries),
273 /// contain any characters with the
274 /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
275 /// property, or with
276 /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
277 #[derive(Clone)]
278 pub struct UnicodeSentences<'a> {
279     inner: Filter<USentenceBounds<'a>, fn(&&str) -> bool>,
280 }
281 
282 /// External iterator for a string's
283 /// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
284 #[derive(Clone)]
285 pub struct USentenceBounds<'a> {
286     iter: fwd::SentenceBreaks<'a>,
287     sentence_start: Option<usize>
288 }
289 
290 /// External iterator for sentence boundaries and byte offsets.
291 #[derive(Clone)]
292 pub struct USentenceBoundIndices<'a> {
293     start_offset: usize,
294     iter: USentenceBounds<'a>,
295 }
296 
297 #[inline]
new_sentence_bounds<'a>(source: &'a str) -> USentenceBounds<'a>298 pub fn new_sentence_bounds<'a>(source: &'a str) -> USentenceBounds<'a> {
299     USentenceBounds {
300         iter: fwd::new_sentence_breaks(source),
301         sentence_start: None
302     }
303 }
304 
305 #[inline]
new_sentence_bound_indices<'a>(source: &'a str) -> USentenceBoundIndices<'a>306 pub fn new_sentence_bound_indices<'a>(source: &'a str) -> USentenceBoundIndices<'a> {
307     USentenceBoundIndices {
308         start_offset: source.as_ptr() as usize,
309         iter: new_sentence_bounds(source)
310     }
311 }
312 
313 #[inline]
new_unicode_sentences<'b>(s: &'b str) -> UnicodeSentences<'b>314 pub fn new_unicode_sentences<'b>(s: &'b str) -> UnicodeSentences<'b> {
315     use super::UnicodeSegmentation;
316     use tables::util::is_alphanumeric;
317 
318     fn has_alphanumeric(s: &&str) -> bool { s.chars().any(|c| is_alphanumeric(c)) }
319     let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
320 
321     UnicodeSentences { inner: s.split_sentence_bounds().filter(has_alphanumeric) }
322 }
323 
324 impl<'a> Iterator for UnicodeSentences<'a> {
325     type Item = &'a str;
326 
327     #[inline]
next(&mut self) -> Option<&'a str>328     fn next(&mut self) -> Option<&'a str> { self.inner.next() }
329 }
330 
331 impl<'a> Iterator for USentenceBounds<'a> {
332     type Item = &'a str;
333 
334     #[inline]
size_hint(&self) -> (usize, Option<usize>)335     fn size_hint(&self) -> (usize, Option<usize>) {
336         let (lower, upper) = self.iter.size_hint();
337         (cmp::max(0, lower - 1), upper.map(|u| cmp::max(0, u - 1)))
338     }
339 
340     #[inline]
next(&mut self) -> Option<&'a str>341     fn next(&mut self) -> Option<&'a str> {
342         if self.sentence_start == None {
343             if let Some(start_pos) = self.iter.next() {
344                 self.sentence_start = Some(start_pos)
345             } else {
346                 return None
347             }
348         }
349 
350         if let Some(break_pos) = self.iter.next() {
351             let start_pos = self.sentence_start.unwrap();
352             let sentence = &self.iter.string[start_pos..break_pos];
353             self.sentence_start = Some(break_pos);
354             Some(sentence)
355         } else {
356             None
357         }
358     }
359 }
360 
361 impl<'a> Iterator for USentenceBoundIndices<'a> {
362     type Item = (usize, &'a str);
363 
364     #[inline]
next(&mut self) -> Option<(usize, &'a str)>365     fn next(&mut self) -> Option<(usize, &'a str)> {
366         self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s))
367     }
368 
369     #[inline]
size_hint(&self) -> (usize, Option<usize>)370     fn size_hint(&self) -> (usize, Option<usize>) {
371         self.iter.size_hint()
372     }
373 }
374