1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10
11 use core::cmp;
12 use core::iter::Filter;
13
14 use tables::word::WordCat;
15
16 /// An iterator over the substrings of a string which, after splitting the string on
17 /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
18 /// contain any characters with the
19 /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
20 /// property, or with
21 /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
22 pub struct UnicodeWords<'a> {
23 inner: Filter<UWordBounds<'a>, fn(&&str) -> bool>,
24 }
25
26 impl<'a> Iterator for UnicodeWords<'a> {
27 type Item = &'a str;
28
29 #[inline]
next(&mut self) -> Option<&'a str>30 fn next(&mut self) -> Option<&'a str> { self.inner.next() }
31 }
32 impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
33 #[inline]
next_back(&mut self) -> Option<&'a str>34 fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() }
35 }
36
37 /// External iterator for a string's
38 /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
39 #[derive(Clone)]
40 pub struct UWordBounds<'a> {
41 string: &'a str,
42 cat: Option<WordCat>,
43 catb: Option<WordCat>,
44 }
45
46 /// External iterator for word boundaries and byte offsets.
47 #[derive(Clone)]
48 pub struct UWordBoundIndices<'a> {
49 start_offset: usize,
50 iter: UWordBounds<'a>,
51 }
52
53 impl<'a> UWordBoundIndices<'a> {
54 #[inline]
55 /// View the underlying data (the part yet to be iterated) as a slice of the original string.
56 ///
57 /// ```rust
58 /// # use unicode_segmentation::UnicodeSegmentation;
59 /// let mut iter = "Hello world".split_word_bound_indices();
60 /// assert_eq!(iter.as_str(), "Hello world");
61 /// iter.next();
62 /// assert_eq!(iter.as_str(), " world");
63 /// iter.next();
64 /// assert_eq!(iter.as_str(), "world");
65 /// ```
as_str(&self) -> &'a str66 pub fn as_str(&self) -> &'a str {
67 self.iter.as_str()
68 }
69 }
70
71 impl<'a> Iterator for UWordBoundIndices<'a> {
72 type Item = (usize, &'a str);
73
74 #[inline]
next(&mut self) -> Option<(usize, &'a str)>75 fn next(&mut self) -> Option<(usize, &'a str)> {
76 self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s))
77 }
78
79 #[inline]
size_hint(&self) -> (usize, Option<usize>)80 fn size_hint(&self) -> (usize, Option<usize>) {
81 self.iter.size_hint()
82 }
83 }
84
85 impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> {
86 #[inline]
next_back(&mut self) -> Option<(usize, &'a str)>87 fn next_back(&mut self) -> Option<(usize, &'a str)> {
88 self.iter.next_back().map(|s| (s.as_ptr() as usize - self.start_offset, s))
89 }
90 }
91
92 // state machine for word boundary rules
93 #[derive(Clone,Copy,PartialEq,Eq,Debug)]
94 enum UWordBoundsState {
95 Start,
96 Letter,
97 HLetter,
98 Numeric,
99 Katakana,
100 ExtendNumLet,
101 Regional(RegionalState),
102 FormatExtend(FormatExtendType),
103 Zwj,
104 Emoji,
105 WSegSpace,
106 }
107
108 // subtypes for FormatExtend state in UWordBoundsState
109 #[derive(Clone,Copy,PartialEq,Eq,Debug)]
110 enum FormatExtendType {
111 AcceptAny,
112 AcceptNone,
113 RequireLetter,
114 RequireHLetter,
115 AcceptQLetter,
116 RequireNumeric,
117 }
118
119 #[derive(Clone,Copy,PartialEq,Eq,Debug)]
120 enum RegionalState {
121 Half,
122 Full,
123 Unknown,
124 }
125
is_emoji(ch: char) -> bool126 fn is_emoji(ch: char) -> bool {
127 use tables::emoji;
128 emoji::emoji_category(ch) == emoji::EmojiCat::EC_Extended_Pictographic
129 }
130
131 impl<'a> Iterator for UWordBounds<'a> {
132 type Item = &'a str;
133
134 #[inline]
size_hint(&self) -> (usize, Option<usize>)135 fn size_hint(&self) -> (usize, Option<usize>) {
136 let slen = self.string.len();
137 (cmp::min(slen, 1), Some(slen))
138 }
139
140 #[inline]
next(&mut self) -> Option<&'a str>141 fn next(&mut self) -> Option<&'a str> {
142 use self::UWordBoundsState::*;
143 use self::FormatExtendType::*;
144 use tables::word as wd;
145 if self.string.len() == 0 {
146 return None;
147 }
148
149 let mut take_curr = true;
150 let mut take_cat = true;
151 let mut idx = 0;
152 let mut saveidx = 0;
153 let mut state = Start;
154 let mut cat = wd::WC_Any;
155 let mut savecat = wd::WC_Any;
156
157 // Whether or not the previous category was ZWJ
158 // ZWJs get collapsed, so this handles precedence of WB3c over WB4
159 let mut prev_zwj;
160 // If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
161 let mut skipped_format_extend = false;
162 for (curr, ch) in self.string.char_indices() {
163 idx = curr;
164 prev_zwj = cat == wd::WC_ZWJ;
165 // if there's a category cached, grab it
166 cat = match self.cat {
167 None => wd::word_category(ch),
168 _ => self.cat.take().unwrap()
169 };
170 take_cat = true;
171
172 // handle rule WB4
173 // just skip all format, extend, and zwj chars
174 // note that Start is a special case: if there's a bunch of Format | Extend
175 // characters at the beginning of a block of text, dump them out as one unit.
176 //
177 // (This is not obvious from the wording of UAX#29, but if you look at the
178 // test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
179 // then the "correct" interpretation of WB4 becomes apparent.)
180 if state != Start {
181 match cat {
182 wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => {
183 skipped_format_extend = true;
184 continue
185 }
186 _ => {}
187 }
188 }
189
190 // rule WB3c
191 // WB4 makes all ZWJs collapse into the previous state
192 // but you can still be in a Zwj state if you started with Zwj
193 //
194 // This means that an EP + Zwj will collapse into EP, which is wrong,
195 // since EP+EP is not a boundary but EP+ZWJ+EP is
196 //
197 // Thus, we separately keep track of whether or not the last character
198 // was a ZWJ. This is an additional bit of state tracked outside of the
199 // state enum; the state enum represents the last non-zwj state encountered.
200 // When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
201 // however we are in the previous state for the purposes of all other rules.
202 if prev_zwj {
203 if is_emoji(ch) {
204 state = Emoji;
205 continue;
206 }
207 }
208 // Don't use `continue` in this match without updating `cat`
209 state = match state {
210 Start if cat == wd::WC_CR => {
211 idx += match self.get_next_cat(idx) {
212 Some(ncat) if ncat == wd::WC_LF => 1, // rule WB3
213 _ => 0
214 };
215 break; // rule WB3a
216 },
217 Start => match cat {
218 wd::WC_ALetter => Letter, // rule WB5, WB6, WB9, WB13a
219 wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB6, WB7a, WB7b, WB9, WB13a
220 wd::WC_Numeric => Numeric, // rule WB8, WB10, WB12, WB13a
221 wd::WC_Katakana => Katakana, // rule WB13, WB13a
222 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b
223 wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c
224 wd::WC_LF | wd::WC_Newline => break, // rule WB3a
225 wd::WC_ZWJ => Zwj, // rule WB3c
226 wd::WC_WSegSpace => WSegSpace, // rule WB3d
227 _ => {
228 if let Some(ncat) = self.get_next_cat(idx) { // rule WB4
229 if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ {
230 state = FormatExtend(AcceptNone);
231 self.cat = Some(ncat);
232 continue;
233 }
234 }
235 break; // rule WB999
236 }
237 },
238 WSegSpace => match cat {
239 wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
240 _ => {
241 take_curr = false;
242 break;
243 }
244 },
245 Zwj => {
246 // We already handle WB3c above.
247 take_curr = false;
248 break;
249 }
250 Letter | HLetter => match cat {
251 wd::WC_ALetter => Letter, // rule WB5
252 wd::WC_Hebrew_Letter => HLetter, // rule WB5
253 wd::WC_Numeric => Numeric, // rule WB9
254 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
255 wd::WC_Double_Quote if state == HLetter => {
256 savecat = cat;
257 saveidx = idx;
258 FormatExtend(RequireHLetter) // rule WB7b
259 },
260 wd::WC_Single_Quote if state == HLetter => {
261 FormatExtend(AcceptQLetter) // rule WB7a
262 },
263 wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
264 savecat = cat;
265 saveidx = idx;
266 FormatExtend(RequireLetter) // rule WB6
267 },
268 _ => {
269 take_curr = false;
270 break;
271 }
272 },
273 Numeric => match cat {
274 wd::WC_Numeric => Numeric, // rule WB8
275 wd::WC_ALetter => Letter, // rule WB10
276 wd::WC_Hebrew_Letter => HLetter, // rule WB10
277 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
278 wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
279 savecat = cat;
280 saveidx = idx;
281 FormatExtend(RequireNumeric) // rule WB12
282 },
283 _ => {
284 take_curr = false;
285 break;
286 }
287 },
288 Katakana => match cat {
289 wd::WC_Katakana => Katakana, // rule WB13
290 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
291 _ => {
292 take_curr = false;
293 break;
294 }
295 },
296 ExtendNumLet => match cat {
297 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
298 wd::WC_ALetter => Letter, // rule WB13b
299 wd::WC_Hebrew_Letter => HLetter, // rule WB13b
300 wd::WC_Numeric => Numeric, // rule WB13b
301 wd::WC_Katakana => Katakana, // rule WB13b
302 _ => {
303 take_curr = false;
304 break;
305 }
306 },
307 Regional(RegionalState::Full) => {
308 // if it reaches here we've gone too far,
309 // a full flag can only compose with ZWJ/Extend/Format
310 // proceeding it.
311 take_curr = false;
312 break;
313 }
314 Regional(RegionalState::Half) => match cat {
315 wd::WC_Regional_Indicator => Regional(RegionalState::Full), // rule WB13c
316 _ => {
317 take_curr = false;
318 break;
319 }
320 },
321 Regional(_) => unreachable!("RegionalState::Unknown should not occur on forward iteration"),
322 Emoji => {
323 // We already handle WB3c above. If you've reached this point, the emoji sequence is over.
324 take_curr = false;
325 break;
326 },
327 FormatExtend(t) => match t { // handle FormatExtends depending on what type
328 RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11
329 RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7
330 RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a
331 RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
332 AcceptNone | AcceptQLetter => {
333 take_curr = false; // emit all the Format|Extend characters
334 take_cat = false;
335 break;
336 },
337 _ => break // rewind (in if statement below)
338 }
339 }
340 }
341
342 if let FormatExtend(t) = state {
343 // we were looking for something and didn't find it; we have to back up
344 if t == RequireLetter || t == RequireHLetter || t == RequireNumeric {
345 idx = saveidx;
346 cat = savecat;
347 take_curr = false;
348 }
349 }
350
351 self.cat = if take_curr {
352 idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
353 None
354 } else if take_cat {
355 Some(cat)
356 } else {
357 None
358 };
359
360 let retstr = &self.string[..idx];
361 self.string = &self.string[idx..];
362 Some(retstr)
363 }
364 }
365
366 impl<'a> DoubleEndedIterator for UWordBounds<'a> {
367 #[inline]
next_back(&mut self) -> Option<&'a str>368 fn next_back(&mut self) -> Option<&'a str> {
369 use self::UWordBoundsState::*;
370 use self::FormatExtendType::*;
371 use tables::word as wd;
372 if self.string.len() == 0 {
373 return None;
374 }
375
376 let mut take_curr = true;
377 let mut take_cat = true;
378 let mut idx = self.string.len();
379 idx -= self.string.chars().next_back().unwrap().len_utf8();
380 let mut previdx = idx;
381 let mut saveidx = idx;
382 let mut state = Start;
383 let mut savestate = Start;
384 let mut cat = wd::WC_Any;
385
386 let mut skipped_format_extend = false;
387
388 for (curr, ch) in self.string.char_indices().rev() {
389 previdx = idx;
390 idx = curr;
391
392 // if there's a category cached, grab it
393 cat = match self.catb {
394 None => wd::word_category(ch),
395 _ => self.catb.take().unwrap()
396 };
397 take_cat = true;
398
399 // backward iterator over word boundaries. Mostly the same as the forward
400 // iterator, with two weirdnesses:
401 // (1) If we encounter a single quote in the Start state, we have to check for a
402 // Hebrew Letter immediately before it.
403 // (2) Format and Extend char handling takes some gymnastics.
404
405 if cat == wd::WC_Extend
406 || cat == wd::WC_Format
407 || (cat == wd::WC_ZWJ && state != Zwj) { // WB3c has more priority so we should not
408 // fold in that case
409 if match state {
410 FormatExtend(_) | Start => false,
411 _ => true
412 } {
413 saveidx = previdx;
414 savestate = state;
415 state = FormatExtend(AcceptNone);
416 }
417
418 if state != Start {
419 continue;
420 }
421 } else if state == FormatExtend(AcceptNone) {
422 // finished a scan of some Format|Extend chars, restore previous state
423 state = savestate;
424 previdx = saveidx;
425 take_cat = false;
426 skipped_format_extend = true;
427 }
428
429 // Don't use `continue` in this match without updating `catb`
430 state = match state {
431 Start | FormatExtend(AcceptAny) => match cat {
432 _ if is_emoji(ch) => Zwj,
433 wd::WC_ALetter => Letter, // rule WB5, WB7, WB10, WB13b
434 wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB7, WB7c, WB10, WB13b
435 wd::WC_Numeric => Numeric, // rule WB8, WB9, WB11, WB13b
436 wd::WC_Katakana => Katakana, // rule WB13, WB13b
437 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
438 wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c
439 // rule WB4:
440 wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny),
441 wd::WC_Single_Quote => {
442 saveidx = idx;
443 FormatExtend(AcceptQLetter) // rule WB7a
444 },
445 wd::WC_WSegSpace => WSegSpace,
446 wd::WC_CR | wd::WC_LF | wd::WC_Newline => {
447 if state == Start {
448 if cat == wd::WC_LF {
449 idx -= match self.get_prev_cat(idx) {
450 Some(pcat) if pcat == wd::WC_CR => 1, // rule WB3
451 _ => 0
452 };
453 }
454 } else {
455 take_curr = false;
456 }
457 break; // rule WB3a
458 },
459 _ => break // rule WB999
460 },
461 Zwj => match cat { // rule WB3c
462 wd::WC_ZWJ => {
463 FormatExtend(AcceptAny)
464 }
465 _ => {
466 take_curr = false;
467 break;
468 }
469 },
470 WSegSpace => match cat { // rule WB3d
471 wd::WC_WSegSpace if !skipped_format_extend => {
472 WSegSpace
473 }
474 _ => {
475 take_curr = false;
476 break;
477 }
478 },
479 Letter | HLetter => match cat {
480 wd::WC_ALetter => Letter, // rule WB5
481 wd::WC_Hebrew_Letter => HLetter, // rule WB5
482 wd::WC_Numeric => Numeric, // rule WB10
483 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
484 wd::WC_Double_Quote if state == HLetter => {
485 saveidx = previdx;
486 FormatExtend(RequireHLetter) // rule WB7c
487 },
488 wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
489 saveidx = previdx;
490 FormatExtend(RequireLetter) // rule WB7
491 },
492 _ => {
493 take_curr = false;
494 break;
495 }
496 },
497 Numeric => match cat {
498 wd::WC_Numeric => Numeric, // rule WB8
499 wd::WC_ALetter => Letter, // rule WB9
500 wd::WC_Hebrew_Letter => HLetter, // rule WB9
501 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
502 wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
503 saveidx = previdx;
504 FormatExtend(RequireNumeric) // rule WB11
505 },
506 _ => {
507 take_curr = false;
508 break;
509 }
510 },
511 Katakana => match cat {
512 wd::WC_Katakana => Katakana, // rule WB13
513 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
514 _ => {
515 take_curr = false;
516 break;
517 }
518 },
519 ExtendNumLet => match cat {
520 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
521 wd::WC_ALetter => Letter, // rule WB13a
522 wd::WC_Hebrew_Letter => HLetter, // rule WB13a
523 wd::WC_Numeric => Numeric, // rule WB13a
524 wd::WC_Katakana => Katakana, // rule WB13a
525 _ => {
526 take_curr = false;
527 break;
528 }
529 },
530 Regional(mut regional_state) => match cat {
531 // rule WB13c
532 wd::WC_Regional_Indicator => {
533 if regional_state == RegionalState::Unknown {
534 let count = self.string[..previdx]
535 .chars().rev()
536 .map(|c| wd::word_category(c))
537 .filter(|&c| ! (c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format))
538 .take_while(|&c| c == wd::WC_Regional_Indicator)
539 .count();
540 regional_state = if count % 2 == 0 {
541 RegionalState::Full
542 } else {
543 RegionalState::Half
544 };
545 }
546 if regional_state == RegionalState::Full {
547 take_curr = false;
548 break;
549 } else {
550 Regional(RegionalState::Full)
551 }
552 }
553 _ => {
554 take_curr = false;
555 break;
556 }
557 },
558 Emoji => {
559 if is_emoji(ch) { // rule WB3c
560 Zwj
561 } else {
562 take_curr = false;
563 break;
564 }
565 },
566 FormatExtend(t) => match t {
567 RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB12
568 RequireLetter if cat == wd::WC_ALetter => Letter, // rule WB6
569 RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB6
570 AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7a
571 RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
572 _ => break // backtrack will happens
573 }
574 }
575 }
576
577 if let FormatExtend(t) = state {
578 // if we required something but didn't find it, backtrack
579 if t == RequireLetter || t == RequireHLetter ||
580 t == RequireNumeric || t == AcceptNone || t == AcceptQLetter {
581 previdx = saveidx;
582 take_cat = false;
583 take_curr = false;
584 }
585 }
586
587 self.catb = if take_curr {
588 None
589 } else {
590 idx = previdx;
591 if take_cat {
592 Some(cat)
593 } else {
594 None
595 }
596 };
597
598 let retstr = &self.string[idx..];
599 self.string = &self.string[..idx];
600 Some(retstr)
601 }
602 }
603
604 impl<'a> UWordBounds<'a> {
605 #[inline]
606 /// View the underlying data (the part yet to be iterated) as a slice of the original string.
607 ///
608 /// ```rust
609 /// # use unicode_segmentation::UnicodeSegmentation;
610 /// let mut iter = "Hello world".split_word_bounds();
611 /// assert_eq!(iter.as_str(), "Hello world");
612 /// iter.next();
613 /// assert_eq!(iter.as_str(), " world");
614 /// iter.next();
615 /// assert_eq!(iter.as_str(), "world");
616 /// ```
as_str(&self) -> &'a str617 pub fn as_str(&self) -> &'a str {
618 self.string
619 }
620
621 #[inline]
get_next_cat(&self, idx: usize) -> Option<WordCat>622 fn get_next_cat(&self, idx: usize) -> Option<WordCat> {
623 use tables::word as wd;
624 let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
625 if nidx < self.string.len() {
626 let nch = self.string[nidx..].chars().next().unwrap();
627 Some(wd::word_category(nch))
628 } else {
629 None
630 }
631 }
632
633 #[inline]
get_prev_cat(&self, idx: usize) -> Option<WordCat>634 fn get_prev_cat(&self, idx: usize) -> Option<WordCat> {
635 use tables::word as wd;
636 if idx > 0 {
637 let nch = self.string[..idx].chars().next_back().unwrap();
638 Some(wd::word_category(nch))
639 } else {
640 None
641 }
642 }
643 }
644
645 #[inline]
new_word_bounds<'b>(s: &'b str) -> UWordBounds<'b>646 pub fn new_word_bounds<'b>(s: &'b str) -> UWordBounds<'b> {
647 UWordBounds { string: s, cat: None, catb: None }
648 }
649
650 #[inline]
new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b>651 pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> {
652 UWordBoundIndices { start_offset: s.as_ptr() as usize, iter: new_word_bounds(s) }
653 }
654
655 #[inline]
new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b>656 pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
657 use super::UnicodeSegmentation;
658 use tables::util::is_alphanumeric;
659
660 fn has_alphanumeric(s: &&str) -> bool { s.chars().any(|c| is_alphanumeric(c)) }
661 let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
662
663 UnicodeWords { inner: s.split_word_bounds().filter(has_alphanumeric) }
664 }
665