1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10 
11 use core::cmp;
12 use core::iter::Filter;
13 
14 use tables::word::WordCat;
15 
16 /// An iterator over the substrings of a string which, after splitting the string on
17 /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
18 /// contain any characters with the
19 /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
20 /// property, or with
21 /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
22 ///
23 /// This struct is created by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See
24 /// its documentation for more.
25 ///
26 /// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words
27 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
28 pub struct UnicodeWords<'a> {
29     inner: Filter<UWordBounds<'a>, fn(&&str) -> bool>,
30 }
31 
32 impl<'a> Iterator for UnicodeWords<'a> {
33     type Item = &'a str;
34 
35     #[inline]
next(&mut self) -> Option<&'a str>36     fn next(&mut self) -> Option<&'a str> { self.inner.next() }
37 }
38 impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
39     #[inline]
next_back(&mut self) -> Option<&'a str>40     fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() }
41 }
42 
43 /// External iterator for a string's
44 /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
45 ///
46 /// This struct is created by the [`split_word_bounds`] method on the [`UnicodeSegmentation`]
47 /// trait. See its documentation for more.
48 ///
49 /// [`split_word_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_word_bounds
50 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
51 #[derive(Clone)]
52 pub struct UWordBounds<'a> {
53     string: &'a str,
54     cat: Option<WordCat>,
55     catb: Option<WordCat>,
56 }
57 
58 /// External iterator for word boundaries and byte offsets.
59 ///
60 /// This struct is created by the [`split_word_bound_indices`] method on the
61 /// [`UnicodeSegmentation`] trait. See its documentation for more.
62 ///
63 /// [`split_word_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_word_bound_indices
64 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
65 #[derive(Clone)]
66 pub struct UWordBoundIndices<'a> {
67     start_offset: usize,
68     iter: UWordBounds<'a>,
69 }
70 
71 impl<'a> UWordBoundIndices<'a> {
72     #[inline]
73     /// View the underlying data (the part yet to be iterated) as a slice of the original string.
74     ///
75     /// ```rust
76     /// # use unicode_segmentation::UnicodeSegmentation;
77     /// let mut iter = "Hello world".split_word_bound_indices();
78     /// assert_eq!(iter.as_str(), "Hello world");
79     /// iter.next();
80     /// assert_eq!(iter.as_str(), " world");
81     /// iter.next();
82     /// assert_eq!(iter.as_str(), "world");
83     /// ```
as_str(&self) -> &'a str84     pub fn as_str(&self) -> &'a str {
85         self.iter.as_str()
86     }
87 }
88 
89 impl<'a> Iterator for UWordBoundIndices<'a> {
90     type Item = (usize, &'a str);
91 
92     #[inline]
next(&mut self) -> Option<(usize, &'a str)>93     fn next(&mut self) -> Option<(usize, &'a str)> {
94         self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s))
95     }
96 
97     #[inline]
size_hint(&self) -> (usize, Option<usize>)98     fn size_hint(&self) -> (usize, Option<usize>) {
99         self.iter.size_hint()
100     }
101 }
102 
103 impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> {
104     #[inline]
next_back(&mut self) -> Option<(usize, &'a str)>105     fn next_back(&mut self) -> Option<(usize, &'a str)> {
106         self.iter.next_back().map(|s| (s.as_ptr() as usize - self.start_offset, s))
107     }
108 }
109 
110 // state machine for word boundary rules
111 #[derive(Clone,Copy,PartialEq,Eq,Debug)]
112 enum UWordBoundsState {
113     Start,
114     Letter,
115     HLetter,
116     Numeric,
117     Katakana,
118     ExtendNumLet,
119     Regional(RegionalState),
120     FormatExtend(FormatExtendType),
121     Zwj,
122     Emoji,
123     WSegSpace,
124 }
125 
126 // subtypes for FormatExtend state in UWordBoundsState
127 #[derive(Clone,Copy,PartialEq,Eq,Debug)]
128 enum FormatExtendType {
129     AcceptAny,
130     AcceptNone,
131     RequireLetter,
132     RequireHLetter,
133     AcceptQLetter,
134     RequireNumeric,
135 }
136 
137 #[derive(Clone,Copy,PartialEq,Eq,Debug)]
138 enum RegionalState {
139     Half,
140     Full,
141     Unknown,
142 }
143 
is_emoji(ch: char) -> bool144 fn is_emoji(ch: char) -> bool {
145     use tables::emoji;
146     emoji::emoji_category(ch).2 == emoji::EmojiCat::EC_Extended_Pictographic
147 }
148 
149 impl<'a> Iterator for UWordBounds<'a> {
150     type Item = &'a str;
151 
152     #[inline]
size_hint(&self) -> (usize, Option<usize>)153     fn size_hint(&self) -> (usize, Option<usize>) {
154         let slen = self.string.len();
155         (cmp::min(slen, 1), Some(slen))
156     }
157 
158     #[inline]
next(&mut self) -> Option<&'a str>159     fn next(&mut self) -> Option<&'a str> {
160         use self::UWordBoundsState::*;
161         use self::FormatExtendType::*;
162         use tables::word as wd;
163         if self.string.len() == 0 {
164             return None;
165         }
166 
167         let mut take_curr = true;
168         let mut take_cat = true;
169         let mut idx = 0;
170         let mut saveidx = 0;
171         let mut state = Start;
172         let mut cat = wd::WC_Any;
173         let mut savecat = wd::WC_Any;
174 
175         // Whether or not the previous category was ZWJ
176         // ZWJs get collapsed, so this handles precedence of WB3c over WB4
177         let mut prev_zwj;
178         // If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
179         let mut skipped_format_extend = false;
180         for (curr, ch) in self.string.char_indices() {
181             idx = curr;
182             prev_zwj = cat == wd::WC_ZWJ;
183             // if there's a category cached, grab it
184             cat = match self.cat {
185                 None => wd::word_category(ch).2,
186                 _ => self.cat.take().unwrap()
187             };
188             take_cat = true;
189 
190             // handle rule WB4
191             // just skip all format, extend, and zwj chars
192             // note that Start is a special case: if there's a bunch of Format | Extend
193             // characters at the beginning of a block of text, dump them out as one unit.
194             //
195             // (This is not obvious from the wording of UAX#29, but if you look at the
196             // test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
197             // then the "correct" interpretation of WB4 becomes apparent.)
198             if state != Start {
199                 match cat {
200                     wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => {
201                         skipped_format_extend = true;
202                         continue
203                     }
204                     _ => {}
205                 }
206             }
207 
208             // rule WB3c
209             // WB4 makes all ZWJs collapse into the previous state
210             // but you can still be in a Zwj state if you started with Zwj
211             //
212             // This means that an EP + Zwj will collapse into EP, which is wrong,
213             // since EP+EP is not a boundary but EP+ZWJ+EP is
214             //
215             // Thus, we separately keep track of whether or not the last character
216             // was a ZWJ. This is an additional bit of state tracked outside of the
217             // state enum; the state enum represents the last non-zwj state encountered.
218             // When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
219             // however we are in the previous state for the purposes of all other rules.
220             if prev_zwj {
221                 if is_emoji(ch) {
222                     state = Emoji;
223                     continue;
224                 }
225             }
226             // Don't use `continue` in this match without updating `cat`
227             state = match state {
228                 Start if cat == wd::WC_CR => {
229                     idx += match self.get_next_cat(idx) {
230                         Some(ncat) if ncat == wd::WC_LF => 1,       // rule WB3
231                         _ => 0
232                     };
233                     break;                                          // rule WB3a
234                 },
235                 Start => match cat {
236                     wd::WC_ALetter => Letter,           // rule WB5, WB6, WB9, WB13a
237                     wd::WC_Hebrew_Letter => HLetter,    // rule WB5, WB6, WB7a, WB7b, WB9, WB13a
238                     wd::WC_Numeric => Numeric,          // rule WB8, WB10, WB12, WB13a
239                     wd::WC_Katakana => Katakana,        // rule WB13, WB13a
240                     wd::WC_ExtendNumLet => ExtendNumLet,    // rule WB13a, WB13b
241                     wd::WC_Regional_Indicator => Regional(RegionalState::Half),  // rule WB13c
242                     wd::WC_LF | wd::WC_Newline => break,    // rule WB3a
243                     wd::WC_ZWJ => Zwj,                      // rule WB3c
244                     wd::WC_WSegSpace => WSegSpace,          // rule WB3d
245                     _ => {
246                         if let Some(ncat) = self.get_next_cat(idx) {                // rule WB4
247                             if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ {
248                                 state = FormatExtend(AcceptNone);
249                                 self.cat = Some(ncat);
250                                 continue;
251                             }
252                         }
253                         break;                                                      // rule WB999
254                     }
255                 },
256                 WSegSpace => match cat {
257                     wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
258                     _ => {
259                         take_curr = false;
260                         break;
261                     }
262                 },
263                 Zwj => {
264                     // We already handle WB3c above.
265                     take_curr = false;
266                     break;
267                 }
268                 Letter | HLetter => match cat {
269                     wd::WC_ALetter => Letter,                   // rule WB5
270                     wd::WC_Hebrew_Letter => HLetter,            // rule WB5
271                     wd::WC_Numeric => Numeric,                  // rule WB9
272                     wd::WC_ExtendNumLet => ExtendNumLet,        // rule WB13a
273                     wd::WC_Double_Quote if state == HLetter => {
274                         savecat = cat;
275                         saveidx = idx;
276                         FormatExtend(RequireHLetter)                        // rule WB7b
277                     },
278                     wd::WC_Single_Quote if state == HLetter => {
279                         FormatExtend(AcceptQLetter)                         // rule WB7a
280                     },
281                     wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
282                         savecat = cat;
283                         saveidx = idx;
284                         FormatExtend(RequireLetter)                         // rule WB6
285                     },
286                     _ => {
287                         take_curr = false;
288                         break;
289                     }
290                 },
291                 Numeric => match cat {
292                     wd::WC_Numeric => Numeric,                  // rule WB8
293                     wd::WC_ALetter => Letter,                   // rule WB10
294                     wd::WC_Hebrew_Letter => HLetter,            // rule WB10
295                     wd::WC_ExtendNumLet => ExtendNumLet,        // rule WB13a
296                     wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
297                         savecat = cat;
298                         saveidx = idx;
299                         FormatExtend(RequireNumeric)            // rule WB12
300                     },
301                     _ => {
302                         take_curr = false;
303                         break;
304                     }
305                 },
306                 Katakana => match cat {
307                     wd::WC_Katakana => Katakana,                // rule WB13
308                     wd::WC_ExtendNumLet => ExtendNumLet,        // rule WB13a
309                     _ => {
310                         take_curr = false;
311                         break;
312                     }
313                 },
314                 ExtendNumLet => match cat {
315                     wd::WC_ExtendNumLet => ExtendNumLet,        // rule WB13a
316                     wd::WC_ALetter => Letter,                   // rule WB13b
317                     wd::WC_Hebrew_Letter => HLetter,            // rule WB13b
318                     wd::WC_Numeric => Numeric,                  // rule WB13b
319                     wd::WC_Katakana => Katakana,                // rule WB13b
320                     _ => {
321                         take_curr = false;
322                         break;
323                     }
324                 },
325                 Regional(RegionalState::Full) => {
326                     // if it reaches here we've gone too far,
327                     // a full flag can only compose with ZWJ/Extend/Format
328                     // proceeding it.
329                     take_curr = false;
330                     break;
331                 }
332                 Regional(RegionalState::Half) => match cat {
333                     wd::WC_Regional_Indicator => Regional(RegionalState::Full),      // rule WB13c
334                     _ => {
335                         take_curr = false;
336                         break;
337                     }
338                 },
339                 Regional(_) => unreachable!("RegionalState::Unknown should not occur on forward iteration"),
340                 Emoji => {
341                     // We already handle WB3c above. If you've reached this point, the emoji sequence is over.
342                     take_curr = false;
343                     break;
344                 },
345                 FormatExtend(t) => match t {    // handle FormatExtends depending on what type
346                     RequireNumeric if cat == wd::WC_Numeric => Numeric,     // rule WB11
347                     RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter,   // rule WB7
348                     RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a
349                     RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter,   // rule WB7b
350                     AcceptNone | AcceptQLetter => {
351                         take_curr = false;  // emit all the Format|Extend characters
352                         take_cat = false;
353                         break;
354                     },
355                     _ => break      // rewind (in if statement below)
356                 }
357             }
358         }
359 
360         if let FormatExtend(t) = state {
361             // we were looking for something and didn't find it; we have to back up
362             if t == RequireLetter || t == RequireHLetter || t == RequireNumeric {
363                 idx = saveidx;
364                 cat = savecat;
365                 take_curr = false;
366             }
367         }
368 
369         self.cat = if take_curr {
370             idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
371             None
372         } else if take_cat {
373             Some(cat)
374         } else {
375             None
376         };
377 
378         let retstr = &self.string[..idx];
379         self.string = &self.string[idx..];
380         Some(retstr)
381     }
382 }
383 
384 impl<'a> DoubleEndedIterator for UWordBounds<'a> {
385     #[inline]
next_back(&mut self) -> Option<&'a str>386     fn next_back(&mut self) -> Option<&'a str> {
387         use self::UWordBoundsState::*;
388         use self::FormatExtendType::*;
389         use tables::word as wd;
390         if self.string.len() == 0 {
391             return None;
392         }
393 
394         let mut take_curr = true;
395         let mut take_cat = true;
396         let mut idx = self.string.len();
397         idx -= self.string.chars().next_back().unwrap().len_utf8();
398         let mut previdx = idx;
399         let mut saveidx = idx;
400         let mut state = Start;
401         let mut savestate = Start;
402         let mut cat = wd::WC_Any;
403 
404         let mut skipped_format_extend = false;
405 
406         for (curr, ch) in self.string.char_indices().rev() {
407             previdx = idx;
408             idx = curr;
409 
410             // if there's a category cached, grab it
411             cat = match self.catb {
412                 None => wd::word_category(ch).2,
413                 _ => self.catb.take().unwrap()
414             };
415             take_cat = true;
416 
417             // backward iterator over word boundaries. Mostly the same as the forward
418             // iterator, with two weirdnesses:
419             // (1) If we encounter a single quote in the Start state, we have to check for a
420             //     Hebrew Letter immediately before it.
421             // (2) Format and Extend char handling takes some gymnastics.
422 
423             if cat == wd::WC_Extend
424                 || cat == wd::WC_Format
425                 || (cat == wd::WC_ZWJ && state != Zwj) { // WB3c has more priority so we should not
426                                                          // fold in that case
427                 if match state {
428                     FormatExtend(_) | Start => false,
429                     _ => true
430                 } {
431                     saveidx = previdx;
432                     savestate = state;
433                     state = FormatExtend(AcceptNone);
434                 }
435 
436                 if state != Start {
437                     continue;
438                 }
439             } else if state == FormatExtend(AcceptNone) {
440                 // finished a scan of some Format|Extend chars, restore previous state
441                 state = savestate;
442                 previdx = saveidx;
443                 take_cat = false;
444                 skipped_format_extend = true;
445             }
446 
447             // Don't use `continue` in this match without updating `catb`
448             state = match state {
449                 Start | FormatExtend(AcceptAny) => match cat {
450                     _ if is_emoji(ch) => Zwj,
451                     wd::WC_ALetter => Letter,           // rule WB5, WB7, WB10, WB13b
452                     wd::WC_Hebrew_Letter => HLetter,    // rule WB5, WB7, WB7c, WB10, WB13b
453                     wd::WC_Numeric => Numeric,          // rule WB8, WB9, WB11, WB13b
454                     wd::WC_Katakana => Katakana,                    // rule WB13, WB13b
455                     wd::WC_ExtendNumLet => ExtendNumLet,                    // rule WB13a
456                     wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c
457                     // rule WB4:
458                     wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny),
459                     wd::WC_Single_Quote => {
460                         saveidx = idx;
461                         FormatExtend(AcceptQLetter)                         // rule WB7a
462                     },
463                     wd::WC_WSegSpace => WSegSpace,
464                     wd::WC_CR | wd::WC_LF | wd::WC_Newline => {
465                         if state == Start {
466                             if cat == wd::WC_LF {
467                                 idx -= match self.get_prev_cat(idx) {
468                                     Some(pcat) if pcat == wd::WC_CR => 1,   // rule WB3
469                                     _ => 0
470                                 };
471                             }
472                         } else {
473                             take_curr = false;
474                         }
475                         break;                                              // rule WB3a
476                     },
477                     _ => break                              // rule WB999
478                 },
479                 Zwj => match cat {                          // rule WB3c
480                     wd::WC_ZWJ => {
481                         FormatExtend(AcceptAny)
482                     }
483                     _ => {
484                         take_curr = false;
485                         break;
486                     }
487                 },
488                 WSegSpace => match cat {                          // rule WB3d
489                     wd::WC_WSegSpace if !skipped_format_extend => {
490                         WSegSpace
491                     }
492                     _ => {
493                         take_curr = false;
494                         break;
495                     }
496                 },
497                 Letter | HLetter => match cat {
498                     wd::WC_ALetter => Letter,               // rule WB5
499                     wd::WC_Hebrew_Letter => HLetter,        // rule WB5
500                     wd::WC_Numeric => Numeric,              // rule WB10
501                     wd::WC_ExtendNumLet => ExtendNumLet,    // rule WB13b
502                     wd::WC_Double_Quote if state == HLetter => {
503                         saveidx = previdx;
504                         FormatExtend(RequireHLetter)         // rule WB7c
505                     },
506                     wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
507                         saveidx = previdx;
508                         FormatExtend(RequireLetter)          // rule WB7
509                     },
510                     _ => {
511                         take_curr = false;
512                         break;
513                     }
514                 },
515                 Numeric => match cat {
516                     wd::WC_Numeric => Numeric,              // rule WB8
517                     wd::WC_ALetter => Letter,               // rule WB9
518                     wd::WC_Hebrew_Letter => HLetter,        // rule WB9
519                     wd::WC_ExtendNumLet => ExtendNumLet,    // rule WB13b
520                     wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
521                         saveidx = previdx;
522                         FormatExtend(RequireNumeric)         // rule WB11
523                     },
524                     _ => {
525                         take_curr = false;
526                         break;
527                     }
528                 },
529                 Katakana => match cat {
530                     wd::WC_Katakana => Katakana,            // rule WB13
531                     wd::WC_ExtendNumLet => ExtendNumLet,    // rule WB13b
532                     _ => {
533                         take_curr = false;
534                         break;
535                     }
536                 },
537                 ExtendNumLet => match cat {
538                     wd::WC_ExtendNumLet => ExtendNumLet,    // rule WB13a
539                     wd::WC_ALetter => Letter,               // rule WB13a
540                     wd::WC_Hebrew_Letter => HLetter,        // rule WB13a
541                     wd::WC_Numeric => Numeric,              // rule WB13a
542                     wd::WC_Katakana => Katakana,            // rule WB13a
543                     _ => {
544                         take_curr = false;
545                         break;
546                     }
547                 },
548                 Regional(mut regional_state) => match cat {
549                     // rule WB13c
550                     wd::WC_Regional_Indicator => {
551                         if regional_state == RegionalState::Unknown {
552                             let count = self.string[..previdx]
553                                             .chars().rev()
554                                             .map(|c| wd::word_category(c).2)
555                                             .filter(|&c| ! (c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format))
556                                             .take_while(|&c| c == wd::WC_Regional_Indicator)
557                                             .count();
558                             regional_state = if count % 2 == 0 {
559                                 RegionalState::Full
560                             } else {
561                                 RegionalState::Half
562                             };
563                         }
564                         if regional_state == RegionalState::Full {
565                             take_curr = false;
566                             break;
567                         } else {
568                             Regional(RegionalState::Full)
569                         }
570                     }
571                     _ => {
572                         take_curr = false;
573                         break;
574                     }
575                 },
576                 Emoji => {
577                     if is_emoji(ch) {           // rule WB3c
578                         Zwj
579                     } else {
580                         take_curr = false;
581                         break;
582                     }
583                 },
584                 FormatExtend(t) => match t {
585                     RequireNumeric if cat == wd::WC_Numeric => Numeric,          // rule WB12
586                     RequireLetter if cat == wd::WC_ALetter => Letter,            // rule WB6
587                     RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter,     // rule WB6
588                     AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter,     // rule WB7a
589                     RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter,    // rule WB7b
590                     _ => break  // backtrack will happens
591                 }
592             }
593         }
594 
595         if let FormatExtend(t) = state {
596             // if we required something but didn't find it, backtrack
597             if t == RequireLetter || t == RequireHLetter ||
598                 t == RequireNumeric || t == AcceptNone || t == AcceptQLetter {
599                 previdx = saveidx;
600                 take_cat = false;
601                 take_curr = false;
602             }
603         }
604 
605         self.catb = if take_curr {
606             None
607         } else {
608             idx = previdx;
609             if take_cat {
610                 Some(cat)
611             } else {
612                 None
613             }
614         };
615 
616         let retstr = &self.string[idx..];
617         self.string = &self.string[..idx];
618         Some(retstr)
619     }
620 }
621 
622 impl<'a> UWordBounds<'a> {
623     #[inline]
624     /// View the underlying data (the part yet to be iterated) as a slice of the original string.
625     ///
626     /// ```rust
627     /// # use unicode_segmentation::UnicodeSegmentation;
628     /// let mut iter = "Hello world".split_word_bounds();
629     /// assert_eq!(iter.as_str(), "Hello world");
630     /// iter.next();
631     /// assert_eq!(iter.as_str(), " world");
632     /// iter.next();
633     /// assert_eq!(iter.as_str(), "world");
634     /// ```
as_str(&self) -> &'a str635     pub fn as_str(&self) -> &'a str {
636         self.string
637     }
638 
639     #[inline]
get_next_cat(&self, idx: usize) -> Option<WordCat>640     fn get_next_cat(&self, idx: usize) -> Option<WordCat> {
641         use tables::word as wd;
642         let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
643         if nidx < self.string.len() {
644             let nch = self.string[nidx..].chars().next().unwrap();
645             Some(wd::word_category(nch).2)
646         } else {
647             None
648         }
649     }
650 
651     #[inline]
get_prev_cat(&self, idx: usize) -> Option<WordCat>652     fn get_prev_cat(&self, idx: usize) -> Option<WordCat> {
653         use tables::word as wd;
654         if idx > 0 {
655             let nch = self.string[..idx].chars().next_back().unwrap();
656             Some(wd::word_category(nch).2)
657         } else {
658             None
659         }
660     }
661 }
662 
663 #[inline]
new_word_bounds<'b>(s: &'b str) -> UWordBounds<'b>664 pub fn new_word_bounds<'b>(s: &'b str) -> UWordBounds<'b> {
665     UWordBounds { string: s, cat: None, catb: None }
666 }
667 
668 #[inline]
new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b>669 pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> {
670     UWordBoundIndices { start_offset: s.as_ptr() as usize, iter: new_word_bounds(s) }
671 }
672 
673 #[inline]
new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b>674 pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
675     use super::UnicodeSegmentation;
676     use tables::util::is_alphanumeric;
677 
678     fn has_alphanumeric(s: &&str) -> bool { s.chars().any(|c| is_alphanumeric(c)) }
679     let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
680 
681     UnicodeWords { inner: s.split_word_bounds().filter(has_alphanumeric) }
682 }
683