1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10 
11 use core::cmp;
12 use core::iter::Filter;
13 
14 use tables::word::WordCat;
15 
16 /// An iterator over the substrings of a string which, after splitting the string on
17 /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
18 /// contain any characters with the
19 /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
20 /// property, or with
21 /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
22 pub struct UnicodeWords<'a> {
23     inner: Filter<UWordBounds<'a>, fn(&&str) -> bool>,
24 }
25 
26 impl<'a> Iterator for UnicodeWords<'a> {
27     type Item = &'a str;
28 
29     #[inline]
next(&mut self) -> Option<&'a str>30     fn next(&mut self) -> Option<&'a str> { self.inner.next() }
31 }
32 impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
33     #[inline]
next_back(&mut self) -> Option<&'a str>34     fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() }
35 }
36 
37 /// External iterator for a string's
38 /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
39 #[derive(Clone)]
40 pub struct UWordBounds<'a> {
41     string: &'a str,
42     cat: Option<WordCat>,
43     catb: Option<WordCat>,
44 }
45 
46 /// External iterator for word boundaries and byte offsets.
47 #[derive(Clone)]
48 pub struct UWordBoundIndices<'a> {
49     start_offset: usize,
50     iter: UWordBounds<'a>,
51 }
52 
53 impl<'a> UWordBoundIndices<'a> {
54     #[inline]
55     /// View the underlying data (the part yet to be iterated) as a slice of the original string.
56     ///
57     /// ```rust
58     /// # use unicode_segmentation::UnicodeSegmentation;
59     /// let mut iter = "Hello world".split_word_bound_indices();
60     /// assert_eq!(iter.as_str(), "Hello world");
61     /// iter.next();
62     /// assert_eq!(iter.as_str(), " world");
63     /// iter.next();
64     /// assert_eq!(iter.as_str(), "world");
65     /// ```
as_str(&self) -> &'a str66     pub fn as_str(&self) -> &'a str {
67         self.iter.as_str()
68     }
69 }
70 
71 impl<'a> Iterator for UWordBoundIndices<'a> {
72     type Item = (usize, &'a str);
73 
74     #[inline]
next(&mut self) -> Option<(usize, &'a str)>75     fn next(&mut self) -> Option<(usize, &'a str)> {
76         self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s))
77     }
78 
79     #[inline]
size_hint(&self) -> (usize, Option<usize>)80     fn size_hint(&self) -> (usize, Option<usize>) {
81         self.iter.size_hint()
82     }
83 }
84 
85 impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> {
86     #[inline]
next_back(&mut self) -> Option<(usize, &'a str)>87     fn next_back(&mut self) -> Option<(usize, &'a str)> {
88         self.iter.next_back().map(|s| (s.as_ptr() as usize - self.start_offset, s))
89     }
90 }
91 
92 // state machine for word boundary rules
93 #[derive(Clone,Copy,PartialEq,Eq,Debug)]
94 enum UWordBoundsState {
95     Start,
96     Letter,
97     HLetter,
98     Numeric,
99     Katakana,
100     ExtendNumLet,
101     Regional(RegionalState),
102     FormatExtend(FormatExtendType),
103     Zwj,
104     Emoji,
105     WSegSpace,
106 }
107 
108 // subtypes for FormatExtend state in UWordBoundsState
109 #[derive(Clone,Copy,PartialEq,Eq,Debug)]
110 enum FormatExtendType {
111     AcceptAny,
112     AcceptNone,
113     RequireLetter,
114     RequireHLetter,
115     AcceptQLetter,
116     RequireNumeric,
117 }
118 
119 #[derive(Clone,Copy,PartialEq,Eq,Debug)]
120 enum RegionalState {
121     Half,
122     Full,
123     Unknown,
124 }
125 
is_emoji(ch: char) -> bool126 fn is_emoji(ch: char) -> bool {
127     use tables::emoji;
128     emoji::emoji_category(ch) == emoji::EmojiCat::EC_Extended_Pictographic
129 }
130 
131 impl<'a> Iterator for UWordBounds<'a> {
132     type Item = &'a str;
133 
134     #[inline]
size_hint(&self) -> (usize, Option<usize>)135     fn size_hint(&self) -> (usize, Option<usize>) {
136         let slen = self.string.len();
137         (cmp::min(slen, 1), Some(slen))
138     }
139 
140     #[inline]
next(&mut self) -> Option<&'a str>141     fn next(&mut self) -> Option<&'a str> {
142         use self::UWordBoundsState::*;
143         use self::FormatExtendType::*;
144         use tables::word as wd;
145         if self.string.len() == 0 {
146             return None;
147         }
148 
149         let mut take_curr = true;
150         let mut take_cat = true;
151         let mut idx = 0;
152         let mut saveidx = 0;
153         let mut state = Start;
154         let mut cat = wd::WC_Any;
155         let mut savecat = wd::WC_Any;
156 
157         // Whether or not the previous category was ZWJ
158         // ZWJs get collapsed, so this handles precedence of WB3c over WB4
159         let mut prev_zwj;
160         // If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
161         let mut skipped_format_extend = false;
162         for (curr, ch) in self.string.char_indices() {
163             idx = curr;
164             prev_zwj = cat == wd::WC_ZWJ;
165             // if there's a category cached, grab it
166             cat = match self.cat {
167                 None => wd::word_category(ch),
168                 _ => self.cat.take().unwrap()
169             };
170             take_cat = true;
171 
172             // handle rule WB4
173             // just skip all format, extend, and zwj chars
174             // note that Start is a special case: if there's a bunch of Format | Extend
175             // characters at the beginning of a block of text, dump them out as one unit.
176             //
177             // (This is not obvious from the wording of UAX#29, but if you look at the
178             // test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
179             // then the "correct" interpretation of WB4 becomes apparent.)
180             if state != Start {
181                 match cat {
182                     wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => {
183                         skipped_format_extend = true;
184                         continue
185                     }
186                     _ => {}
187                 }
188             }
189 
190             // rule WB3c
191             // WB4 makes all ZWJs collapse into the previous state
192             // but you can still be in a Zwj state if you started with Zwj
193             //
194             // This means that an EP + Zwj will collapse into EP, which is wrong,
195             // since EP+EP is not a boundary but EP+ZWJ+EP is
196             //
197             // Thus, we separately keep track of whether or not the last character
198             // was a ZWJ. This is an additional bit of state tracked outside of the
199             // state enum; the state enum represents the last non-zwj state encountered.
200             // When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
201             // however we are in the previous state for the purposes of all other rules.
202             if prev_zwj {
203                 if is_emoji(ch) {
204                     state = Emoji;
205                     continue;
206                 }
207             }
208             // Don't use `continue` in this match without updating `cat`
209             state = match state {
210                 Start if cat == wd::WC_CR => {
211                     idx += match self.get_next_cat(idx) {
212                         Some(ncat) if ncat == wd::WC_LF => 1,       // rule WB3
213                         _ => 0
214                     };
215                     break;                                          // rule WB3a
216                 },
217                 Start => match cat {
218                     wd::WC_ALetter => Letter,           // rule WB5, WB6, WB9, WB13a
219                     wd::WC_Hebrew_Letter => HLetter,    // rule WB5, WB6, WB7a, WB7b, WB9, WB13a
220                     wd::WC_Numeric => Numeric,          // rule WB8, WB10, WB12, WB13a
221                     wd::WC_Katakana => Katakana,        // rule WB13, WB13a
222                     wd::WC_ExtendNumLet => ExtendNumLet,    // rule WB13a, WB13b
223                     wd::WC_Regional_Indicator => Regional(RegionalState::Half),  // rule WB13c
224                     wd::WC_LF | wd::WC_Newline => break,    // rule WB3a
225                     wd::WC_ZWJ => Zwj,                      // rule WB3c
226                     wd::WC_WSegSpace => WSegSpace,          // rule WB3d
227                     _ => {
228                         if let Some(ncat) = self.get_next_cat(idx) {                // rule WB4
229                             if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ {
230                                 state = FormatExtend(AcceptNone);
231                                 self.cat = Some(ncat);
232                                 continue;
233                             }
234                         }
235                         break;                                                      // rule WB999
236                     }
237                 },
238                 WSegSpace => match cat {
239                     wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
240                     _ => {
241                         take_curr = false;
242                         break;
243                     }
244                 },
245                 Zwj => {
246                     // We already handle WB3c above.
247                     take_curr = false;
248                     break;
249                 }
250                 Letter | HLetter => match cat {
251                     wd::WC_ALetter => Letter,                   // rule WB5
252                     wd::WC_Hebrew_Letter => HLetter,            // rule WB5
253                     wd::WC_Numeric => Numeric,                  // rule WB9
254                     wd::WC_ExtendNumLet => ExtendNumLet,        // rule WB13a
255                     wd::WC_Double_Quote if state == HLetter => {
256                         savecat = cat;
257                         saveidx = idx;
258                         FormatExtend(RequireHLetter)                        // rule WB7b
259                     },
260                     wd::WC_Single_Quote if state == HLetter => {
261                         FormatExtend(AcceptQLetter)                         // rule WB7a
262                     },
263                     wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
264                         savecat = cat;
265                         saveidx = idx;
266                         FormatExtend(RequireLetter)                         // rule WB6
267                     },
268                     _ => {
269                         take_curr = false;
270                         break;
271                     }
272                 },
273                 Numeric => match cat {
274                     wd::WC_Numeric => Numeric,                  // rule WB8
275                     wd::WC_ALetter => Letter,                   // rule WB10
276                     wd::WC_Hebrew_Letter => HLetter,            // rule WB10
277                     wd::WC_ExtendNumLet => ExtendNumLet,        // rule WB13a
278                     wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
279                         savecat = cat;
280                         saveidx = idx;
281                         FormatExtend(RequireNumeric)            // rule WB12
282                     },
283                     _ => {
284                         take_curr = false;
285                         break;
286                     }
287                 },
288                 Katakana => match cat {
289                     wd::WC_Katakana => Katakana,                // rule WB13
290                     wd::WC_ExtendNumLet => ExtendNumLet,        // rule WB13a
291                     _ => {
292                         take_curr = false;
293                         break;
294                     }
295                 },
296                 ExtendNumLet => match cat {
297                     wd::WC_ExtendNumLet => ExtendNumLet,        // rule WB13a
298                     wd::WC_ALetter => Letter,                   // rule WB13b
299                     wd::WC_Hebrew_Letter => HLetter,            // rule WB13b
300                     wd::WC_Numeric => Numeric,                  // rule WB13b
301                     wd::WC_Katakana => Katakana,                // rule WB13b
302                     _ => {
303                         take_curr = false;
304                         break;
305                     }
306                 },
307                 Regional(RegionalState::Full) => {
308                     // if it reaches here we've gone too far,
309                     // a full flag can only compose with ZWJ/Extend/Format
310                     // proceeding it.
311                     take_curr = false;
312                     break;
313                 }
314                 Regional(RegionalState::Half) => match cat {
315                     wd::WC_Regional_Indicator => Regional(RegionalState::Full),      // rule WB13c
316                     _ => {
317                         take_curr = false;
318                         break;
319                     }
320                 },
321                 Regional(_) => unreachable!("RegionalState::Unknown should not occur on forward iteration"),
322                 Emoji => {
323                     // We already handle WB3c above. If you've reached this point, the emoji sequence is over.
324                     take_curr = false;
325                     break;
326                 },
327                 FormatExtend(t) => match t {    // handle FormatExtends depending on what type
328                     RequireNumeric if cat == wd::WC_Numeric => Numeric,     // rule WB11
329                     RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter,   // rule WB7
330                     RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a
331                     RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter,   // rule WB7b
332                     AcceptNone | AcceptQLetter => {
333                         take_curr = false;  // emit all the Format|Extend characters
334                         take_cat = false;
335                         break;
336                     },
337                     _ => break      // rewind (in if statement below)
338                 }
339             }
340         }
341 
342         if let FormatExtend(t) = state {
343             // we were looking for something and didn't find it; we have to back up
344             if t == RequireLetter || t == RequireHLetter || t == RequireNumeric {
345                 idx = saveidx;
346                 cat = savecat;
347                 take_curr = false;
348             }
349         }
350 
351         self.cat = if take_curr {
352             idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
353             None
354         } else if take_cat {
355             Some(cat)
356         } else {
357             None
358         };
359 
360         let retstr = &self.string[..idx];
361         self.string = &self.string[idx..];
362         Some(retstr)
363     }
364 }
365 
366 impl<'a> DoubleEndedIterator for UWordBounds<'a> {
367     #[inline]
next_back(&mut self) -> Option<&'a str>368     fn next_back(&mut self) -> Option<&'a str> {
369         use self::UWordBoundsState::*;
370         use self::FormatExtendType::*;
371         use tables::word as wd;
372         if self.string.len() == 0 {
373             return None;
374         }
375 
376         let mut take_curr = true;
377         let mut take_cat = true;
378         let mut idx = self.string.len();
379         idx -= self.string.chars().next_back().unwrap().len_utf8();
380         let mut previdx = idx;
381         let mut saveidx = idx;
382         let mut state = Start;
383         let mut savestate = Start;
384         let mut cat = wd::WC_Any;
385 
386         let mut skipped_format_extend = false;
387 
388         for (curr, ch) in self.string.char_indices().rev() {
389             previdx = idx;
390             idx = curr;
391 
392             // if there's a category cached, grab it
393             cat = match self.catb {
394                 None => wd::word_category(ch),
395                 _ => self.catb.take().unwrap()
396             };
397             take_cat = true;
398 
399             // backward iterator over word boundaries. Mostly the same as the forward
400             // iterator, with two weirdnesses:
401             // (1) If we encounter a single quote in the Start state, we have to check for a
402             //     Hebrew Letter immediately before it.
403             // (2) Format and Extend char handling takes some gymnastics.
404 
405             if cat == wd::WC_Extend
406                 || cat == wd::WC_Format
407                 || (cat == wd::WC_ZWJ && state != Zwj) { // WB3c has more priority so we should not
408                                                          // fold in that case
409                 if match state {
410                     FormatExtend(_) | Start => false,
411                     _ => true
412                 } {
413                     saveidx = previdx;
414                     savestate = state;
415                     state = FormatExtend(AcceptNone);
416                 }
417 
418                 if state != Start {
419                     continue;
420                 }
421             } else if state == FormatExtend(AcceptNone) {
422                 // finished a scan of some Format|Extend chars, restore previous state
423                 state = savestate;
424                 previdx = saveidx;
425                 take_cat = false;
426                 skipped_format_extend = true;
427             }
428 
429             // Don't use `continue` in this match without updating `catb`
430             state = match state {
431                 Start | FormatExtend(AcceptAny) => match cat {
432                     _ if is_emoji(ch) => Zwj,
433                     wd::WC_ALetter => Letter,           // rule WB5, WB7, WB10, WB13b
434                     wd::WC_Hebrew_Letter => HLetter,    // rule WB5, WB7, WB7c, WB10, WB13b
435                     wd::WC_Numeric => Numeric,          // rule WB8, WB9, WB11, WB13b
436                     wd::WC_Katakana => Katakana,                    // rule WB13, WB13b
437                     wd::WC_ExtendNumLet => ExtendNumLet,                    // rule WB13a
438                     wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c
439                     // rule WB4:
440                     wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny),
441                     wd::WC_Single_Quote => {
442                         saveidx = idx;
443                         FormatExtend(AcceptQLetter)                         // rule WB7a
444                     },
445                     wd::WC_WSegSpace => WSegSpace,
446                     wd::WC_CR | wd::WC_LF | wd::WC_Newline => {
447                         if state == Start {
448                             if cat == wd::WC_LF {
449                                 idx -= match self.get_prev_cat(idx) {
450                                     Some(pcat) if pcat == wd::WC_CR => 1,   // rule WB3
451                                     _ => 0
452                                 };
453                             }
454                         } else {
455                             take_curr = false;
456                         }
457                         break;                                              // rule WB3a
458                     },
459                     _ => break                              // rule WB999
460                 },
461                 Zwj => match cat {                          // rule WB3c
462                     wd::WC_ZWJ => {
463                         FormatExtend(AcceptAny)
464                     }
465                     _ => {
466                         take_curr = false;
467                         break;
468                     }
469                 },
470                 WSegSpace => match cat {                          // rule WB3d
471                     wd::WC_WSegSpace if !skipped_format_extend => {
472                         WSegSpace
473                     }
474                     _ => {
475                         take_curr = false;
476                         break;
477                     }
478                 },
479                 Letter | HLetter => match cat {
480                     wd::WC_ALetter => Letter,               // rule WB5
481                     wd::WC_Hebrew_Letter => HLetter,        // rule WB5
482                     wd::WC_Numeric => Numeric,              // rule WB10
483                     wd::WC_ExtendNumLet => ExtendNumLet,    // rule WB13b
484                     wd::WC_Double_Quote if state == HLetter => {
485                         saveidx = previdx;
486                         FormatExtend(RequireHLetter)         // rule WB7c
487                     },
488                     wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
489                         saveidx = previdx;
490                         FormatExtend(RequireLetter)          // rule WB7
491                     },
492                     _ => {
493                         take_curr = false;
494                         break;
495                     }
496                 },
497                 Numeric => match cat {
498                     wd::WC_Numeric => Numeric,              // rule WB8
499                     wd::WC_ALetter => Letter,               // rule WB9
500                     wd::WC_Hebrew_Letter => HLetter,        // rule WB9
501                     wd::WC_ExtendNumLet => ExtendNumLet,    // rule WB13b
502                     wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
503                         saveidx = previdx;
504                         FormatExtend(RequireNumeric)         // rule WB11
505                     },
506                     _ => {
507                         take_curr = false;
508                         break;
509                     }
510                 },
511                 Katakana => match cat {
512                     wd::WC_Katakana => Katakana,            // rule WB13
513                     wd::WC_ExtendNumLet => ExtendNumLet,    // rule WB13b
514                     _ => {
515                         take_curr = false;
516                         break;
517                     }
518                 },
519                 ExtendNumLet => match cat {
520                     wd::WC_ExtendNumLet => ExtendNumLet,    // rule WB13a
521                     wd::WC_ALetter => Letter,               // rule WB13a
522                     wd::WC_Hebrew_Letter => HLetter,        // rule WB13a
523                     wd::WC_Numeric => Numeric,              // rule WB13a
524                     wd::WC_Katakana => Katakana,            // rule WB13a
525                     _ => {
526                         take_curr = false;
527                         break;
528                     }
529                 },
530                 Regional(mut regional_state) => match cat {
531                     // rule WB13c
532                     wd::WC_Regional_Indicator => {
533                         if regional_state == RegionalState::Unknown {
534                             let count = self.string[..previdx]
535                                             .chars().rev()
536                                             .map(|c| wd::word_category(c))
537                                             .filter(|&c| ! (c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format))
538                                             .take_while(|&c| c == wd::WC_Regional_Indicator)
539                                             .count();
540                             regional_state = if count % 2 == 0 {
541                                 RegionalState::Full
542                             } else {
543                                 RegionalState::Half
544                             };
545                         }
546                         if regional_state == RegionalState::Full {
547                             take_curr = false;
548                             break;
549                         } else {
550                             Regional(RegionalState::Full)
551                         }
552                     }
553                     _ => {
554                         take_curr = false;
555                         break;
556                     }
557                 },
558                 Emoji => {
559                     if is_emoji(ch) {           // rule WB3c
560                         Zwj
561                     } else {
562                         take_curr = false;
563                         break;
564                     }
565                 },
566                 FormatExtend(t) => match t {
567                     RequireNumeric if cat == wd::WC_Numeric => Numeric,          // rule WB12
568                     RequireLetter if cat == wd::WC_ALetter => Letter,            // rule WB6
569                     RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter,     // rule WB6
570                     AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter,     // rule WB7a
571                     RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter,    // rule WB7b
572                     _ => break  // backtrack will happens
573                 }
574             }
575         }
576 
577         if let FormatExtend(t) = state {
578             // if we required something but didn't find it, backtrack
579             if t == RequireLetter || t == RequireHLetter ||
580                 t == RequireNumeric || t == AcceptNone || t == AcceptQLetter {
581                 previdx = saveidx;
582                 take_cat = false;
583                 take_curr = false;
584             }
585         }
586 
587         self.catb = if take_curr {
588             None
589         } else {
590             idx = previdx;
591             if take_cat {
592                 Some(cat)
593             } else {
594                 None
595             }
596         };
597 
598         let retstr = &self.string[idx..];
599         self.string = &self.string[..idx];
600         Some(retstr)
601     }
602 }
603 
604 impl<'a> UWordBounds<'a> {
605     #[inline]
606     /// View the underlying data (the part yet to be iterated) as a slice of the original string.
607     ///
608     /// ```rust
609     /// # use unicode_segmentation::UnicodeSegmentation;
610     /// let mut iter = "Hello world".split_word_bounds();
611     /// assert_eq!(iter.as_str(), "Hello world");
612     /// iter.next();
613     /// assert_eq!(iter.as_str(), " world");
614     /// iter.next();
615     /// assert_eq!(iter.as_str(), "world");
616     /// ```
as_str(&self) -> &'a str617     pub fn as_str(&self) -> &'a str {
618         self.string
619     }
620 
621     #[inline]
get_next_cat(&self, idx: usize) -> Option<WordCat>622     fn get_next_cat(&self, idx: usize) -> Option<WordCat> {
623         use tables::word as wd;
624         let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
625         if nidx < self.string.len() {
626             let nch = self.string[nidx..].chars().next().unwrap();
627             Some(wd::word_category(nch))
628         } else {
629             None
630         }
631     }
632 
633     #[inline]
get_prev_cat(&self, idx: usize) -> Option<WordCat>634     fn get_prev_cat(&self, idx: usize) -> Option<WordCat> {
635         use tables::word as wd;
636         if idx > 0 {
637             let nch = self.string[..idx].chars().next_back().unwrap();
638             Some(wd::word_category(nch))
639         } else {
640             None
641         }
642     }
643 }
644 
645 #[inline]
new_word_bounds<'b>(s: &'b str) -> UWordBounds<'b>646 pub fn new_word_bounds<'b>(s: &'b str) -> UWordBounds<'b> {
647     UWordBounds { string: s, cat: None, catb: None }
648 }
649 
650 #[inline]
new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b>651 pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> {
652     UWordBoundIndices { start_offset: s.as_ptr() as usize, iter: new_word_bounds(s) }
653 }
654 
655 #[inline]
new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b>656 pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
657     use super::UnicodeSegmentation;
658     use tables::util::is_alphanumeric;
659 
660     fn has_alphanumeric(s: &&str) -> bool { s.chars().any(|c| is_alphanumeric(c)) }
661     let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
662 
663     UnicodeWords { inner: s.split_word_bounds().filter(has_alphanumeric) }
664 }
665