1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10
11 use core::cmp;
12 use core::iter::Filter;
13
14 use tables::word::WordCat;
15
16 /// An iterator over the substrings of a string which, after splitting the string on
17 /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
18 /// contain any characters with the
19 /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
20 /// property, or with
21 /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
22 ///
23 /// This struct is created by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See
24 /// its documentation for more.
25 ///
26 /// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words
27 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
28 pub struct UnicodeWords<'a> {
29 inner: Filter<UWordBounds<'a>, fn(&&str) -> bool>,
30 }
31
32 impl<'a> Iterator for UnicodeWords<'a> {
33 type Item = &'a str;
34
35 #[inline]
next(&mut self) -> Option<&'a str>36 fn next(&mut self) -> Option<&'a str> { self.inner.next() }
37 }
38 impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
39 #[inline]
next_back(&mut self) -> Option<&'a str>40 fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() }
41 }
42
43 /// External iterator for a string's
44 /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
45 ///
46 /// This struct is created by the [`split_word_bounds`] method on the [`UnicodeSegmentation`]
47 /// trait. See its documentation for more.
48 ///
49 /// [`split_word_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_word_bounds
50 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
51 #[derive(Clone)]
52 pub struct UWordBounds<'a> {
53 string: &'a str,
54 cat: Option<WordCat>,
55 catb: Option<WordCat>,
56 }
57
58 /// External iterator for word boundaries and byte offsets.
59 ///
60 /// This struct is created by the [`split_word_bound_indices`] method on the
61 /// [`UnicodeSegmentation`] trait. See its documentation for more.
62 ///
63 /// [`split_word_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_word_bound_indices
64 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
65 #[derive(Clone)]
66 pub struct UWordBoundIndices<'a> {
67 start_offset: usize,
68 iter: UWordBounds<'a>,
69 }
70
71 impl<'a> UWordBoundIndices<'a> {
72 #[inline]
73 /// View the underlying data (the part yet to be iterated) as a slice of the original string.
74 ///
75 /// ```rust
76 /// # use unicode_segmentation::UnicodeSegmentation;
77 /// let mut iter = "Hello world".split_word_bound_indices();
78 /// assert_eq!(iter.as_str(), "Hello world");
79 /// iter.next();
80 /// assert_eq!(iter.as_str(), " world");
81 /// iter.next();
82 /// assert_eq!(iter.as_str(), "world");
83 /// ```
as_str(&self) -> &'a str84 pub fn as_str(&self) -> &'a str {
85 self.iter.as_str()
86 }
87 }
88
89 impl<'a> Iterator for UWordBoundIndices<'a> {
90 type Item = (usize, &'a str);
91
92 #[inline]
next(&mut self) -> Option<(usize, &'a str)>93 fn next(&mut self) -> Option<(usize, &'a str)> {
94 self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s))
95 }
96
97 #[inline]
size_hint(&self) -> (usize, Option<usize>)98 fn size_hint(&self) -> (usize, Option<usize>) {
99 self.iter.size_hint()
100 }
101 }
102
103 impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> {
104 #[inline]
next_back(&mut self) -> Option<(usize, &'a str)>105 fn next_back(&mut self) -> Option<(usize, &'a str)> {
106 self.iter.next_back().map(|s| (s.as_ptr() as usize - self.start_offset, s))
107 }
108 }
109
110 // state machine for word boundary rules
111 #[derive(Clone,Copy,PartialEq,Eq,Debug)]
112 enum UWordBoundsState {
113 Start,
114 Letter,
115 HLetter,
116 Numeric,
117 Katakana,
118 ExtendNumLet,
119 Regional(RegionalState),
120 FormatExtend(FormatExtendType),
121 Zwj,
122 Emoji,
123 WSegSpace,
124 }
125
126 // subtypes for FormatExtend state in UWordBoundsState
127 #[derive(Clone,Copy,PartialEq,Eq,Debug)]
128 enum FormatExtendType {
129 AcceptAny,
130 AcceptNone,
131 RequireLetter,
132 RequireHLetter,
133 AcceptQLetter,
134 RequireNumeric,
135 }
136
137 #[derive(Clone,Copy,PartialEq,Eq,Debug)]
138 enum RegionalState {
139 Half,
140 Full,
141 Unknown,
142 }
143
is_emoji(ch: char) -> bool144 fn is_emoji(ch: char) -> bool {
145 use tables::emoji;
146 emoji::emoji_category(ch).2 == emoji::EmojiCat::EC_Extended_Pictographic
147 }
148
149 impl<'a> Iterator for UWordBounds<'a> {
150 type Item = &'a str;
151
152 #[inline]
size_hint(&self) -> (usize, Option<usize>)153 fn size_hint(&self) -> (usize, Option<usize>) {
154 let slen = self.string.len();
155 (cmp::min(slen, 1), Some(slen))
156 }
157
158 #[inline]
next(&mut self) -> Option<&'a str>159 fn next(&mut self) -> Option<&'a str> {
160 use self::UWordBoundsState::*;
161 use self::FormatExtendType::*;
162 use tables::word as wd;
163 if self.string.len() == 0 {
164 return None;
165 }
166
167 let mut take_curr = true;
168 let mut take_cat = true;
169 let mut idx = 0;
170 let mut saveidx = 0;
171 let mut state = Start;
172 let mut cat = wd::WC_Any;
173 let mut savecat = wd::WC_Any;
174
175 // Whether or not the previous category was ZWJ
176 // ZWJs get collapsed, so this handles precedence of WB3c over WB4
177 let mut prev_zwj;
178 // If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
179 let mut skipped_format_extend = false;
180 for (curr, ch) in self.string.char_indices() {
181 idx = curr;
182 prev_zwj = cat == wd::WC_ZWJ;
183 // if there's a category cached, grab it
184 cat = match self.cat {
185 None => wd::word_category(ch).2,
186 _ => self.cat.take().unwrap()
187 };
188 take_cat = true;
189
190 // handle rule WB4
191 // just skip all format, extend, and zwj chars
192 // note that Start is a special case: if there's a bunch of Format | Extend
193 // characters at the beginning of a block of text, dump them out as one unit.
194 //
195 // (This is not obvious from the wording of UAX#29, but if you look at the
196 // test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
197 // then the "correct" interpretation of WB4 becomes apparent.)
198 if state != Start {
199 match cat {
200 wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => {
201 skipped_format_extend = true;
202 continue
203 }
204 _ => {}
205 }
206 }
207
208 // rule WB3c
209 // WB4 makes all ZWJs collapse into the previous state
210 // but you can still be in a Zwj state if you started with Zwj
211 //
212 // This means that an EP + Zwj will collapse into EP, which is wrong,
213 // since EP+EP is not a boundary but EP+ZWJ+EP is
214 //
215 // Thus, we separately keep track of whether or not the last character
216 // was a ZWJ. This is an additional bit of state tracked outside of the
217 // state enum; the state enum represents the last non-zwj state encountered.
218 // When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
219 // however we are in the previous state for the purposes of all other rules.
220 if prev_zwj {
221 if is_emoji(ch) {
222 state = Emoji;
223 continue;
224 }
225 }
226 // Don't use `continue` in this match without updating `cat`
227 state = match state {
228 Start if cat == wd::WC_CR => {
229 idx += match self.get_next_cat(idx) {
230 Some(ncat) if ncat == wd::WC_LF => 1, // rule WB3
231 _ => 0
232 };
233 break; // rule WB3a
234 },
235 Start => match cat {
236 wd::WC_ALetter => Letter, // rule WB5, WB6, WB9, WB13a
237 wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB6, WB7a, WB7b, WB9, WB13a
238 wd::WC_Numeric => Numeric, // rule WB8, WB10, WB12, WB13a
239 wd::WC_Katakana => Katakana, // rule WB13, WB13a
240 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b
241 wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c
242 wd::WC_LF | wd::WC_Newline => break, // rule WB3a
243 wd::WC_ZWJ => Zwj, // rule WB3c
244 wd::WC_WSegSpace => WSegSpace, // rule WB3d
245 _ => {
246 if let Some(ncat) = self.get_next_cat(idx) { // rule WB4
247 if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ {
248 state = FormatExtend(AcceptNone);
249 self.cat = Some(ncat);
250 continue;
251 }
252 }
253 break; // rule WB999
254 }
255 },
256 WSegSpace => match cat {
257 wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
258 _ => {
259 take_curr = false;
260 break;
261 }
262 },
263 Zwj => {
264 // We already handle WB3c above.
265 take_curr = false;
266 break;
267 }
268 Letter | HLetter => match cat {
269 wd::WC_ALetter => Letter, // rule WB5
270 wd::WC_Hebrew_Letter => HLetter, // rule WB5
271 wd::WC_Numeric => Numeric, // rule WB9
272 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
273 wd::WC_Double_Quote if state == HLetter => {
274 savecat = cat;
275 saveidx = idx;
276 FormatExtend(RequireHLetter) // rule WB7b
277 },
278 wd::WC_Single_Quote if state == HLetter => {
279 FormatExtend(AcceptQLetter) // rule WB7a
280 },
281 wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
282 savecat = cat;
283 saveidx = idx;
284 FormatExtend(RequireLetter) // rule WB6
285 },
286 _ => {
287 take_curr = false;
288 break;
289 }
290 },
291 Numeric => match cat {
292 wd::WC_Numeric => Numeric, // rule WB8
293 wd::WC_ALetter => Letter, // rule WB10
294 wd::WC_Hebrew_Letter => HLetter, // rule WB10
295 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
296 wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
297 savecat = cat;
298 saveidx = idx;
299 FormatExtend(RequireNumeric) // rule WB12
300 },
301 _ => {
302 take_curr = false;
303 break;
304 }
305 },
306 Katakana => match cat {
307 wd::WC_Katakana => Katakana, // rule WB13
308 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
309 _ => {
310 take_curr = false;
311 break;
312 }
313 },
314 ExtendNumLet => match cat {
315 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
316 wd::WC_ALetter => Letter, // rule WB13b
317 wd::WC_Hebrew_Letter => HLetter, // rule WB13b
318 wd::WC_Numeric => Numeric, // rule WB13b
319 wd::WC_Katakana => Katakana, // rule WB13b
320 _ => {
321 take_curr = false;
322 break;
323 }
324 },
325 Regional(RegionalState::Full) => {
326 // if it reaches here we've gone too far,
327 // a full flag can only compose with ZWJ/Extend/Format
328 // proceeding it.
329 take_curr = false;
330 break;
331 }
332 Regional(RegionalState::Half) => match cat {
333 wd::WC_Regional_Indicator => Regional(RegionalState::Full), // rule WB13c
334 _ => {
335 take_curr = false;
336 break;
337 }
338 },
339 Regional(_) => unreachable!("RegionalState::Unknown should not occur on forward iteration"),
340 Emoji => {
341 // We already handle WB3c above. If you've reached this point, the emoji sequence is over.
342 take_curr = false;
343 break;
344 },
345 FormatExtend(t) => match t { // handle FormatExtends depending on what type
346 RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11
347 RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7
348 RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a
349 RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
350 AcceptNone | AcceptQLetter => {
351 take_curr = false; // emit all the Format|Extend characters
352 take_cat = false;
353 break;
354 },
355 _ => break // rewind (in if statement below)
356 }
357 }
358 }
359
360 if let FormatExtend(t) = state {
361 // we were looking for something and didn't find it; we have to back up
362 if t == RequireLetter || t == RequireHLetter || t == RequireNumeric {
363 idx = saveidx;
364 cat = savecat;
365 take_curr = false;
366 }
367 }
368
369 self.cat = if take_curr {
370 idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
371 None
372 } else if take_cat {
373 Some(cat)
374 } else {
375 None
376 };
377
378 let retstr = &self.string[..idx];
379 self.string = &self.string[idx..];
380 Some(retstr)
381 }
382 }
383
384 impl<'a> DoubleEndedIterator for UWordBounds<'a> {
385 #[inline]
next_back(&mut self) -> Option<&'a str>386 fn next_back(&mut self) -> Option<&'a str> {
387 use self::UWordBoundsState::*;
388 use self::FormatExtendType::*;
389 use tables::word as wd;
390 if self.string.len() == 0 {
391 return None;
392 }
393
394 let mut take_curr = true;
395 let mut take_cat = true;
396 let mut idx = self.string.len();
397 idx -= self.string.chars().next_back().unwrap().len_utf8();
398 let mut previdx = idx;
399 let mut saveidx = idx;
400 let mut state = Start;
401 let mut savestate = Start;
402 let mut cat = wd::WC_Any;
403
404 let mut skipped_format_extend = false;
405
406 for (curr, ch) in self.string.char_indices().rev() {
407 previdx = idx;
408 idx = curr;
409
410 // if there's a category cached, grab it
411 cat = match self.catb {
412 None => wd::word_category(ch).2,
413 _ => self.catb.take().unwrap()
414 };
415 take_cat = true;
416
417 // backward iterator over word boundaries. Mostly the same as the forward
418 // iterator, with two weirdnesses:
419 // (1) If we encounter a single quote in the Start state, we have to check for a
420 // Hebrew Letter immediately before it.
421 // (2) Format and Extend char handling takes some gymnastics.
422
423 if cat == wd::WC_Extend
424 || cat == wd::WC_Format
425 || (cat == wd::WC_ZWJ && state != Zwj) { // WB3c has more priority so we should not
426 // fold in that case
427 if match state {
428 FormatExtend(_) | Start => false,
429 _ => true
430 } {
431 saveidx = previdx;
432 savestate = state;
433 state = FormatExtend(AcceptNone);
434 }
435
436 if state != Start {
437 continue;
438 }
439 } else if state == FormatExtend(AcceptNone) {
440 // finished a scan of some Format|Extend chars, restore previous state
441 state = savestate;
442 previdx = saveidx;
443 take_cat = false;
444 skipped_format_extend = true;
445 }
446
447 // Don't use `continue` in this match without updating `catb`
448 state = match state {
449 Start | FormatExtend(AcceptAny) => match cat {
450 _ if is_emoji(ch) => Zwj,
451 wd::WC_ALetter => Letter, // rule WB5, WB7, WB10, WB13b
452 wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB7, WB7c, WB10, WB13b
453 wd::WC_Numeric => Numeric, // rule WB8, WB9, WB11, WB13b
454 wd::WC_Katakana => Katakana, // rule WB13, WB13b
455 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
456 wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c
457 // rule WB4:
458 wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny),
459 wd::WC_Single_Quote => {
460 saveidx = idx;
461 FormatExtend(AcceptQLetter) // rule WB7a
462 },
463 wd::WC_WSegSpace => WSegSpace,
464 wd::WC_CR | wd::WC_LF | wd::WC_Newline => {
465 if state == Start {
466 if cat == wd::WC_LF {
467 idx -= match self.get_prev_cat(idx) {
468 Some(pcat) if pcat == wd::WC_CR => 1, // rule WB3
469 _ => 0
470 };
471 }
472 } else {
473 take_curr = false;
474 }
475 break; // rule WB3a
476 },
477 _ => break // rule WB999
478 },
479 Zwj => match cat { // rule WB3c
480 wd::WC_ZWJ => {
481 FormatExtend(AcceptAny)
482 }
483 _ => {
484 take_curr = false;
485 break;
486 }
487 },
488 WSegSpace => match cat { // rule WB3d
489 wd::WC_WSegSpace if !skipped_format_extend => {
490 WSegSpace
491 }
492 _ => {
493 take_curr = false;
494 break;
495 }
496 },
497 Letter | HLetter => match cat {
498 wd::WC_ALetter => Letter, // rule WB5
499 wd::WC_Hebrew_Letter => HLetter, // rule WB5
500 wd::WC_Numeric => Numeric, // rule WB10
501 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
502 wd::WC_Double_Quote if state == HLetter => {
503 saveidx = previdx;
504 FormatExtend(RequireHLetter) // rule WB7c
505 },
506 wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
507 saveidx = previdx;
508 FormatExtend(RequireLetter) // rule WB7
509 },
510 _ => {
511 take_curr = false;
512 break;
513 }
514 },
515 Numeric => match cat {
516 wd::WC_Numeric => Numeric, // rule WB8
517 wd::WC_ALetter => Letter, // rule WB9
518 wd::WC_Hebrew_Letter => HLetter, // rule WB9
519 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
520 wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
521 saveidx = previdx;
522 FormatExtend(RequireNumeric) // rule WB11
523 },
524 _ => {
525 take_curr = false;
526 break;
527 }
528 },
529 Katakana => match cat {
530 wd::WC_Katakana => Katakana, // rule WB13
531 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
532 _ => {
533 take_curr = false;
534 break;
535 }
536 },
537 ExtendNumLet => match cat {
538 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
539 wd::WC_ALetter => Letter, // rule WB13a
540 wd::WC_Hebrew_Letter => HLetter, // rule WB13a
541 wd::WC_Numeric => Numeric, // rule WB13a
542 wd::WC_Katakana => Katakana, // rule WB13a
543 _ => {
544 take_curr = false;
545 break;
546 }
547 },
548 Regional(mut regional_state) => match cat {
549 // rule WB13c
550 wd::WC_Regional_Indicator => {
551 if regional_state == RegionalState::Unknown {
552 let count = self.string[..previdx]
553 .chars().rev()
554 .map(|c| wd::word_category(c).2)
555 .filter(|&c| ! (c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format))
556 .take_while(|&c| c == wd::WC_Regional_Indicator)
557 .count();
558 regional_state = if count % 2 == 0 {
559 RegionalState::Full
560 } else {
561 RegionalState::Half
562 };
563 }
564 if regional_state == RegionalState::Full {
565 take_curr = false;
566 break;
567 } else {
568 Regional(RegionalState::Full)
569 }
570 }
571 _ => {
572 take_curr = false;
573 break;
574 }
575 },
576 Emoji => {
577 if is_emoji(ch) { // rule WB3c
578 Zwj
579 } else {
580 take_curr = false;
581 break;
582 }
583 },
584 FormatExtend(t) => match t {
585 RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB12
586 RequireLetter if cat == wd::WC_ALetter => Letter, // rule WB6
587 RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB6
588 AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7a
589 RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
590 _ => break // backtrack will happens
591 }
592 }
593 }
594
595 if let FormatExtend(t) = state {
596 // if we required something but didn't find it, backtrack
597 if t == RequireLetter || t == RequireHLetter ||
598 t == RequireNumeric || t == AcceptNone || t == AcceptQLetter {
599 previdx = saveidx;
600 take_cat = false;
601 take_curr = false;
602 }
603 }
604
605 self.catb = if take_curr {
606 None
607 } else {
608 idx = previdx;
609 if take_cat {
610 Some(cat)
611 } else {
612 None
613 }
614 };
615
616 let retstr = &self.string[idx..];
617 self.string = &self.string[..idx];
618 Some(retstr)
619 }
620 }
621
622 impl<'a> UWordBounds<'a> {
623 #[inline]
624 /// View the underlying data (the part yet to be iterated) as a slice of the original string.
625 ///
626 /// ```rust
627 /// # use unicode_segmentation::UnicodeSegmentation;
628 /// let mut iter = "Hello world".split_word_bounds();
629 /// assert_eq!(iter.as_str(), "Hello world");
630 /// iter.next();
631 /// assert_eq!(iter.as_str(), " world");
632 /// iter.next();
633 /// assert_eq!(iter.as_str(), "world");
634 /// ```
as_str(&self) -> &'a str635 pub fn as_str(&self) -> &'a str {
636 self.string
637 }
638
639 #[inline]
get_next_cat(&self, idx: usize) -> Option<WordCat>640 fn get_next_cat(&self, idx: usize) -> Option<WordCat> {
641 use tables::word as wd;
642 let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
643 if nidx < self.string.len() {
644 let nch = self.string[nidx..].chars().next().unwrap();
645 Some(wd::word_category(nch).2)
646 } else {
647 None
648 }
649 }
650
651 #[inline]
get_prev_cat(&self, idx: usize) -> Option<WordCat>652 fn get_prev_cat(&self, idx: usize) -> Option<WordCat> {
653 use tables::word as wd;
654 if idx > 0 {
655 let nch = self.string[..idx].chars().next_back().unwrap();
656 Some(wd::word_category(nch).2)
657 } else {
658 None
659 }
660 }
661 }
662
663 #[inline]
new_word_bounds<'b>(s: &'b str) -> UWordBounds<'b>664 pub fn new_word_bounds<'b>(s: &'b str) -> UWordBounds<'b> {
665 UWordBounds { string: s, cat: None, catb: None }
666 }
667
668 #[inline]
new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b>669 pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> {
670 UWordBoundIndices { start_offset: s.as_ptr() as usize, iter: new_word_bounds(s) }
671 }
672
673 #[inline]
new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b>674 pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
675 use super::UnicodeSegmentation;
676 use tables::util::is_alphanumeric;
677
678 fn has_alphanumeric(s: &&str) -> bool { s.chars().any(|c| is_alphanumeric(c)) }
679 let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
680
681 UnicodeWords { inner: s.split_word_bounds().filter(has_alphanumeric) }
682 }
683