1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT 2 // file at the top-level directory of this distribution and at 3 // http://rust-lang.org/COPYRIGHT. 4 // 5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license 7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 8 // option. This file may not be copied, modified, or distributed 9 // except according to those terms. 10 11 use core::cmp; 12 use core::iter::Filter; 13 14 use tables::word::WordCat; 15 16 /// An iterator over the substrings of a string which, after splitting the string on 17 /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), 18 /// contain any characters with the 19 /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) 20 /// property, or with 21 /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). 22 pub struct UnicodeWords<'a> { 23 inner: Filter<UWordBounds<'a>, fn(&&str) -> bool>, 24 } 25 26 impl<'a> Iterator for UnicodeWords<'a> { 27 type Item = &'a str; 28 29 #[inline] 30 fn next(&mut self) -> Option<&'a str> { self.inner.next() } 31 } 32 impl<'a> DoubleEndedIterator for UnicodeWords<'a> { 33 #[inline] 34 fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() } 35 } 36 37 /// External iterator for a string's 38 /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries). 39 #[derive(Clone)] 40 pub struct UWordBounds<'a> { 41 string: &'a str, 42 cat: Option<WordCat>, 43 catb: Option<WordCat>, 44 } spawn<F, S>(future: F, schedule: S) -> (Runnable, Task<F::Output>) where F: Future + Send + 'static, F::Output: Send + 'static, S: Fn(Runnable) + Send + Sync + 'static,45 46 /// External iterator for word boundaries and byte offsets. 47 #[derive(Clone)] 48 pub struct UWordBoundIndices<'a> { 49 start_offset: usize, 50 iter: UWordBounds<'a>, 51 } 52 53 impl<'a> UWordBoundIndices<'a> { 54 #[inline] 55 /// View the underlying data (the part yet to be iterated) as a slice of the original string. 56 /// 57 /// ```rust 58 /// # use unicode_segmentation::UnicodeSegmentation; 59 /// let mut iter = "Hello world".split_word_bound_indices(); 60 /// assert_eq!(iter.as_str(), "Hello world"); 61 /// iter.next(); 62 /// assert_eq!(iter.as_str(), " world"); 63 /// iter.next(); 64 /// assert_eq!(iter.as_str(), "world"); 65 /// ``` 66 pub fn as_str(&self) -> &'a str { 67 self.iter.as_str() 68 } 69 } 70 71 impl<'a> Iterator for UWordBoundIndices<'a> { 72 type Item = (usize, &'a str); 73 74 #[inline] 75 fn next(&mut self) -> Option<(usize, &'a str)> { 76 self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s)) 77 } 78 79 #[inline] 80 fn size_hint(&self) -> (usize, Option<usize>) { 81 self.iter.size_hint() 82 } 83 } 84 85 impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> { 86 #[inline] spawn_local<F, S>(future: F, schedule: S) -> (Runnable, Task<F::Output>) where F: Future + 'static, F::Output: 'static, S: Fn(Runnable) + Send + Sync + 'static,87 fn next_back(&mut self) -> Option<(usize, &'a str)> { 88 self.iter.next_back().map(|s| (s.as_ptr() as usize - self.start_offset, s)) 89 } 90 } 91 92 // state machine for word boundary rules 93 #[derive(Clone,Copy,PartialEq,Eq,Debug)] 94 enum UWordBoundsState { 95 Start, 96 Letter, 97 HLetter, 98 Numeric, thread_id() -> ThreadId99 Katakana, 100 ExtendNumLet, 101 Regional(RegionalState), 102 FormatExtend(FormatExtendType), 103 Zwj, 104 Emoji, 105 WSegSpace, 106 } 107 108 // subtypes for FormatExtend state in UWordBoundsState 109 #[derive(Clone,Copy,PartialEq,Eq,Debug)] 110 enum FormatExtendType { 111 AcceptAny, 112 AcceptNone, 113 RequireLetter, 114 RequireHLetter, 115 AcceptQLetter, 116 RequireNumeric, 117 } 118 119 #[derive(Clone,Copy,PartialEq,Eq,Debug)] 120 enum RegionalState { 121 Half, 122 Full, 123 Unknown, 124 } 125 126 fn is_emoji(ch: char) -> bool { poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output>127 use tables::emoji; 128 emoji::emoji_category(ch) == emoji::EmojiCat::EC_Extended_Pictographic 129 } 130 131 impl<'a> Iterator for UWordBounds<'a> { 132 type Item = &'a str; 133 134 #[inline] 135 fn size_hint(&self) -> (usize, Option<usize>) { 136 let slen = self.string.len(); 137 (cmp::min(slen, 1), Some(slen)) 138 } 139 140 #[inline] 141 fn next(&mut self) -> Option<&'a str> { 142 use self::UWordBoundsState::*; 143 use self::FormatExtendType::*; 144 use tables::word as wd; 145 if self.string.len() == 0 { 146 return None; 147 } 148 149 let mut take_curr = true; 150 let mut take_cat = true; 151 let mut idx = 0; 152 let mut saveidx = 0; 153 let mut state = Start; 154 let mut cat = wd::WC_Any; 155 let mut savecat = wd::WC_Any; 156 157 // Whether or not the previous category was ZWJ 158 // ZWJs get collapsed, so this handles precedence of WB3c over WB4 159 let mut prev_zwj; 160 // If extend/format/zwj were skipped. Handles precedence of WB3d over WB4 161 let mut skipped_format_extend = false; 162 for (curr, ch) in self.string.char_indices() { 163 idx = curr; 164 prev_zwj = cat == wd::WC_ZWJ; 165 // if there's a category cached, grab it 166 cat = match self.cat { 167 None => wd::word_category(ch), 168 _ => self.cat.take().unwrap() 169 }; 170 take_cat = true; 171 172 // handle rule WB4 173 // just skip all format, extend, and zwj chars spawn_unchecked<F, S>(future: F, schedule: S) -> (Runnable, Task<F::Output>) where F: Future, S: Fn(Runnable),174 // note that Start is a special case: if there's a bunch of Format | Extend 175 // characters at the beginning of a block of text, dump them out as one unit. 176 // 177 // (This is not obvious from the wording of UAX#29, but if you look at the 178 // test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt 179 // then the "correct" interpretation of WB4 becomes apparent.) 180 if state != Start { 181 match cat { 182 wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => { 183 skipped_format_extend = true; 184 continue 185 } 186 _ => {} 187 } 188 } 189 190 // rule WB3c 191 // WB4 makes all ZWJs collapse into the previous state 192 // but you can still be in a Zwj state if you started with Zwj 193 // 194 // This means that an EP + Zwj will collapse into EP, which is wrong, 195 // since EP+EP is not a boundary but EP+ZWJ+EP is 196 // 197 // Thus, we separately keep track of whether or not the last character 198 // was a ZWJ. This is an additional bit of state tracked outside of the 199 // state enum; the state enum represents the last non-zwj state encountered. 200 // When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state, 201 // however we are in the previous state for the purposes of all other rules. 202 if prev_zwj { 203 if is_emoji(ch) { 204 state = Emoji; 205 continue; 206 } 207 } 208 // Don't use `continue` in this match without updating `cat` 209 state = match state { 210 Start if cat == wd::WC_CR => { 211 idx += match self.get_next_cat(idx) { 212 Some(ncat) if ncat == wd::WC_LF => 1, // rule WB3 213 _ => 0 214 }; 215 break; // rule WB3a 216 }, 217 Start => match cat { 218 wd::WC_ALetter => Letter, // rule WB5, WB6, WB9, WB13a 219 wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB6, WB7a, WB7b, WB9, WB13a 220 wd::WC_Numeric => Numeric, // rule WB8, WB10, WB12, WB13a 221 wd::WC_Katakana => Katakana, // rule WB13, WB13a 222 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b 223 wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c 224 wd::WC_LF | wd::WC_Newline => break, // rule WB3a 225 wd::WC_ZWJ => Zwj, // rule WB3c 226 wd::WC_WSegSpace => WSegSpace, // rule WB3d 227 _ => { 228 if let Some(ncat) = self.get_next_cat(idx) { // rule WB4 229 if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ { 230 state = FormatExtend(AcceptNone); 231 self.cat = Some(ncat); 232 continue; 233 } 234 } 235 break; // rule WB999 236 } 237 }, 238 WSegSpace => match cat { 239 wd::WC_WSegSpace if !skipped_format_extend => WSegSpace, 240 _ => { 241 take_curr = false; 242 break; 243 } 244 }, 245 Zwj => { 246 // We already handle WB3c above. 247 take_curr = false; 248 break; 249 } 250 Letter | HLetter => match cat { 251 wd::WC_ALetter => Letter, // rule WB5 252 wd::WC_Hebrew_Letter => HLetter, // rule WB5 253 wd::WC_Numeric => Numeric, // rule WB9 254 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a 255 wd::WC_Double_Quote if state == HLetter => { 256 savecat = cat; 257 saveidx = idx; 258 FormatExtend(RequireHLetter) // rule WB7b 259 }, 260 wd::WC_Single_Quote if state == HLetter => { 261 FormatExtend(AcceptQLetter) // rule WB7a 262 }, 263 wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => { 264 savecat = cat; 265 saveidx = idx; schedule(self)266 FormatExtend(RequireLetter) // rule WB6 267 }, 268 _ => { 269 take_curr = false; 270 break; 271 } 272 }, 273 Numeric => match cat { 274 wd::WC_Numeric => Numeric, // rule WB8 275 wd::WC_ALetter => Letter, // rule WB10 276 wd::WC_Hebrew_Letter => HLetter, // rule WB10 277 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a 278 wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => { 279 savecat = cat; 280 saveidx = idx; 281 FormatExtend(RequireNumeric) // rule WB12 282 }, 283 _ => { 284 take_curr = false; 285 break; 286 } 287 }, 288 Katakana => match cat { 289 wd::WC_Katakana => Katakana, // rule WB13 290 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a 291 _ => { 292 take_curr = false; 293 break; 294 } 295 }, 296 ExtendNumLet => match cat { 297 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a 298 wd::WC_ALetter => Letter, // rule WB13b 299 wd::WC_Hebrew_Letter => HLetter, // rule WB13b 300 wd::WC_Numeric => Numeric, // rule WB13b 301 wd::WC_Katakana => Katakana, // rule WB13b 302 _ => { 303 take_curr = false; run(self) -> bool304 break; 305 } 306 }, 307 Regional(RegionalState::Full) => { 308 // if it reaches here we've gone too far, 309 // a full flag can only compose with ZWJ/Extend/Format 310 // proceeding it. 311 take_curr = false; 312 break; 313 } 314 Regional(RegionalState::Half) => match cat { 315 wd::WC_Regional_Indicator => Regional(RegionalState::Full), // rule WB13c 316 _ => { 317 take_curr = false; 318 break; 319 } 320 }, 321 Regional(_) => unreachable!("RegionalState::Unknown should not occur on forward iteration"), 322 Emoji => { 323 // We already handle WB3c above. If you've reached this point, the emoji sequence is over. 324 take_curr = false; 325 break; 326 }, 327 FormatExtend(t) => match t { // handle FormatExtends depending on what type 328 RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11 329 RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7 330 RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a 331 RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b 332 AcceptNone | AcceptQLetter => { 333 take_curr = false; // emit all the Format|Extend characters 334 take_cat = false; waker(&self) -> Waker335 break; 336 }, 337 _ => break // rewind (in if statement below) 338 } 339 } 340 } 341 342 if let FormatExtend(t) = state { 343 // we were looking for something and didn't find it; we have to back up 344 if t == RequireLetter || t == RequireHLetter || t == RequireNumeric { 345 idx = saveidx; 346 cat = savecat; drop(&mut self)347 take_curr = false; 348 } 349 } 350 351 self.cat = if take_curr { 352 idx = idx + self.string[idx..].chars().next().unwrap().len_utf8(); 353 None 354 } else if take_cat { 355 Some(cat) 356 } else { 357 None 358 }; 359 360 let retstr = &self.string[..idx]; 361 self.string = &self.string[idx..]; 362 Some(retstr) 363 } 364 } 365 366 impl<'a> DoubleEndedIterator for UWordBounds<'a> { 367 #[inline] 368 fn next_back(&mut self) -> Option<&'a str> { 369 use self::UWordBoundsState::*; 370 use self::FormatExtendType::*; 371 use tables::word as wd; 372 if self.string.len() == 0 { 373 return None; 374 } 375 376 let mut take_curr = true; 377 let mut take_cat = true; 378 let mut idx = self.string.len(); 379 idx -= self.string.chars().next_back().unwrap().len_utf8(); 380 let mut previdx = idx; 381 let mut saveidx = idx; 382 let mut state = Start; 383 let mut savestate = Start; 384 let mut cat = wd::WC_Any; 385 386 let mut skipped_format_extend = false; 387 388 for (curr, ch) in self.string.char_indices().rev() { 389 previdx = idx; fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result390 idx = curr; 391 392 // if there's a category cached, grab it 393 cat = match self.catb { 394 None => wd::word_category(ch), 395 _ => self.catb.take().unwrap() 396 }; 397 take_cat = true; 398 399 // backward iterator over word boundaries. Mostly the same as the forward 400 // iterator, with two weirdnesses: 401 // (1) If we encounter a single quote in the Start state, we have to check for a 402 // Hebrew Letter immediately before it. 403 // (2) Format and Extend char handling takes some gymnastics. 404 405 if cat == wd::WC_Extend 406 || cat == wd::WC_Format 407 || (cat == wd::WC_ZWJ && state != Zwj) { // WB3c has more priority so we should not 408 // fold in that case 409 if match state { 410 FormatExtend(_) | Start => false, 411 _ => true 412 } { 413 saveidx = previdx; 414 savestate = state; 415 state = FormatExtend(AcceptNone); 416 } 417 418 if state != Start { 419 continue; 420 } 421 } else if state == FormatExtend(AcceptNone) { 422 // finished a scan of some Format|Extend chars, restore previous state 423 state = savestate; 424 previdx = saveidx; 425 take_cat = false; 426 skipped_format_extend = true; 427 } 428 429 // Don't use `continue` in this match without updating `catb` 430 state = match state { 431 Start | FormatExtend(AcceptAny) => match cat { 432 _ if is_emoji(ch) => Zwj, 433 wd::WC_ALetter => Letter, // rule WB5, WB7, WB10, WB13b 434 wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB7, WB7c, WB10, WB13b 435 wd::WC_Numeric => Numeric, // rule WB8, WB9, WB11, WB13b 436 wd::WC_Katakana => Katakana, // rule WB13, WB13b 437 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a 438 wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c 439 // rule WB4: 440 wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny), 441 wd::WC_Single_Quote => { 442 saveidx = idx; 443 FormatExtend(AcceptQLetter) // rule WB7a 444 }, 445 wd::WC_WSegSpace => WSegSpace, 446 wd::WC_CR | wd::WC_LF | wd::WC_Newline => { 447 if state == Start { 448 if cat == wd::WC_LF { 449 idx -= match self.get_prev_cat(idx) { 450 Some(pcat) if pcat == wd::WC_CR => 1, // rule WB3 451 _ => 0 452 }; 453 } 454 } else { 455 take_curr = false; 456 } 457 break; // rule WB3a 458 }, 459 _ => break // rule WB999 460 }, 461 Zwj => match cat { // rule WB3c 462 wd::WC_ZWJ => { 463 FormatExtend(AcceptAny) 464 } 465 _ => { 466 take_curr = false; 467 break; 468 } 469 }, 470 WSegSpace => match cat { // rule WB3d 471 wd::WC_WSegSpace if !skipped_format_extend => { 472 WSegSpace 473 } 474 _ => { 475 take_curr = false; 476 break; 477 } 478 }, 479 Letter | HLetter => match cat { 480 wd::WC_ALetter => Letter, // rule WB5 481 wd::WC_Hebrew_Letter => HLetter, // rule WB5 482 wd::WC_Numeric => Numeric, // rule WB10 483 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b 484 wd::WC_Double_Quote if state == HLetter => { 485 saveidx = previdx; 486 FormatExtend(RequireHLetter) // rule WB7c 487 }, 488 wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => { 489 saveidx = previdx; 490 FormatExtend(RequireLetter) // rule WB7 491 }, 492 _ => { 493 take_curr = false; 494 break; 495 } 496 }, 497 Numeric => match cat { 498 wd::WC_Numeric => Numeric, // rule WB8 499 wd::WC_ALetter => Letter, // rule WB9 500 wd::WC_Hebrew_Letter => HLetter, // rule WB9 501 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b 502 wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => { 503 saveidx = previdx; 504 FormatExtend(RequireNumeric) // rule WB11 505 }, 506 _ => { 507 take_curr = false; 508 break; 509 } 510 }, 511 Katakana => match cat { 512 wd::WC_Katakana => Katakana, // rule WB13 513 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b 514 _ => { 515 take_curr = false; 516 break; 517 } 518 }, 519 ExtendNumLet => match cat { 520 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a 521 wd::WC_ALetter => Letter, // rule WB13a 522 wd::WC_Hebrew_Letter => HLetter, // rule WB13a 523 wd::WC_Numeric => Numeric, // rule WB13a 524 wd::WC_Katakana => Katakana, // rule WB13a 525 _ => { 526 take_curr = false; 527 break; 528 } 529 }, 530 Regional(mut regional_state) => match cat { 531 // rule WB13c 532 wd::WC_Regional_Indicator => { 533 if regional_state == RegionalState::Unknown { 534 let count = self.string[..previdx] 535 .chars().rev() 536 .map(|c| wd::word_category(c)) 537 .filter(|&c| ! (c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format)) 538 .take_while(|&c| c == wd::WC_Regional_Indicator) 539 .count(); 540 regional_state = if count % 2 == 0 { 541 RegionalState::Full 542 } else { 543 RegionalState::Half 544 }; 545 } 546 if regional_state == RegionalState::Full { 547 take_curr = false; 548 break; 549 } else { 550 Regional(RegionalState::Full) 551 } 552 } 553 _ => { 554 take_curr = false; 555 break; 556 } 557 }, 558 Emoji => { 559 if is_emoji(ch) { // rule WB3c 560 Zwj 561 } else { 562 take_curr = false; 563 break; 564 } 565 }, 566 FormatExtend(t) => match t { 567 RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB12 568 RequireLetter if cat == wd::WC_ALetter => Letter, // rule WB6 569 RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB6 570 AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7a 571 RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b 572 _ => break // backtrack will happens 573 } 574 } 575 } 576 577 if let FormatExtend(t) = state { 578 // if we required something but didn't find it, backtrack 579 if t == RequireLetter || t == RequireHLetter || 580 t == RequireNumeric || t == AcceptNone || t == AcceptQLetter { 581 previdx = saveidx; 582 take_cat = false; 583 take_curr = false; 584 } 585 } 586 587 self.catb = if take_curr { 588 None 589 } else { 590 idx = previdx; 591 if take_cat { 592 Some(cat) 593 } else { 594 None 595 } 596 }; 597 598 let retstr = &self.string[idx..]; 599 self.string = &self.string[..idx]; 600 Some(retstr) 601 } 602 } 603 604 impl<'a> UWordBounds<'a> { 605 #[inline] 606 /// View the underlying data (the part yet to be iterated) as a slice of the original string. 607 /// 608 /// ```rust 609 /// # use unicode_segmentation::UnicodeSegmentation; 610 /// let mut iter = "Hello world".split_word_bounds(); 611 /// assert_eq!(iter.as_str(), "Hello world"); 612 /// iter.next(); 613 /// assert_eq!(iter.as_str(), " world"); 614 /// iter.next(); 615 /// assert_eq!(iter.as_str(), "world"); 616 /// ``` 617 pub fn as_str(&self) -> &'a str { 618 self.string 619 } 620 621 #[inline] 622 fn get_next_cat(&self, idx: usize) -> Option<WordCat> { 623 use tables::word as wd; 624 let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8(); 625 if nidx < self.string.len() { 626 let nch = self.string[nidx..].chars().next().unwrap(); 627 Some(wd::word_category(nch)) 628 } else { 629 None 630 } 631 } 632 633 #[inline] 634 fn get_prev_cat(&self, idx: usize) -> Option<WordCat> { 635 use tables::word as wd; 636 if idx > 0 { 637 let nch = self.string[..idx].chars().next_back().unwrap(); 638 Some(wd::word_category(nch)) 639 } else { 640 None 641 } 642 } 643 } 644 645 #[inline] 646 pub fn new_word_bounds<'b>(s: &'b str) -> UWordBounds<'b> { 647 UWordBounds { string: s, cat: None, catb: None } 648 } 649 650 #[inline] 651 pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> { 652 UWordBoundIndices { start_offset: s.as_ptr() as usize, iter: new_word_bounds(s) } 653 } 654 655 #[inline] 656 pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> { 657 use super::UnicodeSegmentation; 658 use tables::util::is_alphanumeric; 659 660 fn has_alphanumeric(s: &&str) -> bool { s.chars().any(|c| is_alphanumeric(c)) } 661 let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer 662 663 UnicodeWords { inner: s.split_word_bounds().filter(has_alphanumeric) } 664 } 665