1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10
11 use core::cmp;
12
13 use crate::tables::grapheme::GraphemeCat;
14
15 /// External iterator for grapheme clusters and byte offsets.
16 ///
17 /// This struct is created by the [`grapheme_indices`] method on the [`UnicodeSegmentation`]
18 /// trait. See its documentation for more.
19 ///
20 /// [`grapheme_indices`]: trait.UnicodeSegmentation.html#tymethod.grapheme_indices
21 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
22 #[derive(Clone)]
23 pub struct GraphemeIndices<'a> {
24 start_offset: usize,
25 iter: Graphemes<'a>,
26 }
27
28 impl<'a> GraphemeIndices<'a> {
29 #[inline]
30 /// View the underlying data (the part yet to be iterated) as a slice of the original string.
31 ///
32 /// ```rust
33 /// # use unicode_segmentation::UnicodeSegmentation;
34 /// let mut iter = "abc".grapheme_indices(true);
35 /// assert_eq!(iter.as_str(), "abc");
36 /// iter.next();
37 /// assert_eq!(iter.as_str(), "bc");
38 /// iter.next();
39 /// iter.next();
40 /// assert_eq!(iter.as_str(), "");
41 /// ```
as_str(&self) -> &'a str42 pub fn as_str(&self) -> &'a str {
43 self.iter.as_str()
44 }
45 }
46
47 impl<'a> Iterator for GraphemeIndices<'a> {
48 type Item = (usize, &'a str);
49
50 #[inline]
next(&mut self) -> Option<(usize, &'a str)>51 fn next(&mut self) -> Option<(usize, &'a str)> {
52 self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s))
53 }
54
55 #[inline]
size_hint(&self) -> (usize, Option<usize>)56 fn size_hint(&self) -> (usize, Option<usize>) {
57 self.iter.size_hint()
58 }
59 }
60
61 impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
62 #[inline]
next_back(&mut self) -> Option<(usize, &'a str)>63 fn next_back(&mut self) -> Option<(usize, &'a str)> {
64 self.iter.next_back().map(|s| (s.as_ptr() as usize - self.start_offset, s))
65 }
66 }
67
68 /// External iterator for a string's
69 /// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
70 ///
71 /// This struct is created by the [`graphemes`] method on the [`UnicodeSegmentation`] trait. See its
72 /// documentation for more.
73 ///
74 /// [`graphemes`]: trait.UnicodeSegmentation.html#tymethod.graphemes
75 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
76 #[derive(Clone, Debug)]
77 pub struct Graphemes<'a> {
78 string: &'a str,
79 cursor: GraphemeCursor,
80 cursor_back: GraphemeCursor,
81 }
82
83 impl<'a> Graphemes<'a> {
84 #[inline]
85 /// View the underlying data (the part yet to be iterated) as a slice of the original string.
86 ///
87 /// ```rust
88 /// # use unicode_segmentation::UnicodeSegmentation;
89 /// let mut iter = "abc".graphemes(true);
90 /// assert_eq!(iter.as_str(), "abc");
91 /// iter.next();
92 /// assert_eq!(iter.as_str(), "bc");
93 /// iter.next();
94 /// iter.next();
95 /// assert_eq!(iter.as_str(), "");
96 /// ```
as_str(&self) -> &'a str97 pub fn as_str(&self) -> &'a str {
98 &self.string[self.cursor.cur_cursor()..self.cursor_back.cur_cursor()]
99 }
100 }
101
102 impl<'a> Iterator for Graphemes<'a> {
103 type Item = &'a str;
104
105 #[inline]
size_hint(&self) -> (usize, Option<usize>)106 fn size_hint(&self) -> (usize, Option<usize>) {
107 let slen = self.cursor_back.cur_cursor() - self.cursor.cur_cursor();
108 (cmp::min(slen, 1), Some(slen))
109 }
110
111 #[inline]
next(&mut self) -> Option<&'a str>112 fn next(&mut self) -> Option<&'a str> {
113 let start = self.cursor.cur_cursor();
114 if start == self.cursor_back.cur_cursor() {
115 return None;
116 }
117 let next = self.cursor.next_boundary(self.string, 0).unwrap().unwrap();
118 Some(&self.string[start..next])
119 }
120 }
121
122 impl<'a> DoubleEndedIterator for Graphemes<'a> {
123 #[inline]
next_back(&mut self) -> Option<&'a str>124 fn next_back(&mut self) -> Option<&'a str> {
125 let end = self.cursor_back.cur_cursor();
126 if end == self.cursor.cur_cursor() {
127 return None;
128 }
129 let prev = self.cursor_back.prev_boundary(self.string, 0).unwrap().unwrap();
130 Some(&self.string[prev..end])
131 }
132 }
133
134 #[inline]
new_graphemes<'b>(s: &'b str, is_extended: bool) -> Graphemes<'b>135 pub fn new_graphemes<'b>(s: &'b str, is_extended: bool) -> Graphemes<'b> {
136 let len = s.len();
137 Graphemes {
138 string: s,
139 cursor: GraphemeCursor::new(0, len, is_extended),
140 cursor_back: GraphemeCursor::new(len, len, is_extended),
141 }
142 }
143
144 #[inline]
new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndices<'b>145 pub fn new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndices<'b> {
146 GraphemeIndices { start_offset: s.as_ptr() as usize, iter: new_graphemes(s, is_extended) }
147 }
148
149 // maybe unify with PairResult?
150 // An enum describing information about a potential boundary.
151 #[derive(PartialEq, Eq, Clone, Debug)]
152 enum GraphemeState {
153 // No information is known.
154 Unknown,
155 // It is known to not be a boundary.
156 NotBreak,
157 // It is known to be a boundary.
158 Break,
159 // The codepoint after is a Regional Indicator Symbol, so a boundary iff
160 // it is preceded by an even number of RIS codepoints. (GB12, GB13)
161 Regional,
162 // The codepoint after is Extended_Pictographic,
163 // so whether it's a boundary depends on pre-context according to GB11.
164 Emoji,
165 }
166
167 /// Cursor-based segmenter for grapheme clusters.
168 #[derive(Clone, Debug)]
169 pub struct GraphemeCursor {
170 // Current cursor position.
171 offset: usize,
172 // Total length of the string.
173 len: usize,
174 // A config flag indicating whether this cursor computes legacy or extended
175 // grapheme cluster boundaries (enables GB9a and GB9b if set).
176 is_extended: bool,
177 // Information about the potential boundary at `offset`
178 state: GraphemeState,
179 // Category of codepoint immediately preceding cursor, if known.
180 cat_before: Option<GraphemeCat>,
181 // Category of codepoint immediately after cursor, if known.
182 cat_after: Option<GraphemeCat>,
183 // If set, at least one more codepoint immediately preceding this offset
184 // is needed to resolve whether there's a boundary at `offset`.
185 pre_context_offset: Option<usize>,
186 // The number of RIS codepoints preceding `offset`. If `pre_context_offset`
187 // is set, then counts the number of RIS between that and `offset`, otherwise
188 // is an accurate count relative to the string.
189 ris_count: Option<usize>,
190 // Set if a call to `prev_boundary` or `next_boundary` was suspended due
191 // to needing more input.
192 resuming: bool,
193 // Cached grapheme category and associated scalar value range.
194 grapheme_cat_cache: (u32, u32, GraphemeCat),
195 }
196
197 /// An error return indicating that not enough content was available in the
198 /// provided chunk to satisfy the query, and that more content must be provided.
199 #[derive(PartialEq, Eq, Debug)]
200 pub enum GraphemeIncomplete {
201 /// More pre-context is needed. The caller should call `provide_context`
202 /// with a chunk ending at the offset given, then retry the query. This
203 /// will only be returned if the `chunk_start` parameter is nonzero.
204 PreContext(usize),
205
206 /// When requesting `prev_boundary`, the cursor is moving past the beginning
207 /// of the current chunk, so the chunk before that is requested. This will
208 /// only be returned if the `chunk_start` parameter is nonzero.
209 PrevChunk,
210
211 /// When requesting `next_boundary`, the cursor is moving past the end of the
212 /// current chunk, so the chunk after that is requested. This will only be
213 /// returned if the chunk ends before the `len` parameter provided on
214 /// creation of the cursor.
215 NextChunk, // requesting chunk following the one given
216
217 /// An error returned when the chunk given does not contain the cursor position.
218 InvalidOffset,
219 }
220
221 // An enum describing the result from lookup of a pair of categories.
222 #[derive(PartialEq, Eq)]
223 enum PairResult {
224 NotBreak, // definitely not a break
225 Break, // definitely a break
226 Extended, // a break iff not in extended mode
227 Regional, // a break if preceded by an even number of RIS
228 Emoji, // a break if preceded by emoji base and (Extend)*
229 }
230
231 #[inline]
check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult232 fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
233 use crate::tables::grapheme::GraphemeCat::*;
234 use self::PairResult::*;
235 match (before, after) {
236 (GC_CR, GC_LF) => NotBreak, // GB3
237 (GC_Control, _) => Break, // GB4
238 (GC_CR, _) => Break, // GB4
239 (GC_LF, _) => Break, // GB4
240 (_, GC_Control) => Break, // GB5
241 (_, GC_CR) => Break, // GB5
242 (_, GC_LF) => Break, // GB5
243 (GC_L, GC_L) => NotBreak, // GB6
244 (GC_L, GC_V) => NotBreak, // GB6
245 (GC_L, GC_LV) => NotBreak, // GB6
246 (GC_L, GC_LVT) => NotBreak, // GB6
247 (GC_LV, GC_V) => NotBreak, // GB7
248 (GC_LV, GC_T) => NotBreak, // GB7
249 (GC_V, GC_V) => NotBreak, // GB7
250 (GC_V, GC_T) => NotBreak, // GB7
251 (GC_LVT, GC_T) => NotBreak, // GB8
252 (GC_T, GC_T) => NotBreak, // GB8
253 (_, GC_Extend) => NotBreak, // GB9
254 (_, GC_ZWJ) => NotBreak, // GB9
255 (_, GC_SpacingMark) => Extended, // GB9a
256 (GC_Prepend, _) => Extended, // GB9b
257 (GC_ZWJ, GC_Extended_Pictographic) => Emoji, // GB11
258 (GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13
259 (_, _) => Break, // GB999
260 }
261 }
262
263 impl GraphemeCursor {
264 /// Create a new cursor. The string and initial offset are given at creation
265 /// time, but the contents of the string are not. The `is_extended` parameter
266 /// controls whether extended grapheme clusters are selected.
267 ///
268 /// The `offset` parameter must be on a codepoint boundary.
269 ///
270 /// ```rust
271 /// # use unicode_segmentation::GraphemeCursor;
272 /// let s = "हिन्दी";
273 /// let mut legacy = GraphemeCursor::new(0, s.len(), false);
274 /// assert_eq!(legacy.next_boundary(s, 0), Ok(Some("ह".len())));
275 /// let mut extended = GraphemeCursor::new(0, s.len(), true);
276 /// assert_eq!(extended.next_boundary(s, 0), Ok(Some("हि".len())));
277 /// ```
new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor278 pub fn new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor {
279 let state = if offset == 0 || offset == len {
280 GraphemeState::Break
281 } else {
282 GraphemeState::Unknown
283 };
284 GraphemeCursor {
285 offset: offset,
286 len: len,
287 state: state,
288 is_extended: is_extended,
289 cat_before: None,
290 cat_after: None,
291 pre_context_offset: None,
292 ris_count: None,
293 resuming: false,
294 grapheme_cat_cache: (0, 0, GraphemeCat::GC_Control),
295 }
296 }
297
grapheme_category(&mut self, ch: char) -> GraphemeCat298 fn grapheme_category(&mut self, ch: char) -> GraphemeCat {
299 use crate::tables::grapheme as gr;
300 use crate::tables::grapheme::GraphemeCat::*;
301
302 if ch <= '\u{7e}' {
303 // Special-case optimization for ascii, except U+007F. This
304 // improves performance even for many primarily non-ascii texts,
305 // due to use of punctuation and white space characters from the
306 // ascii range.
307 if ch >= '\u{20}' {
308 GC_Any
309 } else if ch == '\n' {
310 GC_LF
311 } else if ch == '\r' {
312 GC_CR
313 } else {
314 GC_Control
315 }
316 } else {
317 // If this char isn't within the cached range, update the cache to the
318 // range that includes it.
319 if (ch as u32) < self.grapheme_cat_cache.0 || (ch as u32) > self.grapheme_cat_cache.1 {
320 self.grapheme_cat_cache = gr::grapheme_category(ch);
321 }
322 self.grapheme_cat_cache.2
323 }
324 }
325
326 // Not sure I'm gonna keep this, the advantage over new() seems thin.
327
328 /// Set the cursor to a new location in the same string.
329 ///
330 /// ```rust
331 /// # use unicode_segmentation::GraphemeCursor;
332 /// let s = "abcd";
333 /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
334 /// assert_eq!(cursor.cur_cursor(), 0);
335 /// cursor.set_cursor(2);
336 /// assert_eq!(cursor.cur_cursor(), 2);
337 /// ```
set_cursor(&mut self, offset: usize)338 pub fn set_cursor(&mut self, offset: usize) {
339 if offset != self.offset {
340 self.offset = offset;
341 self.state = if offset == 0 || offset == self.len {
342 GraphemeState::Break
343 } else {
344 GraphemeState::Unknown
345 };
346 // reset state derived from text around cursor
347 self.cat_before = None;
348 self.cat_after = None;
349 self.ris_count = None;
350 }
351 }
352
353 #[inline]
354 /// The current offset of the cursor. Equal to the last value provided to
355 /// `new()` or `set_cursor()`, or returned from `next_boundary()` or
356 /// `prev_boundary()`.
357 ///
358 /// ```rust
359 /// # use unicode_segmentation::GraphemeCursor;
360 /// // Two flags (), each flag is two RIS codepoints, each RIS is 4 bytes.
361 /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
362 /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
363 /// assert_eq!(cursor.cur_cursor(), 4);
364 /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
365 /// assert_eq!(cursor.cur_cursor(), 8);
366 /// ```
cur_cursor(&self) -> usize367 pub fn cur_cursor(&self) -> usize {
368 self.offset
369 }
370
371 /// Provide additional pre-context when it is needed to decide a boundary.
372 /// The end of the chunk must coincide with the value given in the
373 /// `GraphemeIncomplete::PreContext` request.
374 ///
375 /// ```rust
376 /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
377 /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
378 /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
379 /// // Not enough pre-context to decide if there's a boundary between the two flags.
380 /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(8)));
381 /// // Provide one more Regional Indicator Symbol of pre-context
382 /// cursor.provide_context(&flags[4..8], 4);
383 /// // Still not enough context to decide.
384 /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(4)));
385 /// // Provide additional requested context.
386 /// cursor.provide_context(&flags[0..4], 0);
387 /// // That's enough to decide (it always is when context goes to the start of the string)
388 /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Ok(true));
389 /// ```
provide_context(&mut self, chunk: &str, chunk_start: usize)390 pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) {
391 use crate::tables::grapheme as gr;
392 assert!(chunk_start + chunk.len() == self.pre_context_offset.unwrap());
393 self.pre_context_offset = None;
394 if self.is_extended && chunk_start + chunk.len() == self.offset {
395 let ch = chunk.chars().rev().next().unwrap();
396 if self.grapheme_category(ch) == gr::GC_Prepend {
397 self.decide(false); // GB9b
398 return;
399 }
400 }
401 match self.state {
402 GraphemeState::Regional => self.handle_regional(chunk, chunk_start),
403 GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start),
404 _ => if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start {
405 let ch = chunk.chars().rev().next().unwrap();
406 self.cat_before = Some(self.grapheme_category(ch));
407 },
408 }
409 }
410
411 #[inline]
decide(&mut self, is_break: bool)412 fn decide(&mut self, is_break: bool) {
413 self.state = if is_break {
414 GraphemeState::Break
415 } else {
416 GraphemeState::NotBreak
417 };
418 }
419
420 #[inline]
decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete>421 fn decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete> {
422 self.decide(is_break);
423 Ok(is_break)
424 }
425
426 #[inline]
is_boundary_result(&self) -> Result<bool, GraphemeIncomplete>427 fn is_boundary_result(&self) -> Result<bool, GraphemeIncomplete> {
428 if self.state == GraphemeState::Break {
429 Ok(true)
430 } else if self.state == GraphemeState::NotBreak {
431 Ok(false)
432 } else if let Some(pre_context_offset) = self.pre_context_offset {
433 Err(GraphemeIncomplete::PreContext(pre_context_offset))
434 } else {
435 unreachable!("inconsistent state");
436 }
437 }
438
439 #[inline]
handle_regional(&mut self, chunk: &str, chunk_start: usize)440 fn handle_regional(&mut self, chunk: &str, chunk_start: usize) {
441 use crate::tables::grapheme as gr;
442 let mut ris_count = self.ris_count.unwrap_or(0);
443 for ch in chunk.chars().rev() {
444 if self.grapheme_category(ch) != gr::GC_Regional_Indicator {
445 self.ris_count = Some(ris_count);
446 self.decide((ris_count % 2) == 0);
447 return;
448 }
449 ris_count += 1;
450 }
451 self.ris_count = Some(ris_count);
452 if chunk_start == 0 {
453 self.decide((ris_count % 2) == 0);
454 return;
455 }
456 self.pre_context_offset = Some(chunk_start);
457 self.state = GraphemeState::Regional;
458 }
459
460 #[inline]
handle_emoji(&mut self, chunk: &str, chunk_start: usize)461 fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
462 use crate::tables::grapheme as gr;
463 let mut iter = chunk.chars().rev();
464 if let Some(ch) = iter.next() {
465 if self.grapheme_category(ch) != gr::GC_ZWJ {
466 self.decide(true);
467 return;
468 }
469 }
470 for ch in iter {
471 match self.grapheme_category(ch) {
472 gr::GC_Extend => (),
473 gr::GC_Extended_Pictographic => {
474 self.decide(false);
475 return;
476 }
477 _ => {
478 self.decide(true);
479 return;
480 }
481 }
482 }
483 if chunk_start == 0 {
484 self.decide(true);
485 return;
486 }
487 self.pre_context_offset = Some(chunk_start);
488 self.state = GraphemeState::Emoji;
489 }
490
491 #[inline]
492 /// Determine whether the current cursor location is a grapheme cluster boundary.
493 /// Only a part of the string need be supplied. If `chunk_start` is nonzero or
494 /// the length of `chunk` is not equal to `len` on creation, then this method
495 /// may return `GraphemeIncomplete::PreContext`. The caller should then
496 /// call `provide_context` with the requested chunk, then retry calling this
497 /// method.
498 ///
499 /// For partial chunks, if the cursor is not at the beginning or end of the
500 /// string, the chunk should contain at least the codepoint following the cursor.
501 /// If the string is nonempty, the chunk must be nonempty.
502 ///
503 /// All calls should have consistent chunk contents (ie, if a chunk provides
504 /// content for a given slice, all further chunks covering that slice must have
505 /// the same content for it).
506 ///
507 /// ```rust
508 /// # use unicode_segmentation::GraphemeCursor;
509 /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
510 /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
511 /// assert_eq!(cursor.is_boundary(flags, 0), Ok(true));
512 /// cursor.set_cursor(12);
513 /// assert_eq!(cursor.is_boundary(flags, 0), Ok(false));
514 /// ```
is_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<bool, GraphemeIncomplete>515 pub fn is_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<bool, GraphemeIncomplete> {
516 use crate::tables::grapheme as gr;
517 if self.state == GraphemeState::Break {
518 return Ok(true)
519 }
520 if self.state == GraphemeState::NotBreak {
521 return Ok(false)
522 }
523 if self.offset < chunk_start || self.offset >= chunk_start + chunk.len() {
524 if self.offset > chunk_start + chunk.len() || self.cat_after.is_none() {
525 return Err(GraphemeIncomplete::InvalidOffset)
526 }
527 }
528 if let Some(pre_context_offset) = self.pre_context_offset {
529 return Err(GraphemeIncomplete::PreContext(pre_context_offset));
530 }
531 let offset_in_chunk = self.offset - chunk_start;
532 if self.cat_after.is_none() {
533 let ch = chunk[offset_in_chunk..].chars().next().unwrap();
534 self.cat_after = Some(self.grapheme_category(ch));
535 }
536 if self.offset == chunk_start {
537 let mut need_pre_context = true;
538 match self.cat_after.unwrap() {
539 gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
540 gr::GC_Extended_Pictographic => self.state = GraphemeState::Emoji,
541 _ => need_pre_context = self.cat_before.is_none(),
542 }
543 if need_pre_context {
544 self.pre_context_offset = Some(chunk_start);
545 return Err(GraphemeIncomplete::PreContext(chunk_start));
546 }
547 }
548 if self.cat_before.is_none() {
549 let ch = chunk[..offset_in_chunk].chars().rev().next().unwrap();
550 self.cat_before = Some(self.grapheme_category(ch));
551 }
552 match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) {
553 PairResult::NotBreak => return self.decision(false),
554 PairResult::Break => return self.decision(true),
555 PairResult::Extended => {
556 let is_extended = self.is_extended;
557 return self.decision(!is_extended);
558 }
559 PairResult::Regional => {
560 if let Some(ris_count) = self.ris_count {
561 return self.decision((ris_count % 2) == 0);
562 }
563 self.handle_regional(&chunk[..offset_in_chunk], chunk_start);
564 self.is_boundary_result()
565 }
566 PairResult::Emoji => {
567 self.handle_emoji(&chunk[..offset_in_chunk], chunk_start);
568 self.is_boundary_result()
569 }
570 }
571 }
572
573 #[inline]
574 /// Find the next boundary after the current cursor position. Only a part of
575 /// the string need be supplied. If the chunk is incomplete, then this
576 /// method might return `GraphemeIncomplete::PreContext` or
577 /// `GraphemeIncomplete::NextChunk`. In the former case, the caller should
578 /// call `provide_context` with the requested chunk, then retry. In the
579 /// latter case, the caller should provide the chunk following the one
580 /// given, then retry.
581 ///
582 /// See `is_boundary` for expectations on the provided chunk.
583 ///
584 /// ```rust
585 /// # use unicode_segmentation::GraphemeCursor;
586 /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
587 /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
588 /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
589 /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(16)));
590 /// assert_eq!(cursor.next_boundary(flags, 0), Ok(None));
591 /// ```
592 ///
593 /// And an example that uses partial strings:
594 ///
595 /// ```rust
596 /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
597 /// let s = "abcd";
598 /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
599 /// assert_eq!(cursor.next_boundary(&s[..2], 0), Ok(Some(1)));
600 /// assert_eq!(cursor.next_boundary(&s[..2], 0), Err(GraphemeIncomplete::NextChunk));
601 /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(2)));
602 /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(3)));
603 /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(4)));
604 /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None));
605 /// ```
next_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<Option<usize>, GraphemeIncomplete>606 pub fn next_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<Option<usize>, GraphemeIncomplete> {
607 if self.offset == self.len {
608 return Ok(None);
609 }
610 let mut iter = chunk[self.offset - chunk_start..].chars();
611 let mut ch = iter.next().unwrap();
612 loop {
613 if self.resuming {
614 if self.cat_after.is_none() {
615 self.cat_after = Some(self.grapheme_category(ch));
616 }
617 } else {
618 self.offset += ch.len_utf8();
619 self.state = GraphemeState::Unknown;
620 self.cat_before = self.cat_after.take();
621 if self.cat_before.is_none() {
622 self.cat_before = Some(self.grapheme_category(ch));
623 }
624 if self.cat_before.unwrap() == GraphemeCat::GC_Regional_Indicator {
625 self.ris_count = self.ris_count.map(|c| c + 1);
626 } else {
627 self.ris_count = Some(0);
628 }
629 if let Some(next_ch) = iter.next() {
630 ch = next_ch;
631 self.cat_after = Some(self.grapheme_category(ch));
632 } else if self.offset == self.len {
633 self.decide(true);
634 } else {
635 self.resuming = true;
636 return Err(GraphemeIncomplete::NextChunk);
637 }
638 }
639 self.resuming = true;
640 if self.is_boundary(chunk, chunk_start)? {
641 self.resuming = false;
642 return Ok(Some(self.offset));
643 }
644 self.resuming = false;
645 }
646 }
647
648 /// Find the previous boundary after the current cursor position. Only a part
649 /// of the string need be supplied. If the chunk is incomplete, then this
650 /// method might return `GraphemeIncomplete::PreContext` or
651 /// `GraphemeIncomplete::PrevChunk`. In the former case, the caller should
652 /// call `provide_context` with the requested chunk, then retry. In the
653 /// latter case, the caller should provide the chunk preceding the one
654 /// given, then retry.
655 ///
656 /// See `is_boundary` for expectations on the provided chunk.
657 ///
658 /// ```rust
659 /// # use unicode_segmentation::GraphemeCursor;
660 /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
661 /// let mut cursor = GraphemeCursor::new(12, flags.len(), false);
662 /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(8)));
663 /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(0)));
664 /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(None));
665 /// ```
666 ///
667 /// And an example that uses partial strings (note the exact return is not
668 /// guaranteed, and may be `PrevChunk` or `PreContext` arbitrarily):
669 ///
670 /// ```rust
671 /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
672 /// let s = "abcd";
673 /// let mut cursor = GraphemeCursor::new(4, s.len(), false);
674 /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Ok(Some(3)));
675 /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Err(GraphemeIncomplete::PrevChunk));
676 /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(2)));
677 /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(1)));
678 /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(0)));
679 /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None));
680 /// ```
prev_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<Option<usize>, GraphemeIncomplete>681 pub fn prev_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<Option<usize>, GraphemeIncomplete> {
682 if self.offset == 0 {
683 return Ok(None);
684 }
685 if self.offset == chunk_start {
686 return Err(GraphemeIncomplete::PrevChunk);
687 }
688 let mut iter = chunk[..self.offset - chunk_start].chars().rev();
689 let mut ch = iter.next().unwrap();
690 loop {
691 if self.offset == chunk_start {
692 self.resuming = true;
693 return Err(GraphemeIncomplete::PrevChunk);
694 }
695 if self.resuming {
696 self.cat_before = Some(self.grapheme_category(ch));
697 } else {
698 self.offset -= ch.len_utf8();
699 self.cat_after = self.cat_before.take();
700 self.state = GraphemeState::Unknown;
701 if let Some(ris_count) = self.ris_count {
702 self.ris_count = if ris_count > 0 { Some(ris_count - 1) } else { None };
703 }
704 if let Some(prev_ch) = iter.next() {
705 ch = prev_ch;
706 self.cat_before = Some(self.grapheme_category(ch));
707 } else if self.offset == 0 {
708 self.decide(true);
709 } else {
710 self.resuming = true;
711 self.cat_after = Some(self.grapheme_category(ch));
712 return Err(GraphemeIncomplete::PrevChunk);
713 }
714 }
715 self.resuming = true;
716 if self.is_boundary(chunk, chunk_start)? {
717 self.resuming = false;
718 return Ok(Some(self.offset));
719 }
720 self.resuming = false;
721 }
722 }
723 }
724
725 #[test]
test_grapheme_cursor_ris_precontext()726 fn test_grapheme_cursor_ris_precontext() {
727 let s = "\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}";
728 let mut c = GraphemeCursor::new(8, s.len(), true);
729 assert_eq!(c.is_boundary(&s[4..], 4), Err(GraphemeIncomplete::PreContext(4)));
730 c.provide_context(&s[..4], 0);
731 assert_eq!(c.is_boundary(&s[4..], 4), Ok(true));
732 }
733
734 #[test]
test_grapheme_cursor_chunk_start_require_precontext()735 fn test_grapheme_cursor_chunk_start_require_precontext() {
736 let s = "\r\n";
737 let mut c = GraphemeCursor::new(1, s.len(), true);
738 assert_eq!(c.is_boundary(&s[1..], 1), Err(GraphemeIncomplete::PreContext(1)));
739 c.provide_context(&s[..1], 0);
740 assert_eq!(c.is_boundary(&s[1..], 1), Ok(false));
741 }
742
743 #[test]
test_grapheme_cursor_prev_boundary()744 fn test_grapheme_cursor_prev_boundary() {
745 let s = "abcd";
746 let mut c = GraphemeCursor::new(3, s.len(), true);
747 assert_eq!(c.prev_boundary(&s[2..], 2), Err(GraphemeIncomplete::PrevChunk));
748 assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(2)));
749 }
750
751 #[test]
test_grapheme_cursor_prev_boundary_chunk_start()752 fn test_grapheme_cursor_prev_boundary_chunk_start() {
753 let s = "abcd";
754 let mut c = GraphemeCursor::new(2, s.len(), true);
755 assert_eq!(c.prev_boundary(&s[2..], 2), Err(GraphemeIncomplete::PrevChunk));
756 assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(1)));
757 }
758