1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10 
11 use core::cmp;
12 
13 use crate::tables::grapheme::GraphemeCat;
14 
15 /// External iterator for grapheme clusters and byte offsets.
16 ///
17 /// This struct is created by the [`grapheme_indices`] method on the [`UnicodeSegmentation`]
18 /// trait. See its documentation for more.
19 ///
20 /// [`grapheme_indices`]: trait.UnicodeSegmentation.html#tymethod.grapheme_indices
21 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
22 #[derive(Clone)]
23 pub struct GraphemeIndices<'a> {
24     start_offset: usize,
25     iter: Graphemes<'a>,
26 }
27 
28 impl<'a> GraphemeIndices<'a> {
29     #[inline]
30     /// View the underlying data (the part yet to be iterated) as a slice of the original string.
31     ///
32     /// ```rust
33     /// # use unicode_segmentation::UnicodeSegmentation;
34     /// let mut iter = "abc".grapheme_indices(true);
35     /// assert_eq!(iter.as_str(), "abc");
36     /// iter.next();
37     /// assert_eq!(iter.as_str(), "bc");
38     /// iter.next();
39     /// iter.next();
40     /// assert_eq!(iter.as_str(), "");
41     /// ```
as_str(&self) -> &'a str42     pub fn as_str(&self) -> &'a str {
43         self.iter.as_str()
44     }
45 }
46 
47 impl<'a> Iterator for GraphemeIndices<'a> {
48     type Item = (usize, &'a str);
49 
50     #[inline]
next(&mut self) -> Option<(usize, &'a str)>51     fn next(&mut self) -> Option<(usize, &'a str)> {
52         self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s))
53     }
54 
55     #[inline]
size_hint(&self) -> (usize, Option<usize>)56     fn size_hint(&self) -> (usize, Option<usize>) {
57         self.iter.size_hint()
58     }
59 }
60 
61 impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
62     #[inline]
next_back(&mut self) -> Option<(usize, &'a str)>63     fn next_back(&mut self) -> Option<(usize, &'a str)> {
64         self.iter.next_back().map(|s| (s.as_ptr() as usize - self.start_offset, s))
65     }
66 }
67 
68 /// External iterator for a string's
69 /// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
70 ///
71 /// This struct is created by the [`graphemes`] method on the [`UnicodeSegmentation`] trait. See its
72 /// documentation for more.
73 ///
74 /// [`graphemes`]: trait.UnicodeSegmentation.html#tymethod.graphemes
75 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
76 #[derive(Clone, Debug)]
77 pub struct Graphemes<'a> {
78     string: &'a str,
79     cursor: GraphemeCursor,
80     cursor_back: GraphemeCursor,
81 }
82 
83 impl<'a> Graphemes<'a> {
84     #[inline]
85     /// View the underlying data (the part yet to be iterated) as a slice of the original string.
86     ///
87     /// ```rust
88     /// # use unicode_segmentation::UnicodeSegmentation;
89     /// let mut iter = "abc".graphemes(true);
90     /// assert_eq!(iter.as_str(), "abc");
91     /// iter.next();
92     /// assert_eq!(iter.as_str(), "bc");
93     /// iter.next();
94     /// iter.next();
95     /// assert_eq!(iter.as_str(), "");
96     /// ```
as_str(&self) -> &'a str97     pub fn as_str(&self) -> &'a str {
98         &self.string[self.cursor.cur_cursor()..self.cursor_back.cur_cursor()]
99     }
100 }
101 
102 impl<'a> Iterator for Graphemes<'a> {
103     type Item = &'a str;
104 
105     #[inline]
size_hint(&self) -> (usize, Option<usize>)106     fn size_hint(&self) -> (usize, Option<usize>) {
107         let slen = self.cursor_back.cur_cursor() - self.cursor.cur_cursor();
108         (cmp::min(slen, 1), Some(slen))
109     }
110 
111     #[inline]
next(&mut self) -> Option<&'a str>112     fn next(&mut self) -> Option<&'a str> {
113         let start = self.cursor.cur_cursor();
114         if start == self.cursor_back.cur_cursor() {
115             return None;
116         }
117         let next = self.cursor.next_boundary(self.string, 0).unwrap().unwrap();
118         Some(&self.string[start..next])
119     }
120 }
121 
122 impl<'a> DoubleEndedIterator for Graphemes<'a> {
123     #[inline]
next_back(&mut self) -> Option<&'a str>124     fn next_back(&mut self) -> Option<&'a str> {
125         let end = self.cursor_back.cur_cursor();
126         if end == self.cursor.cur_cursor() {
127             return None;
128         }
129         let prev = self.cursor_back.prev_boundary(self.string, 0).unwrap().unwrap();
130         Some(&self.string[prev..end])
131     }
132 }
133 
134 #[inline]
new_graphemes<'b>(s: &'b str, is_extended: bool) -> Graphemes<'b>135 pub fn new_graphemes<'b>(s: &'b str, is_extended: bool) -> Graphemes<'b> {
136     let len = s.len();
137     Graphemes {
138         string: s,
139         cursor: GraphemeCursor::new(0, len, is_extended),
140         cursor_back: GraphemeCursor::new(len, len, is_extended),
141     }
142 }
143 
144 #[inline]
new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndices<'b>145 pub fn new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndices<'b> {
146     GraphemeIndices { start_offset: s.as_ptr() as usize, iter: new_graphemes(s, is_extended) }
147 }
148 
149 // maybe unify with PairResult?
150 // An enum describing information about a potential boundary.
151 #[derive(PartialEq, Eq, Clone, Debug)]
152 enum GraphemeState {
153     // No information is known.
154     Unknown,
155     // It is known to not be a boundary.
156     NotBreak,
157     // It is known to be a boundary.
158     Break,
159     // The codepoint after is a Regional Indicator Symbol, so a boundary iff
160     // it is preceded by an even number of RIS codepoints. (GB12, GB13)
161     Regional,
162     // The codepoint after is Extended_Pictographic,
163     // so whether it's a boundary depends on pre-context according to GB11.
164     Emoji,
165 }
166 
167 /// Cursor-based segmenter for grapheme clusters.
168 #[derive(Clone, Debug)]
169 pub struct GraphemeCursor {
170     // Current cursor position.
171     offset: usize,
172     // Total length of the string.
173     len: usize,
174     // A config flag indicating whether this cursor computes legacy or extended
175     // grapheme cluster boundaries (enables GB9a and GB9b if set).
176     is_extended: bool,
177     // Information about the potential boundary at `offset`
178     state: GraphemeState,
179     // Category of codepoint immediately preceding cursor, if known.
180     cat_before: Option<GraphemeCat>,
181     // Category of codepoint immediately after cursor, if known.
182     cat_after: Option<GraphemeCat>,
183     // If set, at least one more codepoint immediately preceding this offset
184     // is needed to resolve whether there's a boundary at `offset`.
185     pre_context_offset: Option<usize>,
186     // The number of RIS codepoints preceding `offset`. If `pre_context_offset`
187     // is set, then counts the number of RIS between that and `offset`, otherwise
188     // is an accurate count relative to the string.
189     ris_count: Option<usize>,
190     // Set if a call to `prev_boundary` or `next_boundary` was suspended due
191     // to needing more input.
192     resuming: bool,
193     // Cached grapheme category and associated scalar value range.
194     grapheme_cat_cache: (u32, u32, GraphemeCat),
195 }
196 
197 /// An error return indicating that not enough content was available in the
198 /// provided chunk to satisfy the query, and that more content must be provided.
199 #[derive(PartialEq, Eq, Debug)]
200 pub enum GraphemeIncomplete {
201     /// More pre-context is needed. The caller should call `provide_context`
202     /// with a chunk ending at the offset given, then retry the query. This
203     /// will only be returned if the `chunk_start` parameter is nonzero.
204     PreContext(usize),
205 
206     /// When requesting `prev_boundary`, the cursor is moving past the beginning
207     /// of the current chunk, so the chunk before that is requested. This will
208     /// only be returned if the `chunk_start` parameter is nonzero.
209     PrevChunk,
210 
211     /// When requesting `next_boundary`, the cursor is moving past the end of the
212     /// current chunk, so the chunk after that is requested. This will only be
213     /// returned if the chunk ends before the `len` parameter provided on
214     /// creation of the cursor.
215     NextChunk,  // requesting chunk following the one given
216 
217     /// An error returned when the chunk given does not contain the cursor position.
218     InvalidOffset,
219 }
220 
221 // An enum describing the result from lookup of a pair of categories.
222 #[derive(PartialEq, Eq)]
223 enum PairResult {
224     NotBreak,  // definitely not a break
225     Break,  // definitely a break
226     Extended,  // a break iff not in extended mode
227     Regional,  // a break if preceded by an even number of RIS
228     Emoji,  // a break if preceded by emoji base and (Extend)*
229 }
230 
231 #[inline]
check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult232 fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
233     use crate::tables::grapheme::GraphemeCat::*;
234     use self::PairResult::*;
235     match (before, after) {
236         (GC_CR, GC_LF) => NotBreak,  // GB3
237         (GC_Control, _) => Break,  // GB4
238         (GC_CR, _) => Break,  // GB4
239         (GC_LF, _) => Break,  // GB4
240         (_, GC_Control) => Break,  // GB5
241         (_, GC_CR) => Break,  // GB5
242         (_, GC_LF) => Break,  // GB5
243         (GC_L, GC_L) => NotBreak,  // GB6
244         (GC_L, GC_V) => NotBreak,  // GB6
245         (GC_L, GC_LV) => NotBreak,  // GB6
246         (GC_L, GC_LVT) => NotBreak,  // GB6
247         (GC_LV, GC_V) => NotBreak,  // GB7
248         (GC_LV, GC_T) => NotBreak,  // GB7
249         (GC_V, GC_V) => NotBreak,  // GB7
250         (GC_V, GC_T) => NotBreak,  // GB7
251         (GC_LVT, GC_T) => NotBreak,  // GB8
252         (GC_T, GC_T) => NotBreak,  // GB8
253         (_, GC_Extend) => NotBreak, // GB9
254         (_, GC_ZWJ) => NotBreak,  // GB9
255         (_, GC_SpacingMark) => Extended,  // GB9a
256         (GC_Prepend, _) => Extended,  // GB9b
257         (GC_ZWJ, GC_Extended_Pictographic) => Emoji,  // GB11
258         (GC_Regional_Indicator, GC_Regional_Indicator) => Regional,  // GB12, GB13
259         (_, _) => Break,  // GB999
260     }
261 }
262 
263 impl GraphemeCursor {
264     /// Create a new cursor. The string and initial offset are given at creation
265     /// time, but the contents of the string are not. The `is_extended` parameter
266     /// controls whether extended grapheme clusters are selected.
267     ///
268     /// The `offset` parameter must be on a codepoint boundary.
269     ///
270     /// ```rust
271     /// # use unicode_segmentation::GraphemeCursor;
272     /// let s = "हिन्दी";
273     /// let mut legacy = GraphemeCursor::new(0, s.len(), false);
274     /// assert_eq!(legacy.next_boundary(s, 0), Ok(Some("ह".len())));
275     /// let mut extended = GraphemeCursor::new(0, s.len(), true);
276     /// assert_eq!(extended.next_boundary(s, 0), Ok(Some("हि".len())));
277     /// ```
new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor278     pub fn new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor {
279         let state = if offset == 0 || offset == len {
280             GraphemeState::Break
281         } else {
282             GraphemeState::Unknown
283         };
284         GraphemeCursor {
285             offset: offset,
286             len: len,
287             state: state,
288             is_extended: is_extended,
289             cat_before: None,
290             cat_after: None,
291             pre_context_offset: None,
292             ris_count: None,
293             resuming: false,
294             grapheme_cat_cache: (0, 0, GraphemeCat::GC_Control),
295         }
296     }
297 
grapheme_category(&mut self, ch: char) -> GraphemeCat298     fn grapheme_category(&mut self, ch: char) -> GraphemeCat {
299         use crate::tables::grapheme as gr;
300         use crate::tables::grapheme::GraphemeCat::*;
301 
302         if ch <= '\u{7e}' {
303             // Special-case optimization for ascii, except U+007F.  This
304             // improves performance even for many primarily non-ascii texts,
305             // due to use of punctuation and white space characters from the
306             // ascii range.
307             if ch >= '\u{20}' {
308                 GC_Any
309             } else if ch == '\n' {
310                 GC_LF
311             } else if ch == '\r' {
312                 GC_CR
313             } else {
314                 GC_Control
315             }
316         } else {
317             // If this char isn't within the cached range, update the cache to the
318             // range that includes it.
319             if (ch as u32) < self.grapheme_cat_cache.0 || (ch as u32) > self.grapheme_cat_cache.1 {
320                 self.grapheme_cat_cache = gr::grapheme_category(ch);
321             }
322             self.grapheme_cat_cache.2
323         }
324     }
325 
326     // Not sure I'm gonna keep this, the advantage over new() seems thin.
327 
328     /// Set the cursor to a new location in the same string.
329     ///
330     /// ```rust
331     /// # use unicode_segmentation::GraphemeCursor;
332     /// let s = "abcd";
333     /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
334     /// assert_eq!(cursor.cur_cursor(), 0);
335     /// cursor.set_cursor(2);
336     /// assert_eq!(cursor.cur_cursor(), 2);
337     /// ```
set_cursor(&mut self, offset: usize)338     pub fn set_cursor(&mut self, offset: usize) {
339         if offset != self.offset {
340             self.offset = offset;
341             self.state = if offset == 0 || offset == self.len {
342                 GraphemeState::Break
343             } else {
344                 GraphemeState::Unknown
345             };
346             // reset state derived from text around cursor
347             self.cat_before = None;
348             self.cat_after = None;
349             self.ris_count = None;
350         }
351     }
352 
353     #[inline]
354     /// The current offset of the cursor. Equal to the last value provided to
355     /// `new()` or `set_cursor()`, or returned from `next_boundary()` or
356     /// `prev_boundary()`.
357     ///
358     /// ```rust
359     /// # use unicode_segmentation::GraphemeCursor;
360     /// // Two flags (��������), each flag is two RIS codepoints, each RIS is 4 bytes.
361     /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
362     /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
363     /// assert_eq!(cursor.cur_cursor(), 4);
364     /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
365     /// assert_eq!(cursor.cur_cursor(), 8);
366     /// ```
cur_cursor(&self) -> usize367     pub fn cur_cursor(&self) -> usize {
368         self.offset
369     }
370 
371     /// Provide additional pre-context when it is needed to decide a boundary.
372     /// The end of the chunk must coincide with the value given in the
373     /// `GraphemeIncomplete::PreContext` request.
374     ///
375     /// ```rust
376     /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
377     /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
378     /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
379     /// // Not enough pre-context to decide if there's a boundary between the two flags.
380     /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(8)));
381     /// // Provide one more Regional Indicator Symbol of pre-context
382     /// cursor.provide_context(&flags[4..8], 4);
383     /// // Still not enough context to decide.
384     /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(4)));
385     /// // Provide additional requested context.
386     /// cursor.provide_context(&flags[0..4], 0);
387     /// // That's enough to decide (it always is when context goes to the start of the string)
388     /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Ok(true));
389     /// ```
provide_context(&mut self, chunk: &str, chunk_start: usize)390     pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) {
391         use crate::tables::grapheme as gr;
392         assert!(chunk_start + chunk.len() == self.pre_context_offset.unwrap());
393         self.pre_context_offset = None;
394         if self.is_extended && chunk_start + chunk.len() == self.offset {
395             let ch = chunk.chars().rev().next().unwrap();
396             if self.grapheme_category(ch) == gr::GC_Prepend {
397                 self.decide(false);  // GB9b
398                 return;
399             }
400         }
401         match self.state {
402             GraphemeState::Regional => self.handle_regional(chunk, chunk_start),
403             GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start),
404             _ => if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start {
405                 let ch = chunk.chars().rev().next().unwrap();
406                 self.cat_before = Some(self.grapheme_category(ch));
407             },
408         }
409     }
410 
411     #[inline]
decide(&mut self, is_break: bool)412     fn decide(&mut self, is_break: bool) {
413         self.state = if is_break {
414             GraphemeState::Break
415         } else {
416             GraphemeState::NotBreak
417         };
418     }
419 
420     #[inline]
decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete>421     fn decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete> {
422         self.decide(is_break);
423         Ok(is_break)
424     }
425 
426     #[inline]
is_boundary_result(&self) -> Result<bool, GraphemeIncomplete>427     fn is_boundary_result(&self) -> Result<bool, GraphemeIncomplete> {
428         if self.state == GraphemeState::Break {
429             Ok(true)
430         } else if self.state == GraphemeState::NotBreak {
431             Ok(false)
432         } else if let Some(pre_context_offset) = self.pre_context_offset {
433             Err(GraphemeIncomplete::PreContext(pre_context_offset))
434         } else {
435             unreachable!("inconsistent state");
436         }
437     }
438 
439     #[inline]
handle_regional(&mut self, chunk: &str, chunk_start: usize)440     fn handle_regional(&mut self, chunk: &str, chunk_start: usize) {
441         use crate::tables::grapheme as gr;
442         let mut ris_count = self.ris_count.unwrap_or(0);
443         for ch in chunk.chars().rev() {
444             if self.grapheme_category(ch) != gr::GC_Regional_Indicator {
445                 self.ris_count = Some(ris_count);
446                 self.decide((ris_count % 2) == 0);
447                 return;
448             }
449             ris_count += 1;
450         }
451         self.ris_count = Some(ris_count);
452         if chunk_start == 0 {
453             self.decide((ris_count % 2) == 0);
454             return;
455         }
456         self.pre_context_offset = Some(chunk_start);
457         self.state = GraphemeState::Regional;
458     }
459 
460     #[inline]
handle_emoji(&mut self, chunk: &str, chunk_start: usize)461     fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
462         use crate::tables::grapheme as gr;
463         let mut iter = chunk.chars().rev();
464         if let Some(ch) = iter.next() {
465             if self.grapheme_category(ch) != gr::GC_ZWJ {
466                 self.decide(true);
467                 return;
468             }
469         }
470         for ch in iter {
471             match self.grapheme_category(ch) {
472                 gr::GC_Extend => (),
473                 gr::GC_Extended_Pictographic => {
474                     self.decide(false);
475                     return;
476                 }
477                 _ => {
478                     self.decide(true);
479                     return;
480                 }
481             }
482         }
483         if chunk_start == 0 {
484             self.decide(true);
485             return;
486         }
487         self.pre_context_offset = Some(chunk_start);
488         self.state = GraphemeState::Emoji;
489     }
490 
491     #[inline]
492     /// Determine whether the current cursor location is a grapheme cluster boundary.
493     /// Only a part of the string need be supplied. If `chunk_start` is nonzero or
494     /// the length of `chunk` is not equal to `len` on creation, then this method
495     /// may return `GraphemeIncomplete::PreContext`. The caller should then
496     /// call `provide_context` with the requested chunk, then retry calling this
497     /// method.
498     ///
499     /// For partial chunks, if the cursor is not at the beginning or end of the
500     /// string, the chunk should contain at least the codepoint following the cursor.
501     /// If the string is nonempty, the chunk must be nonempty.
502     ///
503     /// All calls should have consistent chunk contents (ie, if a chunk provides
504     /// content for a given slice, all further chunks covering that slice must have
505     /// the same content for it).
506     ///
507     /// ```rust
508     /// # use unicode_segmentation::GraphemeCursor;
509     /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
510     /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
511     /// assert_eq!(cursor.is_boundary(flags, 0), Ok(true));
512     /// cursor.set_cursor(12);
513     /// assert_eq!(cursor.is_boundary(flags, 0), Ok(false));
514     /// ```
is_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<bool, GraphemeIncomplete>515     pub fn is_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<bool, GraphemeIncomplete> {
516         use crate::tables::grapheme as gr;
517         if self.state == GraphemeState::Break {
518             return Ok(true)
519         }
520         if self.state == GraphemeState::NotBreak {
521             return Ok(false)
522         }
523         if self.offset < chunk_start || self.offset >= chunk_start + chunk.len() {
524             if self.offset > chunk_start + chunk.len() || self.cat_after.is_none() {
525                 return Err(GraphemeIncomplete::InvalidOffset)
526             }
527         }
528         if let Some(pre_context_offset) = self.pre_context_offset {
529             return Err(GraphemeIncomplete::PreContext(pre_context_offset));
530         }
531         let offset_in_chunk = self.offset - chunk_start;
532         if self.cat_after.is_none() {
533             let ch = chunk[offset_in_chunk..].chars().next().unwrap();
534             self.cat_after = Some(self.grapheme_category(ch));
535         }
536         if self.offset == chunk_start {
537             let mut need_pre_context = true;
538             match self.cat_after.unwrap() {
539                 gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
540                 gr::GC_Extended_Pictographic => self.state = GraphemeState::Emoji,
541                 _ => need_pre_context = self.cat_before.is_none(),
542             }
543             if need_pre_context {
544                 self.pre_context_offset = Some(chunk_start);
545                 return Err(GraphemeIncomplete::PreContext(chunk_start));
546             }
547         }
548         if self.cat_before.is_none() {
549             let ch = chunk[..offset_in_chunk].chars().rev().next().unwrap();
550             self.cat_before = Some(self.grapheme_category(ch));
551         }
552         match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) {
553             PairResult::NotBreak => return self.decision(false),
554             PairResult::Break => return self.decision(true),
555             PairResult::Extended => {
556                 let is_extended = self.is_extended;
557                 return self.decision(!is_extended);
558             }
559             PairResult::Regional => {
560                 if let Some(ris_count) = self.ris_count {
561                     return self.decision((ris_count % 2) == 0);
562                 }
563                 self.handle_regional(&chunk[..offset_in_chunk], chunk_start);
564                 self.is_boundary_result()
565             }
566             PairResult::Emoji => {
567                 self.handle_emoji(&chunk[..offset_in_chunk], chunk_start);
568                 self.is_boundary_result()
569             }
570         }
571     }
572 
573     #[inline]
574     /// Find the next boundary after the current cursor position. Only a part of
575     /// the string need be supplied. If the chunk is incomplete, then this
576     /// method might return `GraphemeIncomplete::PreContext` or
577     /// `GraphemeIncomplete::NextChunk`. In the former case, the caller should
578     /// call `provide_context` with the requested chunk, then retry. In the
579     /// latter case, the caller should provide the chunk following the one
580     /// given, then retry.
581     ///
582     /// See `is_boundary` for expectations on the provided chunk.
583     ///
584     /// ```rust
585     /// # use unicode_segmentation::GraphemeCursor;
586     /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
587     /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
588     /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
589     /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(16)));
590     /// assert_eq!(cursor.next_boundary(flags, 0), Ok(None));
591     /// ```
592     ///
593     /// And an example that uses partial strings:
594     ///
595     /// ```rust
596     /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
597     /// let s = "abcd";
598     /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
599     /// assert_eq!(cursor.next_boundary(&s[..2], 0), Ok(Some(1)));
600     /// assert_eq!(cursor.next_boundary(&s[..2], 0), Err(GraphemeIncomplete::NextChunk));
601     /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(2)));
602     /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(3)));
603     /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(4)));
604     /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None));
605     /// ```
next_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<Option<usize>, GraphemeIncomplete>606     pub fn next_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<Option<usize>, GraphemeIncomplete> {
607         if self.offset == self.len {
608             return Ok(None);
609         }
610         let mut iter = chunk[self.offset - chunk_start..].chars();
611         let mut ch = iter.next().unwrap();
612         loop {
613             if self.resuming {
614                 if self.cat_after.is_none() {
615                     self.cat_after = Some(self.grapheme_category(ch));
616                 }
617             } else {
618                 self.offset += ch.len_utf8();
619                 self.state = GraphemeState::Unknown;
620                 self.cat_before = self.cat_after.take();
621                 if self.cat_before.is_none() {
622                     self.cat_before = Some(self.grapheme_category(ch));
623                 }
624                 if self.cat_before.unwrap() == GraphemeCat::GC_Regional_Indicator {
625                     self.ris_count = self.ris_count.map(|c| c + 1);
626                 } else {
627                     self.ris_count = Some(0);
628                 }
629                 if let Some(next_ch) = iter.next() {
630                     ch = next_ch;
631                     self.cat_after = Some(self.grapheme_category(ch));
632                 } else if self.offset == self.len {
633                     self.decide(true);
634                 } else {
635                     self.resuming = true;
636                     return Err(GraphemeIncomplete::NextChunk);
637                 }
638             }
639             self.resuming = true;
640             if self.is_boundary(chunk, chunk_start)? {
641                 self.resuming = false;
642                 return Ok(Some(self.offset));
643             }
644             self.resuming = false;
645         }
646     }
647 
648     /// Find the previous boundary after the current cursor position. Only a part
649     /// of the string need be supplied. If the chunk is incomplete, then this
650     /// method might return `GraphemeIncomplete::PreContext` or
651     /// `GraphemeIncomplete::PrevChunk`. In the former case, the caller should
652     /// call `provide_context` with the requested chunk, then retry. In the
653     /// latter case, the caller should provide the chunk preceding the one
654     /// given, then retry.
655     ///
656     /// See `is_boundary` for expectations on the provided chunk.
657     ///
658     /// ```rust
659     /// # use unicode_segmentation::GraphemeCursor;
660     /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
661     /// let mut cursor = GraphemeCursor::new(12, flags.len(), false);
662     /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(8)));
663     /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(0)));
664     /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(None));
665     /// ```
666     ///
667     /// And an example that uses partial strings (note the exact return is not
668     /// guaranteed, and may be `PrevChunk` or `PreContext` arbitrarily):
669     ///
670     /// ```rust
671     /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
672     /// let s = "abcd";
673     /// let mut cursor = GraphemeCursor::new(4, s.len(), false);
674     /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Ok(Some(3)));
675     /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Err(GraphemeIncomplete::PrevChunk));
676     /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(2)));
677     /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(1)));
678     /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(0)));
679     /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None));
680     /// ```
prev_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<Option<usize>, GraphemeIncomplete>681     pub fn prev_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<Option<usize>, GraphemeIncomplete> {
682         if self.offset == 0 {
683             return Ok(None);
684         }
685         if self.offset == chunk_start {
686             return Err(GraphemeIncomplete::PrevChunk);
687         }
688         let mut iter = chunk[..self.offset - chunk_start].chars().rev();
689         let mut ch = iter.next().unwrap();
690         loop {
691             if self.offset == chunk_start {
692                 self.resuming = true;
693                 return Err(GraphemeIncomplete::PrevChunk);
694             }
695             if self.resuming {
696                 self.cat_before = Some(self.grapheme_category(ch));
697             } else {
698                 self.offset -= ch.len_utf8();
699                 self.cat_after = self.cat_before.take();
700                 self.state = GraphemeState::Unknown;
701                 if let Some(ris_count) = self.ris_count {
702                     self.ris_count = if ris_count > 0 { Some(ris_count - 1) } else { None };
703                 }
704                 if let Some(prev_ch) = iter.next() {
705                     ch = prev_ch;
706                     self.cat_before = Some(self.grapheme_category(ch));
707                 } else if self.offset == 0 {
708                     self.decide(true);
709                 } else {
710                     self.resuming = true;
711                     self.cat_after = Some(self.grapheme_category(ch));
712                     return Err(GraphemeIncomplete::PrevChunk);
713                 }
714             }
715             self.resuming = true;
716             if self.is_boundary(chunk, chunk_start)? {
717                 self.resuming = false;
718                 return Ok(Some(self.offset));
719             }
720             self.resuming = false;
721         }
722     }
723 }
724 
725 #[test]
test_grapheme_cursor_ris_precontext()726 fn test_grapheme_cursor_ris_precontext() {
727     let s = "\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}";
728     let mut c = GraphemeCursor::new(8, s.len(), true);
729     assert_eq!(c.is_boundary(&s[4..], 4), Err(GraphemeIncomplete::PreContext(4)));
730     c.provide_context(&s[..4], 0);
731     assert_eq!(c.is_boundary(&s[4..], 4), Ok(true));
732 }
733 
734 #[test]
test_grapheme_cursor_chunk_start_require_precontext()735 fn test_grapheme_cursor_chunk_start_require_precontext() {
736     let s = "\r\n";
737     let mut c = GraphemeCursor::new(1, s.len(), true);
738     assert_eq!(c.is_boundary(&s[1..], 1), Err(GraphemeIncomplete::PreContext(1)));
739     c.provide_context(&s[..1], 0);
740     assert_eq!(c.is_boundary(&s[1..], 1), Ok(false));
741 }
742 
743 #[test]
test_grapheme_cursor_prev_boundary()744 fn test_grapheme_cursor_prev_boundary() {
745     let s = "abcd";
746     let mut c = GraphemeCursor::new(3, s.len(), true);
747     assert_eq!(c.prev_boundary(&s[2..], 2), Err(GraphemeIncomplete::PrevChunk));
748     assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(2)));
749 }
750 
751 #[test]
test_grapheme_cursor_prev_boundary_chunk_start()752 fn test_grapheme_cursor_prev_boundary_chunk_start() {
753     let s = "abcd";
754     let mut c = GraphemeCursor::new(2, s.len(), true);
755     assert_eq!(c.prev_boundary(&s[2..], 2), Err(GraphemeIncomplete::PrevChunk));
756     assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(1)));
757 }
758