1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10 
11 use core::cmp;
12 
13 use tables::grapheme::GraphemeCat;
14 
15 /// External iterator for grapheme clusters and byte offsets.
16 #[derive(Clone)]
17 pub struct GraphemeIndices<'a> {
18     start_offset: usize,
19     iter: Graphemes<'a>,
20 }
21 
22 impl<'a> GraphemeIndices<'a> {
23     #[inline]
24     /// View the underlying data (the part yet to be iterated) as a slice of the original string.
25     ///
26     /// ```rust
27     /// # use unicode_segmentation::UnicodeSegmentation;
28     /// let mut iter = "abc".grapheme_indices(true);
29     /// assert_eq!(iter.as_str(), "abc");
30     /// iter.next();
31     /// assert_eq!(iter.as_str(), "bc");
32     /// iter.next();
33     /// iter.next();
34     /// assert_eq!(iter.as_str(), "");
35     /// ```
as_str(&self) -> &'a str36     pub fn as_str(&self) -> &'a str {
37         self.iter.as_str()
38     }
39 }
40 
41 impl<'a> Iterator for GraphemeIndices<'a> {
42     type Item = (usize, &'a str);
43 
44     #[inline]
next(&mut self) -> Option<(usize, &'a str)>45     fn next(&mut self) -> Option<(usize, &'a str)> {
46         self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s))
47     }
48 
49     #[inline]
size_hint(&self) -> (usize, Option<usize>)50     fn size_hint(&self) -> (usize, Option<usize>) {
51         self.iter.size_hint()
52     }
53 }
54 
55 impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
56     #[inline]
next_back(&mut self) -> Option<(usize, &'a str)>57     fn next_back(&mut self) -> Option<(usize, &'a str)> {
58         self.iter.next_back().map(|s| (s.as_ptr() as usize - self.start_offset, s))
59     }
60 }
61 
62 /// External iterator for a string's
63 /// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
64 #[derive(Clone)]
65 pub struct Graphemes<'a> {
66     string: &'a str,
67     cursor: GraphemeCursor,
68     cursor_back: GraphemeCursor,
69 }
70 
71 impl<'a> Graphemes<'a> {
72     #[inline]
73     /// View the underlying data (the part yet to be iterated) as a slice of the original string.
74     ///
75     /// ```rust
76     /// # use unicode_segmentation::UnicodeSegmentation;
77     /// let mut iter = "abc".graphemes(true);
78     /// assert_eq!(iter.as_str(), "abc");
79     /// iter.next();
80     /// assert_eq!(iter.as_str(), "bc");
81     /// iter.next();
82     /// iter.next();
83     /// assert_eq!(iter.as_str(), "");
84     /// ```
as_str(&self) -> &'a str85     pub fn as_str(&self) -> &'a str {
86         &self.string[self.cursor.cur_cursor()..self.cursor_back.cur_cursor()]
87     }
88 }
89 
90 impl<'a> Iterator for Graphemes<'a> {
91     type Item = &'a str;
92 
93     #[inline]
size_hint(&self) -> (usize, Option<usize>)94     fn size_hint(&self) -> (usize, Option<usize>) {
95         let slen = self.cursor_back.cur_cursor() - self.cursor.cur_cursor();
96         (cmp::min(slen, 1), Some(slen))
97     }
98 
99     #[inline]
next(&mut self) -> Option<&'a str>100     fn next(&mut self) -> Option<&'a str> {
101         let start = self.cursor.cur_cursor();
102         if start == self.cursor_back.cur_cursor() {
103             return None;
104         }
105         let next = self.cursor.next_boundary(self.string, 0).unwrap().unwrap();
106         Some(&self.string[start..next])
107     }
108 }
109 
110 impl<'a> DoubleEndedIterator for Graphemes<'a> {
111     #[inline]
next_back(&mut self) -> Option<&'a str>112     fn next_back(&mut self) -> Option<&'a str> {
113         let end = self.cursor_back.cur_cursor();
114         if end == self.cursor.cur_cursor() {
115             return None;
116         }
117         let prev = self.cursor_back.prev_boundary(self.string, 0).unwrap().unwrap();
118         Some(&self.string[prev..end])
119     }
120 }
121 
122 #[inline]
new_graphemes<'b>(s: &'b str, is_extended: bool) -> Graphemes<'b>123 pub fn new_graphemes<'b>(s: &'b str, is_extended: bool) -> Graphemes<'b> {
124     let len = s.len();
125     Graphemes {
126         string: s,
127         cursor: GraphemeCursor::new(0, len, is_extended),
128         cursor_back: GraphemeCursor::new(len, len, is_extended),
129     }
130 }
131 
132 #[inline]
new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndices<'b>133 pub fn new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndices<'b> {
134     GraphemeIndices { start_offset: s.as_ptr() as usize, iter: new_graphemes(s, is_extended) }
135 }
136 
137 // maybe unify with PairResult?
138 // An enum describing information about a potential boundary.
139 #[derive(PartialEq, Eq, Clone)]
140 enum GraphemeState {
141     // No information is known.
142     Unknown,
143     // It is known to not be a boundary.
144     NotBreak,
145     // It is known to be a boundary.
146     Break,
147     // The codepoint after is a Regional Indicator Symbol, so a boundary iff
148     // it is preceded by an even number of RIS codepoints. (GB12, GB13)
149     Regional,
150     // The codepoint after is in the E_Modifier category, so whether it's a boundary
151     // depends on pre-context according to GB10.
152     Emoji,
153 }
154 
155 /// Cursor-based segmenter for grapheme clusters.
156 #[derive(Clone)]
157 pub struct GraphemeCursor {
158     // Current cursor position.
159     offset: usize,
160     // Total length of the string.
161     len: usize,
162     // A config flag indicating whether this cursor computes legacy or extended
163     // grapheme cluster boundaries (enables GB9a and GB9b if set).
164     is_extended: bool,
165     // Information about the potential boundary at `offset`
166     state: GraphemeState,
167     // Category of codepoint immediately preceding cursor, if known.
168     cat_before: Option<GraphemeCat>,
169     // Category of codepoint immediately after cursor, if known.
170     cat_after: Option<GraphemeCat>,
171     // If set, at least one more codepoint immediately preceding this offset
172     // is needed to resolve whether there's a boundary at `offset`.
173     pre_context_offset: Option<usize>,
174     // The number of RIS codepoints preceding `offset`. If `pre_context_offset`
175     // is set, then counts the number of RIS between that and `offset`, otherwise
176     // is an accurate count relative to the string.
177     ris_count: Option<usize>,
178     // Set if a call to `prev_boundary` or `next_boundary` was suspended due
179     // to needing more input.
180     resuming: bool,
181 }
182 
183 /// An error return indicating that not enough content was available in the
184 /// provided chunk to satisfy the query, and that more content must be provided.
185 #[derive(PartialEq, Eq, Debug)]
186 pub enum GraphemeIncomplete {
187     /// More pre-context is needed. The caller should call `provide_context`
188     /// with a chunk ending at the offset given, then retry the query. This
189     /// will only be returned if the `chunk_start` parameter is nonzero.
190     PreContext(usize),
191 
192     /// When requesting `prev_boundary`, the cursor is moving past the beginning
193     /// of the current chunk, so the chunk before that is requested. This will
194     /// only be returned if the `chunk_start` parameter is nonzero.
195     PrevChunk,
196 
197     /// When requesting `next_boundary`, the cursor is moving past the end of the
198     /// current chunk, so the chunk after that is requested. This will only be
199     /// returned if the chunk ends before the `len` parameter provided on
200     /// creation of the cursor.
201     NextChunk,  // requesting chunk following the one given
202 
203     /// An error returned when the chunk given does not contain the cursor position.
204     InvalidOffset,
205 }
206 
207 // An enum describing the result from lookup of a pair of categories.
208 #[derive(PartialEq, Eq)]
209 enum PairResult {
210     NotBreak,  // definitely not a break
211     Break,  // definitely a break
212     Extended,  // a break iff not in extended mode
213     Regional,  // a break if preceded by an even number of RIS
214     Emoji,  // a break if preceded by emoji base and (Extend)*
215 }
216 
check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult217 fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
218     use tables::grapheme::GraphemeCat::*;
219     use self::PairResult::*;
220     match (before, after) {
221         (GC_CR, GC_LF) => NotBreak,  // GB3
222         (GC_Control, _) => Break,  // GB4
223         (GC_CR, _) => Break,  // GB4
224         (GC_LF, _) => Break,  // GB4
225         (_, GC_Control) => Break,  // GB5
226         (_, GC_CR) => Break,  // GB5
227         (_, GC_LF) => Break,  // GB5
228         (GC_L, GC_L) => NotBreak,  // GB6
229         (GC_L, GC_V) => NotBreak,  // GB6
230         (GC_L, GC_LV) => NotBreak,  // GB6
231         (GC_L, GC_LVT) => NotBreak,  // GB6
232         (GC_LV, GC_V) => NotBreak,  // GB7
233         (GC_LV, GC_T) => NotBreak,  // GB7
234         (GC_V, GC_V) => NotBreak,  // GB7
235         (GC_V, GC_T) => NotBreak,  // GB7
236         (GC_LVT, GC_T) => NotBreak,  // GB8
237         (GC_T, GC_T) => NotBreak,  // GB8
238         (_, GC_Extend) => NotBreak, // GB9
239         (_, GC_ZWJ) => NotBreak,  // GB9
240         (_, GC_SpacingMark) => Extended,  // GB9a
241         (GC_Prepend, _) => Extended,  // GB9b
242         (GC_E_Base, GC_E_Modifier) => NotBreak,  // GB10
243         (GC_E_Base_GAZ, GC_E_Modifier) => NotBreak,  // GB10
244         (GC_Extend, GC_E_Modifier) => Emoji,  // GB10
245         (GC_ZWJ, GC_Glue_After_Zwj) => NotBreak,  // GB11
246         (GC_ZWJ, GC_E_Base_GAZ) => NotBreak,  // GB11
247         (GC_Regional_Indicator, GC_Regional_Indicator) => Regional,  // GB12, GB13
248         (_, _) => Break,  // GB999
249     }
250 }
251 
252 impl GraphemeCursor {
253     /// Create a new cursor. The string and initial offset are given at creation
254     /// time, but the contents of the string are not. The `is_extended` parameter
255     /// controls whether extended grapheme clusters are selected.
256     ///
257     /// The `offset` parameter must be on a codepoint boundary.
258     ///
259     /// ```rust
260     /// # use unicode_segmentation::GraphemeCursor;
261     /// let s = "हिन्दी";
262     /// let mut legacy = GraphemeCursor::new(0, s.len(), false);
263     /// assert_eq!(legacy.next_boundary(s, 0), Ok(Some("ह".len())));
264     /// let mut extended = GraphemeCursor::new(0, s.len(), true);
265     /// assert_eq!(extended.next_boundary(s, 0), Ok(Some("हि".len())));
266     /// ```
new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor267     pub fn new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor {
268         let state = if offset == 0 || offset == len {
269             GraphemeState::Break
270         } else {
271             GraphemeState::Unknown
272         };
273         GraphemeCursor {
274             offset: offset,
275             len: len,
276             state: state,
277             is_extended: is_extended,
278             cat_before: None,
279             cat_after: None,
280             pre_context_offset: None,
281             ris_count: None,
282             resuming: false,
283         }
284     }
285 
286     // Not sure I'm gonna keep this, the advantage over new() seems thin.
287 
288     /// Set the cursor to a new location in the same string.
289     ///
290     /// ```rust
291     /// # use unicode_segmentation::GraphemeCursor;
292     /// let s = "abcd";
293     /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
294     /// assert_eq!(cursor.cur_cursor(), 0);
295     /// cursor.set_cursor(2);
296     /// assert_eq!(cursor.cur_cursor(), 2);
297     /// ```
set_cursor(&mut self, offset: usize)298     pub fn set_cursor(&mut self, offset: usize) {
299         if offset != self.offset {
300             self.offset = offset;
301             self.state = if offset == 0 || offset == self.len {
302                 GraphemeState::Break
303             } else {
304                 GraphemeState::Unknown
305             };
306             // reset state derived from text around cursor
307             self.cat_before = None;
308             self.cat_after = None;
309             self.ris_count = None;
310         }
311     }
312 
313     /// The current offset of the cursor. Equal to the last value provided to
314     /// `new()` or `set_cursor()`, or returned from `next_boundary()` or
315     /// `prev_boundary()`.
316     ///
317     /// ```rust
318     /// # use unicode_segmentation::GraphemeCursor;
319     /// // Two flags (��������), each flag is two RIS codepoints, each RIS is 4 bytes.
320     /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
321     /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
322     /// assert_eq!(cursor.cur_cursor(), 4);
323     /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
324     /// assert_eq!(cursor.cur_cursor(), 8);
325     /// ```
cur_cursor(&self) -> usize326     pub fn cur_cursor(&self) -> usize {
327         self.offset
328     }
329 
330     /// Provide additional pre-context when it is needed to decide a boundary.
331     /// The end of the chunk must coincide with the value given in the
332     /// `GraphemeIncomplete::PreContext` request.
333     ///
334     /// ```rust
335     /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
336     /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
337     /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
338     /// // Not enough pre-context to decide if there's a boundary between the two flags.
339     /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(8)));
340     /// // Provide one more Regional Indicator Symbol of pre-context
341     /// cursor.provide_context(&flags[4..8], 4);
342     /// // Still not enough context to decide.
343     /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(4)));
344     /// // Provide additional requested context.
345     /// cursor.provide_context(&flags[0..4], 0);
346     /// // That's enough to decide (it always is when context goes to the start of the string)
347     /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Ok(true));
348     /// ```
provide_context(&mut self, chunk: &str, chunk_start: usize)349     pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) {
350         use tables::grapheme as gr;
351         assert!(chunk_start + chunk.len() == self.pre_context_offset.unwrap());
352         self.pre_context_offset = None;
353         if self.is_extended && chunk_start + chunk.len() == self.offset {
354             let ch = chunk.chars().rev().next().unwrap();
355             if gr::grapheme_category(ch) == gr::GC_Prepend {
356                 self.decide(false);  // GB9b
357                 return;
358             }
359         }
360         match self.state {
361             GraphemeState::Regional => self.handle_regional(chunk, chunk_start),
362             GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start),
363             _ => if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start {
364                 let ch = chunk.chars().rev().next().unwrap();
365                 self.cat_before = Some(gr::grapheme_category(ch));
366             },
367         }
368     }
369 
decide(&mut self, is_break: bool)370     fn decide(&mut self, is_break: bool) {
371         self.state = if is_break {
372             GraphemeState::Break
373         } else {
374             GraphemeState::NotBreak
375         };
376     }
377 
decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete>378     fn decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete> {
379         self.decide(is_break);
380         Ok(is_break)
381     }
382 
is_boundary_result(&self) -> Result<bool, GraphemeIncomplete>383     fn is_boundary_result(&self) -> Result<bool, GraphemeIncomplete> {
384         if self.state == GraphemeState::Break {
385             Ok(true)
386         } else if self.state == GraphemeState::NotBreak {
387             Ok(false)
388         } else if let Some(pre_context_offset) = self.pre_context_offset {
389             Err(GraphemeIncomplete::PreContext(pre_context_offset))
390         } else {
391             unreachable!("inconsistent state");
392         }
393     }
394 
handle_regional(&mut self, chunk: &str, chunk_start: usize)395     fn handle_regional(&mut self, chunk: &str, chunk_start: usize) {
396         use tables::grapheme as gr;
397         let mut ris_count = self.ris_count.unwrap_or(0);
398         for ch in chunk.chars().rev() {
399             if gr::grapheme_category(ch) != gr::GC_Regional_Indicator {
400                 self.ris_count = Some(ris_count);
401                 self.decide((ris_count % 2) == 0);
402                 return;
403             }
404             ris_count += 1;
405         }
406         self.ris_count = Some(ris_count);
407         if chunk_start == 0 {
408             self.decide((ris_count % 2) == 0);
409             return;
410         }
411         self.pre_context_offset = Some(chunk_start);
412         self.state = GraphemeState::Regional;
413     }
414 
handle_emoji(&mut self, chunk: &str, chunk_start: usize)415     fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
416         use tables::grapheme as gr;
417         for ch in chunk.chars().rev() {
418             match gr::grapheme_category(ch) {
419                 gr::GC_Extend => (),
420                 gr::GC_E_Base | gr::GC_E_Base_GAZ => {
421                     self.decide(false);
422                     return;
423                 }
424                 _ => {
425                     self.decide(true);
426                     return;
427                 }
428             }
429         }
430         if chunk_start == 0 {
431             self.decide(true);
432             return;
433         }
434         self.pre_context_offset = Some(chunk_start);
435         self.state = GraphemeState::Emoji;
436     }
437 
438     /// Determine whether the current cursor location is a grapheme cluster boundary.
439     /// Only a part of the string need be supplied. If `chunk_start` is nonzero or
440     /// the length of `chunk` is not equal to `len` on creation, then this method
441     /// may return `GraphemeIncomplete::PreContext`. The caller should then
442     /// call `provide_context` with the requested chunk, then retry calling this
443     /// method.
444     ///
445     /// For partial chunks, if the cursor is not at the beginning or end of the
446     /// string, the chunk should contain at least the codepoint following the cursor.
447     /// If the string is nonempty, the chunk must be nonempty.
448     ///
449     /// All calls should have consistent chunk contents (ie, if a chunk provides
450     /// content for a given slice, all further chunks covering that slice must have
451     /// the same content for it).
452     ///
453     /// ```rust
454     /// # use unicode_segmentation::GraphemeCursor;
455     /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
456     /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
457     /// assert_eq!(cursor.is_boundary(flags, 0), Ok(true));
458     /// cursor.set_cursor(12);
459     /// assert_eq!(cursor.is_boundary(flags, 0), Ok(false));
460     /// ```
is_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<bool, GraphemeIncomplete>461     pub fn is_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<bool, GraphemeIncomplete> {
462         use tables::grapheme as gr;
463         if self.state == GraphemeState::Break {
464             return Ok(true)
465         }
466         if self.state == GraphemeState::NotBreak {
467             return Ok(false)
468         }
469         if self.offset < chunk_start || self.offset >= chunk_start + chunk.len() {
470             if self.offset > chunk_start + chunk.len() || self.cat_after.is_none() {
471                 return Err(GraphemeIncomplete::InvalidOffset)
472             }
473         }
474         if let Some(pre_context_offset) = self.pre_context_offset {
475             return Err(GraphemeIncomplete::PreContext(pre_context_offset));
476         }
477         let offset_in_chunk = self.offset - chunk_start;
478         if self.cat_after.is_none() {
479             let ch = chunk[offset_in_chunk..].chars().next().unwrap();
480             self.cat_after = Some(gr::grapheme_category(ch));
481         }
482         if self.offset == chunk_start {
483             let mut need_pre_context = true;
484             match self.cat_after.unwrap() {
485                 gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
486                 gr::GC_E_Modifier => self.state = GraphemeState::Emoji,
487                 _ => need_pre_context = self.cat_before.is_none(),
488             }
489             if need_pre_context {
490                 self.pre_context_offset = Some(chunk_start);
491                 return Err(GraphemeIncomplete::PreContext(chunk_start));
492             }
493         }
494         if self.cat_before.is_none() {
495             let ch = chunk[..offset_in_chunk].chars().rev().next().unwrap();
496             self.cat_before = Some(gr::grapheme_category(ch));
497         }
498         match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) {
499             PairResult::NotBreak => return self.decision(false),
500             PairResult::Break => return self.decision(true),
501             PairResult::Extended => {
502                 let is_extended = self.is_extended;
503                 return self.decision(!is_extended);
504             }
505             PairResult::Regional => {
506                 if let Some(ris_count) = self.ris_count {
507                     return self.decision((ris_count % 2) == 0);
508                 }
509                 self.handle_regional(&chunk[..offset_in_chunk], chunk_start);
510                 self.is_boundary_result()
511             }
512             PairResult::Emoji => {
513                 self.handle_emoji(&chunk[..offset_in_chunk], chunk_start);
514                 self.is_boundary_result()
515             }
516         }
517     }
518 
519     /// Find the next boundary after the current cursor position. Only a part of
520     /// the string need be supplied. If the chunk is incomplete, then this
521     /// method might return `GraphemeIncomplete::PreContext` or
522     /// `GraphemeIncomplete::NextChunk`. In the former case, the caller should
523     /// call `provide_context` with the requested chunk, then retry. In the
524     /// latter case, the caller should provide the chunk following the one
525     /// given, then retry.
526     ///
527     /// See `is_boundary` for expectations on the provided chunk.
528     ///
529     /// ```rust
530     /// # use unicode_segmentation::GraphemeCursor;
531     /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
532     /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
533     /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
534     /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(16)));
535     /// assert_eq!(cursor.next_boundary(flags, 0), Ok(None));
536     /// ```
537     ///
538     /// And an example that uses partial strings:
539     ///
540     /// ```rust
541     /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
542     /// let s = "abcd";
543     /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
544     /// assert_eq!(cursor.next_boundary(&s[..2], 0), Ok(Some(1)));
545     /// assert_eq!(cursor.next_boundary(&s[..2], 0), Err(GraphemeIncomplete::NextChunk));
546     /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(2)));
547     /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(3)));
548     /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(4)));
549     /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None));
550     /// ```
next_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<Option<usize>, GraphemeIncomplete>551     pub fn next_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<Option<usize>, GraphemeIncomplete> {
552         use tables::grapheme as gr;
553         if self.offset == self.len {
554             return Ok(None);
555         }
556         let mut iter = chunk[self.offset - chunk_start..].chars();
557         let mut ch = iter.next().unwrap();
558         loop {
559             if self.resuming {
560                 if self.cat_after.is_none() {
561                     self.cat_after = Some(gr::grapheme_category(ch));
562                 }
563             } else {
564                 self.offset += ch.len_utf8();
565                 self.state = GraphemeState::Unknown;
566                 self.cat_before = self.cat_after.take();
567                 if self.cat_before.is_none() {
568                     self.cat_before = Some(gr::grapheme_category(ch));
569                 }
570                 if self.cat_before.unwrap() == GraphemeCat::GC_Regional_Indicator {
571                     self.ris_count = self.ris_count.map(|c| c + 1);
572                 } else {
573                     self.ris_count = Some(0);
574                 }
575                 if let Some(next_ch) = iter.next() {
576                     ch = next_ch;
577                     self.cat_after = Some(gr::grapheme_category(ch));
578                 } else if self.offset == self.len {
579                     self.decide(true);
580                 } else {
581                     self.resuming = true;
582                     return Err(GraphemeIncomplete::NextChunk);
583                 }
584             }
585             self.resuming = true;
586             if self.is_boundary(chunk, chunk_start)? {
587                 self.resuming = false;
588                 return Ok(Some(self.offset));
589             }
590             self.resuming = false;
591         }
592     }
593 
594     /// Find the previous boundary after the current cursor position. Only a part
595     /// of the string need be supplied. If the chunk is incomplete, then this
596     /// method might return `GraphemeIncomplete::PreContext` or
597     /// `GraphemeIncomplete::PrevChunk`. In the former case, the caller should
598     /// call `provide_context` with the requested chunk, then retry. In the
599     /// latter case, the caller should provide the chunk preceding the one
600     /// given, then retry.
601     ///
602     /// See `is_boundary` for expectations on the provided chunk.
603     ///
604     /// ```rust
605     /// # use unicode_segmentation::GraphemeCursor;
606     /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
607     /// let mut cursor = GraphemeCursor::new(12, flags.len(), false);
608     /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(8)));
609     /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(0)));
610     /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(None));
611     /// ```
612     ///
613     /// And an example that uses partial strings (note the exact return is not
614     /// guaranteed, and may be `PrevChunk` or `PreContext` arbitrarily):
615     ///
616     /// ```rust
617     /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
618     /// let s = "abcd";
619     /// let mut cursor = GraphemeCursor::new(4, s.len(), false);
620     /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Ok(Some(3)));
621     /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Err(GraphemeIncomplete::PrevChunk));
622     /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(2)));
623     /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(1)));
624     /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(0)));
625     /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None));
626     /// ```
prev_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<Option<usize>, GraphemeIncomplete>627     pub fn prev_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<Option<usize>, GraphemeIncomplete> {
628         use tables::grapheme as gr;
629         if self.offset == 0 {
630             return Ok(None);
631         }
632         if self.offset == chunk_start {
633             return Err(GraphemeIncomplete::PrevChunk);
634         }
635         let mut iter = chunk[..self.offset - chunk_start].chars().rev();
636         let mut ch = iter.next().unwrap();
637         loop {
638             if self.offset == chunk_start {
639                 self.resuming = true;
640                 return Err(GraphemeIncomplete::PrevChunk);
641             }
642             if self.resuming {
643                 self.cat_before = Some(gr::grapheme_category(ch));
644             } else {
645                 self.offset -= ch.len_utf8();
646                 self.cat_after = self.cat_before.take();
647                 self.state = GraphemeState::Unknown;
648                 if let Some(ris_count) = self.ris_count {
649                     self.ris_count = if ris_count > 0 { Some(ris_count - 1) } else { None };
650                 }
651                 if let Some(prev_ch) = iter.next() {
652                     ch = prev_ch;
653                     self.cat_before = Some(gr::grapheme_category(ch));
654                 } else if self.offset == 0 {
655                     self.decide(true);
656                 } else {
657                     self.resuming = true;
658                     self.cat_after = Some(gr::grapheme_category(ch));
659                     return Err(GraphemeIncomplete::PrevChunk);
660                 }
661             }
662             self.resuming = true;
663             if self.is_boundary(chunk, chunk_start)? {
664                 self.resuming = false;
665                 return Ok(Some(self.offset));
666             }
667             self.resuming = false;
668         }
669     }
670 }
671 
672 #[test]
test_grapheme_cursor_ris_precontext()673 fn test_grapheme_cursor_ris_precontext() {
674     let s = "\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}";
675     let mut c = GraphemeCursor::new(8, s.len(), true);
676     assert_eq!(c.is_boundary(&s[4..], 4), Err(GraphemeIncomplete::PreContext(4)));
677     c.provide_context(&s[..4], 0);
678     assert_eq!(c.is_boundary(&s[4..], 4), Ok(true));
679 }
680 
681 #[test]
test_grapheme_cursor_chunk_start_require_precontext()682 fn test_grapheme_cursor_chunk_start_require_precontext() {
683     let s = "\r\n";
684     let mut c = GraphemeCursor::new(1, s.len(), true);
685     assert_eq!(c.is_boundary(&s[1..], 1), Err(GraphemeIncomplete::PreContext(1)));
686     c.provide_context(&s[..1], 0);
687     assert_eq!(c.is_boundary(&s[1..], 1), Ok(false));
688 }
689 
690 #[test]
test_grapheme_cursor_prev_boundary()691 fn test_grapheme_cursor_prev_boundary() {
692     let s = "abcd";
693     let mut c = GraphemeCursor::new(3, s.len(), true);
694     assert_eq!(c.prev_boundary(&s[2..], 2), Err(GraphemeIncomplete::PrevChunk));
695     assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(2)));
696 }
697 
698 #[test]
test_grapheme_cursor_prev_boundary_chunk_start()699 fn test_grapheme_cursor_prev_boundary_chunk_start() {
700     let s = "abcd";
701     let mut c = GraphemeCursor::new(2, s.len(), true);
702     assert_eq!(c.prev_boundary(&s[2..], 2), Err(GraphemeIncomplete::PrevChunk));
703     assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(1)));
704 }
705