1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10
11 use core::cmp;
12
13 use tables::grapheme::GraphemeCat;
14
15 /// External iterator for grapheme clusters and byte offsets.
16 #[derive(Clone)]
17 pub struct GraphemeIndices<'a> {
18 start_offset: usize,
19 iter: Graphemes<'a>,
20 }
21
22 impl<'a> GraphemeIndices<'a> {
23 #[inline]
24 /// View the underlying data (the part yet to be iterated) as a slice of the original string.
25 ///
26 /// ```rust
27 /// # use unicode_segmentation::UnicodeSegmentation;
28 /// let mut iter = "abc".grapheme_indices(true);
29 /// assert_eq!(iter.as_str(), "abc");
30 /// iter.next();
31 /// assert_eq!(iter.as_str(), "bc");
32 /// iter.next();
33 /// iter.next();
34 /// assert_eq!(iter.as_str(), "");
35 /// ```
as_str(&self) -> &'a str36 pub fn as_str(&self) -> &'a str {
37 self.iter.as_str()
38 }
39 }
40
41 impl<'a> Iterator for GraphemeIndices<'a> {
42 type Item = (usize, &'a str);
43
44 #[inline]
next(&mut self) -> Option<(usize, &'a str)>45 fn next(&mut self) -> Option<(usize, &'a str)> {
46 self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s))
47 }
48
49 #[inline]
size_hint(&self) -> (usize, Option<usize>)50 fn size_hint(&self) -> (usize, Option<usize>) {
51 self.iter.size_hint()
52 }
53 }
54
55 impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
56 #[inline]
next_back(&mut self) -> Option<(usize, &'a str)>57 fn next_back(&mut self) -> Option<(usize, &'a str)> {
58 self.iter.next_back().map(|s| (s.as_ptr() as usize - self.start_offset, s))
59 }
60 }
61
62 /// External iterator for a string's
63 /// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
64 #[derive(Clone)]
65 pub struct Graphemes<'a> {
66 string: &'a str,
67 cursor: GraphemeCursor,
68 cursor_back: GraphemeCursor,
69 }
70
71 impl<'a> Graphemes<'a> {
72 #[inline]
73 /// View the underlying data (the part yet to be iterated) as a slice of the original string.
74 ///
75 /// ```rust
76 /// # use unicode_segmentation::UnicodeSegmentation;
77 /// let mut iter = "abc".graphemes(true);
78 /// assert_eq!(iter.as_str(), "abc");
79 /// iter.next();
80 /// assert_eq!(iter.as_str(), "bc");
81 /// iter.next();
82 /// iter.next();
83 /// assert_eq!(iter.as_str(), "");
84 /// ```
as_str(&self) -> &'a str85 pub fn as_str(&self) -> &'a str {
86 &self.string[self.cursor.cur_cursor()..self.cursor_back.cur_cursor()]
87 }
88 }
89
90 impl<'a> Iterator for Graphemes<'a> {
91 type Item = &'a str;
92
93 #[inline]
size_hint(&self) -> (usize, Option<usize>)94 fn size_hint(&self) -> (usize, Option<usize>) {
95 let slen = self.cursor_back.cur_cursor() - self.cursor.cur_cursor();
96 (cmp::min(slen, 1), Some(slen))
97 }
98
99 #[inline]
next(&mut self) -> Option<&'a str>100 fn next(&mut self) -> Option<&'a str> {
101 let start = self.cursor.cur_cursor();
102 if start == self.cursor_back.cur_cursor() {
103 return None;
104 }
105 let next = self.cursor.next_boundary(self.string, 0).unwrap().unwrap();
106 Some(&self.string[start..next])
107 }
108 }
109
110 impl<'a> DoubleEndedIterator for Graphemes<'a> {
111 #[inline]
next_back(&mut self) -> Option<&'a str>112 fn next_back(&mut self) -> Option<&'a str> {
113 let end = self.cursor_back.cur_cursor();
114 if end == self.cursor.cur_cursor() {
115 return None;
116 }
117 let prev = self.cursor_back.prev_boundary(self.string, 0).unwrap().unwrap();
118 Some(&self.string[prev..end])
119 }
120 }
121
122 #[inline]
new_graphemes<'b>(s: &'b str, is_extended: bool) -> Graphemes<'b>123 pub fn new_graphemes<'b>(s: &'b str, is_extended: bool) -> Graphemes<'b> {
124 let len = s.len();
125 Graphemes {
126 string: s,
127 cursor: GraphemeCursor::new(0, len, is_extended),
128 cursor_back: GraphemeCursor::new(len, len, is_extended),
129 }
130 }
131
132 #[inline]
new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndices<'b>133 pub fn new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndices<'b> {
134 GraphemeIndices { start_offset: s.as_ptr() as usize, iter: new_graphemes(s, is_extended) }
135 }
136
137 // maybe unify with PairResult?
138 // An enum describing information about a potential boundary.
139 #[derive(PartialEq, Eq, Clone)]
140 enum GraphemeState {
141 // No information is known.
142 Unknown,
143 // It is known to not be a boundary.
144 NotBreak,
145 // It is known to be a boundary.
146 Break,
147 // The codepoint after is a Regional Indicator Symbol, so a boundary iff
148 // it is preceded by an even number of RIS codepoints. (GB12, GB13)
149 Regional,
150 // The codepoint after is in the E_Modifier category, so whether it's a boundary
151 // depends on pre-context according to GB10.
152 Emoji,
153 }
154
155 /// Cursor-based segmenter for grapheme clusters.
156 #[derive(Clone)]
157 pub struct GraphemeCursor {
158 // Current cursor position.
159 offset: usize,
160 // Total length of the string.
161 len: usize,
162 // A config flag indicating whether this cursor computes legacy or extended
163 // grapheme cluster boundaries (enables GB9a and GB9b if set).
164 is_extended: bool,
165 // Information about the potential boundary at `offset`
166 state: GraphemeState,
167 // Category of codepoint immediately preceding cursor, if known.
168 cat_before: Option<GraphemeCat>,
169 // Category of codepoint immediately after cursor, if known.
170 cat_after: Option<GraphemeCat>,
171 // If set, at least one more codepoint immediately preceding this offset
172 // is needed to resolve whether there's a boundary at `offset`.
173 pre_context_offset: Option<usize>,
174 // The number of RIS codepoints preceding `offset`. If `pre_context_offset`
175 // is set, then counts the number of RIS between that and `offset`, otherwise
176 // is an accurate count relative to the string.
177 ris_count: Option<usize>,
178 // Set if a call to `prev_boundary` or `next_boundary` was suspended due
179 // to needing more input.
180 resuming: bool,
181 }
182
183 /// An error return indicating that not enough content was available in the
184 /// provided chunk to satisfy the query, and that more content must be provided.
185 #[derive(PartialEq, Eq, Debug)]
186 pub enum GraphemeIncomplete {
187 /// More pre-context is needed. The caller should call `provide_context`
188 /// with a chunk ending at the offset given, then retry the query. This
189 /// will only be returned if the `chunk_start` parameter is nonzero.
190 PreContext(usize),
191
192 /// When requesting `prev_boundary`, the cursor is moving past the beginning
193 /// of the current chunk, so the chunk before that is requested. This will
194 /// only be returned if the `chunk_start` parameter is nonzero.
195 PrevChunk,
196
197 /// When requesting `next_boundary`, the cursor is moving past the end of the
198 /// current chunk, so the chunk after that is requested. This will only be
199 /// returned if the chunk ends before the `len` parameter provided on
200 /// creation of the cursor.
201 NextChunk, // requesting chunk following the one given
202
203 /// An error returned when the chunk given does not contain the cursor position.
204 InvalidOffset,
205 }
206
207 // An enum describing the result from lookup of a pair of categories.
208 #[derive(PartialEq, Eq)]
209 enum PairResult {
210 NotBreak, // definitely not a break
211 Break, // definitely a break
212 Extended, // a break iff not in extended mode
213 Regional, // a break if preceded by an even number of RIS
214 Emoji, // a break if preceded by emoji base and (Extend)*
215 }
216
check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult217 fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
218 use tables::grapheme::GraphemeCat::*;
219 use self::PairResult::*;
220 match (before, after) {
221 (GC_CR, GC_LF) => NotBreak, // GB3
222 (GC_Control, _) => Break, // GB4
223 (GC_CR, _) => Break, // GB4
224 (GC_LF, _) => Break, // GB4
225 (_, GC_Control) => Break, // GB5
226 (_, GC_CR) => Break, // GB5
227 (_, GC_LF) => Break, // GB5
228 (GC_L, GC_L) => NotBreak, // GB6
229 (GC_L, GC_V) => NotBreak, // GB6
230 (GC_L, GC_LV) => NotBreak, // GB6
231 (GC_L, GC_LVT) => NotBreak, // GB6
232 (GC_LV, GC_V) => NotBreak, // GB7
233 (GC_LV, GC_T) => NotBreak, // GB7
234 (GC_V, GC_V) => NotBreak, // GB7
235 (GC_V, GC_T) => NotBreak, // GB7
236 (GC_LVT, GC_T) => NotBreak, // GB8
237 (GC_T, GC_T) => NotBreak, // GB8
238 (_, GC_Extend) => NotBreak, // GB9
239 (_, GC_ZWJ) => NotBreak, // GB9
240 (_, GC_SpacingMark) => Extended, // GB9a
241 (GC_Prepend, _) => Extended, // GB9b
242 (GC_E_Base, GC_E_Modifier) => NotBreak, // GB10
243 (GC_E_Base_GAZ, GC_E_Modifier) => NotBreak, // GB10
244 (GC_Extend, GC_E_Modifier) => Emoji, // GB10
245 (GC_ZWJ, GC_Glue_After_Zwj) => NotBreak, // GB11
246 (GC_ZWJ, GC_E_Base_GAZ) => NotBreak, // GB11
247 (GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13
248 (_, _) => Break, // GB999
249 }
250 }
251
252 impl GraphemeCursor {
253 /// Create a new cursor. The string and initial offset are given at creation
254 /// time, but the contents of the string are not. The `is_extended` parameter
255 /// controls whether extended grapheme clusters are selected.
256 ///
257 /// The `offset` parameter must be on a codepoint boundary.
258 ///
259 /// ```rust
260 /// # use unicode_segmentation::GraphemeCursor;
261 /// let s = "हिन्दी";
262 /// let mut legacy = GraphemeCursor::new(0, s.len(), false);
263 /// assert_eq!(legacy.next_boundary(s, 0), Ok(Some("ह".len())));
264 /// let mut extended = GraphemeCursor::new(0, s.len(), true);
265 /// assert_eq!(extended.next_boundary(s, 0), Ok(Some("हि".len())));
266 /// ```
new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor267 pub fn new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor {
268 let state = if offset == 0 || offset == len {
269 GraphemeState::Break
270 } else {
271 GraphemeState::Unknown
272 };
273 GraphemeCursor {
274 offset: offset,
275 len: len,
276 state: state,
277 is_extended: is_extended,
278 cat_before: None,
279 cat_after: None,
280 pre_context_offset: None,
281 ris_count: None,
282 resuming: false,
283 }
284 }
285
286 // Not sure I'm gonna keep this, the advantage over new() seems thin.
287
288 /// Set the cursor to a new location in the same string.
289 ///
290 /// ```rust
291 /// # use unicode_segmentation::GraphemeCursor;
292 /// let s = "abcd";
293 /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
294 /// assert_eq!(cursor.cur_cursor(), 0);
295 /// cursor.set_cursor(2);
296 /// assert_eq!(cursor.cur_cursor(), 2);
297 /// ```
set_cursor(&mut self, offset: usize)298 pub fn set_cursor(&mut self, offset: usize) {
299 if offset != self.offset {
300 self.offset = offset;
301 self.state = if offset == 0 || offset == self.len {
302 GraphemeState::Break
303 } else {
304 GraphemeState::Unknown
305 };
306 // reset state derived from text around cursor
307 self.cat_before = None;
308 self.cat_after = None;
309 self.ris_count = None;
310 }
311 }
312
313 /// The current offset of the cursor. Equal to the last value provided to
314 /// `new()` or `set_cursor()`, or returned from `next_boundary()` or
315 /// `prev_boundary()`.
316 ///
317 /// ```rust
318 /// # use unicode_segmentation::GraphemeCursor;
319 /// // Two flags (), each flag is two RIS codepoints, each RIS is 4 bytes.
320 /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
321 /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
322 /// assert_eq!(cursor.cur_cursor(), 4);
323 /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
324 /// assert_eq!(cursor.cur_cursor(), 8);
325 /// ```
cur_cursor(&self) -> usize326 pub fn cur_cursor(&self) -> usize {
327 self.offset
328 }
329
330 /// Provide additional pre-context when it is needed to decide a boundary.
331 /// The end of the chunk must coincide with the value given in the
332 /// `GraphemeIncomplete::PreContext` request.
333 ///
334 /// ```rust
335 /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
336 /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
337 /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
338 /// // Not enough pre-context to decide if there's a boundary between the two flags.
339 /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(8)));
340 /// // Provide one more Regional Indicator Symbol of pre-context
341 /// cursor.provide_context(&flags[4..8], 4);
342 /// // Still not enough context to decide.
343 /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(4)));
344 /// // Provide additional requested context.
345 /// cursor.provide_context(&flags[0..4], 0);
346 /// // That's enough to decide (it always is when context goes to the start of the string)
347 /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Ok(true));
348 /// ```
provide_context(&mut self, chunk: &str, chunk_start: usize)349 pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) {
350 use tables::grapheme as gr;
351 assert!(chunk_start + chunk.len() == self.pre_context_offset.unwrap());
352 self.pre_context_offset = None;
353 if self.is_extended && chunk_start + chunk.len() == self.offset {
354 let ch = chunk.chars().rev().next().unwrap();
355 if gr::grapheme_category(ch) == gr::GC_Prepend {
356 self.decide(false); // GB9b
357 return;
358 }
359 }
360 match self.state {
361 GraphemeState::Regional => self.handle_regional(chunk, chunk_start),
362 GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start),
363 _ => if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start {
364 let ch = chunk.chars().rev().next().unwrap();
365 self.cat_before = Some(gr::grapheme_category(ch));
366 },
367 }
368 }
369
decide(&mut self, is_break: bool)370 fn decide(&mut self, is_break: bool) {
371 self.state = if is_break {
372 GraphemeState::Break
373 } else {
374 GraphemeState::NotBreak
375 };
376 }
377
decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete>378 fn decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete> {
379 self.decide(is_break);
380 Ok(is_break)
381 }
382
is_boundary_result(&self) -> Result<bool, GraphemeIncomplete>383 fn is_boundary_result(&self) -> Result<bool, GraphemeIncomplete> {
384 if self.state == GraphemeState::Break {
385 Ok(true)
386 } else if self.state == GraphemeState::NotBreak {
387 Ok(false)
388 } else if let Some(pre_context_offset) = self.pre_context_offset {
389 Err(GraphemeIncomplete::PreContext(pre_context_offset))
390 } else {
391 unreachable!("inconsistent state");
392 }
393 }
394
handle_regional(&mut self, chunk: &str, chunk_start: usize)395 fn handle_regional(&mut self, chunk: &str, chunk_start: usize) {
396 use tables::grapheme as gr;
397 let mut ris_count = self.ris_count.unwrap_or(0);
398 for ch in chunk.chars().rev() {
399 if gr::grapheme_category(ch) != gr::GC_Regional_Indicator {
400 self.ris_count = Some(ris_count);
401 self.decide((ris_count % 2) == 0);
402 return;
403 }
404 ris_count += 1;
405 }
406 self.ris_count = Some(ris_count);
407 if chunk_start == 0 {
408 self.decide((ris_count % 2) == 0);
409 return;
410 }
411 self.pre_context_offset = Some(chunk_start);
412 self.state = GraphemeState::Regional;
413 }
414
handle_emoji(&mut self, chunk: &str, chunk_start: usize)415 fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
416 use tables::grapheme as gr;
417 for ch in chunk.chars().rev() {
418 match gr::grapheme_category(ch) {
419 gr::GC_Extend => (),
420 gr::GC_E_Base | gr::GC_E_Base_GAZ => {
421 self.decide(false);
422 return;
423 }
424 _ => {
425 self.decide(true);
426 return;
427 }
428 }
429 }
430 if chunk_start == 0 {
431 self.decide(true);
432 return;
433 }
434 self.pre_context_offset = Some(chunk_start);
435 self.state = GraphemeState::Emoji;
436 }
437
438 /// Determine whether the current cursor location is a grapheme cluster boundary.
439 /// Only a part of the string need be supplied. If `chunk_start` is nonzero or
440 /// the length of `chunk` is not equal to `len` on creation, then this method
441 /// may return `GraphemeIncomplete::PreContext`. The caller should then
442 /// call `provide_context` with the requested chunk, then retry calling this
443 /// method.
444 ///
445 /// For partial chunks, if the cursor is not at the beginning or end of the
446 /// string, the chunk should contain at least the codepoint following the cursor.
447 /// If the string is nonempty, the chunk must be nonempty.
448 ///
449 /// All calls should have consistent chunk contents (ie, if a chunk provides
450 /// content for a given slice, all further chunks covering that slice must have
451 /// the same content for it).
452 ///
453 /// ```rust
454 /// # use unicode_segmentation::GraphemeCursor;
455 /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
456 /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
457 /// assert_eq!(cursor.is_boundary(flags, 0), Ok(true));
458 /// cursor.set_cursor(12);
459 /// assert_eq!(cursor.is_boundary(flags, 0), Ok(false));
460 /// ```
is_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<bool, GraphemeIncomplete>461 pub fn is_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<bool, GraphemeIncomplete> {
462 use tables::grapheme as gr;
463 if self.state == GraphemeState::Break {
464 return Ok(true)
465 }
466 if self.state == GraphemeState::NotBreak {
467 return Ok(false)
468 }
469 if self.offset < chunk_start || self.offset >= chunk_start + chunk.len() {
470 if self.offset > chunk_start + chunk.len() || self.cat_after.is_none() {
471 return Err(GraphemeIncomplete::InvalidOffset)
472 }
473 }
474 if let Some(pre_context_offset) = self.pre_context_offset {
475 return Err(GraphemeIncomplete::PreContext(pre_context_offset));
476 }
477 let offset_in_chunk = self.offset - chunk_start;
478 if self.cat_after.is_none() {
479 let ch = chunk[offset_in_chunk..].chars().next().unwrap();
480 self.cat_after = Some(gr::grapheme_category(ch));
481 }
482 if self.offset == chunk_start {
483 let mut need_pre_context = true;
484 match self.cat_after.unwrap() {
485 gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
486 gr::GC_E_Modifier => self.state = GraphemeState::Emoji,
487 _ => need_pre_context = self.cat_before.is_none(),
488 }
489 if need_pre_context {
490 self.pre_context_offset = Some(chunk_start);
491 return Err(GraphemeIncomplete::PreContext(chunk_start));
492 }
493 }
494 if self.cat_before.is_none() {
495 let ch = chunk[..offset_in_chunk].chars().rev().next().unwrap();
496 self.cat_before = Some(gr::grapheme_category(ch));
497 }
498 match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) {
499 PairResult::NotBreak => return self.decision(false),
500 PairResult::Break => return self.decision(true),
501 PairResult::Extended => {
502 let is_extended = self.is_extended;
503 return self.decision(!is_extended);
504 }
505 PairResult::Regional => {
506 if let Some(ris_count) = self.ris_count {
507 return self.decision((ris_count % 2) == 0);
508 }
509 self.handle_regional(&chunk[..offset_in_chunk], chunk_start);
510 self.is_boundary_result()
511 }
512 PairResult::Emoji => {
513 self.handle_emoji(&chunk[..offset_in_chunk], chunk_start);
514 self.is_boundary_result()
515 }
516 }
517 }
518
519 /// Find the next boundary after the current cursor position. Only a part of
520 /// the string need be supplied. If the chunk is incomplete, then this
521 /// method might return `GraphemeIncomplete::PreContext` or
522 /// `GraphemeIncomplete::NextChunk`. In the former case, the caller should
523 /// call `provide_context` with the requested chunk, then retry. In the
524 /// latter case, the caller should provide the chunk following the one
525 /// given, then retry.
526 ///
527 /// See `is_boundary` for expectations on the provided chunk.
528 ///
529 /// ```rust
530 /// # use unicode_segmentation::GraphemeCursor;
531 /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
532 /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
533 /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
534 /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(16)));
535 /// assert_eq!(cursor.next_boundary(flags, 0), Ok(None));
536 /// ```
537 ///
538 /// And an example that uses partial strings:
539 ///
540 /// ```rust
541 /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
542 /// let s = "abcd";
543 /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
544 /// assert_eq!(cursor.next_boundary(&s[..2], 0), Ok(Some(1)));
545 /// assert_eq!(cursor.next_boundary(&s[..2], 0), Err(GraphemeIncomplete::NextChunk));
546 /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(2)));
547 /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(3)));
548 /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(4)));
549 /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None));
550 /// ```
next_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<Option<usize>, GraphemeIncomplete>551 pub fn next_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<Option<usize>, GraphemeIncomplete> {
552 use tables::grapheme as gr;
553 if self.offset == self.len {
554 return Ok(None);
555 }
556 let mut iter = chunk[self.offset - chunk_start..].chars();
557 let mut ch = iter.next().unwrap();
558 loop {
559 if self.resuming {
560 if self.cat_after.is_none() {
561 self.cat_after = Some(gr::grapheme_category(ch));
562 }
563 } else {
564 self.offset += ch.len_utf8();
565 self.state = GraphemeState::Unknown;
566 self.cat_before = self.cat_after.take();
567 if self.cat_before.is_none() {
568 self.cat_before = Some(gr::grapheme_category(ch));
569 }
570 if self.cat_before.unwrap() == GraphemeCat::GC_Regional_Indicator {
571 self.ris_count = self.ris_count.map(|c| c + 1);
572 } else {
573 self.ris_count = Some(0);
574 }
575 if let Some(next_ch) = iter.next() {
576 ch = next_ch;
577 self.cat_after = Some(gr::grapheme_category(ch));
578 } else if self.offset == self.len {
579 self.decide(true);
580 } else {
581 self.resuming = true;
582 return Err(GraphemeIncomplete::NextChunk);
583 }
584 }
585 self.resuming = true;
586 if self.is_boundary(chunk, chunk_start)? {
587 self.resuming = false;
588 return Ok(Some(self.offset));
589 }
590 self.resuming = false;
591 }
592 }
593
594 /// Find the previous boundary after the current cursor position. Only a part
595 /// of the string need be supplied. If the chunk is incomplete, then this
596 /// method might return `GraphemeIncomplete::PreContext` or
597 /// `GraphemeIncomplete::PrevChunk`. In the former case, the caller should
598 /// call `provide_context` with the requested chunk, then retry. In the
599 /// latter case, the caller should provide the chunk preceding the one
600 /// given, then retry.
601 ///
602 /// See `is_boundary` for expectations on the provided chunk.
603 ///
604 /// ```rust
605 /// # use unicode_segmentation::GraphemeCursor;
606 /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
607 /// let mut cursor = GraphemeCursor::new(12, flags.len(), false);
608 /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(8)));
609 /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(0)));
610 /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(None));
611 /// ```
612 ///
613 /// And an example that uses partial strings (note the exact return is not
614 /// guaranteed, and may be `PrevChunk` or `PreContext` arbitrarily):
615 ///
616 /// ```rust
617 /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
618 /// let s = "abcd";
619 /// let mut cursor = GraphemeCursor::new(4, s.len(), false);
620 /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Ok(Some(3)));
621 /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Err(GraphemeIncomplete::PrevChunk));
622 /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(2)));
623 /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(1)));
624 /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(0)));
625 /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None));
626 /// ```
prev_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<Option<usize>, GraphemeIncomplete>627 pub fn prev_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<Option<usize>, GraphemeIncomplete> {
628 use tables::grapheme as gr;
629 if self.offset == 0 {
630 return Ok(None);
631 }
632 if self.offset == chunk_start {
633 return Err(GraphemeIncomplete::PrevChunk);
634 }
635 let mut iter = chunk[..self.offset - chunk_start].chars().rev();
636 let mut ch = iter.next().unwrap();
637 loop {
638 if self.offset == chunk_start {
639 self.resuming = true;
640 return Err(GraphemeIncomplete::PrevChunk);
641 }
642 if self.resuming {
643 self.cat_before = Some(gr::grapheme_category(ch));
644 } else {
645 self.offset -= ch.len_utf8();
646 self.cat_after = self.cat_before.take();
647 self.state = GraphemeState::Unknown;
648 if let Some(ris_count) = self.ris_count {
649 self.ris_count = if ris_count > 0 { Some(ris_count - 1) } else { None };
650 }
651 if let Some(prev_ch) = iter.next() {
652 ch = prev_ch;
653 self.cat_before = Some(gr::grapheme_category(ch));
654 } else if self.offset == 0 {
655 self.decide(true);
656 } else {
657 self.resuming = true;
658 self.cat_after = Some(gr::grapheme_category(ch));
659 return Err(GraphemeIncomplete::PrevChunk);
660 }
661 }
662 self.resuming = true;
663 if self.is_boundary(chunk, chunk_start)? {
664 self.resuming = false;
665 return Ok(Some(self.offset));
666 }
667 self.resuming = false;
668 }
669 }
670 }
671
672 #[test]
test_grapheme_cursor_ris_precontext()673 fn test_grapheme_cursor_ris_precontext() {
674 let s = "\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}";
675 let mut c = GraphemeCursor::new(8, s.len(), true);
676 assert_eq!(c.is_boundary(&s[4..], 4), Err(GraphemeIncomplete::PreContext(4)));
677 c.provide_context(&s[..4], 0);
678 assert_eq!(c.is_boundary(&s[4..], 4), Ok(true));
679 }
680
681 #[test]
test_grapheme_cursor_chunk_start_require_precontext()682 fn test_grapheme_cursor_chunk_start_require_precontext() {
683 let s = "\r\n";
684 let mut c = GraphemeCursor::new(1, s.len(), true);
685 assert_eq!(c.is_boundary(&s[1..], 1), Err(GraphemeIncomplete::PreContext(1)));
686 c.provide_context(&s[..1], 0);
687 assert_eq!(c.is_boundary(&s[1..], 1), Ok(false));
688 }
689
690 #[test]
test_grapheme_cursor_prev_boundary()691 fn test_grapheme_cursor_prev_boundary() {
692 let s = "abcd";
693 let mut c = GraphemeCursor::new(3, s.len(), true);
694 assert_eq!(c.prev_boundary(&s[2..], 2), Err(GraphemeIncomplete::PrevChunk));
695 assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(2)));
696 }
697
698 #[test]
test_grapheme_cursor_prev_boundary_chunk_start()699 fn test_grapheme_cursor_prev_boundary_chunk_start() {
700 let s = "abcd";
701 let mut c = GraphemeCursor::new(2, s.len(), true);
702 assert_eq!(c.prev_boundary(&s[2..], 2), Err(GraphemeIncomplete::PrevChunk));
703 assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(1)));
704 }
705