1 //! Utility functions to traverse the unicode graphemes of a `Rope`'s text contents.
2 //!
3 //! Based on https://github.com/cessen/led/blob/c4fa72405f510b7fd16052f90a598c429b3104a6/src/graphemes.rs
4 use ropey::{iter::Chunks, str_utils::byte_to_char_idx, RopeSlice};
5 use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
6 use unicode_width::UnicodeWidthStr;
7 
8 use std::fmt;
9 
10 #[must_use]
grapheme_width(g: &str) -> usize11 pub fn grapheme_width(g: &str) -> usize {
12     if g.as_bytes()[0] <= 127 {
13         // Fast-path ascii.
14         // Point 1: theoretically, ascii control characters should have zero
15         // width, but in our case we actually want them to have width: if they
16         // show up in text, we want to treat them as textual elements that can
17         // be editied.  So we can get away with making all ascii single width
18         // here.
19         // Point 2: we're only examining the first codepoint here, which means
20         // we're ignoring graphemes formed with combining characters.  However,
21         // if it starts with ascii, it's going to be a single-width grapeheme
22         // regardless, so, again, we can get away with that here.
23         // Point 3: we're only examining the first _byte_.  But for utf8, when
24         // checking for ascii range values only, that works.
25         1
26     } else {
27         // We use max(1) here because all grapeheme clusters--even illformed
28         // ones--should have at least some width so they can be edited
29         // properly.
30         UnicodeWidthStr::width(g).max(1)
31     }
32 }
33 
34 #[must_use]
nth_prev_grapheme_boundary(slice: RopeSlice, char_idx: usize, n: usize) -> usize35 pub fn nth_prev_grapheme_boundary(slice: RopeSlice, char_idx: usize, n: usize) -> usize {
36     // Bounds check
37     debug_assert!(char_idx <= slice.len_chars());
38 
39     // We work with bytes for this, so convert.
40     let mut byte_idx = slice.char_to_byte(char_idx);
41 
42     // Get the chunk with our byte index in it.
43     let (mut chunk, mut chunk_byte_idx, mut chunk_char_idx, _) = slice.chunk_at_byte(byte_idx);
44 
45     // Set up the grapheme cursor.
46     let mut gc = GraphemeCursor::new(byte_idx, slice.len_bytes(), true);
47 
48     // Find the previous grapheme cluster boundary.
49     for _ in 0..n {
50         loop {
51             match gc.prev_boundary(chunk, chunk_byte_idx) {
52                 Ok(None) => return 0,
53                 Ok(Some(n)) => {
54                     byte_idx = n;
55                     break;
56                 }
57                 Err(GraphemeIncomplete::PrevChunk) => {
58                     let (a, b, c, _) = slice.chunk_at_byte(chunk_byte_idx - 1);
59                     chunk = a;
60                     chunk_byte_idx = b;
61                     chunk_char_idx = c;
62                 }
63                 Err(GraphemeIncomplete::PreContext(n)) => {
64                     let ctx_chunk = slice.chunk_at_byte(n - 1).0;
65                     gc.provide_context(ctx_chunk, n - ctx_chunk.len());
66                 }
67                 _ => unreachable!(),
68             }
69         }
70     }
71     let tmp = byte_to_char_idx(chunk, byte_idx - chunk_byte_idx);
72     chunk_char_idx + tmp
73 }
74 
75 /// Finds the previous grapheme boundary before the given char position.
76 #[must_use]
77 #[inline(always)]
prev_grapheme_boundary(slice: RopeSlice, char_idx: usize) -> usize78 pub fn prev_grapheme_boundary(slice: RopeSlice, char_idx: usize) -> usize {
79     nth_prev_grapheme_boundary(slice, char_idx, 1)
80 }
81 
82 #[must_use]
nth_next_grapheme_boundary(slice: RopeSlice, char_idx: usize, n: usize) -> usize83 pub fn nth_next_grapheme_boundary(slice: RopeSlice, char_idx: usize, n: usize) -> usize {
84     // Bounds check
85     debug_assert!(char_idx <= slice.len_chars());
86 
87     // We work with bytes for this, so convert.
88     let mut byte_idx = slice.char_to_byte(char_idx);
89 
90     // Get the chunk with our byte index in it.
91     let (mut chunk, mut chunk_byte_idx, mut chunk_char_idx, _) = slice.chunk_at_byte(byte_idx);
92 
93     // Set up the grapheme cursor.
94     let mut gc = GraphemeCursor::new(byte_idx, slice.len_bytes(), true);
95 
96     // Find the nth next grapheme cluster boundary.
97     for _ in 0..n {
98         loop {
99             match gc.next_boundary(chunk, chunk_byte_idx) {
100                 Ok(None) => return slice.len_chars(),
101                 Ok(Some(n)) => {
102                     byte_idx = n;
103                     break;
104                 }
105                 Err(GraphemeIncomplete::NextChunk) => {
106                     chunk_byte_idx += chunk.len();
107                     let (a, _, c, _) = slice.chunk_at_byte(chunk_byte_idx);
108                     chunk = a;
109                     chunk_char_idx = c;
110                 }
111                 Err(GraphemeIncomplete::PreContext(n)) => {
112                     let ctx_chunk = slice.chunk_at_byte(n - 1).0;
113                     gc.provide_context(ctx_chunk, n - ctx_chunk.len());
114                 }
115                 _ => unreachable!(),
116             }
117         }
118     }
119     let tmp = byte_to_char_idx(chunk, byte_idx - chunk_byte_idx);
120     chunk_char_idx + tmp
121 }
122 
123 /// Finds the next grapheme boundary after the given char position.
124 #[must_use]
125 #[inline(always)]
next_grapheme_boundary(slice: RopeSlice, char_idx: usize) -> usize126 pub fn next_grapheme_boundary(slice: RopeSlice, char_idx: usize) -> usize {
127     nth_next_grapheme_boundary(slice, char_idx, 1)
128 }
129 
130 /// Returns the passed char index if it's already a grapheme boundary,
131 /// or the next grapheme boundary char index if not.
132 #[must_use]
133 #[inline]
ensure_grapheme_boundary_next(slice: RopeSlice, char_idx: usize) -> usize134 pub fn ensure_grapheme_boundary_next(slice: RopeSlice, char_idx: usize) -> usize {
135     if char_idx == 0 {
136         char_idx
137     } else {
138         next_grapheme_boundary(slice, char_idx - 1)
139     }
140 }
141 
142 /// Returns the passed char index if it's already a grapheme boundary,
143 /// or the prev grapheme boundary char index if not.
144 #[must_use]
145 #[inline]
ensure_grapheme_boundary_prev(slice: RopeSlice, char_idx: usize) -> usize146 pub fn ensure_grapheme_boundary_prev(slice: RopeSlice, char_idx: usize) -> usize {
147     if char_idx == slice.len_chars() {
148         char_idx
149     } else {
150         prev_grapheme_boundary(slice, char_idx + 1)
151     }
152 }
153 
154 /// Returns whether the given char position is a grapheme boundary.
155 #[must_use]
is_grapheme_boundary(slice: RopeSlice, char_idx: usize) -> bool156 pub fn is_grapheme_boundary(slice: RopeSlice, char_idx: usize) -> bool {
157     // Bounds check
158     debug_assert!(char_idx <= slice.len_chars());
159 
160     // We work with bytes for this, so convert.
161     let byte_idx = slice.char_to_byte(char_idx);
162 
163     // Get the chunk with our byte index in it.
164     let (chunk, chunk_byte_idx, _, _) = slice.chunk_at_byte(byte_idx);
165 
166     // Set up the grapheme cursor.
167     let mut gc = GraphemeCursor::new(byte_idx, slice.len_bytes(), true);
168 
169     // Determine if the given position is a grapheme cluster boundary.
170     loop {
171         match gc.is_boundary(chunk, chunk_byte_idx) {
172             Ok(n) => return n,
173             Err(GraphemeIncomplete::PreContext(n)) => {
174                 let (ctx_chunk, ctx_byte_start, _, _) = slice.chunk_at_byte(n - 1);
175                 gc.provide_context(ctx_chunk, ctx_byte_start);
176             }
177             Err(_) => unreachable!(),
178         }
179     }
180 }
181 
182 /// An iterator over the graphemes of a `RopeSlice`.
183 #[derive(Clone)]
184 pub struct RopeGraphemes<'a> {
185     text: RopeSlice<'a>,
186     chunks: Chunks<'a>,
187     cur_chunk: &'a str,
188     cur_chunk_start: usize,
189     cursor: GraphemeCursor,
190 }
191 
192 impl<'a> fmt::Debug for RopeGraphemes<'a> {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result193     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
194         f.debug_struct("RopeGraphemes")
195             .field("text", &self.text)
196             .field("chunks", &self.chunks)
197             .field("cur_chunk", &self.cur_chunk)
198             .field("cur_chunk_start", &self.cur_chunk_start)
199             // .field("cursor", &self.cursor)
200             .finish()
201     }
202 }
203 
204 impl<'a> RopeGraphemes<'a> {
205     #[must_use]
new(slice: RopeSlice) -> RopeGraphemes206     pub fn new(slice: RopeSlice) -> RopeGraphemes {
207         let mut chunks = slice.chunks();
208         let first_chunk = chunks.next().unwrap_or("");
209         RopeGraphemes {
210             text: slice,
211             chunks,
212             cur_chunk: first_chunk,
213             cur_chunk_start: 0,
214             cursor: GraphemeCursor::new(0, slice.len_bytes(), true),
215         }
216     }
217 }
218 
219 impl<'a> Iterator for RopeGraphemes<'a> {
220     type Item = RopeSlice<'a>;
221 
next(&mut self) -> Option<RopeSlice<'a>>222     fn next(&mut self) -> Option<RopeSlice<'a>> {
223         let a = self.cursor.cur_cursor();
224         let b;
225         loop {
226             match self
227                 .cursor
228                 .next_boundary(self.cur_chunk, self.cur_chunk_start)
229             {
230                 Ok(None) => {
231                     return None;
232                 }
233                 Ok(Some(n)) => {
234                     b = n;
235                     break;
236                 }
237                 Err(GraphemeIncomplete::NextChunk) => {
238                     self.cur_chunk_start += self.cur_chunk.len();
239                     self.cur_chunk = self.chunks.next().unwrap_or("");
240                 }
241                 Err(GraphemeIncomplete::PreContext(idx)) => {
242                     let (chunk, byte_idx, _, _) = self.text.chunk_at_byte(idx.saturating_sub(1));
243                     self.cursor.provide_context(chunk, byte_idx);
244                 }
245                 _ => unreachable!(),
246             }
247         }
248 
249         if a < self.cur_chunk_start {
250             let a_char = self.text.byte_to_char(a);
251             let b_char = self.text.byte_to_char(b);
252 
253             Some(self.text.slice(a_char..b_char))
254         } else {
255             let a2 = a - self.cur_chunk_start;
256             let b2 = b - self.cur_chunk_start;
257             Some((&self.cur_chunk[a2..b2]).into())
258         }
259     }
260 }
261