1 //! Functionality for finding words.
2 //!
3 //! In order to wrap text, we need to know where the legal break
4 //! points are, i.e., where the words of the text are. This means that
5 //! we need to define what a "word" is.
6 //!
7 //! A simple approach is to simply split the text on whitespace, but
8 //! this does not work for East-Asian languages such as Chinese or
9 //! Japanese where there are no spaces between words. Breaking a long
10 //! sequence of emojis is another example where line breaks might be
11 //! wanted even if there are no whitespace to be found.
12 //!
13 //! The [`WordSeparator`] trait is responsible for determining where
14 //! there words are in a line of text. Please refer to the trait and
15 //! the structs which implement it for more information.
16
17 #[cfg(feature = "unicode-linebreak")]
18 use crate::core::skip_ansi_escape_sequence;
19 use crate::core::Word;
20
21 /// Describes where words occur in a line of text.
22 ///
23 /// The simplest approach is say that words are separated by one or
24 /// more ASCII spaces (`' '`). This works for Western languages
25 /// without emojis. A more complex approach is to use the Unicode line
26 /// breaking algorithm, which finds break points in non-ASCII text.
27 ///
28 /// The line breaks occur between words, please see the
29 /// [`WordSplitter`](crate::word_splitters::WordSplitter) trait for
30 /// options of how to handle hyphenation of individual words.
31 ///
32 /// # Examples
33 ///
34 /// ```
35 /// use textwrap::core::Word;
36 /// use textwrap::word_separators::{WordSeparator, AsciiSpace};
37 ///
38 /// let words = AsciiSpace.find_words("Hello World!").collect::<Vec<_>>();
39 /// assert_eq!(words, vec![Word::from("Hello "), Word::from("World!")]);
40 /// ```
41 pub trait WordSeparator: WordSeparatorClone + std::fmt::Debug {
42 // This trait should really return impl Iterator<Item = Word>, but
43 // this isn't possible until Rust supports higher-kinded types:
44 // https://github.com/rust-lang/rfcs/blob/master/text/1522-conservative-impl-trait.md
45 /// Find all words in `line`.
find_words<'a>(&self, line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a>46 fn find_words<'a>(&self, line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a>;
47 }
48
49 // The internal `WordSeparatorClone` trait is allows us to implement
50 // `Clone` for `Box<dyn WordSeparator>`. This in used in the
51 // `From<&Options<'_, WrapAlgo, WordSep, WordSplit>> for Options<'a,
52 // WrapAlgo, WordSep, WordSplit>` implementation.
53 #[doc(hidden)]
54 pub trait WordSeparatorClone {
clone_box(&self) -> Box<dyn WordSeparator>55 fn clone_box(&self) -> Box<dyn WordSeparator>;
56 }
57
58 impl<T: WordSeparator + Clone + 'static> WordSeparatorClone for T {
clone_box(&self) -> Box<dyn WordSeparator>59 fn clone_box(&self) -> Box<dyn WordSeparator> {
60 Box::new(self.clone())
61 }
62 }
63
64 impl Clone for Box<dyn WordSeparator> {
clone(&self) -> Box<dyn WordSeparator>65 fn clone(&self) -> Box<dyn WordSeparator> {
66 use std::ops::Deref;
67 self.deref().clone_box()
68 }
69 }
70
71 impl WordSeparator for Box<dyn WordSeparator> {
find_words<'a>(&self, line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a>72 fn find_words<'a>(&self, line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
73 use std::ops::Deref;
74 self.deref().find_words(line)
75 }
76 }
77
78 /// Find words by splitting on regions of `' '` characters.
79 #[derive(Clone, Copy, Debug, Default)]
80 pub struct AsciiSpace;
81
82 /// Split `line` into words separated by regions of `' '` characters.
83 ///
84 /// # Examples
85 ///
86 /// ```
87 /// use textwrap::core::Word;
88 /// use textwrap::word_separators::{AsciiSpace, WordSeparator};
89 ///
90 /// let words = AsciiSpace.find_words("Hello World!").collect::<Vec<_>>();
91 /// assert_eq!(words, vec![Word::from("Hello "),
92 /// Word::from("World!")]);
93 /// ```
94 impl WordSeparator for AsciiSpace {
find_words<'a>(&self, line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a>95 fn find_words<'a>(&self, line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
96 let mut start = 0;
97 let mut in_whitespace = false;
98 let mut char_indices = line.char_indices();
99
100 Box::new(std::iter::from_fn(move || {
101 // for (idx, ch) in char_indices does not work, gives this
102 // error:
103 //
104 // > cannot move out of `char_indices`, a captured variable in
105 // > an `FnMut` closure
106 #[allow(clippy::while_let_on_iterator)]
107 while let Some((idx, ch)) = char_indices.next() {
108 if in_whitespace && ch != ' ' {
109 let word = Word::from(&line[start..idx]);
110 start = idx;
111 in_whitespace = ch == ' ';
112 return Some(word);
113 }
114
115 in_whitespace = ch == ' ';
116 }
117
118 if start < line.len() {
119 let word = Word::from(&line[start..]);
120 start = line.len();
121 return Some(word);
122 }
123
124 None
125 }))
126 }
127 }
128
129 /// Find words using the Unicode line breaking algorithm.
130 #[cfg(feature = "unicode-linebreak")]
131 #[derive(Clone, Copy, Debug, Default)]
132 pub struct UnicodeBreakProperties;
133
134 /// Split `line` into words using Unicode break properties.
135 ///
136 /// This word separator uses the Unicode line breaking algorithm
137 /// described in [Unicode Standard Annex
138 /// #14](https://www.unicode.org/reports/tr14/) to find legal places
139 /// to break lines. There is a small difference in that the U+002D
140 /// (Hyphen-Minus) and U+00AD (Soft Hyphen) don’t create a line break:
141 /// to allow a line break at a hyphen, use the
142 /// [`HyphenSplitter`](crate::word_splitters::HyphenSplitter). Soft
143 /// hyphens are not currently supported.
144 ///
145 /// # Examples
146 ///
147 /// Unlike [`AsciiSpace`], the Unicode line breaking algorithm will
148 /// find line break opportunities between some characters with no
149 /// intervening whitespace:
150 ///
151 /// ```
152 /// #[cfg(feature = "unicode-linebreak")] {
153 /// use textwrap::word_separators::{WordSeparator, UnicodeBreakProperties};
154 /// use textwrap::core::Word;
155 ///
156 /// assert_eq!(UnicodeBreakProperties.find_words("Emojis: ").collect::<Vec<_>>(),
157 /// vec![Word::from("Emojis: "),
158 /// Word::from(""),
159 /// Word::from("")]);
160 ///
161 /// assert_eq!(UnicodeBreakProperties.find_words("CJK: 你好").collect::<Vec<_>>(),
162 /// vec![Word::from("CJK: "),
163 /// Word::from("你"),
164 /// Word::from("好")]);
165 /// }
166 /// ```
167 ///
168 /// A U+2060 (Word Joiner) character can be inserted if you want to
169 /// manually override the defaults and keep the characters together:
170 ///
171 /// ```
172 /// #[cfg(feature = "unicode-linebreak")] {
173 /// use textwrap::word_separators::{UnicodeBreakProperties, WordSeparator};
174 /// use textwrap::core::Word;
175 ///
176 /// assert_eq!(UnicodeBreakProperties.find_words("Emojis: \u{2060}").collect::<Vec<_>>(),
177 /// vec![Word::from("Emojis: "),
178 /// Word::from("\u{2060}")]);
179 /// }
180 /// ```
181 ///
182 /// The Unicode line breaking algorithm will also automatically
183 /// suppress break breaks around certain punctuation characters::
184 ///
185 /// ```
186 /// #[cfg(feature = "unicode-linebreak")] {
187 /// use textwrap::word_separators::{UnicodeBreakProperties, WordSeparator};
188 /// use textwrap::core::Word;
189 ///
190 /// assert_eq!(UnicodeBreakProperties.find_words("[ foo ] bar !").collect::<Vec<_>>(),
191 /// vec![Word::from("[ foo ] "),
192 /// Word::from("bar !")]);
193 /// }
194 /// ```
195 #[cfg(feature = "unicode-linebreak")]
196 impl WordSeparator for UnicodeBreakProperties {
find_words<'a>(&self, line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a>197 fn find_words<'a>(&self, line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
198 // Construct an iterator over (original index, stripped index)
199 // tuples. We find the Unicode linebreaks on a stripped string,
200 // but we need the original indices so we can form words based on
201 // the original string.
202 let mut last_stripped_idx = 0;
203 let mut char_indices = line.char_indices();
204 let mut idx_map = std::iter::from_fn(move || match char_indices.next() {
205 Some((orig_idx, ch)) => {
206 let stripped_idx = last_stripped_idx;
207 if !skip_ansi_escape_sequence(ch, &mut char_indices.by_ref().map(|(_, ch)| ch)) {
208 last_stripped_idx += ch.len_utf8();
209 }
210 Some((orig_idx, stripped_idx))
211 }
212 None => None,
213 });
214
215 let stripped = strip_ansi_escape_sequences(&line);
216 let mut opportunities = unicode_linebreak::linebreaks(&stripped)
217 .filter(|(idx, _)| {
218 #[allow(clippy::match_like_matches_macro)]
219 match &stripped[..*idx].chars().next_back() {
220 // We suppress breaks at ‘-’ since we want to control
221 // this via the WordSplitter.
222 Some('-') => false,
223 // Soft hyphens are currently not supported since we
224 // require all `Word` fragments to be continuous in
225 // the input string.
226 Some(SHY) => false,
227 // Other breaks should be fine!
228 _ => true,
229 }
230 })
231 .collect::<Vec<_>>()
232 .into_iter();
233
234 // Remove final break opportunity, we will add it below using
235 // &line[start..]; This ensures that we correctly include a
236 // trailing ANSI escape sequence.
237 opportunities.next_back();
238
239 let mut start = 0;
240 Box::new(std::iter::from_fn(move || {
241 #[allow(clippy::while_let_on_iterator)]
242 while let Some((idx, _)) = opportunities.next() {
243 if let Some((orig_idx, _)) = idx_map.find(|&(_, stripped_idx)| stripped_idx == idx)
244 {
245 let word = Word::from(&line[start..orig_idx]);
246 start = orig_idx;
247 return Some(word);
248 }
249 }
250
251 if start < line.len() {
252 let word = Word::from(&line[start..]);
253 start = line.len();
254 return Some(word);
255 }
256
257 None
258 }))
259 }
260 }
261
262 /// Soft hyphen, also knows as a “shy hyphen”. Should show up as ‘-’
263 /// if a line is broken at this point, and otherwise be invisible.
264 /// Textwrap does not currently support breaking words at soft
265 /// hyphens.
266 #[cfg(feature = "unicode-linebreak")]
267 const SHY: char = '\u{00ad}';
268
269 // Strip all ANSI escape sequences from `text`.
270 #[cfg(feature = "unicode-linebreak")]
strip_ansi_escape_sequences(text: &str) -> String271 fn strip_ansi_escape_sequences(text: &str) -> String {
272 let mut result = String::with_capacity(text.len());
273
274 let mut chars = text.chars();
275 while let Some(ch) = chars.next() {
276 if skip_ansi_escape_sequence(ch, &mut chars) {
277 continue;
278 }
279 result.push(ch);
280 }
281
282 result
283 }
284
285 #[cfg(test)]
286 mod tests {
287 use super::*;
288
289 // Like assert_eq!, but the left expression is an iterator.
290 macro_rules! assert_iter_eq {
291 ($left:expr, $right:expr) => {
292 assert_eq!($left.collect::<Vec<_>>(), $right);
293 };
294 }
295
296 #[test]
ascii_space_empty()297 fn ascii_space_empty() {
298 assert_iter_eq!(AsciiSpace.find_words(""), vec![]);
299 }
300
301 #[test]
ascii_space_single_word()302 fn ascii_space_single_word() {
303 assert_iter_eq!(AsciiSpace.find_words("foo"), vec![Word::from("foo")]);
304 }
305
306 #[test]
ascii_space_two_words()307 fn ascii_space_two_words() {
308 assert_iter_eq!(
309 AsciiSpace.find_words("foo bar"),
310 vec![Word::from("foo "), Word::from("bar")]
311 );
312 }
313
314 #[test]
ascii_space_multiple_words()315 fn ascii_space_multiple_words() {
316 assert_iter_eq!(
317 AsciiSpace.find_words("foo bar baz"),
318 vec![Word::from("foo "), Word::from("bar "), Word::from("baz")]
319 );
320 }
321
322 #[test]
ascii_space_only_whitespace()323 fn ascii_space_only_whitespace() {
324 assert_iter_eq!(AsciiSpace.find_words(" "), vec![Word::from(" ")]);
325 }
326
327 #[test]
ascii_space_inter_word_whitespace()328 fn ascii_space_inter_word_whitespace() {
329 assert_iter_eq!(
330 AsciiSpace.find_words("foo bar"),
331 vec![Word::from("foo "), Word::from("bar")]
332 )
333 }
334
335 #[test]
ascii_space_trailing_whitespace()336 fn ascii_space_trailing_whitespace() {
337 assert_iter_eq!(AsciiSpace.find_words("foo "), vec![Word::from("foo ")]);
338 }
339
340 #[test]
ascii_space_leading_whitespace()341 fn ascii_space_leading_whitespace() {
342 assert_iter_eq!(
343 AsciiSpace.find_words(" foo"),
344 vec![Word::from(" "), Word::from("foo")]
345 );
346 }
347
348 #[test]
ascii_space_multi_column_char()349 fn ascii_space_multi_column_char() {
350 assert_iter_eq!(
351 AsciiSpace.find_words("\u{1f920}"), // cowboy emoji
352 vec![Word::from("\u{1f920}")]
353 );
354 }
355
356 #[test]
ascii_space_hyphens()357 fn ascii_space_hyphens() {
358 assert_iter_eq!(
359 AsciiSpace.find_words("foo-bar"),
360 vec![Word::from("foo-bar")]
361 );
362 assert_iter_eq!(
363 AsciiSpace.find_words("foo- bar"),
364 vec![Word::from("foo- "), Word::from("bar")]
365 );
366 assert_iter_eq!(
367 AsciiSpace.find_words("foo - bar"),
368 vec![Word::from("foo "), Word::from("- "), Word::from("bar")]
369 );
370 assert_iter_eq!(
371 AsciiSpace.find_words("foo -bar"),
372 vec![Word::from("foo "), Word::from("-bar")]
373 );
374 }
375
376 #[test]
377 #[cfg(unix)]
ascii_space_colored_text()378 fn ascii_space_colored_text() {
379 use termion::color::{Blue, Fg, Green, Reset};
380
381 let green_hello = format!("{}Hello{} ", Fg(Green), Fg(Reset));
382 let blue_world = format!("{}World!{}", Fg(Blue), Fg(Reset));
383 assert_iter_eq!(
384 AsciiSpace.find_words(&format!("{}{}", green_hello, blue_world)),
385 vec![Word::from(&green_hello), Word::from(&blue_world)]
386 );
387
388 #[cfg(feature = "unicode-linebreak")]
389 assert_iter_eq!(
390 UnicodeBreakProperties.find_words(&format!("{}{}", green_hello, blue_world)),
391 vec![Word::from(&green_hello), Word::from(&blue_world)]
392 );
393 }
394
395 #[test]
ascii_space_color_inside_word()396 fn ascii_space_color_inside_word() {
397 let text = "foo\u{1b}[0m\u{1b}[32mbar\u{1b}[0mbaz";
398 assert_iter_eq!(AsciiSpace.find_words(&text), vec![Word::from(text)]);
399
400 #[cfg(feature = "unicode-linebreak")]
401 assert_iter_eq!(
402 UnicodeBreakProperties.find_words(&text),
403 vec![Word::from(text)]
404 );
405 }
406 }
407