1 //! Functionality for finding words.
2 //!
3 //! In order to wrap text, we need to know where the legal break
4 //! points are, i.e., where the words of the text are. This means that
5 //! we need to define what a "word" is.
6 //!
7 //! A simple approach is to simply split the text on whitespace, but
8 //! this does not work for East-Asian languages such as Chinese or
9 //! Japanese where there are no spaces between words. Breaking a long
10 //! sequence of emojis is another example where line breaks might be
11 //! wanted even if there are no whitespace to be found.
12 //!
13 //! The [`WordSeparator`] trait is responsible for determining where
14 //! there words are in a line of text. Please refer to the trait and
15 //! the structs which implement it for more information.
16 
17 #[cfg(feature = "unicode-linebreak")]
18 use crate::core::skip_ansi_escape_sequence;
19 use crate::core::Word;
20 
21 /// Describes where words occur in a line of text.
22 ///
23 /// The simplest approach is say that words are separated by one or
24 /// more ASCII spaces (`' '`). This works for Western languages
25 /// without emojis. A more complex approach is to use the Unicode line
26 /// breaking algorithm, which finds break points in non-ASCII text.
27 ///
28 /// The line breaks occur between words, please see the
29 /// [`WordSplitter`](crate::word_splitters::WordSplitter) trait for
30 /// options of how to handle hyphenation of individual words.
31 ///
32 /// # Examples
33 ///
34 /// ```
35 /// use textwrap::core::Word;
36 /// use textwrap::word_separators::{WordSeparator, AsciiSpace};
37 ///
38 /// let words = AsciiSpace.find_words("Hello World!").collect::<Vec<_>>();
39 /// assert_eq!(words, vec![Word::from("Hello "), Word::from("World!")]);
40 /// ```
41 pub trait WordSeparator: WordSeparatorClone + std::fmt::Debug {
42     // This trait should really return impl Iterator<Item = Word>, but
43     // this isn't possible until Rust supports higher-kinded types:
44     // https://github.com/rust-lang/rfcs/blob/master/text/1522-conservative-impl-trait.md
45     /// Find all words in `line`.
find_words<'a>(&self, line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a>46     fn find_words<'a>(&self, line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a>;
47 }
48 
49 // The internal `WordSeparatorClone` trait is allows us to implement
50 // `Clone` for `Box<dyn WordSeparator>`. This in used in the
51 // `From<&Options<'_, WrapAlgo, WordSep, WordSplit>> for Options<'a,
52 // WrapAlgo, WordSep, WordSplit>` implementation.
53 #[doc(hidden)]
54 pub trait WordSeparatorClone {
clone_box(&self) -> Box<dyn WordSeparator>55     fn clone_box(&self) -> Box<dyn WordSeparator>;
56 }
57 
58 impl<T: WordSeparator + Clone + 'static> WordSeparatorClone for T {
clone_box(&self) -> Box<dyn WordSeparator>59     fn clone_box(&self) -> Box<dyn WordSeparator> {
60         Box::new(self.clone())
61     }
62 }
63 
64 impl Clone for Box<dyn WordSeparator> {
clone(&self) -> Box<dyn WordSeparator>65     fn clone(&self) -> Box<dyn WordSeparator> {
66         use std::ops::Deref;
67         self.deref().clone_box()
68     }
69 }
70 
71 impl WordSeparator for Box<dyn WordSeparator> {
find_words<'a>(&self, line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a>72     fn find_words<'a>(&self, line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
73         use std::ops::Deref;
74         self.deref().find_words(line)
75     }
76 }
77 
78 /// Find words by splitting on regions of `' '` characters.
79 #[derive(Clone, Copy, Debug, Default)]
80 pub struct AsciiSpace;
81 
82 /// Split `line` into words separated by regions of `' '` characters.
83 ///
84 /// # Examples
85 ///
86 /// ```
87 /// use textwrap::core::Word;
88 /// use textwrap::word_separators::{AsciiSpace, WordSeparator};
89 ///
90 /// let words = AsciiSpace.find_words("Hello   World!").collect::<Vec<_>>();
91 /// assert_eq!(words, vec![Word::from("Hello   "),
92 ///                        Word::from("World!")]);
93 /// ```
94 impl WordSeparator for AsciiSpace {
find_words<'a>(&self, line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a>95     fn find_words<'a>(&self, line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
96         let mut start = 0;
97         let mut in_whitespace = false;
98         let mut char_indices = line.char_indices();
99 
100         Box::new(std::iter::from_fn(move || {
101             // for (idx, ch) in char_indices does not work, gives this
102             // error:
103             //
104             // > cannot move out of `char_indices`, a captured variable in
105             // > an `FnMut` closure
106             #[allow(clippy::while_let_on_iterator)]
107             while let Some((idx, ch)) = char_indices.next() {
108                 if in_whitespace && ch != ' ' {
109                     let word = Word::from(&line[start..idx]);
110                     start = idx;
111                     in_whitespace = ch == ' ';
112                     return Some(word);
113                 }
114 
115                 in_whitespace = ch == ' ';
116             }
117 
118             if start < line.len() {
119                 let word = Word::from(&line[start..]);
120                 start = line.len();
121                 return Some(word);
122             }
123 
124             None
125         }))
126     }
127 }
128 
129 /// Find words using the Unicode line breaking algorithm.
130 #[cfg(feature = "unicode-linebreak")]
131 #[derive(Clone, Copy, Debug, Default)]
132 pub struct UnicodeBreakProperties;
133 
134 /// Split `line` into words using Unicode break properties.
135 ///
136 /// This word separator uses the Unicode line breaking algorithm
137 /// described in [Unicode Standard Annex
138 /// #14](https://www.unicode.org/reports/tr14/) to find legal places
139 /// to break lines. There is a small difference in that the U+002D
140 /// (Hyphen-Minus) and U+00AD (Soft Hyphen) don’t create a line break:
141 /// to allow a line break at a hyphen, use the
142 /// [`HyphenSplitter`](crate::word_splitters::HyphenSplitter). Soft
143 /// hyphens are not currently supported.
144 ///
145 /// # Examples
146 ///
147 /// Unlike [`AsciiSpace`], the Unicode line breaking algorithm will
148 /// find line break opportunities between some characters with no
149 /// intervening whitespace:
150 ///
151 /// ```
152 /// #[cfg(feature = "unicode-linebreak")] {
153 /// use textwrap::word_separators::{WordSeparator, UnicodeBreakProperties};
154 /// use textwrap::core::Word;
155 ///
156 /// assert_eq!(UnicodeBreakProperties.find_words("Emojis: ����").collect::<Vec<_>>(),
157 ///            vec![Word::from("Emojis: "),
158 ///                 Word::from("��"),
159 ///                 Word::from("��")]);
160 ///
161 /// assert_eq!(UnicodeBreakProperties.find_words("CJK: 你好").collect::<Vec<_>>(),
162 ///            vec![Word::from("CJK: "),
163 ///                 Word::from("你"),
164 ///                 Word::from("好")]);
165 /// }
166 /// ```
167 ///
168 /// A U+2060 (Word Joiner) character can be inserted if you want to
169 /// manually override the defaults and keep the characters together:
170 ///
171 /// ```
172 /// #[cfg(feature = "unicode-linebreak")] {
173 /// use textwrap::word_separators::{UnicodeBreakProperties, WordSeparator};
174 /// use textwrap::core::Word;
175 ///
176 /// assert_eq!(UnicodeBreakProperties.find_words("Emojis: ��\u{2060}��").collect::<Vec<_>>(),
177 ///            vec![Word::from("Emojis: "),
178 ///                 Word::from("��\u{2060}��")]);
179 /// }
180 /// ```
181 ///
182 /// The Unicode line breaking algorithm will also automatically
183 /// suppress break breaks around certain punctuation characters::
184 ///
185 /// ```
186 /// #[cfg(feature = "unicode-linebreak")] {
187 /// use textwrap::word_separators::{UnicodeBreakProperties, WordSeparator};
188 /// use textwrap::core::Word;
189 ///
190 /// assert_eq!(UnicodeBreakProperties.find_words("[ foo ] bar !").collect::<Vec<_>>(),
191 ///            vec![Word::from("[ foo ] "),
192 ///                 Word::from("bar !")]);
193 /// }
194 /// ```
195 #[cfg(feature = "unicode-linebreak")]
196 impl WordSeparator for UnicodeBreakProperties {
find_words<'a>(&self, line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a>197     fn find_words<'a>(&self, line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
198         // Construct an iterator over (original index, stripped index)
199         // tuples. We find the Unicode linebreaks on a stripped string,
200         // but we need the original indices so we can form words based on
201         // the original string.
202         let mut last_stripped_idx = 0;
203         let mut char_indices = line.char_indices();
204         let mut idx_map = std::iter::from_fn(move || match char_indices.next() {
205             Some((orig_idx, ch)) => {
206                 let stripped_idx = last_stripped_idx;
207                 if !skip_ansi_escape_sequence(ch, &mut char_indices.by_ref().map(|(_, ch)| ch)) {
208                     last_stripped_idx += ch.len_utf8();
209                 }
210                 Some((orig_idx, stripped_idx))
211             }
212             None => None,
213         });
214 
215         let stripped = strip_ansi_escape_sequences(&line);
216         let mut opportunities = unicode_linebreak::linebreaks(&stripped)
217             .filter(|(idx, _)| {
218                 #[allow(clippy::match_like_matches_macro)]
219                 match &stripped[..*idx].chars().next_back() {
220                     // We suppress breaks at ‘-’ since we want to control
221                     // this via the WordSplitter.
222                     Some('-') => false,
223                     // Soft hyphens are currently not supported since we
224                     // require all `Word` fragments to be continuous in
225                     // the input string.
226                     Some(SHY) => false,
227                     // Other breaks should be fine!
228                     _ => true,
229                 }
230             })
231             .collect::<Vec<_>>()
232             .into_iter();
233 
234         // Remove final break opportunity, we will add it below using
235         // &line[start..]; This ensures that we correctly include a
236         // trailing ANSI escape sequence.
237         opportunities.next_back();
238 
239         let mut start = 0;
240         Box::new(std::iter::from_fn(move || {
241             #[allow(clippy::while_let_on_iterator)]
242             while let Some((idx, _)) = opportunities.next() {
243                 if let Some((orig_idx, _)) = idx_map.find(|&(_, stripped_idx)| stripped_idx == idx)
244                 {
245                     let word = Word::from(&line[start..orig_idx]);
246                     start = orig_idx;
247                     return Some(word);
248                 }
249             }
250 
251             if start < line.len() {
252                 let word = Word::from(&line[start..]);
253                 start = line.len();
254                 return Some(word);
255             }
256 
257             None
258         }))
259     }
260 }
261 
262 /// Soft hyphen, also knows as a “shy hyphen”. Should show up as ‘-’
263 /// if a line is broken at this point, and otherwise be invisible.
264 /// Textwrap does not currently support breaking words at soft
265 /// hyphens.
266 #[cfg(feature = "unicode-linebreak")]
267 const SHY: char = '\u{00ad}';
268 
269 // Strip all ANSI escape sequences from `text`.
270 #[cfg(feature = "unicode-linebreak")]
strip_ansi_escape_sequences(text: &str) -> String271 fn strip_ansi_escape_sequences(text: &str) -> String {
272     let mut result = String::with_capacity(text.len());
273 
274     let mut chars = text.chars();
275     while let Some(ch) = chars.next() {
276         if skip_ansi_escape_sequence(ch, &mut chars) {
277             continue;
278         }
279         result.push(ch);
280     }
281 
282     result
283 }
284 
285 #[cfg(test)]
286 mod tests {
287     use super::*;
288 
289     // Like assert_eq!, but the left expression is an iterator.
290     macro_rules! assert_iter_eq {
291         ($left:expr, $right:expr) => {
292             assert_eq!($left.collect::<Vec<_>>(), $right);
293         };
294     }
295 
296     #[test]
ascii_space_empty()297     fn ascii_space_empty() {
298         assert_iter_eq!(AsciiSpace.find_words(""), vec![]);
299     }
300 
301     #[test]
ascii_space_single_word()302     fn ascii_space_single_word() {
303         assert_iter_eq!(AsciiSpace.find_words("foo"), vec![Word::from("foo")]);
304     }
305 
306     #[test]
ascii_space_two_words()307     fn ascii_space_two_words() {
308         assert_iter_eq!(
309             AsciiSpace.find_words("foo bar"),
310             vec![Word::from("foo "), Word::from("bar")]
311         );
312     }
313 
314     #[test]
ascii_space_multiple_words()315     fn ascii_space_multiple_words() {
316         assert_iter_eq!(
317             AsciiSpace.find_words("foo bar baz"),
318             vec![Word::from("foo "), Word::from("bar "), Word::from("baz")]
319         );
320     }
321 
322     #[test]
ascii_space_only_whitespace()323     fn ascii_space_only_whitespace() {
324         assert_iter_eq!(AsciiSpace.find_words("    "), vec![Word::from("    ")]);
325     }
326 
327     #[test]
ascii_space_inter_word_whitespace()328     fn ascii_space_inter_word_whitespace() {
329         assert_iter_eq!(
330             AsciiSpace.find_words("foo   bar"),
331             vec![Word::from("foo   "), Word::from("bar")]
332         )
333     }
334 
335     #[test]
ascii_space_trailing_whitespace()336     fn ascii_space_trailing_whitespace() {
337         assert_iter_eq!(AsciiSpace.find_words("foo   "), vec![Word::from("foo   ")]);
338     }
339 
340     #[test]
ascii_space_leading_whitespace()341     fn ascii_space_leading_whitespace() {
342         assert_iter_eq!(
343             AsciiSpace.find_words("   foo"),
344             vec![Word::from("   "), Word::from("foo")]
345         );
346     }
347 
348     #[test]
ascii_space_multi_column_char()349     fn ascii_space_multi_column_char() {
350         assert_iter_eq!(
351             AsciiSpace.find_words("\u{1f920}"), // cowboy emoji ��
352             vec![Word::from("\u{1f920}")]
353         );
354     }
355 
356     #[test]
ascii_space_hyphens()357     fn ascii_space_hyphens() {
358         assert_iter_eq!(
359             AsciiSpace.find_words("foo-bar"),
360             vec![Word::from("foo-bar")]
361         );
362         assert_iter_eq!(
363             AsciiSpace.find_words("foo- bar"),
364             vec![Word::from("foo- "), Word::from("bar")]
365         );
366         assert_iter_eq!(
367             AsciiSpace.find_words("foo - bar"),
368             vec![Word::from("foo "), Word::from("- "), Word::from("bar")]
369         );
370         assert_iter_eq!(
371             AsciiSpace.find_words("foo -bar"),
372             vec![Word::from("foo "), Word::from("-bar")]
373         );
374     }
375 
376     #[test]
377     #[cfg(unix)]
ascii_space_colored_text()378     fn ascii_space_colored_text() {
379         use termion::color::{Blue, Fg, Green, Reset};
380 
381         let green_hello = format!("{}Hello{} ", Fg(Green), Fg(Reset));
382         let blue_world = format!("{}World!{}", Fg(Blue), Fg(Reset));
383         assert_iter_eq!(
384             AsciiSpace.find_words(&format!("{}{}", green_hello, blue_world)),
385             vec![Word::from(&green_hello), Word::from(&blue_world)]
386         );
387 
388         #[cfg(feature = "unicode-linebreak")]
389         assert_iter_eq!(
390             UnicodeBreakProperties.find_words(&format!("{}{}", green_hello, blue_world)),
391             vec![Word::from(&green_hello), Word::from(&blue_world)]
392         );
393     }
394 
395     #[test]
ascii_space_color_inside_word()396     fn ascii_space_color_inside_word() {
397         let text = "foo\u{1b}[0m\u{1b}[32mbar\u{1b}[0mbaz";
398         assert_iter_eq!(AsciiSpace.find_words(&text), vec![Word::from(text)]);
399 
400         #[cfg(feature = "unicode-linebreak")]
401         assert_iter_eq!(
402             UnicodeBreakProperties.find_words(&text),
403             vec![Word::from(text)]
404         );
405     }
406 }
407