1 //! Building blocks for advanced wrapping functionality.
2 //!
3 //! The functions and structs in this module can be used to implement
4 //! advanced wrapping functionality when the [`wrap`](super::wrap) and
5 //! [`fill`](super::fill) function don't do what you want.
6 //!
7 //! In general, you want to follow these steps when wrapping
8 //! something:
9 //!
10 //! 1. Split your input into [`Fragment`]s. These are abstract blocks
11 //!    of text or content which can be wrapped into lines. See
12 //!    [`WordSeparator`](crate::word_separators::WordSeparator) for
13 //!    how to do this for text.
14 //!
15 //! 2. Potentially split your fragments into smaller pieces. This
16 //!    allows you to implement things like hyphenation. If you are
17 //!    wrapping text represented as a sequence of [`Word`]s, then you
18 //!    can use [`split_words`](crate::word_splitters::split_words) can
19 //!    help you do this.
20 //!
21 //! 3. Potentially break apart fragments that are still too large to
22 //!    fit on a single line. This is implemented in [`break_words`].
23 //!
24 //! 4. Finally take your fragments and put them into lines. There are
25 //!    two algorithms for this in the
26 //!    [`wrap_algorithms`](crate::wrap_algorithms) module:
27 //!    [`wrap_optimal_fit`](crate::wrap_algorithms::wrap_optimal_fit)
28 //!    and [`wrap_first_fit`](crate::wrap_algorithms::wrap_first_fit).
29 //!    The former produces better line breaks, the latter is faster.
30 //!
31 //! 5. Iterate through the slices returned by the wrapping functions
32 //!    and construct your lines of output.
33 //!
34 //! Please [open an issue](https://github.com/mgeisler/textwrap/) if
35 //! the functionality here is not sufficient or if you have ideas for
36 //! improving it. We would love to hear from you!
37 
38 /// The CSI or “Control Sequence Introducer” introduces an ANSI escape
39 /// sequence. This is typically used for colored text and will be
40 /// ignored when computing the text width.
41 const CSI: (char, char) = ('\x1b', '[');
42 /// The final bytes of an ANSI escape sequence must be in this range.
43 const ANSI_FINAL_BYTE: std::ops::RangeInclusive<char> = '\x40'..='\x7e';
44 
45 /// Skip ANSI escape sequences. The `ch` is the current `char`, the
46 /// `chars` provide the following characters. The `chars` will be
47 /// modified if `ch` is the start of an ANSI escape sequence.
48 #[inline]
skip_ansi_escape_sequence<I: Iterator<Item = char>>(ch: char, chars: &mut I) -> bool49 pub(crate) fn skip_ansi_escape_sequence<I: Iterator<Item = char>>(ch: char, chars: &mut I) -> bool {
50     if ch == CSI.0 && chars.next() == Some(CSI.1) {
51         // We have found the start of an ANSI escape code, typically
52         // used for colored terminal text. We skip until we find a
53         // "final byte" in the range 0x40–0x7E.
54         for ch in chars {
55             if ANSI_FINAL_BYTE.contains(&ch) {
56                 return true;
57             }
58         }
59     }
60     false
61 }
62 
63 #[cfg(feature = "unicode-width")]
64 #[inline]
ch_width(ch: char) -> usize65 fn ch_width(ch: char) -> usize {
66     unicode_width::UnicodeWidthChar::width(ch).unwrap_or(0)
67 }
68 
69 /// First character which [`ch_width`] will classify as double-width.
70 /// Please see [`display_width`].
71 #[cfg(not(feature = "unicode-width"))]
72 const DOUBLE_WIDTH_CUTOFF: char = '\u{1100}';
73 
74 #[cfg(not(feature = "unicode-width"))]
75 #[inline]
ch_width(ch: char) -> usize76 fn ch_width(ch: char) -> usize {
77     if ch < DOUBLE_WIDTH_CUTOFF {
78         1
79     } else {
80         2
81     }
82 }
83 
84 /// Compute the display width of `text` while skipping over ANSI
85 /// escape sequences.
86 ///
87 /// # Examples
88 ///
89 /// ```
90 /// use textwrap::core::display_width;
91 ///
92 /// assert_eq!(display_width("Café Plain"), 10);
93 /// assert_eq!(display_width("\u{1b}[31mCafé Rouge\u{1b}[0m"), 10);
94 /// ```
95 ///
96 /// **Note:** When the `unicode-width` Cargo feature is disabled, the
97 /// width of a `char` is determined by a crude approximation which
98 /// simply counts chars below U+1100 as 1 column wide, and all other
99 /// characters as 2 columns wide. With the feature enabled, function
100 /// will correctly deal with [combining characters] in their
101 /// decomposed form (see [Unicode equivalence]).
102 ///
103 /// An example of a decomposed character is “é”, which can be
104 /// decomposed into: “e” followed by a combining acute accent: “◌́”.
105 /// Without the `unicode-width` Cargo feature, every `char` below
106 /// U+1100 has a width of 1. This includes the combining accent:
107 ///
108 /// ```
109 /// use textwrap::core::display_width;
110 ///
111 /// assert_eq!(display_width("Cafe Plain"), 10);
112 /// #[cfg(feature = "unicode-width")]
113 /// assert_eq!(display_width("Cafe\u{301} Plain"), 10);
114 /// #[cfg(not(feature = "unicode-width"))]
115 /// assert_eq!(display_width("Cafe\u{301} Plain"), 11);
116 /// ```
117 ///
118 /// ## Emojis and CJK Characters
119 ///
120 /// Characters such as emojis and [CJK characters] used in the
121 /// Chinese, Japanese, and Korean langauges are seen as double-width,
122 /// even if the `unicode-width` feature is disabled:
123 ///
124 /// ```
125 /// use textwrap::core::display_width;
126 ///
127 /// assert_eq!(display_width("��������✨����������"), 20);
128 /// assert_eq!(display_width("你好"), 4);  // “Nǐ hǎo” or “Hello” in Chinese
129 /// ```
130 ///
131 /// # Limitations
132 ///
133 /// The displayed width of a string cannot always be computed from the
134 /// string alone. This is because the width depends on the rendering
135 /// engine used. This is particularly visible with [emoji modifier
136 /// sequences] where a base emoji is modified with, e.g., skin tone or
137 /// hair color modifiers. It is up to the rendering engine to detect
138 /// this and to produce a suitable emoji.
139 ///
140 /// A simple example is “❤️”, which consists of “❤” (U+2764: Black
141 /// Heart Symbol) followed by U+FE0F (Variation Selector-16). By
142 /// itself, “❤” is a black heart, but if you follow it with the
143 /// variant selector, you may get a wider red heart.
144 ///
145 /// A more complex example would be “��‍��” which should depict a man
146 /// with red hair. Here the computed width is too large — and the
147 /// width differs depending on the use of the `unicode-width` feature:
148 ///
149 /// ```
150 /// use textwrap::core::display_width;
151 ///
152 /// assert_eq!("��‍��".chars().collect::<Vec<char>>(), ['\u{1f468}', '\u{200d}', '\u{1f9b0}']);
153 /// #[cfg(feature = "unicode-width")]
154 /// assert_eq!(display_width("��‍��"), 4);
155 /// #[cfg(not(feature = "unicode-width"))]
156 /// assert_eq!(display_width("��‍��"), 6);
157 /// ```
158 ///
159 /// This happens because the grapheme consists of three code points:
160 /// “��” (U+1F468: Man), Zero Width Joiner (U+200D), and “��”
161 /// (U+1F9B0: Red Hair). You can see them above in the test. With
162 /// `unicode-width` enabled, the ZWJ is correctly seen as having zero
163 /// width, without it is counted as a double-width character.
164 ///
165 /// ## Terminal Support
166 ///
167 /// Modern browsers typically do a great job at combining characters
168 /// as shown above, but terminals often struggle more. As an example,
169 /// Gnome Terminal version 3.38.1, shows “❤️” as a big red heart, but
170 /// shows "��‍��" as “����”.
171 ///
172 /// [combining characters]: https://en.wikipedia.org/wiki/Combining_character
173 /// [Unicode equivalence]: https://en.wikipedia.org/wiki/Unicode_equivalence
174 /// [CJK characters]: https://en.wikipedia.org/wiki/CJK_characters
175 /// [emoji modifier sequences]: https://unicode.org/emoji/charts/full-emoji-modifiers.html
display_width(text: &str) -> usize176 pub fn display_width(text: &str) -> usize {
177     let mut chars = text.chars();
178     let mut width = 0;
179     while let Some(ch) = chars.next() {
180         if skip_ansi_escape_sequence(ch, &mut chars) {
181             continue;
182         }
183         width += ch_width(ch);
184     }
185     width
186 }
187 
188 /// A (text) fragment denotes the unit which we wrap into lines.
189 ///
190 /// Fragments represent an abstract _word_ plus the _whitespace_
191 /// following the word. In case the word falls at the end of the line,
192 /// the whitespace is dropped and a so-called _penalty_ is inserted
193 /// instead (typically `"-"` if the word was hyphenated).
194 ///
195 /// For wrapping purposes, the precise content of the word, the
196 /// whitespace, and the penalty is irrelevant. All we need to know is
197 /// the displayed width of each part, which this trait provides.
198 pub trait Fragment: std::fmt::Debug {
199     /// Displayed width of word represented by this fragment.
width(&self) -> usize200     fn width(&self) -> usize;
201 
202     /// Displayed width of the whitespace that must follow the word
203     /// when the word is not at the end of a line.
whitespace_width(&self) -> usize204     fn whitespace_width(&self) -> usize;
205 
206     /// Displayed width of the penalty that must be inserted if the
207     /// word falls at the end of a line.
penalty_width(&self) -> usize208     fn penalty_width(&self) -> usize;
209 }
210 
211 /// A piece of wrappable text, including any trailing whitespace.
212 ///
213 /// A `Word` is an example of a [`Fragment`], so it has a width,
214 /// trailing whitespace, and potentially a penalty item.
215 #[derive(Debug, Copy, Clone, PartialEq, Eq)]
216 pub struct Word<'a> {
217     /// Word content.
218     pub word: &'a str,
219     /// Whitespace to insert if the word does not fall at the end of a line.
220     pub whitespace: &'a str,
221     /// Penalty string to insert if the word falls at the end of a line.
222     pub penalty: &'a str,
223     // Cached width in columns.
224     pub(crate) width: usize,
225 }
226 
227 impl std::ops::Deref for Word<'_> {
228     type Target = str;
229 
deref(&self) -> &Self::Target230     fn deref(&self) -> &Self::Target {
231         self.word
232     }
233 }
234 
235 impl<'a> Word<'a> {
236     /// Construct a `Word` from a string.
237     ///
238     /// A trailing stretch of `' '` is automatically taken to be the
239     /// whitespace part of the word.
from(word: &str) -> Word<'_>240     pub fn from(word: &str) -> Word<'_> {
241         let trimmed = word.trim_end_matches(' ');
242         Word {
243             word: trimmed,
244             width: display_width(&trimmed),
245             whitespace: &word[trimmed.len()..],
246             penalty: "",
247         }
248     }
249 
250     /// Break this word into smaller words with a width of at most
251     /// `line_width`. The whitespace and penalty from this `Word` is
252     /// added to the last piece.
253     ///
254     /// # Examples
255     ///
256     /// ```
257     /// use textwrap::core::Word;
258     /// assert_eq!(
259     ///     Word::from("Hello!  ").break_apart(3).collect::<Vec<_>>(),
260     ///     vec![Word::from("Hel"), Word::from("lo!  ")]
261     /// );
262     /// ```
break_apart<'b>(&'b self, line_width: usize) -> impl Iterator<Item = Word<'a>> + 'b263     pub fn break_apart<'b>(&'b self, line_width: usize) -> impl Iterator<Item = Word<'a>> + 'b {
264         let mut char_indices = self.word.char_indices();
265         let mut offset = 0;
266         let mut width = 0;
267 
268         std::iter::from_fn(move || {
269             while let Some((idx, ch)) = char_indices.next() {
270                 if skip_ansi_escape_sequence(ch, &mut char_indices.by_ref().map(|(_, ch)| ch)) {
271                     continue;
272                 }
273 
274                 if width > 0 && width + ch_width(ch) > line_width {
275                     let word = Word {
276                         word: &self.word[offset..idx],
277                         width: width,
278                         whitespace: "",
279                         penalty: "",
280                     };
281                     offset = idx;
282                     width = ch_width(ch);
283                     return Some(word);
284                 }
285 
286                 width += ch_width(ch);
287             }
288 
289             if offset < self.word.len() {
290                 let word = Word {
291                     word: &self.word[offset..],
292                     width: width,
293                     whitespace: self.whitespace,
294                     penalty: self.penalty,
295                 };
296                 offset = self.word.len();
297                 return Some(word);
298             }
299 
300             None
301         })
302     }
303 }
304 
305 impl Fragment for Word<'_> {
306     #[inline]
width(&self) -> usize307     fn width(&self) -> usize {
308         self.width
309     }
310 
311     // We assume the whitespace consist of ' ' only. This allows us to
312     // compute the display width in constant time.
313     #[inline]
whitespace_width(&self) -> usize314     fn whitespace_width(&self) -> usize {
315         self.whitespace.len()
316     }
317 
318     // We assume the penalty is `""` or `"-"`. This allows us to
319     // compute the display width in constant time.
320     #[inline]
penalty_width(&self) -> usize321     fn penalty_width(&self) -> usize {
322         self.penalty.len()
323     }
324 }
325 
326 /// Forcibly break words wider than `line_width` into smaller words.
327 ///
328 /// This simply calls [`Word::break_apart`] on words that are too
329 /// wide. This means that no extra `'-'` is inserted, the word is
330 /// simply broken into smaller pieces.
break_words<'a, I>(words: I, line_width: usize) -> Vec<Word<'a>> where I: IntoIterator<Item = Word<'a>>,331 pub fn break_words<'a, I>(words: I, line_width: usize) -> Vec<Word<'a>>
332 where
333     I: IntoIterator<Item = Word<'a>>,
334 {
335     let mut shortened_words = Vec::new();
336     for word in words {
337         if word.width() > line_width {
338             shortened_words.extend(word.break_apart(line_width));
339         } else {
340             shortened_words.push(word);
341         }
342     }
343     shortened_words
344 }
345 
346 #[cfg(test)]
347 mod tests {
348     use super::*;
349 
350     #[cfg(feature = "unicode-width")]
351     use unicode_width::UnicodeWidthChar;
352 
353     #[test]
skip_ansi_escape_sequence_works()354     fn skip_ansi_escape_sequence_works() {
355         let blue_text = "\u{1b}[34mHello\u{1b}[0m";
356         let mut chars = blue_text.chars();
357         let ch = chars.next().unwrap();
358         assert!(skip_ansi_escape_sequence(ch, &mut chars));
359         assert_eq!(chars.next(), Some('H'));
360     }
361 
362     #[test]
emojis_have_correct_width()363     fn emojis_have_correct_width() {
364         use unic_emoji_char::is_emoji;
365 
366         // Emojis in the Basic Latin (ASCII) and Latin-1 Supplement
367         // blocks all have a width of 1 column. This includes
368         // characters such as '#' and '©'.
369         for ch in '\u{1}'..'\u{FF}' {
370             if is_emoji(ch) {
371                 let desc = format!("{:?} U+{:04X}", ch, ch as u32);
372 
373                 #[cfg(feature = "unicode-width")]
374                 assert_eq!(ch.width().unwrap(), 1, "char: {}", desc);
375 
376                 #[cfg(not(feature = "unicode-width"))]
377                 assert_eq!(ch_width(ch), 1, "char: {}", desc);
378             }
379         }
380 
381         // Emojis in the remaining blocks of the Basic Multilingual
382         // Plane (BMP), in the Supplementary Multilingual Plane (SMP),
383         // and in the Supplementary Ideographic Plane (SIP), are all 1
384         // or 2 columns wide when unicode-width is used, and always 2
385         // columns wide otherwise. This includes all of our favorite
386         // emojis such as ��.
387         for ch in '\u{FF}'..'\u{2FFFF}' {
388             if is_emoji(ch) {
389                 let desc = format!("{:?} U+{:04X}", ch, ch as u32);
390 
391                 #[cfg(feature = "unicode-width")]
392                 assert!(ch.width().unwrap() <= 2, "char: {}", desc);
393 
394                 #[cfg(not(feature = "unicode-width"))]
395                 assert_eq!(ch_width(ch), 2, "char: {}", desc);
396             }
397         }
398 
399         // The remaining planes contain almost no assigned code points
400         // and thus also no emojis.
401     }
402 
403     #[test]
display_width_works()404     fn display_width_works() {
405         assert_eq!("Café Plain".len(), 11); // “é” is two bytes
406         assert_eq!(display_width("Café Plain"), 10);
407         assert_eq!(display_width("\u{1b}[31mCafé Rouge\u{1b}[0m"), 10);
408     }
409 
410     #[test]
display_width_narrow_emojis()411     fn display_width_narrow_emojis() {
412         #[cfg(feature = "unicode-width")]
413         assert_eq!(display_width("⁉"), 1);
414 
415         // The ⁉ character is above DOUBLE_WIDTH_CUTOFF.
416         #[cfg(not(feature = "unicode-width"))]
417         assert_eq!(display_width("⁉"), 2);
418     }
419 
420     #[test]
display_width_narrow_emojis_variant_selector()421     fn display_width_narrow_emojis_variant_selector() {
422         #[cfg(feature = "unicode-width")]
423         assert_eq!(display_width("⁉\u{fe0f}"), 1);
424 
425         // The variant selector-16 is also counted.
426         #[cfg(not(feature = "unicode-width"))]
427         assert_eq!(display_width("⁉\u{fe0f}"), 4);
428     }
429 
430     #[test]
display_width_emojis()431     fn display_width_emojis() {
432         assert_eq!(display_width("��������✨����������"), 20);
433     }
434 }
435