1 /*
2    Internal file viewer for the Midnight Commander
3    Function for plain view
4 
5    Copyright (C) 1994-2021
6    Free Software Foundation, Inc.
7 
8    Written by:
9    Miguel de Icaza, 1994, 1995, 1998
10    Janne Kukonlehto, 1994, 1995
11    Jakub Jelinek, 1995
12    Joseph M. Hinkle, 1996
13    Norbert Warmuth, 1997
14    Pavel Machek, 1998
15    Roland Illig <roland.illig@gmx.de>, 2004, 2005
16    Slava Zanko <slavazanko@google.com>, 2009
17    Andrew Borodin <aborodin@vmail.ru>, 2009-2014
18    Ilia Maslakov <il.smind@gmail.com>, 2009
19    Rewritten almost from scratch by:
20    Egmont Koblinger <egmont@gmail.com>, 2014
21 
22    This file is part of the Midnight Commander.
23 
24    The Midnight Commander is free software: you can redistribute it
25    and/or modify it under the terms of the GNU General Public License as
26    published by the Free Software Foundation, either version 3 of the License,
27    or (at your option) any later version.
28 
29    The Midnight Commander is distributed in the hope that it will be useful,
30    but WITHOUT ANY WARRANTY; without even the implied warranty of
31    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
32    GNU General Public License for more details.
33 
34    You should have received a copy of the GNU General Public License
35    along with this program.  If not, see <http://www.gnu.org/licenses/>.
36 
37    ------------------------------------------------------------------------------------------------
38 
39    The viewer is implemented along the following design principles:
40 
41    Goals: Always display simple scripts, double wide (CJK), combining accents and spacing marks
42    (often used e.g. in Devanagari) perfectly. Make the arrow keys always work correctly.
43 
44    Absolutely non-goal: RTL.
45 
46    Terminology:
47 
48    - A "paragraph" is the text between two adjacent newline characters. A "line" or "row" is a
49    visual row on the screen. In wrap mode, the viewer formats a paragraph into one or more lines.
50 
51    - The Unicode glossary <http://www.unicode.org/glossary/> doesn't seem to have a notion of "base
52    character followed by zero or more combining characters". The closest matches are "Combining
53    Character Sequence" meaning a base character followed by one or more combining characters, or
54    "Grapheme" which seems to exclude non-printable characters such as newline. In this file,
55    "combining character sequence" (or any obvious abbreviation thereof) means a base character
56    followed by zero or more (up to a current limit of 4) combining characters.
57 
58    ------------------------------------------------------------------------------------------------
59 
60    The parser-formatter is designed to be stateless across paragraphs. This is so that we can walk
61    backwards without having to reparse the whole file (although we still need to reparse and
62    reformat the whole paragraph, but it's a lot better). This principle needs to be changed if we
63    ever get to address tickets 1849/2977, but then we can still store (for efficiency) the parser
64    state at the beginning of the paragraph, and safely walk backwards if we don't cross an escape
65    character.
66 
67    The parser-formatter, however, definitely needs to carry a state across lines. Currently this
68    state contains:
69 
70    - The logical column (as if we didn't wrap). This is used for handling TAB characters after a
71    wordwrap consistently with less.
72 
73    - Whether the last nroff character was bold or underlined. This is used for displaying the
74    ambiguous _\b_ sequence consistently with less.
75 
76    - Whether the desired way of displaying a lonely combining accent or spacing mark is to place it
77    over a dotted circle (we do this at the beginning of the paragraph of after a TAB), or to ignore
78    the combining char and show replacement char for the spacing mark (we do this if e.g. too many
79    of these were encountered and hence we don't glue them with their base character).
80 
81    - (This state needs to be expanded if e.g. we decide to print verbose replacement characters
82    (e.g. "<U+0080>") and allow these to wrap around lines.)
83 
84    The state also contains the file offset, as it doesn't make sense to ever know the state without
85    knowing the corresponding offset.
86 
87    The state depends on various settings (viewer width, encoding, nroff mode, charwrap or wordwrap
88    mode (if we'll have that one day) etc.), needs to be recomputed if any of these changes.
89 
90    Walking forwards is usually relatively easy both in the file and on the screen. Walking
91    backwards within a paragraph would only be possible in some special cases and even then it would
92    be painful, so we always walk back to the beginning of the paragraph and reparse-reformat from
93    there.
94 
95    (Walking back within a line in the file would have at least the following difficulties: handling
96    the parser state; processing invalid UTF-8; processing invalid nroff (e.g. what is "_\bA\bA"?).
97    Walking back on the display: we wouldn't know where to display the last line of a paragraph, or
98    where to display a line if its following line starts with a wide (CJK or Tab) character. Long
99    story short: just forget this approach.)
100 
101    Most important variables:
102 
103    - dpy_start: Both in unwrap and wrap modes this points to the beginning of the topmost displayed
104    paragraph.
105 
106    - dpy_text_column: Only in unwrap mode, an additional horizontal scroll.
107 
108    - dpy_paragraph_skip_lines: Only in wrap mode, an additional vertical scroll (the number of
109    lines that are scrolled off at the top from the topmost paragraph).
110 
111    - dpy_state_top: Only in wrap mode, the offset and parser-formatter state at the line where
112    displaying the file begins is cached here.
113 
114    - dpy_wrap_dirty: If some parameter has changed that makes it necessary to reparse-redisplay the
115    topmost paragraph.
116 
117    In wrap mode, the three variables "dpy_start", "dpy_paragraph_skip_lines" and "dpy_state_top"
118    are kept consistent. Think of the first two as the ones describing the position, and the third
119    as a cached value for better performance so that we don't need to wrap the invisible beginning
120    of the topmost paragraph over and over again. The third value needs to be recomputed each time a
121    parameter that influences parsing or displaying the file (e.g. width of screen, encoding, nroff
122    mode) changes, this is signaled by "dpy_wrap_dirty" to force recomputing "dpy_state_top" (and
123    clamp "dpy_paragraph_skip_lines" if necessary).
124 
125    ------------------------------------------------------------------------------------------------
126 
127    Help integration
128 
129    I'm planning to port the help viewer to this codebase.
130 
131    Splitting at sections would still happen in the help viewer. It would either copy a section, or
132    set force_max and a similar force_min to limit displaying to one section only.
133 
134    Parsing the help format would go next to the nroff parser. The colors, alternate character set,
135    and emitting the version number would go to the "state". (The version number would be
136    implemented by emitting remaining characters of a buffer in the "state" one by one, without
137    advancing in the file position.)
138 
139    The active link would be drawn similarly to the search highlight. Other than that, the viewer
140    wouldn't care about links (except for their color). help.c would keep track of which one is
141    highlighted, how to advance to the next/prev on an arrow, how the scroll offset needs to be
142    adjusted when moving, etc.
143 
144    Add wrapping at word boundaries to where wrapping at char boundaries happens now.
145  */
146 
147 #include <config.h>
148 
149 #include "lib/global.h"
150 #include "lib/tty/tty.h"
151 #include "lib/skin.h"
152 #include "lib/util.h"           /* is_printable() */
153 #ifdef HAVE_CHARSET
154 #include "lib/charsets.h"
155 #endif
156 
157 #include "src/setup.h"          /* option_tab_spacing */
158 
159 #include "internal.h"
160 
161 /*** global variables ****************************************************************************/
162 
163 /*** file scope macro definitions ****************************************************************/
164 
165 /* The Unicode standard recommends that lonely combining characters are printed over a dotted
166  * circle. If the terminal is not UTF-8, this will be replaced by a dot anyway. */
167 #define BASE_CHARACTER_FOR_LONELY_COMBINING 0x25CC      /* dotted circle */
168 #define MAX_COMBINING_CHARS 4   /* both slang and ncurses support exactly 4 */
169 
170 /* I think anything other than space (e.g. arrows) just introduce visual clutter without actually
171  * adding value. */
172 #define PARTIAL_CJK_AT_LEFT_MARGIN  ' '
173 #define PARTIAL_CJK_AT_RIGHT_MARGIN ' '
174 
175 /*
176  * Wrap mode: This is for safety so that jumping to the end of file (which already includes
177  * scrolling back by a page) and then walking backwards is reasonably fast, even if the file is
178  * extremely large and consists of maybe full zeros or something like that. If there's no newline
179  * found within this limit, just start displaying from there and see what happens. We might get
180  * some displaying parameteres (most importantly the columns) incorrect, but at least will show the
181  * file without spinning the CPU for ages. When scrolling back to that point, the user might see a
182  * garbled first line (even starting with an invalid partial UTF-8), but then walking back by yet
183  * another line should fix it.
184  *
185  * Unwrap mode: This is not used, we wouldn't be able to do anything reasonable without walking
186  * back a whole paragraph (well, view->data_area.height paragraphs actually).
187  */
188 #define MAX_BACKWARDS_WALK_IN_PARAGRAPH (100 * 1000)
189 
190 /*** file scope type declarations ****************************************************************/
191 
192 /*** file scope variables ************************************************************************/
193 
194 /* --------------------------------------------------------------------------------------------- */
195 /*** file scope functions ************************************************************************/
196 /* --------------------------------------------------------------------------------------------- */
197 
198 /* TODO: These methods shouldn't be necessary, see ticket 3257 */
199 
200 static int
mcview_wcwidth(const WView * view,int c)201 mcview_wcwidth (const WView * view, int c)
202 {
203 #ifdef HAVE_CHARSET
204     if (view->utf8)
205     {
206         if (g_unichar_iswide (c))
207             return 2;
208         if (g_unichar_iszerowidth (c))
209             return 0;
210     }
211 #else
212     (void) view;
213     (void) c;
214 #endif /* HAVE_CHARSET */
215     return 1;
216 }
217 
218 /* --------------------------------------------------------------------------------------------- */
219 
220 static gboolean
mcview_ismark(const WView * view,int c)221 mcview_ismark (const WView * view, int c)
222 {
223 #ifdef HAVE_CHARSET
224     if (view->utf8)
225         return g_unichar_ismark (c);
226 #else
227     (void) view;
228     (void) c;
229 #endif /* HAVE_CHARSET */
230     return FALSE;
231 }
232 
233 /* --------------------------------------------------------------------------------------------- */
234 
235 /* actually is_non_spacing_mark_or_enclosing_mark */
236 static gboolean
mcview_is_non_spacing_mark(const WView * view,int c)237 mcview_is_non_spacing_mark (const WView * view, int c)
238 {
239 #ifdef HAVE_CHARSET
240     if (view->utf8)
241     {
242         GUnicodeType type;
243 
244         type = g_unichar_type (c);
245 
246         return type == G_UNICODE_NON_SPACING_MARK || type == G_UNICODE_ENCLOSING_MARK;
247     }
248 #else
249     (void) view;
250     (void) c;
251 #endif /* HAVE_CHARSET */
252     return FALSE;
253 }
254 
255 /* --------------------------------------------------------------------------------------------- */
256 
257 #if 0
258 static gboolean
259 mcview_is_spacing_mark (const WView * view, int c)
260 {
261 #ifdef HAVE_CHARSET
262     if (view->utf8)
263         return g_unichar_type (c) == G_UNICODE_SPACING_MARK;
264 #else
265     (void) view;
266     (void) c;
267 #endif /* HAVE_CHARSET */
268     return FALSE;
269 }
270 #endif /* 0 */
271 
272 /* --------------------------------------------------------------------------------------------- */
273 
274 static gboolean
mcview_isprint(const WView * view,int c)275 mcview_isprint (const WView * view, int c)
276 {
277 #ifdef HAVE_CHARSET
278     if (!view->utf8)
279         c = convert_from_8bit_to_utf_c ((unsigned char) c, view->converter);
280     return g_unichar_isprint (c);
281 #else
282     (void) view;
283     /* TODO this is very-very buggy by design: ticket 3257 comments 0-1 */
284     return is_printable (c);
285 #endif /* HAVE_CHARSET */
286 }
287 
288 /* --------------------------------------------------------------------------------------------- */
289 
290 static int
mcview_char_display(const WView * view,int c,char * s)291 mcview_char_display (const WView * view, int c, char *s)
292 {
293 #ifdef HAVE_CHARSET
294     if (mc_global.utf8_display)
295     {
296         if (!view->utf8)
297             c = convert_from_8bit_to_utf_c ((unsigned char) c, view->converter);
298         if (!g_unichar_isprint (c))
299             c = '.';
300         return g_unichar_to_utf8 (c, s);
301     }
302     if (view->utf8)
303     {
304         if (g_unichar_iswide (c))
305         {
306             s[0] = s[1] = '.';
307             return 2;
308         }
309         if (g_unichar_iszerowidth (c))
310             return 0;
311         /* TODO the is_printable check below will be broken for this */
312         c = convert_from_utf_to_current_c (c, view->converter);
313     }
314     else
315     {
316         /* TODO the is_printable check below will be broken for this */
317         c = convert_to_display_c (c);
318     }
319 #else
320     (void) view;
321 #endif /* HAVE_CHARSET */
322     /* TODO this is very-very buggy by design: ticket 3257 comments 0-1 */
323     if (!is_printable (c))
324         c = '.';
325     *s = c;
326     return 1;
327 }
328 
329 /* --------------------------------------------------------------------------------------------- */
330 
331 /**
332  * Just for convenience, a common interface in front of mcview_get_utf and mcview_get_byte, so that
333  * the caller doesn't have to care about utf8 vs 8-bit modes.
334  *
335  * Normally: stores c, updates state, returns TRUE.
336  * At EOF: state is unchanged, c is undefined, returns FALSE.
337  *
338  * Just as with mcview_get_utf(), invalid UTF-8 is reported using negative integers.
339  *
340  * Also, temporary hack: handle force_max here.
341  * TODO: move it to lower layers (datasource.c)?
342  */
343 static gboolean
mcview_get_next_char(WView * view,mcview_state_machine_t * state,int * c)344 mcview_get_next_char (WView * view, mcview_state_machine_t * state, int *c)
345 {
346     /* Pretend EOF if we reached force_max */
347     if (view->force_max >= 0 && state->offset >= view->force_max)
348         return FALSE;
349 
350 #ifdef HAVE_CHARSET
351     if (view->utf8)
352     {
353         int char_length = 0;
354 
355         if (!mcview_get_utf (view, state->offset, c, &char_length))
356             return FALSE;
357         /* Pretend EOF if we crossed force_max */
358         if (view->force_max >= 0 && state->offset + char_length > view->force_max)
359             return FALSE;
360 
361         state->offset += char_length;
362         return TRUE;
363     }
364 #endif /* HAVE_CHARSET */
365     if (!mcview_get_byte (view, state->offset, c))
366         return FALSE;
367     state->offset++;
368     return TRUE;
369 }
370 
371 /* --------------------------------------------------------------------------------------------- */
372 /**
373  * This function parses the next nroff character and gives it to you along with its desired color,
374  * so you never have to care about nroff again.
375  *
376  * The nroff mode does the backspace trick for every single character (Unicode codepoint). At least
377  * that's what the GNU groff 1.22 package produces, and that's what less 458 expects. For
378  * double-wide characters (CJK), still only a single backspace is emitted. For combining accents
379  * and such, the print-backspace-print step is repeated for the base character and then for each
380  * accent separately.
381  *
382  * So, the right place for this layer is after the bytes are interpreted in UTF-8, but before
383  * joining a base character with its combining accents.
384  *
385  * Normally: stores c and color, updates state, returns TRUE.
386  * At EOF: state is unchanged, c and color are undefined, returns FALSE.
387  *
388  * color can be null if the caller doesn't care.
389  */
390 static gboolean
mcview_get_next_maybe_nroff_char(WView * view,mcview_state_machine_t * state,int * c,int * color)391 mcview_get_next_maybe_nroff_char (WView * view, mcview_state_machine_t * state, int *c, int *color)
392 {
393     mcview_state_machine_t state_after_nroff;
394     int c2, c3;
395 
396     if (color != NULL)
397         *color = VIEW_NORMAL_COLOR;
398 
399     if (!view->mode_flags.nroff)
400         return mcview_get_next_char (view, state, c);
401 
402     if (!mcview_get_next_char (view, state, c))
403         return FALSE;
404     /* Don't allow nroff formatting around CR, LF, TAB or other special chars */
405     if (!mcview_isprint (view, *c))
406         return TRUE;
407 
408     state_after_nroff = *state;
409 
410     if (!mcview_get_next_char (view, &state_after_nroff, &c2))
411         return TRUE;
412     if (c2 != '\b')
413         return TRUE;
414 
415     if (!mcview_get_next_char (view, &state_after_nroff, &c3))
416         return TRUE;
417     if (!mcview_isprint (view, c3))
418         return TRUE;
419 
420     if (*c == '_' && c3 == '_')
421     {
422         *state = state_after_nroff;
423         if (color != NULL)
424             *color =
425                 state->nroff_underscore_is_underlined ? VIEW_UNDERLINED_COLOR : VIEW_BOLD_COLOR;
426     }
427     else if (*c == c3)
428     {
429         *state = state_after_nroff;
430         state->nroff_underscore_is_underlined = FALSE;
431         if (color != NULL)
432             *color = VIEW_BOLD_COLOR;
433     }
434     else if (*c == '_')
435     {
436         *c = c3;
437         *state = state_after_nroff;
438         state->nroff_underscore_is_underlined = TRUE;
439         if (color != NULL)
440             *color = VIEW_UNDERLINED_COLOR;
441     }
442 
443     return TRUE;
444 }
445 
446 /* --------------------------------------------------------------------------------------------- */
447 /**
448  * Get one base character, along with its combining or spacing mark characters.
449  *
450  * (A spacing mark is a character that extends the base character's width 1 into a combined
451  * character of width 2, yet these two character cells should not be separated. E.g. Devanagari
452  * <U+0939><U+094B>.)
453  *
454  * This method exists mainly for two reasons. One is to be able to tell if we fit on the current
455  * line or need to wrap to the next one. The other is that both slang and ncurses seem to require
456  * that the character and its combining marks are printed in a single call (or is it just a
457  * limitation of mc's wrapper to them?).
458  *
459  * For convenience, this method takes care of converting CR or CR+LF into LF.
460  * TODO this should probably happen later, when displaying the file?
461  *
462  * Normally: stores cs and color, updates state, returns >= 1 (entries in cs).
463  * At EOF: state is unchanged, cs and color are undefined, returns 0.
464  *
465  * @param view ...
466  * @param state the parser-formatter state machine's state, updated
467  * @param cs store the characters here
468  * @param clen the room available in cs (that is, at most clen-1 combining marks are allowed), must
469  *   be at least 2
470  * @param color if non-NULL, store the color here, taken from the first codepoint's color
471  * @return the number of entries placed in cs, or 0 on EOF
472  */
473 static int
mcview_next_combining_char_sequence(WView * view,mcview_state_machine_t * state,int * cs,int clen,int * color)474 mcview_next_combining_char_sequence (WView * view, mcview_state_machine_t * state, int *cs,
475                                      int clen, int *color)
476 {
477     int i = 1;
478 
479     if (!mcview_get_next_maybe_nroff_char (view, state, cs, color))
480         return 0;
481 
482     /* Process \r and \r\n newlines. */
483     if (cs[0] == '\r')
484     {
485         int cnext;
486 
487         mcview_state_machine_t state_after_crlf = *state;
488         if (mcview_get_next_maybe_nroff_char (view, &state_after_crlf, &cnext, NULL)
489             && cnext == '\n')
490             *state = state_after_crlf;
491         cs[0] = '\n';
492         return 1;
493     }
494 
495     /* We don't want combining over non-printable characters. This includes '\n' and '\t' too. */
496     if (!mcview_isprint (view, cs[0]))
497         return 1;
498 
499     if (mcview_ismark (view, cs[0]))
500     {
501         if (!state->print_lonely_combining)
502         {
503             /* First character is combining. Either just return it, ... */
504             return 1;
505         }
506         else
507         {
508             /* or place this (and subsequent combining ones) over a dotted circle. */
509             cs[1] = cs[0];
510             cs[0] = BASE_CHARACTER_FOR_LONELY_COMBINING;
511             i = 2;
512         }
513     }
514 
515     if (mcview_wcwidth (view, cs[0]) == 2)
516     {
517         /* Don't allow combining or spacing mark for wide characters, is this okay? */
518         return 1;
519     }
520 
521     /* Look for more combining chars. Either at most clen-1 zero-width combining chars,
522      * or at most 1 spacing mark. Is this logic correct? */
523     for (; i < clen; i++)
524     {
525         mcview_state_machine_t state_after_combining;
526 
527         state_after_combining = *state;
528         if (!mcview_get_next_maybe_nroff_char (view, &state_after_combining, &cs[i], NULL))
529             return i;
530         if (!mcview_ismark (view, cs[i]) || !mcview_isprint (view, cs[i]))
531             return i;
532         if (g_unichar_type (cs[i]) == G_UNICODE_SPACING_MARK)
533         {
534             /* Only allow as the first combining char. Stop processing in either case. */
535             if (i == 1)
536             {
537                 *state = state_after_combining;
538                 i++;
539             }
540             return i;
541         }
542         *state = state_after_combining;
543     }
544     return i;
545 }
546 
547 /* --------------------------------------------------------------------------------------------- */
548 /**
549  * Parse, format and possibly display one visual line of text.
550  *
551  * Formatting starts at the given "state" (which encodes the file offset and parser and formatter's
552  * internal state). In unwrap mode, this should point to the beginning of the paragraph with the
553  * default state, the additional horizontal scrolling is added here. In wrap mode, this should
554  * point to the beginning of the line, with the proper state at that point.
555  *
556  * In wrap mode, if a line ends in a newline, it is consumed, even if it's exactly at the right
557  * edge. In unwrap mode, the whole remaining line, including the newline is consumed. Displaying
558  * the next line should start at "state"'s new value, or if we displayed the bottom line then
559  * state->offset tells the file offset to be shown in the top bar.
560  *
561  * If "row" is offscreen, don't actually display the line but still update "state" and return the
562  * proper value. This is used by mcview_wrap_move_down to advance in the file.
563  *
564  * @param view ...
565  * @param state the parser-formatter state machine's state, updated
566  * @param row print to this row
567  * @param paragraph_ended store TRUE if paragraph ended by newline or EOF, FALSE if wraps to next
568  *   line
569  * @param linewidth store the width of the line here
570  * @return the number of rows, that is, 0 if we were already at EOF, otherwise 1
571  */
572 static int
mcview_display_line(WView * view,mcview_state_machine_t * state,int row,gboolean * paragraph_ended,off_t * linewidth)573 mcview_display_line (WView * view, mcview_state_machine_t * state, int row,
574                      gboolean * paragraph_ended, off_t * linewidth)
575 {
576     const screen_dimen left = view->data_area.left;
577     const screen_dimen top = view->data_area.top;
578     const screen_dimen width = view->data_area.width;
579     const screen_dimen height = view->data_area.height;
580     off_t dpy_text_column = view->mode_flags.wrap ? 0 : view->dpy_text_column;
581     screen_dimen col = 0;
582     int cs[1 + MAX_COMBINING_CHARS];
583     char str[(1 + MAX_COMBINING_CHARS) * UTF8_CHAR_LEN + 1];
584     int i, j;
585 
586     if (paragraph_ended != NULL)
587         *paragraph_ended = TRUE;
588 
589     if (!view->mode_flags.wrap && (row < 0 || row >= (int) height) && linewidth == NULL)
590     {
591         /* Optimization: Fast forward to the end of the line, rather than carefully
592          * parsing and then not actually displaying it. */
593         off_t eol;
594         int retval;
595 
596         eol = mcview_eol (view, state->offset);
597         retval = (eol > state->offset) ? 1 : 0;
598 
599         mcview_state_machine_init (state, eol);
600         return retval;
601     }
602 
603     while (TRUE)
604     {
605         int charwidth = 0;
606         mcview_state_machine_t state_saved;
607         int n;
608         int color;
609 
610         state_saved = *state;
611         n = mcview_next_combining_char_sequence (view, state, cs, 1 + MAX_COMBINING_CHARS, &color);
612         if (n == 0)
613         {
614             if (linewidth != NULL)
615                 *linewidth = col;
616             return (col > 0) ? 1 : 0;
617         }
618 
619         if (view->search_start <= state->offset && state->offset < view->search_end)
620             color = VIEW_SELECTED_COLOR;
621 
622         if (cs[0] == '\n')
623         {
624             /* New line: reset all formatting state for the next paragraph. */
625             mcview_state_machine_init (state, state->offset);
626             if (linewidth != NULL)
627                 *linewidth = col;
628             return 1;
629         }
630 
631         if (mcview_is_non_spacing_mark (view, cs[0]))
632         {
633             /* Lonely combining character. Probably leftover after too many combining chars. Just ignore. */
634             continue;
635         }
636 
637         /* Nonprintable, or lonely spacing mark */
638         if ((!mcview_isprint (view, cs[0]) || mcview_ismark (view, cs[0])) && cs[0] != '\t')
639             cs[0] = '.';
640 
641         for (i = 0; i < n; i++)
642             charwidth += mcview_wcwidth (view, cs[i]);
643 
644         /* Adjust the width for TAB. It's handled below along with the normal characters,
645          * so that it's wrapped consistently with them, and is painted with the proper
646          * attributes (although currently it can't have a special color). */
647         if (cs[0] == '\t')
648         {
649             charwidth = option_tab_spacing - state->unwrapped_column % option_tab_spacing;
650             state->print_lonely_combining = TRUE;
651         }
652         else
653             state->print_lonely_combining = FALSE;
654 
655         /* In wrap mode only: We're done with this row if the character sequence wouldn't fit.
656          * Except if at the first column, because then it wouldn't fit in the next row either.
657          * In this extreme case let the unwrapped code below do its best to display it. */
658         if (view->mode_flags.wrap && (off_t) col + charwidth > dpy_text_column + (off_t) width
659             && col > 0)
660         {
661             *state = state_saved;
662             if (paragraph_ended != NULL)
663                 *paragraph_ended = FALSE;
664             if (linewidth != NULL)
665                 *linewidth = col;
666             return 1;
667         }
668 
669         /* Display, unless outside of the viewport. */
670         if (row >= 0 && row < (int) height)
671         {
672             if ((off_t) col >= dpy_text_column &&
673                 (off_t) col + charwidth <= dpy_text_column + (off_t) width)
674             {
675                 /* The combining character sequence fits entirely in the viewport. Print it. */
676                 tty_setcolor (color);
677                 widget_gotoyx (view, top + row, left + ((off_t) col - dpy_text_column));
678                 if (cs[0] == '\t')
679                 {
680                     for (i = 0; i < charwidth; i++)
681                         tty_print_char (' ');
682                 }
683                 else
684                 {
685                     j = 0;
686                     for (i = 0; i < n; i++)
687                         j += mcview_char_display (view, cs[i], str + j);
688                     str[j] = '\0';
689                     /* This is probably a bug in our tty layer, but tty_print_string
690                      * normalizes the string, whereas tty_printf doesn't. Don't normalize,
691                      * since we handle combining characters ourselves correctly, it's
692                      * better if they are copy-pasted correctly. Ticket 3255. */
693                     tty_printf ("%s", str);
694                 }
695             }
696             else if ((off_t) col < dpy_text_column && (off_t) col + charwidth > dpy_text_column)
697             {
698                 /* The combining character sequence would cross the left edge of the viewport.
699                  * This cannot happen with wrap mode. Print replacement character(s),
700                  * or spaces with the correct attributes for partial Tabs. */
701                 tty_setcolor (color);
702                 for (i = dpy_text_column;
703                      i < (off_t) col + charwidth && i < dpy_text_column + (off_t) width; i++)
704                 {
705                     widget_gotoyx (view, top + row, left + (i - dpy_text_column));
706                     tty_print_anychar ((cs[0] == '\t') ? ' ' : PARTIAL_CJK_AT_LEFT_MARGIN);
707                 }
708             }
709             else if ((off_t) col < dpy_text_column + (off_t) width &&
710                      (off_t) col + charwidth > dpy_text_column + (off_t) width)
711             {
712                 /* The combining character sequence would cross the right edge of the viewport
713                  * and we're not wrapping. Print replacement character(s),
714                  * or spaces with the correct attributes for partial Tabs. */
715                 tty_setcolor (color);
716                 for (i = col; i < dpy_text_column + (off_t) width; i++)
717                 {
718                     widget_gotoyx (view, top + row, left + (i - dpy_text_column));
719                     tty_print_anychar ((cs[0] == '\t') ? ' ' : PARTIAL_CJK_AT_RIGHT_MARGIN);
720                 }
721             }
722         }
723 
724         col += charwidth;
725         state->unwrapped_column += charwidth;
726 
727         if (!view->mode_flags.wrap && (off_t) col >= dpy_text_column + (off_t) width
728             && linewidth == NULL)
729         {
730             /* Optimization: Fast forward to the end of the line, rather than carefully
731              * parsing and then not actually displaying it. */
732             off_t eol;
733 
734             eol = mcview_eol (view, state->offset);
735             mcview_state_machine_init (state, eol);
736             return 1;
737         }
738     }
739 }
740 
741 /* --------------------------------------------------------------------------------------------- */
742 /**
743  * Parse, format and possibly display one paragraph (perhaps not from the beginning).
744  *
745  * Formatting starts at the given "state" (which encodes the file offset and parser and formatter's
746  * internal state). In unwrap mode, this should point to the beginning of the paragraph with the
747  * default state, the additional horizontal scrolling is added here. In wrap mode, this may point
748  * to the beginning of the line within a paragraph (to display the partial paragraph at the top),
749  * with the proper state at that point.
750  *
751  * Displaying the next paragraph should start at "state"'s new value, or if we displayed the bottom
752  * line then state->offset tells the file offset to be shown in the top bar.
753  *
754  * If "row" is negative, don't display the first abs(row) lines and display the rest from the top.
755  * This was a nice idea but it's now unused :)
756  *
757  * If "row" is too large, don't display the paragraph at all but still return the number of lines.
758  * This is used when moving upwards.
759  *
760  * @param view ...
761  * @param state the parser-formatter state machine's state, updated
762  * @param row print starting at this row
763  * @return the number of rows the paragraphs is wrapped to, that is, 0 if we were already at EOF,
764  *   otherwise 1 in unwrap mode, >= 1 in wrap mode. We stop when reaching the bottom of the
765  *   viewport, it's not counted how many more lines the paragraph would occupy
766  */
767 static int
mcview_display_paragraph(WView * view,mcview_state_machine_t * state,int row)768 mcview_display_paragraph (WView * view, mcview_state_machine_t * state, int row)
769 {
770     const screen_dimen height = view->data_area.height;
771     int lines = 0;
772 
773     while (TRUE)
774     {
775         gboolean paragraph_ended;
776 
777         lines += mcview_display_line (view, state, row, &paragraph_ended, NULL);
778         if (paragraph_ended)
779             return lines;
780 
781         if (row < (int) height)
782         {
783             row++;
784             /* stop if bottom of screen reached */
785             if (row >= (int) height)
786                 return lines;
787         }
788     }
789 }
790 
791 /* --------------------------------------------------------------------------------------------- */
792 /**
793  * Recompute dpy_state_top from dpy_start and dpy_paragraph_skip_lines. Clamp
794  * dpy_paragraph_skip_lines if necessary.
795  *
796  * This method should be called in wrap mode after changing one of the parsing or formatting
797  * properties (e.g. window width, encoding, nroff), or when switching to wrap mode from unwrap or
798  * hex.
799  *
800  * If we stayed within the same paragraph then try to keep the vertical offset within that
801  * paragraph as well. It might happen though that the paragraph became shorter than our desired
802  * vertical position, in that case move to its last row.
803  */
804 static void
mcview_wrap_fixup(WView * view)805 mcview_wrap_fixup (WView * view)
806 {
807     int lines = view->dpy_paragraph_skip_lines;
808 
809     if (!view->dpy_wrap_dirty)
810         return;
811     view->dpy_wrap_dirty = FALSE;
812 
813     view->dpy_paragraph_skip_lines = 0;
814     mcview_state_machine_init (&view->dpy_state_top, view->dpy_start);
815 
816     while (lines-- != 0)
817     {
818         mcview_state_machine_t state_prev;
819         gboolean paragraph_ended;
820 
821         state_prev = view->dpy_state_top;
822         if (mcview_display_line (view, &view->dpy_state_top, -1, &paragraph_ended, NULL) == 0)
823             break;
824         if (paragraph_ended)
825         {
826             view->dpy_state_top = state_prev;
827             break;
828         }
829         view->dpy_paragraph_skip_lines++;
830     }
831 }
832 
833 /* --------------------------------------------------------------------------------------------- */
834 /*** public functions ****************************************************************************/
835 /* --------------------------------------------------------------------------------------------- */
836 
837 /**
838  * In both wrap and unwrap modes, dpy_start points to the beginning of the paragraph.
839  *
840  * In unwrap mode, start displaying from this position, probably applying an additional horizontal
841  * scroll.
842  *
843  * In wrap mode, an additional dpy_paragraph_skip_lines lines are skipped from the top of this
844  * paragraph. dpy_state_top contains the position and parser-formatter state corresponding to the
845  * top left corner so we can just start rendering from here. Unless dpy_wrap_dirty is set in which
846  * case dpy_state_top is invalid and we need to recompute first.
847  */
848 void
mcview_display_text(WView * view)849 mcview_display_text (WView * view)
850 {
851     const screen_dimen left = view->data_area.left;
852     const screen_dimen top = view->data_area.top;
853     const screen_dimen height = view->data_area.height;
854     int row;
855     mcview_state_machine_t state;
856     gboolean again;
857 
858     do
859     {
860         int n;
861 
862         again = FALSE;
863 
864         mcview_display_clean (view);
865         mcview_display_ruler (view);
866 
867         if (!view->mode_flags.wrap)
868             mcview_state_machine_init (&state, view->dpy_start);
869         else
870         {
871             mcview_wrap_fixup (view);
872             state = view->dpy_state_top;
873         }
874 
875         for (row = 0; row < (int) height; row += n)
876         {
877             n = mcview_display_paragraph (view, &state, row);
878             if (n == 0)
879             {
880                 /* In the rare case that displaying didn't start at the beginning
881                  * of the file, yet there are some empty lines at the bottom,
882                  * scroll the file and display again. This happens when e.g. the
883                  * window is made bigger, or the file becomes shorter due to
884                  * charset change or enabling nroff. */
885                 if ((view->mode_flags.wrap ? view->dpy_state_top.offset : view->dpy_start) > 0)
886                 {
887                     mcview_ascii_move_up (view, height - row);
888                     again = TRUE;
889                 }
890                 break;
891             }
892         }
893     }
894     while (again);
895 
896     view->dpy_end = state.offset;
897     view->dpy_state_bottom = state;
898 
899     tty_setcolor (VIEW_NORMAL_COLOR);
900     if (mcview_show_eof != NULL && mcview_show_eof[0] != '\0')
901         while (row < (int) height)
902         {
903             widget_gotoyx (view, top + row, left);
904             /* TODO: should make it no wider than the viewport */
905             tty_print_string (mcview_show_eof);
906             row++;
907         }
908 }
909 
910 /* --------------------------------------------------------------------------------------------- */
911 /**
912  * Move down.
913  *
914  * It's very simple. Just invisibly format the next "lines" lines, carefully carrying the formatter
915  * state in wrap mode. But before each step we need to check if we've already hit the end of the
916  * file, in that case we can no longer move. This is done by walking from dpy_state_bottom.
917  *
918  * Note that this relies on mcview_display_text() setting dpy_state_bottom to its correct value
919  * upon rendering the screen contents. So don't call this function from other functions (e.g. at
920  * the bottom of mcview_ascii_move_up()) which invalidate this value.
921  */
922 void
mcview_ascii_move_down(WView * view,off_t lines)923 mcview_ascii_move_down (WView * view, off_t lines)
924 {
925     while (lines-- != 0)
926     {
927         gboolean paragraph_ended;
928 
929         /* See if there's still data below the bottom line, by imaginarily displaying one
930          * more line. This takes care of reading more data into growbuf, if required.
931          * If the end position didn't advance, we're at EOF and hence bail out. */
932         if (mcview_display_line (view, &view->dpy_state_bottom, -1, &paragraph_ended, NULL) == 0)
933             break;
934 
935         /* Okay, there's enough data. Move by 1 row at the top, too. No need to check for
936          * EOF, that can't happen. */
937         if (!view->mode_flags.wrap)
938         {
939             view->dpy_start = mcview_eol (view, view->dpy_start);
940             view->dpy_paragraph_skip_lines = 0;
941             view->dpy_wrap_dirty = TRUE;
942         }
943         else
944         {
945             mcview_display_line (view, &view->dpy_state_top, -1, &paragraph_ended, NULL);
946             if (!paragraph_ended)
947                 view->dpy_paragraph_skip_lines++;
948             else
949             {
950                 view->dpy_start = view->dpy_state_top.offset;
951                 view->dpy_paragraph_skip_lines = 0;
952             }
953         }
954     }
955 }
956 
957 /* --------------------------------------------------------------------------------------------- */
958 /**
959  * Move up.
960  *
961  * Unwrap mode: Piece of cake. Wrap mode: If we'd walk back more than the current line offset
962  * within the paragraph, we need to jump back to the previous paragraph and compute its height to
963  * see if we start from that paragraph, and repeat this if necessary. Once we're within the desired
964  * paragraph, we still need to format it from its beginning to know the state.
965  *
966  * See the top of this file for comments about MAX_BACKWARDS_WALK_IN_PARAGRAPH.
967  *
968  * force_max is a nice protection against the rare extreme case that the file underneath us
969  * changes, we don't want to endlessly consume a file of maybe full of zeros upon moving upwards.
970  */
971 void
mcview_ascii_move_up(WView * view,off_t lines)972 mcview_ascii_move_up (WView * view, off_t lines)
973 {
974     if (!view->mode_flags.wrap)
975     {
976         while (lines-- != 0)
977             view->dpy_start = mcview_bol (view, view->dpy_start - 1, 0);
978         view->dpy_paragraph_skip_lines = 0;
979         view->dpy_wrap_dirty = TRUE;
980     }
981     else
982     {
983         int i;
984 
985         while (lines > view->dpy_paragraph_skip_lines)
986         {
987             /* We need to go back to the previous paragraph. */
988             if (view->dpy_start == 0)
989             {
990                 /* Oops, we're already in the first paragraph. */
991                 view->dpy_paragraph_skip_lines = 0;
992                 mcview_state_machine_init (&view->dpy_state_top, 0);
993                 return;
994             }
995             lines -= view->dpy_paragraph_skip_lines;
996             view->force_max = view->dpy_start;
997             view->dpy_start =
998                 mcview_bol (view, view->dpy_start - 1,
999                             view->dpy_start - MAX_BACKWARDS_WALK_IN_PARAGRAPH);
1000             mcview_state_machine_init (&view->dpy_state_top, view->dpy_start);
1001             /* This is a tricky way of denoting that we're at the end of the paragraph.
1002              * Normally we'd jump to the next paragraph and reset paragraph_skip_lines. But for
1003              * walking backwards this is exactly what we need. */
1004             view->dpy_paragraph_skip_lines =
1005                 mcview_display_paragraph (view, &view->dpy_state_top, view->data_area.height);
1006             view->force_max = -1;
1007         }
1008 
1009         /* Okay, we have have dpy_start pointing to the desired paragraph, and we still need to
1010          * walk back "lines" lines from the current "dpy_paragraph_skip_lines" offset. We can't do
1011          * that, so walk from the beginning of the paragraph. */
1012         mcview_state_machine_init (&view->dpy_state_top, view->dpy_start);
1013         view->dpy_paragraph_skip_lines -= lines;
1014         for (i = 0; i < view->dpy_paragraph_skip_lines; i++)
1015             mcview_display_line (view, &view->dpy_state_top, -1, NULL, NULL);
1016     }
1017 }
1018 
1019 /* --------------------------------------------------------------------------------------------- */
1020 
1021 void
mcview_ascii_moveto_bol(WView * view)1022 mcview_ascii_moveto_bol (WView * view)
1023 {
1024     if (!view->mode_flags.wrap)
1025         view->dpy_text_column = 0;
1026 }
1027 
1028 /* --------------------------------------------------------------------------------------------- */
1029 
1030 void
mcview_ascii_moveto_eol(WView * view)1031 mcview_ascii_moveto_eol (WView * view)
1032 {
1033     if (!view->mode_flags.wrap)
1034     {
1035         mcview_state_machine_t state;
1036         off_t linewidth;
1037 
1038         /* Get the width of the topmost paragraph. */
1039         mcview_state_machine_init (&state, view->dpy_start);
1040         mcview_display_line (view, &state, -1, NULL, &linewidth);
1041         view->dpy_text_column = DOZ (linewidth, (off_t) view->data_area.width);
1042     }
1043 }
1044 
1045 /* --------------------------------------------------------------------------------------------- */
1046 
1047 void
mcview_state_machine_init(mcview_state_machine_t * state,off_t offset)1048 mcview_state_machine_init (mcview_state_machine_t * state, off_t offset)
1049 {
1050     memset (state, 0, sizeof (*state));
1051     state->offset = offset;
1052     state->print_lonely_combining = TRUE;
1053 }
1054 
1055 /* --------------------------------------------------------------------------------------------- */
1056