1 // This is an open source non-commercial project. Dear PVS-Studio, please check
2 // it. PVS-Studio Static Code Analyzer for C, C++ and C#: http://www.viva64.com
3 
4 /// @file charset.c
5 ///
6 /// Code related to character sets.
7 
8 #include <assert.h>
9 #include <inttypes.h>
10 #include <string.h>
11 #include <wctype.h>
12 
13 #include "nvim/ascii.h"
14 #include "nvim/charset.h"
15 #include "nvim/cursor.h"
16 #include "nvim/func_attr.h"
17 #include "nvim/garray.h"
18 #include "nvim/indent.h"
19 #include "nvim/main.h"
20 #include "nvim/mark.h"
21 #include "nvim/mbyte.h"
22 #include "nvim/memline.h"
23 #include "nvim/memory.h"
24 #include "nvim/misc1.h"
25 #include "nvim/move.h"
26 #include "nvim/option.h"
27 #include "nvim/os_unix.h"
28 #include "nvim/path.h"
29 #include "nvim/plines.h"
30 #include "nvim/state.h"
31 #include "nvim/strings.h"
32 #include "nvim/vim.h"
33 
34 #ifdef INCLUDE_GENERATED_DECLARATIONS
35 # include "charset.c.generated.h"
36 #endif
37 
38 
39 static bool chartab_initialized = false;
40 
41 // b_chartab[] is an array with 256 bits, each bit representing one of the
42 // characters 0-255.
43 #define SET_CHARTAB(buf, c) \
44   (buf)->b_chartab[(unsigned)(c) >> 6] |= (1ull << ((c) & 0x3f))
45 #define RESET_CHARTAB(buf, c) \
46   (buf)->b_chartab[(unsigned)(c) >> 6] &= ~(1ull << ((c) & 0x3f))
47 #define GET_CHARTAB_TAB(chartab, c) \
48   ((chartab)[(unsigned)(c) >> 6] & (1ull << ((c) & 0x3f)))
49 
50 // Table used below, see init_chartab() for an explanation
51 static char_u g_chartab[256];
52 
53 // Flags for g_chartab[].
54 #define CT_CELL_MASK  0x07  ///< mask: nr of display cells (1, 2 or 4)
55 #define CT_PRINT_CHAR 0x10  ///< flag: set for printable chars
56 #define CT_ID_CHAR    0x20  ///< flag: set for ID chars
57 #define CT_FNAME_CHAR 0x40  ///< flag: set for file name chars
58 
59 /// Fill g_chartab[].  Also fills curbuf->b_chartab[] with flags for keyword
60 /// characters for current buffer.
61 ///
62 /// Depends on the option settings 'iskeyword', 'isident', 'isfname',
63 /// 'isprint' and 'encoding'.
64 ///
65 /// The index in g_chartab[] is the character when first byte is up to 0x80,
66 /// if the first byte is 0x80 and above it depends on further bytes.
67 ///
68 /// The contents of g_chartab[]:
69 /// - The lower two bits, masked by CT_CELL_MASK, give the number of display
70 ///   cells the character occupies (1 or 2).  Not valid for UTF-8 above 0x80.
71 /// - CT_PRINT_CHAR bit is set when the character is printable (no need to
72 ///   translate the character before displaying it).  Note that only DBCS
73 ///   characters can have 2 display cells and still be printable.
74 /// - CT_FNAME_CHAR bit is set when the character can be in a file name.
75 /// - CT_ID_CHAR bit is set when the character can be in an identifier.
76 ///
77 /// @return FAIL if 'iskeyword', 'isident', 'isfname' or 'isprint' option has
78 /// an error, OK otherwise.
init_chartab(void)79 int init_chartab(void)
80 {
81   return buf_init_chartab(curbuf, true);
82 }
83 
84 /// Helper for init_chartab
85 ///
86 /// @param global false: only set buf->b_chartab[]
87 ///
88 /// @return FAIL if 'iskeyword', 'isident', 'isfname' or 'isprint' option has
89 /// an error, OK otherwise.
buf_init_chartab(buf_T * buf,int global)90 int buf_init_chartab(buf_T *buf, int global)
91 {
92   int c;
93   int c2;
94   int i;
95   bool tilde;
96   bool do_isalpha;
97 
98   if (global) {
99     // Set the default size for printable characters:
100     // From <Space> to '~' is 1 (printable), others are 2 (not printable).
101     // This also inits all 'isident' and 'isfname' flags to false.
102     c = 0;
103 
104     while (c < ' ') {
105       g_chartab[c++] = (dy_flags & DY_UHEX) ? 4 : 2;
106     }
107 
108     while (c <= '~') {
109       g_chartab[c++] = 1 + CT_PRINT_CHAR;
110     }
111 
112     while (c < 256) {
113       if (c >= 0xa0) {
114         // UTF-8: bytes 0xa0 - 0xff are printable (latin1)
115         g_chartab[c++] = CT_PRINT_CHAR + 1;
116       } else {
117         // the rest is unprintable by default
118         g_chartab[c++] = (dy_flags & DY_UHEX) ? 4 : 2;
119       }
120     }
121 
122     // Assume that every multi-byte char is a filename character.
123     for (c = 1; c < 256; c++) {
124       if (c >= 0xa0) {
125         g_chartab[c] |= CT_FNAME_CHAR;
126       }
127     }
128   }
129 
130   // Init word char flags all to false
131   memset(buf->b_chartab, 0, (size_t)32);
132 
133   // In lisp mode the '-' character is included in keywords.
134   if (buf->b_p_lisp) {
135     SET_CHARTAB(buf, '-');
136   }
137 
138   // Walk through the 'isident', 'iskeyword', 'isfname' and 'isprint'
139   // options Each option is a list of characters, character numbers or
140   // ranges, separated by commas, e.g.: "200-210,x,#-178,-"
141   for (i = global ? 0 : 3; i <= 3; i++) {
142     const char_u *p;
143     if (i == 0) {
144       // first round: 'isident'
145       p = p_isi;
146     } else if (i == 1) {
147       // second round: 'isprint'
148       p = p_isp;
149     } else if (i == 2) {
150       // third round: 'isfname'
151       p = p_isf;
152     } else {  // i == 3
153       // fourth round: 'iskeyword'
154       p = buf->b_p_isk;
155     }
156 
157     while (*p) {
158       tilde = false;
159       do_isalpha = false;
160 
161       if ((*p == '^') && (p[1] != NUL)) {
162         tilde = true;
163         ++p;
164       }
165 
166       if (ascii_isdigit(*p)) {
167         c = getdigits_int((char_u **)&p, true, 0);
168       } else {
169         c = mb_ptr2char_adv(&p);
170       }
171       c2 = -1;
172 
173       if ((*p == '-') && (p[1] != NUL)) {
174         ++p;
175 
176         if (ascii_isdigit(*p)) {
177           c2 = getdigits_int((char_u **)&p, true, 0);
178         } else {
179           c2 = mb_ptr2char_adv(&p);
180         }
181       }
182 
183       if ((c <= 0)
184           || (c >= 256)
185           || ((c2 < c) && (c2 != -1))
186           || (c2 >= 256)
187           || !((*p == NUL) || (*p == ','))) {
188         return FAIL;
189       }
190 
191       if (c2 == -1) {  // not a range
192         // A single '@' (not "@-@"):
193         // Decide on letters being ID/printable/keyword chars with
194         // standard function isalpha(). This takes care of locale for
195         // single-byte characters).
196         if (c == '@') {
197           do_isalpha = true;
198           c = 1;
199           c2 = 255;
200         } else {
201           c2 = c;
202         }
203       }
204 
205       while (c <= c2) {
206         // Use the MB_ functions here, because isalpha() doesn't
207         // work properly when 'encoding' is "latin1" and the locale is
208         // "C".
209         if (!do_isalpha
210             || mb_islower(c)
211             || mb_isupper(c)) {
212           if (i == 0) {
213             // (re)set ID flag
214             if (tilde) {
215               g_chartab[c] &= (uint8_t) ~CT_ID_CHAR;
216             } else {
217               g_chartab[c] |= CT_ID_CHAR;
218             }
219           } else if (i == 1) {
220             // (re)set printable
221             // For double-byte we keep the cell width, so
222             // that we can detect it from the first byte.
223             if (((c < ' ') || (c > '~'))) {
224               if (tilde) {
225                 g_chartab[c] = (uint8_t)((g_chartab[c] & ~CT_CELL_MASK)
226                                          + ((dy_flags & DY_UHEX) ? 4 : 2));
227                 g_chartab[c] &= (uint8_t) ~CT_PRINT_CHAR;
228               } else {
229                 g_chartab[c] = (uint8_t)((g_chartab[c] & ~CT_CELL_MASK) + 1);
230                 g_chartab[c] |= CT_PRINT_CHAR;
231               }
232             }
233           } else if (i == 2) {
234             // (re)set fname flag
235             if (tilde) {
236               g_chartab[c] &= (uint8_t) ~CT_FNAME_CHAR;
237             } else {
238               g_chartab[c] |= CT_FNAME_CHAR;
239             }
240           } else {  // i == 3
241             // (re)set keyword flag
242             if (tilde) {
243               RESET_CHARTAB(buf, c);
244             } else {
245               SET_CHARTAB(buf, c);
246             }
247           }
248         }
249         ++c;
250       }
251 
252       c = *p;
253       p = skip_to_option_part(p);
254 
255       if ((c == ',') && (*p == NUL)) {
256         // Trailing comma is not allowed.
257         return FAIL;
258       }
259     }
260   }
261   chartab_initialized = true;
262   return OK;
263 }
264 
265 /// Translate any special characters in buf[bufsize] in-place.
266 ///
267 /// The result is a string with only printable characters, but if there is not
268 /// enough room, not all characters will be translated.
269 ///
270 /// @param buf
271 /// @param bufsize
trans_characters(char_u * buf,int bufsize)272 void trans_characters(char_u *buf, int bufsize)
273 {
274   int len;          // length of string needing translation
275   int room;         // room in buffer after string
276   char_u *trs;      // translated character
277   int trs_len;      // length of trs[]
278 
279   len = (int)STRLEN(buf);
280   room = bufsize - len;
281 
282   while (*buf != 0) {
283     // Assume a multi-byte character doesn't need translation.
284     if ((trs_len = utfc_ptr2len(buf)) > 1) {
285       len -= trs_len;
286     } else {
287       trs = transchar_byte(*buf);
288       trs_len = (int)STRLEN(trs);
289 
290       if (trs_len > 1) {
291         room -= trs_len - 1;
292         if (room <= 0) {
293           return;
294         }
295         memmove(buf + trs_len, buf + 1, (size_t)len);
296       }
297       memmove(buf, trs, (size_t)trs_len);
298       --len;
299     }
300     buf += trs_len;
301   }
302 }
303 
304 /// Find length of a string capable of holding s with all specials replaced
305 ///
306 /// Assumes replacing special characters with printable ones just like
307 /// strtrans() does.
308 ///
309 /// @param[in]  s  String to check.
310 ///
311 /// @return number of bytes needed to hold a translation of `s`, NUL byte not
312 ///         included.
transstr_len(const char * const s,bool untab)313 size_t transstr_len(const char *const s, bool untab)
314   FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_PURE
315 {
316   const char *p = s;
317   size_t len = 0;
318 
319   while (*p) {
320     const size_t l = (size_t)utfc_ptr2len((const char_u *)p);
321     if (l > 1) {
322       int pcc[MAX_MCO + 1];
323       pcc[0] = utfc_ptr2char((const char_u *)p, &pcc[1]);
324 
325       if (vim_isprintc(pcc[0])) {
326         len += l;
327       } else {
328         for (size_t i = 0; i < ARRAY_SIZE(pcc) && pcc[i]; i++) {
329           char hexbuf[9];
330           len += transchar_hex(hexbuf, pcc[i]);
331         }
332       }
333       p += l;
334     } else if (*p == TAB && !untab) {
335       len += 1;
336       p++;
337     } else {
338       const int b2c_l = byte2cells((uint8_t)(*p++));
339       // Illegal byte sequence may occupy up to 4 characters.
340       len += (size_t)(b2c_l > 0 ? b2c_l : 4);
341     }
342   }
343   return len;
344 }
345 
346 /// Replace special characters with printable ones
347 ///
348 /// @param[in]  s  String to replace characters from.
349 /// @param[out]  buf  Buffer to which result should be saved.
350 /// @param[in]  len  Buffer length. Resulting string may not occupy more then
351 ///                  len - 1 bytes (one for trailing NUL byte).
352 /// @param[in]  untab  remove tab characters
353 ///
354 /// @return length of the resulting string, without the NUL byte.
transstr_buf(const char * const s,char * const buf,const size_t len,bool untab)355 size_t transstr_buf(const char *const s, char *const buf, const size_t len, bool untab)
356   FUNC_ATTR_NONNULL_ALL
357 {
358   const char *p = s;
359   char *buf_p = buf;
360   char *const buf_e = buf_p + len - 1;
361 
362   while (*p != NUL && buf_p < buf_e) {
363     const size_t l = (size_t)utfc_ptr2len((const char_u *)p);
364     if (l > 1) {
365       if (buf_p + l > buf_e) {
366         break;  // Exceeded `buf` size.
367       }
368       int pcc[MAX_MCO + 1];
369       pcc[0] = utfc_ptr2char((const char_u *)p, &pcc[1]);
370 
371       if (vim_isprintc(pcc[0])) {
372         memmove(buf_p, p, l);
373         buf_p += l;
374       } else {
375         for (size_t i = 0; i < ARRAY_SIZE(pcc) && pcc[i]; i++) {
376           char hexbuf[9];  // <up to 6 bytes>NUL
377           const size_t hexlen = transchar_hex(hexbuf, pcc[i]);
378           if (buf_p + hexlen > buf_e) {
379             break;
380           }
381           memmove(buf_p, hexbuf, hexlen);
382           buf_p += hexlen;
383         }
384       }
385       p += l;
386     } else if (*p == TAB && !untab) {
387       *buf_p++ = *p++;
388     } else {
389       const char *const tb = (const char *)transchar_byte((uint8_t)(*p++));
390       const size_t tb_len = strlen(tb);
391       if (buf_p + tb_len > buf_e) {
392         break;  // Exceeded `buf` size.
393       }
394       memmove(buf_p, tb, tb_len);
395       buf_p += tb_len;
396     }
397   }
398   *buf_p = NUL;
399   assert(buf_p <= buf_e);
400   return (size_t)(buf_p - buf);
401 }
402 
403 /// Copy string and replace special characters with printable characters
404 ///
405 /// Works like `strtrans()` does, used for that and in some other places.
406 ///
407 /// @param[in]  s  String to replace characters from.
408 ///
409 /// @return [allocated] translated string
transstr(const char * const s,bool untab)410 char *transstr(const char *const s, bool untab)
411   FUNC_ATTR_NONNULL_RET
412 {
413   // Compute the length of the result, taking account of unprintable
414   // multi-byte characters.
415   const size_t len = transstr_len(s, untab) + 1;
416   char *const buf = xmalloc(len);
417   transstr_buf(s, buf, len, untab);
418   return buf;
419 }
420 
421 /// Convert the string "str[orglen]" to do ignore-case comparing.
422 /// Use the current locale.
423 ///
424 /// When "buf" is NULL, return an allocated string.
425 /// Otherwise, put the result in buf, limited by buflen, and return buf.
str_foldcase(char_u * str,int orglen,char_u * buf,int buflen)426 char_u *str_foldcase(char_u *str, int orglen, char_u *buf, int buflen)
427   FUNC_ATTR_NONNULL_RET
428 {
429   garray_T ga;
430   int i;
431   int len = orglen;
432 
433 #define GA_CHAR(i) ((char_u *)ga.ga_data)[i]
434 #define GA_PTR(i) ((char_u *)ga.ga_data + i)
435 #define STR_CHAR(i) (buf == NULL ? GA_CHAR(i) : buf[i])
436 #define STR_PTR(i) (buf == NULL ? GA_PTR(i) : buf + i)
437 
438   // Copy "str" into "buf" or allocated memory, unmodified.
439   if (buf == NULL) {
440     ga_init(&ga, 1, 10);
441 
442     ga_grow(&ga, len + 1);
443     memmove(ga.ga_data, str, (size_t)len);
444     ga.ga_len = len;
445   } else {
446     if (len >= buflen) {
447       // Ugly!
448       len = buflen - 1;
449     }
450     memmove(buf, str, (size_t)len);
451   }
452 
453   if (buf == NULL) {
454     GA_CHAR(len) = NUL;
455   } else {
456     buf[len] = NUL;
457   }
458 
459   // Make each character lower case.
460   i = 0;
461   while (STR_CHAR(i) != NUL) {
462     int c = utf_ptr2char(STR_PTR(i));
463     int olen = utf_ptr2len(STR_PTR(i));
464     int lc = mb_tolower(c);
465 
466     // Only replace the character when it is not an invalid
467     // sequence (ASCII character or more than one byte) and
468     // mb_tolower() doesn't return the original character.
469     if (((c < 0x80) || (olen > 1)) && (c != lc)) {
470       int nlen = utf_char2len(lc);
471 
472       // If the byte length changes need to shift the following
473       // characters forward or backward.
474       if (olen != nlen) {
475         if (nlen > olen) {
476           if (buf == NULL) {
477             ga_grow(&ga, nlen - olen + 1);
478           } else {
479             if (len + nlen - olen >= buflen) {
480               // out of memory, keep old char
481               lc = c;
482               nlen = olen;
483             }
484           }
485         }
486 
487         if (olen != nlen) {
488           if (buf == NULL) {
489             STRMOVE(GA_PTR(i) + nlen, GA_PTR(i) + olen);
490             ga.ga_len += nlen - olen;
491           } else {
492             STRMOVE(buf + i + nlen, buf + i + olen);
493             len += nlen - olen;
494           }
495         }
496       }
497       (void)utf_char2bytes(lc, STR_PTR(i));
498     }
499 
500     // skip to next multi-byte char
501     i += utfc_ptr2len(STR_PTR(i));
502   }
503 
504 
505   if (buf == NULL) {
506     return (char_u *)ga.ga_data;
507   }
508   return buf;
509 }
510 
511 // Catch 22: g_chartab[] can't be initialized before the options are
512 // initialized, and initializing options may cause transchar() to be called!
513 // When chartab_initialized == false don't use g_chartab[].
514 // Does NOT work for multi-byte characters, c must be <= 255.
515 // Also doesn't work for the first byte of a multi-byte, "c" must be a
516 // character!
517 static char_u transchar_charbuf[11];
518 
519 /// Translate a character into a printable one, leaving printable ASCII intact
520 ///
521 /// All unicode characters are considered non-printable in this function.
522 ///
523 /// @param[in]  c  Character to translate.
524 ///
525 /// @return translated character into a static buffer.
transchar(int c)526 char_u *transchar(int c)
527 {
528   return transchar_buf(curbuf, c);
529 }
530 
transchar_buf(const buf_T * buf,int c)531 char_u *transchar_buf(const buf_T *buf, int c)
532   FUNC_ATTR_NONNULL_ALL
533 {
534   int i = 0;
535   if (IS_SPECIAL(c)) {
536     // special key code, display as ~@ char
537     transchar_charbuf[0] = '~';
538     transchar_charbuf[1] = '@';
539     i = 2;
540     c = K_SECOND(c);
541   }
542 
543   if ((!chartab_initialized && (((c >= ' ') && (c <= '~'))))
544       || ((c <= 0xFF) && vim_isprintc_strict(c))) {
545     // printable character
546     transchar_charbuf[i] = (char_u)c;
547     transchar_charbuf[i + 1] = NUL;
548   } else if (c <= 0xFF) {
549     transchar_nonprint(buf, transchar_charbuf + i, c);
550   } else {
551     transchar_hex((char *)transchar_charbuf + i, c);
552   }
553   return transchar_charbuf;
554 }
555 
556 /// Like transchar(), but called with a byte instead of a character
557 ///
558 /// Checks for an illegal UTF-8 byte.
559 ///
560 /// @param[in]  c  Byte to translate.
561 ///
562 /// @return pointer to translated character in transchar_charbuf.
transchar_byte(const int c)563 char_u *transchar_byte(const int c)
564   FUNC_ATTR_WARN_UNUSED_RESULT
565 {
566   if (c >= 0x80) {
567     transchar_nonprint(curbuf, transchar_charbuf, c);
568     return transchar_charbuf;
569   }
570   return transchar(c);
571 }
572 
573 /// Convert non-printable characters to 2..4 printable ones
574 ///
575 /// @warning Does not work for multi-byte characters, c must be <= 255.
576 ///
577 /// @param[in]  buf  Required to check the file format
578 /// @param[out]  charbuf  Buffer to store result in, must be able to hold
579 ///                       at least 5 bytes (conversion result + NUL).
580 /// @param[in]  c  Character to convert. NUL is assumed to be NL according to
581 ///                `:h NL-used-for-NUL`.
transchar_nonprint(const buf_T * buf,char_u * charbuf,int c)582 void transchar_nonprint(const buf_T *buf, char_u *charbuf, int c)
583   FUNC_ATTR_NONNULL_ALL
584 {
585   if (c == NL) {
586     // we use newline in place of a NUL
587     c = NUL;
588   } else if ((c == CAR) && (get_fileformat(buf) == EOL_MAC)) {
589     // we use CR in place of  NL in this case
590     c = NL;
591   }
592   assert(c <= 0xff);
593 
594   if (dy_flags & DY_UHEX || c > 0x7f) {
595     // 'display' has "uhex"
596     transchar_hex((char *)charbuf, c);
597   } else {
598     // 0x00 - 0x1f and 0x7f
599     charbuf[0] = '^';
600     // DEL displayed as ^?
601     charbuf[1] = (char_u)(c ^ 0x40);
602 
603     charbuf[2] = NUL;
604   }
605 }
606 
607 /// Convert a non-printable character to hex C string like "<FFFF>"
608 ///
609 /// @param[out]  buf  Buffer to store result in.
610 /// @param[in]  c  Character to convert.
611 ///
612 /// @return Number of bytes stored in buffer, excluding trailing NUL byte.
transchar_hex(char * const buf,const int c)613 size_t transchar_hex(char *const buf, const int c)
614   FUNC_ATTR_NONNULL_ALL
615 {
616   size_t i = 0;
617 
618   buf[i++] = '<';
619   if (c > 255) {
620     if (c > 255 * 256) {
621       buf[i++] = (char)nr2hex((unsigned)c >> 20);
622       buf[i++] = (char)nr2hex((unsigned)c >> 16);
623     }
624     buf[i++] = (char)nr2hex((unsigned)c >> 12);
625     buf[i++] = (char)nr2hex((unsigned)c >> 8);
626   }
627   buf[i++] = (char)(nr2hex((unsigned)c >> 4));
628   buf[i++] = (char)(nr2hex((unsigned)c));
629   buf[i++] = '>';
630   buf[i] = NUL;
631   return i;
632 }
633 
634 /// Convert the lower 4 bits of byte "c" to its hex character
635 ///
636 /// Lower case letters are used to avoid the confusion of <F1> being 0xf1 or
637 /// function key 1.
638 ///
639 /// @param[in]  n  Number to convert.
640 ///
641 /// @return the hex character.
nr2hex(unsigned n)642 static inline unsigned nr2hex(unsigned n)
643   FUNC_ATTR_CONST FUNC_ATTR_WARN_UNUSED_RESULT
644 {
645   if ((n & 0xf) <= 9) {
646     return (n & 0xf) + '0';
647   }
648   return (n & 0xf) - 10 + 'a';
649 }
650 
651 /// Return number of display cells occupied by byte "b".
652 ///
653 /// Caller must make sure 0 <= b <= 255.
654 /// For multi-byte mode "b" must be the first byte of a character.
655 /// A TAB is counted as two cells: "^I".
656 /// This will return 0 for bytes >= 0x80, because the number of
657 /// cells depends on further bytes in UTF-8.
658 ///
659 /// @param b
660 ///
661 /// @reeturn Number of display cells.
byte2cells(int b)662 int byte2cells(int b)
663 {
664   if (b >= 0x80) {
665     return 0;
666   }
667   return g_chartab[b] & CT_CELL_MASK;
668 }
669 
670 /// Return number of display cells occupied by character "c".
671 ///
672 /// "c" can be a special key (negative number) in which case 3 or 4 is returned.
673 /// A TAB is counted as two cells: "^I" or four: "<09>".
674 ///
675 /// @param c
676 ///
677 /// @return Number of display cells.
char2cells(int c)678 int char2cells(int c)
679 {
680   if (IS_SPECIAL(c)) {
681     return char2cells(K_SECOND(c)) + 2;
682   }
683 
684   if (c >= 0x80) {
685     // UTF-8: above 0x80 need to check the value
686     return utf_char2cells(c);
687   }
688   return g_chartab[c & 0xff] & CT_CELL_MASK;
689 }
690 
691 /// Return number of display cells occupied by character at "*p".
692 /// A TAB is counted as two cells: "^I" or four: "<09>".
693 ///
694 /// @param p
695 ///
696 /// @return number of display cells.
ptr2cells(const char_u * p)697 int ptr2cells(const char_u *p)
698 {
699   // For UTF-8 we need to look at more bytes if the first byte is >= 0x80.
700   if (*p >= 0x80) {
701     return utf_ptr2cells(p);
702   }
703 
704   // For DBCS we can tell the cell count from the first byte.
705   return g_chartab[*p] & CT_CELL_MASK;
706 }
707 
708 /// Return the number of character cells string "s" will take on the screen,
709 /// counting TABs as two characters: "^I".
710 ///
711 /// 's' must be non-null.
712 ///
713 /// @param s
714 ///
715 /// @return number of character cells.
vim_strsize(char_u * s)716 int vim_strsize(char_u *s)
717 {
718   return vim_strnsize(s, MAXCOL);
719 }
720 
721 /// Return the number of character cells string "s[len]" will take on the
722 /// screen, counting TABs as two characters: "^I".
723 ///
724 /// 's' must be non-null.
725 ///
726 /// @param s
727 /// @param len
728 ///
729 /// @return Number of character cells.
vim_strnsize(char_u * s,int len)730 int vim_strnsize(char_u *s, int len)
731 {
732   assert(s != NULL);
733   int size = 0;
734   while (*s != NUL && --len >= 0) {
735     int l = utfc_ptr2len(s);
736     size += ptr2cells(s);
737     s += l;
738     len -= l - 1;
739   }
740   return size;
741 }
742 
743 /// Check that "c" is a normal identifier character:
744 /// Letters and characters from the 'isident' option.
745 ///
746 /// @param  c  character to check
vim_isIDc(int c)747 bool vim_isIDc(int c)
748   FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT
749 {
750   return c > 0 && c < 0x100 && (g_chartab[c] & CT_ID_CHAR);
751 }
752 
753 /// Check that "c" is a keyword character:
754 /// Letters and characters from 'iskeyword' option for the current buffer.
755 /// For multi-byte characters mb_get_class() is used (builtin rules).
756 ///
757 /// @param  c  character to check
vim_iswordc(const int c)758 bool vim_iswordc(const int c)
759   FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT
760 {
761   return vim_iswordc_buf(c, curbuf);
762 }
763 
764 /// Check that "c" is a keyword character
765 /// Letters and characters from 'iskeyword' option for given buffer.
766 /// For multi-byte characters mb_get_class() is used (builtin rules).
767 ///
768 /// @param[in]  c  Character to check.
769 /// @param[in]  chartab  Buffer chartab.
vim_iswordc_tab(const int c,const uint64_t * const chartab)770 bool vim_iswordc_tab(const int c, const uint64_t *const chartab)
771   FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
772 {
773   return (c >= 0x100
774           ? (utf_class_tab(c, chartab) >= 2)
775           : (c > 0 && GET_CHARTAB_TAB(chartab, c) != 0));
776 }
777 
778 /// Check that "c" is a keyword character:
779 /// Letters and characters from 'iskeyword' option for given buffer.
780 /// For multi-byte characters mb_get_class() is used (builtin rules).
781 ///
782 /// @param  c    character to check
783 /// @param  buf  buffer whose keywords to use
vim_iswordc_buf(const int c,buf_T * const buf)784 bool vim_iswordc_buf(const int c, buf_T *const buf)
785   FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ARG(2)
786 {
787   return vim_iswordc_tab(c, buf->b_chartab);
788 }
789 
790 /// Just like vim_iswordc() but uses a pointer to the (multi-byte) character.
791 ///
792 /// @param  p  pointer to the multi-byte character
793 ///
794 /// @return true if "p" points to a keyword character.
vim_iswordp(const char_u * const p)795 bool vim_iswordp(const char_u *const p)
796   FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
797 {
798   return vim_iswordp_buf(p, curbuf);
799 }
800 
801 /// Just like vim_iswordc_buf() but uses a pointer to the (multi-byte)
802 /// character.
803 ///
804 /// @param  p    pointer to the multi-byte character
805 /// @param  buf  buffer whose keywords to use
806 ///
807 /// @return true if "p" points to a keyword character.
vim_iswordp_buf(const char_u * const p,buf_T * const buf)808 bool vim_iswordp_buf(const char_u *const p, buf_T *const buf)
809   FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
810 {
811   int c = *p;
812 
813   if (MB_BYTE2LEN(c) > 1) {
814     c = utf_ptr2char(p);
815   }
816   return vim_iswordc_buf(c, buf);
817 }
818 
819 /// Check that "c" is a valid file-name character.
820 /// Assume characters above 0x100 are valid (multi-byte).
821 ///
822 /// @param  c  character to check
vim_isfilec(int c)823 bool vim_isfilec(int c)
824   FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT
825 {
826   return c >= 0x100 || (c > 0 && (g_chartab[c] & CT_FNAME_CHAR));
827 }
828 
829 /// Check that "c" is a valid file-name character or a wildcard character
830 /// Assume characters above 0x100 are valid (multi-byte).
831 /// Explicitly interpret ']' as a wildcard character as path_has_wildcard("]")
832 /// returns false.
833 ///
834 /// @param  c  character to check
vim_isfilec_or_wc(int c)835 bool vim_isfilec_or_wc(int c)
836   FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT
837 {
838   char_u buf[2];
839   buf[0] = (char_u)c;
840   buf[1] = NUL;
841   return vim_isfilec(c) || c == ']' || path_has_wildcard(buf);
842 }
843 
844 /// Check that "c" is a printable character.
845 /// Assume characters above 0x100 are printable for double-byte encodings.
846 ///
847 /// @param  c  character to check
vim_isprintc(int c)848 bool vim_isprintc(int c)
849   FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT
850 {
851   if (c >= 0x100) {
852     return utf_printable(c);
853   }
854   return c > 0 && (g_chartab[c] & CT_PRINT_CHAR);
855 }
856 
857 /// Strict version of vim_isprintc(c), don't return true if "c" is the head
858 /// byte of a double-byte character.
859 ///
860 /// @param  c  character to check
861 ///
862 /// @return true if "c" is a printable character.
vim_isprintc_strict(int c)863 bool vim_isprintc_strict(int c)
864   FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT
865 {
866   if (c >= 0x100) {
867     return utf_printable(c);
868   }
869   return c > 0 && (g_chartab[c] & CT_PRINT_CHAR);
870 }
871 
872 /// Check that virtual column "vcol" is in the rightmost column of window "wp".
873 ///
874 /// @param  wp    window
875 /// @param  vcol  column number
in_win_border(win_T * wp,colnr_T vcol)876 bool in_win_border(win_T *wp, colnr_T vcol)
877   FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ARG(1)
878 {
879   int width1;             // width of first line (after line number)
880   int width2;             // width of further lines
881 
882   if (wp->w_width_inner == 0) {
883     // there is no border
884     return false;
885   }
886   width1 = wp->w_width_inner - win_col_off(wp);
887 
888   if ((int)vcol < width1 - 1) {
889     return false;
890   }
891 
892   if ((int)vcol == width1 - 1) {
893     return true;
894   }
895   width2 = width1 + win_col_off2(wp);
896 
897   if (width2 <= 0) {
898     return false;
899   }
900   return (vcol - width1) % width2 == width2 - 1;
901 }
902 
903 /// Get virtual column number of pos.
904 ///  start: on the first position of this character (TAB, ctrl)
905 /// cursor: where the cursor is on this character (first char, except for TAB)
906 ///    end: on the last position of this character (TAB, ctrl)
907 ///
908 /// This is used very often, keep it fast!
909 ///
910 /// @param wp
911 /// @param pos
912 /// @param start
913 /// @param cursor
914 /// @param end
getvcol(win_T * wp,pos_T * pos,colnr_T * start,colnr_T * cursor,colnr_T * end)915 void getvcol(win_T *wp, pos_T *pos, colnr_T *start, colnr_T *cursor, colnr_T *end)
916 {
917   colnr_T vcol;
918   char_u *ptr;    // points to current char
919   char_u *posptr;  // points to char at pos->col
920   char_u *line;   // start of the line
921   int incr;
922   int head;
923   long *vts = wp->w_buffer->b_p_vts_array;
924   int ts = (int)wp->w_buffer->b_p_ts;
925   int c;
926 
927   vcol = 0;
928   line = ptr = ml_get_buf(wp->w_buffer, pos->lnum, false);
929 
930   if (pos->col == MAXCOL) {
931     // continue until the NUL
932     posptr = NULL;
933   } else {
934     // Special check for an empty line, which can happen on exit, when
935     // ml_get_buf() always returns an empty string.
936     if (*ptr == NUL) {
937       pos->col = 0;
938     }
939     posptr = ptr + pos->col;
940     posptr -= utf_head_off(line, posptr);
941   }
942 
943   // This function is used very often, do some speed optimizations.
944   // When 'list', 'linebreak', 'showbreak' and 'breakindent' are not set
945   // use a simple loop.
946   // Also use this when 'list' is set but tabs take their normal size.
947   if ((!wp->w_p_list || (wp->w_p_lcs_chars.tab1 != NUL))
948       && !wp->w_p_lbr
949       && *get_showbreak_value(wp) == NUL
950       && !wp->w_p_bri) {
951     for (;;) {
952       head = 0;
953       c = *ptr;
954 
955       // make sure we don't go past the end of the line
956       if (c == NUL) {
957         // NUL at end of line only takes one column
958         incr = 1;
959         break;
960       }
961 
962       // A tab gets expanded, depending on the current column
963       if (c == TAB) {
964         incr = tabstop_padding(vcol, ts, vts);
965       } else {
966         // For utf-8, if the byte is >= 0x80, need to look at
967         // further bytes to find the cell width.
968         if (c >= 0x80) {
969           incr = utf_ptr2cells(ptr);
970         } else {
971           incr = g_chartab[c] & CT_CELL_MASK;
972         }
973 
974         // If a double-cell char doesn't fit at the end of a line
975         // it wraps to the next line, it's like this char is three
976         // cells wide.
977         if ((incr == 2)
978             && wp->w_p_wrap
979             && (MB_BYTE2LEN(*ptr) > 1)
980             && in_win_border(wp, vcol)) {
981           incr++;
982           head = 1;
983         }
984       }
985 
986       if ((posptr != NULL) && (ptr >= posptr)) {
987         // character at pos->col
988         break;
989       }
990 
991       vcol += incr;
992       MB_PTR_ADV(ptr);
993     }
994   } else {
995     for (;;) {
996       // A tab gets expanded, depending on the current column
997       head = 0;
998       incr = win_lbr_chartabsize(wp, line, ptr, vcol, &head);
999 
1000       // make sure we don't go past the end of the line
1001       if (*ptr == NUL) {
1002         // NUL at end of line only takes one column
1003         incr = 1;
1004         break;
1005       }
1006 
1007       if ((posptr != NULL) && (ptr >= posptr)) {
1008         // character at pos->col
1009         break;
1010       }
1011 
1012       vcol += incr;
1013       MB_PTR_ADV(ptr);
1014     }
1015   }
1016 
1017   if (start != NULL) {
1018     *start = vcol + head;
1019   }
1020 
1021   if (end != NULL) {
1022     *end = vcol + incr - 1;
1023   }
1024 
1025   if (cursor != NULL) {
1026     if ((*ptr == TAB)
1027         && (State & NORMAL)
1028         && !wp->w_p_list
1029         && !virtual_active()
1030         && !(VIsual_active && ((*p_sel == 'e') || ltoreq(*pos, VIsual)))) {
1031       // cursor at end
1032       *cursor = vcol + incr - 1;
1033     } else {
1034       // cursor at start
1035       *cursor = vcol + head;
1036     }
1037   }
1038 }
1039 
1040 /// Get virtual cursor column in the current window, pretending 'list' is off.
1041 ///
1042 /// @param posp
1043 ///
1044 /// @retujrn The virtual cursor column.
getvcol_nolist(pos_T * posp)1045 colnr_T getvcol_nolist(pos_T *posp)
1046 {
1047   int list_save = curwin->w_p_list;
1048   colnr_T vcol;
1049 
1050   curwin->w_p_list = false;
1051   if (posp->coladd) {
1052     getvvcol(curwin, posp, NULL, &vcol, NULL);
1053   } else {
1054     getvcol(curwin, posp, NULL, &vcol, NULL);
1055   }
1056   curwin->w_p_list = list_save;
1057   return vcol;
1058 }
1059 
1060 /// Get virtual column in virtual mode.
1061 ///
1062 /// @param wp
1063 /// @param pos
1064 /// @param start
1065 /// @param cursor
1066 /// @param end
getvvcol(win_T * wp,pos_T * pos,colnr_T * start,colnr_T * cursor,colnr_T * end)1067 void getvvcol(win_T *wp, pos_T *pos, colnr_T *start, colnr_T *cursor, colnr_T *end)
1068 {
1069   colnr_T col;
1070   colnr_T coladd;
1071   colnr_T endadd;
1072   char_u *ptr;
1073 
1074   if (virtual_active()) {
1075     // For virtual mode, only want one value
1076     getvcol(wp, pos, &col, NULL, NULL);
1077 
1078     coladd = pos->coladd;
1079     endadd = 0;
1080 
1081     // Cannot put the cursor on part of a wide character.
1082     ptr = ml_get_buf(wp->w_buffer, pos->lnum, false);
1083 
1084     if (pos->col < (colnr_T)STRLEN(ptr)) {
1085       int c = utf_ptr2char(ptr + pos->col);
1086       if ((c != TAB) && vim_isprintc(c)) {
1087         endadd = (colnr_T)(char2cells(c) - 1);
1088         if (coladd > endadd) {
1089           // past end of line
1090           endadd = 0;
1091         } else {
1092           coladd = 0;
1093         }
1094       }
1095     }
1096     col += coladd;
1097 
1098     if (start != NULL) {
1099       *start = col;
1100     }
1101 
1102     if (cursor != NULL) {
1103       *cursor = col;
1104     }
1105 
1106     if (end != NULL) {
1107       *end = col + endadd;
1108     }
1109   } else {
1110     getvcol(wp, pos, start, cursor, end);
1111   }
1112 }
1113 
1114 /// Get the leftmost and rightmost virtual column of pos1 and pos2.
1115 /// Used for Visual block mode.
1116 ///
1117 /// @param wp
1118 /// @param pos1
1119 /// @param pos2
1120 /// @param left
1121 /// @param right
getvcols(win_T * wp,pos_T * pos1,pos_T * pos2,colnr_T * left,colnr_T * right)1122 void getvcols(win_T *wp, pos_T *pos1, pos_T *pos2, colnr_T *left, colnr_T *right)
1123 {
1124   colnr_T from1;
1125   colnr_T from2;
1126   colnr_T to1;
1127   colnr_T to2;
1128 
1129   if (lt(*pos1, *pos2)) {
1130     getvvcol(wp, pos1, &from1, NULL, &to1);
1131     getvvcol(wp, pos2, &from2, NULL, &to2);
1132   } else {
1133     getvvcol(wp, pos2, &from1, NULL, &to1);
1134     getvvcol(wp, pos1, &from2, NULL, &to2);
1135   }
1136 
1137   if (from2 < from1) {
1138     *left = from2;
1139   } else {
1140     *left = from1;
1141   }
1142 
1143   if (to2 > to1) {
1144     if ((*p_sel == 'e') && (from2 - 1 >= to1)) {
1145       *right = from2 - 1;
1146     } else {
1147       *right = to2;
1148     }
1149   } else {
1150     *right = to1;
1151   }
1152 }
1153 
1154 /// skipwhite: skip over ' ' and '\t'.
1155 ///
1156 /// @param[in]  p  String to skip in.
1157 ///
1158 /// @return Pointer to character after the skipped whitespace.
skipwhite(const char_u * const p)1159 char_u *skipwhite(const char_u *const p)
1160   FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
1161   FUNC_ATTR_NONNULL_RET
1162 {
1163   return skipwhite_len(p, STRLEN(p));
1164 }
1165 
1166 /// Like `skipwhite`, but skip up to `len` characters.
1167 /// @see skipwhite
1168 ///
1169 /// @param[in]  p    String to skip in.
1170 /// @param[in]  len  Max length to skip.
1171 ///
1172 /// @return Pointer to character after the skipped whitespace, or the `len`-th
1173 ///         character in the string.
skipwhite_len(const char_u * p,size_t len)1174 char_u *skipwhite_len(const char_u *p, size_t len)
1175   FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
1176   FUNC_ATTR_NONNULL_RET
1177 {
1178   for (; len > 0 && ascii_iswhite(*p); len--) {
1179     p++;
1180   }
1181   return (char_u *)p;
1182 }
1183 
1184 // getwhitecols: return the number of whitespace
1185 // columns (bytes) at the start of a given line
getwhitecols_curline(void)1186 intptr_t getwhitecols_curline(void)
1187 {
1188   return getwhitecols(get_cursor_line_ptr());
1189 }
1190 
getwhitecols(const char_u * p)1191 intptr_t getwhitecols(const char_u *p)
1192 {
1193   return skipwhite(p) - p;
1194 }
1195 
1196 /// Skip over digits
1197 ///
1198 /// @param[in]  q  String to skip digits in.
1199 ///
1200 /// @return Pointer to the character after the skipped digits.
skipdigits(const char_u * q)1201 char_u *skipdigits(const char_u *q)
1202   FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
1203   FUNC_ATTR_NONNULL_RET
1204 {
1205   const char_u *p = q;
1206   while (ascii_isdigit(*p)) {
1207     // skip to next non-digit
1208     p++;
1209   }
1210   return (char_u *)p;
1211 }
1212 
1213 /// skip over binary digits
1214 ///
1215 /// @param q pointer to string
1216 ///
1217 /// @return Pointer to the character after the skipped digits.
skipbin(const char * q)1218 const char *skipbin(const char *q)
1219   FUNC_ATTR_PURE
1220   FUNC_ATTR_NONNULL_ALL
1221   FUNC_ATTR_NONNULL_RET
1222 {
1223   const char *p = q;
1224   while (ascii_isbdigit(*p)) {
1225     // skip to next non-digit
1226     p++;
1227   }
1228   return p;
1229 }
1230 
1231 /// skip over digits and hex characters
1232 ///
1233 /// @param q
1234 ///
1235 /// @return Pointer to the character after the skipped digits and hex
1236 ///         characters.
skiphex(char_u * q)1237 char_u *skiphex(char_u *q)
1238 {
1239   char_u *p = q;
1240   while (ascii_isxdigit(*p)) {
1241     // skip to next non-digit
1242     p++;
1243   }
1244   return p;
1245 }
1246 
1247 /// skip to digit (or NUL after the string)
1248 ///
1249 /// @param q
1250 ///
1251 /// @return Pointer to the digit or (NUL after the string).
skiptodigit(char_u * q)1252 char_u *skiptodigit(char_u *q)
1253 {
1254   char_u *p = q;
1255   while (*p != NUL && !ascii_isdigit(*p)) {
1256     // skip to next digit
1257     p++;
1258   }
1259   return p;
1260 }
1261 
1262 /// skip to binary character (or NUL after the string)
1263 ///
1264 /// @param q pointer to string
1265 ///
1266 /// @return Pointer to the binary character or (NUL after the string).
skiptobin(const char * q)1267 const char *skiptobin(const char *q)
1268   FUNC_ATTR_PURE
1269   FUNC_ATTR_NONNULL_ALL
1270   FUNC_ATTR_NONNULL_RET
1271 {
1272   const char *p = q;
1273   while (*p != NUL && !ascii_isbdigit(*p)) {
1274     // skip to next digit
1275     p++;
1276   }
1277   return p;
1278 }
1279 
1280 /// skip to hex character (or NUL after the string)
1281 ///
1282 /// @param q
1283 ///
1284 /// @return Pointer to the hex character or (NUL after the string).
skiptohex(char_u * q)1285 char_u *skiptohex(char_u *q)
1286 {
1287   char_u *p = q;
1288   while (*p != NUL && !ascii_isxdigit(*p)) {
1289     // skip to next digit
1290     p++;
1291   }
1292   return p;
1293 }
1294 
1295 /// Skip over text until ' ' or '\t' or NUL
1296 ///
1297 /// @param[in]  p  Text to skip over.
1298 ///
1299 /// @return Pointer to the next whitespace or NUL character.
skiptowhite(const char_u * p)1300 char_u *skiptowhite(const char_u *p)
1301   FUNC_ATTR_NONNULL_ALL
1302 {
1303   while (*p != ' ' && *p != '\t' && *p != NUL) {
1304     p++;
1305   }
1306   return (char_u *)p;
1307 }
1308 
1309 /// skiptowhite_esc: Like skiptowhite(), but also skip escaped chars
1310 ///
1311 /// @param p
1312 ///
1313 /// @return Pointer to the next whitespace character.
skiptowhite_esc(char_u * p)1314 char_u *skiptowhite_esc(char_u *p)
1315 {
1316   while (*p != ' ' && *p != '\t' && *p != NUL) {
1317     if (((*p == '\\') || (*p == Ctrl_V)) && (*(p + 1) != NUL)) {
1318       ++p;
1319     }
1320     ++p;
1321   }
1322   return p;
1323 }
1324 
1325 /// Skip over text until '\n' or NUL.
1326 ///
1327 /// @param[in]  p  Text to skip over.
1328 ///
1329 /// @return Pointer to the next '\n' or NUL character.
skip_to_newline(const char_u * const p)1330 char_u *skip_to_newline(const char_u *const p)
1331   FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
1332   FUNC_ATTR_NONNULL_RET
1333 {
1334   return (char_u *)xstrchrnul((const char *)p, NL);
1335 }
1336 
1337 /// Gets a number from a string and skips over it, signalling overflow.
1338 ///
1339 /// @param[out]  pp  A pointer to a pointer to char_u.
1340 ///                  It will be advanced past the read number.
1341 /// @param[out]  nr  Number read from the string.
1342 ///
1343 /// @return true on success, false on error/overflow
try_getdigits(char_u ** pp,intmax_t * nr)1344 bool try_getdigits(char_u **pp, intmax_t *nr)
1345 {
1346   errno = 0;
1347   *nr = strtoimax((char *)(*pp), (char **)pp, 10);
1348   if (errno == ERANGE && (*nr == INTMAX_MIN || *nr == INTMAX_MAX)) {
1349     return false;
1350   }
1351   return true;
1352 }
1353 
1354 /// Gets a number from a string and skips over it.
1355 ///
1356 /// @param[out]  pp  Pointer to a pointer to char_u.
1357 ///                  It will be advanced past the read number.
1358 /// @param strict    Abort on overflow.
1359 /// @param def       Default value, if parsing fails or overflow occurs.
1360 ///
1361 /// @return Number read from the string, or `def` on parse failure or overflow.
getdigits(char_u ** pp,bool strict,intmax_t def)1362 intmax_t getdigits(char_u **pp, bool strict, intmax_t def)
1363 {
1364   intmax_t number;
1365   int ok = try_getdigits(pp, &number);
1366   if (strict && !ok) {
1367     abort();
1368   }
1369   return ok ? number : def;
1370 }
1371 
1372 /// Gets an int number from a string.
1373 ///
1374 /// @see getdigits
getdigits_int(char_u ** pp,bool strict,int def)1375 int getdigits_int(char_u **pp, bool strict, int def)
1376 {
1377   intmax_t number = getdigits(pp, strict, def);
1378 #if SIZEOF_INTMAX_T > SIZEOF_INT
1379   if (strict) {
1380     assert(number >= INT_MIN && number <= INT_MAX);
1381   } else if (!(number >= INT_MIN && number <= INT_MAX)) {
1382     return def;
1383   }
1384 #endif
1385   return (int)number;
1386 }
1387 
1388 /// Gets a long number from a string.
1389 ///
1390 /// @see getdigits
getdigits_long(char_u ** pp,bool strict,long def)1391 long getdigits_long(char_u **pp, bool strict, long def)
1392 {
1393   intmax_t number = getdigits(pp, strict, def);
1394 #if SIZEOF_INTMAX_T > SIZEOF_LONG
1395   if (strict) {
1396     assert(number >= LONG_MIN && number <= LONG_MAX);
1397   } else if (!(number >= LONG_MIN && number <= LONG_MAX)) {
1398     return def;
1399   }
1400 #endif
1401   return (long)number;
1402 }
1403 
1404 /// Check that "lbuf" is empty or only contains blanks.
1405 ///
1406 /// @param  lbuf  line buffer to check
vim_isblankline(char_u * lbuf)1407 bool vim_isblankline(char_u *lbuf)
1408 {
1409   char_u *p = skipwhite(lbuf);
1410   return *p == NUL || *p == '\r' || *p == '\n';
1411 }
1412 
1413 /// Convert a string into a long and/or unsigned long, taking care of
1414 /// hexadecimal, octal and binary numbers.  Accepts a '-' sign.
1415 /// If "prep" is not NULL, returns a flag to indicate the type of the number:
1416 ///   0      decimal
1417 ///   '0'    octal
1418 ///   'O'    octal
1419 ///   'o'    octal
1420 ///   'B'    bin
1421 ///   'b'    bin
1422 ///   'X'    hex
1423 ///   'x'    hex
1424 /// If "len" is not NULL, the length of the number in characters is returned.
1425 /// If "nptr" is not NULL, the signed result is returned in it.
1426 /// If "unptr" is not NULL, the unsigned result is returned in it.
1427 /// If "what" contains STR2NR_BIN recognize binary numbers.
1428 /// If "what" contains STR2NR_OCT recognize octal numbers.
1429 /// If "what" contains STR2NR_HEX recognize hex numbers.
1430 /// If "what" contains STR2NR_FORCE always assume bin/oct/hex.
1431 /// If "what" contains STR2NR_QUOTE ignore embedded single quotes
1432 /// If maxlen > 0, check at a maximum maxlen chars.
1433 /// If strict is true, check the number strictly. return *len = 0 if fail.
1434 ///
1435 /// @param start
1436 /// @param prep Returns guessed type of number 0 = decimal, 'x' or 'X' is
1437 ///             hexadecimal, '0', 'o' or 'O' is octal, 'b' or 'B' is binary.
1438 ///             When using STR2NR_FORCE is always zero.
1439 /// @param len Returns the detected length of number.
1440 /// @param what Recognizes what number passed, @see ChStr2NrFlags.
1441 /// @param nptr Returns the signed result.
1442 /// @param unptr Returns the unsigned result.
1443 /// @param maxlen Max length of string to check.
1444 /// @param strict If true, fail if the number has unexpected trailing
1445 ///               alpha-numeric chars: *len is set to 0 and nothing else is
1446 ///               returned.
vim_str2nr(const char_u * const start,int * const prep,int * const len,const int what,varnumber_T * const nptr,uvarnumber_T * const unptr,const int maxlen,const bool strict)1447 void vim_str2nr(const char_u *const start, int *const prep, int *const len, const int what,
1448                 varnumber_T *const nptr, uvarnumber_T *const unptr, const int maxlen,
1449                 const bool strict)
1450   FUNC_ATTR_NONNULL_ARG(1)
1451 {
1452   const char *ptr = (const char *)start;
1453 #define STRING_ENDED(ptr) \
1454   (!(maxlen == 0 || (int)((ptr) - (const char *)start) < maxlen))
1455   int pre = 0;  // default is decimal
1456   const bool negative = (ptr[0] == '-');
1457   uvarnumber_T un = 0;
1458 
1459   if (len != NULL) {
1460     *len = 0;
1461   }
1462 
1463   if (negative) {
1464     ptr++;
1465   }
1466 
1467   if (what & STR2NR_FORCE) {
1468     // When forcing main consideration is skipping the prefix. Decimal numbers
1469     // have no prefixes to skip. pre is not set.
1470     switch (what & ~(STR2NR_FORCE | STR2NR_QUOTE)) {
1471     case STR2NR_HEX:
1472       if (!STRING_ENDED(ptr + 2)
1473           && ptr[0] == '0'
1474           && (ptr[1] == 'x' || ptr[1] == 'X')
1475           && ascii_isxdigit(ptr[2])) {
1476         ptr += 2;
1477       }
1478       goto vim_str2nr_hex;
1479     case STR2NR_BIN:
1480       if (!STRING_ENDED(ptr + 2)
1481           && ptr[0] == '0'
1482           && (ptr[1] == 'b' || ptr[1] == 'B')
1483           && ascii_isbdigit(ptr[2])) {
1484         ptr += 2;
1485       }
1486       goto vim_str2nr_bin;
1487     // Make STR2NR_OOCT work the same as STR2NR_OCT when forcing.
1488     case STR2NR_OCT:
1489     case STR2NR_OOCT:
1490     case STR2NR_OCT | STR2NR_OOCT:
1491       if (!STRING_ENDED(ptr + 2)
1492           && ptr[0] == '0'
1493           && (ptr[1] == 'o' || ptr[1] == 'O')
1494           && ascii_isodigit(ptr[2])) {
1495         ptr += 2;
1496       }
1497       goto vim_str2nr_oct;
1498     case 0:
1499       goto vim_str2nr_dec;
1500     default:
1501       abort();
1502     }
1503   } else if ((what & (STR2NR_HEX | STR2NR_OCT | STR2NR_OOCT | STR2NR_BIN))
1504              && !STRING_ENDED(ptr + 1) && ptr[0] == '0' && ptr[1] != '8'
1505              && ptr[1] != '9') {
1506     pre = ptr[1];
1507     // Detect hexadecimal: 0x or 0X followed by hex digit.
1508     if ((what & STR2NR_HEX)
1509         && !STRING_ENDED(ptr + 2)
1510         && (pre == 'X' || pre == 'x')
1511         && ascii_isxdigit(ptr[2])) {
1512       ptr += 2;
1513       goto vim_str2nr_hex;
1514     }
1515     // Detect binary: 0b or 0B followed by 0 or 1.
1516     if ((what & STR2NR_BIN)
1517         && !STRING_ENDED(ptr + 2)
1518         && (pre == 'B' || pre == 'b')
1519         && ascii_isbdigit(ptr[2])) {
1520       ptr += 2;
1521       goto vim_str2nr_bin;
1522     }
1523     // Detect octal: 0o or 0O followed by octal digits (without '8' or '9').
1524     if ((what & STR2NR_OOCT)
1525         && !STRING_ENDED(ptr + 2)
1526         && (pre == 'O' || pre == 'o')
1527         && ascii_isodigit(ptr[2])) {
1528       ptr += 2;
1529       goto vim_str2nr_oct;
1530     }
1531     // Detect old octal format: 0 followed by octal digits.
1532     pre = 0;
1533     if (!(what & STR2NR_OCT)
1534         || !ascii_isodigit(ptr[1])) {
1535       goto vim_str2nr_dec;
1536     }
1537     for (int i = 2; !STRING_ENDED(ptr + i) && ascii_isdigit(ptr[i]); i++) {
1538       if (ptr[i] > '7') {
1539         goto vim_str2nr_dec;
1540       }
1541     }
1542     pre = '0';
1543     goto vim_str2nr_oct;
1544   } else {
1545     goto vim_str2nr_dec;
1546   }
1547 
1548   // Do the conversion manually to avoid sscanf() quirks.
1549   abort();  // Should’ve used goto earlier.
1550 #define PARSE_NUMBER(base, cond, conv) \
1551   do { \
1552     const char *const after_prefix = ptr; \
1553     while (!STRING_ENDED(ptr)) { \
1554       if ((what & STR2NR_QUOTE) && ptr > after_prefix && *ptr == '\'') { \
1555         ptr++; \
1556         if (!STRING_ENDED(ptr) && (cond)) { \
1557           continue; \
1558         } \
1559         ptr--; \
1560       } \
1561       if (!(cond)) { \
1562         break; \
1563       } \
1564       const uvarnumber_T digit = (uvarnumber_T)(conv); \
1565       /* avoid ubsan error for overflow */ \
1566       if (un < UVARNUMBER_MAX / base \
1567           || (un == UVARNUMBER_MAX / base \
1568               && (base != 10 || digit <= UVARNUMBER_MAX % 10))) { \
1569         un = base * un + digit; \
1570       } else { \
1571         un = UVARNUMBER_MAX; \
1572       } \
1573       ptr++; \
1574     } \
1575   } while (0)
1576 vim_str2nr_bin:
1577   PARSE_NUMBER(2, (*ptr == '0' || *ptr == '1'), (*ptr - '0'));
1578   goto vim_str2nr_proceed;
1579 vim_str2nr_oct:
1580   PARSE_NUMBER(8, (ascii_isodigit(*ptr)), (*ptr - '0'));
1581   goto vim_str2nr_proceed;
1582 vim_str2nr_dec:
1583   PARSE_NUMBER(10, (ascii_isdigit(*ptr)), (*ptr - '0'));
1584   goto vim_str2nr_proceed;
1585 vim_str2nr_hex:
1586   PARSE_NUMBER(16, (ascii_isxdigit(*ptr)), (hex2nr(*ptr)));
1587   goto vim_str2nr_proceed;
1588 #undef PARSE_NUMBER
1589 
1590 vim_str2nr_proceed:
1591   // Check for an alpha-numeric character immediately following, that is
1592   // most likely a typo.
1593   if (strict && ptr - (const char *)start != maxlen && ASCII_ISALNUM(*ptr)) {
1594     return;
1595   }
1596 
1597   if (prep != NULL) {
1598     *prep = pre;
1599   }
1600 
1601   if (len != NULL) {
1602     *len = (int)(ptr - (const char *)start);
1603   }
1604 
1605   if (nptr != NULL) {
1606     if (negative) {  // account for leading '-' for decimal numbers
1607       // avoid ubsan error for overflow
1608       if (un > VARNUMBER_MAX) {
1609         *nptr = VARNUMBER_MIN;
1610       } else {
1611         *nptr = -(varnumber_T)un;
1612       }
1613     } else {
1614       if (un > VARNUMBER_MAX) {
1615         un = VARNUMBER_MAX;
1616       }
1617       *nptr = (varnumber_T)un;
1618     }
1619   }
1620 
1621   if (unptr != NULL) {
1622     *unptr = un;
1623   }
1624 #undef STRING_ENDED
1625 }
1626 
1627 /// Return the value of a single hex character.
1628 /// Only valid when the argument is '0' - '9', 'A' - 'F' or 'a' - 'f'.
1629 ///
1630 /// @param c
1631 ///
1632 /// @return The value of the hex character.
hex2nr(int c)1633 int hex2nr(int c)
1634 {
1635   if ((c >= 'a') && (c <= 'f')) {
1636     return c - 'a' + 10;
1637   }
1638 
1639   if ((c >= 'A') && (c <= 'F')) {
1640     return c - 'A' + 10;
1641   }
1642   return c - '0';
1643 }
1644 
1645 /// Convert two hex characters to a byte.
1646 /// Return -1 if one of the characters is not hex.
hexhex2nr(char_u * p)1647 int hexhex2nr(char_u *p)
1648 {
1649   if (!ascii_isxdigit(p[0]) || !ascii_isxdigit(p[1])) {
1650     return -1;
1651   }
1652   return (hex2nr(p[0]) << 4) + hex2nr(p[1]);
1653 }
1654 
1655 /// Check that "str" starts with a backslash that should be removed.
1656 /// For Windows this is only done when the character after the
1657 /// backslash is not a normal file name character.
1658 /// '$' is a valid file name character, we don't remove the backslash before
1659 /// it.  This means it is not possible to use an environment variable after a
1660 /// backslash.  "C:\$VIM\doc" is taken literally, only "$VIM\doc" works.
1661 /// Although "\ name" is valid, the backslash in "Program\ files" must be
1662 /// removed.  Assume a file name doesn't start with a space.
1663 /// For multi-byte names, never remove a backslash before a non-ascii
1664 /// character, assume that all multi-byte characters are valid file name
1665 /// characters.
1666 ///
1667 /// @param  str  file path string to check
rem_backslash(const char_u * str)1668 bool rem_backslash(const char_u *str)
1669   FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
1670 {
1671 #ifdef BACKSLASH_IN_FILENAME
1672   return str[0] == '\\'
1673          && str[1] < 0x80
1674          && (str[1] == ' '
1675              || (str[1] != NUL
1676                  && str[1] != '*'
1677                  && str[1] != '?'
1678                  && !vim_isfilec(str[1])));
1679 
1680 #else  // ifdef BACKSLASH_IN_FILENAME
1681   return str[0] == '\\' && str[1] != NUL;
1682 #endif  // ifdef BACKSLASH_IN_FILENAME
1683 }
1684 
1685 /// Halve the number of backslashes in a file name argument.
1686 ///
1687 /// @param p
backslash_halve(char_u * p)1688 void backslash_halve(char_u *p)
1689 {
1690   for (; *p; ++p) {
1691     if (rem_backslash(p)) {
1692       STRMOVE(p, p + 1);
1693     }
1694   }
1695 }
1696 
1697 /// backslash_halve() plus save the result in allocated memory.
1698 ///
1699 /// @param p
1700 ///
1701 /// @return String with the number of backslashes halved.
backslash_halve_save(const char_u * p)1702 char_u *backslash_halve_save(const char_u *p)
1703   FUNC_ATTR_NONNULL_ALL FUNC_ATTR_NONNULL_RET
1704 {
1705   // TODO(philix): simplify and improve backslash_halve_save algorithm
1706   char_u *res = vim_strsave(p);
1707   backslash_halve(res);
1708   return res;
1709 }
1710