1 // This is an open source non-commercial project. Dear PVS-Studio, please check
2 // it. PVS-Studio Static Code Analyzer for C, C++ and C#: http://www.viva64.com
3
4 /// @file charset.c
5 ///
6 /// Code related to character sets.
7
8 #include <assert.h>
9 #include <inttypes.h>
10 #include <string.h>
11 #include <wctype.h>
12
13 #include "nvim/ascii.h"
14 #include "nvim/charset.h"
15 #include "nvim/cursor.h"
16 #include "nvim/func_attr.h"
17 #include "nvim/garray.h"
18 #include "nvim/indent.h"
19 #include "nvim/main.h"
20 #include "nvim/mark.h"
21 #include "nvim/mbyte.h"
22 #include "nvim/memline.h"
23 #include "nvim/memory.h"
24 #include "nvim/misc1.h"
25 #include "nvim/move.h"
26 #include "nvim/option.h"
27 #include "nvim/os_unix.h"
28 #include "nvim/path.h"
29 #include "nvim/plines.h"
30 #include "nvim/state.h"
31 #include "nvim/strings.h"
32 #include "nvim/vim.h"
33
34 #ifdef INCLUDE_GENERATED_DECLARATIONS
35 # include "charset.c.generated.h"
36 #endif
37
38
39 static bool chartab_initialized = false;
40
41 // b_chartab[] is an array with 256 bits, each bit representing one of the
42 // characters 0-255.
43 #define SET_CHARTAB(buf, c) \
44 (buf)->b_chartab[(unsigned)(c) >> 6] |= (1ull << ((c) & 0x3f))
45 #define RESET_CHARTAB(buf, c) \
46 (buf)->b_chartab[(unsigned)(c) >> 6] &= ~(1ull << ((c) & 0x3f))
47 #define GET_CHARTAB_TAB(chartab, c) \
48 ((chartab)[(unsigned)(c) >> 6] & (1ull << ((c) & 0x3f)))
49
50 // Table used below, see init_chartab() for an explanation
51 static char_u g_chartab[256];
52
53 // Flags for g_chartab[].
54 #define CT_CELL_MASK 0x07 ///< mask: nr of display cells (1, 2 or 4)
55 #define CT_PRINT_CHAR 0x10 ///< flag: set for printable chars
56 #define CT_ID_CHAR 0x20 ///< flag: set for ID chars
57 #define CT_FNAME_CHAR 0x40 ///< flag: set for file name chars
58
59 /// Fill g_chartab[]. Also fills curbuf->b_chartab[] with flags for keyword
60 /// characters for current buffer.
61 ///
62 /// Depends on the option settings 'iskeyword', 'isident', 'isfname',
63 /// 'isprint' and 'encoding'.
64 ///
65 /// The index in g_chartab[] is the character when first byte is up to 0x80,
66 /// if the first byte is 0x80 and above it depends on further bytes.
67 ///
68 /// The contents of g_chartab[]:
69 /// - The lower two bits, masked by CT_CELL_MASK, give the number of display
70 /// cells the character occupies (1 or 2). Not valid for UTF-8 above 0x80.
71 /// - CT_PRINT_CHAR bit is set when the character is printable (no need to
72 /// translate the character before displaying it). Note that only DBCS
73 /// characters can have 2 display cells and still be printable.
74 /// - CT_FNAME_CHAR bit is set when the character can be in a file name.
75 /// - CT_ID_CHAR bit is set when the character can be in an identifier.
76 ///
77 /// @return FAIL if 'iskeyword', 'isident', 'isfname' or 'isprint' option has
78 /// an error, OK otherwise.
init_chartab(void)79 int init_chartab(void)
80 {
81 return buf_init_chartab(curbuf, true);
82 }
83
84 /// Helper for init_chartab
85 ///
86 /// @param global false: only set buf->b_chartab[]
87 ///
88 /// @return FAIL if 'iskeyword', 'isident', 'isfname' or 'isprint' option has
89 /// an error, OK otherwise.
buf_init_chartab(buf_T * buf,int global)90 int buf_init_chartab(buf_T *buf, int global)
91 {
92 int c;
93 int c2;
94 int i;
95 bool tilde;
96 bool do_isalpha;
97
98 if (global) {
99 // Set the default size for printable characters:
100 // From <Space> to '~' is 1 (printable), others are 2 (not printable).
101 // This also inits all 'isident' and 'isfname' flags to false.
102 c = 0;
103
104 while (c < ' ') {
105 g_chartab[c++] = (dy_flags & DY_UHEX) ? 4 : 2;
106 }
107
108 while (c <= '~') {
109 g_chartab[c++] = 1 + CT_PRINT_CHAR;
110 }
111
112 while (c < 256) {
113 if (c >= 0xa0) {
114 // UTF-8: bytes 0xa0 - 0xff are printable (latin1)
115 g_chartab[c++] = CT_PRINT_CHAR + 1;
116 } else {
117 // the rest is unprintable by default
118 g_chartab[c++] = (dy_flags & DY_UHEX) ? 4 : 2;
119 }
120 }
121
122 // Assume that every multi-byte char is a filename character.
123 for (c = 1; c < 256; c++) {
124 if (c >= 0xa0) {
125 g_chartab[c] |= CT_FNAME_CHAR;
126 }
127 }
128 }
129
130 // Init word char flags all to false
131 memset(buf->b_chartab, 0, (size_t)32);
132
133 // In lisp mode the '-' character is included in keywords.
134 if (buf->b_p_lisp) {
135 SET_CHARTAB(buf, '-');
136 }
137
138 // Walk through the 'isident', 'iskeyword', 'isfname' and 'isprint'
139 // options Each option is a list of characters, character numbers or
140 // ranges, separated by commas, e.g.: "200-210,x,#-178,-"
141 for (i = global ? 0 : 3; i <= 3; i++) {
142 const char_u *p;
143 if (i == 0) {
144 // first round: 'isident'
145 p = p_isi;
146 } else if (i == 1) {
147 // second round: 'isprint'
148 p = p_isp;
149 } else if (i == 2) {
150 // third round: 'isfname'
151 p = p_isf;
152 } else { // i == 3
153 // fourth round: 'iskeyword'
154 p = buf->b_p_isk;
155 }
156
157 while (*p) {
158 tilde = false;
159 do_isalpha = false;
160
161 if ((*p == '^') && (p[1] != NUL)) {
162 tilde = true;
163 ++p;
164 }
165
166 if (ascii_isdigit(*p)) {
167 c = getdigits_int((char_u **)&p, true, 0);
168 } else {
169 c = mb_ptr2char_adv(&p);
170 }
171 c2 = -1;
172
173 if ((*p == '-') && (p[1] != NUL)) {
174 ++p;
175
176 if (ascii_isdigit(*p)) {
177 c2 = getdigits_int((char_u **)&p, true, 0);
178 } else {
179 c2 = mb_ptr2char_adv(&p);
180 }
181 }
182
183 if ((c <= 0)
184 || (c >= 256)
185 || ((c2 < c) && (c2 != -1))
186 || (c2 >= 256)
187 || !((*p == NUL) || (*p == ','))) {
188 return FAIL;
189 }
190
191 if (c2 == -1) { // not a range
192 // A single '@' (not "@-@"):
193 // Decide on letters being ID/printable/keyword chars with
194 // standard function isalpha(). This takes care of locale for
195 // single-byte characters).
196 if (c == '@') {
197 do_isalpha = true;
198 c = 1;
199 c2 = 255;
200 } else {
201 c2 = c;
202 }
203 }
204
205 while (c <= c2) {
206 // Use the MB_ functions here, because isalpha() doesn't
207 // work properly when 'encoding' is "latin1" and the locale is
208 // "C".
209 if (!do_isalpha
210 || mb_islower(c)
211 || mb_isupper(c)) {
212 if (i == 0) {
213 // (re)set ID flag
214 if (tilde) {
215 g_chartab[c] &= (uint8_t) ~CT_ID_CHAR;
216 } else {
217 g_chartab[c] |= CT_ID_CHAR;
218 }
219 } else if (i == 1) {
220 // (re)set printable
221 // For double-byte we keep the cell width, so
222 // that we can detect it from the first byte.
223 if (((c < ' ') || (c > '~'))) {
224 if (tilde) {
225 g_chartab[c] = (uint8_t)((g_chartab[c] & ~CT_CELL_MASK)
226 + ((dy_flags & DY_UHEX) ? 4 : 2));
227 g_chartab[c] &= (uint8_t) ~CT_PRINT_CHAR;
228 } else {
229 g_chartab[c] = (uint8_t)((g_chartab[c] & ~CT_CELL_MASK) + 1);
230 g_chartab[c] |= CT_PRINT_CHAR;
231 }
232 }
233 } else if (i == 2) {
234 // (re)set fname flag
235 if (tilde) {
236 g_chartab[c] &= (uint8_t) ~CT_FNAME_CHAR;
237 } else {
238 g_chartab[c] |= CT_FNAME_CHAR;
239 }
240 } else { // i == 3
241 // (re)set keyword flag
242 if (tilde) {
243 RESET_CHARTAB(buf, c);
244 } else {
245 SET_CHARTAB(buf, c);
246 }
247 }
248 }
249 ++c;
250 }
251
252 c = *p;
253 p = skip_to_option_part(p);
254
255 if ((c == ',') && (*p == NUL)) {
256 // Trailing comma is not allowed.
257 return FAIL;
258 }
259 }
260 }
261 chartab_initialized = true;
262 return OK;
263 }
264
265 /// Translate any special characters in buf[bufsize] in-place.
266 ///
267 /// The result is a string with only printable characters, but if there is not
268 /// enough room, not all characters will be translated.
269 ///
270 /// @param buf
271 /// @param bufsize
trans_characters(char_u * buf,int bufsize)272 void trans_characters(char_u *buf, int bufsize)
273 {
274 int len; // length of string needing translation
275 int room; // room in buffer after string
276 char_u *trs; // translated character
277 int trs_len; // length of trs[]
278
279 len = (int)STRLEN(buf);
280 room = bufsize - len;
281
282 while (*buf != 0) {
283 // Assume a multi-byte character doesn't need translation.
284 if ((trs_len = utfc_ptr2len(buf)) > 1) {
285 len -= trs_len;
286 } else {
287 trs = transchar_byte(*buf);
288 trs_len = (int)STRLEN(trs);
289
290 if (trs_len > 1) {
291 room -= trs_len - 1;
292 if (room <= 0) {
293 return;
294 }
295 memmove(buf + trs_len, buf + 1, (size_t)len);
296 }
297 memmove(buf, trs, (size_t)trs_len);
298 --len;
299 }
300 buf += trs_len;
301 }
302 }
303
304 /// Find length of a string capable of holding s with all specials replaced
305 ///
306 /// Assumes replacing special characters with printable ones just like
307 /// strtrans() does.
308 ///
309 /// @param[in] s String to check.
310 ///
311 /// @return number of bytes needed to hold a translation of `s`, NUL byte not
312 /// included.
transstr_len(const char * const s,bool untab)313 size_t transstr_len(const char *const s, bool untab)
314 FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_PURE
315 {
316 const char *p = s;
317 size_t len = 0;
318
319 while (*p) {
320 const size_t l = (size_t)utfc_ptr2len((const char_u *)p);
321 if (l > 1) {
322 int pcc[MAX_MCO + 1];
323 pcc[0] = utfc_ptr2char((const char_u *)p, &pcc[1]);
324
325 if (vim_isprintc(pcc[0])) {
326 len += l;
327 } else {
328 for (size_t i = 0; i < ARRAY_SIZE(pcc) && pcc[i]; i++) {
329 char hexbuf[9];
330 len += transchar_hex(hexbuf, pcc[i]);
331 }
332 }
333 p += l;
334 } else if (*p == TAB && !untab) {
335 len += 1;
336 p++;
337 } else {
338 const int b2c_l = byte2cells((uint8_t)(*p++));
339 // Illegal byte sequence may occupy up to 4 characters.
340 len += (size_t)(b2c_l > 0 ? b2c_l : 4);
341 }
342 }
343 return len;
344 }
345
346 /// Replace special characters with printable ones
347 ///
348 /// @param[in] s String to replace characters from.
349 /// @param[out] buf Buffer to which result should be saved.
350 /// @param[in] len Buffer length. Resulting string may not occupy more then
351 /// len - 1 bytes (one for trailing NUL byte).
352 /// @param[in] untab remove tab characters
353 ///
354 /// @return length of the resulting string, without the NUL byte.
transstr_buf(const char * const s,char * const buf,const size_t len,bool untab)355 size_t transstr_buf(const char *const s, char *const buf, const size_t len, bool untab)
356 FUNC_ATTR_NONNULL_ALL
357 {
358 const char *p = s;
359 char *buf_p = buf;
360 char *const buf_e = buf_p + len - 1;
361
362 while (*p != NUL && buf_p < buf_e) {
363 const size_t l = (size_t)utfc_ptr2len((const char_u *)p);
364 if (l > 1) {
365 if (buf_p + l > buf_e) {
366 break; // Exceeded `buf` size.
367 }
368 int pcc[MAX_MCO + 1];
369 pcc[0] = utfc_ptr2char((const char_u *)p, &pcc[1]);
370
371 if (vim_isprintc(pcc[0])) {
372 memmove(buf_p, p, l);
373 buf_p += l;
374 } else {
375 for (size_t i = 0; i < ARRAY_SIZE(pcc) && pcc[i]; i++) {
376 char hexbuf[9]; // <up to 6 bytes>NUL
377 const size_t hexlen = transchar_hex(hexbuf, pcc[i]);
378 if (buf_p + hexlen > buf_e) {
379 break;
380 }
381 memmove(buf_p, hexbuf, hexlen);
382 buf_p += hexlen;
383 }
384 }
385 p += l;
386 } else if (*p == TAB && !untab) {
387 *buf_p++ = *p++;
388 } else {
389 const char *const tb = (const char *)transchar_byte((uint8_t)(*p++));
390 const size_t tb_len = strlen(tb);
391 if (buf_p + tb_len > buf_e) {
392 break; // Exceeded `buf` size.
393 }
394 memmove(buf_p, tb, tb_len);
395 buf_p += tb_len;
396 }
397 }
398 *buf_p = NUL;
399 assert(buf_p <= buf_e);
400 return (size_t)(buf_p - buf);
401 }
402
403 /// Copy string and replace special characters with printable characters
404 ///
405 /// Works like `strtrans()` does, used for that and in some other places.
406 ///
407 /// @param[in] s String to replace characters from.
408 ///
409 /// @return [allocated] translated string
transstr(const char * const s,bool untab)410 char *transstr(const char *const s, bool untab)
411 FUNC_ATTR_NONNULL_RET
412 {
413 // Compute the length of the result, taking account of unprintable
414 // multi-byte characters.
415 const size_t len = transstr_len(s, untab) + 1;
416 char *const buf = xmalloc(len);
417 transstr_buf(s, buf, len, untab);
418 return buf;
419 }
420
421 /// Convert the string "str[orglen]" to do ignore-case comparing.
422 /// Use the current locale.
423 ///
424 /// When "buf" is NULL, return an allocated string.
425 /// Otherwise, put the result in buf, limited by buflen, and return buf.
str_foldcase(char_u * str,int orglen,char_u * buf,int buflen)426 char_u *str_foldcase(char_u *str, int orglen, char_u *buf, int buflen)
427 FUNC_ATTR_NONNULL_RET
428 {
429 garray_T ga;
430 int i;
431 int len = orglen;
432
433 #define GA_CHAR(i) ((char_u *)ga.ga_data)[i]
434 #define GA_PTR(i) ((char_u *)ga.ga_data + i)
435 #define STR_CHAR(i) (buf == NULL ? GA_CHAR(i) : buf[i])
436 #define STR_PTR(i) (buf == NULL ? GA_PTR(i) : buf + i)
437
438 // Copy "str" into "buf" or allocated memory, unmodified.
439 if (buf == NULL) {
440 ga_init(&ga, 1, 10);
441
442 ga_grow(&ga, len + 1);
443 memmove(ga.ga_data, str, (size_t)len);
444 ga.ga_len = len;
445 } else {
446 if (len >= buflen) {
447 // Ugly!
448 len = buflen - 1;
449 }
450 memmove(buf, str, (size_t)len);
451 }
452
453 if (buf == NULL) {
454 GA_CHAR(len) = NUL;
455 } else {
456 buf[len] = NUL;
457 }
458
459 // Make each character lower case.
460 i = 0;
461 while (STR_CHAR(i) != NUL) {
462 int c = utf_ptr2char(STR_PTR(i));
463 int olen = utf_ptr2len(STR_PTR(i));
464 int lc = mb_tolower(c);
465
466 // Only replace the character when it is not an invalid
467 // sequence (ASCII character or more than one byte) and
468 // mb_tolower() doesn't return the original character.
469 if (((c < 0x80) || (olen > 1)) && (c != lc)) {
470 int nlen = utf_char2len(lc);
471
472 // If the byte length changes need to shift the following
473 // characters forward or backward.
474 if (olen != nlen) {
475 if (nlen > olen) {
476 if (buf == NULL) {
477 ga_grow(&ga, nlen - olen + 1);
478 } else {
479 if (len + nlen - olen >= buflen) {
480 // out of memory, keep old char
481 lc = c;
482 nlen = olen;
483 }
484 }
485 }
486
487 if (olen != nlen) {
488 if (buf == NULL) {
489 STRMOVE(GA_PTR(i) + nlen, GA_PTR(i) + olen);
490 ga.ga_len += nlen - olen;
491 } else {
492 STRMOVE(buf + i + nlen, buf + i + olen);
493 len += nlen - olen;
494 }
495 }
496 }
497 (void)utf_char2bytes(lc, STR_PTR(i));
498 }
499
500 // skip to next multi-byte char
501 i += utfc_ptr2len(STR_PTR(i));
502 }
503
504
505 if (buf == NULL) {
506 return (char_u *)ga.ga_data;
507 }
508 return buf;
509 }
510
511 // Catch 22: g_chartab[] can't be initialized before the options are
512 // initialized, and initializing options may cause transchar() to be called!
513 // When chartab_initialized == false don't use g_chartab[].
514 // Does NOT work for multi-byte characters, c must be <= 255.
515 // Also doesn't work for the first byte of a multi-byte, "c" must be a
516 // character!
517 static char_u transchar_charbuf[11];
518
519 /// Translate a character into a printable one, leaving printable ASCII intact
520 ///
521 /// All unicode characters are considered non-printable in this function.
522 ///
523 /// @param[in] c Character to translate.
524 ///
525 /// @return translated character into a static buffer.
transchar(int c)526 char_u *transchar(int c)
527 {
528 return transchar_buf(curbuf, c);
529 }
530
transchar_buf(const buf_T * buf,int c)531 char_u *transchar_buf(const buf_T *buf, int c)
532 FUNC_ATTR_NONNULL_ALL
533 {
534 int i = 0;
535 if (IS_SPECIAL(c)) {
536 // special key code, display as ~@ char
537 transchar_charbuf[0] = '~';
538 transchar_charbuf[1] = '@';
539 i = 2;
540 c = K_SECOND(c);
541 }
542
543 if ((!chartab_initialized && (((c >= ' ') && (c <= '~'))))
544 || ((c <= 0xFF) && vim_isprintc_strict(c))) {
545 // printable character
546 transchar_charbuf[i] = (char_u)c;
547 transchar_charbuf[i + 1] = NUL;
548 } else if (c <= 0xFF) {
549 transchar_nonprint(buf, transchar_charbuf + i, c);
550 } else {
551 transchar_hex((char *)transchar_charbuf + i, c);
552 }
553 return transchar_charbuf;
554 }
555
556 /// Like transchar(), but called with a byte instead of a character
557 ///
558 /// Checks for an illegal UTF-8 byte.
559 ///
560 /// @param[in] c Byte to translate.
561 ///
562 /// @return pointer to translated character in transchar_charbuf.
transchar_byte(const int c)563 char_u *transchar_byte(const int c)
564 FUNC_ATTR_WARN_UNUSED_RESULT
565 {
566 if (c >= 0x80) {
567 transchar_nonprint(curbuf, transchar_charbuf, c);
568 return transchar_charbuf;
569 }
570 return transchar(c);
571 }
572
573 /// Convert non-printable characters to 2..4 printable ones
574 ///
575 /// @warning Does not work for multi-byte characters, c must be <= 255.
576 ///
577 /// @param[in] buf Required to check the file format
578 /// @param[out] charbuf Buffer to store result in, must be able to hold
579 /// at least 5 bytes (conversion result + NUL).
580 /// @param[in] c Character to convert. NUL is assumed to be NL according to
581 /// `:h NL-used-for-NUL`.
transchar_nonprint(const buf_T * buf,char_u * charbuf,int c)582 void transchar_nonprint(const buf_T *buf, char_u *charbuf, int c)
583 FUNC_ATTR_NONNULL_ALL
584 {
585 if (c == NL) {
586 // we use newline in place of a NUL
587 c = NUL;
588 } else if ((c == CAR) && (get_fileformat(buf) == EOL_MAC)) {
589 // we use CR in place of NL in this case
590 c = NL;
591 }
592 assert(c <= 0xff);
593
594 if (dy_flags & DY_UHEX || c > 0x7f) {
595 // 'display' has "uhex"
596 transchar_hex((char *)charbuf, c);
597 } else {
598 // 0x00 - 0x1f and 0x7f
599 charbuf[0] = '^';
600 // DEL displayed as ^?
601 charbuf[1] = (char_u)(c ^ 0x40);
602
603 charbuf[2] = NUL;
604 }
605 }
606
607 /// Convert a non-printable character to hex C string like "<FFFF>"
608 ///
609 /// @param[out] buf Buffer to store result in.
610 /// @param[in] c Character to convert.
611 ///
612 /// @return Number of bytes stored in buffer, excluding trailing NUL byte.
transchar_hex(char * const buf,const int c)613 size_t transchar_hex(char *const buf, const int c)
614 FUNC_ATTR_NONNULL_ALL
615 {
616 size_t i = 0;
617
618 buf[i++] = '<';
619 if (c > 255) {
620 if (c > 255 * 256) {
621 buf[i++] = (char)nr2hex((unsigned)c >> 20);
622 buf[i++] = (char)nr2hex((unsigned)c >> 16);
623 }
624 buf[i++] = (char)nr2hex((unsigned)c >> 12);
625 buf[i++] = (char)nr2hex((unsigned)c >> 8);
626 }
627 buf[i++] = (char)(nr2hex((unsigned)c >> 4));
628 buf[i++] = (char)(nr2hex((unsigned)c));
629 buf[i++] = '>';
630 buf[i] = NUL;
631 return i;
632 }
633
634 /// Convert the lower 4 bits of byte "c" to its hex character
635 ///
636 /// Lower case letters are used to avoid the confusion of <F1> being 0xf1 or
637 /// function key 1.
638 ///
639 /// @param[in] n Number to convert.
640 ///
641 /// @return the hex character.
nr2hex(unsigned n)642 static inline unsigned nr2hex(unsigned n)
643 FUNC_ATTR_CONST FUNC_ATTR_WARN_UNUSED_RESULT
644 {
645 if ((n & 0xf) <= 9) {
646 return (n & 0xf) + '0';
647 }
648 return (n & 0xf) - 10 + 'a';
649 }
650
651 /// Return number of display cells occupied by byte "b".
652 ///
653 /// Caller must make sure 0 <= b <= 255.
654 /// For multi-byte mode "b" must be the first byte of a character.
655 /// A TAB is counted as two cells: "^I".
656 /// This will return 0 for bytes >= 0x80, because the number of
657 /// cells depends on further bytes in UTF-8.
658 ///
659 /// @param b
660 ///
661 /// @reeturn Number of display cells.
byte2cells(int b)662 int byte2cells(int b)
663 {
664 if (b >= 0x80) {
665 return 0;
666 }
667 return g_chartab[b] & CT_CELL_MASK;
668 }
669
670 /// Return number of display cells occupied by character "c".
671 ///
672 /// "c" can be a special key (negative number) in which case 3 or 4 is returned.
673 /// A TAB is counted as two cells: "^I" or four: "<09>".
674 ///
675 /// @param c
676 ///
677 /// @return Number of display cells.
char2cells(int c)678 int char2cells(int c)
679 {
680 if (IS_SPECIAL(c)) {
681 return char2cells(K_SECOND(c)) + 2;
682 }
683
684 if (c >= 0x80) {
685 // UTF-8: above 0x80 need to check the value
686 return utf_char2cells(c);
687 }
688 return g_chartab[c & 0xff] & CT_CELL_MASK;
689 }
690
691 /// Return number of display cells occupied by character at "*p".
692 /// A TAB is counted as two cells: "^I" or four: "<09>".
693 ///
694 /// @param p
695 ///
696 /// @return number of display cells.
ptr2cells(const char_u * p)697 int ptr2cells(const char_u *p)
698 {
699 // For UTF-8 we need to look at more bytes if the first byte is >= 0x80.
700 if (*p >= 0x80) {
701 return utf_ptr2cells(p);
702 }
703
704 // For DBCS we can tell the cell count from the first byte.
705 return g_chartab[*p] & CT_CELL_MASK;
706 }
707
708 /// Return the number of character cells string "s" will take on the screen,
709 /// counting TABs as two characters: "^I".
710 ///
711 /// 's' must be non-null.
712 ///
713 /// @param s
714 ///
715 /// @return number of character cells.
vim_strsize(char_u * s)716 int vim_strsize(char_u *s)
717 {
718 return vim_strnsize(s, MAXCOL);
719 }
720
721 /// Return the number of character cells string "s[len]" will take on the
722 /// screen, counting TABs as two characters: "^I".
723 ///
724 /// 's' must be non-null.
725 ///
726 /// @param s
727 /// @param len
728 ///
729 /// @return Number of character cells.
vim_strnsize(char_u * s,int len)730 int vim_strnsize(char_u *s, int len)
731 {
732 assert(s != NULL);
733 int size = 0;
734 while (*s != NUL && --len >= 0) {
735 int l = utfc_ptr2len(s);
736 size += ptr2cells(s);
737 s += l;
738 len -= l - 1;
739 }
740 return size;
741 }
742
743 /// Check that "c" is a normal identifier character:
744 /// Letters and characters from the 'isident' option.
745 ///
746 /// @param c character to check
vim_isIDc(int c)747 bool vim_isIDc(int c)
748 FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT
749 {
750 return c > 0 && c < 0x100 && (g_chartab[c] & CT_ID_CHAR);
751 }
752
753 /// Check that "c" is a keyword character:
754 /// Letters and characters from 'iskeyword' option for the current buffer.
755 /// For multi-byte characters mb_get_class() is used (builtin rules).
756 ///
757 /// @param c character to check
vim_iswordc(const int c)758 bool vim_iswordc(const int c)
759 FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT
760 {
761 return vim_iswordc_buf(c, curbuf);
762 }
763
764 /// Check that "c" is a keyword character
765 /// Letters and characters from 'iskeyword' option for given buffer.
766 /// For multi-byte characters mb_get_class() is used (builtin rules).
767 ///
768 /// @param[in] c Character to check.
769 /// @param[in] chartab Buffer chartab.
vim_iswordc_tab(const int c,const uint64_t * const chartab)770 bool vim_iswordc_tab(const int c, const uint64_t *const chartab)
771 FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
772 {
773 return (c >= 0x100
774 ? (utf_class_tab(c, chartab) >= 2)
775 : (c > 0 && GET_CHARTAB_TAB(chartab, c) != 0));
776 }
777
778 /// Check that "c" is a keyword character:
779 /// Letters and characters from 'iskeyword' option for given buffer.
780 /// For multi-byte characters mb_get_class() is used (builtin rules).
781 ///
782 /// @param c character to check
783 /// @param buf buffer whose keywords to use
vim_iswordc_buf(const int c,buf_T * const buf)784 bool vim_iswordc_buf(const int c, buf_T *const buf)
785 FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ARG(2)
786 {
787 return vim_iswordc_tab(c, buf->b_chartab);
788 }
789
790 /// Just like vim_iswordc() but uses a pointer to the (multi-byte) character.
791 ///
792 /// @param p pointer to the multi-byte character
793 ///
794 /// @return true if "p" points to a keyword character.
vim_iswordp(const char_u * const p)795 bool vim_iswordp(const char_u *const p)
796 FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
797 {
798 return vim_iswordp_buf(p, curbuf);
799 }
800
801 /// Just like vim_iswordc_buf() but uses a pointer to the (multi-byte)
802 /// character.
803 ///
804 /// @param p pointer to the multi-byte character
805 /// @param buf buffer whose keywords to use
806 ///
807 /// @return true if "p" points to a keyword character.
vim_iswordp_buf(const char_u * const p,buf_T * const buf)808 bool vim_iswordp_buf(const char_u *const p, buf_T *const buf)
809 FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
810 {
811 int c = *p;
812
813 if (MB_BYTE2LEN(c) > 1) {
814 c = utf_ptr2char(p);
815 }
816 return vim_iswordc_buf(c, buf);
817 }
818
819 /// Check that "c" is a valid file-name character.
820 /// Assume characters above 0x100 are valid (multi-byte).
821 ///
822 /// @param c character to check
vim_isfilec(int c)823 bool vim_isfilec(int c)
824 FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT
825 {
826 return c >= 0x100 || (c > 0 && (g_chartab[c] & CT_FNAME_CHAR));
827 }
828
829 /// Check that "c" is a valid file-name character or a wildcard character
830 /// Assume characters above 0x100 are valid (multi-byte).
831 /// Explicitly interpret ']' as a wildcard character as path_has_wildcard("]")
832 /// returns false.
833 ///
834 /// @param c character to check
vim_isfilec_or_wc(int c)835 bool vim_isfilec_or_wc(int c)
836 FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT
837 {
838 char_u buf[2];
839 buf[0] = (char_u)c;
840 buf[1] = NUL;
841 return vim_isfilec(c) || c == ']' || path_has_wildcard(buf);
842 }
843
844 /// Check that "c" is a printable character.
845 /// Assume characters above 0x100 are printable for double-byte encodings.
846 ///
847 /// @param c character to check
vim_isprintc(int c)848 bool vim_isprintc(int c)
849 FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT
850 {
851 if (c >= 0x100) {
852 return utf_printable(c);
853 }
854 return c > 0 && (g_chartab[c] & CT_PRINT_CHAR);
855 }
856
857 /// Strict version of vim_isprintc(c), don't return true if "c" is the head
858 /// byte of a double-byte character.
859 ///
860 /// @param c character to check
861 ///
862 /// @return true if "c" is a printable character.
vim_isprintc_strict(int c)863 bool vim_isprintc_strict(int c)
864 FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT
865 {
866 if (c >= 0x100) {
867 return utf_printable(c);
868 }
869 return c > 0 && (g_chartab[c] & CT_PRINT_CHAR);
870 }
871
872 /// Check that virtual column "vcol" is in the rightmost column of window "wp".
873 ///
874 /// @param wp window
875 /// @param vcol column number
in_win_border(win_T * wp,colnr_T vcol)876 bool in_win_border(win_T *wp, colnr_T vcol)
877 FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ARG(1)
878 {
879 int width1; // width of first line (after line number)
880 int width2; // width of further lines
881
882 if (wp->w_width_inner == 0) {
883 // there is no border
884 return false;
885 }
886 width1 = wp->w_width_inner - win_col_off(wp);
887
888 if ((int)vcol < width1 - 1) {
889 return false;
890 }
891
892 if ((int)vcol == width1 - 1) {
893 return true;
894 }
895 width2 = width1 + win_col_off2(wp);
896
897 if (width2 <= 0) {
898 return false;
899 }
900 return (vcol - width1) % width2 == width2 - 1;
901 }
902
903 /// Get virtual column number of pos.
904 /// start: on the first position of this character (TAB, ctrl)
905 /// cursor: where the cursor is on this character (first char, except for TAB)
906 /// end: on the last position of this character (TAB, ctrl)
907 ///
908 /// This is used very often, keep it fast!
909 ///
910 /// @param wp
911 /// @param pos
912 /// @param start
913 /// @param cursor
914 /// @param end
getvcol(win_T * wp,pos_T * pos,colnr_T * start,colnr_T * cursor,colnr_T * end)915 void getvcol(win_T *wp, pos_T *pos, colnr_T *start, colnr_T *cursor, colnr_T *end)
916 {
917 colnr_T vcol;
918 char_u *ptr; // points to current char
919 char_u *posptr; // points to char at pos->col
920 char_u *line; // start of the line
921 int incr;
922 int head;
923 long *vts = wp->w_buffer->b_p_vts_array;
924 int ts = (int)wp->w_buffer->b_p_ts;
925 int c;
926
927 vcol = 0;
928 line = ptr = ml_get_buf(wp->w_buffer, pos->lnum, false);
929
930 if (pos->col == MAXCOL) {
931 // continue until the NUL
932 posptr = NULL;
933 } else {
934 // Special check for an empty line, which can happen on exit, when
935 // ml_get_buf() always returns an empty string.
936 if (*ptr == NUL) {
937 pos->col = 0;
938 }
939 posptr = ptr + pos->col;
940 posptr -= utf_head_off(line, posptr);
941 }
942
943 // This function is used very often, do some speed optimizations.
944 // When 'list', 'linebreak', 'showbreak' and 'breakindent' are not set
945 // use a simple loop.
946 // Also use this when 'list' is set but tabs take their normal size.
947 if ((!wp->w_p_list || (wp->w_p_lcs_chars.tab1 != NUL))
948 && !wp->w_p_lbr
949 && *get_showbreak_value(wp) == NUL
950 && !wp->w_p_bri) {
951 for (;;) {
952 head = 0;
953 c = *ptr;
954
955 // make sure we don't go past the end of the line
956 if (c == NUL) {
957 // NUL at end of line only takes one column
958 incr = 1;
959 break;
960 }
961
962 // A tab gets expanded, depending on the current column
963 if (c == TAB) {
964 incr = tabstop_padding(vcol, ts, vts);
965 } else {
966 // For utf-8, if the byte is >= 0x80, need to look at
967 // further bytes to find the cell width.
968 if (c >= 0x80) {
969 incr = utf_ptr2cells(ptr);
970 } else {
971 incr = g_chartab[c] & CT_CELL_MASK;
972 }
973
974 // If a double-cell char doesn't fit at the end of a line
975 // it wraps to the next line, it's like this char is three
976 // cells wide.
977 if ((incr == 2)
978 && wp->w_p_wrap
979 && (MB_BYTE2LEN(*ptr) > 1)
980 && in_win_border(wp, vcol)) {
981 incr++;
982 head = 1;
983 }
984 }
985
986 if ((posptr != NULL) && (ptr >= posptr)) {
987 // character at pos->col
988 break;
989 }
990
991 vcol += incr;
992 MB_PTR_ADV(ptr);
993 }
994 } else {
995 for (;;) {
996 // A tab gets expanded, depending on the current column
997 head = 0;
998 incr = win_lbr_chartabsize(wp, line, ptr, vcol, &head);
999
1000 // make sure we don't go past the end of the line
1001 if (*ptr == NUL) {
1002 // NUL at end of line only takes one column
1003 incr = 1;
1004 break;
1005 }
1006
1007 if ((posptr != NULL) && (ptr >= posptr)) {
1008 // character at pos->col
1009 break;
1010 }
1011
1012 vcol += incr;
1013 MB_PTR_ADV(ptr);
1014 }
1015 }
1016
1017 if (start != NULL) {
1018 *start = vcol + head;
1019 }
1020
1021 if (end != NULL) {
1022 *end = vcol + incr - 1;
1023 }
1024
1025 if (cursor != NULL) {
1026 if ((*ptr == TAB)
1027 && (State & NORMAL)
1028 && !wp->w_p_list
1029 && !virtual_active()
1030 && !(VIsual_active && ((*p_sel == 'e') || ltoreq(*pos, VIsual)))) {
1031 // cursor at end
1032 *cursor = vcol + incr - 1;
1033 } else {
1034 // cursor at start
1035 *cursor = vcol + head;
1036 }
1037 }
1038 }
1039
1040 /// Get virtual cursor column in the current window, pretending 'list' is off.
1041 ///
1042 /// @param posp
1043 ///
1044 /// @retujrn The virtual cursor column.
getvcol_nolist(pos_T * posp)1045 colnr_T getvcol_nolist(pos_T *posp)
1046 {
1047 int list_save = curwin->w_p_list;
1048 colnr_T vcol;
1049
1050 curwin->w_p_list = false;
1051 if (posp->coladd) {
1052 getvvcol(curwin, posp, NULL, &vcol, NULL);
1053 } else {
1054 getvcol(curwin, posp, NULL, &vcol, NULL);
1055 }
1056 curwin->w_p_list = list_save;
1057 return vcol;
1058 }
1059
1060 /// Get virtual column in virtual mode.
1061 ///
1062 /// @param wp
1063 /// @param pos
1064 /// @param start
1065 /// @param cursor
1066 /// @param end
getvvcol(win_T * wp,pos_T * pos,colnr_T * start,colnr_T * cursor,colnr_T * end)1067 void getvvcol(win_T *wp, pos_T *pos, colnr_T *start, colnr_T *cursor, colnr_T *end)
1068 {
1069 colnr_T col;
1070 colnr_T coladd;
1071 colnr_T endadd;
1072 char_u *ptr;
1073
1074 if (virtual_active()) {
1075 // For virtual mode, only want one value
1076 getvcol(wp, pos, &col, NULL, NULL);
1077
1078 coladd = pos->coladd;
1079 endadd = 0;
1080
1081 // Cannot put the cursor on part of a wide character.
1082 ptr = ml_get_buf(wp->w_buffer, pos->lnum, false);
1083
1084 if (pos->col < (colnr_T)STRLEN(ptr)) {
1085 int c = utf_ptr2char(ptr + pos->col);
1086 if ((c != TAB) && vim_isprintc(c)) {
1087 endadd = (colnr_T)(char2cells(c) - 1);
1088 if (coladd > endadd) {
1089 // past end of line
1090 endadd = 0;
1091 } else {
1092 coladd = 0;
1093 }
1094 }
1095 }
1096 col += coladd;
1097
1098 if (start != NULL) {
1099 *start = col;
1100 }
1101
1102 if (cursor != NULL) {
1103 *cursor = col;
1104 }
1105
1106 if (end != NULL) {
1107 *end = col + endadd;
1108 }
1109 } else {
1110 getvcol(wp, pos, start, cursor, end);
1111 }
1112 }
1113
1114 /// Get the leftmost and rightmost virtual column of pos1 and pos2.
1115 /// Used for Visual block mode.
1116 ///
1117 /// @param wp
1118 /// @param pos1
1119 /// @param pos2
1120 /// @param left
1121 /// @param right
getvcols(win_T * wp,pos_T * pos1,pos_T * pos2,colnr_T * left,colnr_T * right)1122 void getvcols(win_T *wp, pos_T *pos1, pos_T *pos2, colnr_T *left, colnr_T *right)
1123 {
1124 colnr_T from1;
1125 colnr_T from2;
1126 colnr_T to1;
1127 colnr_T to2;
1128
1129 if (lt(*pos1, *pos2)) {
1130 getvvcol(wp, pos1, &from1, NULL, &to1);
1131 getvvcol(wp, pos2, &from2, NULL, &to2);
1132 } else {
1133 getvvcol(wp, pos2, &from1, NULL, &to1);
1134 getvvcol(wp, pos1, &from2, NULL, &to2);
1135 }
1136
1137 if (from2 < from1) {
1138 *left = from2;
1139 } else {
1140 *left = from1;
1141 }
1142
1143 if (to2 > to1) {
1144 if ((*p_sel == 'e') && (from2 - 1 >= to1)) {
1145 *right = from2 - 1;
1146 } else {
1147 *right = to2;
1148 }
1149 } else {
1150 *right = to1;
1151 }
1152 }
1153
1154 /// skipwhite: skip over ' ' and '\t'.
1155 ///
1156 /// @param[in] p String to skip in.
1157 ///
1158 /// @return Pointer to character after the skipped whitespace.
skipwhite(const char_u * const p)1159 char_u *skipwhite(const char_u *const p)
1160 FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
1161 FUNC_ATTR_NONNULL_RET
1162 {
1163 return skipwhite_len(p, STRLEN(p));
1164 }
1165
1166 /// Like `skipwhite`, but skip up to `len` characters.
1167 /// @see skipwhite
1168 ///
1169 /// @param[in] p String to skip in.
1170 /// @param[in] len Max length to skip.
1171 ///
1172 /// @return Pointer to character after the skipped whitespace, or the `len`-th
1173 /// character in the string.
skipwhite_len(const char_u * p,size_t len)1174 char_u *skipwhite_len(const char_u *p, size_t len)
1175 FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
1176 FUNC_ATTR_NONNULL_RET
1177 {
1178 for (; len > 0 && ascii_iswhite(*p); len--) {
1179 p++;
1180 }
1181 return (char_u *)p;
1182 }
1183
1184 // getwhitecols: return the number of whitespace
1185 // columns (bytes) at the start of a given line
getwhitecols_curline(void)1186 intptr_t getwhitecols_curline(void)
1187 {
1188 return getwhitecols(get_cursor_line_ptr());
1189 }
1190
getwhitecols(const char_u * p)1191 intptr_t getwhitecols(const char_u *p)
1192 {
1193 return skipwhite(p) - p;
1194 }
1195
1196 /// Skip over digits
1197 ///
1198 /// @param[in] q String to skip digits in.
1199 ///
1200 /// @return Pointer to the character after the skipped digits.
skipdigits(const char_u * q)1201 char_u *skipdigits(const char_u *q)
1202 FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
1203 FUNC_ATTR_NONNULL_RET
1204 {
1205 const char_u *p = q;
1206 while (ascii_isdigit(*p)) {
1207 // skip to next non-digit
1208 p++;
1209 }
1210 return (char_u *)p;
1211 }
1212
1213 /// skip over binary digits
1214 ///
1215 /// @param q pointer to string
1216 ///
1217 /// @return Pointer to the character after the skipped digits.
skipbin(const char * q)1218 const char *skipbin(const char *q)
1219 FUNC_ATTR_PURE
1220 FUNC_ATTR_NONNULL_ALL
1221 FUNC_ATTR_NONNULL_RET
1222 {
1223 const char *p = q;
1224 while (ascii_isbdigit(*p)) {
1225 // skip to next non-digit
1226 p++;
1227 }
1228 return p;
1229 }
1230
1231 /// skip over digits and hex characters
1232 ///
1233 /// @param q
1234 ///
1235 /// @return Pointer to the character after the skipped digits and hex
1236 /// characters.
skiphex(char_u * q)1237 char_u *skiphex(char_u *q)
1238 {
1239 char_u *p = q;
1240 while (ascii_isxdigit(*p)) {
1241 // skip to next non-digit
1242 p++;
1243 }
1244 return p;
1245 }
1246
1247 /// skip to digit (or NUL after the string)
1248 ///
1249 /// @param q
1250 ///
1251 /// @return Pointer to the digit or (NUL after the string).
skiptodigit(char_u * q)1252 char_u *skiptodigit(char_u *q)
1253 {
1254 char_u *p = q;
1255 while (*p != NUL && !ascii_isdigit(*p)) {
1256 // skip to next digit
1257 p++;
1258 }
1259 return p;
1260 }
1261
1262 /// skip to binary character (or NUL after the string)
1263 ///
1264 /// @param q pointer to string
1265 ///
1266 /// @return Pointer to the binary character or (NUL after the string).
skiptobin(const char * q)1267 const char *skiptobin(const char *q)
1268 FUNC_ATTR_PURE
1269 FUNC_ATTR_NONNULL_ALL
1270 FUNC_ATTR_NONNULL_RET
1271 {
1272 const char *p = q;
1273 while (*p != NUL && !ascii_isbdigit(*p)) {
1274 // skip to next digit
1275 p++;
1276 }
1277 return p;
1278 }
1279
1280 /// skip to hex character (or NUL after the string)
1281 ///
1282 /// @param q
1283 ///
1284 /// @return Pointer to the hex character or (NUL after the string).
skiptohex(char_u * q)1285 char_u *skiptohex(char_u *q)
1286 {
1287 char_u *p = q;
1288 while (*p != NUL && !ascii_isxdigit(*p)) {
1289 // skip to next digit
1290 p++;
1291 }
1292 return p;
1293 }
1294
1295 /// Skip over text until ' ' or '\t' or NUL
1296 ///
1297 /// @param[in] p Text to skip over.
1298 ///
1299 /// @return Pointer to the next whitespace or NUL character.
skiptowhite(const char_u * p)1300 char_u *skiptowhite(const char_u *p)
1301 FUNC_ATTR_NONNULL_ALL
1302 {
1303 while (*p != ' ' && *p != '\t' && *p != NUL) {
1304 p++;
1305 }
1306 return (char_u *)p;
1307 }
1308
1309 /// skiptowhite_esc: Like skiptowhite(), but also skip escaped chars
1310 ///
1311 /// @param p
1312 ///
1313 /// @return Pointer to the next whitespace character.
skiptowhite_esc(char_u * p)1314 char_u *skiptowhite_esc(char_u *p)
1315 {
1316 while (*p != ' ' && *p != '\t' && *p != NUL) {
1317 if (((*p == '\\') || (*p == Ctrl_V)) && (*(p + 1) != NUL)) {
1318 ++p;
1319 }
1320 ++p;
1321 }
1322 return p;
1323 }
1324
1325 /// Skip over text until '\n' or NUL.
1326 ///
1327 /// @param[in] p Text to skip over.
1328 ///
1329 /// @return Pointer to the next '\n' or NUL character.
skip_to_newline(const char_u * const p)1330 char_u *skip_to_newline(const char_u *const p)
1331 FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
1332 FUNC_ATTR_NONNULL_RET
1333 {
1334 return (char_u *)xstrchrnul((const char *)p, NL);
1335 }
1336
1337 /// Gets a number from a string and skips over it, signalling overflow.
1338 ///
1339 /// @param[out] pp A pointer to a pointer to char_u.
1340 /// It will be advanced past the read number.
1341 /// @param[out] nr Number read from the string.
1342 ///
1343 /// @return true on success, false on error/overflow
try_getdigits(char_u ** pp,intmax_t * nr)1344 bool try_getdigits(char_u **pp, intmax_t *nr)
1345 {
1346 errno = 0;
1347 *nr = strtoimax((char *)(*pp), (char **)pp, 10);
1348 if (errno == ERANGE && (*nr == INTMAX_MIN || *nr == INTMAX_MAX)) {
1349 return false;
1350 }
1351 return true;
1352 }
1353
1354 /// Gets a number from a string and skips over it.
1355 ///
1356 /// @param[out] pp Pointer to a pointer to char_u.
1357 /// It will be advanced past the read number.
1358 /// @param strict Abort on overflow.
1359 /// @param def Default value, if parsing fails or overflow occurs.
1360 ///
1361 /// @return Number read from the string, or `def` on parse failure or overflow.
getdigits(char_u ** pp,bool strict,intmax_t def)1362 intmax_t getdigits(char_u **pp, bool strict, intmax_t def)
1363 {
1364 intmax_t number;
1365 int ok = try_getdigits(pp, &number);
1366 if (strict && !ok) {
1367 abort();
1368 }
1369 return ok ? number : def;
1370 }
1371
1372 /// Gets an int number from a string.
1373 ///
1374 /// @see getdigits
getdigits_int(char_u ** pp,bool strict,int def)1375 int getdigits_int(char_u **pp, bool strict, int def)
1376 {
1377 intmax_t number = getdigits(pp, strict, def);
1378 #if SIZEOF_INTMAX_T > SIZEOF_INT
1379 if (strict) {
1380 assert(number >= INT_MIN && number <= INT_MAX);
1381 } else if (!(number >= INT_MIN && number <= INT_MAX)) {
1382 return def;
1383 }
1384 #endif
1385 return (int)number;
1386 }
1387
1388 /// Gets a long number from a string.
1389 ///
1390 /// @see getdigits
getdigits_long(char_u ** pp,bool strict,long def)1391 long getdigits_long(char_u **pp, bool strict, long def)
1392 {
1393 intmax_t number = getdigits(pp, strict, def);
1394 #if SIZEOF_INTMAX_T > SIZEOF_LONG
1395 if (strict) {
1396 assert(number >= LONG_MIN && number <= LONG_MAX);
1397 } else if (!(number >= LONG_MIN && number <= LONG_MAX)) {
1398 return def;
1399 }
1400 #endif
1401 return (long)number;
1402 }
1403
1404 /// Check that "lbuf" is empty or only contains blanks.
1405 ///
1406 /// @param lbuf line buffer to check
vim_isblankline(char_u * lbuf)1407 bool vim_isblankline(char_u *lbuf)
1408 {
1409 char_u *p = skipwhite(lbuf);
1410 return *p == NUL || *p == '\r' || *p == '\n';
1411 }
1412
1413 /// Convert a string into a long and/or unsigned long, taking care of
1414 /// hexadecimal, octal and binary numbers. Accepts a '-' sign.
1415 /// If "prep" is not NULL, returns a flag to indicate the type of the number:
1416 /// 0 decimal
1417 /// '0' octal
1418 /// 'O' octal
1419 /// 'o' octal
1420 /// 'B' bin
1421 /// 'b' bin
1422 /// 'X' hex
1423 /// 'x' hex
1424 /// If "len" is not NULL, the length of the number in characters is returned.
1425 /// If "nptr" is not NULL, the signed result is returned in it.
1426 /// If "unptr" is not NULL, the unsigned result is returned in it.
1427 /// If "what" contains STR2NR_BIN recognize binary numbers.
1428 /// If "what" contains STR2NR_OCT recognize octal numbers.
1429 /// If "what" contains STR2NR_HEX recognize hex numbers.
1430 /// If "what" contains STR2NR_FORCE always assume bin/oct/hex.
1431 /// If "what" contains STR2NR_QUOTE ignore embedded single quotes
1432 /// If maxlen > 0, check at a maximum maxlen chars.
1433 /// If strict is true, check the number strictly. return *len = 0 if fail.
1434 ///
1435 /// @param start
1436 /// @param prep Returns guessed type of number 0 = decimal, 'x' or 'X' is
1437 /// hexadecimal, '0', 'o' or 'O' is octal, 'b' or 'B' is binary.
1438 /// When using STR2NR_FORCE is always zero.
1439 /// @param len Returns the detected length of number.
1440 /// @param what Recognizes what number passed, @see ChStr2NrFlags.
1441 /// @param nptr Returns the signed result.
1442 /// @param unptr Returns the unsigned result.
1443 /// @param maxlen Max length of string to check.
1444 /// @param strict If true, fail if the number has unexpected trailing
1445 /// alpha-numeric chars: *len is set to 0 and nothing else is
1446 /// returned.
vim_str2nr(const char_u * const start,int * const prep,int * const len,const int what,varnumber_T * const nptr,uvarnumber_T * const unptr,const int maxlen,const bool strict)1447 void vim_str2nr(const char_u *const start, int *const prep, int *const len, const int what,
1448 varnumber_T *const nptr, uvarnumber_T *const unptr, const int maxlen,
1449 const bool strict)
1450 FUNC_ATTR_NONNULL_ARG(1)
1451 {
1452 const char *ptr = (const char *)start;
1453 #define STRING_ENDED(ptr) \
1454 (!(maxlen == 0 || (int)((ptr) - (const char *)start) < maxlen))
1455 int pre = 0; // default is decimal
1456 const bool negative = (ptr[0] == '-');
1457 uvarnumber_T un = 0;
1458
1459 if (len != NULL) {
1460 *len = 0;
1461 }
1462
1463 if (negative) {
1464 ptr++;
1465 }
1466
1467 if (what & STR2NR_FORCE) {
1468 // When forcing main consideration is skipping the prefix. Decimal numbers
1469 // have no prefixes to skip. pre is not set.
1470 switch (what & ~(STR2NR_FORCE | STR2NR_QUOTE)) {
1471 case STR2NR_HEX:
1472 if (!STRING_ENDED(ptr + 2)
1473 && ptr[0] == '0'
1474 && (ptr[1] == 'x' || ptr[1] == 'X')
1475 && ascii_isxdigit(ptr[2])) {
1476 ptr += 2;
1477 }
1478 goto vim_str2nr_hex;
1479 case STR2NR_BIN:
1480 if (!STRING_ENDED(ptr + 2)
1481 && ptr[0] == '0'
1482 && (ptr[1] == 'b' || ptr[1] == 'B')
1483 && ascii_isbdigit(ptr[2])) {
1484 ptr += 2;
1485 }
1486 goto vim_str2nr_bin;
1487 // Make STR2NR_OOCT work the same as STR2NR_OCT when forcing.
1488 case STR2NR_OCT:
1489 case STR2NR_OOCT:
1490 case STR2NR_OCT | STR2NR_OOCT:
1491 if (!STRING_ENDED(ptr + 2)
1492 && ptr[0] == '0'
1493 && (ptr[1] == 'o' || ptr[1] == 'O')
1494 && ascii_isodigit(ptr[2])) {
1495 ptr += 2;
1496 }
1497 goto vim_str2nr_oct;
1498 case 0:
1499 goto vim_str2nr_dec;
1500 default:
1501 abort();
1502 }
1503 } else if ((what & (STR2NR_HEX | STR2NR_OCT | STR2NR_OOCT | STR2NR_BIN))
1504 && !STRING_ENDED(ptr + 1) && ptr[0] == '0' && ptr[1] != '8'
1505 && ptr[1] != '9') {
1506 pre = ptr[1];
1507 // Detect hexadecimal: 0x or 0X followed by hex digit.
1508 if ((what & STR2NR_HEX)
1509 && !STRING_ENDED(ptr + 2)
1510 && (pre == 'X' || pre == 'x')
1511 && ascii_isxdigit(ptr[2])) {
1512 ptr += 2;
1513 goto vim_str2nr_hex;
1514 }
1515 // Detect binary: 0b or 0B followed by 0 or 1.
1516 if ((what & STR2NR_BIN)
1517 && !STRING_ENDED(ptr + 2)
1518 && (pre == 'B' || pre == 'b')
1519 && ascii_isbdigit(ptr[2])) {
1520 ptr += 2;
1521 goto vim_str2nr_bin;
1522 }
1523 // Detect octal: 0o or 0O followed by octal digits (without '8' or '9').
1524 if ((what & STR2NR_OOCT)
1525 && !STRING_ENDED(ptr + 2)
1526 && (pre == 'O' || pre == 'o')
1527 && ascii_isodigit(ptr[2])) {
1528 ptr += 2;
1529 goto vim_str2nr_oct;
1530 }
1531 // Detect old octal format: 0 followed by octal digits.
1532 pre = 0;
1533 if (!(what & STR2NR_OCT)
1534 || !ascii_isodigit(ptr[1])) {
1535 goto vim_str2nr_dec;
1536 }
1537 for (int i = 2; !STRING_ENDED(ptr + i) && ascii_isdigit(ptr[i]); i++) {
1538 if (ptr[i] > '7') {
1539 goto vim_str2nr_dec;
1540 }
1541 }
1542 pre = '0';
1543 goto vim_str2nr_oct;
1544 } else {
1545 goto vim_str2nr_dec;
1546 }
1547
1548 // Do the conversion manually to avoid sscanf() quirks.
1549 abort(); // Should’ve used goto earlier.
1550 #define PARSE_NUMBER(base, cond, conv) \
1551 do { \
1552 const char *const after_prefix = ptr; \
1553 while (!STRING_ENDED(ptr)) { \
1554 if ((what & STR2NR_QUOTE) && ptr > after_prefix && *ptr == '\'') { \
1555 ptr++; \
1556 if (!STRING_ENDED(ptr) && (cond)) { \
1557 continue; \
1558 } \
1559 ptr--; \
1560 } \
1561 if (!(cond)) { \
1562 break; \
1563 } \
1564 const uvarnumber_T digit = (uvarnumber_T)(conv); \
1565 /* avoid ubsan error for overflow */ \
1566 if (un < UVARNUMBER_MAX / base \
1567 || (un == UVARNUMBER_MAX / base \
1568 && (base != 10 || digit <= UVARNUMBER_MAX % 10))) { \
1569 un = base * un + digit; \
1570 } else { \
1571 un = UVARNUMBER_MAX; \
1572 } \
1573 ptr++; \
1574 } \
1575 } while (0)
1576 vim_str2nr_bin:
1577 PARSE_NUMBER(2, (*ptr == '0' || *ptr == '1'), (*ptr - '0'));
1578 goto vim_str2nr_proceed;
1579 vim_str2nr_oct:
1580 PARSE_NUMBER(8, (ascii_isodigit(*ptr)), (*ptr - '0'));
1581 goto vim_str2nr_proceed;
1582 vim_str2nr_dec:
1583 PARSE_NUMBER(10, (ascii_isdigit(*ptr)), (*ptr - '0'));
1584 goto vim_str2nr_proceed;
1585 vim_str2nr_hex:
1586 PARSE_NUMBER(16, (ascii_isxdigit(*ptr)), (hex2nr(*ptr)));
1587 goto vim_str2nr_proceed;
1588 #undef PARSE_NUMBER
1589
1590 vim_str2nr_proceed:
1591 // Check for an alpha-numeric character immediately following, that is
1592 // most likely a typo.
1593 if (strict && ptr - (const char *)start != maxlen && ASCII_ISALNUM(*ptr)) {
1594 return;
1595 }
1596
1597 if (prep != NULL) {
1598 *prep = pre;
1599 }
1600
1601 if (len != NULL) {
1602 *len = (int)(ptr - (const char *)start);
1603 }
1604
1605 if (nptr != NULL) {
1606 if (negative) { // account for leading '-' for decimal numbers
1607 // avoid ubsan error for overflow
1608 if (un > VARNUMBER_MAX) {
1609 *nptr = VARNUMBER_MIN;
1610 } else {
1611 *nptr = -(varnumber_T)un;
1612 }
1613 } else {
1614 if (un > VARNUMBER_MAX) {
1615 un = VARNUMBER_MAX;
1616 }
1617 *nptr = (varnumber_T)un;
1618 }
1619 }
1620
1621 if (unptr != NULL) {
1622 *unptr = un;
1623 }
1624 #undef STRING_ENDED
1625 }
1626
1627 /// Return the value of a single hex character.
1628 /// Only valid when the argument is '0' - '9', 'A' - 'F' or 'a' - 'f'.
1629 ///
1630 /// @param c
1631 ///
1632 /// @return The value of the hex character.
hex2nr(int c)1633 int hex2nr(int c)
1634 {
1635 if ((c >= 'a') && (c <= 'f')) {
1636 return c - 'a' + 10;
1637 }
1638
1639 if ((c >= 'A') && (c <= 'F')) {
1640 return c - 'A' + 10;
1641 }
1642 return c - '0';
1643 }
1644
1645 /// Convert two hex characters to a byte.
1646 /// Return -1 if one of the characters is not hex.
hexhex2nr(char_u * p)1647 int hexhex2nr(char_u *p)
1648 {
1649 if (!ascii_isxdigit(p[0]) || !ascii_isxdigit(p[1])) {
1650 return -1;
1651 }
1652 return (hex2nr(p[0]) << 4) + hex2nr(p[1]);
1653 }
1654
1655 /// Check that "str" starts with a backslash that should be removed.
1656 /// For Windows this is only done when the character after the
1657 /// backslash is not a normal file name character.
1658 /// '$' is a valid file name character, we don't remove the backslash before
1659 /// it. This means it is not possible to use an environment variable after a
1660 /// backslash. "C:\$VIM\doc" is taken literally, only "$VIM\doc" works.
1661 /// Although "\ name" is valid, the backslash in "Program\ files" must be
1662 /// removed. Assume a file name doesn't start with a space.
1663 /// For multi-byte names, never remove a backslash before a non-ascii
1664 /// character, assume that all multi-byte characters are valid file name
1665 /// characters.
1666 ///
1667 /// @param str file path string to check
rem_backslash(const char_u * str)1668 bool rem_backslash(const char_u *str)
1669 FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
1670 {
1671 #ifdef BACKSLASH_IN_FILENAME
1672 return str[0] == '\\'
1673 && str[1] < 0x80
1674 && (str[1] == ' '
1675 || (str[1] != NUL
1676 && str[1] != '*'
1677 && str[1] != '?'
1678 && !vim_isfilec(str[1])));
1679
1680 #else // ifdef BACKSLASH_IN_FILENAME
1681 return str[0] == '\\' && str[1] != NUL;
1682 #endif // ifdef BACKSLASH_IN_FILENAME
1683 }
1684
1685 /// Halve the number of backslashes in a file name argument.
1686 ///
1687 /// @param p
backslash_halve(char_u * p)1688 void backslash_halve(char_u *p)
1689 {
1690 for (; *p; ++p) {
1691 if (rem_backslash(p)) {
1692 STRMOVE(p, p + 1);
1693 }
1694 }
1695 }
1696
1697 /// backslash_halve() plus save the result in allocated memory.
1698 ///
1699 /// @param p
1700 ///
1701 /// @return String with the number of backslashes halved.
backslash_halve_save(const char_u * p)1702 char_u *backslash_halve_save(const char_u *p)
1703 FUNC_ATTR_NONNULL_ALL FUNC_ATTR_NONNULL_RET
1704 {
1705 // TODO(philix): simplify and improve backslash_halve_save algorithm
1706 char_u *res = vim_strsave(p);
1707 backslash_halve(res);
1708 return res;
1709 }
1710