1 // This is an open source non-commercial project. Dear PVS-Studio, please check
2 // it. PVS-Studio Static Code Analyzer for C, C++ and C#: http://www.viva64.com
3
4 /// mbyte.c: Code specifically for handling multi-byte characters.
5 /// Multibyte extensions partly by Sung-Hoon Baek
6 ///
7 /// Strings internal to Nvim are always encoded as UTF-8 (thus the legacy
8 /// 'encoding' option is always "utf-8").
9 ///
10 /// The cell width on the display needs to be determined from the character
11 /// value. Recognizing UTF-8 bytes is easy: 0xxx.xxxx is a single-byte char,
12 /// 10xx.xxxx is a trailing byte, 11xx.xxxx is a leading byte of a multi-byte
13 /// character. To make things complicated, up to six composing characters
14 /// are allowed. These are drawn on top of the first char. For most editing
15 /// the sequence of bytes with composing characters included is considered to
16 /// be one character.
17 ///
18 /// UTF-8 is used everywhere in the core. This is in registers, text
19 /// manipulation, buffers, etc. Nvim core communicates with external plugins
20 /// and GUIs in this encoding.
21 ///
22 /// The encoding of a file is specified with 'fileencoding'. Conversion
23 /// is to be done when it's different from "utf-8".
24 ///
25 /// Vim scripts may contain an ":scriptencoding" command. This has an effect
26 /// for some commands, like ":menutrans".
27
28 #include <inttypes.h>
29 #include <stdbool.h>
30 #include <string.h>
31 #include <wchar.h>
32 #include <wctype.h>
33
34 #include "nvim/ascii.h"
35 #include "nvim/vim.h"
36 #ifdef HAVE_LOCALE_H
37 # include <locale.h>
38 #endif
39 #include "nvim/arabic.h"
40 #include "nvim/charset.h"
41 #include "nvim/cursor.h"
42 #include "nvim/eval.h"
43 #include "nvim/fileio.h"
44 #include "nvim/func_attr.h"
45 #include "nvim/iconv.h"
46 #include "nvim/mark.h"
47 #include "nvim/mbyte.h"
48 #include "nvim/memline.h"
49 #include "nvim/memory.h"
50 #include "nvim/message.h"
51 #include "nvim/misc1.h"
52 #include "nvim/option.h"
53 #include "nvim/os/os.h"
54 #include "nvim/path.h"
55 #include "nvim/screen.h"
56 #include "nvim/spell.h"
57 #include "nvim/strings.h"
58
59 typedef struct {
60 int rangeStart;
61 int rangeEnd;
62 int step;
63 int offset;
64 } convertStruct;
65
66 struct interval {
67 long first;
68 long last;
69 };
70
71 #ifdef INCLUDE_GENERATED_DECLARATIONS
72 # include "mbyte.c.generated.h"
73
74 # include "unicode_tables.generated.h"
75 #endif
76
77 // To speed up BYTELEN(); keep a lookup table to quickly get the length in
78 // bytes of a UTF-8 character from the first byte of a UTF-8 string. Bytes
79 // which are illegal when used as the first byte have a 1. The NUL byte has
80 // length 1.
81 const uint8_t utf8len_tab[] = {
82 // ?1 ?2 ?3 ?4 ?5 ?6 ?7 ?8 ?9 ?A ?B ?C ?D ?E ?F
83 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0?
84 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1?
85 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2?
86 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3?
87 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4?
88 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5?
89 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6?
90 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7?
91 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8?
92 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9?
93 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A?
94 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B?
95 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C?
96 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D?
97 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E?
98 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1, // F?
99 };
100
101 // Like utf8len_tab above, but using a zero for illegal lead bytes.
102 const uint8_t utf8len_tab_zero[] = {
103 // ?1 ?2 ?3 ?4 ?5 ?6 ?7 ?8 ?9 ?A ?B ?C ?D ?E ?F
104 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0?
105 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1?
106 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2?
107 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3?
108 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4?
109 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5?
110 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6?
111 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7?
112 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8?
113 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9?
114 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A?
115 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B?
116 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C?
117 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D?
118 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E?
119 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0, // F?
120 };
121
122 /*
123 * Canonical encoding names and their properties.
124 * "iso-8859-n" is handled by enc_canonize() directly.
125 */
126 static struct
127 { const char *name; int prop; int codepage; }
128 enc_canon_table[] =
129 {
130 #define IDX_LATIN_1 0
131 { "latin1", ENC_8BIT + ENC_LATIN1, 1252 },
132 #define IDX_ISO_2 1
133 { "iso-8859-2", ENC_8BIT, 0 },
134 #define IDX_ISO_3 2
135 { "iso-8859-3", ENC_8BIT, 0 },
136 #define IDX_ISO_4 3
137 { "iso-8859-4", ENC_8BIT, 0 },
138 #define IDX_ISO_5 4
139 { "iso-8859-5", ENC_8BIT, 0 },
140 #define IDX_ISO_6 5
141 { "iso-8859-6", ENC_8BIT, 0 },
142 #define IDX_ISO_7 6
143 { "iso-8859-7", ENC_8BIT, 0 },
144 #define IDX_ISO_8 7
145 { "iso-8859-8", ENC_8BIT, 0 },
146 #define IDX_ISO_9 8
147 { "iso-8859-9", ENC_8BIT, 0 },
148 #define IDX_ISO_10 9
149 { "iso-8859-10", ENC_8BIT, 0 },
150 #define IDX_ISO_11 10
151 { "iso-8859-11", ENC_8BIT, 0 },
152 #define IDX_ISO_13 11
153 { "iso-8859-13", ENC_8BIT, 0 },
154 #define IDX_ISO_14 12
155 { "iso-8859-14", ENC_8BIT, 0 },
156 #define IDX_ISO_15 13
157 { "iso-8859-15", ENC_8BIT + ENC_LATIN9, 0 },
158 #define IDX_KOI8_R 14
159 { "koi8-r", ENC_8BIT, 0 },
160 #define IDX_KOI8_U 15
161 { "koi8-u", ENC_8BIT, 0 },
162 #define IDX_UTF8 16
163 { "utf-8", ENC_UNICODE, 0 },
164 #define IDX_UCS2 17
165 { "ucs-2", ENC_UNICODE + ENC_ENDIAN_B + ENC_2BYTE, 0 },
166 #define IDX_UCS2LE 18
167 { "ucs-2le", ENC_UNICODE + ENC_ENDIAN_L + ENC_2BYTE, 0 },
168 #define IDX_UTF16 19
169 { "utf-16", ENC_UNICODE + ENC_ENDIAN_B + ENC_2WORD, 0 },
170 #define IDX_UTF16LE 20
171 { "utf-16le", ENC_UNICODE + ENC_ENDIAN_L + ENC_2WORD, 0 },
172 #define IDX_UCS4 21
173 { "ucs-4", ENC_UNICODE + ENC_ENDIAN_B + ENC_4BYTE, 0 },
174 #define IDX_UCS4LE 22
175 { "ucs-4le", ENC_UNICODE + ENC_ENDIAN_L + ENC_4BYTE, 0 },
176
177 // For debugging DBCS encoding on Unix.
178 #define IDX_DEBUG 23
179 { "debug", ENC_DBCS, DBCS_DEBUG },
180 #define IDX_EUC_JP 24
181 { "euc-jp", ENC_DBCS, DBCS_JPNU },
182 #define IDX_SJIS 25
183 { "sjis", ENC_DBCS, DBCS_JPN },
184 #define IDX_EUC_KR 26
185 { "euc-kr", ENC_DBCS, DBCS_KORU },
186 #define IDX_EUC_CN 27
187 { "euc-cn", ENC_DBCS, DBCS_CHSU },
188 #define IDX_EUC_TW 28
189 { "euc-tw", ENC_DBCS, DBCS_CHTU },
190 #define IDX_BIG5 29
191 { "big5", ENC_DBCS, DBCS_CHT },
192
193 // MS-DOS and MS-Windows codepages are included here, so that they can be
194 // used on Unix too. Most of them are similar to ISO-8859 encodings, but
195 // not exactly the same.
196 #define IDX_CP437 30
197 { "cp437", ENC_8BIT, 437 }, // like iso-8859-1
198 #define IDX_CP737 31
199 { "cp737", ENC_8BIT, 737 }, // like iso-8859-7
200 #define IDX_CP775 32
201 { "cp775", ENC_8BIT, 775 }, // Baltic
202 #define IDX_CP850 33
203 { "cp850", ENC_8BIT, 850 }, // like iso-8859-4
204 #define IDX_CP852 34
205 { "cp852", ENC_8BIT, 852 }, // like iso-8859-1
206 #define IDX_CP855 35
207 { "cp855", ENC_8BIT, 855 }, // like iso-8859-2
208 #define IDX_CP857 36
209 { "cp857", ENC_8BIT, 857 }, // like iso-8859-5
210 #define IDX_CP860 37
211 { "cp860", ENC_8BIT, 860 }, // like iso-8859-9
212 #define IDX_CP861 38
213 { "cp861", ENC_8BIT, 861 }, // like iso-8859-1
214 #define IDX_CP862 39
215 { "cp862", ENC_8BIT, 862 }, // like iso-8859-1
216 #define IDX_CP863 40
217 { "cp863", ENC_8BIT, 863 }, // like iso-8859-8
218 #define IDX_CP865 41
219 { "cp865", ENC_8BIT, 865 }, // like iso-8859-1
220 #define IDX_CP866 42
221 { "cp866", ENC_8BIT, 866 }, // like iso-8859-5
222 #define IDX_CP869 43
223 { "cp869", ENC_8BIT, 869 }, // like iso-8859-7
224 #define IDX_CP874 44
225 { "cp874", ENC_8BIT, 874 }, // Thai
226 #define IDX_CP932 45
227 { "cp932", ENC_DBCS, DBCS_JPN },
228 #define IDX_CP936 46
229 { "cp936", ENC_DBCS, DBCS_CHS },
230 #define IDX_CP949 47
231 { "cp949", ENC_DBCS, DBCS_KOR },
232 #define IDX_CP950 48
233 { "cp950", ENC_DBCS, DBCS_CHT },
234 #define IDX_CP1250 49
235 { "cp1250", ENC_8BIT, 1250 }, // Czech, Polish, etc.
236 #define IDX_CP1251 50
237 { "cp1251", ENC_8BIT, 1251 }, // Cyrillic
238 // cp1252 is considered to be equal to latin1
239 #define IDX_CP1253 51
240 { "cp1253", ENC_8BIT, 1253 }, // Greek
241 #define IDX_CP1254 52
242 { "cp1254", ENC_8BIT, 1254 }, // Turkish
243 #define IDX_CP1255 53
244 { "cp1255", ENC_8BIT, 1255 }, // Hebrew
245 #define IDX_CP1256 54
246 { "cp1256", ENC_8BIT, 1256 }, // Arabic
247 #define IDX_CP1257 55
248 { "cp1257", ENC_8BIT, 1257 }, // Baltic
249 #define IDX_CP1258 56
250 { "cp1258", ENC_8BIT, 1258 }, // Vietnamese
251
252 #define IDX_MACROMAN 57
253 { "macroman", ENC_8BIT + ENC_MACROMAN, 0 }, // Mac OS
254 #define IDX_HPROMAN8 58
255 { "hp-roman8", ENC_8BIT, 0 }, // HP Roman8
256 #define IDX_COUNT 59
257 };
258
259 /*
260 * Aliases for encoding names.
261 */
262 static struct
263 { const char *name; int canon; }
264 enc_alias_table[] =
265 {
266 { "ansi", IDX_LATIN_1 },
267 { "iso-8859-1", IDX_LATIN_1 },
268 { "latin2", IDX_ISO_2 },
269 { "latin3", IDX_ISO_3 },
270 { "latin4", IDX_ISO_4 },
271 { "cyrillic", IDX_ISO_5 },
272 { "arabic", IDX_ISO_6 },
273 { "greek", IDX_ISO_7 },
274 { "hebrew", IDX_ISO_8 },
275 { "latin5", IDX_ISO_9 },
276 { "turkish", IDX_ISO_9 }, // ?
277 { "latin6", IDX_ISO_10 },
278 { "nordic", IDX_ISO_10 }, // ?
279 { "thai", IDX_ISO_11 }, // ?
280 { "latin7", IDX_ISO_13 },
281 { "latin8", IDX_ISO_14 },
282 { "latin9", IDX_ISO_15 },
283 { "utf8", IDX_UTF8 },
284 { "unicode", IDX_UCS2 },
285 { "ucs2", IDX_UCS2 },
286 { "ucs2be", IDX_UCS2 },
287 { "ucs-2be", IDX_UCS2 },
288 { "ucs2le", IDX_UCS2LE },
289 { "utf16", IDX_UTF16 },
290 { "utf16be", IDX_UTF16 },
291 { "utf-16be", IDX_UTF16 },
292 { "utf16le", IDX_UTF16LE },
293 { "ucs4", IDX_UCS4 },
294 { "ucs4be", IDX_UCS4 },
295 { "ucs-4be", IDX_UCS4 },
296 { "ucs4le", IDX_UCS4LE },
297 { "utf32", IDX_UCS4 },
298 { "utf-32", IDX_UCS4 },
299 { "utf32be", IDX_UCS4 },
300 { "utf-32be", IDX_UCS4 },
301 { "utf32le", IDX_UCS4LE },
302 { "utf-32le", IDX_UCS4LE },
303 { "932", IDX_CP932 },
304 { "949", IDX_CP949 },
305 { "936", IDX_CP936 },
306 { "gbk", IDX_CP936 },
307 { "950", IDX_CP950 },
308 { "eucjp", IDX_EUC_JP },
309 { "unix-jis", IDX_EUC_JP },
310 { "ujis", IDX_EUC_JP },
311 { "shift-jis", IDX_SJIS },
312 { "pck", IDX_SJIS }, // Sun: PCK
313 { "euckr", IDX_EUC_KR },
314 { "5601", IDX_EUC_KR }, // Sun: KS C 5601
315 { "euccn", IDX_EUC_CN },
316 { "gb2312", IDX_EUC_CN },
317 { "euctw", IDX_EUC_TW },
318 { "japan", IDX_EUC_JP },
319 { "korea", IDX_EUC_KR },
320 { "prc", IDX_EUC_CN },
321 { "zh-cn", IDX_EUC_CN },
322 { "chinese", IDX_EUC_CN },
323 { "zh-tw", IDX_EUC_TW },
324 { "taiwan", IDX_EUC_TW },
325 { "cp950", IDX_BIG5 },
326 { "950", IDX_BIG5 },
327 { "mac", IDX_MACROMAN },
328 { "mac-roman", IDX_MACROMAN },
329 { NULL, 0 }
330 };
331
332 /*
333 * Find encoding "name" in the list of canonical encoding names.
334 * Returns -1 if not found.
335 */
enc_canon_search(const char_u * name)336 static int enc_canon_search(const char_u *name)
337 {
338 int i;
339
340 for (i = 0; i < IDX_COUNT; ++i) {
341 if (STRCMP(name, enc_canon_table[i].name) == 0) {
342 return i;
343 }
344 }
345 return -1;
346 }
347
348
349 /*
350 * Find canonical encoding "name" in the list and return its properties.
351 * Returns 0 if not found.
352 */
enc_canon_props(const char_u * name)353 int enc_canon_props(const char_u *name)
354 {
355 int i;
356
357 i = enc_canon_search(name);
358 if (i >= 0) {
359 return enc_canon_table[i].prop;
360 } else if (STRNCMP(name, "2byte-", 6) == 0) {
361 return ENC_DBCS;
362 } else if (STRNCMP(name, "8bit-", 5) == 0 || STRNCMP(name, "iso-8859-", 9) == 0) {
363 return ENC_8BIT;
364 }
365 return 0;
366 }
367
368 /*
369 * Return the size of the BOM for the current buffer:
370 * 0 - no BOM
371 * 2 - UCS-2 or UTF-16 BOM
372 * 4 - UCS-4 BOM
373 * 3 - UTF-8 BOM
374 */
bomb_size(void)375 int bomb_size(void)
376 {
377 int n = 0;
378
379 if (curbuf->b_p_bomb && !curbuf->b_p_bin) {
380 if (*curbuf->b_p_fenc == NUL
381 || STRCMP(curbuf->b_p_fenc, "utf-8") == 0) {
382 n = 3;
383 } else if (STRNCMP(curbuf->b_p_fenc, "ucs-2", 5) == 0
384 || STRNCMP(curbuf->b_p_fenc, "utf-16", 6) == 0) {
385 n = 2;
386 } else if (STRNCMP(curbuf->b_p_fenc, "ucs-4", 5) == 0) {
387 n = 4;
388 }
389 }
390 return n;
391 }
392
393 /*
394 * Remove all BOM from "s" by moving remaining text.
395 */
remove_bom(char_u * s)396 void remove_bom(char_u *s)
397 {
398 char *p = (char *)s;
399
400 while ((p = strchr(p, 0xef)) != NULL) {
401 if ((uint8_t)p[1] == 0xbb && (uint8_t)p[2] == 0xbf) {
402 STRMOVE(p, p + 3);
403 } else {
404 p++;
405 }
406 }
407 }
408
409 /*
410 * Get class of pointer:
411 * 0 for blank or NUL
412 * 1 for punctuation
413 * 2 for an (ASCII) word character
414 * >2 for other word characters
415 */
mb_get_class(const char_u * p)416 int mb_get_class(const char_u *p)
417 {
418 return mb_get_class_tab(p, curbuf->b_chartab);
419 }
420
mb_get_class_tab(const char_u * p,const uint64_t * const chartab)421 int mb_get_class_tab(const char_u *p, const uint64_t *const chartab)
422 {
423 if (MB_BYTE2LEN(p[0]) == 1) {
424 if (p[0] == NUL || ascii_iswhite(p[0])) {
425 return 0;
426 }
427 if (vim_iswordc_tab(p[0], chartab)) {
428 return 2;
429 }
430 return 1;
431 }
432 return utf_class_tab(utf_ptr2char(p), chartab);
433 }
434
435 /*
436 * Return true if "c" is in "table".
437 */
intable(const struct interval * table,size_t n_items,int c)438 static bool intable(const struct interval *table, size_t n_items, int c)
439 {
440 int mid, bot, top;
441
442 // first quick check for Latin1 etc. characters
443 if (c < table[0].first) {
444 return false;
445 }
446
447 // binary search in table
448 bot = 0;
449 top = (int)(n_items - 1);
450 while (top >= bot) {
451 mid = (bot + top) / 2;
452 if (table[mid].last < c) {
453 bot = mid + 1;
454 } else if (table[mid].first > c) {
455 top = mid - 1;
456 } else {
457 return true;
458 }
459 }
460 return false;
461 }
462
463 /// For UTF-8 character "c" return 2 for a double-width character, 1 for others.
464 /// Returns 4 or 6 for an unprintable character.
465 /// Is only correct for characters >= 0x80.
466 /// When p_ambw is "double", return 2 for a character with East Asian Width
467 /// class 'A'(mbiguous).
468 ///
469 /// @note Tables `doublewidth` and `ambiguous` are generated by
470 /// gen_unicode_tables.lua, which must be manually invoked as needed.
utf_char2cells(int c)471 int utf_char2cells(int c)
472 {
473 if (c >= 0x100) {
474 #ifdef USE_WCHAR_FUNCTIONS
475 //
476 // Assume the library function wcwidth() works better than our own
477 // stuff. It should return 1 for ambiguous width chars!
478 //
479 int n = wcwidth(c);
480
481 if (n < 0) {
482 return 6; // unprintable, displays <xxxx>
483 }
484 if (n > 1) {
485 return n;
486 }
487 #else
488 if (!utf_printable(c)) {
489 return 6; // unprintable, displays <xxxx>
490 }
491 if (intable(doublewidth, ARRAY_SIZE(doublewidth), c)) {
492 return 2;
493 }
494 #endif
495 if (p_emoji && intable(emoji_width, ARRAY_SIZE(emoji_width), c)) {
496 return 2;
497 }
498 } else if (c >= 0x80 && !vim_isprintc(c)) {
499 // Characters below 0x100 are influenced by 'isprint' option.
500 return 4; // unprintable, displays <xx>
501 }
502
503 if (c >= 0x80 && *p_ambw == 'd'
504 && intable(ambiguous, ARRAY_SIZE(ambiguous), c)) {
505 return 2;
506 }
507
508 return 1;
509 }
510
511 /// Return the number of display cells character at "*p" occupies.
512 /// This doesn't take care of unprintable characters, use ptr2cells() for that.
utf_ptr2cells(const char_u * p)513 int utf_ptr2cells(const char_u *p)
514 {
515 int c;
516
517 // Need to convert to a character number.
518 if (*p >= 0x80) {
519 c = utf_ptr2char(p);
520 // An illegal byte is displayed as <xx>.
521 if (utf_ptr2len(p) == 1 || c == NUL) {
522 return 4;
523 }
524 // If the char is ASCII it must be an overlong sequence.
525 if (c < 0x80) {
526 return char2cells(c);
527 }
528 return utf_char2cells(c);
529 }
530 return 1;
531 }
532
533 /// Like utf_ptr2cells(), but limit string length to "size".
534 /// For an empty string or truncated character returns 1.
utf_ptr2cells_len(const char_u * p,int size)535 int utf_ptr2cells_len(const char_u *p, int size)
536 {
537 int c;
538
539 // Need to convert to a wide character.
540 if (size > 0 && *p >= 0x80) {
541 if (utf_ptr2len_len(p, size) < utf8len_tab[*p]) {
542 return 1; // truncated
543 }
544 c = utf_ptr2char(p);
545 // An illegal byte is displayed as <xx>.
546 if (utf_ptr2len(p) == 1 || c == NUL) {
547 return 4;
548 }
549 // If the char is ASCII it must be an overlong sequence.
550 if (c < 0x80) {
551 return char2cells(c);
552 }
553 return utf_char2cells(c);
554 }
555 return 1;
556 }
557
558 /// Calculate the number of cells occupied by string `str`.
559 ///
560 /// @param str The source string, may not be NULL, must be a NUL-terminated
561 /// string.
562 /// @return The number of cells occupied by string `str`
mb_string2cells(const char_u * str)563 size_t mb_string2cells(const char_u *str)
564 {
565 size_t clen = 0;
566
567 for (const char_u *p = str; *p != NUL; p += utfc_ptr2len(p)) {
568 clen += utf_ptr2cells(p);
569 }
570
571 return clen;
572 }
573
574 /// Get the number of cells occupied by string `str` with maximum length `size`
575 ///
576 /// @param str The source string, may not be NULL, must be a NUL-terminated
577 /// string.
578 /// @param size maximum length of string. It will terminate on earlier NUL.
579 /// @return The number of cells occupied by string `str`
mb_string2cells_len(const char_u * str,size_t size)580 size_t mb_string2cells_len(const char_u *str, size_t size)
581 FUNC_ATTR_NONNULL_ARG(1)
582 {
583 size_t clen = 0;
584
585 for (const char_u *p = str; *p != NUL && p < str+size;
586 p += utfc_ptr2len_len(p, size+(p-str))) {
587 clen += utf_ptr2cells(p);
588 }
589
590 return clen;
591 }
592
593 /// Convert a UTF-8 byte sequence to a character number.
594 ///
595 /// If the sequence is illegal or truncated by a NUL then the first byte is
596 /// returned.
597 /// For an overlong sequence this may return zero.
598 /// Does not include composing characters for obvious reasons.
599 ///
600 /// @param[in] p String to convert.
601 ///
602 /// @return Unicode codepoint or byte value.
utf_ptr2char(const char_u * const p)603 int utf_ptr2char(const char_u *const p)
604 FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT
605 {
606 if (p[0] < 0x80) { // Be quick for ASCII.
607 return p[0];
608 }
609
610 const uint8_t len = utf8len_tab_zero[p[0]];
611 if (len > 1 && (p[1] & 0xc0) == 0x80) {
612 if (len == 2) {
613 return ((p[0] & 0x1f) << 6) + (p[1] & 0x3f);
614 }
615 if ((p[2] & 0xc0) == 0x80) {
616 if (len == 3) {
617 return (((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6)
618 + (p[2] & 0x3f));
619 }
620 if ((p[3] & 0xc0) == 0x80) {
621 if (len == 4) {
622 return (((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12)
623 + ((p[2] & 0x3f) << 6) + (p[3] & 0x3f));
624 }
625 if ((p[4] & 0xc0) == 0x80) {
626 if (len == 5) {
627 return (((p[0] & 0x03) << 24) + ((p[1] & 0x3f) << 18)
628 + ((p[2] & 0x3f) << 12) + ((p[3] & 0x3f) << 6)
629 + (p[4] & 0x3f));
630 }
631 if ((p[5] & 0xc0) == 0x80 && len == 6) {
632 return (((p[0] & 0x01) << 30) + ((p[1] & 0x3f) << 24)
633 + ((p[2] & 0x3f) << 18) + ((p[3] & 0x3f) << 12)
634 + ((p[4] & 0x3f) << 6) + (p[5] & 0x3f));
635 }
636 }
637 }
638 }
639 }
640 // Illegal value: just return the first byte.
641 return p[0];
642 }
643
644 /*
645 * Convert a UTF-8 byte sequence to a wide character.
646 * String is assumed to be terminated by NUL or after "n" bytes, whichever
647 * comes first.
648 * The function is safe in the sense that it never accesses memory beyond the
649 * first "n" bytes of "s".
650 *
651 * On success, returns decoded codepoint, advances "s" to the beginning of
652 * next character and decreases "n" accordingly.
653 *
654 * If end of string was reached, returns 0 and, if "n" > 0, advances "s" past
655 * NUL byte.
656 *
657 * If byte sequence is illegal or incomplete, returns -1 and does not advance
658 * "s".
659 */
utf_safe_read_char_adv(const char_u ** s,size_t * n)660 static int utf_safe_read_char_adv(const char_u **s, size_t *n)
661 {
662 int c;
663
664 if (*n == 0) { // end of buffer
665 return 0;
666 }
667
668 uint8_t k = utf8len_tab_zero[**s];
669
670 if (k == 1) {
671 // ASCII character or NUL
672 (*n)--;
673 return *(*s)++;
674 }
675
676 if (k <= *n) {
677 // We have a multibyte sequence and it isn't truncated by buffer
678 // limits so utf_ptr2char() is safe to use. Or the first byte is
679 // illegal (k=0), and it's also safe to use utf_ptr2char().
680 c = utf_ptr2char(*s);
681
682 // On failure, utf_ptr2char() returns the first byte, so here we
683 // check equality with the first byte. The only non-ASCII character
684 // which equals the first byte of its own UTF-8 representation is
685 // U+00C3 (UTF-8: 0xC3 0x83), so need to check that special case too.
686 // It's safe even if n=1, else we would have k=2 > n.
687 if (c != (int)(**s) || (c == 0xC3 && (*s)[1] == 0x83)) {
688 // byte sequence was successfully decoded
689 *s += k;
690 *n -= k;
691 return c;
692 }
693 }
694
695 // byte sequence is incomplete or illegal
696 return -1;
697 }
698
699 /*
700 * Get character at **pp and advance *pp to the next character.
701 * Note: composing characters are skipped!
702 */
mb_ptr2char_adv(const char_u ** const pp)703 int mb_ptr2char_adv(const char_u **const pp)
704 {
705 int c;
706
707 c = utf_ptr2char(*pp);
708 *pp += utfc_ptr2len(*pp);
709 return c;
710 }
711
712 /*
713 * Get character at **pp and advance *pp to the next character.
714 * Note: composing characters are returned as separate characters.
715 */
mb_cptr2char_adv(const char_u ** pp)716 int mb_cptr2char_adv(const char_u **pp)
717 {
718 int c;
719
720 c = utf_ptr2char(*pp);
721 *pp += utf_ptr2len(*pp);
722 return c;
723 }
724
725 /*
726 * Check if the character pointed to by "p2" is a composing character when it
727 * comes after "p1". For Arabic sometimes "ab" is replaced with "c", which
728 * behaves like a composing character.
729 */
utf_composinglike(const char_u * p1,const char_u * p2)730 bool utf_composinglike(const char_u *p1, const char_u *p2)
731 {
732 int c2;
733
734 c2 = utf_ptr2char(p2);
735 if (utf_iscomposing(c2)) {
736 return true;
737 }
738 if (!arabic_maycombine(c2)) {
739 return false;
740 }
741 return arabic_combine(utf_ptr2char(p1), c2);
742 }
743
744 /// Convert a UTF-8 string to a wide character
745 ///
746 /// Also gets up to #MAX_MCO composing characters.
747 ///
748 /// @param[out] pcc Location where to store composing characters. Must have
749 /// space at least for #MAX_MCO + 1 elements.
750 ///
751 /// @return leading character.
utfc_ptr2char(const char_u * p,int * pcc)752 int utfc_ptr2char(const char_u *p, int *pcc)
753 {
754 int len;
755 int c;
756 int cc;
757 int i = 0;
758
759 c = utf_ptr2char(p);
760 len = utf_ptr2len(p);
761
762 // Only accept a composing char when the first char isn't illegal.
763 if ((len > 1 || *p < 0x80)
764 && p[len] >= 0x80
765 && utf_composinglike(p, p + len)) {
766 cc = utf_ptr2char(p + len);
767 for (;;) {
768 pcc[i++] = cc;
769 if (i == MAX_MCO) {
770 break;
771 }
772 len += utf_ptr2len(p + len);
773 if (p[len] < 0x80 || !utf_iscomposing(cc = utf_ptr2char(p + len))) {
774 break;
775 }
776 }
777 }
778
779 if (i < MAX_MCO) { // last composing char must be 0
780 pcc[i] = 0;
781 }
782
783 return c;
784 }
785
786 /*
787 * Convert a UTF-8 byte string to a wide character. Also get up to MAX_MCO
788 * composing characters. Use no more than p[maxlen].
789 *
790 * @param [out] pcc: composing chars, last one is 0
791 */
utfc_ptr2char_len(const char_u * p,int * pcc,int maxlen)792 int utfc_ptr2char_len(const char_u *p, int *pcc, int maxlen)
793 {
794 assert(maxlen > 0);
795
796 int i = 0;
797
798 int len = utf_ptr2len_len(p, maxlen);
799 // Is it safe to use utf_ptr2char()?
800 bool safe = len > 1 && len <= maxlen;
801 int c = safe ? utf_ptr2char(p) : *p;
802
803 // Only accept a composing char when the first char isn't illegal.
804 if ((safe || c < 0x80) && len < maxlen && p[len] >= 0x80) {
805 for (; i < MAX_MCO; i++) {
806 int len_cc = utf_ptr2len_len(p + len, maxlen - len);
807 safe = len_cc > 1 && len_cc <= maxlen - len;
808 if (!safe || (pcc[i] = utf_ptr2char(p + len)) < 0x80
809 || !(i == 0 ? utf_composinglike(p, p+len) : utf_iscomposing(pcc[i]))) {
810 break;
811 }
812 len += len_cc;
813 }
814 }
815
816 if (i < MAX_MCO) {
817 // last composing char must be 0
818 pcc[i] = 0;
819 }
820
821 return c;
822 #undef ISCOMPOSING
823 }
824
825 /// Get the length of a UTF-8 byte sequence representing a single codepoint
826 ///
827 /// @param[in] p UTF-8 string.
828 ///
829 /// @return Sequence length, 0 for empty string and 1 for non-UTF-8 byte
830 /// sequence.
utf_ptr2len(const char_u * const p)831 int utf_ptr2len(const char_u *const p)
832 FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
833 {
834 if (*p == NUL) {
835 return 0;
836 }
837 const int len = utf8len_tab[*p];
838 for (int i = 1; i < len; i++) {
839 if ((p[i] & 0xc0) != 0x80) {
840 return 1;
841 }
842 }
843 return len;
844 }
845
846 /*
847 * Return length of UTF-8 character, obtained from the first byte.
848 * "b" must be between 0 and 255!
849 * Returns 1 for an invalid first byte value.
850 */
utf_byte2len(int b)851 int utf_byte2len(int b)
852 {
853 return utf8len_tab[b];
854 }
855
856 /*
857 * Get the length of UTF-8 byte sequence "p[size]". Does not include any
858 * following composing characters.
859 * Returns 1 for "".
860 * Returns 1 for an illegal byte sequence (also in incomplete byte seq.).
861 * Returns number > "size" for an incomplete byte sequence.
862 * Never returns zero.
863 */
utf_ptr2len_len(const char_u * p,int size)864 int utf_ptr2len_len(const char_u *p, int size)
865 {
866 int len;
867 int i;
868 int m;
869
870 len = utf8len_tab[*p];
871 if (len == 1) {
872 return 1; // NUL, ascii or illegal lead byte
873 }
874 if (len > size) {
875 m = size; // incomplete byte sequence.
876 } else {
877 m = len;
878 }
879 for (i = 1; i < m; ++i) {
880 if ((p[i] & 0xc0) != 0x80) {
881 return 1;
882 }
883 }
884 return len;
885 }
886
887 /// Return the number of bytes occupied by a UTF-8 character in a string
888 ///
889 /// This includes following composing characters.
utfc_ptr2len(const char_u * const p)890 int utfc_ptr2len(const char_u *const p)
891 FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
892 {
893 uint8_t b0 = (uint8_t)(*p);
894
895 if (b0 == NUL) {
896 return 0;
897 }
898 if (b0 < 0x80 && p[1] < 0x80) { // be quick for ASCII
899 return 1;
900 }
901
902 // Skip over first UTF-8 char, stopping at a NUL byte.
903 int len = utf_ptr2len(p);
904
905 // Check for illegal byte.
906 if (len == 1 && b0 >= 0x80) {
907 return 1;
908 }
909
910 // Check for composing characters. We can handle only the first six, but
911 // skip all of them (otherwise the cursor would get stuck).
912 int prevlen = 0;
913 for (;;) {
914 if (p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len)) {
915 return len;
916 }
917
918 // Skip over composing char.
919 prevlen = len;
920 len += utf_ptr2len(p + len);
921 }
922 }
923
924 /*
925 * Return the number of bytes the UTF-8 encoding of the character at "p[size]"
926 * takes. This includes following composing characters.
927 * Returns 0 for an empty string.
928 * Returns 1 for an illegal char or an incomplete byte sequence.
929 */
utfc_ptr2len_len(const char_u * p,int size)930 int utfc_ptr2len_len(const char_u *p, int size)
931 {
932 int len;
933 int prevlen;
934
935 if (size < 1 || *p == NUL) {
936 return 0;
937 }
938 if (p[0] < 0x80 && (size == 1 || p[1] < 0x80)) { // be quick for ASCII
939 return 1;
940 }
941
942 // Skip over first UTF-8 char, stopping at a NUL byte.
943 len = utf_ptr2len_len(p, size);
944
945 // Check for illegal byte and incomplete byte sequence.
946 if ((len == 1 && p[0] >= 0x80) || len > size) {
947 return 1;
948 }
949
950 /*
951 * Check for composing characters. We can handle only the first six, but
952 * skip all of them (otherwise the cursor would get stuck).
953 */
954 prevlen = 0;
955 while (len < size) {
956 int len_next_char;
957
958 if (p[len] < 0x80) {
959 break;
960 }
961
962 /*
963 * Next character length should not go beyond size to ensure that
964 * utf_composinglike(...) does not read beyond size.
965 */
966 len_next_char = utf_ptr2len_len(p + len, size - len);
967 if (len_next_char > size - len) {
968 break;
969 }
970
971 if (!utf_composinglike(p + prevlen, p + len)) {
972 break;
973 }
974
975 // Skip over composing char
976 prevlen = len;
977 len += len_next_char;
978 }
979 return len;
980 }
981
982 /// Determine how many bytes certain unicode codepoint will occupy
utf_char2len(const int c)983 int utf_char2len(const int c)
984 {
985 if (c < 0x80) {
986 return 1;
987 } else if (c < 0x800) {
988 return 2;
989 } else if (c < 0x10000) {
990 return 3;
991 } else if (c < 0x200000) {
992 return 4;
993 } else if (c < 0x4000000) {
994 return 5;
995 } else {
996 return 6;
997 }
998 }
999
1000 /// Convert Unicode character to UTF-8 string
1001 ///
1002 /// @param c character to convert to \p buf
1003 /// @param[out] buf UTF-8 string generated from \p c, does not add \0
1004 /// @return Number of bytes (1-6).
utf_char2bytes(const int c,char_u * const buf)1005 int utf_char2bytes(const int c, char_u *const buf)
1006 {
1007 if (c < 0x80) { // 7 bits
1008 buf[0] = c;
1009 return 1;
1010 } else if (c < 0x800) { // 11 bits
1011 buf[0] = 0xc0 + ((unsigned)c >> 6);
1012 buf[1] = 0x80 + (c & 0x3f);
1013 return 2;
1014 } else if (c < 0x10000) { // 16 bits
1015 buf[0] = 0xe0 + ((unsigned)c >> 12);
1016 buf[1] = 0x80 + (((unsigned)c >> 6) & 0x3f);
1017 buf[2] = 0x80 + (c & 0x3f);
1018 return 3;
1019 } else if (c < 0x200000) { // 21 bits
1020 buf[0] = 0xf0 + ((unsigned)c >> 18);
1021 buf[1] = 0x80 + (((unsigned)c >> 12) & 0x3f);
1022 buf[2] = 0x80 + (((unsigned)c >> 6) & 0x3f);
1023 buf[3] = 0x80 + (c & 0x3f);
1024 return 4;
1025 } else if (c < 0x4000000) { // 26 bits
1026 buf[0] = 0xf8 + ((unsigned)c >> 24);
1027 buf[1] = 0x80 + (((unsigned)c >> 18) & 0x3f);
1028 buf[2] = 0x80 + (((unsigned)c >> 12) & 0x3f);
1029 buf[3] = 0x80 + (((unsigned)c >> 6) & 0x3f);
1030 buf[4] = 0x80 + (c & 0x3f);
1031 return 5;
1032 } else { // 31 bits
1033 buf[0] = 0xfc + ((unsigned)c >> 30);
1034 buf[1] = 0x80 + (((unsigned)c >> 24) & 0x3f);
1035 buf[2] = 0x80 + (((unsigned)c >> 18) & 0x3f);
1036 buf[3] = 0x80 + (((unsigned)c >> 12) & 0x3f);
1037 buf[4] = 0x80 + (((unsigned)c >> 6) & 0x3f);
1038 buf[5] = 0x80 + (c & 0x3f);
1039 return 6;
1040 }
1041 }
1042
1043 /*
1044 * Return true if "c" is a composing UTF-8 character. This means it will be
1045 * drawn on top of the preceding character.
1046 * Based on code from Markus Kuhn.
1047 */
utf_iscomposing(int c)1048 bool utf_iscomposing(int c)
1049 {
1050 return intable(combining, ARRAY_SIZE(combining), c);
1051 }
1052
1053 /*
1054 * Return true for characters that can be displayed in a normal way.
1055 * Only for characters of 0x100 and above!
1056 */
utf_printable(int c)1057 bool utf_printable(int c)
1058 {
1059 #ifdef USE_WCHAR_FUNCTIONS
1060 /*
1061 * Assume the iswprint() library function works better than our own stuff.
1062 */
1063 return iswprint(c);
1064 #else
1065 // Sorted list of non-overlapping intervals.
1066 // 0xd800-0xdfff is reserved for UTF-16, actually illegal.
1067 static struct interval nonprint[] =
1068 {
1069 { 0x070f, 0x070f }, { 0x180b, 0x180e }, { 0x200b, 0x200f }, { 0x202a, 0x202e },
1070 { 0x206a, 0x206f }, { 0xd800, 0xdfff }, { 0xfeff, 0xfeff }, { 0xfff9, 0xfffb },
1071 { 0xfffe, 0xffff }
1072 };
1073
1074 return !intable(nonprint, ARRAY_SIZE(nonprint), c);
1075 #endif
1076 }
1077
1078 /*
1079 * Get class of a Unicode character.
1080 * 0: white space
1081 * 1: punctuation
1082 * 2 or bigger: some class of word character.
1083 */
utf_class(const int c)1084 int utf_class(const int c)
1085 {
1086 return utf_class_tab(c, curbuf->b_chartab);
1087 }
1088
utf_class_tab(const int c,const uint64_t * const chartab)1089 int utf_class_tab(const int c, const uint64_t *const chartab)
1090 {
1091 // sorted list of non-overlapping intervals
1092 static struct clinterval {
1093 unsigned int first;
1094 unsigned int last;
1095 unsigned int class;
1096 } classes[] = {
1097 { 0x037e, 0x037e, 1 }, // Greek question mark
1098 { 0x0387, 0x0387, 1 }, // Greek ano teleia
1099 { 0x055a, 0x055f, 1 }, // Armenian punctuation
1100 { 0x0589, 0x0589, 1 }, // Armenian full stop
1101 { 0x05be, 0x05be, 1 },
1102 { 0x05c0, 0x05c0, 1 },
1103 { 0x05c3, 0x05c3, 1 },
1104 { 0x05f3, 0x05f4, 1 },
1105 { 0x060c, 0x060c, 1 },
1106 { 0x061b, 0x061b, 1 },
1107 { 0x061f, 0x061f, 1 },
1108 { 0x066a, 0x066d, 1 },
1109 { 0x06d4, 0x06d4, 1 },
1110 { 0x0700, 0x070d, 1 }, // Syriac punctuation
1111 { 0x0964, 0x0965, 1 },
1112 { 0x0970, 0x0970, 1 },
1113 { 0x0df4, 0x0df4, 1 },
1114 { 0x0e4f, 0x0e4f, 1 },
1115 { 0x0e5a, 0x0e5b, 1 },
1116 { 0x0f04, 0x0f12, 1 },
1117 { 0x0f3a, 0x0f3d, 1 },
1118 { 0x0f85, 0x0f85, 1 },
1119 { 0x104a, 0x104f, 1 }, // Myanmar punctuation
1120 { 0x10fb, 0x10fb, 1 }, // Georgian punctuation
1121 { 0x1361, 0x1368, 1 }, // Ethiopic punctuation
1122 { 0x166d, 0x166e, 1 }, // Canadian Syl. punctuation
1123 { 0x1680, 0x1680, 0 },
1124 { 0x169b, 0x169c, 1 },
1125 { 0x16eb, 0x16ed, 1 },
1126 { 0x1735, 0x1736, 1 },
1127 { 0x17d4, 0x17dc, 1 }, // Khmer punctuation
1128 { 0x1800, 0x180a, 1 }, // Mongolian punctuation
1129 { 0x2000, 0x200b, 0 }, // spaces
1130 { 0x200c, 0x2027, 1 }, // punctuation and symbols
1131 { 0x2028, 0x2029, 0 },
1132 { 0x202a, 0x202e, 1 }, // punctuation and symbols
1133 { 0x202f, 0x202f, 0 },
1134 { 0x2030, 0x205e, 1 }, // punctuation and symbols
1135 { 0x205f, 0x205f, 0 },
1136 { 0x2060, 0x27ff, 1 }, // punctuation and symbols
1137 { 0x2070, 0x207f, 0x2070 }, // superscript
1138 { 0x2080, 0x2094, 0x2080 }, // subscript
1139 { 0x20a0, 0x27ff, 1 }, // all kinds of symbols
1140 { 0x2800, 0x28ff, 0x2800 }, // braille
1141 { 0x2900, 0x2998, 1 }, // arrows, brackets, etc.
1142 { 0x29d8, 0x29db, 1 },
1143 { 0x29fc, 0x29fd, 1 },
1144 { 0x2e00, 0x2e7f, 1 }, // supplemental punctuation
1145 { 0x3000, 0x3000, 0 }, // ideographic space
1146 { 0x3001, 0x3020, 1 }, // ideographic punctuation
1147 { 0x3030, 0x3030, 1 },
1148 { 0x303d, 0x303d, 1 },
1149 { 0x3040, 0x309f, 0x3040 }, // Hiragana
1150 { 0x30a0, 0x30ff, 0x30a0 }, // Katakana
1151 { 0x3300, 0x9fff, 0x4e00 }, // CJK Ideographs
1152 { 0xac00, 0xd7a3, 0xac00 }, // Hangul Syllables
1153 { 0xf900, 0xfaff, 0x4e00 }, // CJK Ideographs
1154 { 0xfd3e, 0xfd3f, 1 },
1155 { 0xfe30, 0xfe6b, 1 }, // punctuation forms
1156 { 0xff00, 0xff0f, 1 }, // half/fullwidth ASCII
1157 { 0xff1a, 0xff20, 1 }, // half/fullwidth ASCII
1158 { 0xff3b, 0xff40, 1 }, // half/fullwidth ASCII
1159 { 0xff5b, 0xff65, 1 }, // half/fullwidth ASCII
1160 { 0x1d000, 0x1d24f, 1 }, // Musical notation
1161 { 0x1d400, 0x1d7ff, 1 }, // Mathematical Alphanumeric Symbols
1162 { 0x1f000, 0x1f2ff, 1 }, // Game pieces; enclosed characters
1163 { 0x1f300, 0x1f9ff, 1 }, // Many symbol blocks
1164 { 0x20000, 0x2a6df, 0x4e00 }, // CJK Ideographs
1165 { 0x2a700, 0x2b73f, 0x4e00 }, // CJK Ideographs
1166 { 0x2b740, 0x2b81f, 0x4e00 }, // CJK Ideographs
1167 { 0x2f800, 0x2fa1f, 0x4e00 }, // CJK Ideographs
1168 };
1169 int bot = 0;
1170 int top = ARRAY_SIZE(classes) - 1;
1171 int mid;
1172
1173 // First quick check for Latin1 characters, use 'iskeyword'.
1174 if (c < 0x100) {
1175 if (c == ' ' || c == '\t' || c == NUL || c == 0xa0) {
1176 return 0; // blank
1177 }
1178 if (vim_iswordc_tab(c, chartab)) {
1179 return 2; // word character
1180 }
1181 return 1; // punctuation
1182 }
1183
1184 // binary search in table
1185 while (top >= bot) {
1186 mid = (bot + top) / 2;
1187 if (classes[mid].last < (unsigned int)c) {
1188 bot = mid + 1;
1189 } else if (classes[mid].first > (unsigned int)c) {
1190 top = mid - 1;
1191 } else {
1192 return (int)classes[mid].class;
1193 }
1194 }
1195
1196 // emoji
1197 if (intable(emoji_all, ARRAY_SIZE(emoji_all), c)) {
1198 return 3;
1199 }
1200
1201 // most other characters are "word" characters
1202 return 2;
1203 }
1204
utf_ambiguous_width(int c)1205 bool utf_ambiguous_width(int c)
1206 {
1207 return c >= 0x80 && (intable(ambiguous, ARRAY_SIZE(ambiguous), c)
1208 || intable(emoji_all, ARRAY_SIZE(emoji_all), c));
1209 }
1210
1211 /*
1212 * Generic conversion function for case operations.
1213 * Return the converted equivalent of "a", which is a UCS-4 character. Use
1214 * the given conversion "table". Uses binary search on "table".
1215 */
utf_convert(int a,const convertStruct * const table,size_t n_items)1216 static int utf_convert(int a, const convertStruct *const table, size_t n_items)
1217 {
1218 size_t start, mid, end; // indices into table
1219
1220 start = 0;
1221 end = n_items;
1222 while (start < end) {
1223 // need to search further
1224 mid = (end + start) / 2;
1225 if (table[mid].rangeEnd < a) {
1226 start = mid + 1;
1227 } else {
1228 end = mid;
1229 }
1230 }
1231 if (start < n_items
1232 && table[start].rangeStart <= a
1233 && a <= table[start].rangeEnd
1234 && (a - table[start].rangeStart) % table[start].step == 0) {
1235 return a + table[start].offset;
1236 } else {
1237 return a;
1238 }
1239 }
1240
1241 /*
1242 * Return the folded-case equivalent of "a", which is a UCS-4 character. Uses
1243 * simple case folding.
1244 */
utf_fold(int a)1245 int utf_fold(int a)
1246 {
1247 if (a < 0x80) {
1248 // be fast for ASCII
1249 return a >= 0x41 && a <= 0x5a ? a + 32 : a;
1250 }
1251 return utf_convert(a, foldCase, ARRAY_SIZE(foldCase));
1252 }
1253
1254 // Vim's own character class functions. These exist because many library
1255 // islower()/toupper() etc. do not work properly: they crash when used with
1256 // invalid values or can't handle latin1 when the locale is C.
1257 // Speed is most important here.
1258
1259 /// Return the upper-case equivalent of "a", which is a UCS-4 character. Use
1260 /// simple case folding.
mb_toupper(int a)1261 int mb_toupper(int a)
1262 {
1263 // If 'casemap' contains "keepascii" use ASCII style toupper().
1264 if (a < 128 && (cmp_flags & CMP_KEEPASCII)) {
1265 return TOUPPER_ASC(a);
1266 }
1267
1268 #if defined(__STDC_ISO_10646__)
1269 // If towupper() is available and handles Unicode, use it.
1270 if (!(cmp_flags & CMP_INTERNAL)) {
1271 return towupper(a);
1272 }
1273 #endif
1274
1275 // For characters below 128 use locale sensitive toupper().
1276 if (a < 128) {
1277 return TOUPPER_LOC(a);
1278 }
1279
1280 // For any other characters use the above mapping table.
1281 return utf_convert(a, toUpper, ARRAY_SIZE(toUpper));
1282 }
1283
mb_islower(int a)1284 bool mb_islower(int a)
1285 {
1286 // German sharp s is lower case but has no upper case equivalent.
1287 return (mb_toupper(a) != a) || a == 0xdf;
1288 }
1289
1290 /// Return the lower-case equivalent of "a", which is a UCS-4 character. Use
1291 /// simple case folding.
mb_tolower(int a)1292 int mb_tolower(int a)
1293 {
1294 // If 'casemap' contains "keepascii" use ASCII style tolower().
1295 if (a < 128 && (cmp_flags & CMP_KEEPASCII)) {
1296 return TOLOWER_ASC(a);
1297 }
1298
1299 #if defined(__STDC_ISO_10646__)
1300 // If towlower() is available and handles Unicode, use it.
1301 if (!(cmp_flags & CMP_INTERNAL)) {
1302 return towlower(a);
1303 }
1304 #endif
1305
1306 // For characters below 128 use locale sensitive tolower().
1307 if (a < 128) {
1308 return TOLOWER_LOC(a);
1309 }
1310
1311 // For any other characters use the above mapping table.
1312 return utf_convert(a, toLower, ARRAY_SIZE(toLower));
1313 }
1314
mb_isupper(int a)1315 bool mb_isupper(int a)
1316 {
1317 return mb_tolower(a) != a;
1318 }
1319
utf_strnicmp(const char_u * s1,const char_u * s2,size_t n1,size_t n2)1320 static int utf_strnicmp(const char_u *s1, const char_u *s2, size_t n1, size_t n2)
1321 {
1322 int c1, c2, cdiff;
1323 char_u buffer[6];
1324
1325 for (;;) {
1326 c1 = utf_safe_read_char_adv(&s1, &n1);
1327 c2 = utf_safe_read_char_adv(&s2, &n2);
1328
1329 if (c1 <= 0 || c2 <= 0) {
1330 break;
1331 }
1332
1333 if (c1 == c2) {
1334 continue;
1335 }
1336
1337 cdiff = utf_fold(c1) - utf_fold(c2);
1338 if (cdiff != 0) {
1339 return cdiff;
1340 }
1341 }
1342
1343 // some string ended or has an incomplete/illegal character sequence
1344
1345 if (c1 == 0 || c2 == 0) {
1346 // some string ended. shorter string is smaller
1347 if (c1 == 0 && c2 == 0) {
1348 return 0;
1349 }
1350 return c1 == 0 ? -1 : 1;
1351 }
1352
1353 // Continue with bytewise comparison to produce some result that
1354 // would make comparison operations involving this function transitive.
1355 //
1356 // If only one string had an error, comparison should be made with
1357 // folded version of the other string. In this case it is enough
1358 // to fold just one character to determine the result of comparison.
1359
1360 if (c1 != -1 && c2 == -1) {
1361 n1 = utf_char2bytes(utf_fold(c1), buffer);
1362 s1 = buffer;
1363 } else if (c2 != -1 && c1 == -1) {
1364 n2 = utf_char2bytes(utf_fold(c2), buffer);
1365 s2 = buffer;
1366 }
1367
1368 while (n1 > 0 && n2 > 0 && *s1 != NUL && *s2 != NUL) {
1369 cdiff = (int)(*s1) - (int)(*s2);
1370 if (cdiff != 0) {
1371 return cdiff;
1372 }
1373
1374 s1++;
1375 s2++;
1376 n1--;
1377 n2--;
1378 }
1379
1380 if (n1 > 0 && *s1 == NUL) {
1381 n1 = 0;
1382 }
1383 if (n2 > 0 && *s2 == NUL) {
1384 n2 = 0;
1385 }
1386
1387 if (n1 == 0 && n2 == 0) {
1388 return 0;
1389 }
1390 return n1 == 0 ? -1 : 1;
1391 }
1392
1393 #ifdef WIN32
1394 # ifndef CP_UTF8
1395 # define CP_UTF8 65001 // magic number from winnls.h
1396 # endif
1397
1398 /// Converts string from UTF-8 to UTF-16.
1399 ///
1400 /// @param utf8 UTF-8 string.
1401 /// @param utf8len Length of `utf8`. May be -1 if `utf8` is NUL-terminated.
1402 /// @param utf16[out,allocated] NUL-terminated UTF-16 string, or NULL on error
1403 /// @return 0 on success, or libuv error code
utf8_to_utf16(const char * utf8,int utf8len,wchar_t ** utf16)1404 int utf8_to_utf16(const char *utf8, int utf8len, wchar_t **utf16)
1405 FUNC_ATTR_NONNULL_ALL
1406 {
1407 // Compute the length needed for the converted UTF-16 string.
1408 int bufsize = MultiByteToWideChar(CP_UTF8,
1409 0, // dwFlags: must be 0 for UTF-8
1410 utf8, // -1: process up to NUL
1411 utf8len,
1412 NULL,
1413 0); // 0: get length, don't convert
1414 if (bufsize == 0) {
1415 *utf16 = NULL;
1416 return uv_translate_sys_error(GetLastError());
1417 }
1418
1419 // Allocate the destination buffer adding an extra byte for the terminating
1420 // NULL. If `utf8len` is not -1 MultiByteToWideChar will not add it, so
1421 // we do it ourselves always, just in case.
1422 *utf16 = xmalloc(sizeof(wchar_t) * (bufsize + 1));
1423
1424 // Convert to UTF-16.
1425 bufsize = MultiByteToWideChar(CP_UTF8, 0, utf8, utf8len, *utf16, bufsize);
1426 if (bufsize == 0) {
1427 XFREE_CLEAR(*utf16);
1428 return uv_translate_sys_error(GetLastError());
1429 }
1430
1431 (*utf16)[bufsize] = L'\0';
1432 return 0;
1433 }
1434
1435 /// Converts string from UTF-16 to UTF-8.
1436 ///
1437 /// @param utf16 UTF-16 string.
1438 /// @param utf16len Length of `utf16`. May be -1 if `utf16` is NUL-terminated.
1439 /// @param utf8[out,allocated] NUL-terminated UTF-8 string, or NULL on error
1440 /// @return 0 on success, or libuv error code
utf16_to_utf8(const wchar_t * utf16,int utf16len,char ** utf8)1441 int utf16_to_utf8(const wchar_t *utf16, int utf16len, char **utf8)
1442 FUNC_ATTR_NONNULL_ALL
1443 {
1444 // Compute the space needed for the converted UTF-8 string.
1445 DWORD bufsize = WideCharToMultiByte(CP_UTF8,
1446 0,
1447 utf16,
1448 utf16len,
1449 NULL,
1450 0,
1451 NULL,
1452 NULL);
1453 if (bufsize == 0) {
1454 *utf8 = NULL;
1455 return uv_translate_sys_error(GetLastError());
1456 }
1457
1458 // Allocate the destination buffer adding an extra byte for the terminating
1459 // NULL. If `utf16len` is not -1 WideCharToMultiByte will not add it, so
1460 // we do it ourselves always, just in case.
1461 *utf8 = xmalloc(bufsize + 1);
1462
1463 // Convert to UTF-8.
1464 bufsize = WideCharToMultiByte(CP_UTF8,
1465 0,
1466 utf16,
1467 utf16len,
1468 *utf8,
1469 bufsize,
1470 NULL,
1471 NULL);
1472 if (bufsize == 0) {
1473 XFREE_CLEAR(*utf8);
1474 return uv_translate_sys_error(GetLastError());
1475 }
1476
1477 (*utf8)[bufsize] = '\0';
1478 return 0;
1479 }
1480
1481 #endif
1482
1483 /// Measure the length of a string in corresponding UTF-32 and UTF-16 units.
1484 ///
1485 /// Invalid UTF-8 bytes, or embedded surrogates, count as one code point/unit
1486 /// each.
1487 ///
1488 /// The out parameters are incremented. This is used to measure the size of
1489 /// a buffer region consisting of multiple line segments.
1490 ///
1491 /// @param s the string
1492 /// @param len maximum length (an earlier NUL terminates)
1493 /// @param[out] codepoints incremented with UTF-32 code point size
1494 /// @param[out] codeunits incremented with UTF-16 code unit size
mb_utflen(const char_u * s,size_t len,size_t * codepoints,size_t * codeunits)1495 void mb_utflen(const char_u *s, size_t len, size_t *codepoints, size_t *codeunits)
1496 FUNC_ATTR_NONNULL_ALL
1497 {
1498 size_t count = 0, extra = 0;
1499 size_t clen;
1500 for (size_t i = 0; i < len && s[i] != NUL; i += clen) {
1501 clen = utf_ptr2len_len(s+i, len-i);
1502 // NB: gets the byte value of invalid sequence bytes.
1503 // we only care whether the char fits in the BMP or not
1504 int c = (clen > 1) ? utf_ptr2char(s+i) : s[i];
1505 count++;
1506 if (c > 0xFFFF) {
1507 extra++;
1508 }
1509 }
1510 *codepoints += count;
1511 *codeunits += count + extra;
1512 }
1513
mb_utf_index_to_bytes(const char_u * s,size_t len,size_t index,bool use_utf16_units)1514 ssize_t mb_utf_index_to_bytes(const char_u *s, size_t len, size_t index, bool use_utf16_units)
1515 FUNC_ATTR_NONNULL_ALL
1516 {
1517 size_t count = 0;
1518 size_t clen, i;
1519 if (index == 0) {
1520 return 0;
1521 }
1522 for (i = 0; i < len && s[i] != NUL; i += clen) {
1523 clen = utf_ptr2len_len(s+i, len-i);
1524 // NB: gets the byte value of invalid sequence bytes.
1525 // we only care whether the char fits in the BMP or not
1526 int c = (clen > 1) ? utf_ptr2char(s+i) : s[i];
1527 count++;
1528 if (use_utf16_units && c > 0xFFFF) {
1529 count++;
1530 }
1531 if (count >= index) {
1532 return i+clen;
1533 }
1534 }
1535 return -1;
1536 }
1537
1538
1539 /*
1540 * Version of strnicmp() that handles multi-byte characters.
1541 * Needed for Big5, Shift-JIS and UTF-8 encoding. Other DBCS encodings can
1542 * probably use strnicmp(), because there are no ASCII characters in the
1543 * second byte.
1544 * Returns zero if s1 and s2 are equal (ignoring case), the difference between
1545 * two characters otherwise.
1546 */
mb_strnicmp(const char_u * s1,const char_u * s2,const size_t nn)1547 int mb_strnicmp(const char_u *s1, const char_u *s2, const size_t nn)
1548 {
1549 return utf_strnicmp(s1, s2, nn, nn);
1550 }
1551
1552 /// Compare strings case-insensitively
1553 ///
1554 /// @note We need to call mb_stricmp() even when we aren't dealing with
1555 /// a multi-byte encoding because mb_stricmp() takes care of all ASCII and
1556 /// non-ascii encodings, including characters with umlauts in latin1,
1557 /// etc., while STRICMP() only handles the system locale version, which
1558 /// often does not handle non-ascii properly.
1559 ///
1560 /// @param[in] s1 First string to compare, not more then #MAXCOL characters.
1561 /// @param[in] s2 Second string to compare, not more then #MAXCOL characters.
1562 ///
1563 /// @return 0 if strings are equal, <0 if s1 < s2, >0 if s1 > s2.
mb_stricmp(const char * s1,const char * s2)1564 int mb_stricmp(const char *s1, const char *s2)
1565 {
1566 return mb_strnicmp((const char_u *)s1, (const char_u *)s2, MAXCOL);
1567 }
1568
1569 /*
1570 * "g8": show bytes of the UTF-8 char under the cursor. Doesn't matter what
1571 * 'encoding' has been set to.
1572 */
show_utf8(void)1573 void show_utf8(void)
1574 {
1575 int len;
1576 int rlen = 0;
1577 char_u *line;
1578 int clen;
1579 int i;
1580
1581 // Get the byte length of the char under the cursor, including composing
1582 // characters.
1583 line = get_cursor_pos_ptr();
1584 len = utfc_ptr2len(line);
1585 if (len == 0) {
1586 msg("NUL");
1587 return;
1588 }
1589
1590 clen = 0;
1591 for (i = 0; i < len; ++i) {
1592 if (clen == 0) {
1593 // start of (composing) character, get its length
1594 if (i > 0) {
1595 STRCPY(IObuff + rlen, "+ ");
1596 rlen += 2;
1597 }
1598 clen = utf_ptr2len(line + i);
1599 }
1600 sprintf((char *)IObuff + rlen, "%02x ",
1601 (line[i] == NL) ? NUL : line[i]); // NUL is stored as NL
1602 --clen;
1603 rlen += (int)STRLEN(IObuff + rlen);
1604 if (rlen > IOSIZE - 20) {
1605 break;
1606 }
1607 }
1608
1609 msg((char *)IObuff);
1610 }
1611
1612 /// Return offset from "p" to the first byte of the character it points into.
1613 /// If "p" points to the NUL at the end of the string return 0.
1614 /// Returns 0 when already at the first byte of a character.
utf_head_off(const char_u * base,const char_u * p)1615 int utf_head_off(const char_u *base, const char_u *p)
1616 {
1617 int c;
1618 int len;
1619
1620 if (*p < 0x80) { // be quick for ASCII
1621 return 0;
1622 }
1623
1624 // Skip backwards over trailing bytes: 10xx.xxxx
1625 // Skip backwards again if on a composing char.
1626 const char_u *q;
1627 for (q = p;; --q) {
1628 // Move s to the last byte of this char.
1629 const char_u *s;
1630 for (s = q; (s[1] & 0xc0) == 0x80; ++s) {}
1631
1632 // Move q to the first byte of this char.
1633 while (q > base && (*q & 0xc0) == 0x80) {
1634 --q;
1635 }
1636 // Check for illegal sequence. Do allow an illegal byte after where we
1637 // started.
1638 len = utf8len_tab[*q];
1639 if (len != (int)(s - q + 1) && len != (int)(p - q + 1)) {
1640 return 0;
1641 }
1642
1643 if (q <= base) {
1644 break;
1645 }
1646
1647 c = utf_ptr2char(q);
1648 if (utf_iscomposing(c)) {
1649 continue;
1650 }
1651
1652 if (arabic_maycombine(c)) {
1653 // Advance to get a sneak-peak at the next char
1654 const char_u *j = q;
1655 --j;
1656 // Move j to the first byte of this char.
1657 while (j > base && (*j & 0xc0) == 0x80) {
1658 --j;
1659 }
1660 if (arabic_combine(utf_ptr2char(j), c)) {
1661 continue;
1662 }
1663 }
1664 break;
1665 }
1666
1667 return (int)(p - q);
1668 }
1669
1670 // Whether space is NOT allowed before/after 'c'.
utf_eat_space(int cc)1671 bool utf_eat_space(int cc)
1672 FUNC_ATTR_CONST FUNC_ATTR_WARN_UNUSED_RESULT
1673 {
1674 return (cc >= 0x2000 && cc <= 0x206F) // General punctuations
1675 || (cc >= 0x2e00 && cc <= 0x2e7f) // Supplemental punctuations
1676 || (cc >= 0x3000 && cc <= 0x303f) // CJK symbols and punctuations
1677 || (cc >= 0xff01 && cc <= 0xff0f) // Full width ASCII punctuations
1678 || (cc >= 0xff1a && cc <= 0xff20) // ..
1679 || (cc >= 0xff3b && cc <= 0xff40) // ..
1680 || (cc >= 0xff5b && cc <= 0xff65); // ..
1681 }
1682
1683 // Whether line break is allowed before "cc".
utf_allow_break_before(int cc)1684 bool utf_allow_break_before(int cc)
1685 FUNC_ATTR_CONST FUNC_ATTR_WARN_UNUSED_RESULT
1686 {
1687 static const int BOL_prohibition_punct[] = {
1688 '!',
1689 '%',
1690 ')',
1691 ',',
1692 ':',
1693 ';',
1694 '>',
1695 '?',
1696 ']',
1697 '}',
1698 0x2019, // ’ right single quotation mark
1699 0x201d, // ” right double quotation mark
1700 0x2020, // † dagger
1701 0x2021, // ‡ double dagger
1702 0x2026, // … horizontal ellipsis
1703 0x2030, // ‰ per mille sign
1704 0x2031, // ‱ per then thousand sign
1705 0x203c, // ‼ double exclamation mark
1706 0x2047, // ⁇ double question mark
1707 0x2048, // ⁈ question exclamation mark
1708 0x2049, // ⁉ exclamation question mark
1709 0x2103, // ℃ degree celsius
1710 0x2109, // ℉ degree fahrenheit
1711 0x3001, // 、 ideographic comma
1712 0x3002, // 。 ideographic full stop
1713 0x3009, // 〉 right angle bracket
1714 0x300b, // 》 right double angle bracket
1715 0x300d, // 」 right corner bracket
1716 0x300f, // 』 right white corner bracket
1717 0x3011, // 】 right black lenticular bracket
1718 0x3015, // 〕 right tortoise shell bracket
1719 0x3017, // 〗 right white lenticular bracket
1720 0x3019, // 〙 right white tortoise shell bracket
1721 0x301b, // 〛 right white square bracket
1722 0xff01, // ! fullwidth exclamation mark
1723 0xff09, // ) fullwidth right parenthesis
1724 0xff0c, // , fullwidth comma
1725 0xff0e, // . fullwidth full stop
1726 0xff1a, // : fullwidth colon
1727 0xff1b, // ; fullwidth semicolon
1728 0xff1f, // ? fullwidth question mark
1729 0xff3d, // ] fullwidth right square bracket
1730 0xff5d, // } fullwidth right curly bracket
1731 };
1732
1733 int first = 0;
1734 int last = ARRAY_SIZE(BOL_prohibition_punct) - 1;
1735
1736 while (first < last) {
1737 const int mid = (first + last) / 2;
1738
1739 if (cc == BOL_prohibition_punct[mid]) {
1740 return false;
1741 } else if (cc > BOL_prohibition_punct[mid]) {
1742 first = mid + 1;
1743 } else {
1744 last = mid - 1;
1745 }
1746 }
1747
1748 return cc != BOL_prohibition_punct[first];
1749 }
1750
1751 // Whether line break is allowed after "cc".
utf_allow_break_after(int cc)1752 bool utf_allow_break_after(int cc)
1753 FUNC_ATTR_CONST FUNC_ATTR_WARN_UNUSED_RESULT
1754 {
1755 static const int EOL_prohibition_punct[] = {
1756 '(',
1757 '<',
1758 '[',
1759 '`',
1760 '{',
1761 // 0x2014, // — em dash
1762 0x2018, // ‘ left single quotation mark
1763 0x201c, // “ left double quotation mark
1764 // 0x2053, // ~ swung dash
1765 0x3008, // 〈 left angle bracket
1766 0x300a, // 《 left double angle bracket
1767 0x300c, // 「 left corner bracket
1768 0x300e, // 『 left white corner bracket
1769 0x3010, // 【 left black lenticular bracket
1770 0x3014, // 〔 left tortoise shell bracket
1771 0x3016, // 〖 left white lenticular bracket
1772 0x3018, // 〘 left white tortoise shell bracket
1773 0x301a, // 〚 left white square bracket
1774 0xff08, // ( fullwidth left parenthesis
1775 0xff3b, // [ fullwidth left square bracket
1776 0xff5b, // { fullwidth left curly bracket
1777 };
1778
1779 int first = 0;
1780 int last = ARRAY_SIZE(EOL_prohibition_punct) - 1;
1781
1782 while (first < last) {
1783 const int mid = (first + last)/2;
1784
1785 if (cc == EOL_prohibition_punct[mid]) {
1786 return false;
1787 } else if (cc > EOL_prohibition_punct[mid]) {
1788 first = mid + 1;
1789 } else {
1790 last = mid - 1;
1791 }
1792 }
1793
1794 return cc != EOL_prohibition_punct[first];
1795 }
1796
1797 // Whether line break is allowed between "cc" and "ncc".
utf_allow_break(int cc,int ncc)1798 bool utf_allow_break(int cc, int ncc)
1799 FUNC_ATTR_CONST FUNC_ATTR_WARN_UNUSED_RESULT
1800 {
1801 // don't break between two-letter punctuations
1802 if (cc == ncc
1803 && (cc == 0x2014 // em dash
1804 || cc == 0x2026)) { // horizontal ellipsis
1805 return false;
1806 }
1807 return utf_allow_break_after(cc) && utf_allow_break_before(ncc);
1808 }
1809
1810 /// Copy a character, advancing the pointers
1811 ///
1812 /// @param[in,out] fp Source of the character to copy.
1813 /// @param[in,out] tp Destination to copy to.
mb_copy_char(const char_u ** const fp,char_u ** const tp)1814 void mb_copy_char(const char_u **const fp, char_u **const tp)
1815 {
1816 const size_t l = (size_t)utfc_ptr2len(*fp);
1817
1818 memmove(*tp, *fp, l);
1819 *tp += l;
1820 *fp += l;
1821 }
1822
1823 /*
1824 * Return the offset from "p" to the first byte of a character. When "p" is
1825 * at the start of a character 0 is returned, otherwise the offset to the next
1826 * character. Can start anywhere in a stream of bytes.
1827 */
mb_off_next(char_u * base,char_u * p)1828 int mb_off_next(char_u *base, char_u *p)
1829 {
1830 int i;
1831 int j;
1832
1833 if (*p < 0x80) { // be quick for ASCII
1834 return 0;
1835 }
1836
1837 // Find the next character that isn't 10xx.xxxx
1838 for (i = 0; (p[i] & 0xc0) == 0x80; i++) {}
1839 if (i > 0) {
1840 // Check for illegal sequence.
1841 for (j = 0; p - j > base; j++) {
1842 if ((p[-j] & 0xc0) != 0x80) {
1843 break;
1844 }
1845 }
1846 if (utf8len_tab[p[-j]] != i + j) {
1847 return 0;
1848 }
1849 }
1850 return i;
1851 }
1852
1853 /*
1854 * Return the offset from "p" to the last byte of the character it points
1855 * into. Can start anywhere in a stream of bytes.
1856 */
mb_tail_off(char_u * base,char_u * p)1857 int mb_tail_off(char_u *base, char_u *p)
1858 {
1859 int i;
1860 int j;
1861
1862 if (*p == NUL) {
1863 return 0;
1864 }
1865
1866 // Find the last character that is 10xx.xxxx
1867 for (i = 0; (p[i + 1] & 0xc0) == 0x80; i++) {}
1868
1869 // Check for illegal sequence.
1870 for (j = 0; p - j > base; j++) {
1871 if ((p[-j] & 0xc0) != 0x80) {
1872 break;
1873 }
1874 }
1875
1876 if (utf8len_tab[p[-j]] != i + j + 1) {
1877 return 0;
1878 }
1879 return i;
1880 }
1881
1882
1883 /// Return the offset from "p" to the first byte of the character it points
1884 /// into. Can start anywhere in a stream of bytes.
1885 ///
1886 /// @param[in] base Pointer to start of string
1887 /// @param[in] p Pointer to byte for which to return the offset to the previous codepoint
1888 //
1889 /// @return 0 if invalid sequence, else offset to previous codepoint
mb_head_off(char_u * base,char_u * p)1890 int mb_head_off(char_u *base, char_u *p)
1891 {
1892 int i;
1893 int j;
1894
1895 if (*p == NUL) {
1896 return 0;
1897 }
1898
1899 // Find the first character that is not 10xx.xxxx
1900 for (i = 0; p - i > base; i--) {
1901 if ((p[i] & 0xc0) != 0x80) {
1902 break;
1903 }
1904 }
1905
1906 // Find the last character that is 10xx.xxxx
1907 for (j = 0; (p[j + 1] & 0xc0) == 0x80; j++) {}
1908
1909 // Check for illegal sequence.
1910 if (utf8len_tab[p[i]] == 1) {
1911 return 0;
1912 }
1913 return i;
1914 }
1915
1916 /*
1917 * Find the next illegal byte sequence.
1918 */
utf_find_illegal(void)1919 void utf_find_illegal(void)
1920 {
1921 pos_T pos = curwin->w_cursor;
1922 char_u *p;
1923 int len;
1924 vimconv_T vimconv;
1925 char_u *tofree = NULL;
1926
1927 vimconv.vc_type = CONV_NONE;
1928 if (enc_canon_props(curbuf->b_p_fenc) & ENC_8BIT) {
1929 // 'encoding' is "utf-8" but we are editing a 8-bit encoded file,
1930 // possibly a utf-8 file with illegal bytes. Setup for conversion
1931 // from utf-8 to 'fileencoding'.
1932 convert_setup(&vimconv, p_enc, curbuf->b_p_fenc);
1933 }
1934
1935 curwin->w_cursor.coladd = 0;
1936 for (;;) {
1937 p = get_cursor_pos_ptr();
1938 if (vimconv.vc_type != CONV_NONE) {
1939 xfree(tofree);
1940 tofree = string_convert(&vimconv, p, NULL);
1941 if (tofree == NULL) {
1942 break;
1943 }
1944 p = tofree;
1945 }
1946
1947 while (*p != NUL) {
1948 // Illegal means that there are not enough trail bytes (checked by
1949 // utf_ptr2len()) or too many of them (overlong sequence).
1950 len = utf_ptr2len(p);
1951 if (*p >= 0x80 && (len == 1
1952 || utf_char2len(utf_ptr2char(p)) != len)) {
1953 if (vimconv.vc_type == CONV_NONE) {
1954 curwin->w_cursor.col += (colnr_T)(p - get_cursor_pos_ptr());
1955 } else {
1956 int l;
1957
1958 len = (int)(p - tofree);
1959 for (p = get_cursor_pos_ptr(); *p != NUL && len-- > 0; p += l) {
1960 l = utf_ptr2len(p);
1961 curwin->w_cursor.col += l;
1962 }
1963 }
1964 goto theend;
1965 }
1966 p += len;
1967 }
1968 if (curwin->w_cursor.lnum == curbuf->b_ml.ml_line_count) {
1969 break;
1970 }
1971 ++curwin->w_cursor.lnum;
1972 curwin->w_cursor.col = 0;
1973 }
1974
1975 // didn't find it: don't move and beep
1976 curwin->w_cursor = pos;
1977 beep_flush();
1978
1979 theend:
1980 xfree(tofree);
1981 convert_setup(&vimconv, NULL, NULL);
1982 }
1983
1984 /*
1985 * If the cursor moves on an trail byte, set the cursor on the lead byte.
1986 * Thus it moves left if necessary.
1987 */
mb_adjust_cursor(void)1988 void mb_adjust_cursor(void)
1989 {
1990 mark_mb_adjustpos(curbuf, &curwin->w_cursor);
1991 }
1992
1993 /// Checks and adjusts cursor column. Not mode-dependent.
1994 /// @see check_cursor_col_win
1995 ///
1996 /// @param win_ Places cursor on a valid column for this window.
mb_check_adjust_col(void * win_)1997 void mb_check_adjust_col(void *win_)
1998 {
1999 win_T *win = (win_T *)win_;
2000 colnr_T oldcol = win->w_cursor.col;
2001
2002 // Column 0 is always valid.
2003 if (oldcol != 0) {
2004 char_u *p = ml_get_buf(win->w_buffer, win->w_cursor.lnum, false);
2005 colnr_T len = (colnr_T)STRLEN(p);
2006
2007 // Empty line or invalid column?
2008 if (len == 0 || oldcol < 0) {
2009 win->w_cursor.col = 0;
2010 } else {
2011 // Cursor column too big for line?
2012 if (oldcol > len) {
2013 win->w_cursor.col = len - 1;
2014 }
2015 // Move the cursor to the head byte.
2016 win->w_cursor.col -= utf_head_off(p, p + win->w_cursor.col);
2017 }
2018
2019 // Reset `coladd` when the cursor would be on the right half of a
2020 // double-wide character.
2021 if (win->w_cursor.coladd == 1 && p[win->w_cursor.col] != TAB
2022 && vim_isprintc(utf_ptr2char(p + win->w_cursor.col))
2023 && ptr2cells(p + win->w_cursor.col) > 1) {
2024 win->w_cursor.coladd = 0;
2025 }
2026 }
2027 }
2028
2029 /// @param line start of the string
2030 ///
2031 /// @return a pointer to the character before "*p", if there is one.
mb_prevptr(char_u * line,char_u * p)2032 char_u *mb_prevptr(char_u *line, char_u *p)
2033 {
2034 if (p > line) {
2035 MB_PTR_BACK(line, p);
2036 }
2037 return p;
2038 }
2039
2040 /*
2041 * Return the character length of "str". Each multi-byte character (with
2042 * following composing characters) counts as one.
2043 */
mb_charlen(char_u * str)2044 int mb_charlen(char_u *str)
2045 {
2046 char_u *p = str;
2047 int count;
2048
2049 if (p == NULL) {
2050 return 0;
2051 }
2052
2053 for (count = 0; *p != NUL; count++) {
2054 p += utfc_ptr2len(p);
2055 }
2056
2057 return count;
2058 }
2059
2060 /*
2061 * Like mb_charlen() but for a string with specified length.
2062 */
mb_charlen_len(char_u * str,int len)2063 int mb_charlen_len(char_u *str, int len)
2064 {
2065 char_u *p = str;
2066 int count;
2067
2068 for (count = 0; *p != NUL && p < str + len; count++) {
2069 p += utfc_ptr2len(p);
2070 }
2071
2072 return count;
2073 }
2074
2075 /// Try to unescape a multibyte character
2076 ///
2077 /// Used for the rhs and lhs of the mappings.
2078 ///
2079 /// @param[in,out] pp String to unescape. Is advanced to just after the bytes
2080 /// that form a multibyte character.
2081 ///
2082 /// @return Unescaped string if it is a multibyte character, NULL if no
2083 /// multibyte character was found. Returns a static buffer, always one
2084 /// and the same.
mb_unescape(const char ** const pp)2085 const char *mb_unescape(const char **const pp)
2086 FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
2087 {
2088 static char buf[6];
2089 size_t buf_idx = 0;
2090 uint8_t *str = (uint8_t *)(*pp);
2091
2092 // Must translate K_SPECIAL KS_SPECIAL KE_FILLER to K_SPECIAL and CSI
2093 // KS_EXTRA KE_CSI to CSI.
2094 // Maximum length of a utf-8 character is 4 bytes.
2095 for (size_t str_idx = 0; str[str_idx] != NUL && buf_idx < 4; str_idx++) {
2096 if (str[str_idx] == K_SPECIAL
2097 && str[str_idx + 1] == KS_SPECIAL
2098 && str[str_idx + 2] == KE_FILLER) {
2099 buf[buf_idx++] = (char)K_SPECIAL;
2100 str_idx += 2;
2101 } else if ((str[str_idx] == K_SPECIAL)
2102 && str[str_idx + 1] == KS_EXTRA
2103 && str[str_idx + 2] == KE_CSI) {
2104 buf[buf_idx++] = (char)CSI;
2105 str_idx += 2;
2106 } else if (str[str_idx] == K_SPECIAL) {
2107 break; // A special key can't be a multibyte char.
2108 } else {
2109 buf[buf_idx++] = (char)str[str_idx];
2110 }
2111 buf[buf_idx] = NUL;
2112
2113 // Return a multi-byte character if it's found. An illegal sequence
2114 // will result in a 1 here.
2115 if (utf_ptr2len((const char_u *)buf) > 1) {
2116 *pp = (const char *)str + str_idx + 1;
2117 return buf;
2118 }
2119
2120 // Bail out quickly for ASCII.
2121 if ((uint8_t)buf[0] < 128) {
2122 break;
2123 }
2124 }
2125 return NULL;
2126 }
2127
2128
2129 /*
2130 * Skip the Vim specific head of a 'encoding' name.
2131 */
enc_skip(char_u * p)2132 char_u *enc_skip(char_u *p)
2133 {
2134 if (STRNCMP(p, "2byte-", 6) == 0) {
2135 return p + 6;
2136 }
2137 if (STRNCMP(p, "8bit-", 5) == 0) {
2138 return p + 5;
2139 }
2140 return p;
2141 }
2142
2143 /*
2144 * Find the canonical name for encoding "enc".
2145 * When the name isn't recognized, returns "enc" itself, but with all lower
2146 * case characters and '_' replaced with '-'.
2147 * Returns an allocated string.
2148 */
enc_canonize(char_u * enc)2149 char_u *enc_canonize(char_u *enc) FUNC_ATTR_NONNULL_RET
2150 {
2151 char_u *p, *s;
2152 int i;
2153
2154 if (STRCMP(enc, "default") == 0) {
2155 // Use the default encoding as found by set_init_1().
2156 return vim_strsave(fenc_default);
2157 }
2158
2159 // copy "enc" to allocated memory, with room for two '-'
2160 char_u *r = xmalloc(STRLEN(enc) + 3);
2161 // Make it all lower case and replace '_' with '-'.
2162 p = r;
2163 for (s = enc; *s != NUL; ++s) {
2164 if (*s == '_') {
2165 *p++ = '-';
2166 } else {
2167 *p++ = TOLOWER_ASC(*s);
2168 }
2169 }
2170 *p = NUL;
2171
2172 // Skip "2byte-" and "8bit-".
2173 p = enc_skip(r);
2174
2175 // Change "microsoft-cp" to "cp". Used in some spell files.
2176 if (STRNCMP(p, "microsoft-cp", 12) == 0) {
2177 STRMOVE(p, p + 10);
2178 }
2179
2180 // "iso8859" -> "iso-8859"
2181 if (STRNCMP(p, "iso8859", 7) == 0) {
2182 STRMOVE(p + 4, p + 3);
2183 p[3] = '-';
2184 }
2185
2186 // "iso-8859n" -> "iso-8859-n"
2187 if (STRNCMP(p, "iso-8859", 8) == 0 && p[8] != '-') {
2188 STRMOVE(p + 9, p + 8);
2189 p[8] = '-';
2190 }
2191
2192 // "latin-N" -> "latinN"
2193 if (STRNCMP(p, "latin-", 6) == 0) {
2194 STRMOVE(p + 5, p + 6);
2195 }
2196
2197 if (enc_canon_search(p) >= 0) {
2198 // canonical name can be used unmodified
2199 if (p != r) {
2200 STRMOVE(r, p);
2201 }
2202 } else if ((i = enc_alias_search(p)) >= 0) {
2203 // alias recognized, get canonical name
2204 xfree(r);
2205 r = vim_strsave((char_u *)enc_canon_table[i].name);
2206 }
2207 return r;
2208 }
2209
2210 /*
2211 * Search for an encoding alias of "name".
2212 * Returns -1 when not found.
2213 */
enc_alias_search(char_u * name)2214 static int enc_alias_search(char_u *name)
2215 {
2216 int i;
2217
2218 for (i = 0; enc_alias_table[i].name != NULL; ++i) {
2219 if (STRCMP(name, enc_alias_table[i].name) == 0) {
2220 return enc_alias_table[i].canon;
2221 }
2222 }
2223 return -1;
2224 }
2225
2226
2227 #ifdef HAVE_LANGINFO_H
2228 # include <langinfo.h>
2229 #endif
2230
2231 /*
2232 * Get the canonicalized encoding of the current locale.
2233 * Returns an allocated string when successful, NULL when not.
2234 */
enc_locale(void)2235 char_u *enc_locale(void)
2236 {
2237 int i;
2238 char buf[50];
2239
2240 const char *s;
2241 #ifdef HAVE_NL_LANGINFO_CODESET
2242 if (!(s = nl_langinfo(CODESET)) || *s == NUL)
2243 #endif
2244 {
2245 #if defined(HAVE_LOCALE_H)
2246 if (!(s = setlocale(LC_CTYPE, NULL)) || *s == NUL)
2247 #endif
2248 {
2249 if ((s = os_getenv("LC_ALL"))) {
2250 if ((s = os_getenv("LC_CTYPE"))) {
2251 s = os_getenv("LANG");
2252 }
2253 }
2254 }
2255 }
2256
2257 if (!s) {
2258 return NULL;
2259 }
2260
2261 // The most generic locale format is:
2262 // language[_territory][.codeset][@modifier][+special][,[sponsor][_revision]]
2263 // If there is a '.' remove the part before it.
2264 // if there is something after the codeset, remove it.
2265 // Make the name lowercase and replace '_' with '-'.
2266 // Exception: "ja_JP.EUC" == "euc-jp", "zh_CN.EUC" = "euc-cn",
2267 // "ko_KR.EUC" == "euc-kr"
2268 const char *p = (char *)vim_strchr((char_u *)s, '.');
2269 if (p != NULL) {
2270 if (p > s + 2 && !STRNICMP(p + 1, "EUC", 3)
2271 && !isalnum((int)p[4]) && p[4] != '-' && p[-3] == '_') {
2272 // Copy "XY.EUC" to "euc-XY" to buf[10].
2273 memmove(buf, "euc-", 4);
2274 buf[4] = (ASCII_ISALNUM(p[-2]) ? TOLOWER_ASC(p[-2]) : 0);
2275 buf[5] = (ASCII_ISALNUM(p[-1]) ? TOLOWER_ASC(p[-1]) : 0);
2276 buf[6] = NUL;
2277 } else {
2278 s = p + 1;
2279 goto enc_locale_copy_enc;
2280 }
2281 } else {
2282 enc_locale_copy_enc:
2283 for (i = 0; i < (int)sizeof(buf) - 1 && s[i] != NUL; i++) {
2284 if (s[i] == '_' || s[i] == '-') {
2285 buf[i] = '-';
2286 } else if (ASCII_ISALNUM((uint8_t)s[i])) {
2287 buf[i] = TOLOWER_ASC(s[i]);
2288 } else {
2289 break;
2290 }
2291 }
2292 buf[i] = NUL;
2293 }
2294
2295 return enc_canonize((char_u *)buf);
2296 }
2297
2298 #if defined(HAVE_ICONV)
2299
2300
2301 /*
2302 * Call iconv_open() with a check if iconv() works properly (there are broken
2303 * versions).
2304 * Returns (void *)-1 if failed.
2305 * (should return iconv_t, but that causes problems with prototypes).
2306 */
my_iconv_open(char_u * to,char_u * from)2307 void *my_iconv_open(char_u *to, char_u *from)
2308 {
2309 iconv_t fd;
2310 # define ICONV_TESTLEN 400
2311 char_u tobuf[ICONV_TESTLEN];
2312 char *p;
2313 size_t tolen;
2314 static WorkingStatus iconv_working = kUnknown;
2315
2316 if (iconv_working == kBroken) {
2317 return (void *)-1; // detected a broken iconv() previously
2318 }
2319 fd = iconv_open((char *)enc_skip(to), (char *)enc_skip(from));
2320
2321 if (fd != (iconv_t)-1 && iconv_working == kUnknown) {
2322 /*
2323 * Do a dummy iconv() call to check if it actually works. There is a
2324 * version of iconv() on Linux that is broken. We can't ignore it,
2325 * because it's wide-spread. The symptoms are that after outputting
2326 * the initial shift state the "to" pointer is NULL and conversion
2327 * stops for no apparent reason after about 8160 characters.
2328 */
2329 p = (char *)tobuf;
2330 tolen = ICONV_TESTLEN;
2331 (void)iconv(fd, NULL, NULL, &p, &tolen);
2332 if (p == NULL) {
2333 iconv_working = kBroken;
2334 iconv_close(fd);
2335 fd = (iconv_t)-1;
2336 } else {
2337 iconv_working = kWorking;
2338 }
2339 }
2340
2341 return (void *)fd;
2342 }
2343
2344 /*
2345 * Convert the string "str[slen]" with iconv().
2346 * If "unconvlenp" is not NULL handle the string ending in an incomplete
2347 * sequence and set "*unconvlenp" to the length of it.
2348 * Returns the converted string in allocated memory. NULL for an error.
2349 * If resultlenp is not NULL, sets it to the result length in bytes.
2350 */
iconv_string(const vimconv_T * const vcp,char_u * str,size_t slen,size_t * unconvlenp,size_t * resultlenp)2351 static char_u *iconv_string(const vimconv_T *const vcp, char_u *str, size_t slen,
2352 size_t *unconvlenp, size_t *resultlenp)
2353 {
2354 const char *from;
2355 size_t fromlen;
2356 char *to;
2357 size_t tolen;
2358 size_t len = 0;
2359 size_t done = 0;
2360 char_u *result = NULL;
2361 char_u *p;
2362 int l;
2363
2364 from = (char *)str;
2365 fromlen = slen;
2366 for (;;) {
2367 if (len == 0 || ICONV_ERRNO == ICONV_E2BIG) {
2368 // Allocate enough room for most conversions. When re-allocating
2369 // increase the buffer size.
2370 len = len + fromlen * 2 + 40;
2371 p = xmalloc(len);
2372 if (done > 0) {
2373 memmove(p, result, done);
2374 }
2375 xfree(result);
2376 result = p;
2377 }
2378
2379 to = (char *)result + done;
2380 tolen = len - done - 2;
2381 // Avoid a warning for systems with a wrong iconv() prototype by
2382 // casting the second argument to void *.
2383 if (iconv(vcp->vc_fd, (void *)&from, &fromlen, &to, &tolen) != SIZE_MAX) {
2384 // Finished, append a NUL.
2385 *to = NUL;
2386 break;
2387 }
2388
2389 // Check both ICONV_EINVAL and EINVAL, because the dynamically loaded
2390 // iconv library may use one of them.
2391 if (!vcp->vc_fail && unconvlenp != NULL
2392 && (ICONV_ERRNO == ICONV_EINVAL || ICONV_ERRNO == EINVAL)) {
2393 // Handle an incomplete sequence at the end.
2394 *to = NUL;
2395 *unconvlenp = fromlen;
2396 break;
2397 } else if (!vcp->vc_fail
2398 && (ICONV_ERRNO == ICONV_EILSEQ || ICONV_ERRNO == EILSEQ
2399 || ICONV_ERRNO == ICONV_EINVAL || ICONV_ERRNO == EINVAL)) {
2400 // Check both ICONV_EILSEQ and EILSEQ, because the dynamically loaded
2401 // iconv library may use one of them.
2402
2403 // Can't convert: insert a '?' and skip a character. This assumes
2404 // conversion from 'encoding' to something else. In other
2405 // situations we don't know what to skip anyway.
2406 *to++ = '?';
2407 if (utf_ptr2cells((char_u *)from) > 1) {
2408 *to++ = '?';
2409 }
2410 l = utfc_ptr2len_len((const char_u *)from, (int)fromlen);
2411 from += l;
2412 fromlen -= l;
2413 } else if (ICONV_ERRNO != ICONV_E2BIG) {
2414 // conversion failed
2415 XFREE_CLEAR(result);
2416 break;
2417 }
2418 // Not enough room or skipping illegal sequence.
2419 done = to - (char *)result;
2420 }
2421
2422 if (resultlenp != NULL && result != NULL) {
2423 *resultlenp = (size_t)(to - (char *)result);
2424 }
2425 return result;
2426 }
2427
2428 #endif // HAVE_ICONV
2429
2430
2431 /*
2432 * Setup "vcp" for conversion from "from" to "to".
2433 * The names must have been made canonical with enc_canonize().
2434 * vcp->vc_type must have been initialized to CONV_NONE.
2435 * Note: cannot be used for conversion from/to ucs-2 and ucs-4 (will use utf-8
2436 * instead).
2437 * Afterwards invoke with "from" and "to" equal to NULL to cleanup.
2438 * Return FAIL when conversion is not supported, OK otherwise.
2439 */
convert_setup(vimconv_T * vcp,char_u * from,char_u * to)2440 int convert_setup(vimconv_T *vcp, char_u *from, char_u *to)
2441 {
2442 return convert_setup_ext(vcp, from, true, to, true);
2443 }
2444
2445 /// As convert_setup(), but only when from_unicode_is_utf8 is true will all
2446 /// "from" unicode charsets be considered utf-8. Same for "to".
convert_setup_ext(vimconv_T * vcp,char_u * from,bool from_unicode_is_utf8,char_u * to,bool to_unicode_is_utf8)2447 int convert_setup_ext(vimconv_T *vcp, char_u *from, bool from_unicode_is_utf8, char_u *to,
2448 bool to_unicode_is_utf8)
2449 {
2450 int from_prop;
2451 int to_prop;
2452 int from_is_utf8;
2453 int to_is_utf8;
2454
2455 // Reset to no conversion.
2456 #ifdef HAVE_ICONV
2457 if (vcp->vc_type == CONV_ICONV && vcp->vc_fd != (iconv_t)-1) {
2458 iconv_close(vcp->vc_fd);
2459 }
2460 #endif
2461 *vcp = (vimconv_T)MBYTE_NONE_CONV;
2462
2463 // No conversion when one of the names is empty or they are equal.
2464 if (from == NULL || *from == NUL || to == NULL || *to == NUL
2465 || STRCMP(from, to) == 0) {
2466 return OK;
2467 }
2468
2469 from_prop = enc_canon_props(from);
2470 to_prop = enc_canon_props(to);
2471 if (from_unicode_is_utf8) {
2472 from_is_utf8 = from_prop & ENC_UNICODE;
2473 } else {
2474 from_is_utf8 = from_prop == ENC_UNICODE;
2475 }
2476 if (to_unicode_is_utf8) {
2477 to_is_utf8 = to_prop & ENC_UNICODE;
2478 } else {
2479 to_is_utf8 = to_prop == ENC_UNICODE;
2480 }
2481
2482 if ((from_prop & ENC_LATIN1) && to_is_utf8) {
2483 // Internal latin1 -> utf-8 conversion.
2484 vcp->vc_type = CONV_TO_UTF8;
2485 vcp->vc_factor = 2; // up to twice as long
2486 } else if ((from_prop & ENC_LATIN9) && to_is_utf8) {
2487 // Internal latin9 -> utf-8 conversion.
2488 vcp->vc_type = CONV_9_TO_UTF8;
2489 vcp->vc_factor = 3; // up to three as long (euro sign)
2490 } else if (from_is_utf8 && (to_prop & ENC_LATIN1)) {
2491 // Internal utf-8 -> latin1 conversion.
2492 vcp->vc_type = CONV_TO_LATIN1;
2493 } else if (from_is_utf8 && (to_prop & ENC_LATIN9)) {
2494 // Internal utf-8 -> latin9 conversion.
2495 vcp->vc_type = CONV_TO_LATIN9;
2496 }
2497 #ifdef HAVE_ICONV
2498 else { // NOLINT(readability/braces)
2499 // Use iconv() for conversion.
2500 vcp->vc_fd = (iconv_t)my_iconv_open(to_is_utf8 ? (char_u *)"utf-8" : to,
2501 from_is_utf8 ? (char_u *)"utf-8" : from);
2502 if (vcp->vc_fd != (iconv_t)-1) {
2503 vcp->vc_type = CONV_ICONV;
2504 vcp->vc_factor = 4; // could be longer too...
2505 }
2506 }
2507 #endif
2508 if (vcp->vc_type == CONV_NONE) {
2509 return FAIL;
2510 }
2511
2512 return OK;
2513 }
2514
2515 /*
2516 * Convert text "ptr[*lenp]" according to "vcp".
2517 * Returns the result in allocated memory and sets "*lenp".
2518 * When "lenp" is NULL, use NUL terminated strings.
2519 * Illegal chars are often changed to "?", unless vcp->vc_fail is set.
2520 * When something goes wrong, NULL is returned and "*lenp" is unchanged.
2521 */
string_convert(const vimconv_T * const vcp,char_u * ptr,size_t * lenp)2522 char_u *string_convert(const vimconv_T *const vcp, char_u *ptr, size_t *lenp)
2523 {
2524 return string_convert_ext(vcp, ptr, lenp, NULL);
2525 }
2526
2527 /*
2528 * Like string_convert(), but when "unconvlenp" is not NULL and there are is
2529 * an incomplete sequence at the end it is not converted and "*unconvlenp" is
2530 * set to the number of remaining bytes.
2531 */
string_convert_ext(const vimconv_T * const vcp,char_u * ptr,size_t * lenp,size_t * unconvlenp)2532 char_u *string_convert_ext(const vimconv_T *const vcp, char_u *ptr, size_t *lenp,
2533 size_t *unconvlenp)
2534 {
2535 char_u *retval = NULL;
2536 char_u *d;
2537 int l;
2538 int c;
2539
2540 size_t len;
2541 if (lenp == NULL) {
2542 len = STRLEN(ptr);
2543 } else {
2544 len = *lenp;
2545 }
2546 if (len == 0) {
2547 return vim_strsave((char_u *)"");
2548 }
2549
2550 switch (vcp->vc_type) {
2551 case CONV_TO_UTF8: // latin1 to utf-8 conversion
2552 retval = xmalloc(len * 2 + 1);
2553 d = retval;
2554 for (size_t i = 0; i < len; ++i) {
2555 c = ptr[i];
2556 if (c < 0x80) {
2557 *d++ = c;
2558 } else {
2559 *d++ = 0xc0 + ((unsigned)c >> 6);
2560 *d++ = 0x80 + (c & 0x3f);
2561 }
2562 }
2563 *d = NUL;
2564 if (lenp != NULL) {
2565 *lenp = (size_t)(d - retval);
2566 }
2567 break;
2568
2569 case CONV_9_TO_UTF8: // latin9 to utf-8 conversion
2570 retval = xmalloc(len * 3 + 1);
2571 d = retval;
2572 for (size_t i = 0; i < len; ++i) {
2573 c = ptr[i];
2574 switch (c) {
2575 case 0xa4:
2576 c = 0x20ac; break; // euro
2577 case 0xa6:
2578 c = 0x0160; break; // S hat
2579 case 0xa8:
2580 c = 0x0161; break; // S -hat
2581 case 0xb4:
2582 c = 0x017d; break; // Z hat
2583 case 0xb8:
2584 c = 0x017e; break; // Z -hat
2585 case 0xbc:
2586 c = 0x0152; break; // OE
2587 case 0xbd:
2588 c = 0x0153; break; // oe
2589 case 0xbe:
2590 c = 0x0178; break; // Y
2591 }
2592 d += utf_char2bytes(c, d);
2593 }
2594 *d = NUL;
2595 if (lenp != NULL) {
2596 *lenp = (size_t)(d - retval);
2597 }
2598 break;
2599
2600 case CONV_TO_LATIN1: // utf-8 to latin1 conversion
2601 case CONV_TO_LATIN9: // utf-8 to latin9 conversion
2602 retval = xmalloc(len + 1);
2603 d = retval;
2604 for (size_t i = 0; i < len; ++i) {
2605 l = utf_ptr2len_len(ptr + i, len - i);
2606 if (l == 0) {
2607 *d++ = NUL;
2608 } else if (l == 1) {
2609 uint8_t l_w = utf8len_tab_zero[ptr[i]];
2610
2611 if (l_w == 0) {
2612 // Illegal utf-8 byte cannot be converted
2613 xfree(retval);
2614 return NULL;
2615 }
2616 if (unconvlenp != NULL && l_w > len - i) {
2617 // Incomplete sequence at the end.
2618 *unconvlenp = len - i;
2619 break;
2620 }
2621 *d++ = ptr[i];
2622 } else {
2623 c = utf_ptr2char(ptr + i);
2624 if (vcp->vc_type == CONV_TO_LATIN9) {
2625 switch (c) {
2626 case 0x20ac:
2627 c = 0xa4; break; // euro
2628 case 0x0160:
2629 c = 0xa6; break; // S hat
2630 case 0x0161:
2631 c = 0xa8; break; // S -hat
2632 case 0x017d:
2633 c = 0xb4; break; // Z hat
2634 case 0x017e:
2635 c = 0xb8; break; // Z -hat
2636 case 0x0152:
2637 c = 0xbc; break; // OE
2638 case 0x0153:
2639 c = 0xbd; break; // oe
2640 case 0x0178:
2641 c = 0xbe; break; // Y
2642 case 0xa4:
2643 case 0xa6:
2644 case 0xa8:
2645 case 0xb4:
2646 case 0xb8:
2647 case 0xbc:
2648 case 0xbd:
2649 case 0xbe:
2650 c = 0x100; break; // not in latin9
2651 }
2652 }
2653 if (!utf_iscomposing(c)) { // skip composing chars
2654 if (c < 0x100) {
2655 *d++ = c;
2656 } else if (vcp->vc_fail) {
2657 xfree(retval);
2658 return NULL;
2659 } else {
2660 *d++ = 0xbf;
2661 if (utf_char2cells(c) > 1) {
2662 *d++ = '?';
2663 }
2664 }
2665 }
2666 i += l - 1;
2667 }
2668 }
2669 *d = NUL;
2670 if (lenp != NULL) {
2671 *lenp = (size_t)(d - retval);
2672 }
2673 break;
2674
2675 #ifdef HAVE_ICONV
2676 case CONV_ICONV: // conversion with vcp->vc_fd
2677 retval = iconv_string(vcp, ptr, len, unconvlenp, lenp);
2678 break;
2679 #endif
2680 }
2681
2682 return retval;
2683 }
2684