1 // This is an open source non-commercial project. Dear PVS-Studio, please check
2 // it. PVS-Studio Static Code Analyzer for C, C++ and C#: http://www.viva64.com
3 
4 /// mbyte.c: Code specifically for handling multi-byte characters.
5 /// Multibyte extensions partly by Sung-Hoon Baek
6 ///
7 /// Strings internal to Nvim are always encoded as UTF-8 (thus the legacy
8 /// 'encoding' option is always "utf-8").
9 ///
10 /// The cell width on the display needs to be determined from the character
11 /// value. Recognizing UTF-8 bytes is easy: 0xxx.xxxx is a single-byte char,
12 /// 10xx.xxxx is a trailing byte, 11xx.xxxx is a leading byte of a multi-byte
13 /// character. To make things complicated, up to six composing characters
14 /// are allowed. These are drawn on top of the first char. For most editing
15 /// the sequence of bytes with composing characters included is considered to
16 /// be one character.
17 ///
18 /// UTF-8 is used everywhere in the core. This is in registers, text
19 /// manipulation, buffers, etc. Nvim core communicates with external plugins
20 /// and GUIs in this encoding.
21 ///
22 /// The encoding of a file is specified with 'fileencoding'.  Conversion
23 /// is to be done when it's different from "utf-8".
24 ///
25 /// Vim scripts may contain an ":scriptencoding" command. This has an effect
26 /// for some commands, like ":menutrans".
27 
28 #include <inttypes.h>
29 #include <stdbool.h>
30 #include <string.h>
31 #include <wchar.h>
32 #include <wctype.h>
33 
34 #include "nvim/ascii.h"
35 #include "nvim/vim.h"
36 #ifdef HAVE_LOCALE_H
37 # include <locale.h>
38 #endif
39 #include "nvim/arabic.h"
40 #include "nvim/charset.h"
41 #include "nvim/cursor.h"
42 #include "nvim/eval.h"
43 #include "nvim/fileio.h"
44 #include "nvim/func_attr.h"
45 #include "nvim/iconv.h"
46 #include "nvim/mark.h"
47 #include "nvim/mbyte.h"
48 #include "nvim/memline.h"
49 #include "nvim/memory.h"
50 #include "nvim/message.h"
51 #include "nvim/misc1.h"
52 #include "nvim/option.h"
53 #include "nvim/os/os.h"
54 #include "nvim/path.h"
55 #include "nvim/screen.h"
56 #include "nvim/spell.h"
57 #include "nvim/strings.h"
58 
59 typedef struct {
60   int rangeStart;
61   int rangeEnd;
62   int step;
63   int offset;
64 } convertStruct;
65 
66 struct interval {
67   long first;
68   long last;
69 };
70 
71 #ifdef INCLUDE_GENERATED_DECLARATIONS
72 # include "mbyte.c.generated.h"
73 
74 # include "unicode_tables.generated.h"
75 #endif
76 
77 // To speed up BYTELEN(); keep a lookup table to quickly get the length in
78 // bytes of a UTF-8 character from the first byte of a UTF-8 string.  Bytes
79 // which are illegal when used as the first byte have a 1.  The NUL byte has
80 // length 1.
81 const uint8_t utf8len_tab[] = {
82   // ?1 ?2 ?3 ?4 ?5 ?6 ?7 ?8 ?9 ?A ?B ?C ?D ?E ?F
83   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 0?
84   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 1?
85   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 2?
86   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 3?
87   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 4?
88   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 5?
89   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 6?
90   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 7?
91   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 8?
92   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 9?
93   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // A?
94   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // B?
95   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C?
96   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D?
97   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E?
98   4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1,  // F?
99 };
100 
101 // Like utf8len_tab above, but using a zero for illegal lead bytes.
102 const uint8_t utf8len_tab_zero[] = {
103   // ?1 ?2 ?3 ?4 ?5 ?6 ?7 ?8 ?9 ?A ?B ?C ?D ?E ?F
104   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 0?
105   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 1?
106   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 2?
107   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 3?
108   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 4?
109   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 5?
110   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 6?
111   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 7?
112   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 8?
113   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 9?
114   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A?
115   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B?
116   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C?
117   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D?
118   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E?
119   4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0,  // F?
120 };
121 
122 /*
123  * Canonical encoding names and their properties.
124  * "iso-8859-n" is handled by enc_canonize() directly.
125  */
126 static struct
127 {   const char *name;   int prop;              int codepage; }
128 enc_canon_table[] =
129 {
130 #define IDX_LATIN_1     0
131   { "latin1",          ENC_8BIT + ENC_LATIN1,  1252 },
132 #define IDX_ISO_2       1
133   { "iso-8859-2",      ENC_8BIT,               0 },
134 #define IDX_ISO_3       2
135   { "iso-8859-3",      ENC_8BIT,               0 },
136 #define IDX_ISO_4       3
137   { "iso-8859-4",      ENC_8BIT,               0 },
138 #define IDX_ISO_5       4
139   { "iso-8859-5",      ENC_8BIT,               0 },
140 #define IDX_ISO_6       5
141   { "iso-8859-6",      ENC_8BIT,               0 },
142 #define IDX_ISO_7       6
143   { "iso-8859-7",      ENC_8BIT,               0 },
144 #define IDX_ISO_8       7
145   { "iso-8859-8",      ENC_8BIT,               0 },
146 #define IDX_ISO_9       8
147   { "iso-8859-9",      ENC_8BIT,               0 },
148 #define IDX_ISO_10      9
149   { "iso-8859-10",     ENC_8BIT,               0 },
150 #define IDX_ISO_11      10
151   { "iso-8859-11",     ENC_8BIT,               0 },
152 #define IDX_ISO_13      11
153   { "iso-8859-13",     ENC_8BIT,               0 },
154 #define IDX_ISO_14      12
155   { "iso-8859-14",     ENC_8BIT,               0 },
156 #define IDX_ISO_15      13
157   { "iso-8859-15",     ENC_8BIT + ENC_LATIN9,  0 },
158 #define IDX_KOI8_R      14
159   { "koi8-r",          ENC_8BIT,               0 },
160 #define IDX_KOI8_U      15
161   { "koi8-u",          ENC_8BIT,               0 },
162 #define IDX_UTF8        16
163   { "utf-8",           ENC_UNICODE,            0 },
164 #define IDX_UCS2        17
165   { "ucs-2",           ENC_UNICODE + ENC_ENDIAN_B + ENC_2BYTE, 0 },
166 #define IDX_UCS2LE      18
167   { "ucs-2le",         ENC_UNICODE + ENC_ENDIAN_L + ENC_2BYTE, 0 },
168 #define IDX_UTF16       19
169   { "utf-16",          ENC_UNICODE + ENC_ENDIAN_B + ENC_2WORD, 0 },
170 #define IDX_UTF16LE     20
171   { "utf-16le",        ENC_UNICODE + ENC_ENDIAN_L + ENC_2WORD, 0 },
172 #define IDX_UCS4        21
173   { "ucs-4",           ENC_UNICODE + ENC_ENDIAN_B + ENC_4BYTE, 0 },
174 #define IDX_UCS4LE      22
175   { "ucs-4le",         ENC_UNICODE + ENC_ENDIAN_L + ENC_4BYTE, 0 },
176 
177   // For debugging DBCS encoding on Unix.
178 #define IDX_DEBUG       23
179   { "debug",           ENC_DBCS,               DBCS_DEBUG },
180 #define IDX_EUC_JP      24
181   { "euc-jp",          ENC_DBCS,               DBCS_JPNU },
182 #define IDX_SJIS        25
183   { "sjis",            ENC_DBCS,               DBCS_JPN },
184 #define IDX_EUC_KR      26
185   { "euc-kr",          ENC_DBCS,               DBCS_KORU },
186 #define IDX_EUC_CN      27
187   { "euc-cn",          ENC_DBCS,               DBCS_CHSU },
188 #define IDX_EUC_TW      28
189   { "euc-tw",          ENC_DBCS,               DBCS_CHTU },
190 #define IDX_BIG5        29
191   { "big5",            ENC_DBCS,               DBCS_CHT },
192 
193   // MS-DOS and MS-Windows codepages are included here, so that they can be
194   // used on Unix too.  Most of them are similar to ISO-8859 encodings, but
195   // not exactly the same.
196 #define IDX_CP437       30
197   { "cp437",           ENC_8BIT,               437 },   // like iso-8859-1
198 #define IDX_CP737       31
199   { "cp737",           ENC_8BIT,               737 },   // like iso-8859-7
200 #define IDX_CP775       32
201   { "cp775",           ENC_8BIT,               775 },   // Baltic
202 #define IDX_CP850       33
203   { "cp850",           ENC_8BIT,               850 },   // like iso-8859-4
204 #define IDX_CP852       34
205   { "cp852",           ENC_8BIT,               852 },   // like iso-8859-1
206 #define IDX_CP855       35
207   { "cp855",           ENC_8BIT,               855 },   // like iso-8859-2
208 #define IDX_CP857       36
209   { "cp857",           ENC_8BIT,               857 },   // like iso-8859-5
210 #define IDX_CP860       37
211   { "cp860",           ENC_8BIT,               860 },   // like iso-8859-9
212 #define IDX_CP861       38
213   { "cp861",           ENC_8BIT,               861 },   // like iso-8859-1
214 #define IDX_CP862       39
215   { "cp862",           ENC_8BIT,               862 },   // like iso-8859-1
216 #define IDX_CP863       40
217   { "cp863",           ENC_8BIT,               863 },   // like iso-8859-8
218 #define IDX_CP865       41
219   { "cp865",           ENC_8BIT,               865 },   // like iso-8859-1
220 #define IDX_CP866       42
221   { "cp866",           ENC_8BIT,               866 },   // like iso-8859-5
222 #define IDX_CP869       43
223   { "cp869",           ENC_8BIT,               869 },   // like iso-8859-7
224 #define IDX_CP874       44
225   { "cp874",           ENC_8BIT,               874 },   // Thai
226 #define IDX_CP932       45
227   { "cp932",           ENC_DBCS,               DBCS_JPN },
228 #define IDX_CP936       46
229   { "cp936",           ENC_DBCS,               DBCS_CHS },
230 #define IDX_CP949       47
231   { "cp949",           ENC_DBCS,               DBCS_KOR },
232 #define IDX_CP950       48
233   { "cp950",           ENC_DBCS,               DBCS_CHT },
234 #define IDX_CP1250      49
235   { "cp1250",          ENC_8BIT,               1250 },   // Czech, Polish, etc.
236 #define IDX_CP1251      50
237   { "cp1251",          ENC_8BIT,               1251 },   // Cyrillic
238   // cp1252 is considered to be equal to latin1
239 #define IDX_CP1253      51
240   { "cp1253",          ENC_8BIT,               1253 },   // Greek
241 #define IDX_CP1254      52
242   { "cp1254",          ENC_8BIT,               1254 },   // Turkish
243 #define IDX_CP1255      53
244   { "cp1255",          ENC_8BIT,               1255 },   // Hebrew
245 #define IDX_CP1256      54
246   { "cp1256",          ENC_8BIT,               1256 },   // Arabic
247 #define IDX_CP1257      55
248   { "cp1257",          ENC_8BIT,               1257 },   // Baltic
249 #define IDX_CP1258      56
250   { "cp1258",          ENC_8BIT,               1258 },   // Vietnamese
251 
252 #define IDX_MACROMAN    57
253   { "macroman",        ENC_8BIT + ENC_MACROMAN, 0 },      // Mac OS
254 #define IDX_HPROMAN8    58
255   { "hp-roman8",       ENC_8BIT,               0 },       // HP Roman8
256 #define IDX_COUNT       59
257 };
258 
259 /*
260  * Aliases for encoding names.
261  */
262 static struct
263 {   const char *name; int canon; }
264 enc_alias_table[] =
265 {
266   { "ansi",            IDX_LATIN_1 },
267   { "iso-8859-1",      IDX_LATIN_1 },
268   { "latin2",          IDX_ISO_2 },
269   { "latin3",          IDX_ISO_3 },
270   { "latin4",          IDX_ISO_4 },
271   { "cyrillic",        IDX_ISO_5 },
272   { "arabic",          IDX_ISO_6 },
273   { "greek",           IDX_ISO_7 },
274   { "hebrew",          IDX_ISO_8 },
275   { "latin5",          IDX_ISO_9 },
276   { "turkish",         IDX_ISO_9 },   // ?
277   { "latin6",          IDX_ISO_10 },
278   { "nordic",          IDX_ISO_10 },  // ?
279   { "thai",            IDX_ISO_11 },  // ?
280   { "latin7",          IDX_ISO_13 },
281   { "latin8",          IDX_ISO_14 },
282   { "latin9",          IDX_ISO_15 },
283   { "utf8",            IDX_UTF8 },
284   { "unicode",         IDX_UCS2 },
285   { "ucs2",            IDX_UCS2 },
286   { "ucs2be",          IDX_UCS2 },
287   { "ucs-2be",         IDX_UCS2 },
288   { "ucs2le",          IDX_UCS2LE },
289   { "utf16",           IDX_UTF16 },
290   { "utf16be",         IDX_UTF16 },
291   { "utf-16be",        IDX_UTF16 },
292   { "utf16le",         IDX_UTF16LE },
293   { "ucs4",            IDX_UCS4 },
294   { "ucs4be",          IDX_UCS4 },
295   { "ucs-4be",         IDX_UCS4 },
296   { "ucs4le",          IDX_UCS4LE },
297   { "utf32",           IDX_UCS4 },
298   { "utf-32",          IDX_UCS4 },
299   { "utf32be",         IDX_UCS4 },
300   { "utf-32be",        IDX_UCS4 },
301   { "utf32le",         IDX_UCS4LE },
302   { "utf-32le",        IDX_UCS4LE },
303   { "932",             IDX_CP932 },
304   { "949",             IDX_CP949 },
305   { "936",             IDX_CP936 },
306   { "gbk",             IDX_CP936 },
307   { "950",             IDX_CP950 },
308   { "eucjp",           IDX_EUC_JP },
309   { "unix-jis",        IDX_EUC_JP },
310   { "ujis",            IDX_EUC_JP },
311   { "shift-jis",       IDX_SJIS },
312   { "pck",             IDX_SJIS },        // Sun: PCK
313   { "euckr",           IDX_EUC_KR },
314   { "5601",            IDX_EUC_KR },      // Sun: KS C 5601
315   { "euccn",           IDX_EUC_CN },
316   { "gb2312",          IDX_EUC_CN },
317   { "euctw",           IDX_EUC_TW },
318   { "japan",           IDX_EUC_JP },
319   { "korea",           IDX_EUC_KR },
320   { "prc",             IDX_EUC_CN },
321   { "zh-cn",           IDX_EUC_CN },
322   { "chinese",         IDX_EUC_CN },
323   { "zh-tw",           IDX_EUC_TW },
324   { "taiwan",          IDX_EUC_TW },
325   { "cp950",           IDX_BIG5 },
326   { "950",             IDX_BIG5 },
327   { "mac",             IDX_MACROMAN },
328   { "mac-roman",       IDX_MACROMAN },
329   { NULL,              0 }
330 };
331 
332 /*
333  * Find encoding "name" in the list of canonical encoding names.
334  * Returns -1 if not found.
335  */
enc_canon_search(const char_u * name)336 static int enc_canon_search(const char_u *name)
337 {
338   int i;
339 
340   for (i = 0; i < IDX_COUNT; ++i) {
341     if (STRCMP(name, enc_canon_table[i].name) == 0) {
342       return i;
343     }
344   }
345   return -1;
346 }
347 
348 
349 /*
350  * Find canonical encoding "name" in the list and return its properties.
351  * Returns 0 if not found.
352  */
enc_canon_props(const char_u * name)353 int enc_canon_props(const char_u *name)
354 {
355   int i;
356 
357   i = enc_canon_search(name);
358   if (i >= 0) {
359     return enc_canon_table[i].prop;
360   } else if (STRNCMP(name, "2byte-", 6) == 0) {
361     return ENC_DBCS;
362   } else if (STRNCMP(name, "8bit-", 5) == 0 || STRNCMP(name, "iso-8859-", 9) == 0) {
363     return ENC_8BIT;
364   }
365   return 0;
366 }
367 
368 /*
369  * Return the size of the BOM for the current buffer:
370  * 0 - no BOM
371  * 2 - UCS-2 or UTF-16 BOM
372  * 4 - UCS-4 BOM
373  * 3 - UTF-8 BOM
374  */
bomb_size(void)375 int bomb_size(void)
376 {
377   int n = 0;
378 
379   if (curbuf->b_p_bomb && !curbuf->b_p_bin) {
380     if (*curbuf->b_p_fenc == NUL
381         || STRCMP(curbuf->b_p_fenc, "utf-8") == 0) {
382       n = 3;
383     } else if (STRNCMP(curbuf->b_p_fenc, "ucs-2", 5) == 0
384                || STRNCMP(curbuf->b_p_fenc, "utf-16", 6) == 0) {
385       n = 2;
386     } else if (STRNCMP(curbuf->b_p_fenc, "ucs-4", 5) == 0) {
387       n = 4;
388     }
389   }
390   return n;
391 }
392 
393 /*
394  * Remove all BOM from "s" by moving remaining text.
395  */
remove_bom(char_u * s)396 void remove_bom(char_u *s)
397 {
398   char *p = (char *)s;
399 
400   while ((p = strchr(p, 0xef)) != NULL) {
401     if ((uint8_t)p[1] == 0xbb && (uint8_t)p[2] == 0xbf) {
402       STRMOVE(p, p + 3);
403     } else {
404       p++;
405     }
406   }
407 }
408 
409 /*
410  * Get class of pointer:
411  * 0 for blank or NUL
412  * 1 for punctuation
413  * 2 for an (ASCII) word character
414  * >2 for other word characters
415  */
mb_get_class(const char_u * p)416 int mb_get_class(const char_u *p)
417 {
418   return mb_get_class_tab(p, curbuf->b_chartab);
419 }
420 
mb_get_class_tab(const char_u * p,const uint64_t * const chartab)421 int mb_get_class_tab(const char_u *p, const uint64_t *const chartab)
422 {
423   if (MB_BYTE2LEN(p[0]) == 1) {
424     if (p[0] == NUL || ascii_iswhite(p[0])) {
425       return 0;
426     }
427     if (vim_iswordc_tab(p[0], chartab)) {
428       return 2;
429     }
430     return 1;
431   }
432   return utf_class_tab(utf_ptr2char(p), chartab);
433 }
434 
435 /*
436  * Return true if "c" is in "table".
437  */
intable(const struct interval * table,size_t n_items,int c)438 static bool intable(const struct interval *table, size_t n_items, int c)
439 {
440   int mid, bot, top;
441 
442   // first quick check for Latin1 etc. characters
443   if (c < table[0].first) {
444     return false;
445   }
446 
447   // binary search in table
448   bot = 0;
449   top = (int)(n_items - 1);
450   while (top >= bot) {
451     mid = (bot + top) / 2;
452     if (table[mid].last < c) {
453       bot = mid + 1;
454     } else if (table[mid].first > c) {
455       top = mid - 1;
456     } else {
457       return true;
458     }
459   }
460   return false;
461 }
462 
463 /// For UTF-8 character "c" return 2 for a double-width character, 1 for others.
464 /// Returns 4 or 6 for an unprintable character.
465 /// Is only correct for characters >= 0x80.
466 /// When p_ambw is "double", return 2 for a character with East Asian Width
467 /// class 'A'(mbiguous).
468 ///
469 /// @note Tables `doublewidth` and `ambiguous` are generated by
470 ///       gen_unicode_tables.lua, which must be manually invoked as needed.
utf_char2cells(int c)471 int utf_char2cells(int c)
472 {
473   if (c >= 0x100) {
474 #ifdef USE_WCHAR_FUNCTIONS
475     //
476     // Assume the library function wcwidth() works better than our own
477     // stuff.  It should return 1 for ambiguous width chars!
478     //
479     int n = wcwidth(c);
480 
481     if (n < 0) {
482       return 6;                 // unprintable, displays <xxxx>
483     }
484     if (n > 1) {
485       return n;
486     }
487 #else
488     if (!utf_printable(c)) {
489       return 6;                 // unprintable, displays <xxxx>
490     }
491     if (intable(doublewidth, ARRAY_SIZE(doublewidth), c)) {
492       return 2;
493     }
494 #endif
495     if (p_emoji && intable(emoji_width, ARRAY_SIZE(emoji_width), c)) {
496       return 2;
497     }
498   } else if (c >= 0x80 && !vim_isprintc(c)) {
499     // Characters below 0x100 are influenced by 'isprint' option.
500     return 4;                   // unprintable, displays <xx>
501   }
502 
503   if (c >= 0x80 && *p_ambw == 'd'
504       && intable(ambiguous, ARRAY_SIZE(ambiguous), c)) {
505     return 2;
506   }
507 
508   return 1;
509 }
510 
511 /// Return the number of display cells character at "*p" occupies.
512 /// This doesn't take care of unprintable characters, use ptr2cells() for that.
utf_ptr2cells(const char_u * p)513 int utf_ptr2cells(const char_u *p)
514 {
515   int c;
516 
517   // Need to convert to a character number.
518   if (*p >= 0x80) {
519     c = utf_ptr2char(p);
520     // An illegal byte is displayed as <xx>.
521     if (utf_ptr2len(p) == 1 || c == NUL) {
522       return 4;
523     }
524     // If the char is ASCII it must be an overlong sequence.
525     if (c < 0x80) {
526       return char2cells(c);
527     }
528     return utf_char2cells(c);
529   }
530   return 1;
531 }
532 
533 /// Like utf_ptr2cells(), but limit string length to "size".
534 /// For an empty string or truncated character returns 1.
utf_ptr2cells_len(const char_u * p,int size)535 int utf_ptr2cells_len(const char_u *p, int size)
536 {
537   int c;
538 
539   // Need to convert to a wide character.
540   if (size > 0 && *p >= 0x80) {
541     if (utf_ptr2len_len(p, size) < utf8len_tab[*p]) {
542       return 1;        // truncated
543     }
544     c = utf_ptr2char(p);
545     // An illegal byte is displayed as <xx>.
546     if (utf_ptr2len(p) == 1 || c == NUL) {
547       return 4;
548     }
549     // If the char is ASCII it must be an overlong sequence.
550     if (c < 0x80) {
551       return char2cells(c);
552     }
553     return utf_char2cells(c);
554   }
555   return 1;
556 }
557 
558 /// Calculate the number of cells occupied by string `str`.
559 ///
560 /// @param str The source string, may not be NULL, must be a NUL-terminated
561 ///            string.
562 /// @return The number of cells occupied by string `str`
mb_string2cells(const char_u * str)563 size_t mb_string2cells(const char_u *str)
564 {
565   size_t clen = 0;
566 
567   for (const char_u *p = str; *p != NUL; p += utfc_ptr2len(p)) {
568     clen += utf_ptr2cells(p);
569   }
570 
571   return clen;
572 }
573 
574 /// Get the number of cells occupied by string `str` with maximum length `size`
575 ///
576 /// @param str The source string, may not be NULL, must be a NUL-terminated
577 ///            string.
578 /// @param size maximum length of string. It will terminate on earlier NUL.
579 /// @return The number of cells occupied by string `str`
mb_string2cells_len(const char_u * str,size_t size)580 size_t mb_string2cells_len(const char_u *str, size_t size)
581   FUNC_ATTR_NONNULL_ARG(1)
582 {
583   size_t clen = 0;
584 
585   for (const char_u *p = str; *p != NUL && p < str+size;
586        p += utfc_ptr2len_len(p, size+(p-str))) {
587     clen += utf_ptr2cells(p);
588   }
589 
590   return clen;
591 }
592 
593 /// Convert a UTF-8 byte sequence to a character number.
594 ///
595 /// If the sequence is illegal or truncated by a NUL then the first byte is
596 /// returned.
597 /// For an overlong sequence this may return zero.
598 /// Does not include composing characters for obvious reasons.
599 ///
600 /// @param[in]  p  String to convert.
601 ///
602 /// @return Unicode codepoint or byte value.
utf_ptr2char(const char_u * const p)603 int utf_ptr2char(const char_u *const p)
604   FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT
605 {
606   if (p[0] < 0x80) {  // Be quick for ASCII.
607     return p[0];
608   }
609 
610   const uint8_t len = utf8len_tab_zero[p[0]];
611   if (len > 1 && (p[1] & 0xc0) == 0x80) {
612     if (len == 2) {
613       return ((p[0] & 0x1f) << 6) + (p[1] & 0x3f);
614     }
615     if ((p[2] & 0xc0) == 0x80) {
616       if (len == 3) {
617         return (((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6)
618                 + (p[2] & 0x3f));
619       }
620       if ((p[3] & 0xc0) == 0x80) {
621         if (len == 4) {
622           return (((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12)
623                   + ((p[2] & 0x3f) << 6) + (p[3] & 0x3f));
624         }
625         if ((p[4] & 0xc0) == 0x80) {
626           if (len == 5) {
627             return (((p[0] & 0x03) << 24) + ((p[1] & 0x3f) << 18)
628                     + ((p[2] & 0x3f) << 12) + ((p[3] & 0x3f) << 6)
629                     + (p[4] & 0x3f));
630           }
631           if ((p[5] & 0xc0) == 0x80 && len == 6) {
632             return (((p[0] & 0x01) << 30) + ((p[1] & 0x3f) << 24)
633                     + ((p[2] & 0x3f) << 18) + ((p[3] & 0x3f) << 12)
634                     + ((p[4] & 0x3f) << 6) + (p[5] & 0x3f));
635           }
636         }
637       }
638     }
639   }
640   // Illegal value: just return the first byte.
641   return p[0];
642 }
643 
644 /*
645  * Convert a UTF-8 byte sequence to a wide character.
646  * String is assumed to be terminated by NUL or after "n" bytes, whichever
647  * comes first.
648  * The function is safe in the sense that it never accesses memory beyond the
649  * first "n" bytes of "s".
650  *
651  * On success, returns decoded codepoint, advances "s" to the beginning of
652  * next character and decreases "n" accordingly.
653  *
654  * If end of string was reached, returns 0 and, if "n" > 0, advances "s" past
655  * NUL byte.
656  *
657  * If byte sequence is illegal or incomplete, returns -1 and does not advance
658  * "s".
659  */
utf_safe_read_char_adv(const char_u ** s,size_t * n)660 static int utf_safe_read_char_adv(const char_u **s, size_t *n)
661 {
662   int c;
663 
664   if (*n == 0) {  // end of buffer
665     return 0;
666   }
667 
668   uint8_t k = utf8len_tab_zero[**s];
669 
670   if (k == 1) {
671     // ASCII character or NUL
672     (*n)--;
673     return *(*s)++;
674   }
675 
676   if (k <= *n) {
677     // We have a multibyte sequence and it isn't truncated by buffer
678     // limits so utf_ptr2char() is safe to use. Or the first byte is
679     // illegal (k=0), and it's also safe to use utf_ptr2char().
680     c = utf_ptr2char(*s);
681 
682     // On failure, utf_ptr2char() returns the first byte, so here we
683     // check equality with the first byte. The only non-ASCII character
684     // which equals the first byte of its own UTF-8 representation is
685     // U+00C3 (UTF-8: 0xC3 0x83), so need to check that special case too.
686     // It's safe even if n=1, else we would have k=2 > n.
687     if (c != (int)(**s) || (c == 0xC3 && (*s)[1] == 0x83)) {
688       // byte sequence was successfully decoded
689       *s += k;
690       *n -= k;
691       return c;
692     }
693   }
694 
695   // byte sequence is incomplete or illegal
696   return -1;
697 }
698 
699 /*
700  * Get character at **pp and advance *pp to the next character.
701  * Note: composing characters are skipped!
702  */
mb_ptr2char_adv(const char_u ** const pp)703 int mb_ptr2char_adv(const char_u **const pp)
704 {
705   int c;
706 
707   c = utf_ptr2char(*pp);
708   *pp += utfc_ptr2len(*pp);
709   return c;
710 }
711 
712 /*
713  * Get character at **pp and advance *pp to the next character.
714  * Note: composing characters are returned as separate characters.
715  */
mb_cptr2char_adv(const char_u ** pp)716 int mb_cptr2char_adv(const char_u **pp)
717 {
718   int c;
719 
720   c = utf_ptr2char(*pp);
721   *pp += utf_ptr2len(*pp);
722   return c;
723 }
724 
725 /*
726  * Check if the character pointed to by "p2" is a composing character when it
727  * comes after "p1".  For Arabic sometimes "ab" is replaced with "c", which
728  * behaves like a composing character.
729  */
utf_composinglike(const char_u * p1,const char_u * p2)730 bool utf_composinglike(const char_u *p1, const char_u *p2)
731 {
732   int c2;
733 
734   c2 = utf_ptr2char(p2);
735   if (utf_iscomposing(c2)) {
736     return true;
737   }
738   if (!arabic_maycombine(c2)) {
739     return false;
740   }
741   return arabic_combine(utf_ptr2char(p1), c2);
742 }
743 
744 /// Convert a UTF-8 string to a wide character
745 ///
746 /// Also gets up to #MAX_MCO composing characters.
747 ///
748 /// @param[out]  pcc  Location where to store composing characters. Must have
749 ///                   space at least for #MAX_MCO + 1 elements.
750 ///
751 /// @return leading character.
utfc_ptr2char(const char_u * p,int * pcc)752 int utfc_ptr2char(const char_u *p, int *pcc)
753 {
754   int len;
755   int c;
756   int cc;
757   int i = 0;
758 
759   c = utf_ptr2char(p);
760   len = utf_ptr2len(p);
761 
762   // Only accept a composing char when the first char isn't illegal.
763   if ((len > 1 || *p < 0x80)
764       && p[len] >= 0x80
765       && utf_composinglike(p, p + len)) {
766     cc = utf_ptr2char(p + len);
767     for (;;) {
768       pcc[i++] = cc;
769       if (i == MAX_MCO) {
770         break;
771       }
772       len += utf_ptr2len(p + len);
773       if (p[len] < 0x80 || !utf_iscomposing(cc = utf_ptr2char(p + len))) {
774         break;
775       }
776     }
777   }
778 
779   if (i < MAX_MCO) {    // last composing char must be 0
780     pcc[i] = 0;
781   }
782 
783   return c;
784 }
785 
786 /*
787  * Convert a UTF-8 byte string to a wide character.  Also get up to MAX_MCO
788  * composing characters.  Use no more than p[maxlen].
789  *
790  * @param [out] pcc: composing chars, last one is 0
791  */
utfc_ptr2char_len(const char_u * p,int * pcc,int maxlen)792 int utfc_ptr2char_len(const char_u *p, int *pcc, int maxlen)
793 {
794   assert(maxlen > 0);
795 
796   int i = 0;
797 
798   int len = utf_ptr2len_len(p, maxlen);
799   // Is it safe to use utf_ptr2char()?
800   bool safe = len > 1 && len <= maxlen;
801   int c = safe ? utf_ptr2char(p) : *p;
802 
803   // Only accept a composing char when the first char isn't illegal.
804   if ((safe || c < 0x80) && len < maxlen && p[len] >= 0x80) {
805     for (; i < MAX_MCO; i++) {
806       int len_cc = utf_ptr2len_len(p + len, maxlen - len);
807       safe = len_cc > 1 && len_cc <= maxlen - len;
808       if (!safe || (pcc[i] = utf_ptr2char(p + len)) < 0x80
809           || !(i == 0 ? utf_composinglike(p, p+len) : utf_iscomposing(pcc[i]))) {
810         break;
811       }
812       len += len_cc;
813     }
814   }
815 
816   if (i < MAX_MCO) {
817     // last composing char must be 0
818     pcc[i] = 0;
819   }
820 
821   return c;
822 #undef ISCOMPOSING
823 }
824 
825 /// Get the length of a UTF-8 byte sequence representing a single codepoint
826 ///
827 /// @param[in]  p  UTF-8 string.
828 ///
829 /// @return Sequence length, 0 for empty string and 1 for non-UTF-8 byte
830 ///         sequence.
utf_ptr2len(const char_u * const p)831 int utf_ptr2len(const char_u *const p)
832   FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
833 {
834   if (*p == NUL) {
835     return 0;
836   }
837   const int len = utf8len_tab[*p];
838   for (int i = 1; i < len; i++) {
839     if ((p[i] & 0xc0) != 0x80) {
840       return 1;
841     }
842   }
843   return len;
844 }
845 
846 /*
847  * Return length of UTF-8 character, obtained from the first byte.
848  * "b" must be between 0 and 255!
849  * Returns 1 for an invalid first byte value.
850  */
utf_byte2len(int b)851 int utf_byte2len(int b)
852 {
853   return utf8len_tab[b];
854 }
855 
856 /*
857  * Get the length of UTF-8 byte sequence "p[size]".  Does not include any
858  * following composing characters.
859  * Returns 1 for "".
860  * Returns 1 for an illegal byte sequence (also in incomplete byte seq.).
861  * Returns number > "size" for an incomplete byte sequence.
862  * Never returns zero.
863  */
utf_ptr2len_len(const char_u * p,int size)864 int utf_ptr2len_len(const char_u *p, int size)
865 {
866   int len;
867   int i;
868   int m;
869 
870   len = utf8len_tab[*p];
871   if (len == 1) {
872     return 1;           // NUL, ascii or illegal lead byte
873   }
874   if (len > size) {
875     m = size;           // incomplete byte sequence.
876   } else {
877     m = len;
878   }
879   for (i = 1; i < m; ++i) {
880     if ((p[i] & 0xc0) != 0x80) {
881       return 1;
882     }
883   }
884   return len;
885 }
886 
887 /// Return the number of bytes occupied by a UTF-8 character in a string
888 ///
889 /// This includes following composing characters.
utfc_ptr2len(const char_u * const p)890 int utfc_ptr2len(const char_u *const p)
891   FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
892 {
893   uint8_t b0 = (uint8_t)(*p);
894 
895   if (b0 == NUL) {
896     return 0;
897   }
898   if (b0 < 0x80 && p[1] < 0x80) {  // be quick for ASCII
899     return 1;
900   }
901 
902   // Skip over first UTF-8 char, stopping at a NUL byte.
903   int len = utf_ptr2len(p);
904 
905   // Check for illegal byte.
906   if (len == 1 && b0 >= 0x80) {
907     return 1;
908   }
909 
910   // Check for composing characters.  We can handle only the first six, but
911   // skip all of them (otherwise the cursor would get stuck).
912   int prevlen = 0;
913   for (;;) {
914     if (p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len)) {
915       return len;
916     }
917 
918     // Skip over composing char.
919     prevlen = len;
920     len += utf_ptr2len(p + len);
921   }
922 }
923 
924 /*
925  * Return the number of bytes the UTF-8 encoding of the character at "p[size]"
926  * takes.  This includes following composing characters.
927  * Returns 0 for an empty string.
928  * Returns 1 for an illegal char or an incomplete byte sequence.
929  */
utfc_ptr2len_len(const char_u * p,int size)930 int utfc_ptr2len_len(const char_u *p, int size)
931 {
932   int len;
933   int prevlen;
934 
935   if (size < 1 || *p == NUL) {
936     return 0;
937   }
938   if (p[0] < 0x80 && (size == 1 || p[1] < 0x80)) {  // be quick for ASCII
939     return 1;
940   }
941 
942   // Skip over first UTF-8 char, stopping at a NUL byte.
943   len = utf_ptr2len_len(p, size);
944 
945   // Check for illegal byte and incomplete byte sequence.
946   if ((len == 1 && p[0] >= 0x80) || len > size) {
947     return 1;
948   }
949 
950   /*
951    * Check for composing characters.  We can handle only the first six, but
952    * skip all of them (otherwise the cursor would get stuck).
953    */
954   prevlen = 0;
955   while (len < size) {
956     int len_next_char;
957 
958     if (p[len] < 0x80) {
959       break;
960     }
961 
962     /*
963      * Next character length should not go beyond size to ensure that
964      * utf_composinglike(...) does not read beyond size.
965      */
966     len_next_char = utf_ptr2len_len(p + len, size - len);
967     if (len_next_char > size - len) {
968       break;
969     }
970 
971     if (!utf_composinglike(p + prevlen, p + len)) {
972       break;
973     }
974 
975     // Skip over composing char
976     prevlen = len;
977     len += len_next_char;
978   }
979   return len;
980 }
981 
982 /// Determine how many bytes certain unicode codepoint will occupy
utf_char2len(const int c)983 int utf_char2len(const int c)
984 {
985   if (c < 0x80) {
986     return 1;
987   } else if (c < 0x800) {
988     return 2;
989   } else if (c < 0x10000) {
990     return 3;
991   } else if (c < 0x200000) {
992     return 4;
993   } else if (c < 0x4000000) {
994     return 5;
995   } else {
996     return 6;
997   }
998 }
999 
1000 /// Convert Unicode character to UTF-8 string
1001 ///
1002 /// @param c character to convert to \p buf
1003 /// @param[out] buf UTF-8 string generated from \p c, does not add \0
1004 /// @return Number of bytes (1-6).
utf_char2bytes(const int c,char_u * const buf)1005 int utf_char2bytes(const int c, char_u *const buf)
1006 {
1007   if (c < 0x80) {  // 7 bits
1008     buf[0] = c;
1009     return 1;
1010   } else if (c < 0x800) {  // 11 bits
1011     buf[0] = 0xc0 + ((unsigned)c >> 6);
1012     buf[1] = 0x80 + (c & 0x3f);
1013     return 2;
1014   } else if (c < 0x10000) {  // 16 bits
1015     buf[0] = 0xe0 + ((unsigned)c >> 12);
1016     buf[1] = 0x80 + (((unsigned)c >> 6) & 0x3f);
1017     buf[2] = 0x80 + (c & 0x3f);
1018     return 3;
1019   } else if (c < 0x200000) {  // 21 bits
1020     buf[0] = 0xf0 + ((unsigned)c >> 18);
1021     buf[1] = 0x80 + (((unsigned)c >> 12) & 0x3f);
1022     buf[2] = 0x80 + (((unsigned)c >> 6) & 0x3f);
1023     buf[3] = 0x80 + (c & 0x3f);
1024     return 4;
1025   } else if (c < 0x4000000) {  // 26 bits
1026     buf[0] = 0xf8 + ((unsigned)c >> 24);
1027     buf[1] = 0x80 + (((unsigned)c >> 18) & 0x3f);
1028     buf[2] = 0x80 + (((unsigned)c >> 12) & 0x3f);
1029     buf[3] = 0x80 + (((unsigned)c >> 6) & 0x3f);
1030     buf[4] = 0x80 + (c & 0x3f);
1031     return 5;
1032   } else {  // 31 bits
1033     buf[0] = 0xfc + ((unsigned)c >> 30);
1034     buf[1] = 0x80 + (((unsigned)c >> 24) & 0x3f);
1035     buf[2] = 0x80 + (((unsigned)c >> 18) & 0x3f);
1036     buf[3] = 0x80 + (((unsigned)c >> 12) & 0x3f);
1037     buf[4] = 0x80 + (((unsigned)c >> 6) & 0x3f);
1038     buf[5] = 0x80 + (c & 0x3f);
1039     return 6;
1040   }
1041 }
1042 
1043 /*
1044  * Return true if "c" is a composing UTF-8 character.  This means it will be
1045  * drawn on top of the preceding character.
1046  * Based on code from Markus Kuhn.
1047  */
utf_iscomposing(int c)1048 bool utf_iscomposing(int c)
1049 {
1050   return intable(combining, ARRAY_SIZE(combining), c);
1051 }
1052 
1053 /*
1054  * Return true for characters that can be displayed in a normal way.
1055  * Only for characters of 0x100 and above!
1056  */
utf_printable(int c)1057 bool utf_printable(int c)
1058 {
1059 #ifdef USE_WCHAR_FUNCTIONS
1060   /*
1061    * Assume the iswprint() library function works better than our own stuff.
1062    */
1063   return iswprint(c);
1064 #else
1065   // Sorted list of non-overlapping intervals.
1066   // 0xd800-0xdfff is reserved for UTF-16, actually illegal.
1067   static struct interval nonprint[] =
1068   {
1069     { 0x070f, 0x070f }, { 0x180b, 0x180e }, { 0x200b, 0x200f }, { 0x202a, 0x202e },
1070     { 0x206a, 0x206f }, { 0xd800, 0xdfff }, { 0xfeff, 0xfeff }, { 0xfff9, 0xfffb },
1071     { 0xfffe, 0xffff }
1072   };
1073 
1074   return !intable(nonprint, ARRAY_SIZE(nonprint), c);
1075 #endif
1076 }
1077 
1078 /*
1079  * Get class of a Unicode character.
1080  * 0: white space
1081  * 1: punctuation
1082  * 2 or bigger: some class of word character.
1083  */
utf_class(const int c)1084 int utf_class(const int c)
1085 {
1086   return utf_class_tab(c, curbuf->b_chartab);
1087 }
1088 
utf_class_tab(const int c,const uint64_t * const chartab)1089 int utf_class_tab(const int c, const uint64_t *const chartab)
1090 {
1091   // sorted list of non-overlapping intervals
1092   static struct clinterval {
1093     unsigned int first;
1094     unsigned int last;
1095     unsigned int class;
1096   } classes[] = {
1097     { 0x037e, 0x037e, 1 },              // Greek question mark
1098     { 0x0387, 0x0387, 1 },              // Greek ano teleia
1099     { 0x055a, 0x055f, 1 },              // Armenian punctuation
1100     { 0x0589, 0x0589, 1 },              // Armenian full stop
1101     { 0x05be, 0x05be, 1 },
1102     { 0x05c0, 0x05c0, 1 },
1103     { 0x05c3, 0x05c3, 1 },
1104     { 0x05f3, 0x05f4, 1 },
1105     { 0x060c, 0x060c, 1 },
1106     { 0x061b, 0x061b, 1 },
1107     { 0x061f, 0x061f, 1 },
1108     { 0x066a, 0x066d, 1 },
1109     { 0x06d4, 0x06d4, 1 },
1110     { 0x0700, 0x070d, 1 },              // Syriac punctuation
1111     { 0x0964, 0x0965, 1 },
1112     { 0x0970, 0x0970, 1 },
1113     { 0x0df4, 0x0df4, 1 },
1114     { 0x0e4f, 0x0e4f, 1 },
1115     { 0x0e5a, 0x0e5b, 1 },
1116     { 0x0f04, 0x0f12, 1 },
1117     { 0x0f3a, 0x0f3d, 1 },
1118     { 0x0f85, 0x0f85, 1 },
1119     { 0x104a, 0x104f, 1 },              // Myanmar punctuation
1120     { 0x10fb, 0x10fb, 1 },              // Georgian punctuation
1121     { 0x1361, 0x1368, 1 },              // Ethiopic punctuation
1122     { 0x166d, 0x166e, 1 },              // Canadian Syl. punctuation
1123     { 0x1680, 0x1680, 0 },
1124     { 0x169b, 0x169c, 1 },
1125     { 0x16eb, 0x16ed, 1 },
1126     { 0x1735, 0x1736, 1 },
1127     { 0x17d4, 0x17dc, 1 },              // Khmer punctuation
1128     { 0x1800, 0x180a, 1 },              // Mongolian punctuation
1129     { 0x2000, 0x200b, 0 },              // spaces
1130     { 0x200c, 0x2027, 1 },              // punctuation and symbols
1131     { 0x2028, 0x2029, 0 },
1132     { 0x202a, 0x202e, 1 },              // punctuation and symbols
1133     { 0x202f, 0x202f, 0 },
1134     { 0x2030, 0x205e, 1 },              // punctuation and symbols
1135     { 0x205f, 0x205f, 0 },
1136     { 0x2060, 0x27ff, 1 },              // punctuation and symbols
1137     { 0x2070, 0x207f, 0x2070 },         // superscript
1138     { 0x2080, 0x2094, 0x2080 },         // subscript
1139     { 0x20a0, 0x27ff, 1 },              // all kinds of symbols
1140     { 0x2800, 0x28ff, 0x2800 },         // braille
1141     { 0x2900, 0x2998, 1 },              // arrows, brackets, etc.
1142     { 0x29d8, 0x29db, 1 },
1143     { 0x29fc, 0x29fd, 1 },
1144     { 0x2e00, 0x2e7f, 1 },              // supplemental punctuation
1145     { 0x3000, 0x3000, 0 },              // ideographic space
1146     { 0x3001, 0x3020, 1 },              // ideographic punctuation
1147     { 0x3030, 0x3030, 1 },
1148     { 0x303d, 0x303d, 1 },
1149     { 0x3040, 0x309f, 0x3040 },         // Hiragana
1150     { 0x30a0, 0x30ff, 0x30a0 },         // Katakana
1151     { 0x3300, 0x9fff, 0x4e00 },         // CJK Ideographs
1152     { 0xac00, 0xd7a3, 0xac00 },         // Hangul Syllables
1153     { 0xf900, 0xfaff, 0x4e00 },         // CJK Ideographs
1154     { 0xfd3e, 0xfd3f, 1 },
1155     { 0xfe30, 0xfe6b, 1 },              // punctuation forms
1156     { 0xff00, 0xff0f, 1 },              // half/fullwidth ASCII
1157     { 0xff1a, 0xff20, 1 },              // half/fullwidth ASCII
1158     { 0xff3b, 0xff40, 1 },              // half/fullwidth ASCII
1159     { 0xff5b, 0xff65, 1 },              // half/fullwidth ASCII
1160     { 0x1d000, 0x1d24f, 1 },            // Musical notation
1161     { 0x1d400, 0x1d7ff, 1 },            // Mathematical Alphanumeric Symbols
1162     { 0x1f000, 0x1f2ff, 1 },            // Game pieces; enclosed characters
1163     { 0x1f300, 0x1f9ff, 1 },            // Many symbol blocks
1164     { 0x20000, 0x2a6df, 0x4e00 },       // CJK Ideographs
1165     { 0x2a700, 0x2b73f, 0x4e00 },       // CJK Ideographs
1166     { 0x2b740, 0x2b81f, 0x4e00 },       // CJK Ideographs
1167     { 0x2f800, 0x2fa1f, 0x4e00 },       // CJK Ideographs
1168   };
1169   int bot = 0;
1170   int top = ARRAY_SIZE(classes) - 1;
1171   int mid;
1172 
1173   // First quick check for Latin1 characters, use 'iskeyword'.
1174   if (c < 0x100) {
1175     if (c == ' ' || c == '\t' || c == NUL || c == 0xa0) {
1176       return 0;             // blank
1177     }
1178     if (vim_iswordc_tab(c, chartab)) {
1179       return 2;             // word character
1180     }
1181     return 1;               // punctuation
1182   }
1183 
1184   // binary search in table
1185   while (top >= bot) {
1186     mid = (bot + top) / 2;
1187     if (classes[mid].last < (unsigned int)c) {
1188       bot = mid + 1;
1189     } else if (classes[mid].first > (unsigned int)c) {
1190       top = mid - 1;
1191     } else {
1192       return (int)classes[mid].class;
1193     }
1194   }
1195 
1196   // emoji
1197   if (intable(emoji_all, ARRAY_SIZE(emoji_all), c)) {
1198     return 3;
1199   }
1200 
1201   // most other characters are "word" characters
1202   return 2;
1203 }
1204 
utf_ambiguous_width(int c)1205 bool utf_ambiguous_width(int c)
1206 {
1207   return c >= 0x80 && (intable(ambiguous, ARRAY_SIZE(ambiguous), c)
1208                        || intable(emoji_all, ARRAY_SIZE(emoji_all), c));
1209 }
1210 
1211 /*
1212  * Generic conversion function for case operations.
1213  * Return the converted equivalent of "a", which is a UCS-4 character.  Use
1214  * the given conversion "table".  Uses binary search on "table".
1215  */
utf_convert(int a,const convertStruct * const table,size_t n_items)1216 static int utf_convert(int a, const convertStruct *const table, size_t n_items)
1217 {
1218   size_t start, mid, end;   // indices into table
1219 
1220   start = 0;
1221   end = n_items;
1222   while (start < end) {
1223     // need to search further
1224     mid = (end + start) / 2;
1225     if (table[mid].rangeEnd < a) {
1226       start = mid + 1;
1227     } else {
1228       end = mid;
1229     }
1230   }
1231   if (start < n_items
1232       && table[start].rangeStart <= a
1233       && a <= table[start].rangeEnd
1234       && (a - table[start].rangeStart) % table[start].step == 0) {
1235     return a + table[start].offset;
1236   } else {
1237     return a;
1238   }
1239 }
1240 
1241 /*
1242  * Return the folded-case equivalent of "a", which is a UCS-4 character.  Uses
1243  * simple case folding.
1244  */
utf_fold(int a)1245 int utf_fold(int a)
1246 {
1247   if (a < 0x80) {
1248     // be fast for ASCII
1249     return a >= 0x41 && a <= 0x5a ? a + 32 : a;
1250   }
1251   return utf_convert(a, foldCase, ARRAY_SIZE(foldCase));
1252 }
1253 
1254 // Vim's own character class functions.  These exist because many library
1255 // islower()/toupper() etc. do not work properly: they crash when used with
1256 // invalid values or can't handle latin1 when the locale is C.
1257 // Speed is most important here.
1258 
1259 /// Return the upper-case equivalent of "a", which is a UCS-4 character.  Use
1260 /// simple case folding.
mb_toupper(int a)1261 int mb_toupper(int a)
1262 {
1263   // If 'casemap' contains "keepascii" use ASCII style toupper().
1264   if (a < 128 && (cmp_flags & CMP_KEEPASCII)) {
1265     return TOUPPER_ASC(a);
1266   }
1267 
1268 #if defined(__STDC_ISO_10646__)
1269   // If towupper() is available and handles Unicode, use it.
1270   if (!(cmp_flags & CMP_INTERNAL)) {
1271     return towupper(a);
1272   }
1273 #endif
1274 
1275   // For characters below 128 use locale sensitive toupper().
1276   if (a < 128) {
1277     return TOUPPER_LOC(a);
1278   }
1279 
1280   // For any other characters use the above mapping table.
1281   return utf_convert(a, toUpper, ARRAY_SIZE(toUpper));
1282 }
1283 
mb_islower(int a)1284 bool mb_islower(int a)
1285 {
1286   // German sharp s is lower case but has no upper case equivalent.
1287   return (mb_toupper(a) != a) || a == 0xdf;
1288 }
1289 
1290 /// Return the lower-case equivalent of "a", which is a UCS-4 character.  Use
1291 /// simple case folding.
mb_tolower(int a)1292 int mb_tolower(int a)
1293 {
1294   // If 'casemap' contains "keepascii" use ASCII style tolower().
1295   if (a < 128 && (cmp_flags & CMP_KEEPASCII)) {
1296     return TOLOWER_ASC(a);
1297   }
1298 
1299 #if defined(__STDC_ISO_10646__)
1300   // If towlower() is available and handles Unicode, use it.
1301   if (!(cmp_flags & CMP_INTERNAL)) {
1302     return towlower(a);
1303   }
1304 #endif
1305 
1306   // For characters below 128 use locale sensitive tolower().
1307   if (a < 128) {
1308     return TOLOWER_LOC(a);
1309   }
1310 
1311   // For any other characters use the above mapping table.
1312   return utf_convert(a, toLower, ARRAY_SIZE(toLower));
1313 }
1314 
mb_isupper(int a)1315 bool mb_isupper(int a)
1316 {
1317   return mb_tolower(a) != a;
1318 }
1319 
utf_strnicmp(const char_u * s1,const char_u * s2,size_t n1,size_t n2)1320 static int utf_strnicmp(const char_u *s1, const char_u *s2, size_t n1, size_t n2)
1321 {
1322   int c1, c2, cdiff;
1323   char_u buffer[6];
1324 
1325   for (;;) {
1326     c1 = utf_safe_read_char_adv(&s1, &n1);
1327     c2 = utf_safe_read_char_adv(&s2, &n2);
1328 
1329     if (c1 <= 0 || c2 <= 0) {
1330       break;
1331     }
1332 
1333     if (c1 == c2) {
1334       continue;
1335     }
1336 
1337     cdiff = utf_fold(c1) - utf_fold(c2);
1338     if (cdiff != 0) {
1339       return cdiff;
1340     }
1341   }
1342 
1343   // some string ended or has an incomplete/illegal character sequence
1344 
1345   if (c1 == 0 || c2 == 0) {
1346     // some string ended. shorter string is smaller
1347     if (c1 == 0 && c2 == 0) {
1348       return 0;
1349     }
1350     return c1 == 0 ? -1 : 1;
1351   }
1352 
1353   // Continue with bytewise comparison to produce some result that
1354   // would make comparison operations involving this function transitive.
1355   //
1356   // If only one string had an error, comparison should be made with
1357   // folded version of the other string. In this case it is enough
1358   // to fold just one character to determine the result of comparison.
1359 
1360   if (c1 != -1 && c2 == -1) {
1361     n1 = utf_char2bytes(utf_fold(c1), buffer);
1362     s1 = buffer;
1363   } else if (c2 != -1 && c1 == -1) {
1364     n2 = utf_char2bytes(utf_fold(c2), buffer);
1365     s2 = buffer;
1366   }
1367 
1368   while (n1 > 0 && n2 > 0 && *s1 != NUL && *s2 != NUL) {
1369     cdiff = (int)(*s1) - (int)(*s2);
1370     if (cdiff != 0) {
1371       return cdiff;
1372     }
1373 
1374     s1++;
1375     s2++;
1376     n1--;
1377     n2--;
1378   }
1379 
1380   if (n1 > 0 && *s1 == NUL) {
1381     n1 = 0;
1382   }
1383   if (n2 > 0 && *s2 == NUL) {
1384     n2 = 0;
1385   }
1386 
1387   if (n1 == 0 && n2 == 0) {
1388     return 0;
1389   }
1390   return n1 == 0 ? -1 : 1;
1391 }
1392 
1393 #ifdef WIN32
1394 # ifndef CP_UTF8
1395 #  define CP_UTF8 65001  // magic number from winnls.h
1396 # endif
1397 
1398 /// Converts string from UTF-8 to UTF-16.
1399 ///
1400 /// @param utf8  UTF-8 string.
1401 /// @param utf8len  Length of `utf8`. May be -1 if `utf8` is NUL-terminated.
1402 /// @param utf16[out,allocated]  NUL-terminated UTF-16 string, or NULL on error
1403 /// @return 0 on success, or libuv error code
utf8_to_utf16(const char * utf8,int utf8len,wchar_t ** utf16)1404 int utf8_to_utf16(const char *utf8, int utf8len, wchar_t **utf16)
1405   FUNC_ATTR_NONNULL_ALL
1406 {
1407   // Compute the length needed for the converted UTF-16 string.
1408   int bufsize = MultiByteToWideChar(CP_UTF8,
1409                                     0,     // dwFlags: must be 0 for UTF-8
1410                                     utf8,  // -1: process up to NUL
1411                                     utf8len,
1412                                     NULL,
1413                                     0);    // 0: get length, don't convert
1414   if (bufsize == 0) {
1415     *utf16 = NULL;
1416     return uv_translate_sys_error(GetLastError());
1417   }
1418 
1419   // Allocate the destination buffer adding an extra byte for the terminating
1420   // NULL. If `utf8len` is not -1 MultiByteToWideChar will not add it, so
1421   // we do it ourselves always, just in case.
1422   *utf16 = xmalloc(sizeof(wchar_t) * (bufsize + 1));
1423 
1424   // Convert to UTF-16.
1425   bufsize = MultiByteToWideChar(CP_UTF8, 0, utf8, utf8len, *utf16, bufsize);
1426   if (bufsize == 0) {
1427     XFREE_CLEAR(*utf16);
1428     return uv_translate_sys_error(GetLastError());
1429   }
1430 
1431   (*utf16)[bufsize] = L'\0';
1432   return 0;
1433 }
1434 
1435 /// Converts string from UTF-16 to UTF-8.
1436 ///
1437 /// @param utf16  UTF-16 string.
1438 /// @param utf16len  Length of `utf16`. May be -1 if `utf16` is NUL-terminated.
1439 /// @param utf8[out,allocated]  NUL-terminated UTF-8 string, or NULL on error
1440 /// @return 0 on success, or libuv error code
utf16_to_utf8(const wchar_t * utf16,int utf16len,char ** utf8)1441 int utf16_to_utf8(const wchar_t *utf16, int utf16len, char **utf8)
1442   FUNC_ATTR_NONNULL_ALL
1443 {
1444   // Compute the space needed for the converted UTF-8 string.
1445   DWORD bufsize = WideCharToMultiByte(CP_UTF8,
1446                                       0,
1447                                       utf16,
1448                                       utf16len,
1449                                       NULL,
1450                                       0,
1451                                       NULL,
1452                                       NULL);
1453   if (bufsize == 0) {
1454     *utf8 = NULL;
1455     return uv_translate_sys_error(GetLastError());
1456   }
1457 
1458   // Allocate the destination buffer adding an extra byte for the terminating
1459   // NULL. If `utf16len` is not -1 WideCharToMultiByte will not add it, so
1460   // we do it ourselves always, just in case.
1461   *utf8 = xmalloc(bufsize + 1);
1462 
1463   // Convert to UTF-8.
1464   bufsize = WideCharToMultiByte(CP_UTF8,
1465                                 0,
1466                                 utf16,
1467                                 utf16len,
1468                                 *utf8,
1469                                 bufsize,
1470                                 NULL,
1471                                 NULL);
1472   if (bufsize == 0) {
1473     XFREE_CLEAR(*utf8);
1474     return uv_translate_sys_error(GetLastError());
1475   }
1476 
1477   (*utf8)[bufsize] = '\0';
1478   return 0;
1479 }
1480 
1481 #endif
1482 
1483 /// Measure the length of a string in corresponding UTF-32 and UTF-16 units.
1484 ///
1485 /// Invalid UTF-8 bytes, or embedded surrogates, count as one code point/unit
1486 /// each.
1487 ///
1488 /// The out parameters are incremented. This is used to measure the size of
1489 /// a buffer region consisting of multiple line segments.
1490 ///
1491 /// @param s the string
1492 /// @param len maximum length (an earlier NUL terminates)
1493 /// @param[out] codepoints incremented with UTF-32 code point size
1494 /// @param[out] codeunits incremented with UTF-16 code unit size
mb_utflen(const char_u * s,size_t len,size_t * codepoints,size_t * codeunits)1495 void mb_utflen(const char_u *s, size_t len, size_t *codepoints, size_t *codeunits)
1496   FUNC_ATTR_NONNULL_ALL
1497 {
1498   size_t count = 0, extra = 0;
1499   size_t clen;
1500   for (size_t i = 0; i < len && s[i] != NUL; i += clen) {
1501     clen = utf_ptr2len_len(s+i, len-i);
1502     // NB: gets the byte value of invalid sequence bytes.
1503     // we only care whether the char fits in the BMP or not
1504     int c = (clen > 1) ? utf_ptr2char(s+i) : s[i];
1505     count++;
1506     if (c > 0xFFFF) {
1507       extra++;
1508     }
1509   }
1510   *codepoints += count;
1511   *codeunits += count + extra;
1512 }
1513 
mb_utf_index_to_bytes(const char_u * s,size_t len,size_t index,bool use_utf16_units)1514 ssize_t mb_utf_index_to_bytes(const char_u *s, size_t len, size_t index, bool use_utf16_units)
1515   FUNC_ATTR_NONNULL_ALL
1516 {
1517   size_t count = 0;
1518   size_t clen, i;
1519   if (index == 0) {
1520     return 0;
1521   }
1522   for (i = 0; i < len && s[i] != NUL; i += clen) {
1523     clen = utf_ptr2len_len(s+i, len-i);
1524     // NB: gets the byte value of invalid sequence bytes.
1525     // we only care whether the char fits in the BMP or not
1526     int c = (clen > 1) ? utf_ptr2char(s+i) : s[i];
1527     count++;
1528     if (use_utf16_units && c > 0xFFFF) {
1529       count++;
1530     }
1531     if (count >= index) {
1532       return i+clen;
1533     }
1534   }
1535   return -1;
1536 }
1537 
1538 
1539 /*
1540  * Version of strnicmp() that handles multi-byte characters.
1541  * Needed for Big5, Shift-JIS and UTF-8 encoding.  Other DBCS encodings can
1542  * probably use strnicmp(), because there are no ASCII characters in the
1543  * second byte.
1544  * Returns zero if s1 and s2 are equal (ignoring case), the difference between
1545  * two characters otherwise.
1546  */
mb_strnicmp(const char_u * s1,const char_u * s2,const size_t nn)1547 int mb_strnicmp(const char_u *s1, const char_u *s2, const size_t nn)
1548 {
1549   return utf_strnicmp(s1, s2, nn, nn);
1550 }
1551 
1552 /// Compare strings case-insensitively
1553 ///
1554 /// @note We need to call mb_stricmp() even when we aren't dealing with
1555 ///       a multi-byte encoding because mb_stricmp() takes care of all ASCII and
1556 ///       non-ascii encodings, including characters with umlauts in latin1,
1557 ///       etc., while STRICMP() only handles the system locale version, which
1558 ///       often does not handle non-ascii properly.
1559 ///
1560 /// @param[in]  s1  First string to compare, not more then #MAXCOL characters.
1561 /// @param[in]  s2  Second string to compare, not more then #MAXCOL characters.
1562 ///
1563 /// @return 0 if strings are equal, <0 if s1 < s2, >0 if s1 > s2.
mb_stricmp(const char * s1,const char * s2)1564 int mb_stricmp(const char *s1, const char *s2)
1565 {
1566   return mb_strnicmp((const char_u *)s1, (const char_u *)s2, MAXCOL);
1567 }
1568 
1569 /*
1570  * "g8": show bytes of the UTF-8 char under the cursor.  Doesn't matter what
1571  * 'encoding' has been set to.
1572  */
show_utf8(void)1573 void show_utf8(void)
1574 {
1575   int len;
1576   int rlen = 0;
1577   char_u *line;
1578   int clen;
1579   int i;
1580 
1581   // Get the byte length of the char under the cursor, including composing
1582   // characters.
1583   line = get_cursor_pos_ptr();
1584   len = utfc_ptr2len(line);
1585   if (len == 0) {
1586     msg("NUL");
1587     return;
1588   }
1589 
1590   clen = 0;
1591   for (i = 0; i < len; ++i) {
1592     if (clen == 0) {
1593       // start of (composing) character, get its length
1594       if (i > 0) {
1595         STRCPY(IObuff + rlen, "+ ");
1596         rlen += 2;
1597       }
1598       clen = utf_ptr2len(line + i);
1599     }
1600     sprintf((char *)IObuff + rlen, "%02x ",
1601             (line[i] == NL) ? NUL : line[i]);          // NUL is stored as NL
1602     --clen;
1603     rlen += (int)STRLEN(IObuff + rlen);
1604     if (rlen > IOSIZE - 20) {
1605       break;
1606     }
1607   }
1608 
1609   msg((char *)IObuff);
1610 }
1611 
1612 /// Return offset from "p" to the first byte of the character it points into.
1613 /// If "p" points to the NUL at the end of the string return 0.
1614 /// Returns 0 when already at the first byte of a character.
utf_head_off(const char_u * base,const char_u * p)1615 int utf_head_off(const char_u *base, const char_u *p)
1616 {
1617   int c;
1618   int len;
1619 
1620   if (*p < 0x80) {              // be quick for ASCII
1621     return 0;
1622   }
1623 
1624   // Skip backwards over trailing bytes: 10xx.xxxx
1625   // Skip backwards again if on a composing char.
1626   const char_u *q;
1627   for (q = p;; --q) {
1628     // Move s to the last byte of this char.
1629     const char_u *s;
1630     for (s = q; (s[1] & 0xc0) == 0x80; ++s) {}
1631 
1632     // Move q to the first byte of this char.
1633     while (q > base && (*q & 0xc0) == 0x80) {
1634       --q;
1635     }
1636     // Check for illegal sequence. Do allow an illegal byte after where we
1637     // started.
1638     len = utf8len_tab[*q];
1639     if (len != (int)(s - q + 1) && len != (int)(p - q + 1)) {
1640       return 0;
1641     }
1642 
1643     if (q <= base) {
1644       break;
1645     }
1646 
1647     c = utf_ptr2char(q);
1648     if (utf_iscomposing(c)) {
1649       continue;
1650     }
1651 
1652     if (arabic_maycombine(c)) {
1653       // Advance to get a sneak-peak at the next char
1654       const char_u *j = q;
1655       --j;
1656       // Move j to the first byte of this char.
1657       while (j > base && (*j & 0xc0) == 0x80) {
1658         --j;
1659       }
1660       if (arabic_combine(utf_ptr2char(j), c)) {
1661         continue;
1662       }
1663     }
1664     break;
1665   }
1666 
1667   return (int)(p - q);
1668 }
1669 
1670 // Whether space is NOT allowed before/after 'c'.
utf_eat_space(int cc)1671 bool utf_eat_space(int cc)
1672   FUNC_ATTR_CONST FUNC_ATTR_WARN_UNUSED_RESULT
1673 {
1674   return (cc >= 0x2000 && cc <= 0x206F)   // General punctuations
1675          || (cc >= 0x2e00 && cc <= 0x2e7f)   // Supplemental punctuations
1676          || (cc >= 0x3000 && cc <= 0x303f)   // CJK symbols and punctuations
1677          || (cc >= 0xff01 && cc <= 0xff0f)   // Full width ASCII punctuations
1678          || (cc >= 0xff1a && cc <= 0xff20)   // ..
1679          || (cc >= 0xff3b && cc <= 0xff40)   // ..
1680          || (cc >= 0xff5b && cc <= 0xff65);  // ..
1681 }
1682 
1683 // Whether line break is allowed before "cc".
utf_allow_break_before(int cc)1684 bool utf_allow_break_before(int cc)
1685   FUNC_ATTR_CONST FUNC_ATTR_WARN_UNUSED_RESULT
1686 {
1687   static const int BOL_prohibition_punct[] = {
1688     '!',
1689     '%',
1690     ')',
1691     ',',
1692     ':',
1693     ';',
1694     '>',
1695     '?',
1696     ']',
1697     '}',
1698     0x2019,  // ’ right single quotation mark
1699     0x201d,  // ” right double quotation mark
1700     0x2020,  // † dagger
1701     0x2021,  // ‡ double dagger
1702     0x2026,  // … horizontal ellipsis
1703     0x2030,  // ‰ per mille sign
1704     0x2031,  // ‱ per then thousand sign
1705     0x203c,  // ‼ double exclamation mark
1706     0x2047,  // ⁇ double question mark
1707     0x2048,  // ⁈ question exclamation mark
1708     0x2049,  // ⁉ exclamation question mark
1709     0x2103,  // ℃ degree celsius
1710     0x2109,  // ℉ degree fahrenheit
1711     0x3001,  // 、 ideographic comma
1712     0x3002,  // 。 ideographic full stop
1713     0x3009,  // 〉 right angle bracket
1714     0x300b,  // 》 right double angle bracket
1715     0x300d,  // 」 right corner bracket
1716     0x300f,  // 』 right white corner bracket
1717     0x3011,  // 】 right black lenticular bracket
1718     0x3015,  // 〕 right tortoise shell bracket
1719     0x3017,  // 〗 right white lenticular bracket
1720     0x3019,  // 〙 right white tortoise shell bracket
1721     0x301b,  // 〛 right white square bracket
1722     0xff01,  // ! fullwidth exclamation mark
1723     0xff09,  // ) fullwidth right parenthesis
1724     0xff0c,  // , fullwidth comma
1725     0xff0e,  // . fullwidth full stop
1726     0xff1a,  // : fullwidth colon
1727     0xff1b,  // ; fullwidth semicolon
1728     0xff1f,  // ? fullwidth question mark
1729     0xff3d,  // ] fullwidth right square bracket
1730     0xff5d,  // } fullwidth right curly bracket
1731   };
1732 
1733   int first = 0;
1734   int last = ARRAY_SIZE(BOL_prohibition_punct) - 1;
1735 
1736   while (first < last) {
1737     const int mid = (first + last) / 2;
1738 
1739     if (cc == BOL_prohibition_punct[mid]) {
1740       return false;
1741     } else if (cc > BOL_prohibition_punct[mid]) {
1742       first = mid + 1;
1743     } else {
1744       last = mid - 1;
1745     }
1746   }
1747 
1748   return cc != BOL_prohibition_punct[first];
1749 }
1750 
1751 // Whether line break is allowed after "cc".
utf_allow_break_after(int cc)1752 bool utf_allow_break_after(int cc)
1753   FUNC_ATTR_CONST FUNC_ATTR_WARN_UNUSED_RESULT
1754 {
1755   static const int EOL_prohibition_punct[] = {
1756     '(',
1757     '<',
1758     '[',
1759     '`',
1760     '{',
1761     // 0x2014,  // — em dash
1762     0x2018,     // ‘ left single quotation mark
1763     0x201c,     // “ left double quotation mark
1764     // 0x2053,  // ~ swung dash
1765     0x3008,     // 〈 left angle bracket
1766     0x300a,     // 《 left double angle bracket
1767     0x300c,     // 「 left corner bracket
1768     0x300e,     // 『 left white corner bracket
1769     0x3010,     // 【 left black lenticular bracket
1770     0x3014,     // 〔 left tortoise shell bracket
1771     0x3016,     // 〖 left white lenticular bracket
1772     0x3018,     // 〘 left white tortoise shell bracket
1773     0x301a,     // 〚 left white square bracket
1774     0xff08,     // ( fullwidth left parenthesis
1775     0xff3b,     // [ fullwidth left square bracket
1776     0xff5b,     // { fullwidth left curly bracket
1777   };
1778 
1779   int first = 0;
1780   int last = ARRAY_SIZE(EOL_prohibition_punct) - 1;
1781 
1782   while (first < last) {
1783     const int mid = (first + last)/2;
1784 
1785     if (cc == EOL_prohibition_punct[mid]) {
1786       return false;
1787     } else if (cc > EOL_prohibition_punct[mid]) {
1788       first = mid + 1;
1789     } else {
1790       last = mid - 1;
1791     }
1792   }
1793 
1794   return cc != EOL_prohibition_punct[first];
1795 }
1796 
1797 // Whether line break is allowed between "cc" and "ncc".
utf_allow_break(int cc,int ncc)1798 bool utf_allow_break(int cc, int ncc)
1799   FUNC_ATTR_CONST FUNC_ATTR_WARN_UNUSED_RESULT
1800 {
1801   // don't break between two-letter punctuations
1802   if (cc == ncc
1803       && (cc == 0x2014         // em dash
1804           || cc == 0x2026)) {  // horizontal ellipsis
1805     return false;
1806   }
1807   return utf_allow_break_after(cc) && utf_allow_break_before(ncc);
1808 }
1809 
1810 /// Copy a character, advancing the pointers
1811 ///
1812 /// @param[in,out]  fp  Source of the character to copy.
1813 /// @param[in,out]  tp  Destination to copy to.
mb_copy_char(const char_u ** const fp,char_u ** const tp)1814 void mb_copy_char(const char_u **const fp, char_u **const tp)
1815 {
1816   const size_t l = (size_t)utfc_ptr2len(*fp);
1817 
1818   memmove(*tp, *fp, l);
1819   *tp += l;
1820   *fp += l;
1821 }
1822 
1823 /*
1824  * Return the offset from "p" to the first byte of a character.  When "p" is
1825  * at the start of a character 0 is returned, otherwise the offset to the next
1826  * character.  Can start anywhere in a stream of bytes.
1827  */
mb_off_next(char_u * base,char_u * p)1828 int mb_off_next(char_u *base, char_u *p)
1829 {
1830   int i;
1831   int j;
1832 
1833   if (*p < 0x80) {              // be quick for ASCII
1834     return 0;
1835   }
1836 
1837   // Find the next character that isn't 10xx.xxxx
1838   for (i = 0; (p[i] & 0xc0) == 0x80; i++) {}
1839   if (i > 0) {
1840     // Check for illegal sequence.
1841     for (j = 0; p - j > base; j++) {
1842       if ((p[-j] & 0xc0) != 0x80) {
1843         break;
1844       }
1845     }
1846     if (utf8len_tab[p[-j]] != i + j) {
1847       return 0;
1848     }
1849   }
1850   return i;
1851 }
1852 
1853 /*
1854  * Return the offset from "p" to the last byte of the character it points
1855  * into.  Can start anywhere in a stream of bytes.
1856  */
mb_tail_off(char_u * base,char_u * p)1857 int mb_tail_off(char_u *base, char_u *p)
1858 {
1859   int i;
1860   int j;
1861 
1862   if (*p == NUL) {
1863     return 0;
1864   }
1865 
1866   // Find the last character that is 10xx.xxxx
1867   for (i = 0; (p[i + 1] & 0xc0) == 0x80; i++) {}
1868 
1869   // Check for illegal sequence.
1870   for (j = 0; p - j > base; j++) {
1871     if ((p[-j] & 0xc0) != 0x80) {
1872       break;
1873     }
1874   }
1875 
1876   if (utf8len_tab[p[-j]] != i + j + 1) {
1877     return 0;
1878   }
1879   return i;
1880 }
1881 
1882 
1883 /// Return the offset from "p" to the first byte of the character it points
1884 /// into. Can start anywhere in a stream of bytes.
1885 ///
1886 /// @param[in] base  Pointer to start of string
1887 /// @param[in] p     Pointer to byte for which to return the offset to the previous codepoint
1888 //
1889 /// @return 0 if invalid sequence, else offset to previous codepoint
mb_head_off(char_u * base,char_u * p)1890 int mb_head_off(char_u *base, char_u *p)
1891 {
1892   int i;
1893   int j;
1894 
1895   if (*p == NUL) {
1896     return 0;
1897   }
1898 
1899   // Find the first character that is not 10xx.xxxx
1900   for (i = 0; p - i > base; i--) {
1901     if ((p[i] & 0xc0) != 0x80) {
1902       break;
1903     }
1904   }
1905 
1906   // Find the last character that is 10xx.xxxx
1907   for (j = 0; (p[j + 1] & 0xc0) == 0x80; j++) {}
1908 
1909   // Check for illegal sequence.
1910   if (utf8len_tab[p[i]] == 1) {
1911     return 0;
1912   }
1913   return i;
1914 }
1915 
1916 /*
1917  * Find the next illegal byte sequence.
1918  */
utf_find_illegal(void)1919 void utf_find_illegal(void)
1920 {
1921   pos_T pos = curwin->w_cursor;
1922   char_u *p;
1923   int len;
1924   vimconv_T vimconv;
1925   char_u *tofree = NULL;
1926 
1927   vimconv.vc_type = CONV_NONE;
1928   if (enc_canon_props(curbuf->b_p_fenc) & ENC_8BIT) {
1929     // 'encoding' is "utf-8" but we are editing a 8-bit encoded file,
1930     // possibly a utf-8 file with illegal bytes.  Setup for conversion
1931     // from utf-8 to 'fileencoding'.
1932     convert_setup(&vimconv, p_enc, curbuf->b_p_fenc);
1933   }
1934 
1935   curwin->w_cursor.coladd = 0;
1936   for (;;) {
1937     p = get_cursor_pos_ptr();
1938     if (vimconv.vc_type != CONV_NONE) {
1939       xfree(tofree);
1940       tofree = string_convert(&vimconv, p, NULL);
1941       if (tofree == NULL) {
1942         break;
1943       }
1944       p = tofree;
1945     }
1946 
1947     while (*p != NUL) {
1948       // Illegal means that there are not enough trail bytes (checked by
1949       // utf_ptr2len()) or too many of them (overlong sequence).
1950       len = utf_ptr2len(p);
1951       if (*p >= 0x80 && (len == 1
1952                          || utf_char2len(utf_ptr2char(p)) != len)) {
1953         if (vimconv.vc_type == CONV_NONE) {
1954           curwin->w_cursor.col += (colnr_T)(p - get_cursor_pos_ptr());
1955         } else {
1956           int l;
1957 
1958           len = (int)(p - tofree);
1959           for (p = get_cursor_pos_ptr(); *p != NUL && len-- > 0; p += l) {
1960             l = utf_ptr2len(p);
1961             curwin->w_cursor.col += l;
1962           }
1963         }
1964         goto theend;
1965       }
1966       p += len;
1967     }
1968     if (curwin->w_cursor.lnum == curbuf->b_ml.ml_line_count) {
1969       break;
1970     }
1971     ++curwin->w_cursor.lnum;
1972     curwin->w_cursor.col = 0;
1973   }
1974 
1975   // didn't find it: don't move and beep
1976   curwin->w_cursor = pos;
1977   beep_flush();
1978 
1979 theend:
1980   xfree(tofree);
1981   convert_setup(&vimconv, NULL, NULL);
1982 }
1983 
1984 /*
1985  * If the cursor moves on an trail byte, set the cursor on the lead byte.
1986  * Thus it moves left if necessary.
1987  */
mb_adjust_cursor(void)1988 void mb_adjust_cursor(void)
1989 {
1990   mark_mb_adjustpos(curbuf, &curwin->w_cursor);
1991 }
1992 
1993 /// Checks and adjusts cursor column. Not mode-dependent.
1994 /// @see check_cursor_col_win
1995 ///
1996 /// @param  win_  Places cursor on a valid column for this window.
mb_check_adjust_col(void * win_)1997 void mb_check_adjust_col(void *win_)
1998 {
1999   win_T *win = (win_T *)win_;
2000   colnr_T oldcol = win->w_cursor.col;
2001 
2002   // Column 0 is always valid.
2003   if (oldcol != 0) {
2004     char_u *p = ml_get_buf(win->w_buffer, win->w_cursor.lnum, false);
2005     colnr_T len = (colnr_T)STRLEN(p);
2006 
2007     // Empty line or invalid column?
2008     if (len == 0 || oldcol < 0) {
2009       win->w_cursor.col = 0;
2010     } else {
2011       // Cursor column too big for line?
2012       if (oldcol > len) {
2013         win->w_cursor.col = len - 1;
2014       }
2015       // Move the cursor to the head byte.
2016       win->w_cursor.col -= utf_head_off(p, p + win->w_cursor.col);
2017     }
2018 
2019     // Reset `coladd` when the cursor would be on the right half of a
2020     // double-wide character.
2021     if (win->w_cursor.coladd == 1 && p[win->w_cursor.col] != TAB
2022         && vim_isprintc(utf_ptr2char(p + win->w_cursor.col))
2023         && ptr2cells(p + win->w_cursor.col) > 1) {
2024       win->w_cursor.coladd = 0;
2025     }
2026   }
2027 }
2028 
2029 /// @param line  start of the string
2030 ///
2031 /// @return      a pointer to the character before "*p", if there is one.
mb_prevptr(char_u * line,char_u * p)2032 char_u *mb_prevptr(char_u *line, char_u *p)
2033 {
2034   if (p > line) {
2035     MB_PTR_BACK(line, p);
2036   }
2037   return p;
2038 }
2039 
2040 /*
2041  * Return the character length of "str".  Each multi-byte character (with
2042  * following composing characters) counts as one.
2043  */
mb_charlen(char_u * str)2044 int mb_charlen(char_u *str)
2045 {
2046   char_u *p = str;
2047   int count;
2048 
2049   if (p == NULL) {
2050     return 0;
2051   }
2052 
2053   for (count = 0; *p != NUL; count++) {
2054     p += utfc_ptr2len(p);
2055   }
2056 
2057   return count;
2058 }
2059 
2060 /*
2061  * Like mb_charlen() but for a string with specified length.
2062  */
mb_charlen_len(char_u * str,int len)2063 int mb_charlen_len(char_u *str, int len)
2064 {
2065   char_u *p = str;
2066   int count;
2067 
2068   for (count = 0; *p != NUL && p < str + len; count++) {
2069     p += utfc_ptr2len(p);
2070   }
2071 
2072   return count;
2073 }
2074 
2075 /// Try to unescape a multibyte character
2076 ///
2077 /// Used for the rhs and lhs of the mappings.
2078 ///
2079 /// @param[in,out]  pp  String to unescape. Is advanced to just after the bytes
2080 ///                     that form a multibyte character.
2081 ///
2082 /// @return Unescaped string if it is a multibyte character, NULL if no
2083 ///         multibyte character was found. Returns a static buffer, always one
2084 ///         and the same.
mb_unescape(const char ** const pp)2085 const char *mb_unescape(const char **const pp)
2086   FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
2087 {
2088   static char buf[6];
2089   size_t buf_idx = 0;
2090   uint8_t *str = (uint8_t *)(*pp);
2091 
2092   // Must translate K_SPECIAL KS_SPECIAL KE_FILLER to K_SPECIAL and CSI
2093   // KS_EXTRA KE_CSI to CSI.
2094   // Maximum length of a utf-8 character is 4 bytes.
2095   for (size_t str_idx = 0; str[str_idx] != NUL && buf_idx < 4; str_idx++) {
2096     if (str[str_idx] == K_SPECIAL
2097         && str[str_idx + 1] == KS_SPECIAL
2098         && str[str_idx + 2] == KE_FILLER) {
2099       buf[buf_idx++] = (char)K_SPECIAL;
2100       str_idx += 2;
2101     } else if ((str[str_idx] == K_SPECIAL)
2102                && str[str_idx + 1] == KS_EXTRA
2103                && str[str_idx + 2] == KE_CSI) {
2104       buf[buf_idx++] = (char)CSI;
2105       str_idx += 2;
2106     } else if (str[str_idx] == K_SPECIAL) {
2107       break;  // A special key can't be a multibyte char.
2108     } else {
2109       buf[buf_idx++] = (char)str[str_idx];
2110     }
2111     buf[buf_idx] = NUL;
2112 
2113     // Return a multi-byte character if it's found.  An illegal sequence
2114     // will result in a 1 here.
2115     if (utf_ptr2len((const char_u *)buf) > 1) {
2116       *pp = (const char *)str + str_idx + 1;
2117       return buf;
2118     }
2119 
2120     // Bail out quickly for ASCII.
2121     if ((uint8_t)buf[0] < 128) {
2122       break;
2123     }
2124   }
2125   return NULL;
2126 }
2127 
2128 
2129 /*
2130  * Skip the Vim specific head of a 'encoding' name.
2131  */
enc_skip(char_u * p)2132 char_u *enc_skip(char_u *p)
2133 {
2134   if (STRNCMP(p, "2byte-", 6) == 0) {
2135     return p + 6;
2136   }
2137   if (STRNCMP(p, "8bit-", 5) == 0) {
2138     return p + 5;
2139   }
2140   return p;
2141 }
2142 
2143 /*
2144  * Find the canonical name for encoding "enc".
2145  * When the name isn't recognized, returns "enc" itself, but with all lower
2146  * case characters and '_' replaced with '-'.
2147  * Returns an allocated string.
2148  */
enc_canonize(char_u * enc)2149 char_u *enc_canonize(char_u *enc) FUNC_ATTR_NONNULL_RET
2150 {
2151   char_u *p, *s;
2152   int i;
2153 
2154   if (STRCMP(enc, "default") == 0) {
2155     // Use the default encoding as found by set_init_1().
2156     return vim_strsave(fenc_default);
2157   }
2158 
2159   // copy "enc" to allocated memory, with room for two '-'
2160   char_u *r = xmalloc(STRLEN(enc) + 3);
2161   // Make it all lower case and replace '_' with '-'.
2162   p = r;
2163   for (s = enc; *s != NUL; ++s) {
2164     if (*s == '_') {
2165       *p++ = '-';
2166     } else {
2167       *p++ = TOLOWER_ASC(*s);
2168     }
2169   }
2170   *p = NUL;
2171 
2172   // Skip "2byte-" and "8bit-".
2173   p = enc_skip(r);
2174 
2175   // Change "microsoft-cp" to "cp".  Used in some spell files.
2176   if (STRNCMP(p, "microsoft-cp", 12) == 0) {
2177     STRMOVE(p, p + 10);
2178   }
2179 
2180   // "iso8859" -> "iso-8859"
2181   if (STRNCMP(p, "iso8859", 7) == 0) {
2182     STRMOVE(p + 4, p + 3);
2183     p[3] = '-';
2184   }
2185 
2186   // "iso-8859n" -> "iso-8859-n"
2187   if (STRNCMP(p, "iso-8859", 8) == 0 && p[8] != '-') {
2188     STRMOVE(p + 9, p + 8);
2189     p[8] = '-';
2190   }
2191 
2192   // "latin-N" -> "latinN"
2193   if (STRNCMP(p, "latin-", 6) == 0) {
2194     STRMOVE(p + 5, p + 6);
2195   }
2196 
2197   if (enc_canon_search(p) >= 0) {
2198     // canonical name can be used unmodified
2199     if (p != r) {
2200       STRMOVE(r, p);
2201     }
2202   } else if ((i = enc_alias_search(p)) >= 0) {
2203     // alias recognized, get canonical name
2204     xfree(r);
2205     r = vim_strsave((char_u *)enc_canon_table[i].name);
2206   }
2207   return r;
2208 }
2209 
2210 /*
2211  * Search for an encoding alias of "name".
2212  * Returns -1 when not found.
2213  */
enc_alias_search(char_u * name)2214 static int enc_alias_search(char_u *name)
2215 {
2216   int i;
2217 
2218   for (i = 0; enc_alias_table[i].name != NULL; ++i) {
2219     if (STRCMP(name, enc_alias_table[i].name) == 0) {
2220       return enc_alias_table[i].canon;
2221     }
2222   }
2223   return -1;
2224 }
2225 
2226 
2227 #ifdef HAVE_LANGINFO_H
2228 # include <langinfo.h>
2229 #endif
2230 
2231 /*
2232  * Get the canonicalized encoding of the current locale.
2233  * Returns an allocated string when successful, NULL when not.
2234  */
enc_locale(void)2235 char_u *enc_locale(void)
2236 {
2237   int i;
2238   char buf[50];
2239 
2240   const char *s;
2241 #ifdef HAVE_NL_LANGINFO_CODESET
2242   if (!(s = nl_langinfo(CODESET)) || *s == NUL)
2243 #endif
2244   {
2245 #if defined(HAVE_LOCALE_H)
2246     if (!(s = setlocale(LC_CTYPE, NULL)) || *s == NUL)
2247 #endif
2248     {
2249       if ((s = os_getenv("LC_ALL"))) {
2250         if ((s = os_getenv("LC_CTYPE"))) {
2251           s = os_getenv("LANG");
2252         }
2253       }
2254     }
2255   }
2256 
2257   if (!s) {
2258     return NULL;
2259   }
2260 
2261   // The most generic locale format is:
2262   // language[_territory][.codeset][@modifier][+special][,[sponsor][_revision]]
2263   // If there is a '.' remove the part before it.
2264   // if there is something after the codeset, remove it.
2265   // Make the name lowercase and replace '_' with '-'.
2266   // Exception: "ja_JP.EUC" == "euc-jp", "zh_CN.EUC" = "euc-cn",
2267   // "ko_KR.EUC" == "euc-kr"
2268   const char *p = (char *)vim_strchr((char_u *)s, '.');
2269   if (p != NULL) {
2270     if (p > s + 2 && !STRNICMP(p + 1, "EUC", 3)
2271         && !isalnum((int)p[4]) && p[4] != '-' && p[-3] == '_') {
2272       // Copy "XY.EUC" to "euc-XY" to buf[10].
2273       memmove(buf, "euc-", 4);
2274       buf[4] = (ASCII_ISALNUM(p[-2]) ? TOLOWER_ASC(p[-2]) : 0);
2275       buf[5] = (ASCII_ISALNUM(p[-1]) ? TOLOWER_ASC(p[-1]) : 0);
2276       buf[6] = NUL;
2277     } else {
2278       s = p + 1;
2279       goto enc_locale_copy_enc;
2280     }
2281   } else {
2282 enc_locale_copy_enc:
2283     for (i = 0; i < (int)sizeof(buf) - 1 && s[i] != NUL; i++) {
2284       if (s[i] == '_' || s[i] == '-') {
2285         buf[i] = '-';
2286       } else if (ASCII_ISALNUM((uint8_t)s[i])) {
2287         buf[i] = TOLOWER_ASC(s[i]);
2288       } else {
2289         break;
2290       }
2291     }
2292     buf[i] = NUL;
2293   }
2294 
2295   return enc_canonize((char_u *)buf);
2296 }
2297 
2298 #if defined(HAVE_ICONV)
2299 
2300 
2301 /*
2302  * Call iconv_open() with a check if iconv() works properly (there are broken
2303  * versions).
2304  * Returns (void *)-1 if failed.
2305  * (should return iconv_t, but that causes problems with prototypes).
2306  */
my_iconv_open(char_u * to,char_u * from)2307 void *my_iconv_open(char_u *to, char_u *from)
2308 {
2309   iconv_t fd;
2310 # define ICONV_TESTLEN 400
2311   char_u tobuf[ICONV_TESTLEN];
2312   char *p;
2313   size_t tolen;
2314   static WorkingStatus iconv_working = kUnknown;
2315 
2316   if (iconv_working == kBroken) {
2317     return (void *)-1;          // detected a broken iconv() previously
2318   }
2319   fd = iconv_open((char *)enc_skip(to), (char *)enc_skip(from));
2320 
2321   if (fd != (iconv_t)-1 && iconv_working == kUnknown) {
2322     /*
2323      * Do a dummy iconv() call to check if it actually works.  There is a
2324      * version of iconv() on Linux that is broken.  We can't ignore it,
2325      * because it's wide-spread.  The symptoms are that after outputting
2326      * the initial shift state the "to" pointer is NULL and conversion
2327      * stops for no apparent reason after about 8160 characters.
2328      */
2329     p = (char *)tobuf;
2330     tolen = ICONV_TESTLEN;
2331     (void)iconv(fd, NULL, NULL, &p, &tolen);
2332     if (p == NULL) {
2333       iconv_working = kBroken;
2334       iconv_close(fd);
2335       fd = (iconv_t)-1;
2336     } else {
2337       iconv_working = kWorking;
2338     }
2339   }
2340 
2341   return (void *)fd;
2342 }
2343 
2344 /*
2345  * Convert the string "str[slen]" with iconv().
2346  * If "unconvlenp" is not NULL handle the string ending in an incomplete
2347  * sequence and set "*unconvlenp" to the length of it.
2348  * Returns the converted string in allocated memory.  NULL for an error.
2349  * If resultlenp is not NULL, sets it to the result length in bytes.
2350  */
iconv_string(const vimconv_T * const vcp,char_u * str,size_t slen,size_t * unconvlenp,size_t * resultlenp)2351 static char_u *iconv_string(const vimconv_T *const vcp, char_u *str, size_t slen,
2352                             size_t *unconvlenp, size_t *resultlenp)
2353 {
2354   const char *from;
2355   size_t fromlen;
2356   char *to;
2357   size_t tolen;
2358   size_t len = 0;
2359   size_t done = 0;
2360   char_u *result = NULL;
2361   char_u *p;
2362   int l;
2363 
2364   from = (char *)str;
2365   fromlen = slen;
2366   for (;;) {
2367     if (len == 0 || ICONV_ERRNO == ICONV_E2BIG) {
2368       // Allocate enough room for most conversions.  When re-allocating
2369       // increase the buffer size.
2370       len = len + fromlen * 2 + 40;
2371       p = xmalloc(len);
2372       if (done > 0) {
2373         memmove(p, result, done);
2374       }
2375       xfree(result);
2376       result = p;
2377     }
2378 
2379     to = (char *)result + done;
2380     tolen = len - done - 2;
2381     // Avoid a warning for systems with a wrong iconv() prototype by
2382     // casting the second argument to void *.
2383     if (iconv(vcp->vc_fd, (void *)&from, &fromlen, &to, &tolen) != SIZE_MAX) {
2384       // Finished, append a NUL.
2385       *to = NUL;
2386       break;
2387     }
2388 
2389     // Check both ICONV_EINVAL and EINVAL, because the dynamically loaded
2390     // iconv library may use one of them.
2391     if (!vcp->vc_fail && unconvlenp != NULL
2392         && (ICONV_ERRNO == ICONV_EINVAL || ICONV_ERRNO == EINVAL)) {
2393       // Handle an incomplete sequence at the end.
2394       *to = NUL;
2395       *unconvlenp = fromlen;
2396       break;
2397     } else if (!vcp->vc_fail
2398                && (ICONV_ERRNO == ICONV_EILSEQ || ICONV_ERRNO == EILSEQ
2399                    || ICONV_ERRNO == ICONV_EINVAL || ICONV_ERRNO == EINVAL)) {
2400       // Check both ICONV_EILSEQ and EILSEQ, because the dynamically loaded
2401       // iconv library may use one of them.
2402 
2403       // Can't convert: insert a '?' and skip a character.  This assumes
2404       // conversion from 'encoding' to something else.  In other
2405       // situations we don't know what to skip anyway.
2406       *to++ = '?';
2407       if (utf_ptr2cells((char_u *)from) > 1) {
2408         *to++ = '?';
2409       }
2410       l = utfc_ptr2len_len((const char_u *)from, (int)fromlen);
2411       from += l;
2412       fromlen -= l;
2413     } else if (ICONV_ERRNO != ICONV_E2BIG) {
2414       // conversion failed
2415       XFREE_CLEAR(result);
2416       break;
2417     }
2418     // Not enough room or skipping illegal sequence.
2419     done = to - (char *)result;
2420   }
2421 
2422   if (resultlenp != NULL && result != NULL) {
2423     *resultlenp = (size_t)(to - (char *)result);
2424   }
2425   return result;
2426 }
2427 
2428 #endif  // HAVE_ICONV
2429 
2430 
2431 /*
2432  * Setup "vcp" for conversion from "from" to "to".
2433  * The names must have been made canonical with enc_canonize().
2434  * vcp->vc_type must have been initialized to CONV_NONE.
2435  * Note: cannot be used for conversion from/to ucs-2 and ucs-4 (will use utf-8
2436  * instead).
2437  * Afterwards invoke with "from" and "to" equal to NULL to cleanup.
2438  * Return FAIL when conversion is not supported, OK otherwise.
2439  */
convert_setup(vimconv_T * vcp,char_u * from,char_u * to)2440 int convert_setup(vimconv_T *vcp, char_u *from, char_u *to)
2441 {
2442   return convert_setup_ext(vcp, from, true, to, true);
2443 }
2444 
2445 /// As convert_setup(), but only when from_unicode_is_utf8 is true will all
2446 /// "from" unicode charsets be considered utf-8.  Same for "to".
convert_setup_ext(vimconv_T * vcp,char_u * from,bool from_unicode_is_utf8,char_u * to,bool to_unicode_is_utf8)2447 int convert_setup_ext(vimconv_T *vcp, char_u *from, bool from_unicode_is_utf8, char_u *to,
2448                       bool to_unicode_is_utf8)
2449 {
2450   int from_prop;
2451   int to_prop;
2452   int from_is_utf8;
2453   int to_is_utf8;
2454 
2455   // Reset to no conversion.
2456 #ifdef HAVE_ICONV
2457   if (vcp->vc_type == CONV_ICONV && vcp->vc_fd != (iconv_t)-1) {
2458     iconv_close(vcp->vc_fd);
2459   }
2460 #endif
2461   *vcp = (vimconv_T)MBYTE_NONE_CONV;
2462 
2463   // No conversion when one of the names is empty or they are equal.
2464   if (from == NULL || *from == NUL || to == NULL || *to == NUL
2465       || STRCMP(from, to) == 0) {
2466     return OK;
2467   }
2468 
2469   from_prop = enc_canon_props(from);
2470   to_prop = enc_canon_props(to);
2471   if (from_unicode_is_utf8) {
2472     from_is_utf8 = from_prop & ENC_UNICODE;
2473   } else {
2474     from_is_utf8 = from_prop == ENC_UNICODE;
2475   }
2476   if (to_unicode_is_utf8) {
2477     to_is_utf8 = to_prop & ENC_UNICODE;
2478   } else {
2479     to_is_utf8 = to_prop == ENC_UNICODE;
2480   }
2481 
2482   if ((from_prop & ENC_LATIN1) && to_is_utf8) {
2483     // Internal latin1 -> utf-8 conversion.
2484     vcp->vc_type = CONV_TO_UTF8;
2485     vcp->vc_factor = 2;         // up to twice as long
2486   } else if ((from_prop & ENC_LATIN9) && to_is_utf8) {
2487     // Internal latin9 -> utf-8 conversion.
2488     vcp->vc_type = CONV_9_TO_UTF8;
2489     vcp->vc_factor = 3;         // up to three as long (euro sign)
2490   } else if (from_is_utf8 && (to_prop & ENC_LATIN1)) {
2491     // Internal utf-8 -> latin1 conversion.
2492     vcp->vc_type = CONV_TO_LATIN1;
2493   } else if (from_is_utf8 && (to_prop & ENC_LATIN9)) {
2494     // Internal utf-8 -> latin9 conversion.
2495     vcp->vc_type = CONV_TO_LATIN9;
2496   }
2497 #ifdef HAVE_ICONV
2498   else {  // NOLINT(readability/braces)
2499     // Use iconv() for conversion.
2500     vcp->vc_fd = (iconv_t)my_iconv_open(to_is_utf8 ? (char_u *)"utf-8" : to,
2501                                         from_is_utf8 ? (char_u *)"utf-8" : from);
2502     if (vcp->vc_fd != (iconv_t)-1) {
2503       vcp->vc_type = CONV_ICONV;
2504       vcp->vc_factor = 4;       // could be longer too...
2505     }
2506   }
2507 #endif
2508   if (vcp->vc_type == CONV_NONE) {
2509     return FAIL;
2510   }
2511 
2512   return OK;
2513 }
2514 
2515 /*
2516  * Convert text "ptr[*lenp]" according to "vcp".
2517  * Returns the result in allocated memory and sets "*lenp".
2518  * When "lenp" is NULL, use NUL terminated strings.
2519  * Illegal chars are often changed to "?", unless vcp->vc_fail is set.
2520  * When something goes wrong, NULL is returned and "*lenp" is unchanged.
2521  */
string_convert(const vimconv_T * const vcp,char_u * ptr,size_t * lenp)2522 char_u *string_convert(const vimconv_T *const vcp, char_u *ptr, size_t *lenp)
2523 {
2524   return string_convert_ext(vcp, ptr, lenp, NULL);
2525 }
2526 
2527 /*
2528  * Like string_convert(), but when "unconvlenp" is not NULL and there are is
2529  * an incomplete sequence at the end it is not converted and "*unconvlenp" is
2530  * set to the number of remaining bytes.
2531  */
string_convert_ext(const vimconv_T * const vcp,char_u * ptr,size_t * lenp,size_t * unconvlenp)2532 char_u *string_convert_ext(const vimconv_T *const vcp, char_u *ptr, size_t *lenp,
2533                            size_t *unconvlenp)
2534 {
2535   char_u *retval = NULL;
2536   char_u *d;
2537   int l;
2538   int c;
2539 
2540   size_t len;
2541   if (lenp == NULL) {
2542     len = STRLEN(ptr);
2543   } else {
2544     len = *lenp;
2545   }
2546   if (len == 0) {
2547     return vim_strsave((char_u *)"");
2548   }
2549 
2550   switch (vcp->vc_type) {
2551   case CONV_TO_UTF8:            // latin1 to utf-8 conversion
2552     retval = xmalloc(len * 2 + 1);
2553     d = retval;
2554     for (size_t i = 0; i < len; ++i) {
2555       c = ptr[i];
2556       if (c < 0x80) {
2557         *d++ = c;
2558       } else {
2559         *d++ = 0xc0 + ((unsigned)c >> 6);
2560         *d++ = 0x80 + (c & 0x3f);
2561       }
2562     }
2563     *d = NUL;
2564     if (lenp != NULL) {
2565       *lenp = (size_t)(d - retval);
2566     }
2567     break;
2568 
2569   case CONV_9_TO_UTF8:          // latin9 to utf-8 conversion
2570     retval = xmalloc(len * 3 + 1);
2571     d = retval;
2572     for (size_t i = 0; i < len; ++i) {
2573       c = ptr[i];
2574       switch (c) {
2575       case 0xa4:
2576         c = 0x20ac; break;                 // euro
2577       case 0xa6:
2578         c = 0x0160; break;                 // S hat
2579       case 0xa8:
2580         c = 0x0161; break;                 // S -hat
2581       case 0xb4:
2582         c = 0x017d; break;                 // Z hat
2583       case 0xb8:
2584         c = 0x017e; break;                 // Z -hat
2585       case 0xbc:
2586         c = 0x0152; break;                 // OE
2587       case 0xbd:
2588         c = 0x0153; break;                 // oe
2589       case 0xbe:
2590         c = 0x0178; break;                 // Y
2591       }
2592       d += utf_char2bytes(c, d);
2593     }
2594     *d = NUL;
2595     if (lenp != NULL) {
2596       *lenp = (size_t)(d - retval);
2597     }
2598     break;
2599 
2600   case CONV_TO_LATIN1:          // utf-8 to latin1 conversion
2601   case CONV_TO_LATIN9:          // utf-8 to latin9 conversion
2602     retval = xmalloc(len + 1);
2603     d = retval;
2604     for (size_t i = 0; i < len; ++i) {
2605       l = utf_ptr2len_len(ptr + i, len - i);
2606       if (l == 0) {
2607         *d++ = NUL;
2608       } else if (l == 1) {
2609         uint8_t l_w = utf8len_tab_zero[ptr[i]];
2610 
2611         if (l_w == 0) {
2612           // Illegal utf-8 byte cannot be converted
2613           xfree(retval);
2614           return NULL;
2615         }
2616         if (unconvlenp != NULL && l_w > len - i) {
2617           // Incomplete sequence at the end.
2618           *unconvlenp = len - i;
2619           break;
2620         }
2621         *d++ = ptr[i];
2622       } else {
2623         c = utf_ptr2char(ptr + i);
2624         if (vcp->vc_type == CONV_TO_LATIN9) {
2625           switch (c) {
2626           case 0x20ac:
2627             c = 0xa4; break;                     // euro
2628           case 0x0160:
2629             c = 0xa6; break;                     // S hat
2630           case 0x0161:
2631             c = 0xa8; break;                     // S -hat
2632           case 0x017d:
2633             c = 0xb4; break;                     // Z hat
2634           case 0x017e:
2635             c = 0xb8; break;                     // Z -hat
2636           case 0x0152:
2637             c = 0xbc; break;                     // OE
2638           case 0x0153:
2639             c = 0xbd; break;                     // oe
2640           case 0x0178:
2641             c = 0xbe; break;                     // Y
2642           case 0xa4:
2643           case 0xa6:
2644           case 0xa8:
2645           case 0xb4:
2646           case 0xb8:
2647           case 0xbc:
2648           case 0xbd:
2649           case 0xbe:
2650             c = 0x100; break;                   // not in latin9
2651           }
2652         }
2653         if (!utf_iscomposing(c)) {              // skip composing chars
2654           if (c < 0x100) {
2655             *d++ = c;
2656           } else if (vcp->vc_fail) {
2657             xfree(retval);
2658             return NULL;
2659           } else {
2660             *d++ = 0xbf;
2661             if (utf_char2cells(c) > 1) {
2662               *d++ = '?';
2663             }
2664           }
2665         }
2666         i += l - 1;
2667       }
2668     }
2669     *d = NUL;
2670     if (lenp != NULL) {
2671       *lenp = (size_t)(d - retval);
2672     }
2673     break;
2674 
2675 #ifdef HAVE_ICONV
2676   case CONV_ICONV:  // conversion with vcp->vc_fd
2677     retval = iconv_string(vcp, ptr, len, unconvlenp, lenp);
2678     break;
2679 #endif
2680   }
2681 
2682   return retval;
2683 }
2684