1 /* Copyright (c) 2005-2018 Dovecot authors, see the included COPYING file */
2
3 #include "lib.h"
4 #include "array.h"
5 #include "bsearch-insert-pos.h"
6 #include "unichar.h"
7
8 #include "unicodemap.c"
9
10 #define HANGUL_FIRST 0xac00
11 #define HANGUL_LAST 0xd7a3
12
13 const unsigned char utf8_replacement_char[UTF8_REPLACEMENT_CHAR_LEN] =
14 { 0xef, 0xbf, 0xbd }; /* 0xfffd */
15
16 static const uint8_t utf8_non1_bytes[256 - 192 - 2] = {
17 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
18 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
19 };
20
21 const uint8_t *const uni_utf8_non1_bytes = utf8_non1_bytes;
22
uni_strlen(const unichar_t * str)23 unsigned int uni_strlen(const unichar_t *str)
24 {
25 unsigned int len = 0;
26
27 for (len = 0; str[len] != 0; len++) ;
28
29 return len;
30 }
31
uni_utf8_get_char(const char * input,unichar_t * chr_r)32 int uni_utf8_get_char(const char *input, unichar_t *chr_r)
33 {
34 return uni_utf8_get_char_n((const unsigned char *)input, SIZE_MAX,
35 chr_r);
36 }
37
uni_utf8_get_char_n(const void * _input,size_t max_len,unichar_t * chr_r)38 int uni_utf8_get_char_n(const void *_input, size_t max_len, unichar_t *chr_r)
39 {
40 static unichar_t lowest_valid_chr_table[] =
41 { 0, 0, 0x80, 0x800, 0x10000, 0x200000, 0x4000000 };
42 const unsigned char *input = _input;
43 unichar_t chr, lowest_valid_chr;
44 unsigned int i, len;
45 int ret;
46
47 i_assert(max_len > 0);
48
49 if (*input < 0x80) {
50 *chr_r = *input;
51 return 1;
52 }
53
54 /* first byte has len highest bits set, followed by zero bit.
55 the rest of the bits are used as the highest bits of the value. */
56 chr = *input;
57 len = uni_utf8_char_bytes(*input);
58 switch (len) {
59 case 2:
60 chr &= 0x1f;
61 break;
62 case 3:
63 chr &= 0x0f;
64 break;
65 case 4:
66 chr &= 0x07;
67 break;
68 case 5:
69 chr &= 0x03;
70 break;
71 case 6:
72 chr &= 0x01;
73 break;
74 default:
75 /* only 7bit chars should have len==1 */
76 i_assert(len == 1);
77 return -1;
78 }
79
80 if (len <= max_len) {
81 lowest_valid_chr = lowest_valid_chr_table[len];
82 ret = len;
83 } else {
84 /* check first if the input is invalid before returning 0 */
85 lowest_valid_chr = 0;
86 ret = 0;
87 len = max_len;
88 }
89
90 /* the following bytes must all be 10xxxxxx */
91 for (i = 1; i < len; i++) {
92 if ((input[i] & 0xc0) != 0x80)
93 return input[i] == '\0' ? 0 : -1;
94
95 chr <<= 6;
96 chr |= input[i] & 0x3f;
97 }
98 /* these are specified as invalid encodings by standards
99 see RFC3629 */
100 if (!uni_is_valid_ucs4(chr))
101 return -1;
102 if (chr < lowest_valid_chr) {
103 /* overlong encoding */
104 return -1;
105 }
106
107 *chr_r = chr;
108 return ret;
109 }
110
uni_utf8_to_ucs4(const char * input,ARRAY_TYPE (unichars)* output)111 int uni_utf8_to_ucs4(const char *input, ARRAY_TYPE(unichars) *output)
112 {
113 unichar_t chr;
114
115 while (*input != '\0') {
116 int len = uni_utf8_get_char(input, &chr);
117 if (len <= 0) {
118 /* invalid input */
119 return -1;
120 }
121 input += len;
122
123 array_push_back(output, &chr);
124 }
125 return 0;
126 }
127
uni_utf8_to_ucs4_n(const unsigned char * input,size_t size,ARRAY_TYPE (unichars)* output)128 int uni_utf8_to_ucs4_n(const unsigned char *input, size_t size,
129 ARRAY_TYPE(unichars) *output)
130 {
131 unichar_t chr;
132
133 while (size > 0) {
134 int len = uni_utf8_get_char_n(input, size, &chr);
135 if (len <= 0)
136 return -1; /* invalid input */
137 input += len; size -= len;
138
139 array_push_back(output, &chr);
140 }
141 return 0;
142 }
143
uni_ucs4_to_utf8(const unichar_t * input,size_t len,buffer_t * output)144 void uni_ucs4_to_utf8(const unichar_t *input, size_t len, buffer_t *output)
145 {
146 for (; len > 0 && *input != '\0'; input++, len--)
147 uni_ucs4_to_utf8_c(*input, output);
148 }
149
uni_ucs4_to_utf8_c(unichar_t chr,buffer_t * output)150 void uni_ucs4_to_utf8_c(unichar_t chr, buffer_t *output)
151 {
152 unsigned char first;
153 int bitpos;
154
155 if (chr < 0x80) {
156 buffer_append_c(output, chr);
157 return;
158 }
159
160 i_assert(uni_is_valid_ucs4(chr));
161
162 if (chr < (1 << (6 + 5))) {
163 /* 110xxxxx */
164 bitpos = 6;
165 first = 0x80 | 0x40;
166 } else if (chr < (1 << ((2*6) + 4))) {
167 /* 1110xxxx */
168 bitpos = 2*6;
169 first = 0x80 | 0x40 | 0x20;
170 } else if (chr < (1 << ((3*6) + 3))) {
171 /* 11110xxx */
172 bitpos = 3*6;
173 first = 0x80 | 0x40 | 0x20 | 0x10;
174 } else if (chr < (1 << ((4*6) + 2))) {
175 /* 111110xx */
176 bitpos = 4*6;
177 first = 0x80 | 0x40 | 0x20 | 0x10 | 0x08;
178 } else {
179 /* 1111110x */
180 bitpos = 5*6;
181 first = 0x80 | 0x40 | 0x20 | 0x10 | 0x08 | 0x04;
182 }
183 buffer_append_c(output, first | (chr >> bitpos));
184
185 do {
186 bitpos -= 6;
187 buffer_append_c(output, 0x80 | ((chr >> bitpos) & 0x3f));
188 } while (bitpos > 0);
189 }
190
uni_utf8_strlen(const char * input)191 unsigned int uni_utf8_strlen(const char *input)
192 {
193 return uni_utf8_strlen_n(input, strlen(input));
194 }
195
uni_utf8_strlen_n(const void * input,size_t size)196 unsigned int uni_utf8_strlen_n(const void *input, size_t size)
197 {
198 size_t partial_pos;
199
200 return uni_utf8_partial_strlen_n(input, size, &partial_pos);
201 }
202
uni_utf8_partial_strlen_n(const void * _input,size_t size,size_t * partial_pos_r)203 unsigned int uni_utf8_partial_strlen_n(const void *_input, size_t size,
204 size_t *partial_pos_r)
205 {
206 const unsigned char *input = _input;
207 unsigned int count, len = 0;
208 size_t i;
209
210 for (i = 0; i < size; ) {
211 count = uni_utf8_char_bytes(input[i]);
212 if (i + count > size)
213 break;
214 i += count;
215 len++;
216 }
217 *partial_pos_r = i;
218 return len;
219 }
220
uint16_find(const uint16_t * data,unsigned int count,uint16_t value,unsigned int * idx_r)221 static bool uint16_find(const uint16_t *data, unsigned int count,
222 uint16_t value, unsigned int *idx_r)
223 {
224 BINARY_NUMBER_SEARCH(data, count, value, idx_r);
225 }
226
uint32_find(const uint32_t * data,unsigned int count,uint32_t value,unsigned int * idx_r)227 static bool uint32_find(const uint32_t *data, unsigned int count,
228 uint32_t value, unsigned int *idx_r)
229 {
230 BINARY_NUMBER_SEARCH(data, count, value, idx_r);
231 }
232
uni_ucs4_to_titlecase(unichar_t chr)233 unichar_t uni_ucs4_to_titlecase(unichar_t chr)
234 {
235 unsigned int idx;
236
237 if (chr <= 0xff)
238 return titlecase8_map[chr];
239 else if (chr <= 0xffff) {
240 if (!uint16_find(titlecase16_keys, N_ELEMENTS(titlecase16_keys),
241 chr, &idx))
242 return chr;
243 else
244 return titlecase16_values[idx];
245 } else {
246 if (!uint32_find(titlecase32_keys, N_ELEMENTS(titlecase32_keys),
247 chr, &idx))
248 return chr;
249 else
250 return titlecase32_values[idx];
251 }
252 }
253
uni_ucs4_decompose_uni(unichar_t * chr)254 static bool uni_ucs4_decompose_uni(unichar_t *chr)
255 {
256 unsigned int idx;
257
258 if (*chr <= 0xff) {
259 if (uni8_decomp_map[*chr] == *chr)
260 return FALSE;
261 *chr = uni8_decomp_map[*chr];
262 } else if (*chr <= 0xffff) {
263 if (*chr < uni16_decomp_keys[0])
264 return FALSE;
265
266 if (!uint16_find(uni16_decomp_keys,
267 N_ELEMENTS(uni16_decomp_keys), *chr, &idx))
268 return FALSE;
269 *chr = uni16_decomp_values[idx];
270 } else {
271 if (!uint32_find(uni32_decomp_keys,
272 N_ELEMENTS(uni32_decomp_keys), *chr, &idx))
273 return FALSE;
274 *chr = uni32_decomp_values[idx];
275 }
276 return TRUE;
277 }
278
uni_ucs4_decompose_hangul_utf8(unichar_t chr,buffer_t * output)279 static void uni_ucs4_decompose_hangul_utf8(unichar_t chr, buffer_t *output)
280 {
281 #define SBase HANGUL_FIRST
282 #define LBase 0x1100
283 #define VBase 0x1161
284 #define TBase 0x11A7
285 #define LCount 19
286 #define VCount 21
287 #define TCount 28
288 #define NCount (VCount * TCount)
289 unsigned int SIndex = chr - SBase;
290 unichar_t L = LBase + SIndex / NCount;
291 unichar_t V = VBase + (SIndex % NCount) / TCount;
292 unichar_t T = TBase + SIndex % TCount;
293
294 uni_ucs4_to_utf8_c(L, output);
295 uni_ucs4_to_utf8_c(V, output);
296 if (T != TBase) uni_ucs4_to_utf8_c(T, output);
297 }
298
uni_ucs4_decompose_multi_utf8(unichar_t chr,buffer_t * output)299 static bool uni_ucs4_decompose_multi_utf8(unichar_t chr, buffer_t *output)
300 {
301 const uint32_t *value;
302 unsigned int idx;
303
304 if (chr < multidecomp_keys[0] || chr > 0xffff)
305 return FALSE;
306
307 if (!uint32_find(multidecomp_keys, N_ELEMENTS(multidecomp_keys),
308 chr, &idx))
309 return FALSE;
310
311 value = &multidecomp_values[multidecomp_offsets[idx]];
312 for (; *value != 0; value++)
313 uni_ucs4_to_utf8_c(*value, output);
314 return TRUE;
315 }
316
output_add_replacement_char(buffer_t * output)317 static void output_add_replacement_char(buffer_t *output)
318 {
319 if (output->used >= UTF8_REPLACEMENT_CHAR_LEN &&
320 memcmp(CONST_PTR_OFFSET(output->data,
321 output->used - UTF8_REPLACEMENT_CHAR_LEN),
322 utf8_replacement_char, UTF8_REPLACEMENT_CHAR_LEN) == 0) {
323 /* don't add the replacement char multiple times */
324 return;
325 }
326 buffer_append(output, utf8_replacement_char, UTF8_REPLACEMENT_CHAR_LEN);
327 }
328
uni_utf8_to_decomposed_titlecase(const void * _input,size_t size,buffer_t * output)329 int uni_utf8_to_decomposed_titlecase(const void *_input, size_t size,
330 buffer_t *output)
331 {
332 const unsigned char *input = _input;
333 unichar_t chr;
334 int ret = 0;
335
336 while (size > 0) {
337 int bytes = uni_utf8_get_char_n(input, size, &chr);
338 if (bytes <= 0) {
339 /* invalid input. try the next byte. */
340 ret = -1;
341 input++; size--;
342 output_add_replacement_char(output);
343 continue;
344 }
345 input += bytes;
346 size -= bytes;
347
348 chr = uni_ucs4_to_titlecase(chr);
349 if (chr >= HANGUL_FIRST && chr <= HANGUL_LAST)
350 uni_ucs4_decompose_hangul_utf8(chr, output);
351 else if (uni_ucs4_decompose_uni(&chr) ||
352 !uni_ucs4_decompose_multi_utf8(chr, output))
353 uni_ucs4_to_utf8_c(chr, output);
354 }
355 return ret;
356 }
357
358 static inline unsigned int
is_valid_utf8_seq(const unsigned char * input,unsigned int size)359 is_valid_utf8_seq(const unsigned char *input, unsigned int size)
360 {
361 unichar_t chr;
362 int len = uni_utf8_get_char_n(input, size, &chr);
363 return len <= 0 ? 0 : len;
364 }
365
uni_utf8_find_invalid_pos(const unsigned char * input,size_t size,size_t * pos_r)366 static int uni_utf8_find_invalid_pos(const unsigned char *input, size_t size,
367 size_t *pos_r)
368 {
369 size_t i, len;
370
371 /* find the first invalid utf8 sequence */
372 for (i = 0; i < size;) {
373 if (input[i] < 0x80)
374 i++;
375 else {
376 len = is_valid_utf8_seq(input + i, size-i);
377 if (unlikely(len == 0)) {
378 *pos_r = i;
379 return -1;
380 }
381 i += len;
382 }
383 }
384 return 0;
385 }
386
uni_utf8_get_valid_data(const unsigned char * input,size_t size,buffer_t * buf)387 bool uni_utf8_get_valid_data(const unsigned char *input, size_t size,
388 buffer_t *buf)
389 {
390 size_t i, len;
391
392 if (uni_utf8_find_invalid_pos(input, size, &i) == 0)
393 return TRUE;
394
395 /* broken utf-8 input - skip the broken characters */
396 buffer_append(buf, input, i++);
397
398 output_add_replacement_char(buf);
399 while (i < size) {
400 if (input[i] < 0x80) {
401 buffer_append_c(buf, input[i++]);
402 continue;
403 }
404
405 len = is_valid_utf8_seq(input + i, size-i);
406 if (len == 0) {
407 i++;
408 output_add_replacement_char(buf);
409 continue;
410 }
411 buffer_append(buf, input + i, len);
412 i += len;
413 }
414 return FALSE;
415 }
416
uni_utf8_str_is_valid(const char * str)417 bool uni_utf8_str_is_valid(const char *str)
418 {
419 size_t i;
420
421 return uni_utf8_find_invalid_pos((const unsigned char *)str,
422 strlen(str), &i) == 0;
423 }
424
uni_utf8_data_is_valid(const unsigned char * data,size_t size)425 bool uni_utf8_data_is_valid(const unsigned char *data, size_t size)
426 {
427 size_t i;
428
429 return uni_utf8_find_invalid_pos(data, size, &i) == 0;
430 }
431
uni_utf8_data_truncate(const unsigned char * data,size_t old_size,size_t max_new_size)432 size_t uni_utf8_data_truncate(const unsigned char *data, size_t old_size,
433 size_t max_new_size)
434 {
435 if (max_new_size >= old_size)
436 return old_size;
437 if (max_new_size == 0)
438 return 0;
439
440 if ((data[max_new_size] & 0x80) == 0)
441 return max_new_size;
442 while (max_new_size > 0 && (data[max_new_size-1] & 0xc0) == 0x80)
443 max_new_size--;
444 if (max_new_size > 0 && (data[max_new_size-1] & 0xc0) == 0xc0)
445 max_new_size--;
446 return max_new_size;
447 }
448