1 /* Copyright (c) 2005-2018 Dovecot authors, see the included COPYING file */
2 
3 #include "lib.h"
4 #include "array.h"
5 #include "bsearch-insert-pos.h"
6 #include "unichar.h"
7 
8 #include "unicodemap.c"
9 
10 #define HANGUL_FIRST 0xac00
11 #define HANGUL_LAST 0xd7a3
12 
13 const unsigned char utf8_replacement_char[UTF8_REPLACEMENT_CHAR_LEN] =
14 	{ 0xef, 0xbf, 0xbd }; /* 0xfffd */
15 
16 static const uint8_t utf8_non1_bytes[256 - 192 - 2] = {
17 	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
18 	3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
19 };
20 
21 const uint8_t *const uni_utf8_non1_bytes = utf8_non1_bytes;
22 
uni_strlen(const unichar_t * str)23 unsigned int uni_strlen(const unichar_t *str)
24 {
25 	unsigned int len = 0;
26 
27 	for (len = 0; str[len] != 0; len++) ;
28 
29 	return len;
30 }
31 
uni_utf8_get_char(const char * input,unichar_t * chr_r)32 int uni_utf8_get_char(const char *input, unichar_t *chr_r)
33 {
34 	return uni_utf8_get_char_n((const unsigned char *)input, SIZE_MAX,
35 				   chr_r);
36 }
37 
uni_utf8_get_char_n(const void * _input,size_t max_len,unichar_t * chr_r)38 int uni_utf8_get_char_n(const void *_input, size_t max_len, unichar_t *chr_r)
39 {
40 	static unichar_t lowest_valid_chr_table[] =
41 		{ 0, 0, 0x80, 0x800, 0x10000, 0x200000, 0x4000000 };
42 	const unsigned char *input = _input;
43 	unichar_t chr, lowest_valid_chr;
44 	unsigned int i, len;
45 	int ret;
46 
47 	i_assert(max_len > 0);
48 
49 	if (*input < 0x80) {
50 		*chr_r = *input;
51 		return 1;
52 	}
53 
54 	/* first byte has len highest bits set, followed by zero bit.
55 	   the rest of the bits are used as the highest bits of the value. */
56 	chr = *input;
57 	len = uni_utf8_char_bytes(*input);
58 	switch (len) {
59 	case 2:
60 		chr &= 0x1f;
61 		break;
62 	case 3:
63 		chr &= 0x0f;
64 		break;
65 	case 4:
66 		chr &= 0x07;
67 		break;
68 	case 5:
69 		chr &= 0x03;
70 		break;
71 	case 6:
72 		chr &= 0x01;
73 		break;
74 	default:
75 		/* only 7bit chars should have len==1 */
76 		i_assert(len == 1);
77 		return -1;
78 	}
79 
80 	if (len <= max_len) {
81 		lowest_valid_chr = lowest_valid_chr_table[len];
82 		ret = len;
83 	} else {
84 		/* check first if the input is invalid before returning 0 */
85 		lowest_valid_chr = 0;
86 		ret = 0;
87 		len = max_len;
88 	}
89 
90 	/* the following bytes must all be 10xxxxxx */
91 	for (i = 1; i < len; i++) {
92 		if ((input[i] & 0xc0) != 0x80)
93 			return input[i] == '\0' ? 0 : -1;
94 
95 		chr <<= 6;
96 		chr |= input[i] & 0x3f;
97 	}
98 	/* these are specified as invalid encodings by standards
99 	   see RFC3629 */
100 	if (!uni_is_valid_ucs4(chr))
101 		return -1;
102 	if (chr < lowest_valid_chr) {
103 		/* overlong encoding */
104 		return -1;
105 	}
106 
107 	*chr_r = chr;
108 	return ret;
109 }
110 
uni_utf8_to_ucs4(const char * input,ARRAY_TYPE (unichars)* output)111 int uni_utf8_to_ucs4(const char *input, ARRAY_TYPE(unichars) *output)
112 {
113 	unichar_t chr;
114 
115 	while (*input != '\0') {
116 		int len = uni_utf8_get_char(input, &chr);
117 		if (len <= 0) {
118 			/* invalid input */
119 			return -1;
120 		}
121                 input += len;
122 
123 		array_push_back(output, &chr);
124 	}
125 	return 0;
126 }
127 
uni_utf8_to_ucs4_n(const unsigned char * input,size_t size,ARRAY_TYPE (unichars)* output)128 int uni_utf8_to_ucs4_n(const unsigned char *input, size_t size,
129 		       ARRAY_TYPE(unichars) *output)
130 {
131 	unichar_t chr;
132 
133 	while (size > 0) {
134 		int len = uni_utf8_get_char_n(input, size, &chr);
135 		if (len <= 0)
136 			return -1; /* invalid input */
137 		input += len; size -= len;
138 
139 		array_push_back(output, &chr);
140 	}
141 	return 0;
142 }
143 
uni_ucs4_to_utf8(const unichar_t * input,size_t len,buffer_t * output)144 void uni_ucs4_to_utf8(const unichar_t *input, size_t len, buffer_t *output)
145 {
146 	for (; len > 0 && *input != '\0'; input++, len--)
147 		uni_ucs4_to_utf8_c(*input, output);
148 }
149 
uni_ucs4_to_utf8_c(unichar_t chr,buffer_t * output)150 void uni_ucs4_to_utf8_c(unichar_t chr, buffer_t *output)
151 {
152 	unsigned char first;
153 	int bitpos;
154 
155 	if (chr < 0x80) {
156 		buffer_append_c(output, chr);
157 		return;
158 	}
159 
160 	i_assert(uni_is_valid_ucs4(chr));
161 
162 	if (chr < (1 << (6 + 5))) {
163 		/* 110xxxxx */
164 		bitpos = 6;
165 		first = 0x80 | 0x40;
166 	} else if (chr < (1 << ((2*6) + 4))) {
167 		/* 1110xxxx */
168 		bitpos = 2*6;
169 		first = 0x80 | 0x40 | 0x20;
170 	} else if (chr < (1 << ((3*6) + 3))) {
171 		/* 11110xxx */
172 		bitpos = 3*6;
173 		first = 0x80 | 0x40 | 0x20 | 0x10;
174 	} else if (chr < (1 << ((4*6) + 2))) {
175 		/* 111110xx */
176 		bitpos = 4*6;
177 		first = 0x80 | 0x40 | 0x20 | 0x10 | 0x08;
178 	} else {
179 		/* 1111110x */
180 		bitpos = 5*6;
181 		first = 0x80 | 0x40 | 0x20 | 0x10 | 0x08 | 0x04;
182 	}
183 	buffer_append_c(output, first | (chr >> bitpos));
184 
185 	do {
186 		bitpos -= 6;
187 		buffer_append_c(output, 0x80 | ((chr >> bitpos) & 0x3f));
188 	} while (bitpos > 0);
189 }
190 
uni_utf8_strlen(const char * input)191 unsigned int uni_utf8_strlen(const char *input)
192 {
193 	return uni_utf8_strlen_n(input, strlen(input));
194 }
195 
uni_utf8_strlen_n(const void * input,size_t size)196 unsigned int uni_utf8_strlen_n(const void *input, size_t size)
197 {
198 	size_t partial_pos;
199 
200 	return uni_utf8_partial_strlen_n(input, size, &partial_pos);
201 }
202 
uni_utf8_partial_strlen_n(const void * _input,size_t size,size_t * partial_pos_r)203 unsigned int uni_utf8_partial_strlen_n(const void *_input, size_t size,
204 				       size_t *partial_pos_r)
205 {
206 	const unsigned char *input = _input;
207 	unsigned int count, len = 0;
208 	size_t i;
209 
210 	for (i = 0; i < size; ) {
211 		count = uni_utf8_char_bytes(input[i]);
212 		if (i + count > size)
213 			break;
214 		i += count;
215 		len++;
216 	}
217 	*partial_pos_r = i;
218 	return len;
219 }
220 
uint16_find(const uint16_t * data,unsigned int count,uint16_t value,unsigned int * idx_r)221 static bool uint16_find(const uint16_t *data, unsigned int count,
222 			uint16_t value, unsigned int *idx_r)
223 {
224 	BINARY_NUMBER_SEARCH(data, count, value, idx_r);
225 }
226 
uint32_find(const uint32_t * data,unsigned int count,uint32_t value,unsigned int * idx_r)227 static bool uint32_find(const uint32_t *data, unsigned int count,
228 			uint32_t value, unsigned int *idx_r)
229 {
230 	BINARY_NUMBER_SEARCH(data, count, value, idx_r);
231 }
232 
uni_ucs4_to_titlecase(unichar_t chr)233 unichar_t uni_ucs4_to_titlecase(unichar_t chr)
234 {
235 	unsigned int idx;
236 
237 	if (chr <= 0xff)
238 		return titlecase8_map[chr];
239 	else if (chr <= 0xffff) {
240 		if (!uint16_find(titlecase16_keys, N_ELEMENTS(titlecase16_keys),
241 				 chr, &idx))
242 			return chr;
243 		else
244 			return titlecase16_values[idx];
245 	} else {
246 		if (!uint32_find(titlecase32_keys, N_ELEMENTS(titlecase32_keys),
247 				 chr, &idx))
248 			return chr;
249 		else
250 			return titlecase32_values[idx];
251 	}
252 }
253 
uni_ucs4_decompose_uni(unichar_t * chr)254 static bool uni_ucs4_decompose_uni(unichar_t *chr)
255 {
256 	unsigned int idx;
257 
258 	if (*chr <= 0xff) {
259 		if (uni8_decomp_map[*chr] == *chr)
260 			return FALSE;
261 		*chr = uni8_decomp_map[*chr];
262 	} else if (*chr <= 0xffff) {
263 		if (*chr < uni16_decomp_keys[0])
264 			return FALSE;
265 
266 		if (!uint16_find(uni16_decomp_keys,
267 				 N_ELEMENTS(uni16_decomp_keys), *chr, &idx))
268 			return FALSE;
269 		*chr = uni16_decomp_values[idx];
270 	} else {
271 		if (!uint32_find(uni32_decomp_keys,
272 				 N_ELEMENTS(uni32_decomp_keys), *chr, &idx))
273 			return FALSE;
274 		*chr = uni32_decomp_values[idx];
275 	}
276 	return TRUE;
277 }
278 
uni_ucs4_decompose_hangul_utf8(unichar_t chr,buffer_t * output)279 static void uni_ucs4_decompose_hangul_utf8(unichar_t chr, buffer_t *output)
280 {
281 #define SBase HANGUL_FIRST
282 #define LBase 0x1100
283 #define VBase 0x1161
284 #define TBase 0x11A7
285 #define LCount 19
286 #define VCount 21
287 #define TCount 28
288 #define NCount (VCount * TCount)
289 	unsigned int SIndex = chr - SBase;
290         unichar_t L = LBase + SIndex / NCount;
291         unichar_t V = VBase + (SIndex % NCount) / TCount;
292         unichar_t T = TBase + SIndex % TCount;
293 
294 	uni_ucs4_to_utf8_c(L, output);
295 	uni_ucs4_to_utf8_c(V, output);
296 	if (T != TBase) uni_ucs4_to_utf8_c(T, output);
297 }
298 
uni_ucs4_decompose_multi_utf8(unichar_t chr,buffer_t * output)299 static bool uni_ucs4_decompose_multi_utf8(unichar_t chr, buffer_t *output)
300 {
301 	const uint32_t *value;
302 	unsigned int idx;
303 
304 	if (chr < multidecomp_keys[0] || chr > 0xffff)
305 		return FALSE;
306 
307 	if (!uint32_find(multidecomp_keys, N_ELEMENTS(multidecomp_keys),
308 			 chr, &idx))
309 		return FALSE;
310 
311 	value = &multidecomp_values[multidecomp_offsets[idx]];
312 	for (; *value != 0; value++)
313 		uni_ucs4_to_utf8_c(*value, output);
314 	return TRUE;
315 }
316 
output_add_replacement_char(buffer_t * output)317 static void output_add_replacement_char(buffer_t *output)
318 {
319 	if (output->used >= UTF8_REPLACEMENT_CHAR_LEN &&
320 	    memcmp(CONST_PTR_OFFSET(output->data,
321 				    output->used - UTF8_REPLACEMENT_CHAR_LEN),
322 		   utf8_replacement_char, UTF8_REPLACEMENT_CHAR_LEN) == 0) {
323 		/* don't add the replacement char multiple times */
324 		return;
325 	}
326 	buffer_append(output, utf8_replacement_char, UTF8_REPLACEMENT_CHAR_LEN);
327 }
328 
uni_utf8_to_decomposed_titlecase(const void * _input,size_t size,buffer_t * output)329 int uni_utf8_to_decomposed_titlecase(const void *_input, size_t size,
330 				     buffer_t *output)
331 {
332 	const unsigned char *input = _input;
333 	unichar_t chr;
334 	int ret = 0;
335 
336 	while (size > 0) {
337 		int bytes = uni_utf8_get_char_n(input, size, &chr);
338 		if (bytes <= 0) {
339 			/* invalid input. try the next byte. */
340 			ret = -1;
341 			input++; size--;
342 			output_add_replacement_char(output);
343 			continue;
344 		}
345 		input += bytes;
346 		size -= bytes;
347 
348 		chr = uni_ucs4_to_titlecase(chr);
349 		if (chr >= HANGUL_FIRST && chr <= HANGUL_LAST)
350 			uni_ucs4_decompose_hangul_utf8(chr, output);
351 		else if (uni_ucs4_decompose_uni(&chr) ||
352 			 !uni_ucs4_decompose_multi_utf8(chr, output))
353 			uni_ucs4_to_utf8_c(chr, output);
354 	}
355 	return ret;
356 }
357 
358 static inline unsigned int
is_valid_utf8_seq(const unsigned char * input,unsigned int size)359 is_valid_utf8_seq(const unsigned char *input, unsigned int size)
360 {
361 	unichar_t chr;
362 	int len = uni_utf8_get_char_n(input, size, &chr);
363 	return len <= 0 ? 0 : len;
364 }
365 
uni_utf8_find_invalid_pos(const unsigned char * input,size_t size,size_t * pos_r)366 static int uni_utf8_find_invalid_pos(const unsigned char *input, size_t size,
367 				     size_t *pos_r)
368 {
369 	size_t i, len;
370 
371 	/* find the first invalid utf8 sequence */
372 	for (i = 0; i < size;) {
373 		if (input[i] < 0x80)
374 			i++;
375 		else {
376 			len = is_valid_utf8_seq(input + i, size-i);
377 			if (unlikely(len == 0)) {
378 				*pos_r = i;
379 				return -1;
380 			}
381 			i += len;
382 		}
383 	}
384 	return 0;
385 }
386 
uni_utf8_get_valid_data(const unsigned char * input,size_t size,buffer_t * buf)387 bool uni_utf8_get_valid_data(const unsigned char *input, size_t size,
388 			     buffer_t *buf)
389 {
390 	size_t i, len;
391 
392 	if (uni_utf8_find_invalid_pos(input, size, &i) == 0)
393 		return TRUE;
394 
395 	/* broken utf-8 input - skip the broken characters */
396 	buffer_append(buf, input, i++);
397 
398 	output_add_replacement_char(buf);
399 	while (i < size) {
400 		if (input[i] < 0x80) {
401 			buffer_append_c(buf, input[i++]);
402 			continue;
403 		}
404 
405 		len = is_valid_utf8_seq(input + i, size-i);
406 		if (len == 0) {
407 			i++;
408 			output_add_replacement_char(buf);
409 			continue;
410 		}
411 		buffer_append(buf, input + i, len);
412 		i += len;
413 	}
414 	return FALSE;
415 }
416 
uni_utf8_str_is_valid(const char * str)417 bool uni_utf8_str_is_valid(const char *str)
418 {
419 	size_t i;
420 
421 	return uni_utf8_find_invalid_pos((const unsigned char *)str,
422 					 strlen(str), &i) == 0;
423 }
424 
uni_utf8_data_is_valid(const unsigned char * data,size_t size)425 bool uni_utf8_data_is_valid(const unsigned char *data, size_t size)
426 {
427 	size_t i;
428 
429 	return uni_utf8_find_invalid_pos(data, size, &i) == 0;
430 }
431 
uni_utf8_data_truncate(const unsigned char * data,size_t old_size,size_t max_new_size)432 size_t uni_utf8_data_truncate(const unsigned char *data, size_t old_size,
433 			      size_t max_new_size)
434 {
435 	if (max_new_size >= old_size)
436 		return old_size;
437 	if (max_new_size == 0)
438 		return 0;
439 
440 	if ((data[max_new_size] & 0x80) == 0)
441 		return max_new_size;
442 	while (max_new_size > 0 && (data[max_new_size-1] & 0xc0) == 0x80)
443 		max_new_size--;
444 	if (max_new_size > 0 && (data[max_new_size-1] & 0xc0) == 0xc0)
445 		max_new_size--;
446 	return max_new_size;
447 }
448