1 /*
2
3 silcutf8.c
4
5 Author: Pekka Riikonen <priikone@silcnet.org>
6
7 Copyright (C) 2004 - 2007 Pekka Riikonen
8
9 The contents of this file are subject to one of the Licenses specified
10 in the COPYING file; You may not use this file except in compliance
11 with the License.
12
13 The software distributed under the License is distributed on an "AS IS"
14 basis, in the hope that it will be useful, but WITHOUT WARRANTY OF ANY
15 KIND, either expressed or implied. See the COPYING file for more
16 information.
17
18 */
19
20 #include "silc.h"
21 #include "silcutf8.h"
22
23 /* Encodes the string `bin' of which encoding is `bin_encoding' to the
24 UTF-8 encoding into the buffer `utf8' which is of size of `utf8_size'.
25 Returns the length of the UTF-8 encoded string, or zero (0) on error.
26 By default `bin_encoding' is ASCII, and the caller needs to know the
27 encoding of the input string if it is anything else. */
28
silc_utf8_encode(const unsigned char * bin,SilcUInt32 bin_len,SilcStringEncoding bin_encoding,unsigned char * utf8,SilcUInt32 utf8_size)29 SilcUInt32 silc_utf8_encode(const unsigned char *bin, SilcUInt32 bin_len,
30 SilcStringEncoding bin_encoding,
31 unsigned char *utf8, SilcUInt32 utf8_size)
32 {
33 SilcUInt32 enclen = 0, i, charval = 0;
34
35 if (!bin || !bin_len)
36 return 0;
37
38 if (bin_encoding == SILC_STRING_UTF8) {
39 if (!silc_utf8_valid(bin, bin_len))
40 return 0;
41 if (!utf8)
42 return bin_len;
43 if (bin_len > utf8_size)
44 return 0;
45 memcpy(utf8, bin, bin_len);
46 return bin_len;
47 }
48
49 /* The SILC_STRING_LDAP_DN is alredy UTF-8 but it may be escaped. We
50 remove the escaping and we're done. */
51 if (bin_encoding == SILC_STRING_LDAP_DN ||
52 bin_encoding == SILC_STRING_UTF8_ESCAPE) {
53 unsigned char cv;
54
55 for (i = 0; i < bin_len; i++) {
56 if (bin[i] == '\\') {
57 if (i + 1 >= bin_len)
58 return 0;
59
60 /* If escaped character is any of the following no processing is
61 needed, otherwise it is a hex value and we need to read it. */
62 cv = bin[i + 1];
63 if (cv != ',' && cv != '+' && cv != '"' && cv != '\\' && cv != '<' &&
64 cv != '>' && cv != ';' && cv != ' ' && cv != '#') {
65 unsigned int hexval;
66 if (i + 2 >= bin_len)
67 return 0;
68 if (sscanf(&bin[i + 1], "%02X", &hexval) != 1)
69 return 0;
70 if (utf8) {
71 if (enclen + 1 > utf8_size)
72 return 0;
73 utf8[enclen] = (unsigned char)hexval;
74 }
75
76 i += 2;
77 enclen++;
78 continue;
79 }
80 i++;
81 }
82
83 if (utf8) {
84 if (enclen + 1 > utf8_size)
85 return 0;
86 utf8[enclen] = bin[i];
87 }
88 enclen++;
89 }
90
91 return enclen;
92 }
93
94 if (bin_encoding == SILC_STRING_LOCALE) {
95 #if defined(HAVE_ICONV) && defined(HAVE_NL_LANGINFO) && defined(CODESET)
96 char *fromconv, *icp, *ocp;
97 iconv_t icd;
98 size_t inlen, outlen;
99
100 setlocale(LC_CTYPE, "");
101 fromconv = nl_langinfo(CODESET);
102 if (fromconv && strlen(fromconv)) {
103 icd = iconv_open("UTF-8", fromconv);
104 icp = (char *)bin;
105 ocp = (char *)utf8;
106 inlen = bin_len;
107 outlen = utf8_size;
108 if (icp && ocp && icd != (iconv_t)-1) {
109 if (iconv(icd, &icp, &inlen, &ocp, &outlen) != -1) {
110 utf8_size -= outlen;
111 iconv_close(icd);
112 return utf8_size;
113 }
114 }
115 if (icd != (iconv_t)-1)
116 iconv_close(icd);
117 }
118 #endif
119
120 /* Fallback to 8-bit ASCII */
121 bin_encoding = SILC_STRING_ASCII;
122 }
123
124 for (i = 0; i < bin_len; i++) {
125 switch (bin_encoding) {
126 case SILC_STRING_ASCII:
127 case SILC_STRING_TELETEX:
128 charval = bin[i];
129 break;
130 case SILC_STRING_ASCII_ESC:
131 SILC_NOT_IMPLEMENTED("SILC_STRING_ASCII_ESC");
132 return 0;
133 break;
134 case SILC_STRING_BMP:
135 if (i + 1 >= bin_len)
136 return 0;
137 SILC_GET16_MSB(charval, bin + i);
138 i += 1;
139 break;
140 case SILC_STRING_BMP_LSB:
141 if (i + 1 >= bin_len)
142 return 0;
143 SILC_GET16_LSB(charval, bin + i);
144 i += 1;
145 break;
146 case SILC_STRING_UNIVERSAL:
147 if (i + 3 >= bin_len)
148 return 0;
149 SILC_GET32_MSB(charval, bin + i);
150 i += 3;
151 break;
152 case SILC_STRING_UNIVERSAL_LSB:
153 if (i + 3 >= bin_len)
154 return 0;
155 SILC_GET32_LSB(charval, bin + i);
156 i += 3;
157 break;
158 case SILC_STRING_PRINTABLE:
159 case SILC_STRING_VISIBLE:
160 if (!isprint(bin[i]))
161 return 0;
162 charval = bin[i];
163 break;
164 case SILC_STRING_NUMERICAL:
165 if (bin[i] != 0x20 && !isdigit(bin[i]))
166 return 0;
167 charval = bin[i];
168 break;
169 default:
170 return 0;
171 break;
172 }
173
174 if (charval < 0x80) {
175 if (utf8) {
176 if (enclen > utf8_size)
177 return 0;
178
179 utf8[enclen] = (unsigned char)charval;
180 }
181 enclen++;
182 } else if (charval < 0x800) {
183 if (utf8) {
184 if (enclen + 2 > utf8_size)
185 return 0;
186
187 utf8[enclen ] = (unsigned char )(((charval >> 6) & 0x1f) | 0xc0);
188 utf8[enclen + 1] = (unsigned char )((charval & 0x3f) | 0x80);
189 }
190 enclen += 2;
191 } else if (charval < 0x10000) {
192 if (utf8) {
193 if (enclen + 3 > utf8_size)
194 return 0;
195
196 utf8[enclen ] = (unsigned char )(((charval >> 12) & 0xf) | 0xe0);
197 utf8[enclen + 1] = (unsigned char )(((charval >> 6) & 0x3f) | 0x80);
198 utf8[enclen + 2] = (unsigned char )((charval & 0x3f) | 0x80);
199 }
200 enclen += 3;
201 } else if (charval < 0x200000) {
202 if (utf8) {
203 if (enclen + 4 > utf8_size)
204 return 0;
205
206 utf8[enclen ] = (unsigned char )(((charval >> 18) & 0x7) | 0xf0);
207 utf8[enclen + 1] = (unsigned char )(((charval >> 12) & 0x3f) | 0x80);
208 utf8[enclen + 2] = (unsigned char )(((charval >> 6) & 0x3f) | 0x80);
209 utf8[enclen + 3] = (unsigned char )((charval & 0x3f) | 0x80);
210 }
211 enclen += 4;
212 } else if (charval < 0x4000000) {
213 if (utf8) {
214 if (enclen + 5 > utf8_size)
215 return 0;
216
217 utf8[enclen ] = (unsigned char )(((charval >> 24) & 0x3) | 0xf8);
218 utf8[enclen + 1] = (unsigned char )(((charval >> 18) & 0x3f) | 0x80);
219 utf8[enclen + 2] = (unsigned char )(((charval >> 12) & 0x3f) | 0x80);
220 utf8[enclen + 3] = (unsigned char )(((charval >> 6) & 0x3f) | 0x80);
221 utf8[enclen + 4] = (unsigned char )((charval & 0x3f) | 0x80);
222 }
223 enclen += 5;
224 } else {
225 if (utf8) {
226 if (enclen + 6 > utf8_size)
227 return 0;
228
229 utf8[enclen ] = (unsigned char )(((charval >> 30) & 0x1) | 0xfc);
230 utf8[enclen + 1] = (unsigned char )(((charval >> 24) & 0x3f) | 0x80);
231 utf8[enclen + 2] = (unsigned char )(((charval >> 18) & 0x3f) | 0x80);
232 utf8[enclen + 3] = (unsigned char )(((charval >> 12) & 0x3f) | 0x80);
233 utf8[enclen + 4] = (unsigned char )(((charval >> 6) & 0x3f) | 0x80);
234 utf8[enclen + 5] = (unsigned char )((charval & 0x3f) | 0x80);
235 }
236 enclen += 6;
237 }
238 }
239
240 return enclen;
241 }
242
243 /* Decodes UTF-8 encoded string `utf8' to string of which encoding is
244 to be `bin_encoding', into the `bin' buffer of size of `bin_size'.
245 Returns the length of the decoded buffer, or zero (0) on error.
246 By default `bin_encoding' is ASCII, and the caller needs to know to
247 which encoding the output string is to be encoded if ASCII is not
248 desired. */
249
silc_utf8_decode(const unsigned char * utf8,SilcUInt32 utf8_len,SilcStringEncoding bin_encoding,unsigned char * bin,SilcUInt32 bin_size)250 SilcUInt32 silc_utf8_decode(const unsigned char *utf8, SilcUInt32 utf8_len,
251 SilcStringEncoding bin_encoding,
252 unsigned char *bin, SilcUInt32 bin_size)
253 {
254 SilcUInt32 enclen = 0, i, charval, bytes;
255
256 if (!utf8 || !utf8_len)
257 return 0;
258
259 if (bin_encoding == SILC_STRING_UTF8) {
260 if (!silc_utf8_valid(utf8, utf8_len) ||
261 utf8_len > bin_size)
262 return 0;
263 memcpy(bin, utf8, utf8_len);
264 return utf8_len;
265 }
266
267 if (bin_encoding == SILC_STRING_LOCALE) {
268 #if defined(HAVE_ICONV) && defined(HAVE_NL_LANGINFO) && defined(CODESET)
269 char *toconv, *icp, *ocp;
270 iconv_t icd;
271 size_t inlen, outlen;
272
273 setlocale(LC_CTYPE, "");
274 toconv = nl_langinfo(CODESET);
275 if (toconv && strlen(toconv)) {
276 icd = iconv_open(toconv, "UTF-8");
277 icp = (char *)utf8;
278 ocp = (char *)bin;
279 inlen = utf8_len;
280 outlen = bin_size;
281 if (icp && ocp && icd != (iconv_t)-1) {
282 if (iconv(icd, &icp, &inlen, &ocp, &outlen) != -1) {
283 bin_size -= outlen;
284 iconv_close(icd);
285 return bin_size;
286 }
287 }
288 if (icd != (iconv_t)-1)
289 iconv_close(icd);
290 }
291 #endif
292
293 /* Fallback to 8-bit ASCII */
294 bin_encoding = SILC_STRING_ASCII;
295 }
296
297 for (i = 0; i < utf8_len; i++) {
298 if ((utf8[i] & 0x80) == 0x00) {
299 charval = utf8[i] & 0x7f;
300 bytes = 1;
301 } else if ((utf8[i] & 0xe0) == 0xc0) {
302 if (i + 1 >= utf8_len)
303 return 0;
304
305 if ((utf8[i + 1] & 0xc0) != 0x80)
306 return 0;
307
308 charval = (utf8[i++] & 0x1f) << 6;
309 charval |= utf8[i] & 0x3f;
310 if (charval < 0x80)
311 return 0;
312 bytes = 2;
313 } else if ((utf8[i] & 0xf0) == 0xe0) {
314 if (i + 2 >= utf8_len)
315 return 0;
316
317 if (((utf8[i + 1] & 0xc0) != 0x80) ||
318 ((utf8[i + 2] & 0xc0) != 0x80))
319 return 0;
320
321 /* Surrogates not allowed (D800-DFFF) */
322 if (utf8[i] == 0xed &&
323 utf8[i + 1] >= 0xa0 && utf8[i + 1] <= 0xbf &&
324 utf8[i + 2] >= 0x80 && utf8[i + 2] <= 0xbf)
325 return 0;
326
327 charval = (utf8[i++] & 0xf) << 12;
328 charval |= (utf8[i++] & 0x3f) << 6;
329 charval |= utf8[i] & 0x3f;
330 if (charval < 0x800)
331 return 0;
332 bytes = 3;
333 } else if ((utf8[i] & 0xf8) == 0xf0) {
334 if (i + 3 >= utf8_len)
335 return 0;
336
337 if (((utf8[i + 1] & 0xc0) != 0x80) ||
338 ((utf8[i + 2] & 0xc0) != 0x80) ||
339 ((utf8[i + 3] & 0xc0) != 0x80))
340 return 0;
341
342 charval = ((SilcUInt32)(utf8[i++] & 0x7)) << 18;
343 charval |= (utf8[i++] & 0x3f) << 12;
344 charval |= (utf8[i++] & 0x3f) << 6;
345 charval |= utf8[i] & 0x3f;
346 if (charval < 0x10000)
347 return 0;
348 bytes = 4;
349 } else if ((utf8[i] & 0xfc) == 0xf8) {
350 if (i + 4 >= utf8_len)
351 return 0;
352
353 if (((utf8[i + 1] & 0xc0) != 0x80) ||
354 ((utf8[i + 2] & 0xc0) != 0x80) ||
355 ((utf8[i + 3] & 0xc0) != 0x80) ||
356 ((utf8[i + 4] & 0xc0) != 0x80))
357 return 0;
358
359 charval = ((SilcUInt32)(utf8[i++] & 0x3)) << 24;
360 charval |= ((SilcUInt32)(utf8[i++] & 0x3f)) << 18;
361 charval |= ((SilcUInt32)(utf8[i++] & 0x3f)) << 12;
362 charval |= (utf8[i++] & 0x3f) << 6;
363 charval |= utf8[i] & 0x3f;
364 if (charval < 0x200000)
365 return 0;
366 bytes = 5;
367 } else if ((utf8[i] & 0xfe) == 0xfc) {
368 if (i + 5 >= utf8_len)
369 return 0;
370
371 if (((utf8[i + 1] & 0xc0) != 0x80) ||
372 ((utf8[i + 2] & 0xc0) != 0x80) ||
373 ((utf8[i + 3] & 0xc0) != 0x80) ||
374 ((utf8[i + 4] & 0xc0) != 0x80) ||
375 ((utf8[i + 5] & 0xc0) != 0x80))
376 return 0;
377
378 charval = ((SilcUInt32)(utf8[i++] & 0x1)) << 30;
379 charval |= ((SilcUInt32)(utf8[i++] & 0x3f)) << 24;
380 charval |= ((SilcUInt32)(utf8[i++] & 0x3f)) << 18;
381 charval |= ((SilcUInt32)(utf8[i++] & 0x3f)) << 12;
382 charval |= (utf8[i++] & 0x3f) << 6;
383 charval |= utf8[i] & 0x3f;
384 if (charval < 0x4000000)
385 return 0;
386 bytes = 6;
387 } else {
388 return 0;
389 }
390
391 switch (bin_encoding) {
392 case SILC_STRING_ASCII:
393 case SILC_STRING_PRINTABLE:
394 case SILC_STRING_VISIBLE:
395 case SILC_STRING_TELETEX:
396 case SILC_STRING_NUMERICAL:
397 if (bin) {
398 if (enclen + 1 > bin_size)
399 return 0;
400
401 bin[enclen] = (unsigned char)charval;
402 }
403 enclen++;
404 break;
405 case SILC_STRING_ASCII_ESC:
406 SILC_NOT_IMPLEMENTED("SILC_STRING_ASCII_ESC");
407 return 0;
408 break;
409 case SILC_STRING_BMP:
410 if (bin) {
411 if (enclen + 2 > bin_size)
412 return 0;
413 SILC_PUT16_MSB(charval, bin + enclen);
414 }
415 enclen += 2;
416 break;
417 case SILC_STRING_BMP_LSB:
418 if (bin) {
419 if (enclen + 2 > bin_size)
420 return 0;
421 SILC_PUT16_LSB(charval, bin + enclen);
422 }
423 enclen += 2;
424 break;
425 case SILC_STRING_UNIVERSAL:
426 if (bin) {
427 if (enclen + 4 > bin_size)
428 return 0;
429 SILC_PUT32_MSB(charval, bin + enclen);
430 }
431 enclen += 4;
432 break;
433 case SILC_STRING_UNIVERSAL_LSB:
434 if (bin) {
435 if (enclen + 4 > bin_size)
436 return 0;
437 SILC_PUT32_LSB(charval, bin + enclen);
438 }
439 enclen += 4;
440 break;
441 case SILC_STRING_LDAP_DN:
442 {
443 int k;
444 unsigned char cv;
445
446 /* Non-printable UTF-8 characters will be escaped, printable will
447 be as is. We take the bytes directly from the original data. */
448 for (k = 0; k < bytes; k++) {
449 cv = utf8[(i - (bytes - 1)) + k];
450
451 /* If string starts with space or # escape it */
452 if (!enclen && (cv == '#' || cv == ' ')) {
453 if (bin) {
454 if (enclen + 2 > bin_size)
455 return 0;
456 bin[enclen] = '\\';
457 bin[enclen + 1] = cv;
458 }
459 enclen += 2;
460 continue;
461 }
462
463 /* If string ends with space escape it */
464 if (i == utf8_len - 1 && cv == ' ') {
465 if (bin) {
466 if (enclen + 2 > bin_size)
467 return 0;
468 bin[enclen] = '\\';
469 bin[enclen + 1] = cv;
470 }
471 enclen += 2;
472 continue;
473 }
474
475 /* If character is any of following then escape */
476 if (cv == ',' || cv == '+' || cv == '"' || cv == '\\' || cv == '<' ||
477 cv == '>' || cv == ';') {
478 if (bin) {
479 if (enclen + 2 > bin_size)
480 return 0;
481 bin[enclen] = '\\';
482 bin[enclen + 1] = cv;
483 }
484 enclen += 2;
485 continue;
486 }
487
488 /* If character is not printable escape it with hex character */
489 if (!isprint((int)cv)) {
490 if (bin) {
491 if (enclen + 3 > bin_size)
492 return 0;
493 bin[enclen] = '\\';
494 silc_snprintf(bin + enclen + 1, 3, "%02X", cv);
495 }
496 enclen += 3;
497 continue;
498 }
499
500 if (bin) {
501 if (enclen + 1 > bin_size)
502 return 0;
503 bin[enclen] = cv;
504 }
505 enclen++;
506 }
507 }
508 break;
509 default:
510 return 0;
511 break;
512 }
513 }
514
515 return enclen;
516 }
517
518 /* UTF-8 to wide characters */
519
silc_utf8_c2w(const unsigned char * utf8,SilcUInt32 utf8_len,SilcUInt16 * utf8_wide,SilcUInt32 utf8_wide_size)520 SilcUInt32 silc_utf8_c2w(const unsigned char *utf8, SilcUInt32 utf8_len,
521 SilcUInt16 *utf8_wide, SilcUInt32 utf8_wide_size)
522 {
523 unsigned char *tmp;
524 SilcUInt32 tmp_len;
525 int i, k;
526
527 tmp_len = silc_utf8_decoded_len(utf8, utf8_len, SILC_STRING_BMP);
528 if (!tmp_len)
529 return 0;
530
531 if (utf8_wide_size < tmp_len / 2)
532 return 0;
533
534 memset(utf8_wide, 0, utf8_wide_size * 2);
535
536 tmp = silc_malloc(tmp_len);
537 if (!tmp)
538 return 0;
539
540 silc_utf8_decode(utf8, utf8_len, SILC_STRING_BMP, tmp, tmp_len);
541
542 for (i = 0, k = 0; i < tmp_len; i += 2, k++)
543 SILC_GET16_MSB(utf8_wide[k], tmp + i);
544
545 silc_free(tmp);
546 return k + 1;
547 }
548
549 /* Wide characters to UTF-8 */
550
silc_utf8_w2c(const SilcUInt16 * wide_str,SilcUInt32 wide_str_len,unsigned char * utf8,SilcUInt32 utf8_size)551 SilcUInt32 silc_utf8_w2c(const SilcUInt16 *wide_str,
552 SilcUInt32 wide_str_len,
553 unsigned char *utf8, SilcUInt32 utf8_size)
554
555 {
556 unsigned char *tmp;
557 SilcUInt32 tmp_len;
558 int i, k;
559
560 if (utf8_size < wide_str_len * 2)
561 return 0;
562
563 memset(utf8, 0, utf8_size);
564
565 tmp = silc_malloc(wide_str_len * 2);
566 if (!tmp)
567 return 0;
568
569 for (i = 0, k = 0; i < wide_str_len; i += 2, k++)
570 SILC_PUT16_MSB(wide_str[k], tmp + i);
571
572 tmp_len = silc_utf8_encode(tmp, wide_str_len * 2, SILC_STRING_BMP,
573 utf8, utf8_size);
574
575 silc_free(tmp);
576 return tmp_len;
577 }
578
579 /* Returns the length of UTF-8 encoded string if the `bin' of
580 encoding of `bin_encoding' is encoded with silc_utf8_encode. */
581
silc_utf8_encoded_len(const unsigned char * bin,SilcUInt32 bin_len,SilcStringEncoding bin_encoding)582 SilcUInt32 silc_utf8_encoded_len(const unsigned char *bin, SilcUInt32 bin_len,
583 SilcStringEncoding bin_encoding)
584 {
585 return silc_utf8_encode(bin, bin_len, bin_encoding, NULL, 0);
586 }
587
588 /* Returns the length of decoded string if the `bin' of encoding of
589 `bin_encoding' is decoded with silc_utf8_decode. */
590
silc_utf8_decoded_len(const unsigned char * bin,SilcUInt32 bin_len,SilcStringEncoding bin_encoding)591 SilcUInt32 silc_utf8_decoded_len(const unsigned char *bin, SilcUInt32 bin_len,
592 SilcStringEncoding bin_encoding)
593 {
594 return silc_utf8_decode(bin, bin_len, bin_encoding, NULL, 0);
595 }
596
597 /* Returns TRUE if the `utf8' string of length of `utf8_len' is valid
598 UTF-8 encoded string, FALSE if it is not UTF-8 encoded string. */
599
silc_utf8_valid(const unsigned char * utf8,SilcUInt32 utf8_len)600 SilcBool silc_utf8_valid(const unsigned char *utf8, SilcUInt32 utf8_len)
601 {
602 return silc_utf8_decode(utf8, utf8_len, 0, NULL, 0) != 0;
603 }
604
605 /* Pretty close strcasecmp */
606
silc_utf8_strcasecmp(const char * s1,const char * s2)607 SilcBool silc_utf8_strcasecmp(const char *s1, const char *s2)
608 {
609 if (s1 == s2)
610 return TRUE;
611 if (strlen(s1) != strlen(s2))
612 return FALSE;
613
614 return silc_utf8_strncasecmp(s1, s2, strlen(s1));
615 }
616
617 /* Pretty close strcasecmp */
618
silc_utf8_strncasecmp(const char * s1,const char * s2,SilcUInt32 n)619 SilcBool silc_utf8_strncasecmp(const char *s1, const char *s2, SilcUInt32 n)
620 {
621 unsigned char *s1u, *s2u;
622 SilcUInt32 s1u_len, s2u_len;
623 SilcStringprepStatus status;
624 SilcBool ret;
625
626 if (s1 == s2)
627 return TRUE;
628
629 /* Casefold and normalize */
630 status = silc_stringprep(s1, n, SILC_STRING_UTF8,
631 SILC_IDENTIFIERC_PREP, 0, &s1u,
632 &s1u_len, SILC_STRING_UTF8);
633 if (status != SILC_STRINGPREP_OK)
634 return FALSE;
635
636 /* Casefold and normalize */
637 status = silc_stringprep(s2, n, SILC_STRING_UTF8,
638 SILC_IDENTIFIERC_PREP, 0, &s2u,
639 &s2u_len, SILC_STRING_UTF8);
640 if (status != SILC_STRINGPREP_OK)
641 return FALSE;
642
643 ret = !memcmp(s1u, s2u, n);
644
645 silc_free(s1u);
646 silc_free(s2u);
647
648 return ret;
649 }
650