1 /* Copyright (C) 2005 Morten K. Poulsen <morten at afdelingp.dk>
2 *
3 * $Id$
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a copy
6 * of this software and associated documentation files (the "Software"), to
7 * deal in the Software without restriction, including without limitation the
8 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
9 * sell copies of the Software, and to permit persons to whom the Software is
10 * furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <unistd.h>
25 #include <string.h>
26 #include <stdio.h>
27 #include <fcntl.h>
28 #include <stdlib.h>
29 #include <sys/types.h>
30 #include <sys/stat.h>
31 #include <sys/wait.h>
32 #include <ctype.h>
33 #include <iconv.h>
34
35 #include "mlmmj.h"
36 #include "unistr.h"
37 #include "log_error.h"
38 #include "memory.h"
39
40 /* This is allocated on the stack, so it can't be too big. */
41 #define ICONV_BUFFER_SIZE 160
42
43
unistr_new(void)44 unistr *unistr_new(void)
45 {
46 unistr *ret;
47
48 ret = mymalloc(sizeof(unistr));
49 ret->len = 0;
50 ret->alloc_len = 64;
51 ret->chars = mymalloc(ret->alloc_len * sizeof(unistr_char));
52
53 return ret;
54 }
55
56
unistr_free(unistr * str)57 void unistr_free(unistr *str)
58 {
59 if (!str)
60 return;
61 myfree(str->chars);
62 myfree(str);
63 }
64
65
unistr_cmp(const unistr * str1,const unistr * str2)66 int unistr_cmp(const unistr *str1, const unistr *str2)
67 {
68 unsigned int i;
69
70 for (i=0; i<str1->len; i++) {
71 if (str1->chars[i] < str2->chars[i]) {
72 return -1;
73 } else if (str1->chars[i] > str2->chars[i]) {
74 return 1;
75 }
76 }
77 if (str2->len > str1->len) {
78 return 1;
79 }
80 return 0;
81 }
82
83
unistr_dup(const unistr * str)84 unistr *unistr_dup(const unistr *str)
85 {
86 unistr *ret;
87 unsigned int i;
88
89 ret = unistr_new();
90 for (i=0; i<str->len; i++) {
91 unistr_append_char(ret, str->chars[i]);
92 }
93
94 return ret;
95 }
96
97
unistr_append_char(unistr * str,unistr_char uc)98 void unistr_append_char(unistr *str, unistr_char uc)
99 {
100 if (str->len >= str->alloc_len) {
101 str->alloc_len *= 2;
102 str->chars = myrealloc(str->chars, str->alloc_len * sizeof(unistr_char));
103 }
104 str->chars[str->len++] = uc;
105 }
106
107
unistr_append_usascii(unistr * str,const char * binary,size_t bin_len)108 void unistr_append_usascii(unistr *str, const char *binary, size_t bin_len)
109 {
110 unsigned int i;
111
112 for (i=0; i<bin_len; i++) {
113 if ((unsigned char)binary[i] > 0x7F) {
114 unistr_append_char(str, '?');
115 } else {
116 unistr_append_char(str, (unsigned char)binary[i]);
117 }
118 }
119 }
120
121
unistr_append_utf8(unistr * str,const char * binary,size_t bin_len)122 void unistr_append_utf8(unistr *str, const char *binary, size_t bin_len)
123 {
124 unsigned int i, j;
125 unistr_char ch;
126 unsigned char *bin = (unsigned char *)binary;
127
128 for (i=0; i<bin_len; i++) {
129 if (bin[i] <= 0x7F) { /* 1 */
130 unistr_append_char(str, bin[i]);
131 } else {
132 if ((bin[i] & 224) == 192) { /* 2 */
133 ch = bin[i] & 31;
134 j = 1;
135 } else if ((bin[i] & 240) == 224) { /* 3 */
136 ch = bin[i] & 15;
137 j = 2;
138 } else if ((bin[i] & 248) == 240) { /* 4 */
139 ch = bin[i] & 7;
140 j = 3;
141 } else if ((bin[i] & 252) == 248) { /* 5 */
142 ch = bin[i] & 3;
143 j = 4;
144 } else if ((bin[i] & 254) == 252) { /* 6 */
145 ch = bin[i] & 1;
146 j = 5;
147 } else {
148 /* invalid byte sequence */
149 unistr_append_char(str, '?');
150 continue;
151 }
152 if (ch == 0) {
153 /* invalid encoding, no data bits set in first byte */
154 unistr_append_char(str, '?');
155 continue;
156 }
157 for (;j>0; j--) {
158 i++;
159 ch <<= 6;
160 if ((bin[i] & 192) != 128) {
161 /* invalid byte sequence */
162 ch = '?';
163 break;
164 }
165 ch |= bin[i] & 63;
166 }
167 unistr_append_char(str, ch);
168 }
169 }
170 }
171
172
unistr_append_iso88591(unistr * str,const char * binary,size_t bin_len)173 void unistr_append_iso88591(unistr *str, const char *binary, size_t bin_len)
174 {
175 unsigned int i;
176
177 for (i=0; i<bin_len; i++) {
178 if (binary[i] == 0x00) {
179 unistr_append_char(str, '?');
180 } else {
181 unistr_append_char(str, (unsigned char)binary[i]);
182 }
183 }
184 }
185
186
unistr_append_iconv(unistr * str,char * binary,size_t bin_len,const char * charset)187 void unistr_append_iconv(unistr *str, char *binary, size_t bin_len,
188 const char * charset)
189 {
190 char bytes[ICONV_BUFFER_SIZE];
191 char * buffer;
192 size_t bufferleft;
193 iconv_t cd;
194
195 cd = iconv_open("UTF-8", charset);
196 if (cd == (iconv_t)-1) {
197 unistr_append_usascii(str, "???", 3);
198 return;
199 }
200
201 while (bin_len > 0) {
202 buffer = bytes;
203 bufferleft = ICONV_BUFFER_SIZE;
204 if (iconv(cd, &binary, &bin_len, &buffer, &bufferleft) == (size_t)-1) {
205 if (errno == EILSEQ) {
206 /* illegal sequence; try to recover */
207 unistr_append_utf8(str, bytes, ICONV_BUFFER_SIZE - bufferleft);
208 unistr_append_usascii(str, "?", 1);
209 bin_len--;
210 binary++;
211 continue;
212 } else if (errno == EINVAL) {
213 /* incomplete sequence; we're done */
214 unistr_append_usascii(str, "?", 1);
215 break;
216 } else if (errno != E2BIG) {
217 /* some other error; abort */
218 unistr_append_usascii(str, "???", 1);
219 break;
220 }
221 }
222 /* success or buffer full */
223 unistr_append_utf8(str, bytes, ICONV_BUFFER_SIZE - bufferleft);
224 }
225 iconv_close(cd);
226 }
227
228
unistr_dump(const unistr * str)229 void unistr_dump(const unistr *str)
230 {
231 unsigned int i;
232
233 printf("unistr_dump(%p)\n", (void *)str);
234 printf(" ->len = %lu\n", (unsigned long)str->len);
235 printf(" ->alloc_len = %lu\n", (unsigned long)str->alloc_len);
236 printf(" ->chars [ ");
237 for (i=0; i<str->len; i++) {
238 if ((str->chars[i] <= 0x7F) && (str->chars[i] != '\n')) {
239 printf("'%c' ", str->chars[i]);
240 } else {
241 printf("0x%02X ", str->chars[i]);
242 }
243 }
244 printf("]\n");
245 }
246
247
unistr_to_utf8(const unistr * str)248 char *unistr_to_utf8(const unistr *str)
249 {
250 unsigned int i;
251 size_t len = 0;
252 char *ret;
253 char *p;
254
255 for (i=0; i<str->len; i++) {
256 if (str->chars[i] <= 0x7F) {
257 len++;
258 } else if (str->chars[i] <= 0x7FF) {
259 len += 2;
260 } else if (str->chars[i] <= 0xFFFF) {
261 len += 3;
262 } else if (str->chars[i] <= 0x1FFFFF) {
263 len += 4;
264 } else if (str->chars[i] <= 0x3FFFFFF) {
265 len += 5;
266 } else if (str->chars[i] <= 0x7FFFFFFF) {
267 len += 6;
268 } else {
269 errno = 0;
270 log_error(LOG_ARGS, "unistr_to_utf8(): can not utf-8 encode"
271 "U+%04X", str->chars[i]);
272 return mystrdup("");
273 }
274 }
275 len++; /* NUL */
276
277 ret = mymalloc(len);
278 p = ret;
279
280 for (i=0; i<str->len; i++) {
281 if (str->chars[i] <= 0x7F) { /* 1 */
282 *(p++) = str->chars[i];
283 } else if (str->chars[i] <= 0x7FF) { /* 2 */
284 *(p++) = 192 + ((str->chars[i] & 1984) >> 6);
285 *(p++) = 128 + (str->chars[i] & 63);
286 } else if (str->chars[i] <= 0xFFFF) { /* 3 */
287 *(p++) = 224 + ((str->chars[i] & 61440) >> 12);
288 *(p++) = 128 + ((str->chars[i] & 4032) >> 6);
289 *(p++) = 128 + (str->chars[i] & 63);
290 } else if (str->chars[i] <= 0x1FFFFF) { /* 4 */
291 *(p++) = 240 + ((str->chars[i] & 1835008) >> 18);
292 *(p++) = 128 + ((str->chars[i] & 258048) >> 12);
293 *(p++) = 128 + ((str->chars[i] & 4032) >> 6);
294 *(p++) = 128 + (str->chars[i] & 63);
295 } else if (str->chars[i] <= 0x3FFFFFF) { /* 5 */
296 *(p++) = 248 + ((str->chars[i] & 50331648) >> 24);
297 *(p++) = 128 + ((str->chars[i] & 16515072) >> 18);
298 *(p++) = 128 + ((str->chars[i] & 258048) >> 12);
299 *(p++) = 128 + ((str->chars[i] & 4032) >> 6);
300 *(p++) = 128 + (str->chars[i] & 63);
301 } else if (str->chars[i] <= 0x7FFFFFFF) { /* 6 */
302 *(p++) = 252 + ((str->chars[i] & 1073741824) >> 30);
303 *(p++) = 128 + ((str->chars[i] & 1056964608) >> 24);
304 *(p++) = 128 + ((str->chars[i] & 16515072) >> 18);
305 *(p++) = 128 + ((str->chars[i] & 258048) >> 12);
306 *(p++) = 128 + ((str->chars[i] & 4032) >> 6);
307 *(p++) = 128 + (str->chars[i] & 63);
308 } else {
309 errno = 0;
310 log_error(LOG_ARGS, "unistr_to_utf8(): can not utf-8 encode"
311 "U+%04X", str->chars[i]);
312 }
313 }
314 *(p++) = '\0';
315
316 return ret;
317 }
318
319
hexval(char ch)320 static int hexval(char ch)
321 {
322 ch = tolower(ch);
323
324 if ((ch >= 'a') && (ch <= 'f')) {
325 return 10 + ch - 'a';
326 }
327
328 if ((ch >= '0') && (ch <= '9')) {
329 return ch - '0';
330 }
331
332 return 0;
333 }
334
335
decode_qp(char * str,char ** binary,size_t * bin_len)336 static void decode_qp(char *str, char **binary, size_t *bin_len)
337 {
338 int i;
339
340 /* decoded string will never be longer, and we don't include a NUL */
341 *binary = mymalloc(strlen(str));
342 *bin_len = 0;
343
344 for (i=0; str[i]; i++) {
345 if ((str[i] == '=') && isxdigit(str[i+1]) && isxdigit(str[i+2])) {
346 (*binary)[(*bin_len)++] = (hexval(str[i+1]) << 4) + hexval(str[i+2]);
347 i += 2;
348 } else if (str[i] == '_') {
349 (*binary)[(*bin_len)++] = 0x20;
350 } else {
351 (*binary)[(*bin_len)++] = str[i];
352 }
353 }
354 }
355
356
decode_base64(char * str,char ** binary,size_t * bin_len)357 static void decode_base64(char *str, char **binary, size_t *bin_len)
358 {
359 int tab[] = {
360 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
361 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
362 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63,
363 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
364 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
365 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
366 -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
367 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1,
368 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
369 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
370 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
371 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
372 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
373 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
374 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
375 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
376 };
377 size_t len;
378 unsigned int i;
379 unsigned int out;
380 int out_numbits;
381 int val;
382
383 /* decoded string will never be longer, and we don't include a NUL */
384 len = strlen(str);
385 *binary = mymalloc(len);
386 *bin_len = 0;
387
388 out = 0;
389 out_numbits = 0;
390 for (i=0; i<strlen(str); i++) {
391 val = tab[(unsigned char)str[i]];
392 if (val == -1)
393 continue;
394 out <<= 6;
395 out |= val;
396 out_numbits += 6;
397 if (out_numbits >= 8) {
398 (*binary)[(*bin_len)++] = (out >> (out_numbits - 8)) & 255;
399 out_numbits -= 8;
400 }
401 }
402 }
403
404
405 /* wsp, if not NULL, is an earlier offset into the same string as word,
406 * to whitespace that should only be included if word is not encoded. */
header_decode_word(char * wsp,char * word,unistr * ret)407 static int header_decode_word(char *wsp, char *word, unistr *ret)
408 {
409 char *my_word;
410 char *charset, *encoding, *string, *end;
411 char *binary;
412 size_t bin_len;
413
414 if (wsp == NULL)
415 wsp = word;
416
417 if ((word[0] != '=') || (word[1] != '?')) {
418 unistr_append_usascii(ret, wsp, strlen(wsp));
419 return 0;
420 }
421
422 my_word = mystrdup(word);
423
424 charset = my_word + 2;
425
426 if ((encoding = strchr(charset, '?')) == NULL) {
427 /* missing encoding */
428 unistr_append_usascii(ret, wsp, word-wsp);
429 unistr_append_usascii(ret, "???", 3);
430 myfree(my_word);
431 return 0;
432 }
433 *(encoding++) = '\0';
434
435 if ((string = strchr(encoding, '?')) == NULL) {
436 /* missing string */
437 unistr_append_usascii(ret, wsp, word-wsp);
438 unistr_append_usascii(ret, "???", 3);
439 myfree(my_word);
440 return 0;
441 }
442 *(string++) = '\0';
443
444 if ((end = strchr(string, '?')) == NULL) {
445 /* missing end */
446 unistr_append_usascii(ret, wsp, word-wsp);
447 unistr_append_usascii(ret, "???", 3);
448 myfree(my_word);
449 return 0;
450 }
451 *(end++) = '\0';
452 if ((end[0] != '=') || (end[1] != '\0')) {
453 /* broken end */
454 unistr_append_usascii(ret, wsp, word-wsp);
455 unistr_append_usascii(ret, "???", 3);
456 myfree(my_word);
457 return 0;
458 }
459
460 if (tolower(encoding[0]) == 'q') {
461 decode_qp(string, &binary, &bin_len);
462 } else if (tolower(encoding[0]) == 'b') {
463 decode_base64(string, &binary, &bin_len);
464 } else {
465 /* unknown encoding */
466 unistr_append_usascii(ret, wsp, word-wsp);
467 unistr_append_usascii(ret, "???", 3);
468 myfree(my_word);
469 return 0;
470 }
471
472 if (strcasecmp(charset, "us-ascii") == 0) {
473 unistr_append_usascii(ret, binary, bin_len);
474 } else if (strcasecmp(charset, "utf-8") == 0) {
475 unistr_append_utf8(ret, binary, bin_len);
476 } else if (strcasecmp(charset, "iso-8859-1") == 0) {
477 unistr_append_iso88591(ret, binary, bin_len);
478 } else {
479 unistr_append_iconv(ret, binary, bin_len, charset);
480 }
481
482 myfree(my_word);
483 myfree(binary);
484
485 return 1;
486 }
487
488
489 /* IN: " =?iso-8859-1?Q?hyggem=F8de?= torsdag "
490 * OUT: "hyggem\xC3\xB8de torsdag"
491 */
unistr_header_to_utf8(const char * str)492 char *unistr_header_to_utf8(const char *str)
493 {
494 char *my_str;
495 char *word;
496 char *p;
497 char c;
498 char *wsp = NULL;
499 int decoded = 0;
500 unistr *us;
501 char *ret;
502
503 my_str = mystrdup(str);
504 us = unistr_new();
505
506 p = my_str + strspn(my_str, " \t\n");
507 wsp = p;
508 while (*p) {
509 if (!decoded) {
510 unistr_append_usascii(us, wsp, p-wsp);
511 wsp = NULL;
512 }
513 word = p;
514 p += strcspn(p, " \t\n");
515 c = *p;
516 *p = '\0';
517 decoded = header_decode_word(wsp, word, us);
518 *p = c;
519 wsp = p;
520 p += strspn(p, " \t\n");
521 }
522
523 myfree(my_str);
524
525 ret = unistr_to_utf8(us);
526 unistr_free(us);
527
528 return ret;
529 }
530
531
is_ok_in_header(char ch)532 static int is_ok_in_header(char ch)
533 {
534 if ((ch >= 'a') && (ch <= 'z')) return 1;
535 if ((ch >= 'A') && (ch <= 'Z')) return 1;
536 if ((ch >= '0') && (ch <= '9')) return 1;
537 if (ch == '.') return 1;
538 if (ch == ',') return 1;
539 if (ch == ':') return 1;
540 if (ch == ';') return 1;
541 if (ch == '-') return 1;
542 if (ch == ' ') return 1;
543 return 0;
544 }
545
546
547 /* IN: " hyggem\xC3\xB8de torsdag "
548 * OUT: "=?utf-8?Q?hyggem=C3=B8de_torsdag?="
549 */
unistr_utf8_to_header(const char * str)550 char *unistr_utf8_to_header(const char *str)
551 {
552 unistr *us;
553 char *my_str;
554 char *ret;
555 char *wsp = NULL;
556 char *p;
557 int clean;
558 char buf[4];
559
560 my_str = mystrdup(str);
561
562 /* trim whitespace and see if the header is clean */
563
564 ret = my_str + strspn(my_str, " \t\n");
565
566 clean = 1;
567 for (p=ret; *p; p++) {
568 if (*p == ' ' || *p == '\t' || *p == '\n') {
569 if (wsp == NULL)
570 wsp = p;
571 } else {
572 wsp = NULL;
573 }
574 if (clean && !is_ok_in_header(*p))
575 clean = 0;
576 }
577 if (wsp != NULL)
578 *wsp = '\0';
579
580 if (clean) {
581 ret = mystrdup(ret);
582 myfree(my_str);
583 return ret;
584 }
585
586 us = unistr_new();
587
588 unistr_append_usascii(us, "=?utf-8?q?", 10);
589 for (p=ret; *p; p++) {
590 if (*p == 0x20) {
591 unistr_append_char(us, '_');
592 } else if (is_ok_in_header(*p)) {
593 unistr_append_char(us, *p);
594 } else {
595 snprintf(buf, sizeof(buf), "=%02X", (unsigned char)*p);
596 unistr_append_usascii(us, buf, 3);
597 }
598 }
599 unistr_append_usascii(us, "?=", 2);
600
601 ret = unistr_to_utf8(us);
602 unistr_free(us);
603 myfree(my_str);
604
605 return ret;
606 }
607
608
609 /* IN: "hyggem\\u00F8de torsdag"
610 * OUT: "hyggem\xC3\xB8de torsdag"
611 */
unistr_escaped_to_utf8(const char * str)612 char *unistr_escaped_to_utf8(const char *str)
613 {
614 unistr_char ch;
615 unistr *us;
616 char *ret;
617 char u[5];
618 int len;
619 int skip = 0;
620
621 us = unistr_new();
622
623 while (*str) {
624 if (*str == '\\') {
625 str++;
626 if (*str == 'u' && !skip) {
627 str++;
628 if (!isxdigit(str[0]) ||
629 !isxdigit(str[1]) ||
630 !isxdigit(str[2]) ||
631 !isxdigit(str[3])) {
632 unistr_append_char(us, '?');
633 continue;
634 }
635 u[0] = *str++;
636 u[1] = *str++;
637 u[2] = *str++;
638 u[3] = *str++;
639 u[4] = '\0';
640 ch = strtol(u, NULL, 16);
641 unistr_append_char(us, ch);
642 continue;
643 } else {
644 unistr_append_char(us, '\\');
645 /* Avoid processing the second backslash of a
646 * double-backslash; but if this was a such a
647 * one, go back to normal */
648 skip = !skip;
649 continue;
650 }
651 } else {
652 u[0] = *str;
653 len = 1;
654 str++;
655 while (*str && (unsigned char)u[0] > 0x7F) {
656 u[0] = *str;
657 len++;
658 str++;
659 }
660 unistr_append_utf8(us, str - len, len);
661 }
662 }
663
664 ret = unistr_to_utf8(us);
665 unistr_free(us);
666
667 return ret;
668 }
669