1 /*:ts=8*/
2 /*****************************************************************************
3 * FIDOGATE --- Gateway UNIX Mail/News <-> FTN NetMail/EchoMail
4 *
5 *
6 * NEW charset.c code using charset.bin mapping file
7 *
8 *****************************************************************************
9 * Copyright (C) 1990-2001
10 * _____ _____
11 * | |___ | Martin Junius FIDO: 2:2452/110
12 * | | | | | | Radiumstr. 18 Internet: mj@fido.de
13 * |_|_|_|@home| D-51069 Koeln, Germany
14 *
15 * This file is part of FIDOGATE.
16 *
17 * FIDOGATE is free software; you can redistribute it and/or modify it
18 * under the terms of the GNU General Public License as published by the
19 * Free Software Foundation; either version 2, or (at your option) any
20 * later version.
21 *
22 * FIDOGATE is distributed in the hope that it will be useful, but
23 * WITHOUT ANY WARRANTY; without even the implied warranty of
24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
25 * General Public License for more details.
26 *
27 * You should have received a copy of the GNU General Public License
28 * along with FIDOGATE; see the file COPYING. If not, write to the Free
29 * Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
30 *****************************************************************************/
31
32 #include "fidogate.h"
33
34 /* New code reuses original structures */
35
36 /* fidogate.conf aliases to canonical fsc charset name */
37 static CharsetAlias *fsc_aliases;
38 static CharsetAlias *charset_name_map;
39
40 static char *orig_in;
41 static char *orig_out;
42
43 /*
44 * Translate string
45 */
xlat_s(char * s1,char * s2)46 char *xlat_s(char *s1, char *s2)
47 {
48 char *dst = NULL;
49 size_t src_len;
50 size_t dst_len;
51 int rc;
52
53 if (s2 != NULL)
54 free(s2);
55
56 if (s1 == NULL)
57 return NULL;
58
59 src_len = strlen(s1);
60 src_len++; /* recode also final \0 */
61
62 rc = charset_recode_buf(&dst, &dst_len, s1, src_len, orig_in, orig_out);
63 if (rc == OK)
64 return dst;
65
66 free(dst);
67 return NULL;
68 }
69
70 /*
71 * Set character mapping table
72 */
charset_set_in_out(char * in,char * out)73 void charset_set_in_out(char *in, char *out)
74 {
75 if (!in || !out)
76 return;
77
78 debug(5, "charset: in=%s out=%s", in, out);
79
80 orig_in = in;
81 orig_out = out;
82 }
83
charset_fsc_aliases_add(char ** list)84 static void charset_fsc_aliases_add(char **list)
85 {
86 char *name = *list++;
87 char *alias;
88 CharsetAlias *p;
89
90 for (alias = *list; alias; alias = *++list) {
91 p = xmalloc(sizeof(*p));
92 snprintf(p->name, sizeof(p->name), "%s", name);
93 snprintf(p->alias, sizeof(p->alias), "%s", alias);
94
95 p->next = fsc_aliases;
96 fsc_aliases = p;
97
98 debug(15, "Adding FSC alias %s -> %s\n", alias, name);
99 }
100 }
101
charset_fsc_canonize(char * chrs)102 char *charset_fsc_canonize(char *chrs)
103 {
104 CharsetAlias *p;
105
106 for (p = fsc_aliases; p; p = p->next) {
107 if (streq(chrs, p->alias))
108 return p->name;
109 }
110 return chrs;
111 }
112
charset_fsc_aliases_init(void)113 static void charset_fsc_aliases_init(void)
114 {
115 int first = TRUE, next = FALSE;
116 char *p;
117 char **list = NULL;
118
119 for (p = cf_get_string("CharsetAliasesFSC", first);
120 p; p = cf_get_string("CharsetAliasesFSC", next)) {
121
122 list_init(&list, p);
123
124 if (*list == NULL) {
125 fglog("ERROR: CharsetAliasesFSC requires <name> <list>\n");
126 continue;
127 }
128
129 charset_fsc_aliases_add(list);
130 }
131
132 list_free(list);
133 }
134
charset_name_rfc2ftn(char * chrs)135 char *charset_name_rfc2ftn(char *chrs)
136 {
137 CharsetAlias *p;
138
139 for (p = charset_name_map; p; p = p->next) {
140 if (streq(chrs, p->name))
141 return p->alias;
142 }
143 return chrs;
144 }
145
charset_name_map_add(char ** list)146 static void charset_name_map_add(char **list)
147 {
148 char *rfc = list[0];
149 char *fsc = list[1];
150 CharsetAlias *p;
151
152 p = xmalloc(sizeof(*p));
153 snprintf(p->name, sizeof(p->name), "%s", rfc);
154 snprintf(p->alias, sizeof(p->alias), "%s", fsc);
155
156 p->next = charset_name_map;
157 charset_name_map = p;
158
159 debug(15, "Adding charset name map %s -> %s\n", rfc, fsc);
160 }
161
charset_name_map_init(void)162 static void charset_name_map_init(void)
163 {
164 int first = TRUE, next = FALSE;
165 char *p;
166 char **list = NULL;
167
168 for (p = cf_get_string("CharsetNameMap", first);
169 p; p = cf_get_string("CharsetNameMap", next)) {
170
171 list_init(&list, p);
172
173 if ((list[0] == NULL) || (list[1] == 0)) {
174 fglog("ERROR: Syntax CharsetNameMap <RFC name> <FSC name>\n");
175 continue;
176 }
177
178 charset_name_map_add(list);
179 }
180
181 list_free(list);
182 }
183
184 /*
185 * Initialize charset mapping
186 */
charset_init(void)187 void charset_init(void)
188 {
189 charset_fsc_aliases_init();
190 charset_name_map_init();
191 }
192
193 struct str_to_str {
194 char *key;
195 char *val;
196 };
197
198 static struct str_to_str level1_map[] = {
199 {"ASCII", "ASCII"},
200 {"DUTCH", "ISO646-DK"},
201 {"FINNISH", "ISO646-FI"},
202 {"FRENCH", "ISO646-FR"},
203 {"CANADIAN", "ISO646-CA"},
204 {"GERMAN", "ISO646-DE"},
205 {"ITALIAN", "ISO646-IT"},
206 {"NORWEIG", "ISO646-NO"},
207 {"PORTU", "ISO646-PT"},
208 {"SPANISH", "ISO646-ES"},
209 {"SWEDISH", "ISO646-SE"},
210 {"SWISS", "ISO646-CN"},
211 {"UK", "ISO646-GB"},
212 };
213
charset_level1_to_iconv(char * charset)214 static char *charset_level1_to_iconv(char *charset)
215 {
216 int i;
217
218 for (i = 0; i < sizeof(level1_map) / sizeof(level1_map[0]); i++)
219 if (stricmp(level1_map[i].key, charset) == 0)
220 return level1_map[i].val;
221 return NULL;
222 }
223
224 /*
225 * Get charset name from ^ACHRS kludge line
226 */
charset_chrs_name(char * s)227 char *charset_chrs_name(char *s)
228 {
229 static char name[MAXPATH];
230 char *p;
231 int level;
232
233 while (is_space(*s))
234 s++;
235 debug(5, "FSC-0054 ^ACHRS/CHARSET: %s", s);
236
237 BUF_COPY(name, s);
238 p = strtok(name, " \t");
239 if (!p)
240 return NULL;
241
242 p = strtok(NULL, " \t");
243 if (!p)
244 /* In this case it's an FSC-0050 kludge without the class code.
245 * Treat it like FSC-0054 level 2. */
246 level = 2;
247 else
248 level = atoi(p);
249
250 switch (level) {
251 case 1:
252 p = charset_level1_to_iconv(name);
253 debug(5, "FSC-0054 level 1 charset=%s (level 2: %s)", name, p);
254 return p;
255
256 default:
257 debug(5, "FSC-0054 level %d charset=%s", level, name);
258 return name;
259 }
260
261 return NULL;
262 }
263
charset_fsc_aliases_free(void)264 static void charset_fsc_aliases_free(void)
265 {
266 CharsetAlias *pa, *pa1;
267
268 for (pa = fsc_aliases; pa; pa = pa1) {
269 pa1 = pa->next;
270 free(pa);
271 }
272 }
273
charset_name_map_free(void)274 static void charset_name_map_free(void)
275 {
276 CharsetAlias *pa, *pa1;
277
278 for (pa = charset_name_map; pa; pa = pa1) {
279 pa1 = pa->next;
280 free(pa);
281 }
282 }
283
charset_free(void)284 void charset_free(void)
285 {
286 charset_fsc_aliases_free();
287 charset_name_map_free();
288 }
289
_charset_recode_iconv(char ** res,size_t * res_len,char * src,size_t src_len,char * from,char * _to)290 static int _charset_recode_iconv(char **res, size_t *res_len,
291 char *src, size_t src_len,
292 char *from, char *_to)
293 {
294 int rc;
295 iconv_t desc;
296 size_t size;
297 char *to;
298 char *dst;
299 size_t dst_size;
300 size_t inc = src_len;
301 char *cur;
302 size_t cur_size;
303 size_t dst_len; /* successfuly converted to dst */
304
305 debug(6, "Using ICONV");
306
307 size = strlen(_to) + sizeof("//TRANSLIT");
308 to = xmalloc(size);
309 sprintf(to, "%s//TRANSLIT", _to);
310
311 desc = iconv_open(to, from);
312 if (desc == (iconv_t) - 1) {
313 debug(6, "WARNING: iconv cannot convert from %s to %s", from, to);
314 return ERROR;
315 }
316
317 dst_size = src_len;
318 dst = xmalloc(dst_size);
319 cur = dst;
320 cur_size = dst_size;
321
322 while (src_len > 0) {
323 rc = iconv(desc, &src, &src_len, &cur, &cur_size);
324 if (rc != -1)
325 break;
326
327 if (errno != E2BIG) {
328 src++;
329 src_len--;
330 *cur++ = '?';
331 cur_size--;
332 continue;
333 }
334
335 /* after iconv call cur_size contains size of unused space */
336 dst_len = dst_size - cur_size;
337 dst = xrealloc(dst, dst_size + inc);
338 dst_size += inc;
339 cur = dst + dst_len;
340 /* unused + new */
341 cur_size += inc;
342 }
343
344 /*
345 * write sequence to get to the initial state if needed
346 * https://www.gnu.org/software/libc/manual/html_node/iconv-Examples.html
347 */
348 iconv(desc, NULL, NULL, &cur, &cur_size);
349 dst_len = dst_size - cur_size;
350 iconv_close(desc);
351 free(to);
352
353 *res = dst;
354 *res_len = dst_len;
355
356 return OK;
357 }
358
charset_recode_iconv(char ** dst,size_t * dstlen,char * src,size_t srclen,char * from,char * to)359 static int charset_recode_iconv(char **dst, size_t *dstlen,
360 char *src, size_t srclen, char *from, char *to)
361 {
362 int rc;
363 char *p;
364 char *buf;
365 size_t len;
366 size_t off;
367
368 rc = _charset_recode_iconv(dst, dstlen, src, srclen, from, to);
369 if (rc == OK)
370 return OK;
371
372 /* Heuristic, LATIN-1 -> LATIN1 */
373 p = strchr(from, '-');
374 if (p == NULL)
375 return ERROR;
376
377 off = p - from;
378 len = strlen(from);
379
380 buf = xmalloc(len + 1);
381 memcpy(buf, from, off);
382 memcpy(buf + off, p + 1, len - off - 1);
383 buf[len - 1] = '\0';
384
385 rc = _charset_recode_iconv(dst, dstlen, src, srclen, buf, to);
386 free(buf);
387
388 return rc;
389 }
390
391 /*
392 * Gets source buffer, lenght of it, allocates buffer for the result.
393 * Return dst -- allocated buffer
394 * dstlen -- number of used bytes in it
395 * The argument's order is like in str/mem functions
396 *
397 * Adjusts given length to string's length
398 */
charset_recode_buf(char ** dst,size_t * dstlen,char * src,size_t srclen,char * from,char * to)399 int charset_recode_buf(char **dst, size_t *dstlen,
400 char *src, size_t srclen, char *from, char *to)
401 {
402 if (src == NULL || dst == NULL)
403 return ERROR;
404
405 if (srclen == 0)
406 return ERROR;
407
408 debug(6, "mime charset: recoding from %s to %s", from, to);
409
410 if (strieq(from, to)) {
411 *dst = xmalloc(srclen);
412 memcpy(*dst, src, srclen);
413 *dstlen = srclen;
414 return OK;
415 }
416
417 return charset_recode_iconv(dst, dstlen, src, srclen, from, to);
418 }
419
charset_is_7bit(char * buffer,size_t len)420 int charset_is_7bit(char *buffer, size_t len)
421 {
422 int i;
423
424 if (buffer == NULL)
425 return TRUE;
426
427 for (i = 0; i < len; i++)
428 if (buffer[i] & 0x80)
429 return FALSE;
430 return TRUE;
431 }
432
433 enum utf8_state {
434 START_SEQ,
435 PROCESS_SEQ,
436 FINISH,
437 ERR,
438 };
439
utf8_check_start(unsigned char c,size_t * n)440 static enum utf8_state utf8_check_start(unsigned char c, size_t *n)
441 {
442 size_t num;
443
444 if ((c & 0x80) == 0)
445 num = 1;
446 else if (((c & 0xc0) == 0xc0) && ((c & 0x20) == 0))
447 num = 2;
448 else if (((c & 0xe0) == 0xe0) && ((c & 0x10) == 0))
449 num = 3;
450 else if (((c & 0xf0) == 0xf0) && ((c & 0x08) == 0))
451 num = 4;
452 else
453 return ERR;
454
455 *n = num;
456 return PROCESS_SEQ;
457 }
458
utf8_check_rest_byte(unsigned char c)459 static bool utf8_check_rest_byte(unsigned char c)
460 {
461 return ((c & 0x80) == 0x80) && ((c & 0x40) == 0);
462 }
463
utf8_check_rest_bytes(char * s,size_t len,size_t i,size_t num)464 static bool utf8_check_rest_bytes(char *s, size_t len, size_t i, size_t num)
465 {
466 while (num--) {
467 if (s[i] == '\0' || i == len)
468 return false;
469 if (!utf8_check_rest_byte(s[i]))
470 return false;
471 i++;
472 }
473 return true;
474 }
475
charset_is_valid_utf8(char * s,size_t len)476 bool charset_is_valid_utf8(char *s, size_t len)
477 {
478 enum utf8_state state = START_SEQ;
479 size_t i;
480 size_t num;
481 static const void *const states[] = {
482 [START_SEQ] = &&START_SEQ,
483 [PROCESS_SEQ] = &&PROCESS_SEQ,
484 [FINISH] = &&FINISH,
485 [ERR] = &&ERR,
486 };
487
488 i = 0;
489 goto START_SEQ;
490
491 START_SEQ:
492 if (s[i] == '\0' || i == len)
493 goto FINISH;
494 state = utf8_check_start(s[i], &num);
495 goto *states[state];
496
497 PROCESS_SEQ:
498 i++;
499 num--;
500 if (!utf8_check_rest_bytes(s, len, i, num))
501 goto ERR;
502 i += num;
503 goto START_SEQ;
504
505 FINISH:
506 return true;
507 ERR:
508 return false;
509 }
510