1 /*
2 * This file handles character conversions.
3 *
4 * climm Copyright (C) © 2001-2005 Rüdiger Kuhlmann
5 *
6 * climm is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; version 2 dated June, 1991.
9 *
10 * climm is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
12 * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
13 * License for more details.
14 *
15 * In addition, as a special exception permission is granted to link the
16 * code of this release of climm with the OpenSSL project's "OpenSSL"
17 * library, and distribute the linked executables. You must obey the GNU
18 * General Public License in all respects for all of the code used other
19 * than "OpenSSL". If you modify this file, you may extend this exception
20 * to your version of the file, but you are not obligated to do so. If you
21 * do not wish to do so, delete this exception statement from your version
22 * of this file.
23 *
24 * You should have received a copy of the GNU General Public License
25 * along with this package; if not, write to the Free Software
26 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
27 * 02111-1307, USA.
28 *
29 *
30 * $Id: conv.c 2420 2007-10-10 21:46:48Z kuhlmann $
31 */
32
33 #include "climm.h"
34 #if HAVE_ERRNO_H
35 #include <errno.h>
36 #endif
37 #if HAVE_CTYPE_H
38 #include <ctype.h>
39 #endif
40 #include "conv.h"
41 #include "preferences.h"
42 #if !HAVE_WCWIDTH
43 #undef ENABLE_FALLBACK_WCHART
44 #endif
45
46 typedef strc_t (iconv_func)(strc_t, UBYTE);
47
48 #if HAVE_ICONV
49 #include <iconv.h>
50 static strc_t iconv_from_iconv (strc_t, UBYTE);
51 static strc_t iconv_to_iconv (strc_t, UBYTE);
52 #endif
53 static iconv_func iconv_from_usascii, iconv_to_usascii;
54 #if ENABLE_FALLBACK_UTF8
55 static iconv_func iconv_from_utf8, iconv_to_utf8;
56 #endif
57 #if ENABLE_FALLBACK_LATIN1
58 static iconv_func iconv_from_latin1, iconv_to_latin1;
59 #endif
60 #if ENABLE_FALLBACK_LATIN9
61 static iconv_func iconv_from_latin9, iconv_to_latin9;
62 #endif
63 #if ENABLE_FALLBACK_KOI8
64 static iconv_func iconv_from_koi8, iconv_to_koi8;
65 #endif
66 #if ENABLE_FALLBACK_WIN1251
67 static iconv_func iconv_from_win1251, iconv_to_win1251;
68 #endif
69 #if ENABLE_FALLBACK_UCS2BE
70 static iconv_func iconv_from_ucs2be, iconv_to_ucs2be;
71 #endif
72 #if ENABLE_FALLBACK_WCHART
73 static iconv_func iconv_from_wchart, iconv_to_wchart;
74 #endif
75 typedef struct { const char *enca; const char *encb; const char *encc; const char *encd;
76 #if HAVE_ICONV
77 iconv_t iof; iconv_t ito;
78 #endif
79 iconv_func *fof; iconv_func *fto; } enc_t;
80
81 static int conv_nr = 0;
82 static enc_t *conv_encs = NULL;
83
84 UBYTE conv_error = 0;
85
86 #if HAVE_ICONV
87 static const char *Utf8Name = "UTF-8";
88 /*
89 * Check whether iconv() can handle it.
90 */
iconv_check(UBYTE enc)91 static BOOL iconv_check (UBYTE enc)
92 {
93 #ifdef ENABLE_TRANSLIT
94 conv_encs[enc].ito = iconv_open (s_sprintf ("%s//TRANSLIT", conv_encs[enc].enca), Utf8Name);
95 if (conv_encs[enc].ito == (iconv_t)-1)
96 #endif
97 conv_encs[enc].ito = iconv_open (conv_encs[enc].enca, Utf8Name);
98 conv_encs[enc].iof = iconv_open (Utf8Name, conv_encs[enc].enca);
99 if ((conv_encs[enc].ito == (iconv_t)-1 || conv_encs[enc].iof == (iconv_t)-1)
100 && conv_encs[enc].encb)
101 {
102 #ifdef ENABLE_TRANSLIT
103 conv_encs[enc].ito = iconv_open (s_sprintf ("%s//TRANSLIT", conv_encs[enc].encb), Utf8Name);
104 if (conv_encs[enc].ito == (iconv_t)-1)
105 #endif
106 conv_encs[enc].ito = iconv_open (conv_encs[enc].encb, Utf8Name);
107 conv_encs[enc].iof = iconv_open (Utf8Name, conv_encs[enc].encb);
108 }
109 if ((conv_encs[enc].ito == (iconv_t)-1 || conv_encs[enc].iof == (iconv_t)-1)
110 && conv_encs[enc].encc)
111 {
112 #ifdef ENABLE_TRANSLIT
113 conv_encs[enc].ito = iconv_open (s_sprintf ("%s//TRANSLIT", conv_encs[enc].encc), Utf8Name);
114 if (conv_encs[enc].ito == (iconv_t)-1)
115 #endif
116 conv_encs[enc].ito = iconv_open (conv_encs[enc].encc, Utf8Name);
117 conv_encs[enc].iof = iconv_open (Utf8Name, conv_encs[enc].encc);
118 }
119 if ((conv_encs[enc].ito == (iconv_t)-1 || conv_encs[enc].iof == (iconv_t)-1)
120 && conv_encs[enc].encd)
121 {
122 #ifdef ENABLE_TRANSLIT
123 conv_encs[enc].ito = iconv_open (s_sprintf ("%s//TRANSLIT", conv_encs[enc].encd), Utf8Name);
124 if (conv_encs[enc].ito == (iconv_t)-1)
125 #endif
126 conv_encs[enc].ito = iconv_open (conv_encs[enc].encd, Utf8Name);
127 conv_encs[enc].iof = iconv_open (Utf8Name, conv_encs[enc].encd);
128 }
129 if (enc == ENC_LATIN1 && conv_encs[enc].ito == (iconv_t)-1)
130 {
131 conv_encs[enc].ito = iconv_open (conv_encs[enc].encc, "utf8");
132 conv_encs[enc].iof = iconv_open ("utf8", conv_encs[enc].encc);
133 if (conv_encs[enc].ito != (iconv_t)-1 && conv_encs[enc].iof != (iconv_t)-1)
134 Utf8Name = "utf8";
135 }
136 if (conv_encs[enc].ito != (iconv_t)-1 && conv_encs[enc].iof != (iconv_t)-1)
137 {
138 conv_encs[enc].fof = &iconv_from_iconv;
139 conv_encs[enc].fto = &iconv_to_iconv;
140 return TRUE;
141 }
142 return FALSE;
143 }
144 #endif
145
146 #if HAVE_ICONV
iconv_reset(iconv_t cd)147 static inline void iconv_reset (iconv_t cd)
148 {
149 size_t sunos57_sucks_inl = 0, sunos57_sucks_outl = 0;
150 char *sunos57_sucks_outb = NULL;
151 /* SunOS 5.7 segfaults if anything other than inb is NULL */
152 iconv (cd, NULL, &sunos57_sucks_inl, &sunos57_sucks_outb, &sunos57_sucks_outl);
153 }
154 #endif
155
156 /*
157 * Initialize encoding table.
158 */
ConvInit(void)159 void ConvInit (void)
160 {
161 conv_error = 0;
162 conv_encs = calloc (sizeof (enc_t), conv_nr = 15);
163 conv_encs[ENC_ASCII].enca = "US-ASCII";
164 conv_encs[ENC_ASCII].encb = "USASCII";
165 conv_encs[ENC_ASCII].encc = "ANSI_X3.4-1968";
166 conv_encs[ENC_UTF8].enca = "UTF-8";
167 conv_encs[ENC_LATIN1].enca = "ISO-8859-1";
168 conv_encs[ENC_LATIN1].encb = "ISO8859-1";
169 conv_encs[ENC_LATIN1].encc = "ISO88591"; /* don't re-sort */
170 conv_encs[ENC_LATIN1].encd = "LATIN1";
171 conv_encs[ENC_LATIN9].enca = "ISO-8859-15";
172 conv_encs[ENC_LATIN9].encb = "ISO8859-15";
173 conv_encs[ENC_LATIN9].encc = "ISO885915";
174 conv_encs[ENC_LATIN9].encd = "LATIN9";
175 conv_encs[ENC_KOI8].enca = "KOI8-U";
176 conv_encs[ENC_KOI8].encb = "KOI8-R";
177 conv_encs[ENC_KOI8].encc = "KOI8";
178 conv_encs[ENC_WIN1251].enca = "CP1251";
179 conv_encs[ENC_WIN1251].encb = "WINDOWS-1251";
180 conv_encs[ENC_WIN1251].encc = "CP-1251";
181 conv_encs[ENC_UCS2BE].enca = "UCS-2BE";
182 conv_encs[ENC_UCS2BE].encb = "UNICODEBIG";
183 conv_encs[ENC_UCS2BE].encc = "UNICODE-2-0"; /* ICQ sucks */
184 conv_encs[ENC_WIN1257].enca = "CP1257";
185 conv_encs[ENC_WIN1257].encb = "WINDOWS-1257";
186 conv_encs[ENC_WIN1257].encc = "CP-1257";
187 conv_encs[ENC_EUC].enca = "EUC-JP";
188 conv_encs[ENC_SJIS].enca = "SHIFT-JIS";
189 conv_encs[ENC_SJIS].encb = "SJIS";
190 conv_encs[ENC_WCHART].enca = "WCHAR_T";
191
192 #if HAVE_ICONV
193 /* extra check for UTF-8 */
194 ConvEnc (conv_encs[ENC_UTF8].enca);
195 if (conv_encs[ENC_UTF8].fof)
196 {
197 size_t inl = 2, outl = 10;
198 char inb[10], outb[10], *outp = outb;
199 ICONV_CONST char *inp = inb;
200 strcpy (inb, "\xfc.\xc0\xaf");
201 if (iconv (conv_encs[ENC_UTF8].ito, &inp, &inl, &outp, &outl) != (size_t)-1)
202 conv_encs[ENC_UTF8].fto = conv_encs[ENC_UTF8].fof = NULL;
203 else
204 {
205 inp = inb + 2;
206 iconv_reset (conv_encs[ENC_UTF8].ito);
207 if ((iconv (conv_encs[ENC_UTF8].ito, &inp, &inl, &outp, &outl) != (size_t)-1) && *outp != '/')
208 conv_encs[ENC_UTF8].fto = conv_encs[ENC_UTF8].fof = NULL;
209 }
210 }
211 #endif
212 if (!conv_encs[ENC_ASCII].fof)
213 {
214 conv_encs[ENC_ASCII].fof = &iconv_from_usascii;
215 conv_encs[ENC_ASCII].fto = &iconv_to_usascii;
216 }
217 if (!conv_encs[ENC_UTF8].fof)
218 {
219 #if ENABLE_FALLBACK_UTF8
220 conv_encs[ENC_UTF8].fof = &iconv_from_utf8;
221 conv_encs[ENC_UTF8].fto = &iconv_to_utf8;
222 #else
223 conv_encs[ENC_UTF8].fof = conv_encs[ENC_ASCII].fof;
224 conv_encs[ENC_UTF8].fto = conv_encs[ENC_ASCII].fto;
225 #endif
226 }
227 if (!conv_encs[ENC_LATIN1].fof)
228 {
229 #if ENABLE_FALLBACK_LATIN1
230 conv_encs[ENC_LATIN1].fof = &iconv_from_latin1;
231 conv_encs[ENC_LATIN1].fto = &iconv_to_latin1;
232 #else
233 conv_encs[ENC_LATIN1].fof = conv_encs[ENC_ASCII].fof;
234 conv_encs[ENC_LATIN1].fto = conv_encs[ENC_ASCII].fto;
235 #endif
236 }
237 if (!conv_encs[ENC_LATIN9].fof)
238 {
239 #if ENABLE_FALLBACK_LATIN9
240 conv_encs[ENC_LATIN9].fof = &iconv_from_latin9;
241 conv_encs[ENC_LATIN9].fto = &iconv_to_latin9;
242 #else
243 conv_encs[ENC_LATIN9].fof = conv_encs[ENC_ASCII].fof;
244 conv_encs[ENC_LATIN9].fto = conv_encs[ENC_ASCII].fto;
245 #endif
246 }
247 if (!conv_encs[ENC_KOI8].fof)
248 {
249 #if ENABLE_FALLBACK_KOI8
250 conv_encs[ENC_KOI8].fof = &iconv_from_koi8;
251 conv_encs[ENC_KOI8].fto = &iconv_to_koi8;
252 #else
253 conv_encs[ENC_KOI8].fof = conv_encs[ENC_ASCII].fof;
254 conv_encs[ENC_KOI8].fto = conv_encs[ENC_ASCII].fto;
255 #endif
256 }
257 if (!conv_encs[ENC_WIN1251].fof)
258 {
259 #if ENABLE_FALLBACK_WIN1251
260 conv_encs[ENC_WIN1251].fof = &iconv_from_win1251;
261 conv_encs[ENC_WIN1251].fto = &iconv_to_win1251;
262 #else
263 conv_encs[ENC_WIN1251].fof = conv_encs[ENC_ASCII].fof;
264 conv_encs[ENC_WIN1251].fto = conv_encs[ENC_ASCII].fto;
265 #endif
266 }
267 if (!conv_encs[ENC_UCS2BE].fof)
268 {
269 #if ENABLE_FALLBACK_UCS2BE
270 conv_encs[ENC_UCS2BE].fof = &iconv_from_ucs2be;
271 conv_encs[ENC_UCS2BE].fto = &iconv_to_ucs2be;
272 #else
273 conv_encs[ENC_UCS2BE].fof = conv_encs[ENC_ASCII].fof;
274 conv_encs[ENC_UCS2BE].fto = conv_encs[ENC_ASCII].fto;
275 #endif
276 }
277 if (!conv_encs[ENC_WCHART].fof)
278 {
279 #if ENABLE_FALLBACK_WCHART
280 conv_encs[ENC_WCHART].fof = &iconv_from_wchart;
281 conv_encs[ENC_WCHART].fto = &iconv_to_wchart;
282 #else
283 conv_encs[ENC_WCHART].fof = conv_encs[ENC_ASCII].fof;
284 conv_encs[ENC_WCHART].fto = conv_encs[ENC_ASCII].fto;
285 #endif
286 }
287 }
288
289 /*
290 * Give an ID for the given encoding name.
291 */
ConvEnc(const char * enc)292 UBYTE ConvEnc (const char *enc)
293 {
294 UBYTE nr;
295
296 for (nr = 0; conv_encs[nr].enca; nr++)
297 if (!strcasecmp (conv_encs[nr].enca, enc) ||
298 (conv_encs[nr].encb && !strcasecmp (conv_encs[nr].encb, enc)) ||
299 (conv_encs[nr].encc && !strcasecmp (conv_encs[nr].encc, enc)) ||
300 (conv_encs[nr].encd && !strcasecmp (conv_encs[nr].encd, enc)))
301 {
302 #if HAVE_ICONV
303 if (!conv_encs[nr].ito || !conv_encs[nr].iof)
304 iconv_check (nr);
305 if (conv_encs[nr].ito != (iconv_t)(-1) && conv_encs[nr].iof != (iconv_t)(-1))
306 return nr;
307 return ENC_FERR | nr;
308 #endif
309 if (conv_encs[nr].fof && conv_encs[nr].fto)
310 return nr;
311 break;
312 }
313
314 if (nr & ENC_FLAGS)
315 return ENC_FERR;
316 if (nr == conv_nr - 1)
317 {
318 enc_t *newc = realloc (conv_encs, sizeof (enc_t) * (conv_nr + 8));
319 if (!newc)
320 return 0;
321 conv_nr += 8;
322 conv_encs = newc;
323 }
324 if (!conv_encs[nr].enca)
325 {
326 char *p;
327 for (conv_encs[nr].enca = p = strdup (enc); *p; p++)
328 *p = toupper (*p);
329 conv_encs[nr].encb = strdup (enc);
330 conv_encs[nr + 1].enca = NULL;
331 }
332 #if HAVE_ICONV
333 if (iconv_check (nr))
334 return nr;
335 #endif
336 conv_error = nr;
337 return ENC_FERR | nr;
338 }
339
340 /*
341 * Give the encoding name for a given ID
342 */
ConvEncName(UBYTE enc)343 const char *ConvEncName (UBYTE enc)
344 {
345 if ((enc & ~ENC_FLAGS) > conv_nr)
346 return "<auto/undefined>";
347 return conv_encs[enc & ~ENC_FLAGS].enca;
348 }
349
ConvCrush0xFE(const char * in)350 const char *ConvCrush0xFE (const char *in)
351 {
352 static str_s t;
353 char *p;
354
355 if (!in || !*in)
356 return "";
357
358 s_init (&t, in, 0);
359
360 for (p = t.txt; *p; p++)
361 if (*p == Conv0xFE)
362 *p = '*';
363 return t.txt;
364 }
365
366 /*
367 * Convert a single unicode code point to UTF-8
368 */
ConvUTF8(UDWORD ucs)369 const char *ConvUTF8 (UDWORD ucs)
370 {
371 static char b[5];
372
373 if (!(ucs & 0xffffff80))
374 {
375 b[0] = ucs;
376 b[1] = '\0';
377 }
378 else if (!(ucs & 0xfffff800))
379 {
380 b[0] = 0xc0 | (ucs >> 6);
381 b[1] = 0x80 | (ucs & 0x3f);
382 b[2] = '\0';
383 }
384 else if (!(ucs & 0xffff0000))
385 {
386 b[0] = 0xe0 | ( ucs >> 12);
387 b[1] = 0x80 | ((ucs & 0xfc0) >> 6);
388 b[2] = 0x80 | (ucs & 0x3f);
389 b[3] = '\0';
390 }
391 else if (!(ucs & 0xffe00000))
392 {
393 b[0] = 0xf0 | ( ucs >> 18);
394 b[1] = 0x80 | ((ucs & 0x3f000) >> 12);
395 b[2] = 0x80 | ((ucs & 0xfc0) >> 6);
396 b[3] = 0x80 | (ucs & 0x3f);
397 b[4] = '\0';
398 }
399 else
400 {
401 b[0] = CHAR_BROKEN;
402 b[1] = '\0';
403 }
404 return b;
405 }
406
ConvGetUTF8(strc_t in,int * off)407 UDWORD ConvGetUTF8 (strc_t in, int *off)
408 {
409 UDWORD ucs = 0;
410 int i, continuations = 1;
411 UBYTE c = in->txt[(*off)++];
412
413 if (~c & 0x80)
414 return c;
415
416 if (~c & 0x40)
417 return CHAR_BROKEN;
418
419 while (c & 0x20)
420 {
421 continuations++;
422 c <<= 1;
423 }
424
425 c &= 0x3f;
426 c >>= continuations - 1;
427
428 for (i = 0, ucs = c; i < continuations; i++)
429 {
430 if (((c = in->txt[*off + i]) & 0xc0) != 0x80)
431 return c ? CHAR_BROKEN : CHAR_INCOMPLETE;
432
433 c &= 0x3f;
434 ucs <<= 6;
435 ucs |= c;
436 }
437 *off += continuations;
438 return ucs;
439 }
440
ConvFrom(strc_t text,UBYTE enc)441 strc_t ConvFrom (strc_t text, UBYTE enc)
442 {
443 enc &= ~ENC_FLAGS;
444 #if HAVE_ICONV
445 if ((enc < conv_nr) && !conv_encs[enc].iof)
446 iconv_check (enc);
447 #endif
448 if ((enc >= conv_nr) || (!conv_encs[enc].fof))
449 enc = ENC_ASCII;
450 return conv_encs[enc].fof (text, enc);
451 }
452
ConvFromSplit(strc_t text,UBYTE enc)453 strc_t ConvFromSplit (strc_t text, UBYTE enc)
454 {
455 static str_s str;
456 str_s tstr;
457 const char *p;
458 size_t tlen = text->len;
459
460 s_init (&str, "", 100);
461 tstr.txt = (char *)text->txt;
462 tstr.max = 0;
463 while ((p = memchr (tstr.txt, '\xfe', tlen)))
464 {
465 tstr.len = p - tstr.txt;
466 s_cat (&str, ConvFrom (&tstr, enc)->txt);
467 s_catc (&str, '\xfe');
468 tstr.txt += tstr.len + 1;
469 tlen -= tstr.len + 1;
470 }
471 tstr.len = tlen;
472 s_cat (&str, ConvFrom (&tstr, enc)->txt);
473 return &str;
474 }
475
ConvToSplit(const char * text,UBYTE enc)476 strc_t ConvToSplit (const char *text, UBYTE enc)
477 {
478 static str_s str;
479 const char *p;
480 size_t tlen = strlen (text);
481 size_t plen;
482
483 s_init (&str, "", 100);
484 while ((p = memchr (text, '\xfe', tlen)))
485 {
486 plen = p - text;
487 s_cat (&str, ConvToLen (text, enc, plen)->txt);
488 s_catc (&str, '\xfe');
489 text += plen + 1;
490 tlen -= plen + 1;
491 }
492 s_cat (&str, ConvToLen (text, enc, tlen)->txt);
493 return &str;
494 }
495
ConvToLen(const char * ctext,UBYTE enc,size_t len)496 strc_t ConvToLen (const char *ctext, UBYTE enc, size_t len)
497 {
498 str_s text;
499 enc &= ~ENC_FLAGS;
500 #if HAVE_ICONV
501 if ((enc < conv_nr) && !conv_encs[enc].ito)
502 iconv_check (enc);
503 #endif
504 text.txt = (char *)ctext;
505 text.len = len;
506 text.max = 0;
507 if ((enc >= conv_nr) || (!conv_encs[enc].fto))
508 enc = ENC_ASCII;
509 return conv_encs[enc].fto (&text, enc);
510 }
511
ConvTo(const char * ctext,UBYTE enc)512 strc_t ConvTo (const char *ctext, UBYTE enc)
513 {
514 return ConvToLen (ctext, enc, ctext ? strlen (ctext) : 0);
515 }
516
ConvFromMime(strc_t mime,strc_t text)517 strc_t ConvFromMime (strc_t mime, strc_t text)
518 {
519 static str_s out;
520 strc_t o;
521 UBYTE enc = ENC_ASCII;
522 const char *e;
523 if ((e = strstr (mime->txt, "charset=")))
524 {
525 char *ee = strdup (e + 8);
526 e = ee;
527 if (*e == '"')
528 {
529 e++;
530 if (strchr (e, '"'))
531 *strchr (e, '"') = 0;
532 }
533 if (strchr (e, ';'))
534 *strchr (e, ';') = 0;
535 enc = ConvEnc (e);
536 if (!enc || ENC_FERR & enc)
537 enc = ENC_ASCII;
538 free (ee);
539 if (enc != ENC_ASCII && enc != ENC_UTF8 && ConvIsUTF8 (text->txt) && text->len == strlen (text->txt))
540 enc = ENC_UTF8; /* ICQ 5.10, Build 3000 send l1 even though it _is_ utf8 */
541 }
542 o = ConvFrom (text, enc);
543 s_init (&out, o->txt, 0);
544
545 if ( (!strncmp (mime->txt, "text/x-aolrtf", 13) && (!mime->txt[13] || mime->txt[13] == ';'))
546 || (!strncmp (mime->txt, "text/aolrtf", 11) && (!mime->txt[11] || mime->txt[11] == ';'))
547 || (!strncmp (mime->txt, "text/html", 9) && (!mime->txt[9] || mime->txt[9] == ';')) )
548 {
549 /* more or less html */
550 s_strrepl (&out, "<html>", "");
551 s_strrepl (&out, "</html>", "");
552 s_strrepl (&out, "<body>", "");
553 s_strrepl (&out, "</body>", "");
554 s_strrepl (&out, "<br/>", "\n");
555 s_strrepl (&out, "<br>", "\n");
556 }
557 else if (!strncmp (mime->txt, "text/plain", 10) && (!mime->txt[10] || mime->txt[10] == ';'))
558 {
559 /* nothing to do */
560 }
561 else
562 {
563 /* unknown */
564 s_insn (&out, 0, mime->txt, mime->len);
565 }
566 return &out;
567 }
568
569 /*
570 * Transliterates manually a string if it doesn't fit into the local
571 * encoding
572 */
ConvTranslit(const char * orig,const char * trans)573 const char *ConvTranslit (const char *orig, const char *trans)
574 {
575 UBYTE enc = prG->enc_loc;
576
577 if (strcmp (orig, ConvFrom (ConvTo (orig, enc), enc)->txt))
578 return trans;
579 return orig;
580 }
581
ConvFits(const char * in,UBYTE enc)582 BOOL ConvFits (const char *in, UBYTE enc)
583 {
584 char *inn, *p;
585 int i;
586
587 inn = strdup (in);
588 if (!inn)
589 return 0;
590 for (p = inn; *p; p++)
591 if (*p == Conv0xFE || *p == CHAR_NOT_AVAILABLE || *p == CHAR_INCOMPLETE || *p == CHAR_BROKEN)
592 *p = ' ';
593 i = strpbrk (ConvFrom (ConvTo (inn, enc), enc)->txt, "?*_") ? 0 : 1;
594 free (inn);
595 return i;
596 }
597
598 #if HAVE_ICONV
iconv_from_iconv(strc_t text,UBYTE enc)599 static strc_t iconv_from_iconv (strc_t text, UBYTE enc)
600 {
601 static str_s str;
602
603 size_t inleft, outleft;
604 char *out;
605 ICONV_CONST char *in;
606
607 s_init (&str, "", 100);
608 out = str.txt;
609 outleft = str.max - 2;
610 in = (ICONV_CONST char *) text->txt;
611 inleft = text->len;
612
613 iconv_reset (conv_encs[enc].iof);
614 while (iconv (conv_encs[enc].iof, &in, &inleft, &out, &outleft) == (size_t)(-1))
615 {
616 UDWORD rc = errno;
617 str.len = out - str.txt;
618
619 if (outleft < 10 || rc == E2BIG)
620 s_blow (&str, 50 + inleft);
621 else if (rc == EINVAL)
622 {
623 s_catc (&str, CHAR_INCOMPLETE);
624 str.txt[str.len] = '\0';
625 return &str;
626 }
627 else
628 {
629 s_catc (&str, CHAR_NOT_AVAILABLE);
630 in++;
631 inleft--;
632 }
633 out = str.txt + str.len;
634 outleft = str.max - str.len - 2;
635 iconv_reset (conv_encs[enc].iof);
636 }
637 *out = '\0';
638 str.len = out - str.txt;
639 return &str;
640 }
641
iconv_to_iconv(strc_t text,UBYTE enc)642 static strc_t iconv_to_iconv (strc_t text, UBYTE enc)
643 {
644 static str_s str;
645
646 size_t inleft, outleft;
647 char *out;
648 ICONV_CONST char *in;
649
650 s_init (&str, "", 100 + text->len);
651 out = str.txt;
652 outleft = str.max - 2;
653 in = (ICONV_CONST char *) text->txt;
654 inleft = text->len;
655
656 iconv_reset (conv_encs[enc].ito);
657 while (iconv (conv_encs[enc].ito, &in, &inleft, &out, &outleft) == (size_t)(-1))
658 {
659 UDWORD rc = errno;
660 str.len = out - str.txt;
661
662 if (outleft < 10 || rc == E2BIG)
663 s_blow (&str, 50 + inleft);
664 else if (rc == EINVAL)
665 {
666 char inc = CHAR_INCOMPLETE;
667 ICONV_CONST char *in = &inc;
668 size_t inleft = 1;
669 iconv_reset (conv_encs[enc].ito);
670 iconv (conv_encs[enc].ito, &in, &inleft, &out, &outleft);
671 str.len = out - str.txt;
672 str.txt[str.len] = '\0';
673 return &str;
674 }
675 else
676 {
677 char inc = CHAR_NOT_AVAILABLE;
678 ICONV_CONST char *inn = &inc;
679 size_t innleft = 1;
680 iconv_reset (conv_encs[enc].ito);
681 iconv (conv_encs[enc].ito, &inn, &innleft, &out, &outleft);
682 in++;
683 inleft--;
684 iconv_reset (conv_encs[enc].ito);
685 continue;
686 }
687 out = str.txt + str.len;
688 outleft = str.max - str.len - 2;
689 iconv_reset (conv_encs[enc].ito);
690 }
691 *out = '\0';
692 str.len = out - str.txt;
693 return &str;
694 }
695 #endif
696
iconv_from_usascii(strc_t in,UBYTE enc)697 static strc_t iconv_from_usascii (strc_t in, UBYTE enc)
698 {
699 static str_s str = { NULL, 0, 0 };
700 int off;
701 char c;
702
703 s_init (&str, "", in->len);
704 for (off = 0; off < in->len; off++)
705 s_catc (&str, (c = in->txt[off]) & 0x80 ? CHAR_BROKEN : c);
706 return &str;
707 }
708
iconv_to_usascii(strc_t in,UBYTE enc)709 static strc_t iconv_to_usascii (strc_t in, UBYTE enc)
710 {
711 static str_s str = { NULL, 0, 0 };
712 UDWORD ucs;
713 int off;
714
715 s_init (&str, "", in->len);
716 for (off = 0; off < in->len; )
717 {
718 ucs = ConvGetUTF8 (in, &off);
719 s_catc (&str, ucs >= 0x80 ? CHAR_NOT_AVAILABLE : ucs);
720 }
721 return &str;
722 }
723
724 #if ENABLE_FALLBACK_UCS2BE || ENABLE_FALLBACK_WIN1251 || ENABLE_FALLBACK_KOI8 \
725 || ENABLE_FALLBACK_LATIN9 || ENABLE_FALLBACK_LATIN1 || ENABLE_FALLBACK_UTF8 || ENABLE_FALLBACK_WCHART
726
727 #if ENABLE_FALLBACK_UTF8
iconv_utf8_buf(str_t out,strc_t in,UBYTE enc)728 strc_t iconv_utf8_buf (str_t out, strc_t in, UBYTE enc)
729 {
730 UDWORD ucs;
731 int off;
732
733 s_init (out, "", in->len);
734 for (off = 0; off < in->len; )
735 {
736 ucs = ConvGetUTF8 (in, &off);
737 s_cat (out, ConvUTF8 (ucs));
738 }
739 return out;
740 }
741
iconv_to_utf8(strc_t in,UBYTE enc)742 strc_t iconv_to_utf8 (strc_t in, UBYTE enc)
743 {
744 static str_s str = { NULL, 0, 0 };
745 return iconv_utf8_buf (&str, in, enc);
746 }
747
iconv_from_utf8(strc_t in,UBYTE enc)748 strc_t iconv_from_utf8 (strc_t in, UBYTE enc)
749 {
750 static str_s str = { NULL, 0, 0 };
751 return iconv_utf8_buf (&str, in, enc);
752 }
753 #endif
754
755 #if ENABLE_FALLBACK_LATIN1
iconv_from_latin1(strc_t in,UBYTE enc)756 static strc_t iconv_from_latin1 (strc_t in, UBYTE enc)
757 {
758 static str_s str = { NULL, 0, 0 };
759 int off;
760
761 s_init (&str, "", in->len);
762 for (off = 0; off < in->len; off++)
763 s_cat (&str, ConvUTF8 ((UBYTE)in->txt[off]));
764 return &str;
765 }
766
iconv_to_latin1(strc_t in,UBYTE enc)767 static strc_t iconv_to_latin1 (strc_t in, UBYTE enc)
768 {
769 static str_s str = { NULL, 0, 0 };
770 UDWORD ucs;
771 int off;
772
773 s_init (&str, "", in->len);
774 for (off = 0; off < in->len; )
775 {
776 ucs = ConvGetUTF8 (in, &off);
777 s_catc (&str, ucs & 0xffffff00 ? CHAR_NOT_AVAILABLE : ucs);
778 }
779 return &str;
780 }
781 #endif
782
783 #if ENABLE_FALLBACK_LATIN9
iconv_from_latin9(strc_t in,UBYTE enc)784 static strc_t iconv_from_latin9 (strc_t in, UBYTE enc)
785 {
786 static str_s str = { NULL, 0, 0 };
787 UDWORD ucs;
788 UBYTE c;
789 int off;
790
791 s_init (&str, "", in->len);
792 for (off = 0; off < in->len; off++)
793 {
794 c = in->txt[off];
795 switch (c)
796 {
797 case 0xa4: ucs = 0x20ac; /* EURO */
798 case 0xa6: ucs = 0x0160; /* SCARON */
799 case 0xa8: ucs = 0x0161; /* SMALL SCARON */
800 case 0xb4: ucs = 0x017d; /* ZCARON */
801 case 0xb8: ucs = 0x017e; /* SMALL ZCARON */
802 case 0xbc: ucs = 0x0152; /* OE */
803 case 0xbd: ucs = 0x0153; /* SMALL OE */
804 case 0xbe: ucs = 0x0178; /* Y DIAERESIS */
805 default: ucs = c;
806 }
807 s_cat (&str, ConvUTF8 (ucs));
808 }
809 return &str;
810 }
811
iconv_to_latin9(strc_t in,UBYTE enc)812 static strc_t iconv_to_latin9 (strc_t in, UBYTE enc)
813 {
814 static str_s str = { NULL, 0, 0 };
815 UDWORD ucs;
816 int off;
817
818 s_init (&str, "", in->len);
819 for (off = 0; off < in->len; )
820 {
821 ucs = ConvGetUTF8 (in, &off);
822 if (!(ucs & 0xffffff00))
823 {
824 switch (ucs)
825 {
826 case 0xa4: case 0xa6: case 0xa8: case 0xb4:
827 case 0xb8: case 0xbc: case 0xbd: case 0xbe:
828 ucs = CHAR_NOT_AVAILABLE;
829 }
830 s_catc (&str, ucs);
831 }
832 else
833 {
834 switch (ucs)
835 {
836 case 0x20ac: s_catc (&str, '\xa4'); /* EURO */
837 case 0x0160: s_catc (&str, '\xa6'); /* SCARON */
838 case 0x0161: s_catc (&str, '\xa8'); /* SMALL SCARON */
839 case 0x017d: s_catc (&str, '\xb4'); /* ZCARON */
840 case 0x017e: s_catc (&str, '\xb8'); /* SMALL ZCARON */
841 case 0x0152: s_catc (&str, '\xbc'); /* OE */
842 case 0x0153: s_catc (&str, '\xbd'); /* SMALL OE */
843 case 0x0178: s_catc (&str, '\xbe'); /* Y DIAERESIS */
844 default: s_catc (&str, CHAR_NOT_AVAILABLE);
845 }
846 }
847 }
848 return &str;
849 }
850 #endif
851
852 #if ENABLE_FALLBACK_KOI8
853 const UDWORD koi8u_utf8[] = { /* 7bit are us-ascii */
854 0x2500, 0x2502, 0x250c, 0x2510, 0x2514, 0x2518, 0x251c, 0x2524,
855 0x252c, 0x2534, 0x253c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
856 0x2591, 0x2592, 0x2593, 0x2320, 0x25a0, 0x2022, 0x221a, 0x2248,
857 0x2264, 0x2265, 0x00a0, 0x2321, 0x00b0, 0x00b2, 0x00b7, 0x00f7,
858 0x2550, 0x2551, 0x2552, 0x0451, 0x0454, 0x2554, 0x0456, 0x0457,
859 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x0491, 0x255d, 0x255e,
860 0x255f, 0x2560, 0x2561, 0x0401, 0x0403, 0x2563, 0x0406, 0x0407,
861 0x2566, 0x2567, 0x2568, 0x2569, 0x256a, 0x0490, 0x256c, 0x00a9,
862 0x044e, 0x0430, 0x0431, 0x0446, 0x0434, 0x0435, 0x0444, 0x0433,
863 0x0445, 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e,
864 0x043f, 0x044f, 0x0440, 0x0441, 0x0442, 0x0443, 0x0436, 0x0432,
865 0x044c, 0x044b, 0x0437, 0x0448, 0x044d, 0x0449, 0x0447, 0x044a,
866 0x042e, 0x0410, 0x0411, 0x0426, 0x0414, 0x0415, 0x0424, 0x0413,
867 0x0425, 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e,
868 0x041f, 0x042f, 0x0420, 0x0421, 0x0422, 0x0423, 0x0416, 0x0412,
869 0x042c, 0x042b, 0x0417, 0x0428, 0x042d, 0x0429, 0x0427, 0x042a,
870 0x0
871 };
872
iconv_from_koi8(strc_t in,UBYTE enc)873 static strc_t iconv_from_koi8 (strc_t in, UBYTE enc)
874 {
875 static str_s str = { NULL, 0, 0 };
876 UBYTE c;
877 int off;
878
879 s_init (&str, "", in->len);
880 for (off = 0; off < in->len; off++)
881 s_cat (&str, ConvUTF8 ((c = in->txt[off]) & 0x80 ? koi8u_utf8[c & 0x7f] : c));
882 return &str;
883 }
884
iconv_to_koi8(strc_t in,UBYTE enc)885 static strc_t iconv_to_koi8 (strc_t in, UBYTE enc)
886 {
887 static str_s str = { NULL, 0, 0 };
888 UDWORD ucs;
889 UBYTE c;
890 int off;
891
892 s_init (&str, "", in->len);
893 for (off = 0; off < in->len; )
894 {
895 ucs = ConvGetUTF8 (in, &off);
896 if (ucs & 0xffffff80)
897 {
898 for (c = 0; ~c & 0x80; c++)
899 if (koi8u_utf8[c] == ucs)
900 break;
901
902 s_catc (&str, c & 0x80 ? CHAR_NOT_AVAILABLE : c | 0x80);
903 }
904 else
905 s_catc (&str, ucs);
906 }
907 return &str;
908 }
909 #endif
910
911 #if ENABLE_FALLBACK_WIN1251
912 const UDWORD win1251_utf8[] = { /* 7bit are us-ascii */
913 0x0402, 0x0403, 0x201a, 0x0453, 0x201e, 0x2026, 0x2020, 0x2021,
914 0x0088, 0x2030, 0x0409, 0x2039, 0x040a, 0x040c, 0x040b, 0x040f,
915 0x0452, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
916 0x0098, 0x2122, 0x0459, 0x203a, 0x045a, 0x045c, 0x045b, 0x045f,
917 0x00a0, 0x040e, 0x045e, 0x0408, 0x00a4, 0x0490, 0x00a6, 0x00a7,
918 0x0401, 0x00a9, 0x0404, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x0407,
919 0x00b0, 0x00b1, 0x0406, 0x0456, 0x0491, 0x00b5, 0x00b6, 0x00b7,
920 0x0451, 0x2116, 0x0454, 0x00bb, 0x0458, 0x0405, 0x0455, 0x0457,
921 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,
922 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f,
923 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,
924 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f,
925 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437,
926 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f,
927 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447,
928 0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f,
929 0x0
930 };
931
iconv_from_win1251(strc_t in,UBYTE enc)932 static strc_t iconv_from_win1251 (strc_t in, UBYTE enc)
933 {
934 static str_s str = { NULL, 0, 0 };
935 UBYTE c;
936 int off;
937
938 s_init (&str, "", in->len);
939 for (off = 0; off < in->len; off++)
940 s_cat (&str, ConvUTF8 ((c = in->txt[off]) & 0x80 ? win1251_utf8[c & 0x7f] : c));
941 return &str;
942 }
943
iconv_to_win1251(strc_t in,UBYTE enc)944 static strc_t iconv_to_win1251 (strc_t in, UBYTE enc)
945 {
946 static str_s str = { NULL, 0, 0 };
947 UDWORD ucs;
948 UBYTE c;
949 int off;
950
951 s_init (&str, "", in->len);
952 for (off = 0; off < in->len; )
953 {
954 ucs = ConvGetUTF8 (in, &off);
955 if (ucs & 0xffffff80)
956 {
957 for (c = 0; ~c & 0x80; c++)
958 if (win1251_utf8[c] == ucs)
959 break;
960
961 s_catc (&str, c & 0x80 ? CHAR_NOT_AVAILABLE : c | 0x80);
962 }
963 else
964 s_catc (&str, ucs);
965 }
966 return &str;
967 }
968 #endif
969
970 #if ENABLE_FALLBACK_UCS2BE
iconv_from_ucs2be(strc_t in,UBYTE enc)971 static strc_t iconv_from_ucs2be (strc_t in, UBYTE enc)
972 {
973 static str_s str = { NULL, 0, 0 };
974 UDWORD ucs;
975 int off;
976
977 s_init (&str, "", in->len);
978 for (off = 0; off < in->len; )
979 {
980 if (off + 1 >= in->len)
981 {
982 s_catc (&str, CHAR_INCOMPLETE);
983 break;
984 }
985
986 ucs = (UBYTE)in->txt[off++] << 8;
987 ucs |= (UBYTE)in->txt[off++];
988 if ((ucs & 0xf800) == 0xd800)
989 s_catc (&str, CHAR_BROKEN);
990 else
991 s_cat (&str, ConvUTF8 (ucs));
992 }
993 return &str;
994 }
995
iconv_to_ucs2be(strc_t in,UBYTE enc)996 static strc_t iconv_to_ucs2be (strc_t in, UBYTE enc)
997 {
998 static str_s str = { NULL, 0, 0 };
999 UDWORD ucs;
1000 int off;
1001
1002 s_init (&str, "", in->len);
1003 for (off = 0; off < in->len; )
1004 {
1005 ucs = ConvGetUTF8 (in, &off);
1006 if (ucs & 0xffff0000)
1007 {
1008 s_catc (&str, 0);
1009 s_catc (&str, CHAR_NOT_AVAILABLE);
1010 }
1011 else
1012 {
1013 s_catc (&str, (ucs >> 8) & 0xff);
1014 s_catc (&str, ucs & 0xff);
1015 }
1016 }
1017 return &str;
1018 }
1019 #endif
1020
1021 #if ENABLE_FALLBACK_WCHART
iconv_from_wchart(strc_t in,UBYTE enc)1022 static strc_t iconv_from_wchart (strc_t in, UBYTE enc)
1023 {
1024 static str_s str = { NULL, 0, 0 };
1025 UDWORD ucs;
1026 int off;
1027
1028 s_init (&str, "", in->len);
1029 for (off = 0; off < in->len; )
1030 {
1031 if (off + sizeof (wchar_t) > in->len)
1032 {
1033 s_catc (&str, CHAR_INCOMPLETE);
1034 break;
1035 }
1036
1037 ucs = *((wchar_t *)(in->txt + off));
1038 off += sizeof (wchar_t);
1039 if ((ucs & 0xf800) == 0xd800)
1040 s_catc (&str, CHAR_BROKEN);
1041 else
1042 s_cat (&str, ConvUTF8 (ucs));
1043 }
1044 return &str;
1045 }
1046
iconv_to_wchart(strc_t in,UBYTE enc)1047 static strc_t iconv_to_wchart (strc_t in, UBYTE enc)
1048 {
1049 static str_s str = { NULL, 0, 0 };
1050 UDWORD ucs;
1051 wchar_t na = CHAR_NOT_AVAILABLE;
1052 int off;
1053
1054 s_init (&str, "", in->len);
1055 for (off = 0; off < in->len; )
1056 {
1057 ucs = ConvGetUTF8 (in, &off);
1058 if ((ucs & 0xf800) == 0xd800)
1059 s_catc (&str, CHAR_BROKEN);
1060 else if ( (sizeof (wchar_t) <= 1 && ucs & 0xffffff00)
1061 || (sizeof (wchar_t) <= 2 && ucs & 0xffff0000))
1062 s_catn (&str, (const char *)&na, sizeof (wchar_t));
1063 else
1064 s_catn (&str, (const char *)&ucs, sizeof (wchar_t));
1065 }
1066 return &str;
1067 }
1068 #endif
1069 #endif /* ENABLE_FALLBACK_* */
1070