1 /*
2  * This file handles character conversions.
3  *
4  * climm Copyright (C) © 2001-2005 Rüdiger Kuhlmann
5  *
6  * climm is free software; you can redistribute it and/or modify it
7  * under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; version 2 dated June, 1991.
9  *
10  * climm is distributed in the hope that it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
12  * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
13  * License for more details.
14  *
15  * In addition, as a special exception permission is granted to link the
16  * code of this release of climm with the OpenSSL project's "OpenSSL"
17  * library, and distribute the linked executables.  You must obey the GNU
18  * General Public License in all respects for all of the code used other
19  * than "OpenSSL".  If you modify this file, you may extend this exception
20  * to your version of the file, but you are not obligated to do so.  If you
21  * do not wish to do so, delete this exception statement from your version
22  * of this file.
23  *
24  * You should have received a copy of the GNU General Public License
25  * along with this package; if not, write to the Free Software
26  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
27  * 02111-1307, USA.
28  *
29  *
30  * $Id: conv.c 2420 2007-10-10 21:46:48Z kuhlmann $
31  */
32 
33 #include "climm.h"
34 #if HAVE_ERRNO_H
35 #include <errno.h>
36 #endif
37 #if HAVE_CTYPE_H
38 #include <ctype.h>
39 #endif
40 #include "conv.h"
41 #include "preferences.h"
42 #if !HAVE_WCWIDTH
43 #undef ENABLE_FALLBACK_WCHART
44 #endif
45 
46 typedef strc_t (iconv_func)(strc_t, UBYTE);
47 
48 #if HAVE_ICONV
49 #include <iconv.h>
50 static strc_t iconv_from_iconv (strc_t, UBYTE);
51 static strc_t iconv_to_iconv (strc_t, UBYTE);
52 #endif
53 static iconv_func iconv_from_usascii, iconv_to_usascii;
54 #if ENABLE_FALLBACK_UTF8
55 static iconv_func iconv_from_utf8, iconv_to_utf8;
56 #endif
57 #if ENABLE_FALLBACK_LATIN1
58 static iconv_func iconv_from_latin1, iconv_to_latin1;
59 #endif
60 #if ENABLE_FALLBACK_LATIN9
61 static iconv_func iconv_from_latin9, iconv_to_latin9;
62 #endif
63 #if ENABLE_FALLBACK_KOI8
64 static iconv_func iconv_from_koi8, iconv_to_koi8;
65 #endif
66 #if ENABLE_FALLBACK_WIN1251
67 static iconv_func iconv_from_win1251, iconv_to_win1251;
68 #endif
69 #if ENABLE_FALLBACK_UCS2BE
70 static iconv_func iconv_from_ucs2be, iconv_to_ucs2be;
71 #endif
72 #if ENABLE_FALLBACK_WCHART
73 static iconv_func iconv_from_wchart, iconv_to_wchart;
74 #endif
75 typedef struct { const char *enca; const char *encb; const char *encc; const char *encd;
76 #if HAVE_ICONV
77                  iconv_t     iof; iconv_t      ito;
78 #endif
79                  iconv_func *fof; iconv_func *fto; } enc_t;
80 
81 static int conv_nr = 0;
82 static enc_t *conv_encs = NULL;
83 
84 UBYTE conv_error = 0;
85 
86 #if HAVE_ICONV
87 static const char *Utf8Name = "UTF-8";
88 /*
89  * Check whether iconv() can handle it.
90  */
iconv_check(UBYTE enc)91 static BOOL iconv_check (UBYTE enc)
92 {
93 #ifdef ENABLE_TRANSLIT
94     conv_encs[enc].ito = iconv_open (s_sprintf ("%s//TRANSLIT", conv_encs[enc].enca), Utf8Name);
95     if (conv_encs[enc].ito == (iconv_t)-1)
96 #endif
97         conv_encs[enc].ito = iconv_open (conv_encs[enc].enca, Utf8Name);
98     conv_encs[enc].iof = iconv_open (Utf8Name, conv_encs[enc].enca);
99     if ((conv_encs[enc].ito == (iconv_t)-1 || conv_encs[enc].iof == (iconv_t)-1)
100         && conv_encs[enc].encb)
101     {
102 #ifdef ENABLE_TRANSLIT
103         conv_encs[enc].ito = iconv_open (s_sprintf ("%s//TRANSLIT", conv_encs[enc].encb), Utf8Name);
104         if (conv_encs[enc].ito == (iconv_t)-1)
105 #endif
106             conv_encs[enc].ito = iconv_open (conv_encs[enc].encb, Utf8Name);
107         conv_encs[enc].iof = iconv_open (Utf8Name, conv_encs[enc].encb);
108     }
109     if ((conv_encs[enc].ito == (iconv_t)-1 || conv_encs[enc].iof == (iconv_t)-1)
110         && conv_encs[enc].encc)
111     {
112 #ifdef ENABLE_TRANSLIT
113         conv_encs[enc].ito = iconv_open (s_sprintf ("%s//TRANSLIT", conv_encs[enc].encc), Utf8Name);
114         if (conv_encs[enc].ito == (iconv_t)-1)
115 #endif
116             conv_encs[enc].ito = iconv_open (conv_encs[enc].encc, Utf8Name);
117         conv_encs[enc].iof = iconv_open (Utf8Name, conv_encs[enc].encc);
118     }
119     if ((conv_encs[enc].ito == (iconv_t)-1 || conv_encs[enc].iof == (iconv_t)-1)
120         && conv_encs[enc].encd)
121     {
122 #ifdef ENABLE_TRANSLIT
123         conv_encs[enc].ito = iconv_open (s_sprintf ("%s//TRANSLIT", conv_encs[enc].encd), Utf8Name);
124         if (conv_encs[enc].ito == (iconv_t)-1)
125 #endif
126             conv_encs[enc].ito = iconv_open (conv_encs[enc].encd, Utf8Name);
127         conv_encs[enc].iof = iconv_open (Utf8Name, conv_encs[enc].encd);
128     }
129     if (enc == ENC_LATIN1 && conv_encs[enc].ito == (iconv_t)-1)
130     {
131         conv_encs[enc].ito = iconv_open (conv_encs[enc].encc, "utf8");
132         conv_encs[enc].iof = iconv_open ("utf8", conv_encs[enc].encc);
133         if (conv_encs[enc].ito != (iconv_t)-1 && conv_encs[enc].iof != (iconv_t)-1)
134             Utf8Name = "utf8";
135     }
136     if (conv_encs[enc].ito != (iconv_t)-1 && conv_encs[enc].iof != (iconv_t)-1)
137     {
138         conv_encs[enc].fof = &iconv_from_iconv;
139         conv_encs[enc].fto = &iconv_to_iconv;
140         return TRUE;
141     }
142     return FALSE;
143 }
144 #endif
145 
146 #if HAVE_ICONV
iconv_reset(iconv_t cd)147 static inline void iconv_reset (iconv_t cd)
148 {
149     size_t sunos57_sucks_inl = 0, sunos57_sucks_outl = 0;
150     char *sunos57_sucks_outb = NULL;
151     /* SunOS 5.7 segfaults if anything other than inb is NULL */
152     iconv (cd, NULL, &sunos57_sucks_inl, &sunos57_sucks_outb, &sunos57_sucks_outl);
153 }
154 #endif
155 
156 /*
157  * Initialize encoding table.
158  */
ConvInit(void)159 void ConvInit (void)
160 {
161     conv_error = 0;
162     conv_encs = calloc (sizeof (enc_t), conv_nr = 15);
163     conv_encs[ENC_ASCII].enca = "US-ASCII";
164     conv_encs[ENC_ASCII].encb = "USASCII";
165     conv_encs[ENC_ASCII].encc = "ANSI_X3.4-1968";
166     conv_encs[ENC_UTF8].enca = "UTF-8";
167     conv_encs[ENC_LATIN1].enca = "ISO-8859-1";
168     conv_encs[ENC_LATIN1].encb = "ISO8859-1";
169     conv_encs[ENC_LATIN1].encc = "ISO88591"; /* don't re-sort */
170     conv_encs[ENC_LATIN1].encd = "LATIN1";
171     conv_encs[ENC_LATIN9].enca = "ISO-8859-15";
172     conv_encs[ENC_LATIN9].encb = "ISO8859-15";
173     conv_encs[ENC_LATIN9].encc = "ISO885915";
174     conv_encs[ENC_LATIN9].encd = "LATIN9";
175     conv_encs[ENC_KOI8].enca = "KOI8-U";
176     conv_encs[ENC_KOI8].encb = "KOI8-R";
177     conv_encs[ENC_KOI8].encc = "KOI8";
178     conv_encs[ENC_WIN1251].enca = "CP1251";
179     conv_encs[ENC_WIN1251].encb = "WINDOWS-1251";
180     conv_encs[ENC_WIN1251].encc = "CP-1251";
181     conv_encs[ENC_UCS2BE].enca = "UCS-2BE";
182     conv_encs[ENC_UCS2BE].encb = "UNICODEBIG";
183     conv_encs[ENC_UCS2BE].encc = "UNICODE-2-0"; /* ICQ sucks */
184     conv_encs[ENC_WIN1257].enca = "CP1257";
185     conv_encs[ENC_WIN1257].encb = "WINDOWS-1257";
186     conv_encs[ENC_WIN1257].encc = "CP-1257";
187     conv_encs[ENC_EUC].enca = "EUC-JP";
188     conv_encs[ENC_SJIS].enca = "SHIFT-JIS";
189     conv_encs[ENC_SJIS].encb = "SJIS";
190     conv_encs[ENC_WCHART].enca = "WCHAR_T";
191 
192 #if HAVE_ICONV
193     /* extra check for UTF-8 */
194     ConvEnc (conv_encs[ENC_UTF8].enca);
195     if (conv_encs[ENC_UTF8].fof)
196     {
197         size_t inl = 2, outl = 10;
198         char inb[10], outb[10], *outp = outb;
199         ICONV_CONST char *inp = inb;
200         strcpy (inb, "\xfc.\xc0\xaf");
201         if (iconv (conv_encs[ENC_UTF8].ito, &inp, &inl, &outp, &outl) != (size_t)-1)
202             conv_encs[ENC_UTF8].fto = conv_encs[ENC_UTF8].fof = NULL;
203         else
204         {
205             inp = inb + 2;
206             iconv_reset (conv_encs[ENC_UTF8].ito);
207             if ((iconv (conv_encs[ENC_UTF8].ito, &inp, &inl, &outp, &outl) != (size_t)-1) && *outp != '/')
208                 conv_encs[ENC_UTF8].fto = conv_encs[ENC_UTF8].fof = NULL;
209         }
210     }
211 #endif
212     if (!conv_encs[ENC_ASCII].fof)
213     {
214         conv_encs[ENC_ASCII].fof  = &iconv_from_usascii;
215         conv_encs[ENC_ASCII].fto  = &iconv_to_usascii;
216     }
217     if (!conv_encs[ENC_UTF8].fof)
218     {
219 #if ENABLE_FALLBACK_UTF8
220         conv_encs[ENC_UTF8].fof  = &iconv_from_utf8;
221         conv_encs[ENC_UTF8].fto  = &iconv_to_utf8;
222 #else
223         conv_encs[ENC_UTF8].fof  = conv_encs[ENC_ASCII].fof;
224         conv_encs[ENC_UTF8].fto  = conv_encs[ENC_ASCII].fto;
225 #endif
226     }
227     if (!conv_encs[ENC_LATIN1].fof)
228     {
229 #if ENABLE_FALLBACK_LATIN1
230         conv_encs[ENC_LATIN1].fof  = &iconv_from_latin1;
231         conv_encs[ENC_LATIN1].fto  = &iconv_to_latin1;
232 #else
233         conv_encs[ENC_LATIN1].fof  = conv_encs[ENC_ASCII].fof;
234         conv_encs[ENC_LATIN1].fto  = conv_encs[ENC_ASCII].fto;
235 #endif
236     }
237     if (!conv_encs[ENC_LATIN9].fof)
238     {
239 #if ENABLE_FALLBACK_LATIN9
240         conv_encs[ENC_LATIN9].fof  = &iconv_from_latin9;
241         conv_encs[ENC_LATIN9].fto  = &iconv_to_latin9;
242 #else
243         conv_encs[ENC_LATIN9].fof  = conv_encs[ENC_ASCII].fof;
244         conv_encs[ENC_LATIN9].fto  = conv_encs[ENC_ASCII].fto;
245 #endif
246     }
247     if (!conv_encs[ENC_KOI8].fof)
248     {
249 #if ENABLE_FALLBACK_KOI8
250         conv_encs[ENC_KOI8].fof  = &iconv_from_koi8;
251         conv_encs[ENC_KOI8].fto  = &iconv_to_koi8;
252 #else
253         conv_encs[ENC_KOI8].fof  = conv_encs[ENC_ASCII].fof;
254         conv_encs[ENC_KOI8].fto  = conv_encs[ENC_ASCII].fto;
255 #endif
256     }
257     if (!conv_encs[ENC_WIN1251].fof)
258     {
259 #if ENABLE_FALLBACK_WIN1251
260         conv_encs[ENC_WIN1251].fof  = &iconv_from_win1251;
261         conv_encs[ENC_WIN1251].fto  = &iconv_to_win1251;
262 #else
263         conv_encs[ENC_WIN1251].fof  = conv_encs[ENC_ASCII].fof;
264         conv_encs[ENC_WIN1251].fto  = conv_encs[ENC_ASCII].fto;
265 #endif
266     }
267     if (!conv_encs[ENC_UCS2BE].fof)
268     {
269 #if ENABLE_FALLBACK_UCS2BE
270         conv_encs[ENC_UCS2BE].fof  = &iconv_from_ucs2be;
271         conv_encs[ENC_UCS2BE].fto  = &iconv_to_ucs2be;
272 #else
273         conv_encs[ENC_UCS2BE].fof  = conv_encs[ENC_ASCII].fof;
274         conv_encs[ENC_UCS2BE].fto  = conv_encs[ENC_ASCII].fto;
275 #endif
276     }
277     if (!conv_encs[ENC_WCHART].fof)
278     {
279 #if ENABLE_FALLBACK_WCHART
280         conv_encs[ENC_WCHART].fof  = &iconv_from_wchart;
281         conv_encs[ENC_WCHART].fto  = &iconv_to_wchart;
282 #else
283         conv_encs[ENC_WCHART].fof  = conv_encs[ENC_ASCII].fof;
284         conv_encs[ENC_WCHART].fto  = conv_encs[ENC_ASCII].fto;
285 #endif
286     }
287 }
288 
289 /*
290  * Give an ID for the given encoding name.
291  */
ConvEnc(const char * enc)292 UBYTE ConvEnc (const char *enc)
293 {
294     UBYTE nr;
295 
296     for (nr = 0; conv_encs[nr].enca; nr++)
297         if (!strcasecmp (conv_encs[nr].enca, enc) ||
298             (conv_encs[nr].encb && !strcasecmp (conv_encs[nr].encb, enc)) ||
299             (conv_encs[nr].encc && !strcasecmp (conv_encs[nr].encc, enc)) ||
300             (conv_encs[nr].encd && !strcasecmp (conv_encs[nr].encd, enc)))
301         {
302 #if HAVE_ICONV
303             if (!conv_encs[nr].ito || !conv_encs[nr].iof)
304                 iconv_check (nr);
305             if (conv_encs[nr].ito != (iconv_t)(-1) && conv_encs[nr].iof != (iconv_t)(-1))
306                 return nr;
307             return ENC_FERR | nr;
308 #endif
309             if (conv_encs[nr].fof && conv_encs[nr].fto)
310                 return nr;
311             break;
312         }
313 
314     if (nr & ENC_FLAGS)
315         return ENC_FERR;
316     if (nr == conv_nr - 1)
317     {
318         enc_t *newc = realloc (conv_encs, sizeof (enc_t) * (conv_nr + 8));
319         if (!newc)
320             return 0;
321         conv_nr += 8;
322         conv_encs = newc;
323     }
324     if (!conv_encs[nr].enca)
325     {
326         char *p;
327         for (conv_encs[nr].enca = p = strdup (enc); *p; p++)
328             *p = toupper (*p);
329         conv_encs[nr].encb = strdup (enc);
330         conv_encs[nr + 1].enca = NULL;
331     }
332 #if HAVE_ICONV
333     if (iconv_check (nr))
334         return nr;
335 #endif
336     conv_error = nr;
337     return ENC_FERR | nr;
338 }
339 
340 /*
341  * Give the encoding name for a given ID
342  */
ConvEncName(UBYTE enc)343 const char *ConvEncName (UBYTE enc)
344 {
345     if ((enc & ~ENC_FLAGS) > conv_nr)
346         return "<auto/undefined>";
347     return conv_encs[enc & ~ENC_FLAGS].enca;
348 }
349 
ConvCrush0xFE(const char * in)350 const char *ConvCrush0xFE (const char *in)
351 {
352     static str_s t;
353     char *p;
354 
355     if (!in || !*in)
356         return "";
357 
358     s_init (&t, in, 0);
359 
360     for (p = t.txt; *p; p++)
361         if (*p == Conv0xFE)
362             *p = '*';
363     return t.txt;
364 }
365 
366 /*
367  * Convert a single unicode code point to UTF-8
368  */
ConvUTF8(UDWORD ucs)369 const char *ConvUTF8 (UDWORD ucs)
370 {
371     static char b[5];
372 
373     if      (!(ucs & 0xffffff80))
374     {
375         b[0] = ucs;
376         b[1] = '\0';
377     }
378     else if (!(ucs & 0xfffff800))
379     {
380         b[0] = 0xc0 |  (ucs               >>  6);
381         b[1] = 0x80 |  (ucs &       0x3f);
382         b[2] = '\0';
383     }
384     else if (!(ucs & 0xffff0000))
385     {
386         b[0] = 0xe0 | ( ucs               >> 12);
387         b[1] = 0x80 | ((ucs &      0xfc0) >>  6);
388         b[2] = 0x80 |  (ucs &       0x3f);
389         b[3] = '\0';
390     }
391     else if (!(ucs & 0xffe00000))
392     {
393         b[0] = 0xf0 | ( ucs               >> 18);
394         b[1] = 0x80 | ((ucs &    0x3f000) >> 12);
395         b[2] = 0x80 | ((ucs &      0xfc0) >>  6);
396         b[3] = 0x80 |  (ucs &       0x3f);
397         b[4] = '\0';
398     }
399     else
400     {
401         b[0] = CHAR_BROKEN;
402         b[1] = '\0';
403     }
404     return b;
405 }
406 
ConvGetUTF8(strc_t in,int * off)407 UDWORD ConvGetUTF8 (strc_t in, int *off)
408 {
409      UDWORD ucs = 0;
410      int i, continuations = 1;
411      UBYTE  c = in->txt[(*off)++];
412 
413      if (~c & 0x80)
414          return c;
415 
416      if (~c & 0x40)
417          return CHAR_BROKEN;
418 
419      while (c & 0x20)
420      {
421          continuations++;
422          c <<= 1;
423      }
424 
425      c &= 0x3f;
426      c >>= continuations - 1;
427 
428      for (i = 0, ucs = c; i < continuations; i++)
429      {
430          if (((c = in->txt[*off + i]) & 0xc0) != 0x80)
431              return c ? CHAR_BROKEN : CHAR_INCOMPLETE;
432 
433          c &= 0x3f;
434          ucs <<= 6;
435          ucs |= c;
436      }
437      *off += continuations;
438      return ucs;
439 }
440 
ConvFrom(strc_t text,UBYTE enc)441 strc_t ConvFrom (strc_t text, UBYTE enc)
442 {
443     enc &= ~ENC_FLAGS;
444 #if HAVE_ICONV
445     if ((enc < conv_nr) && !conv_encs[enc].iof)
446         iconv_check (enc);
447 #endif
448     if ((enc >= conv_nr) || (!conv_encs[enc].fof))
449         enc = ENC_ASCII;
450     return conv_encs[enc].fof (text, enc);
451 }
452 
ConvFromSplit(strc_t text,UBYTE enc)453 strc_t ConvFromSplit (strc_t text, UBYTE enc)
454 {
455     static str_s str;
456     str_s tstr;
457     const char *p;
458     size_t tlen = text->len;
459 
460     s_init (&str, "", 100);
461     tstr.txt = (char *)text->txt;
462     tstr.max = 0;
463     while ((p = memchr (tstr.txt, '\xfe', tlen)))
464     {
465         tstr.len = p - tstr.txt;
466         s_cat (&str, ConvFrom (&tstr, enc)->txt);
467         s_catc (&str, '\xfe');
468         tstr.txt += tstr.len + 1;
469         tlen     -= tstr.len + 1;
470     }
471     tstr.len = tlen;
472     s_cat (&str, ConvFrom (&tstr, enc)->txt);
473     return &str;
474 }
475 
ConvToSplit(const char * text,UBYTE enc)476 strc_t ConvToSplit (const char *text, UBYTE enc)
477 {
478     static str_s str;
479     const char *p;
480     size_t tlen = strlen (text);
481     size_t plen;
482 
483     s_init (&str, "", 100);
484     while ((p = memchr (text, '\xfe', tlen)))
485     {
486         plen = p - text;
487         s_cat (&str, ConvToLen (text, enc, plen)->txt);
488         s_catc (&str, '\xfe');
489         text += plen + 1;
490         tlen -= plen + 1;
491     }
492     s_cat (&str, ConvToLen (text, enc, tlen)->txt);
493     return &str;
494 }
495 
ConvToLen(const char * ctext,UBYTE enc,size_t len)496 strc_t ConvToLen (const char *ctext, UBYTE enc, size_t len)
497 {
498     str_s text;
499     enc &= ~ENC_FLAGS;
500 #if HAVE_ICONV
501     if ((enc < conv_nr) && !conv_encs[enc].ito)
502         iconv_check (enc);
503 #endif
504     text.txt = (char *)ctext;
505     text.len = len;
506     text.max = 0;
507     if ((enc >= conv_nr) || (!conv_encs[enc].fto))
508         enc = ENC_ASCII;
509     return conv_encs[enc].fto (&text, enc);
510 }
511 
ConvTo(const char * ctext,UBYTE enc)512 strc_t ConvTo (const char *ctext, UBYTE enc)
513 {
514     return ConvToLen (ctext, enc, ctext ? strlen (ctext) : 0);
515 }
516 
ConvFromMime(strc_t mime,strc_t text)517 strc_t ConvFromMime (strc_t mime, strc_t text)
518 {
519     static str_s out;
520     strc_t o;
521     UBYTE enc = ENC_ASCII;
522     const char *e;
523     if ((e = strstr (mime->txt, "charset=")))
524     {
525         char *ee = strdup (e + 8);
526         e = ee;
527         if (*e == '"')
528         {
529             e++;
530             if (strchr (e, '"'))
531                 *strchr (e, '"') = 0;
532         }
533         if (strchr (e, ';'))
534             *strchr (e, ';') = 0;
535         enc = ConvEnc (e);
536         if (!enc || ENC_FERR & enc)
537             enc = ENC_ASCII;
538         free (ee);
539         if (enc != ENC_ASCII && enc != ENC_UTF8 && ConvIsUTF8 (text->txt) && text->len == strlen (text->txt))
540             enc = ENC_UTF8; /* ICQ 5.10, Build 3000 send l1 even though it _is_ utf8 */
541     }
542     o = ConvFrom (text, enc);
543     s_init (&out, o->txt, 0);
544 
545     if (   (!strncmp (mime->txt, "text/x-aolrtf", 13) && (!mime->txt[13] || mime->txt[13] == ';'))
546         || (!strncmp (mime->txt, "text/aolrtf", 11)   && (!mime->txt[11] || mime->txt[11] == ';'))
547         || (!strncmp (mime->txt, "text/html", 9)      && (!mime->txt[9]  || mime->txt[9]  == ';')) )
548     {
549         /* more or less html */
550         s_strrepl (&out, "<html>", "");
551         s_strrepl (&out, "</html>", "");
552         s_strrepl (&out, "<body>", "");
553         s_strrepl (&out, "</body>", "");
554         s_strrepl (&out, "<br/>", "\n");
555         s_strrepl (&out, "<br>", "\n");
556     }
557     else if (!strncmp (mime->txt, "text/plain", 10)   && (!mime->txt[10] || mime->txt[10] == ';'))
558     {
559         /* nothing to do */
560     }
561     else
562     {
563         /* unknown */
564         s_insn (&out, 0, mime->txt, mime->len);
565     }
566     return &out;
567 }
568 
569 /*
570  * Transliterates manually a string if it doesn't fit into the local
571  * encoding
572  */
ConvTranslit(const char * orig,const char * trans)573 const char *ConvTranslit (const char *orig, const char *trans)
574 {
575     UBYTE enc = prG->enc_loc;
576 
577     if (strcmp (orig, ConvFrom (ConvTo (orig, enc), enc)->txt))
578         return trans;
579     return orig;
580 }
581 
ConvFits(const char * in,UBYTE enc)582 BOOL ConvFits (const char *in, UBYTE enc)
583 {
584     char *inn, *p;
585     int i;
586 
587     inn = strdup (in);
588     if (!inn)
589         return 0;
590     for (p = inn; *p; p++)
591         if (*p == Conv0xFE || *p == CHAR_NOT_AVAILABLE || *p == CHAR_INCOMPLETE || *p == CHAR_BROKEN)
592             *p = ' ';
593     i = strpbrk (ConvFrom (ConvTo (inn, enc), enc)->txt, "?*_") ? 0 : 1;
594     free (inn);
595     return i;
596 }
597 
598 #if HAVE_ICONV
iconv_from_iconv(strc_t text,UBYTE enc)599 static strc_t iconv_from_iconv (strc_t text, UBYTE enc)
600 {
601     static str_s str;
602 
603     size_t inleft, outleft;
604     char *out;
605     ICONV_CONST char *in;
606 
607     s_init (&str, "", 100);
608     out = str.txt;
609     outleft = str.max - 2;
610     in = (ICONV_CONST char *) text->txt;
611     inleft = text->len;
612 
613     iconv_reset (conv_encs[enc].iof);
614     while (iconv (conv_encs[enc].iof, &in, &inleft, &out, &outleft) == (size_t)(-1))
615     {
616         UDWORD rc = errno;
617         str.len = out - str.txt;
618 
619         if (outleft < 10 || rc == E2BIG)
620             s_blow (&str, 50 + inleft);
621         else if (rc == EINVAL)
622         {
623             s_catc (&str, CHAR_INCOMPLETE);
624             str.txt[str.len] = '\0';
625             return &str;
626         }
627         else
628         {
629             s_catc (&str, CHAR_NOT_AVAILABLE);
630             in++;
631             inleft--;
632         }
633         out = str.txt + str.len;
634         outleft = str.max - str.len - 2;
635         iconv_reset (conv_encs[enc].iof);
636     }
637     *out = '\0';
638     str.len = out - str.txt;
639     return &str;
640 }
641 
iconv_to_iconv(strc_t text,UBYTE enc)642 static strc_t iconv_to_iconv (strc_t text, UBYTE enc)
643 {
644     static str_s str;
645 
646     size_t inleft, outleft;
647     char *out;
648     ICONV_CONST char *in;
649 
650     s_init (&str, "", 100 + text->len);
651     out = str.txt;
652     outleft = str.max - 2;
653     in = (ICONV_CONST char *) text->txt;
654     inleft = text->len;
655 
656     iconv_reset (conv_encs[enc].ito);
657     while (iconv (conv_encs[enc].ito, &in, &inleft, &out, &outleft) == (size_t)(-1))
658     {
659         UDWORD rc = errno;
660         str.len = out - str.txt;
661 
662         if (outleft < 10 || rc == E2BIG)
663             s_blow (&str, 50 + inleft);
664         else if (rc == EINVAL)
665         {
666             char inc = CHAR_INCOMPLETE;
667             ICONV_CONST char *in = &inc;
668             size_t inleft = 1;
669             iconv_reset (conv_encs[enc].ito);
670             iconv (conv_encs[enc].ito, &in, &inleft, &out, &outleft);
671             str.len = out - str.txt;
672             str.txt[str.len] = '\0';
673             return &str;
674         }
675         else
676         {
677             char inc = CHAR_NOT_AVAILABLE;
678             ICONV_CONST char *inn = &inc;
679             size_t innleft = 1;
680             iconv_reset (conv_encs[enc].ito);
681             iconv (conv_encs[enc].ito, &inn, &innleft, &out, &outleft);
682             in++;
683             inleft--;
684             iconv_reset (conv_encs[enc].ito);
685             continue;
686         }
687         out = str.txt + str.len;
688         outleft = str.max - str.len - 2;
689         iconv_reset (conv_encs[enc].ito);
690     }
691     *out = '\0';
692     str.len = out - str.txt;
693     return &str;
694 }
695 #endif
696 
iconv_from_usascii(strc_t in,UBYTE enc)697 static strc_t iconv_from_usascii (strc_t in, UBYTE enc)
698 {
699     static str_s str = { NULL, 0, 0 };
700     int off;
701     char c;
702 
703     s_init (&str, "", in->len);
704     for (off = 0; off < in->len; off++)
705         s_catc (&str, (c = in->txt[off]) & 0x80 ? CHAR_BROKEN : c);
706     return &str;
707 }
708 
iconv_to_usascii(strc_t in,UBYTE enc)709 static strc_t iconv_to_usascii (strc_t in, UBYTE enc)
710 {
711     static str_s str = { NULL, 0, 0 };
712     UDWORD ucs;
713     int off;
714 
715     s_init (&str, "", in->len);
716     for (off = 0; off < in->len; )
717     {
718         ucs = ConvGetUTF8 (in, &off);
719         s_catc (&str, ucs >= 0x80 ? CHAR_NOT_AVAILABLE : ucs);
720     }
721     return &str;
722 }
723 
724 #if ENABLE_FALLBACK_UCS2BE || ENABLE_FALLBACK_WIN1251 || ENABLE_FALLBACK_KOI8 \
725   || ENABLE_FALLBACK_LATIN9 || ENABLE_FALLBACK_LATIN1 || ENABLE_FALLBACK_UTF8 || ENABLE_FALLBACK_WCHART
726 
727 #if ENABLE_FALLBACK_UTF8
iconv_utf8_buf(str_t out,strc_t in,UBYTE enc)728 strc_t iconv_utf8_buf (str_t out, strc_t in, UBYTE enc)
729 {
730     UDWORD ucs;
731     int off;
732 
733     s_init (out, "", in->len);
734     for (off = 0; off < in->len; )
735     {
736         ucs = ConvGetUTF8 (in, &off);
737         s_cat (out, ConvUTF8 (ucs));
738     }
739     return out;
740 }
741 
iconv_to_utf8(strc_t in,UBYTE enc)742 strc_t iconv_to_utf8 (strc_t in, UBYTE enc)
743 {
744     static str_s str = { NULL, 0, 0 };
745     return iconv_utf8_buf (&str, in, enc);
746 }
747 
iconv_from_utf8(strc_t in,UBYTE enc)748 strc_t iconv_from_utf8 (strc_t in, UBYTE enc)
749 {
750     static str_s str = { NULL, 0, 0 };
751     return iconv_utf8_buf (&str, in, enc);
752 }
753 #endif
754 
755 #if ENABLE_FALLBACK_LATIN1
iconv_from_latin1(strc_t in,UBYTE enc)756 static strc_t iconv_from_latin1 (strc_t in, UBYTE enc)
757 {
758     static str_s str = { NULL, 0, 0 };
759     int off;
760 
761     s_init (&str, "", in->len);
762     for (off = 0; off < in->len; off++)
763         s_cat (&str, ConvUTF8 ((UBYTE)in->txt[off]));
764     return &str;
765 }
766 
iconv_to_latin1(strc_t in,UBYTE enc)767 static strc_t iconv_to_latin1 (strc_t in, UBYTE enc)
768 {
769     static str_s str = { NULL, 0, 0 };
770     UDWORD ucs;
771     int off;
772 
773     s_init (&str, "", in->len);
774     for (off = 0; off < in->len; )
775     {
776         ucs = ConvGetUTF8 (in, &off);
777         s_catc (&str, ucs & 0xffffff00 ? CHAR_NOT_AVAILABLE : ucs);
778     }
779     return &str;
780 }
781 #endif
782 
783 #if ENABLE_FALLBACK_LATIN9
iconv_from_latin9(strc_t in,UBYTE enc)784 static strc_t iconv_from_latin9 (strc_t in, UBYTE enc)
785 {
786     static str_s str = { NULL, 0, 0 };
787     UDWORD ucs;
788     UBYTE c;
789     int off;
790 
791     s_init (&str, "", in->len);
792     for (off = 0; off < in->len; off++)
793     {
794         c = in->txt[off];
795         switch (c)
796         {
797             case 0xa4: ucs = 0x20ac; /* EURO */
798             case 0xa6: ucs = 0x0160; /* SCARON */
799             case 0xa8: ucs = 0x0161; /* SMALL SCARON */
800             case 0xb4: ucs = 0x017d; /* ZCARON */
801             case 0xb8: ucs = 0x017e; /* SMALL ZCARON */
802             case 0xbc: ucs = 0x0152; /* OE */
803             case 0xbd: ucs = 0x0153; /* SMALL OE */
804             case 0xbe: ucs = 0x0178; /* Y DIAERESIS */
805             default:   ucs = c;
806         }
807         s_cat (&str, ConvUTF8 (ucs));
808     }
809     return &str;
810 }
811 
iconv_to_latin9(strc_t in,UBYTE enc)812 static strc_t iconv_to_latin9 (strc_t in, UBYTE enc)
813 {
814     static str_s str = { NULL, 0, 0 };
815     UDWORD ucs;
816     int off;
817 
818     s_init (&str, "", in->len);
819     for (off = 0; off < in->len; )
820     {
821         ucs = ConvGetUTF8 (in, &off);
822         if (!(ucs & 0xffffff00))
823         {
824             switch (ucs)
825             {
826                 case 0xa4: case 0xa6: case 0xa8: case 0xb4:
827                 case 0xb8: case 0xbc: case 0xbd: case 0xbe:
828                     ucs = CHAR_NOT_AVAILABLE;
829             }
830             s_catc (&str, ucs);
831         }
832         else
833         {
834             switch (ucs)
835             {
836                 case 0x20ac: s_catc (&str, '\xa4'); /* EURO */
837                 case 0x0160: s_catc (&str, '\xa6'); /* SCARON */
838                 case 0x0161: s_catc (&str, '\xa8'); /* SMALL SCARON */
839                 case 0x017d: s_catc (&str, '\xb4'); /* ZCARON */
840                 case 0x017e: s_catc (&str, '\xb8'); /* SMALL ZCARON */
841                 case 0x0152: s_catc (&str, '\xbc'); /* OE */
842                 case 0x0153: s_catc (&str, '\xbd'); /* SMALL OE */
843                 case 0x0178: s_catc (&str, '\xbe'); /* Y DIAERESIS */
844                 default:     s_catc (&str, CHAR_NOT_AVAILABLE);
845             }
846         }
847     }
848     return &str;
849 }
850 #endif
851 
852 #if ENABLE_FALLBACK_KOI8
853 const UDWORD koi8u_utf8[] = { /* 7bit are us-ascii */
854     0x2500, 0x2502, 0x250c, 0x2510, 0x2514, 0x2518, 0x251c, 0x2524,
855     0x252c, 0x2534, 0x253c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
856     0x2591, 0x2592, 0x2593, 0x2320, 0x25a0, 0x2022, 0x221a, 0x2248,
857     0x2264, 0x2265, 0x00a0, 0x2321, 0x00b0, 0x00b2, 0x00b7, 0x00f7,
858     0x2550, 0x2551, 0x2552, 0x0451, 0x0454, 0x2554, 0x0456, 0x0457,
859     0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x0491, 0x255d, 0x255e,
860     0x255f, 0x2560, 0x2561, 0x0401, 0x0403, 0x2563, 0x0406, 0x0407,
861     0x2566, 0x2567, 0x2568, 0x2569, 0x256a, 0x0490, 0x256c, 0x00a9,
862     0x044e, 0x0430, 0x0431, 0x0446, 0x0434, 0x0435, 0x0444, 0x0433,
863     0x0445, 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e,
864     0x043f, 0x044f, 0x0440, 0x0441, 0x0442, 0x0443, 0x0436, 0x0432,
865     0x044c, 0x044b, 0x0437, 0x0448, 0x044d, 0x0449, 0x0447, 0x044a,
866     0x042e, 0x0410, 0x0411, 0x0426, 0x0414, 0x0415, 0x0424, 0x0413,
867     0x0425, 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e,
868     0x041f, 0x042f, 0x0420, 0x0421, 0x0422, 0x0423, 0x0416, 0x0412,
869     0x042c, 0x042b, 0x0417, 0x0428, 0x042d, 0x0429, 0x0427, 0x042a,
870     0x0
871 };
872 
iconv_from_koi8(strc_t in,UBYTE enc)873 static strc_t iconv_from_koi8 (strc_t in, UBYTE enc)
874 {
875     static str_s str = { NULL, 0, 0 };
876     UBYTE c;
877     int off;
878 
879     s_init (&str, "", in->len);
880     for (off = 0; off < in->len; off++)
881         s_cat (&str, ConvUTF8 ((c = in->txt[off]) & 0x80 ? koi8u_utf8[c & 0x7f] : c));
882     return &str;
883 }
884 
iconv_to_koi8(strc_t in,UBYTE enc)885 static strc_t iconv_to_koi8 (strc_t in, UBYTE enc)
886 {
887     static str_s str = { NULL, 0, 0 };
888     UDWORD ucs;
889     UBYTE c;
890     int off;
891 
892     s_init (&str, "", in->len);
893     for (off = 0; off < in->len; )
894     {
895         ucs = ConvGetUTF8 (in, &off);
896         if (ucs & 0xffffff80)
897         {
898             for (c = 0; ~c & 0x80; c++)
899                 if (koi8u_utf8[c] == ucs)
900                     break;
901 
902             s_catc (&str, c & 0x80 ? CHAR_NOT_AVAILABLE : c | 0x80);
903         }
904         else
905             s_catc (&str, ucs);
906     }
907     return &str;
908 }
909 #endif
910 
911 #if ENABLE_FALLBACK_WIN1251
912 const UDWORD win1251_utf8[] = { /* 7bit are us-ascii */
913     0x0402, 0x0403, 0x201a, 0x0453, 0x201e, 0x2026, 0x2020, 0x2021,
914     0x0088, 0x2030, 0x0409, 0x2039, 0x040a, 0x040c, 0x040b, 0x040f,
915     0x0452, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
916     0x0098, 0x2122, 0x0459, 0x203a, 0x045a, 0x045c, 0x045b, 0x045f,
917     0x00a0, 0x040e, 0x045e, 0x0408, 0x00a4, 0x0490, 0x00a6, 0x00a7,
918     0x0401, 0x00a9, 0x0404, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x0407,
919     0x00b0, 0x00b1, 0x0406, 0x0456, 0x0491, 0x00b5, 0x00b6, 0x00b7,
920     0x0451, 0x2116, 0x0454, 0x00bb, 0x0458, 0x0405, 0x0455, 0x0457,
921     0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,
922     0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f,
923     0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,
924     0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f,
925     0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437,
926     0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f,
927     0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447,
928     0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f,
929     0x0
930 };
931 
iconv_from_win1251(strc_t in,UBYTE enc)932 static strc_t iconv_from_win1251 (strc_t in, UBYTE enc)
933 {
934     static str_s str = { NULL, 0, 0 };
935     UBYTE c;
936     int off;
937 
938     s_init (&str, "", in->len);
939     for (off = 0; off < in->len; off++)
940         s_cat (&str, ConvUTF8 ((c = in->txt[off]) & 0x80 ? win1251_utf8[c & 0x7f] : c));
941     return &str;
942 }
943 
iconv_to_win1251(strc_t in,UBYTE enc)944 static strc_t iconv_to_win1251 (strc_t in, UBYTE enc)
945 {
946     static str_s str = { NULL, 0, 0 };
947     UDWORD ucs;
948     UBYTE c;
949     int off;
950 
951     s_init (&str, "", in->len);
952     for (off = 0; off < in->len; )
953     {
954         ucs = ConvGetUTF8 (in, &off);
955         if (ucs & 0xffffff80)
956         {
957             for (c = 0; ~c & 0x80; c++)
958                 if (win1251_utf8[c] == ucs)
959                     break;
960 
961             s_catc (&str, c & 0x80 ? CHAR_NOT_AVAILABLE : c | 0x80);
962         }
963         else
964             s_catc (&str, ucs);
965     }
966     return &str;
967 }
968 #endif
969 
970 #if ENABLE_FALLBACK_UCS2BE
iconv_from_ucs2be(strc_t in,UBYTE enc)971 static strc_t iconv_from_ucs2be (strc_t in, UBYTE enc)
972 {
973     static str_s str = { NULL, 0, 0 };
974     UDWORD ucs;
975     int off;
976 
977     s_init (&str, "", in->len);
978     for (off = 0; off < in->len; )
979     {
980         if (off + 1 >= in->len)
981         {
982             s_catc (&str, CHAR_INCOMPLETE);
983             break;
984         }
985 
986         ucs = (UBYTE)in->txt[off++] << 8;
987         ucs |= (UBYTE)in->txt[off++];
988         if ((ucs & 0xf800) == 0xd800)
989             s_catc (&str, CHAR_BROKEN);
990         else
991             s_cat (&str, ConvUTF8 (ucs));
992     }
993     return &str;
994 }
995 
iconv_to_ucs2be(strc_t in,UBYTE enc)996 static strc_t iconv_to_ucs2be (strc_t in, UBYTE enc)
997 {
998     static str_s str = { NULL, 0, 0 };
999     UDWORD ucs;
1000     int off;
1001 
1002     s_init (&str, "", in->len);
1003     for (off = 0; off < in->len; )
1004     {
1005         ucs = ConvGetUTF8 (in, &off);
1006         if (ucs & 0xffff0000)
1007         {
1008             s_catc (&str, 0);
1009             s_catc (&str, CHAR_NOT_AVAILABLE);
1010         }
1011         else
1012         {
1013             s_catc (&str, (ucs >> 8) & 0xff);
1014             s_catc (&str, ucs & 0xff);
1015         }
1016     }
1017     return &str;
1018 }
1019 #endif
1020 
1021 #if ENABLE_FALLBACK_WCHART
iconv_from_wchart(strc_t in,UBYTE enc)1022 static strc_t iconv_from_wchart (strc_t in, UBYTE enc)
1023 {
1024     static str_s str = { NULL, 0, 0 };
1025     UDWORD ucs;
1026     int off;
1027 
1028     s_init (&str, "", in->len);
1029     for (off = 0; off < in->len; )
1030     {
1031         if (off + sizeof (wchar_t) > in->len)
1032         {
1033             s_catc (&str, CHAR_INCOMPLETE);
1034             break;
1035         }
1036 
1037         ucs = *((wchar_t *)(in->txt + off));
1038         off += sizeof (wchar_t);
1039         if ((ucs & 0xf800) == 0xd800)
1040             s_catc (&str, CHAR_BROKEN);
1041         else
1042             s_cat (&str, ConvUTF8 (ucs));
1043     }
1044     return &str;
1045 }
1046 
iconv_to_wchart(strc_t in,UBYTE enc)1047 static strc_t iconv_to_wchart (strc_t in, UBYTE enc)
1048 {
1049     static str_s str = { NULL, 0, 0 };
1050     UDWORD ucs;
1051     wchar_t na = CHAR_NOT_AVAILABLE;
1052     int off;
1053 
1054     s_init (&str, "", in->len);
1055     for (off = 0; off < in->len; )
1056     {
1057         ucs = ConvGetUTF8 (in, &off);
1058         if ((ucs & 0xf800) == 0xd800)
1059             s_catc (&str, CHAR_BROKEN);
1060         else if (   (sizeof (wchar_t) <= 1 && ucs & 0xffffff00)
1061                  || (sizeof (wchar_t) <= 2 && ucs & 0xffff0000))
1062             s_catn (&str, (const char *)&na, sizeof (wchar_t));
1063         else
1064             s_catn (&str, (const char *)&ucs, sizeof (wchar_t));
1065     }
1066     return &str;
1067 }
1068 #endif
1069 #endif /* ENABLE_FALLBACK_* */
1070