1 /*****************************************************************************
2
3 NAME:
4 charset_iconv.c -- provide charset support using iconv().
5
6 Note:
7
8 Character translation is done to make life easier for the lexer.
9 Text is changed only after the message has been saved for
10 passthrough. The end user (mail reader) never sees any changes -
11 only the lexer.
12
13 AUTHOR:
14 David Relson <relson@osagesoftware.com> 2005
15
16 ******************************************************************************/
17
18 #include "common.h"
19
20 #include <ctype.h>
21 #include <errno.h>
22 #include <stdlib.h>
23 #include <string.h>
24
25 #include "charset.h"
26 #include "convert_unicode.h"
27 #include "chUnicodeTo866.h"
28 #include "xmalloc.h"
29 #include "xstrdup.h"
30
31 #define SP ' '
32
33 #include <iconv.h>
34 iconv_t cd = (iconv_t)-1;
35
map_nonascii_characters(void)36 static void map_nonascii_characters(void)
37 {
38 uint ch;
39 for (ch = 0; ch < COUNTOF(charset_table); ch += 1)
40 {
41 /* convert high-bit characters to '?' */
42 if (ch & 0x80 && casefold_table[ch] == ch)
43 casefold_table[ch] = '?';
44 }
45 }
46
map_default(void)47 static void map_default(void)
48 {
49 unsigned int ch;
50
51 for (ch = 0; ch < COUNTOF(charset_table); ch += 1)
52 {
53 charset_table[ch] = casefold_table[ch] = ch;
54 }
55
56 for (ch=0; ch < COUNTOF(charset_table); ch += 1)
57 {
58 if (iscntrl(ch) && /* convert control characters to blanks */
59 ch != '\t' && ch != '\n') /* except tabs and newlines */
60 charset_table[ch] = SP;
61 }
62 }
63
64 typedef struct charset_def {
65 const char *name;
66 bool allow_nonascii_replacement;
67 } charset_def_t;
68
69 #define T true
70 #define F false
71
72 static charset_def_t charsets[] = {
73 { "default", T },
74 { "us-ascii", T },
75 { "utf-8", T },
76 { "iso8859-1", T }, /* ISOIEC 8859-1:1998 Latin Alphabet No. 1 */
77 /* tests/t.systest.d/inputs/spam.mbx is iso-8859-1 and contains
78 * 8-bit characters - " �Your Account� "---a typical case of a
79 * message that should have declared windows-1252 instead */
80 { "iso8859-2", F }, /* ISOIEC 8859-2:1999 Latin Alphabet No. 2 */
81 { "iso8859-3", F }, /* ISOIEC 8859-3:1999 Latin Alphabet No. 3 */
82 { "iso8859-4", F }, /* ISOIEC 8859-4:1998 Latin Alphabet No. 4 */
83 { "iso8859-5", F }, /* ISOIEC 8859-5:1999 LatinCyrillic Alphabet */
84 { "iso8859-6", F }, /* ISOIEC 8859-6:1999 LatinArabic Alphabet */
85 { "iso8859-7", F }, /* ISO 8859-7:1987 LatinGreek Alphabet */
86 { "iso8859-8", F }, /* ISOIEC 8859-8:1999 LatinHebrew Alphabet */
87 { "iso8859-9", F }, /* ISOIEC 8859-9:1999 Latin Alphabet No. 5 */
88 { "iso8859-10", F }, /* ISOIEC 8859-10:1998 Latin Alphabet No. 6 */
89 { "iso8859-13", F }, /* ISOIEC 8859-13:1998 Latin Alphabet No. 7 (Baltic Rim)*/
90 { "iso8859-14", F }, /* ISOIEC 8859-14:1998 Latin Alphabet No. 8 (Celtic) */
91 { "iso8859-15", F }, /* ISOIEC 8859-15:1999 Latin Alphabet No. 9 */
92 { "cp866", F },
93 { "koi8-r", F },
94 { "windows-1251", F },
95 { "windows-1252", T },
96 { "windows-1256", T },
97 { "iso2022-jp", T }, /* rfc-1468 - japanese */
98 { "euc-kr", T }, /* extended unix code for korean */
99 { "iso2022-kr", T }, /* korean standard code (7-bit)*/
100 { "ks-c-5601-1987", T }, /* korean standard (default) */
101 { "big5", T },
102 { "csbig5", T },
103 { "gb2312", T },
104 { "csgb2312", T },
105 };
106
bf_iconv_open(const char * to_charset,const char * from_charset)107 iconv_t bf_iconv_open( const char *to_charset, const char *from_charset )
108 {
109 iconv_t xd = iconv_open( to_charset, from_charset );
110
111 if (xd == (iconv_t)-1) {
112 int err = errno;
113 if (err == EINVAL) {
114 if (DEBUG_ICONV(1))
115 fprintf(dbgout, "Conversion from '%s' to '%s' is not supported.\n",
116 from_charset, to_charset );
117 /* error - map default charset to unicode */
118 xd = iconv_open( charset_unicode, charset_default );
119 }
120 }
121
122 return xd;
123 }
124
init_charset_table_iconv(const char * from_charset,const char * to_charset)125 void init_charset_table_iconv(const char *from_charset, const char *to_charset)
126 {
127 uint idx;
128
129 if (cd != (iconv_t)-1)
130 iconv_close(cd);
131
132 if (DEBUG_ICONV(1))
133 fprintf(dbgout, "converting %s to %s\n", from_charset, to_charset);
134
135 if (strcasecmp( from_charset, "default" ) == 0)
136 from_charset = charset_default;
137
138 cd = bf_iconv_open( to_charset, from_charset );
139
140 for (idx = 0; idx < COUNTOF(charsets); idx += 1)
141 {
142 charset_def_t *charset = &charsets[idx];
143 if (strcasecmp(charset->name, to_charset) == 0)
144 {
145 map_default(); /* Setup the table defaults. */
146 if (replace_nonascii_characters)
147 if (charset->allow_nonascii_replacement)
148 map_nonascii_characters();
149 break;
150 }
151 }
152
153 return;
154 }
155