1 /*****************************************************************************
2 
3 NAME:
4    charset_iconv.c -- provide charset support using iconv().
5 
6 Note:
7 
8    Character translation is done to make life easier for the lexer.
9    Text is changed only after the message has been saved for
10    passthrough.  The end user (mail reader) never sees any changes -
11    only the lexer.
12 
13 AUTHOR:
14    David Relson <relson@osagesoftware.com>  2005
15 
16 ******************************************************************************/
17 
18 #include "common.h"
19 
20 #include <ctype.h>
21 #include <errno.h>
22 #include <stdlib.h>
23 #include <string.h>
24 
25 #include "charset.h"
26 #include "convert_unicode.h"
27 #include "chUnicodeTo866.h"
28 #include "xmalloc.h"
29 #include "xstrdup.h"
30 
31 #define	SP	' '
32 
33 #include <iconv.h>
34 iconv_t cd = (iconv_t)-1;
35 
map_nonascii_characters(void)36 static void map_nonascii_characters(void)
37 {
38     uint ch;
39     for (ch = 0; ch < COUNTOF(charset_table); ch += 1)
40     {
41 	/* convert high-bit characters to '?' */
42 	if (ch & 0x80 && casefold_table[ch] == ch)
43 	    casefold_table[ch] = '?';
44     }
45 }
46 
map_default(void)47 static void map_default(void)
48 {
49     unsigned int ch;
50 
51     for (ch = 0; ch < COUNTOF(charset_table); ch += 1)
52     {
53 	charset_table[ch] = casefold_table[ch] = ch;
54     }
55 
56     for (ch=0; ch < COUNTOF(charset_table); ch += 1)
57     {
58 	if (iscntrl(ch) &&		/* convert control characters to blanks */
59 	    ch != '\t' && ch != '\n')	/* except tabs and newlines		*/
60 	    charset_table[ch] = SP;
61     }
62 }
63 
64 typedef struct charset_def {
65     const char *name;
66     bool allow_nonascii_replacement;
67 } charset_def_t;
68 
69 #define	T	true
70 #define	F	false
71 
72 static charset_def_t charsets[] = {
73     { "default",	T },
74     { "us-ascii",	T },
75     { "utf-8",		T },
76     { "iso8859-1",	T },		/* ISOIEC 8859-1:1998 Latin Alphabet No. 1	*/
77     /* tests/t.systest.d/inputs/spam.mbx is iso-8859-1 and contains
78      * 8-bit characters - " �Your Account� "---a typical case of a
79      * message that should have declared windows-1252 instead */
80     { "iso8859-2",	F },		/* ISOIEC 8859-2:1999 Latin Alphabet No. 2	*/
81     { "iso8859-3",	F },		/* ISOIEC 8859-3:1999 Latin Alphabet No. 3	*/
82     { "iso8859-4",	F },		/* ISOIEC 8859-4:1998 Latin Alphabet No. 4	*/
83     { "iso8859-5",	F },		/* ISOIEC 8859-5:1999 LatinCyrillic Alphabet	*/
84     { "iso8859-6",	F },		/* ISOIEC 8859-6:1999 LatinArabic Alphabet	*/
85     { "iso8859-7",	F },		/* ISO	  8859-7:1987 LatinGreek Alphabet	*/
86     { "iso8859-8",	F },		/* ISOIEC 8859-8:1999 LatinHebrew Alphabet	*/
87     { "iso8859-9",	F },		/* ISOIEC 8859-9:1999 Latin Alphabet No. 5	*/
88     { "iso8859-10",	F },		/* ISOIEC 8859-10:1998 Latin Alphabet No. 6	*/
89     { "iso8859-13",	F },		/* ISOIEC 8859-13:1998 Latin Alphabet No. 7 (Baltic Rim)*/
90     { "iso8859-14",	F },		/* ISOIEC 8859-14:1998 Latin Alphabet No. 8 (Celtic)	*/
91     { "iso8859-15",	F },		/* ISOIEC 8859-15:1999 Latin Alphabet No. 9		*/
92     { "cp866",		F },
93     { "koi8-r",		F },
94     { "windows-1251",	F },
95     { "windows-1252",	T },
96     { "windows-1256",	T },
97     { "iso2022-jp",	T },		/* rfc-1468 - japanese */
98     { "euc-kr",		T },		/* extended unix code for korean */
99     { "iso2022-kr",	T },		/* korean standard code (7-bit)*/
100     { "ks-c-5601-1987",	T },		/* korean standard (default) */
101     { "big5",		T },
102     { "csbig5",		T },
103     { "gb2312",		T },
104     { "csgb2312",	T },
105 };
106 
bf_iconv_open(const char * to_charset,const char * from_charset)107 iconv_t bf_iconv_open( const char *to_charset, const char *from_charset )
108 {
109     iconv_t xd = iconv_open( to_charset, from_charset );
110 
111     if (xd == (iconv_t)-1) {
112 	int err = errno;
113 	if (err == EINVAL) {
114 	    if (DEBUG_ICONV(1))
115 		fprintf(dbgout, "Conversion from '%s' to '%s' is not supported.\n",
116 			from_charset, to_charset );
117 	    /* error - map default charset to unicode */
118 	    xd = iconv_open( charset_unicode, charset_default );
119 	}
120     }
121 
122     return xd;
123 }
124 
init_charset_table_iconv(const char * from_charset,const char * to_charset)125 void init_charset_table_iconv(const char *from_charset, const char *to_charset)
126 {
127     uint idx;
128 
129     if (cd != (iconv_t)-1)
130 	iconv_close(cd);
131 
132     if (DEBUG_ICONV(1))
133 	fprintf(dbgout, "converting %s to %s\n", from_charset, to_charset);
134 
135     if (strcasecmp( from_charset, "default" ) == 0)
136 	from_charset = charset_default;
137 
138     cd = bf_iconv_open( to_charset, from_charset );
139 
140     for (idx = 0; idx < COUNTOF(charsets); idx += 1)
141     {
142 	charset_def_t *charset = &charsets[idx];
143 	if (strcasecmp(charset->name, to_charset) == 0)
144 	{
145 	    map_default();	/* Setup the table defaults. */
146 	    if (replace_nonascii_characters)
147 		if (charset->allow_nonascii_replacement)
148 		    map_nonascii_characters();
149 	    break;
150 	}
151     }
152 
153     return;
154 }
155