1 /*
2 * Copyright (c) 2011 QUALCOMM Incorporated. All rights reserved.
3 * The file license.txt specifies the terms for use, modification,
4 * and redistribution.
5 *
6 *
7 * Revisions:
8 * 03/09/00 [rcg]
9 * - Cleaned up nits to appease compilers.
10 *
11 * 07/22/98 [py]
12 * - Created.
13 *
14 */
15
16 #include "config.h"
17
18 #include <mime.h>
19 #include <charmangle.h>
20 #include <mangle.h>
21
22 typedef struct tagCHMangle {
23 OutputFn *m_pfOut;
24 void *m_pvOut;
25 TextCharSetType m_partCharSet;
26 TextCharSetType m_reqCharSet;
27 } CHMangle;
28
29 static CHMangle hm;
30
CharManglerInit(OutputFn oFn,void * oFnState,TextCharSetType partCharSet,TextCharSetType reqCharSet)31 void *CharManglerInit ( OutputFn oFn, void *oFnState, TextCharSetType partCharSet,
32 TextCharSetType reqCharSet )
33 {
34 hm.m_pfOut = oFn;
35 hm.m_pvOut = oFnState;
36 hm.m_partCharSet = partCharSet;
37 hm.m_reqCharSet = reqCharSet;
38 return &hm;
39 }
40
41 #define BLAT '_'
42
43 /* Define mappings from the ISO-8859-x charsets to Unicode. Since all are
44 * the same from 0x00 - 0xA0, we start at 0xA1, unless otherwise noted.
45 * -1 is the same as Unicode, so there is no need for a mapping for it.
46 * That leaves -2 through -10.
47 */
48
49 static const unsigned int ISO8859_2[] = {
50 /* --A1--, --A2--, --A3--, --A4--, --A5--, --A6--, --A7--, --A8--, --A9--, --AA-- */
51 0x0104, 0x02D8, 0x0141, 0x00A4, 0x013D, 0x015A, 0x00A7, 0x00A8, 0x0160, 0x015E,
52 0x0164, 0x0179, 0x00AD, 0x017D, 0x017B, 0x00B0, 0x0105, 0x02DB, 0x0142, 0x00B4,
53 0x013E, 0x015B, 0x02C7, 0x00B8, 0x0161, 0x015F, 0x0165, 0x017A, 0x02DD, 0x017E,
54 0x017C, 0x0154, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0139, 0x0106, 0x00C7, 0x010C,
55 0x00C9, 0x0118, 0x00CB, 0x011A, 0x00CD, 0x00CE, 0x010E, 0x0110, 0x0143, 0x0147,
56 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x00D7, 0x0158, 0x016E, 0x00DA, 0x0170, 0x00DC,
57 0x00DD, 0x0162, 0x00DF, 0x0155, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x013A, 0x0107,
58 0x00E7, 0x010D, 0x00E9, 0x0119, 0x00EB, 0x011B, 0x00ED, 0x00EE, 0x010F, 0x0111,
59 0x0144, 0x0148, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x00F7, 0x0159, 0x016F, 0x00FA,
60 0x0171, 0x00FC, 0x00FD, 0x0163, 0x02D9
61 };
62
63 static const unsigned int ISO8859_3[] = {
64 /* --A1--, --A2--, --A3--, --A4--, --A5--, --A6--, --A7--, --A8--, --A9--, --AA-- */
65 0x0126, 0x02D8, 0x00A3, 0x00A4, ' ' , 0x0124, 0x00A7, 0x00A8, 0x0130, 0x015E,
66 0x011E, 0x0134, 0x00AD, ' ' , 0x017B, 0x00B0, 0x0127, 0x00B2, 0x00B3, 0x00B4,
67 0x00B5, 0x0125, 0x00B7, 0x00B8, 0x0131, 0x015F, 0x011F, 0x0135, 0x00BD, ' ',
68 0x017C, 0x00C0, 0x00C1, 0x00C2, ' ', 0x00C4, 0x010A, 0x0108, 0x00C7, 0x00C8,
69 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, ' ', 0x00D1, 0x00D2,
70 0x00D3, 0x00D4, 0x0120, 0x00D6, 0x00D7, 0x011C, 0x00D9, 0x00DA, 0x00DB, 0x00DC,
71 0x016C, 0x015C, 0x00DF, 0x00E0, 0x00E1, 0x00E2, ' ', 0x00E4, 0x010B, 0x0109,
72 0x00E7, 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, ' ',
73 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x0121, 0x00F6, 0x00F7, 0x011D, 0x00F9, 0x00FA,
74 0x00FB, 0x00FC, 0x016D, 0x015D, 0x02D9
75 };
76
77 static const unsigned int ISO8859_4[] = {
78 /* --A1--, --A2--, --A3--, --A4--, --A5--, --A6--, --A7--, --A8--, --A9--, --AA-- */
79 0x0104, 0x0138, 0x0156, 0x00A4, 0x0128, 0x013B, 0x00A7, 0x00A8, 0x0160, 0x0112,
80 0x0122, 0x0166, 0x00AD, 0x017D, 0x00AF, 0x00B0, 0x0105, 0x02DB, 0x0157, 0x00B4,
81 0x0129, 0x013C, 0x02C7, 0x00B8, 0x0161, 0x0113, 0x0123, 0x0167, 0x014A, 0x017E,
82 0x014B, 0x0100, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x012E, 0x010C,
83 0x00C9, 0x0118, 0x00CB, 0x0116, 0x00CD, 0x00CE, 0x012A, 0x0110, 0x0145, 0x014C,
84 0x0136, 0x00D4, 0x00D5, 0x00D6, 0x00D7, 0x00D8, 0x0172, 0x00DA, 0x00DB, 0x00DC,
85 0x0168, 0x016A, 0x00DF, 0x0101, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6,
86 0x012F, 0x010D, 0x00E9, 0x0119, 0x00EB, 0x0117, 0x00ED, 0x00EE, 0x012B, 0x0111,
87 0x0146, 0x014D, 0x0137, 0x00F4, 0x00F5, 0x00F6, 0x00F7, 0x00F8, 0x0173, 0x00FA,
88 0x00FB, 0x00FC, 0x0169, 0x016B, 0x02D9
89 };
90
91 static const unsigned int ISO8859_5[] = {
92 /* --A1--, --A2--, --A3--, --A4--, --A5--, --A6--, --A7--, --A8--, --A9--, --AA-- */
93 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407, 0x0408, 0x0409, 0x040A,
94 0x040B, 0x040C, 0x00AD, 0x040E, 0x040F, 0x0410, 0x0411, 0x0412, 0x0413, 0x0414,
95 0x0415, 0x0416, 0x0417, 0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E,
96 0x041F, 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, 0x0428,
97 0x0429, 0x042A, 0x042B, 0x042C, 0x042D, 0x042E, 0x042F, 0x0430, 0x0431, 0x0432,
98 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, 0x0438, 0x0439, 0x043A, 0x043B, 0x043C,
99 0x043D, 0x043E, 0x043F, 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446,
100 0x0447, 0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F, 0x2116,
101 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457, 0x0458, 0x0459, 0x045A,
102 0x045B, 0x045C, 0x00A7, 0x045E, 0x045F
103 };
104
105 /* 0xA1 - 0xFF */
106 static const unsigned int ISO8859_6[] = {
107 /* --A1--, --A2--, --A3--, --A4--, --A5--, --A6--, --A7--, --A8--, --A9--, --AA-- */
108 ' ', ' ', ' ', 0x00A4, ' ', ' ', ' ', ' ', ' ', ' ',
109 ' ', 0x060C, 0x00AD, ' ', ' ', ' ', ' ', ' ', ' ', ' ',
110 ' ', ' ', ' ', ' ', ' ', ' ', 0x061B, ' ', ' ', ' ',
111 0x061F, ' ', 0x0621, 0x0622, 0x0623, 0x0624, 0x0625, 0x0626, 0x0627, 0x0628,
112 0x0629, 0x062A, 0x062B, 0x062C, 0x062D, 0x062E, 0x062F, 0x0630, 0x0631, 0x0632,
113 0x0633, 0x0634, 0x0635, 0x0636, 0x0637, 0x0638, 0x0639, 0x063A, ' ', ' ',
114 ' ', ' ', ' ', 0x0640, 0x0641, 0x0642, 0x0643, 0x0644, 0x0645, 0x0646,
115 0x0647, 0x0648, 0x0649, 0x064A, 0x064B, 0x064C, 0x064D, 0x064E, 0x064F, 0x0650,
116 0x0651, 0x0652, ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
117 ' ', ' ', ' ', ' ', ' '
118 };
119
120 static const unsigned int ISO8859_7[] = {
121 /* --A1--, --A2--, --A3--, --A4--, --A5--, --A6--, --A7--, --A8--, --A9--, --AA-- */
122 0x02BD, 0x02BC, 0x00A3, ' ', ' ', 0x00A6, 0x00A7, 0x00A8, 0x00A9, ' ',
123 0x00AB, 0x00AC, 0x00AD, ' ', 0x2015, 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x0384,
124 0x0385, 0x0386, 0x00B7, 0x0388, 0x0389, 0x038A, 0x00BB, 0x038C, 0x00BD, 0x038E,
125 0x038F, 0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, 0x0398,
126 0x0399, 0x039A, 0x039B, 0x039C, 0x039D, 0x039E, 0x039F, 0x03A0, 0x03A1, ' ',
127 0x03A3, 0x03A4, 0x03A5, 0x03A6, 0x03A7, 0x03A8, 0x03A9, 0x03AA, 0x03AB, 0x03AC,
128 0x03AD, 0x03AE, 0x03AF, 0x03B0, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, 0x03B6,
129 0x03B7, 0x03B8, 0x03B9, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF, 0x03C0,
130 0x03C1, 0x03C2, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7, 0x03C8, 0x03C9, 0x03CA,
131 0x03CB, 0x03CC, 0x03CD, 0x03CE
132 };
133
134 /* 0x80 - 0x9F */
135 static const unsigned int CP1252[] = {
136 0x20AC, ' ' , 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 0x02C6, 0x2030,
137 0x0160, 0x2039, 0x0152, ' ' , 0x017D, ' ' , ' ' , 0x2018, 0x2019, 0x201C,
138 0x201D, 0x2022, 0x2013, 0x2014, 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, ' ' ,
139 0x017E, 0x0178
140 };
141
142
Iso8859_to_Unicode(TextCharSetType CharSet,unsigned char character)143 static unsigned int Iso8859_to_Unicode( TextCharSetType CharSet, unsigned char character )
144 {
145 switch( CharSet ) {
146 case us_ascii:
147 return (character & 0x7F);
148 case iso_8859_1:
149 return (character & 0xFF);
150 case iso_8859_2:
151 return ( character < 0xA1 ) ? character : ISO8859_2[character - 0xA1];
152 case iso_8859_3:
153 return ( character < 0xA1 ) ? character : ISO8859_3[character - 0xA1];
154 case iso_8859_4:
155 return ( character < 0xA1 ) ? character : ISO8859_4[character - 0xA1];
156 case iso_8859_5:
157 return ( character < 0xA1 ) ? character : ISO8859_4[character - 0xA1];
158 case iso_8859_6:
159 return ( character < 0xA1 ) ? character : ISO8859_4[character - 0xA1];
160 case cp1252:
161 return ( character >= 0x80 && character <= 0x9F ) ? CP1252[character - 0x80]
162 : character;
163 default:
164 /* todo for the rest of ISO8859. */
165 break;
166 }
167 return character;
168 }
169
170
171 /*
172 * Actually UCS-4
173 */
Unicode_to_Utf8(unsigned int unicode,unsigned char * buf,int * nSize)174 static void Unicode_to_Utf8( unsigned int unicode, unsigned char *buf, int *nSize )
175 {
176 int i;
177 if( unicode <= 0x0000007F ) {
178 *nSize = 1;
179 buf[0] = unicode & 0x0000007F;
180 }
181 else if( unicode >= 0x00000080 && unicode <= 0x000007FF ){
182 *nSize = 2;
183 buf[0] = 0xC0 | ( (unicode >> 0x06) & 0x1F );
184 }
185 else if( unicode >= 0x00000800 && unicode <= 0x0000FFFF ){
186 *nSize = 3;
187 buf[0] = 0xE0 | ( (unicode >> 0x0C) & 0x0F );
188 }
189 else if( unicode >= 0x00010000 && unicode <= 0x001FFFFF ){
190 *nSize = 4;
191 buf[0] = 0xF0 | ( (unicode >> 0x12) & 0x07 );
192 }
193 else if( unicode >= 0x00200000 && unicode <= 0x03FFFFFF ){
194 *nSize = 5;
195 buf[0] = 0xF8 | ( (unicode >> 0x18) & 0x03 );
196 }
197 else if( unicode >= 0x04000000 && unicode <= 0x7FFFFFFF ){
198 *nSize = 6;
199 buf[0] = 0xFC | ( (unicode >> 0x1E) & 0x01 );
200 }
201 for( i = 1; i < *nSize; i++ )
202 buf[i] = 0x80 | ( (unicode >> (0x06 * (*nSize - i - 1))) & 0x3F );
203 }
204
CharMangler(void * state,char * szText,long len)205 void CharMangler ( void *state, char *szText, long len )
206 {
207 int i;
208 (void) len;
209 (void) state;
210
211
212 switch ( hm.m_reqCharSet ) {
213
214 case us_ascii:
215 switch ( hm.m_partCharSet ) {
216 case utf_8:
217 for ( i = 0; i < len; i++ ) {
218 switch ( szText[i] & 0xC0 ) {
219 case 0xC0:
220 szText[i] = BLAT;
221 break;
222 case 0x80:
223 break;
224 default:
225 hm.m_pfOut ( hm.m_pvOut, &szText[i], 1 );
226 break;
227 } /* switch ( szText[i] & 0xC0 ) */
228 } /* for loop */
229 break;
230 case us_ascii: /* Pass */
231 hm.m_pfOut ( hm.m_pvOut, szText, len );
232 break;
233 default:
234 /* Filter the octets with b7 set */
235 for ( i = 0; i < len; i++ ) {
236 if ( ( (unsigned char) szText[i] != 0xA0 ) &&
237 ( (unsigned char) szText[i] & 0x80 ) ) {
238 szText [ i ] = BLAT;
239 }
240 } /* for loop */
241 hm.m_pfOut ( hm.m_pvOut, szText, len );
242 } /* switch ( hm.m_partCharSet ) */
243 break;
244
245 case iso_8859_1:
246 case iso_8859_2:
247 case iso_8859_3:
248 case iso_8859_4:
249 case iso_8859_9:
250 case iso_8859_10: /* Latin 1 thru 5 */
251 switch ( hm.m_partCharSet ) {
252 case us_ascii:
253 case iso_8859_1:
254 case iso_8859_2:
255 case iso_8859_3:
256 case iso_8859_4:
257 case iso_8859_9:
258 case iso_8859_10: /* Pass */
259 hm.m_pfOut ( hm.m_pvOut, szText, len );
260 break;
261 case utf_8:
262 /* todo */
263 break;
264 case iso_8859_5:
265 case iso_8859_6:
266 case iso_8859_7:
267 case iso_8859_8: /* Octets with b7 set replaced with 'blat' */
268 default:
269 for ( i = 0; i < len; i++ ) {
270 if ( szText[i] & 0x80 ) {
271 szText[i] = '_';
272 }
273 } /* for loop */
274 hm.m_pfOut ( hm.m_pvOut, szText, len );
275 break;
276 } /* switch ( hm.m_partCharSet ) */
277 break;
278
279 case iso_8859_5:
280 case iso_8859_6:
281 case iso_8859_7:
282 case iso_8859_8:
283 switch ( hm.m_partCharSet ) {
284 case utf_8:
285 break;
286 default:
287 /* Filter the octets with b7 set */
288 for ( i = 0; i < len; i++ ) {
289 if ( szText[i] & 0x80 ) {
290 szText[i] = '_';
291 }
292 } /* for loop */
293 hm.m_pfOut ( hm.m_pvOut, szText, len );
294 } /* switch ( hm.m_partCharSet ) */
295 break;
296
297 case cp1252:
298 switch ( hm.m_partCharSet ) {
299 case us_ascii:
300 case iso_8859_1:
301 case iso_8859_2:
302 case iso_8859_3:
303 case iso_8859_4:
304 case iso_8859_9:
305 case iso_8859_10:
306 hm.m_pfOut ( hm.m_pvOut, szText, len );
307 break;
308 default:
309 /* Filter the octets with b7 set */
310 for ( i = 0; i < len; i++ ) {
311 if ( szText[i] & 0x80 ) {
312 szText[i] = '_';
313 }
314 } /* for loop */
315 hm.m_pfOut ( hm.m_pvOut, szText, len) ;
316 } /* switch ( hm.m_partCharSet ) */
317 break;
318
319 case iso_2022_jp:
320 break;
321
322 case utf_8:
323 switch ( hm.m_partCharSet ) {
324 case utf_8:
325 case us_ascii:
326 /* Pass */
327 hm.m_pfOut ( hm.m_pvOut, szText, len );
328 break;
329 case iso_8859_1:
330 case iso_8859_2:
331 case iso_8859_3:
332 case iso_8859_4:
333 case iso_8859_5:
334 case iso_8859_6:
335 case iso_8859_7:
336 case iso_8859_8:
337 case iso_8859_9:
338 { /* local block */
339 unsigned char szBuf[6];
340 int nS = 6;
341 unsigned int unicode;
342
343 for ( i = 0; i < len; i++ ) {
344 if ( ( (unsigned char) szText[i] ) > 0xA0 ) {
345 unicode = Iso8859_to_Unicode ( hm.m_partCharSet, szText[i] );
346 Unicode_to_Utf8 ( unicode, szBuf, &nS );
347 hm.m_pfOut ( hm.m_pvOut, (char *)szBuf, nS );
348 }
349 else
350 hm.m_pfOut ( hm.m_pvOut, &szText[i], 1 );
351 } /* for loop */
352 } /* local block */
353 break;
354 case iso_8859_10:
355 /* to do */
356 break;
357 case iso_8859_11:
358 /* to do */
359 break;
360 case iso_8859_12:
361 /* to do */
362 break;
363 case iso_8859_13:
364 /* to do */
365 break;
366 case iso_8859_14:
367 /* to do */
368 break;
369 case iso_8859_15:
370 /* to do */
371 break;
372 case cp1252:
373 /* to do */
374 break;
375 case iso_2022_jp:
376 /* to do */
377 break;
378 } /* switch ( hm.m_partCharSet ) */
379 break;
380
381 default:
382 break;
383 } /* switch ( hm.m_reqCharSet ) */
384 }
385
386