1 /*
2  * chartrans.c
3  * Copyright (C) 1999-2004 A.J. van Os; Released under GNU GPL
4  *
5  * Description:
6  * Translate Word characters to local representation
7  */
8 
9 #include <stdlib.h>
10 #include <string.h>
11 #include <ctype.h>
12 #if defined(__STDC_ISO_10646__)
13 #include <wctype.h>
14 #endif /* __STDC_ISO_10646__ */
15 #include "antiword.h"
16 
17 static const USHORT usCp850[] = {	/* DOS implementation of Latin1 */
18 	0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7,
19 	0x00ea, 0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5,
20 	0x00c9, 0x00e6, 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9,
21 	0x00ff, 0x00d6, 0x00dc, 0x00f8, 0x00a3, 0x00d8, 0x00d7, 0x0192,
22 	0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x00f1, 0x00d1, 0x00aa, 0x00ba,
23 	0x00bf, 0x00ae, 0x00ac, 0x00bd, 0x00bc, 0x00a1, 0x00ab, 0x00bb,
24 	0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00c1, 0x00c2, 0x00c0,
25 	0x00a9, 0x2563, 0x2551, 0x2557, 0x255d, 0x00a2, 0x00a5, 0x2510,
26 	0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x00e3, 0x00c3,
27 	0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4,
28 	0x00f0, 0x00d0, 0x00ca, 0x00cb, 0x00c8, 0x0131, 0x00cd, 0x00ce,
29 	0x00cf, 0x2518, 0x250c, 0x2588, 0x2584, 0x00a6, 0x00cc, 0x2580,
30 	0x00d3, 0x00df, 0x00d4, 0x00d2, 0x00f5, 0x00d5, 0x00b5, 0x00fe,
31 	0x00de, 0x00da, 0x00db, 0x00d9, 0x00fd, 0x00dd, 0x00af, 0x00b4,
32 	0x00ad, 0x00b1, 0x2017, 0x00be, 0x00b6, 0x00a7, 0x00f7, 0x00b8,
33 	0x00b0, 0x00a8, 0x00b7, 0x00b9, 0x00b3, 0x00b2, 0x25a0, 0x00a0,
34 };
35 
36 static const USHORT usCp1250[] = {	/* Windows implementation of Latin2 */
37 	0x20ac, 0x003f, 0x201a, 0x003f, 0x201e, 0x2026, 0x2020, 0x2021,
38 	0x003f, 0x2030, 0x0160, 0x2039, 0x015a, 0x0164, 0x017d, 0x0179,
39 	0x003f, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
40 	0x003f, 0x2122, 0x0161, 0x203a, 0x015b, 0x0165, 0x017e, 0x017a,
41 	0x00a0, 0x02c7, 0x02d8, 0x0141, 0x00a4, 0x0104, 0x00a6, 0x00a7,
42 	0x00a8, 0x00a9, 0x015e, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x017b,
43 	0x00b0, 0x00b1, 0x02db, 0x0142, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
44 	0x00b8, 0x0105, 0x015f, 0x00bb, 0x013d, 0x02dd, 0x013e, 0x017c,
45 	0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7,
46 	0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e,
47 	0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7,
48 	0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df,
49 	0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7,
50 	0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f,
51 	0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7,
52 	0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9,
53 };
54 
55 static const USHORT usCp1251[] = {	/* Windows implementation of Cyrillic */
56 	0x0402, 0x0403, 0x201a, 0x0453, 0x201e, 0x2026, 0x2020, 0x2021,
57 	0x20ac, 0x2030, 0x0409, 0x2039, 0x040a, 0x040c, 0x040b, 0x040f,
58 	0x0452, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
59 	0x00f3, 0x2122, 0x0459, 0x203a, 0x045a, 0x045c, 0x045b, 0x045f,
60 	0x00a0, 0x040e, 0x045e, 0x0408, 0x00a4, 0x0490, 0x00a6, 0x00a7,
61 	0x0401, 0x00a9, 0x0404, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x0407,
62 	0x00b0, 0x00b1, 0x0406, 0x0456, 0x0491, 0x00b5, 0x00b6, 0x00b7,
63 	0x0451, 0x2116, 0x0454, 0x00bb, 0x0458, 0x0405, 0x0455, 0x0457,
64 	0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,
65 	0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f,
66 	0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,
67 	0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f,
68 	0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437,
69 	0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f,
70 	0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447,
71 	0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f,
72 };
73 
74 static const USHORT usCp1252[] = {	/* Windows implementation of Latin1 */
75 	0x20ac, 0x003f, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
76 	0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x003f, 0x017d, 0x003f,
77 	0x003f, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
78 	0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x003f, 0x017e, 0x0178,
79 	0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
80 	0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af,
81 	0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
82 	0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
83 	0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
84 	0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
85 	0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
86 	0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,
87 	0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
88 	0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
89 	0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
90 	0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
91 };
92 
93 static const USHORT usMacRoman[] = {	/* Apple implementation of Latin1 */
94 	0x00c4, 0x00c5, 0x00c7, 0x00c9, 0x00d1, 0x00d6, 0x00dc, 0x00e1,
95 	0x00e0, 0x00e2, 0x00e4, 0x00e3, 0x00e5, 0x00e7, 0x00e9, 0x00e8,
96 	0x00ea, 0x00eb, 0x00ed, 0x00ec, 0x00ee, 0x00ef, 0x00f1, 0x00f3,
97 	0x00f2, 0x00f4, 0x00f6, 0x00f5, 0x00fa, 0x00f9, 0x00fb, 0x00fc,
98 	0x2020, 0x00b0, 0x00a2, 0x00a3, 0x00a7, 0x2022, 0x00b6, 0x00df,
99 	0x00ae, 0x00a9, 0x2122, 0x00b4, 0x00a8, 0x2260, 0x00c6, 0x00d8,
100 	0x221e, 0x00b1, 0x2264, 0x2265, 0x00a5, 0x00b5, 0x2202, 0x2211,
101 	0x220f, 0x03c0, 0x222b, 0x00aa, 0x00ba, 0x2126, 0x00e6, 0x00f8,
102 	0x00bf, 0x00a1, 0x00ac, 0x221a, 0x0192, 0x2248, 0x2206, 0x00ab,
103 	0x00bb, 0x2026, 0x00a0, 0x00c0, 0x00c3, 0x00d5, 0x0152, 0x0153,
104 	0x2013, 0x2014, 0x201c, 0x201d, 0x2018, 0x2019, 0x00f7, 0x25ca,
105 	0x00ff, 0x0178, 0x2044, 0x00a4, 0x2039, 0x203a, 0xfb01, 0xfb02,
106 	0x2021, 0x00b7, 0x201a, 0x201e, 0x2030, 0x00c2, 0x00ca, 0x00c1,
107 	0x00cb, 0x00c8, 0x00cd, 0x00ce, 0x00cf, 0x00cc, 0x00d3, 0x00d4,
108 	0x003f, 0x00d2, 0x00da, 0x00db, 0x00d9, 0x0131, 0x02c6, 0x02dc,
109 	0x00af, 0x02d8, 0x02d9, 0x02da, 0x00b8, 0x02dd, 0x02db, 0x02c7,
110 };
111 
112 static const USHORT usPrivateArea[] = {
113 	0x0020, 0x0021, 0x2200, 0x0023, 0x2203, 0x0025, 0x0026, 0x220d,
114 	0x0028, 0x0029, 0x2217, 0x002b, 0x002c, 0x2212, 0x002e, 0x002f,
115 	0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
116 	0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x2019, 0x003e, 0x003f,
117 	0x201d, 0x201c, 0x0392, 0x03a7, 0x0394, 0x0395, 0x03a6, 0x0393,
118 	0x0397, 0x0399, 0x03d1, 0x039a, 0x039b, 0x039c, 0x039d, 0x039f,
119 	0x03a0, 0x0398, 0x03a1, 0x03a3, 0x03a4, 0x03a5, 0x03c2, 0x03a9,
120 	0x039e, 0x03a8, 0x0396, 0x005b, 0x2234, 0x005d, 0x22a5, 0x005f,
121 	0x003f, 0x03b1, 0x03b2, 0x03c7, 0x03b4, 0x03b5, 0x03c6, 0x03b3,
122 	0x03b7, 0x03b9, 0x03d5, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03bf,
123 	0x03c0, 0x03b8, 0x03c1, 0x03c3, 0x03c4, 0x03c5, 0x03d6, 0x03c9,
124 	0x03be, 0x03c8, 0x03b6, 0x007b, 0x007c, 0x007d, 0x223c, 0x003f,
125 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
126 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
127 	0x003f, 0x003f, 0x003f, 0x2022, 0x003f, 0x003f, 0x003f, 0x003f,
128 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
129 	0x20ac, 0x03d2, 0x2032, 0x2264, 0x2044, 0x221e, 0x0192, 0x2663,
130 	0x2666, 0x2665, 0x2660, 0x2194, 0x2190, 0x2191, 0x2192, 0x2193,
131 	0x00b0, 0x00b1, 0x2033, 0x2265, 0x00d7, 0x221d, 0x2202, 0x2022,
132 	0x00f7, 0x2260, 0x2261, 0x2248, 0x2026, 0x007c, 0x23af, 0x21b5,
133 	0x2135, 0x2111, 0x211c, 0x2118, 0x2297, 0x2295, 0x2205, 0x2229,
134 	0x222a, 0x2283, 0x2287, 0x2284, 0x2282, 0x2286, 0x2208, 0x2209,
135 	0x2220, 0x2207, 0x00ae, 0x00a9, 0x2122, 0x220f, 0x221a, 0x22c5,
136 	0x00ac, 0x2227, 0x2228, 0x21d4, 0x21d0, 0x21d1, 0x21d2, 0x21d3,
137 	0x22c4, 0x3008, 0x00ae, 0x00a9, 0x2122, 0x2211, 0x239b, 0x239c,
138 	0x239d, 0x23a1, 0x23a2, 0x23a3, 0x23a7, 0x23a8, 0x23a9, 0x23aa,
139 	0x003f, 0x3009, 0x222b, 0x2320, 0x23ae, 0x2321, 0x239e, 0x239f,
140 	0x23a0, 0x23a4, 0x23a5, 0x23a6, 0x23ab, 0x23ac, 0x23ad, 0x003f,
141 };
142 
143 typedef struct char_table_tag {
144 	UCHAR	ucLocal;
145 	USHORT	usUnicode;
146 } char_table_type;
147 
148 static char_table_type	atCharTable[256];
149 static size_t		tNextPosFree = 0;
150 
151 
152 /*
153  * iCompare - compare two records
154  *
155  * Compares two records. For use by qsort(3C) and bsearch(3C).
156  *
157  * returns -1 if rec1 < rec2, 0 if rec1 == rec2, 1 if rec1 > rec2
158  */
159 static int
iCompare(const void * pvRecord1,const void * pvRecord2)160 iCompare(const void *pvRecord1, const void *pvRecord2)
161 {
162 	USHORT	usUnicode1, usUnicode2;
163 
164 	usUnicode1 = ((char_table_type *)pvRecord1)->usUnicode;
165 	usUnicode2 = ((char_table_type *)pvRecord2)->usUnicode;
166 
167 	if (usUnicode1 < usUnicode2) {
168 		return -1;
169 	}
170 	if (usUnicode1 > usUnicode2) {
171 		return 1;
172 	}
173 	return 0;
174 } /* end of iCompare */
175 
176 /*
177  * pGetCharTableRecord - get the character table record
178  *
179  * returns a pointer to the record when found, otherwise NULL
180  */
181 static const char_table_type *
pGetCharTableRecord(USHORT usUnicode)182 pGetCharTableRecord(USHORT usUnicode)
183 {
184 	char_table_type	tKey;
185 
186 	if (tNextPosFree == 0) {
187 		return NULL;
188 	}
189 	tKey.usUnicode = usUnicode;
190 	tKey.ucLocal = 0;
191 	return (char_table_type *)bsearch(&tKey,
192 			atCharTable,
193 			tNextPosFree, sizeof(atCharTable[0]),
194 			iCompare);
195 } /* end of pGetCharTableRecord */
196 
197 /*
198  * ucGetBulletCharacter - get the local representation of the bullet
199  */
200 UCHAR
ucGetBulletCharacter(conversion_type eConversionType,encoding_type eEncoding)201 ucGetBulletCharacter(conversion_type eConversionType, encoding_type eEncoding)
202 {
203 #if defined(__riscos)
204 	return 0x8f;
205 #else
206 	const char_table_type	*pRec;
207 
208 	fail(eEncoding == encoding_utf_8);
209 
210 	if (eEncoding == encoding_latin_1 &&
211 	    (eConversionType == conversion_ps ||
212 	     eConversionType == conversion_pdf)) {
213 		/* Ugly, but it makes the PostScript and PDF look better */
214 		return (UCHAR)143;
215 	}
216 	if (eConversionType != conversion_text &&
217 	    eConversionType != conversion_fmt_text) {
218 		pRec = pGetCharTableRecord(UNICODE_BULLET);
219 		if (pRec != NULL) {
220 			return pRec->ucLocal;
221 		}
222 		pRec = pGetCharTableRecord(UNICODE_BULLET_OPERATOR);
223 		if (pRec != NULL) {
224 			return pRec->ucLocal;
225 		}
226 		pRec = pGetCharTableRecord(UNICODE_MIDDLE_DOT);
227 		if (pRec != NULL) {
228 			return pRec->ucLocal;
229 		}
230 	}
231 	return (UCHAR)'.';
232 #endif /* __riscos */
233 } /* end of ucGetBulletCharacter */
234 
235 /*
236  * ucGetNbspCharacter - get the local representation of the non-breaking space
237  */
238 UCHAR
ucGetNbspCharacter(void)239 ucGetNbspCharacter(void)
240 {
241 	const char_table_type	*pRec;
242 
243 	pRec = pGetCharTableRecord(0x00a0);	/* Unicode non-breaking space */
244 	if (pRec == NULL) {
245 		DBG_MSG("Non-breaking space record not found");
246 		/* No value found, use the best guess */
247 		return (UCHAR)0xa0;
248 	}
249 	return pRec->ucLocal;
250 } /* end of ucGetNbspCharacter */
251 
252 /*
253  * bReadCharacterMappingTable - read the mapping table
254  *
255  * Read the character mapping table from file and have the contents sorted
256  *
257  * returns TRUE if successful, otherwise FALSE
258  */
259 BOOL
bReadCharacterMappingTable(FILE * pFile)260 bReadCharacterMappingTable(FILE *pFile)
261 {
262 	char	*pcTmp;
263 	ULONG	ulUnicode;
264 	UINT	uiLocal;
265 	int	iFields;
266 	char	szLine[81];
267 
268 	if (pFile == NULL) {
269 		return FALSE;
270 	}
271 
272 	/* Clean the table first */
273 	(void)memset(atCharTable, 0, sizeof(atCharTable));
274 
275 	/* Fill the table */
276 	while (fgets(szLine, (int)sizeof(szLine), pFile)) {
277 		if (szLine[0] == '#' ||
278 		    szLine[0] == '\r' ||
279 		    szLine[0] == '\n') {
280 			/* Comment or empty line */
281 			continue;
282 		}
283 		iFields = sscanf(szLine, "%x %lx %*s", &uiLocal, &ulUnicode);
284 		if (iFields != 2) {
285 			pcTmp = strchr(szLine, '\r');
286 			if (pcTmp != NULL) {
287 				*pcTmp = '\0';
288 			}
289 			pcTmp = strchr(szLine, '\n');
290 			if (pcTmp != NULL) {
291 				*pcTmp = '\0';
292 			}
293 			werr(0, "Syntax error in: '%s'", szLine);
294 			continue;
295 		}
296 		if (uiLocal > 0xff || ulUnicode > 0xffff) {
297 			werr(0, "Syntax error in: '%02x %04lx'",
298 					uiLocal, ulUnicode);
299 			continue;
300 		}
301 		/* Store only the relevant entries */
302 		if (uiLocal != ulUnicode || uiLocal >= 0x80) {
303 			atCharTable[tNextPosFree].ucLocal = (UCHAR)uiLocal;
304 			atCharTable[tNextPosFree].usUnicode = (USHORT)ulUnicode;
305 			tNextPosFree++;
306 		}
307 		if (tNextPosFree >= elementsof(atCharTable)) {
308 			werr(0, "Too many entries in the character mapping "
309 				"file. Ignoring the rest.");
310 			break;
311 		}
312 	}
313 
314 	if (tNextPosFree != 0) {
315 		DBG_HEX(atCharTable[0].usUnicode);
316 		DBG_HEX(atCharTable[tNextPosFree - 1].usUnicode);
317 
318 		qsort(atCharTable,
319 			tNextPosFree, sizeof(atCharTable[0]),
320 			iCompare);
321 
322 		DBG_HEX(atCharTable[0].usUnicode);
323 		DBG_HEX(atCharTable[tNextPosFree - 1].usUnicode);
324 	}
325 
326 	return TRUE;
327 } /* end of bReadCharacterMappingTable */
328 
329 /*
330  * ulTranslateCharacters - Translate characters to local representation
331  *
332  * Translate all characters to local representation
333  *
334  * returns the translated character
335  */
336 ULONG
ulTranslateCharacters(USHORT usChar,ULONG ulFileOffset,int iWordVersion,conversion_type eConversionType,encoding_type eEncoding,BOOL bUseMacCharSet)337 ulTranslateCharacters(USHORT usChar, ULONG ulFileOffset, int iWordVersion,
338 	conversion_type eConversionType, encoding_type eEncoding,
339 	BOOL bUseMacCharSet)
340 {
341 	const char_table_type	*pTmp;
342 	const USHORT	*usCharSet;
343 
344 	usCharSet = NULL;
345 	if (bUseMacCharSet) {
346 		/* Macintosh character set */
347 		usCharSet = usMacRoman;
348 	} else if (iWordVersion == 0) {
349 		/* DOS character set */
350 		usCharSet = usCp850;
351 	} else {
352 		/* Windows character set */
353 		switch (eEncoding) {
354 		case encoding_latin_2:
355 			usCharSet = usCp1250;
356 			break;
357 		case encoding_cyrillic:
358 			usCharSet = usCp1251;
359 			break;
360 		case encoding_latin_1:
361 		default:
362 			usCharSet = usCp1252;
363 			break;
364 		}
365 	}
366 	fail(usCharSet == NULL);
367 	if (usChar >= 0x80 && usChar <= 0x9f) {
368 		/* Translate implementation defined characters */
369 		usChar = usCharSet[usChar - 0x80];
370 	} else if (iWordVersion < 8 && usChar >= 0xa0 && usChar <= 0xff) {
371 		/* Translate old character set to Unixcode */
372 		usChar = usCharSet[usChar - 0x80];
373 	}
374 
375 	/* Microsoft Unicode to real Unicode */
376 	if (usChar >= 0xf020 && usChar <= 0xf0ff) {
377 		DBG_HEX_C(usPrivateArea[usChar - 0xf020] == 0x003f, usChar);
378 		usChar = usPrivateArea[usChar - 0xf020];
379 	}
380 
381 	/* Characters with a special meaning in Word */
382 	switch (usChar) {
383 	case IGNORE_CHARACTER:
384 	case FOOTNOTE_SEPARATOR:
385 	case FOOTNOTE_CONTINUATION:
386 	case ANNOTATION:
387 	case FRAME:
388 	case LINE_FEED:
389 	case WORD_SOFT_HYPHEN:
390 	case UNICODE_HYPHENATION_POINT:
391 		return IGNORE_CHARACTER;
392 	case PICTURE:
393 	case TABLE_SEPARATOR:
394 	case TAB:
395 	case HARD_RETURN:
396 	case PAGE_BREAK:
397 	case PAR_END:
398 	case COLUMN_FEED:
399 		return (ULONG)usChar;
400 	case FOOTNOTE_OR_ENDNOTE:
401 		NO_DBG_HEX(ulFileOffset);
402 		switch (eGetNotetype(ulFileOffset)) {
403 		case notetype_is_footnote:
404 			return FOOTNOTE_CHAR;
405 		case notetype_is_endnote:
406 			return ENDNOTE_CHAR;
407 		default:
408 			return UNKNOWN_NOTE_CHAR;
409 		}
410 	case WORD_UNBREAKABLE_JOIN:
411 		return (ULONG)OUR_UNBREAKABLE_JOIN;
412 	default:
413 		break;
414 	}
415 
416 	if (eEncoding != encoding_utf_8) {
417 		/* Latin characters in an oriental text */
418 		if (usChar >= 0xff01 && usChar <= 0xff5e) {
419 			usChar -= 0xfee0;
420 		}
421 	}
422 
423 	if (eEncoding == encoding_latin_1 &&
424 	    (eConversionType == conversion_ps ||
425 	     eConversionType == conversion_pdf)) {
426 		/* Ugly, but it makes the PostScript and PDF look better */
427 		switch (usChar) {
428 		case UNICODE_ELLIPSIS:
429 			return 140;
430 		case UNICODE_TRADEMARK_SIGN:
431 			return 141;
432 		case UNICODE_PER_MILLE_SIGN:
433 			return 142;
434 		case UNICODE_BULLET:
435 		case UNICODE_BULLET_OPERATOR:
436 		case UNICODE_BLACK_CLUB_SUIT:
437 			return 143;
438 		case UNICODE_LEFT_SINGLE_QMARK:
439 			return 144;
440 		case UNICODE_RIGHT_SINGLE_QMARK:
441 			return 145;
442 		case UNICODE_SINGLE_LEFT_ANGLE_QMARK:
443 			return 146;
444 		case UNICODE_SINGLE_RIGHT_ANGLE_QMARK:
445 			return 147;
446 		case UNICODE_LEFT_DOUBLE_QMARK:
447 			return 148;
448 		case UNICODE_RIGHT_DOUBLE_QMARK:
449 			return 149;
450 		case UNICODE_DOUBLE_LOW_9_QMARK:
451 			return 150;
452 		case UNICODE_EN_DASH:
453 			return 151;
454 		case UNICODE_EM_DASH:
455 			return 152;
456 		case UNICODE_MINUS_SIGN:
457 			return 153;
458 		case UNICODE_CAPITAL_LIGATURE_OE:
459 			return 154;
460 		case UNICODE_SMALL_LIGATURE_OE:
461 			return 155;
462 		case UNICODE_DAGGER:
463 			return 156;
464 		case UNICODE_DOUBLE_DAGGER:
465 			return 157;
466 		case UNICODE_SMALL_LIGATURE_FI:
467 			return 158;
468 		case UNICODE_SMALL_LIGATURE_FL:
469 			return 159;
470 		default:
471 			break;
472 		}
473 	}
474 
475 	if (eConversionType == conversion_pdf) {
476 		if (eEncoding == encoding_latin_1) {
477 			switch (usChar) {
478 			case UNICODE_EURO_SIGN:
479 				return 128;
480 			default:
481 				break;
482 			}
483 		} else if (eEncoding == encoding_latin_2) {
484 			switch (usChar) {
485 			case UNICODE_CAPITAL_D_WITH_STROKE:
486 			case UNICODE_SMALL_D_WITH_STROKE:
487 				return 0x3f;
488 			default:
489 				break;
490 			}
491 		}
492 	}
493 
494 	if (usChar < 0x80) {
495 		/* US ASCII */
496 		if (usChar < 0x20 || usChar == 0x7f) {
497 			/* Ignore control characters */
498 			DBG_HEX(usChar);
499 			DBG_FIXME();
500 			return IGNORE_CHARACTER;
501 		}
502 		return (ULONG)usChar;
503 	}
504 
505 	if (eEncoding == encoding_utf_8) {
506 		/* No need to convert Unicode characters */
507 		return (ULONG)usChar;
508 	}
509 
510 	/* Unicode to local representation */
511 	pTmp = pGetCharTableRecord(usChar);
512 	if (pTmp != NULL) {
513 		DBG_HEX_C(usChar >= 0x7f && usChar <= 0x9f, usChar);
514 		return (ULONG)pTmp->ucLocal;
515 	}
516 
517 	/* Fancy characters to simple US ASCII */
518 	switch (usChar) {
519 	case UNICODE_SMALL_F_HOOK:
520 		return (ULONG)'f';
521 	case UNICODE_GREEK_CAPITAL_CHI:
522 		return (ULONG)'X';
523 	case UNICODE_GREEK_SMALL_UPSILON:
524 		return (ULONG)'v';
525 	case UNICODE_MODIFIER_CIRCUMFLEX:
526 	case UNICODE_UPWARDS_ARROW:
527 		return (ULONG)'^';
528 	case UNICODE_SMALL_TILDE:
529 	case UNICODE_TILDE_OPERATOR:
530 		return (ULONG)'~';
531 	case UNICODE_EN_QUAD:
532 	case UNICODE_EM_QUAD:
533 	case UNICODE_EN_SPACE:
534 	case UNICODE_EM_SPACE:
535 	case UNICODE_THREE_PER_EM_SPACE:
536 	case UNICODE_FOUR_PER_EM_SPACE:
537 	case UNICODE_SIX_PER_EM_SPACE:
538 	case UNICODE_FIGURE_SPACE:
539 	case UNICODE_PUNCTUATION_SPACE:
540 	case UNICODE_THIN_SPACE:
541 	case UNICODE_NARROW_NO_BREAK_SPACE:
542 	case UNICODE_LIGHT_SHADE:
543 	case UNICODE_MEDIUM_SHADE:
544 	case UNICODE_DARK_SHADE:
545 		return (ULONG)' ';
546 	case UNICODE_LEFT_DOUBLE_QMARK:
547 	case UNICODE_RIGHT_DOUBLE_QMARK:
548 	case UNICODE_DOUBLE_LOW_9_QMARK:
549 	case UNICODE_DOUBLE_HIGH_REV_9_QMARK:
550 	case UNICODE_DOUBLE_PRIME:
551 		return (ULONG)'"';
552 	case UNICODE_LEFT_SINGLE_QMARK:
553 	case UNICODE_RIGHT_SINGLE_QMARK:
554 	case UNICODE_SINGLE_LOW_9_QMARK:
555 	case UNICODE_SINGLE_HIGH_REV_9_QMARK:
556 	case UNICODE_PRIME:
557 		return (ULONG)'\'';
558 	case UNICODE_HYPHEN:
559 	case UNICODE_NON_BREAKING_HYPHEN:
560 	case UNICODE_FIGURE_DASH:
561 	case UNICODE_EN_DASH:
562 	case UNICODE_EM_DASH:
563 	case UNICODE_HORIZONTAL_BAR:
564 	case UNICODE_MINUS_SIGN:
565 	case UNICODE_BD_LIGHT_HORIZONTAL:
566 	case UNICODE_BD_DOUBLE_HORIZONTAL:
567 		return (ULONG)'-';
568 	case UNICODE_DOUBLE_VERTICAL_LINE:
569 	case UNICODE_BD_LIGHT_VERTICAL:
570 	case UNICODE_BD_DOUBLE_VERTICAL:
571 		return (ULONG)'|';
572 	case UNICODE_DOUBLE_LOW_LINE:
573 		return (ULONG)'_';
574 	case UNICODE_DAGGER:
575 		return (ULONG)'+';
576 	case UNICODE_DOUBLE_DAGGER:
577 		return (ULONG)'#';
578 	case UNICODE_BULLET:
579 	case UNICODE_BULLET_OPERATOR:
580 	case UNICODE_BLACK_CLUB_SUIT:
581 		return (ULONG)ucGetBulletCharacter(eConversionType, eEncoding);
582 	case UNICODE_ONE_DOT_LEADER:
583 	case UNICODE_TWO_DOT_LEADER:
584 		return (ULONG)'.';
585 	case UNICODE_ELLIPSIS:
586 #if defined(__riscos)
587 		return (ULONG)OUR_ELLIPSIS;
588 #else
589 		if (ulFileOffset == 0) {
590 			return (ULONG)OUR_ELLIPSIS;
591 		}
592 		return UNICODE_ELLIPSIS;
593 #endif /* __riscos */
594 	case UNICODE_DOUBLE_LEFT_ANGLE_QMARK:
595 	case UNICODE_TRIANGULAR_BULLET:
596 	case UNICODE_SINGLE_LEFT_ANGLE_QMARK:
597 	case UNICODE_LEFTWARDS_ARROW:
598 		return (ULONG)'<';
599 	case UNICODE_DOUBLE_RIGHT_ANGLE_QMARK:
600 	case UNICODE_SINGLE_RIGHT_ANGLE_QMARK:
601 	case UNICODE_RIGHTWARDS_ARROW:
602 		return (ULONG)'>';
603 	case UNICODE_UNDERTIE:
604 		return (ULONG)'-';
605 	case UNICODE_N_ARY_SUMMATION:
606 		return (ULONG)'S';
607 	case UNICODE_EURO_SIGN:
608 		return (ULONG)'E';
609 	case UNICODE_CIRCLE:
610 	case UNICODE_SQUARE:
611 		return (ULONG)'O';
612 	case UNICODE_DIAMOND:
613 		return (ULONG)OUR_DIAMOND;
614 	case UNICODE_NUMERO_SIGN:
615 		return (ULONG)'N';
616 	case UNICODE_KELVIN_SIGN:
617 		return (ULONG)'K';
618 	case UNICODE_DOWNWARDS_ARROW:
619 		return (ULONG)'v';
620 	case UNICODE_FRACTION_SLASH:
621 	case UNICODE_DIVISION_SLASH:
622 		return (ULONG)'/';
623 	case UNICODE_ASTERISK_OPERATOR:
624 		return (ULONG)'*';
625 	case UNICODE_RATIO:
626 		return (ULONG)':';
627 	case UNICODE_BD_LIGHT_DOWN_RIGHT:
628 	case UNICODE_BD_LIGHT_DOWN_AND_LEFT:
629 	case UNICODE_BD_LIGHT_UP_AND_RIGHT:
630 	case UNICODE_BD_LIGHT_UP_AND_LEFT:
631 	case UNICODE_BD_LIGHT_VERTICAL_AND_RIGHT:
632 	case UNICODE_BD_LIGHT_VERTICAL_AND_LEFT:
633 	case UNICODE_BD_LIGHT_DOWN_AND_HORIZONTAL:
634 	case UNICODE_BD_LIGHT_UP_AND_HORIZONTAL:
635 	case UNICODE_BD_LIGHT_VERTICAL_AND_HORIZONTAL:
636 	case UNICODE_BD_DOUBLE_DOWN_AND_RIGHT:
637 	case UNICODE_BD_DOUBLE_DOWN_AND_LEFT:
638 	case UNICODE_BD_DOUBLE_UP_AND_RIGHT:
639 	case UNICODE_BD_DOUBLE_UP_AND_LEFT:
640 	case UNICODE_BD_DOUBLE_VERTICAL_AND_RIGHT:
641 	case UNICODE_BD_DOUBLE_VERTICAL_AND_LEFT:
642 	case UNICODE_BD_DOUBLE_DOWN_AND_HORIZONTAL:
643 	case UNICODE_BD_DOUBLE_UP_AND_HORIZONTAL:
644 	case UNICODE_BD_DOUBLE_VERTICAL_AND_HORIZONTAL:
645 	case UNICODE_BLACK_SQUARE:
646 		return (ULONG)'+';
647 	case UNICODE_HAIR_SPACE:
648 	case UNICODE_ZERO_WIDTH_SPACE:
649 	case UNICODE_ZERO_WIDTH_NON_JOINER:
650 	case UNICODE_ZERO_WIDTH_JOINER:
651 	case UNICODE_LEFT_TO_RIGHT_MARK:
652 	case UNICODE_RIGHT_TO_LEFT_MARK:
653 	case UNICODE_LEFT_TO_RIGHT_EMBEDDING:
654 	case UNICODE_RIGHT_TO_LEFT_EMBEDDING:
655 	case UNICODE_POP_DIRECTIONAL_FORMATTING:
656 	case UNICODE_LEFT_TO_RIGHT_OVERRIDE:
657 	case UNICODE_RIGHT_TO_LEFT_OVERRIDE:
658 	case UNICODE_ZERO_WIDTH_NO_BREAK_SPACE:
659 		return IGNORE_CHARACTER;
660 	default:
661 		break;
662 	}
663 
664 	if (usChar == UNICODE_TRADEMARK_SIGN) {
665 		/*
666 		 * No local representation, it doesn't look like anything in
667 		 * US-ASCII and a question mark does more harm than good.
668 		 */
669 		return IGNORE_CHARACTER;
670 	}
671 
672 	if (usChar >= 0xa0 && usChar <= 0xff) {
673 		/* Before Word 97, Word did't use Unicode */
674 		return (ULONG)usChar;
675 	}
676 
677 	DBG_HEX_C(usChar < 0x3000 || usChar >= 0xd800, ulFileOffset);
678 	DBG_HEX_C(usChar < 0x3000 || usChar >= 0xd800, usChar);
679 	DBG_MSG_C(usChar >= 0xe000 && usChar < 0xf900, "Private Use Area");
680 
681 	/* Untranslated Unicode character */
682 	return 0x3f;
683 } /* end of ulTranslateCharacters */
684 
685 /*
686  * ulToUpper - convert letter to upper case
687  *
688  * This function converts a letter to upper case. Unlike toupper(3) this
689  * function is independent from the settings of locale. This comes in handy
690  * for people who have to read Word documents in more than one language or
691  * contain more than one language.
692  *
693  * returns the converted letter, or ulChar if the conversion was not possible.
694  */
695 ULONG
ulToUpper(ULONG ulChar)696 ulToUpper(ULONG ulChar)
697 {
698 	if (ulChar < 0x80) {
699 		/* US ASCII: use standard function */
700 		return (ULONG)toupper((int)ulChar);
701 	}
702 	if (ulChar >= 0xe0 && ulChar <= 0xfe && ulChar != 0xf7) {
703 		/*
704 		 * Lower case accented characters
705 		 * 0xf7 is Division sign; 0xd7 is Multiplication sign
706 		 * 0xff is y with diaeresis; 0xdf is Sharp s
707 		 */
708 		return ulChar & ~0x20;
709 	}
710 #if defined(__STDC_ISO_10646__)
711 	/*
712 	 * If this is ISO C99 and all locales have wchar_t = ISO 10646
713 	 * (e.g., glibc 2.2 or newer), then use standard function
714 	 */
715 	if (ulChar > 0xff) {
716 		return (ULONG)towupper((wint_t)ulChar);
717 	}
718 #endif /* __STDC_ISO_10646__ */
719 	return ulChar;
720 } /* end of ulToUpper */
721