1
2 /***************************************************************************/
3
4 /*
5 * Portions Copyright (c) 1999 GMRS Software GmbH
6 * Carl-von-Linde-Str. 38, D-85716 Unterschleissheim, http://www.gmrs.de
7 * All rights reserved.
8 *
9 * Author: Arno Unkrig <arno@unkrig.de>
10 */
11
12 /* This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License in the file COPYING for more details.
21 */
22
23 /***************************************************************************/
24
25 /*
26 * Changes to version 1.2.2 were made by Martin Bayer <mbayer@zedat.fu-berlin.de>
27 * Dates and reasons of modifications:
28 * Thu Oct 4 22:14:38 CEST 2001: included EURO-sign
29 * Sat Sep 14 15:23:25 CEST 2002: Added plain ASCII output patch by Bela Lubkin
30 * Thu Nov 20 18:23:59 CET 2003: SGML entities array revised
31 */
32
33 /***************************************************************************/
34
35
36 #include <stdlib.h>
37 #include <string.h>
38 #include <ctype.h>
39
40 #include "html.h"
41 #include "sgml.h"
42
43 #ifndef nelems
44 #define nelems(array) (sizeof(array) / sizeof((array)[0]))
45 #endif
46
47 /* ------------------------------------------------------------------------- */
48
49 /*
50 * Selected SGML entities, with translations to ISO-8859-1 and to
51 * plain ASCII.
52 */
53
54 /* Straight-ASCII and extra entities partially
55 * added by Bela Lubkin <belal@caldera.com>.
56 */
57
58 /*
59 * Keep this array sorted alphabetically!
60 */
61 static const struct TextToInt {
62 char name[8];
63 int iso8859code;
64 char *asciistr;
65 } entities[] = {
66 { "AElig", LATIN1_AElig, "AE" },
67 { "AMP", 0, "&" },
68 { "Aacute", LATIN1_Aacute, "A'" },
69 { "Acirc", LATIN1_Acirc, "A^" },
70 { "Agrave", LATIN1_Agrave, "A`" },
71 { "Alpha", 0, "A" },
72 { "Aring", LATIN1_Aring, "AA" },
73 { "Atilde", LATIN1_Atilde, "A~" },
74 { "Auml", LATIN1_Auml, "A\"" },
75 { "Beta", 0, "B" },
76 { "Ccedil", LATIN1_Ccedil, "C," },
77 { "Chi", 0, "H" },
78 { "Dagger", 0, "++" },
79 { "Delta", 0, "D" },
80 { "ETH", LATIN1_ETH, "D-" },
81 { "Eacute", LATIN1_Eacute, "E'" },
82 { "Ecirc", LATIN1_Ecirc, "E^" },
83 { "Egrave", LATIN1_Egrave, "E`" },
84 { "Epsilon", 0, "E" },
85 { "Eta", 0, "E" },
86 { "Euml", LATIN1_Euml, "E\"" },
87 { "GT", 0, ">" },
88 { "Gamma", 0, "G" },
89 { "Iacute", LATIN1_Iacute, "I'" },
90 { "Icirc", LATIN1_Icirc, "I^" },
91 { "Igrave", LATIN1_Igrave, "I`" },
92 { "Iota", 0, "I" },
93 { "Iuml", LATIN1_Iuml, "I\"" },
94 { "Kappa", 0, "K" },
95 { "LT", 0, "<" },
96 { "Lambda", 0, "L" },
97 { "Mu", 0, "M" },
98 { "Ntilde", LATIN1_Ntilde, "N~" },
99 { "Nu", 0, "N" },
100 { "OElig", 0, "OE" },
101 { "Oacute", LATIN1_Oacute, "O'" },
102 { "Ocirc", LATIN1_Ocirc, "O^" },
103 { "Ograve", LATIN1_Ograve, "O`" },
104 { "Omega", 0, "O" },
105 { "Omicron", 0, "O" },
106 { "Oslash", LATIN1_Oslash, "O/" },
107 { "Otilde", LATIN1_Otilde, "O~" },
108 { "Ouml", LATIN1_Ouml, "O\"" },
109 { "Phi", 0, "F" },
110 { "Pi", 0, "P" },
111 { "Prime", 0, "''" },
112 { "Psi", 0, "PS" },
113 { "QUOT", 0, "\"" },
114 { "Rho", 0, "R" },
115 { "Scaron", 0, "S" },
116 { "Sigma", 0, "S" },
117 { "THORN", LATIN1_THORN, "TH" },
118 { "Tau", 0, "T" },
119 { "Theta", 0, "TH" },
120 { "Uacute", LATIN1_Uacute, "U'" },
121 { "Ucirc", LATIN1_Ucirc, "U^" },
122 { "Ugrave", LATIN1_Ugrave, "U`" },
123 { "Upsilon", 0, "U" },
124 { "Uuml", LATIN1_Uuml, "U\"" },
125 { "Xi", 0, "X" },
126 { "Yacute", LATIN1_Yacute, "Y'" },
127 { "Yuml", 0, "Y\"" },
128 { "Zeta", 0, "Z" },
129 { "aacute", LATIN1_aacute, "a'" },
130 { "acirc", LATIN1_acirc, "a^" },
131 { "acute", LATIN1_acute, "'" },
132 { "aelig", LATIN1_aelig, "ae" },
133 { "agrave", LATIN1_agrave, "a`" },
134 { "alefsym", 0, "Aleph" },
135 { "alpha", 0, "a" },
136 { "amp", 0, "&" },
137 { "and", 0, "AND" },
138 { "ang", 0, "-V" },
139 { "apos", 0, "'" },
140 { "aring", LATIN1_aring, "aa" },
141 { "asymp", 0, "~=" },
142 { "atilde", LATIN1_atilde, "a~" },
143 { "auml", LATIN1_auml, "a\"" },
144 { "bdquo", 0, "\"" },
145 { "beta", 0, "b" },
146 { "brvbar", LATIN1_brvbar, "|" },
147 { "bull", 0, " o " },
148 { "cap", 0, "(U" },
149 { "ccedil", LATIN1_ccedil, "c," },
150 { "cedil", LATIN1_cedil, "," },
151 { "cent", LATIN1_cent, "-c-" },
152 { "chi", 0, "h" },
153 { "circ", 0, "^" },
154 // { "clubs", 0, "[clubs]" },
155 { "cong", 0, "?=" },
156 { "copy", LATIN1_copy, "(c)" },
157 { "crarr", 0, "<-'" },
158 { "cup", 0, ")U" },
159 { "curren", LATIN1_curren, "CUR" },
160 { "dArr", 0, "vv" },
161 { "dagger", 0, "+" },
162 { "darr", 0, "v" },
163 { "deg", LATIN1_deg, "DEG" },
164 { "delta", 0, "d" },
165 // { "diams", 0, "[diamonds]" },
166 { "divide", LATIN1_divide, "/" },
167 { "eacute", LATIN1_eacute, "e'" },
168 { "ecirc", LATIN1_ecirc, "e^" },
169 { "egrave", LATIN1_egrave, "e`" },
170 { "empty", 0, "{}" },
171 { "epsilon", 0, "e" },
172 { "equiv", 0, "==" },
173 { "eta", 0, "e" },
174 { "eth", LATIN1_eth, "d-" },
175 { "euml", LATIN1_euml, "e\"" },
176 { "euro", 0, "EUR" },
177 { "exist", 0, "TE" },
178 { "fnof", 0, "f" },
179 { "forall", 0, "FA" },
180 { "frac12", LATIN1_frac12, " 1/2" },
181 { "frac14", LATIN1_frac14, " 1/4" },
182 { "frac34", LATIN1_frac34, " 3/4" },
183 { "frasl", 0, "/" },
184 { "gamma", 0, "g" },
185 { "ge", 0, ">=" },
186 { "gt", 0, ">" },
187 { "hArr", 0, "<=>" },
188 { "harr", 0, "<->" },
189 // { "hearts", 0, "[hearts]" },
190 { "hellip", 0, "..." },
191 { "iacute", LATIN1_iacute, "i'" },
192 { "icirc", LATIN1_icirc, "i^" },
193 { "iexcl", LATIN1_iexcl, "!" },
194 { "igrave", LATIN1_igrave, "i`" },
195 { "image", 0, "Im" },
196 { "infin", 0, "oo" },
197 { "int", 0, "INT" },
198 { "iota", 0, "i" },
199 { "iquest", LATIN1_iquest, "?" },
200 { "isin", 0, "(-" },
201 { "iuml", LATIN1_iuml, "i\"" },
202 { "kappa", 0, "k" },
203 { "lArr", 0, "<=" },
204 { "lambda", 0, "l" },
205 { "lang", 0, "</" },
206 { "laquo", LATIN1_laquo, "<<" },
207 { "larr", 0, "<-" },
208 // { "lceil", 0, "<|" },
209 { "ldquo", 0, "\"" },
210 { "le", 0, "<=" },
211 // { "lfloor", 0, "|<" },
212 { "lowast", 0, "*" },
213 { "loz", 0, "<>" },
214 { "lsaquo", 0, "<" },
215 { "lsquo", 0, "`" },
216 { "lt", 0, "<" },
217 { "macr", LATIN1_macr, "-" },
218 { "mdash", 0, "--" },
219 { "micro", LATIN1_micro, "my" },
220 { "middot", LATIN1_middot, "." },
221 { "minus", 0, "-" },
222 { "mu", 0, "m" },
223 { "nabla", 0, "Nabla" },
224 { "nbsp", LATIN1_nbsp, " " },
225 { "ndash", 0, "-" },
226 { "ne", 0, "!=" },
227 { "ni", 0, "-)" },
228 { "not", LATIN1_not, "NOT" },
229 { "notin", 0, "!(-" },
230 { "nsub", 0, "!(C" },
231 { "ntilde", LATIN1_ntilde, "n~" },
232 { "nu", 0, "n" },
233 { "oacute", LATIN1_oacute, "o'" },
234 { "ocirc", LATIN1_ocirc, "o^" },
235 { "oelig", 0, "oe" },
236 { "ograve", LATIN1_ograve, "o`" },
237 { "oline", LATIN1_macr, "-" },
238 { "omega", 0, "o" },
239 { "omicron", 0, "o" },
240 { "oplus", 0, "(+)" },
241 { "or", 0, "OR" },
242 { "ordf", LATIN1_ordf, "-a" },
243 { "ordm", LATIN1_ordm, "-o" },
244 { "oslash", LATIN1_oslash, "o/" },
245 { "otilde", LATIN1_otilde, "o~" },
246 { "otimes", 0, "(x)" },
247 { "ouml", LATIN1_ouml, "o\"" },
248 { "para", LATIN1_para, "P:" },
249 { "part", 0, "PART" },
250 { "permil", 0, " 0/00" },
251 { "perp", 0, "-T" },
252 { "phi", 0, "f" },
253 { "pi", 0, "p" },
254 { "piv", 0, "Pi" },
255 { "plusmn", LATIN1_plusmn, "+/-" },
256 { "pound", LATIN1_pound, "-L-" },
257 { "prime", 0, "'" },
258 { "prod", 0, "PROD" },
259 { "prop", 0, "0(" },
260 { "psi", 0, "ps" },
261 { "quot", 0, "\"" },
262 { "rArr", 0, "=>" },
263 { "radic", 0, "SQRT" },
264 { "rang", 0, "/>" },
265 { "raquo", LATIN1_raquo, ">>" },
266 { "rarr", 0, "->" },
267 // { "rceil", 0, ">|" },
268 { "rdquo", 0, "\"" },
269 { "real", 0, "Re" },
270 { "reg", LATIN1_reg, "(R)" },
271 // { "rfloor", 0, "|>" },
272 { "rho", 0, "r" },
273 { "rsaquo", 0, ">" },
274 { "rsquo", 0, "'" },
275 { "sbquo", 0, "'" },
276 { "scaron", 0, "s" },
277 { "sdot", 0, "DOT" },
278 { "sect", LATIN1_sect, "S:" },
279 { "shy", LATIN1_shy, "" },
280 { "sigma", 0, "s" },
281 { "sigmaf", 0, "s" },
282 { "sim", 0, "~" },
283 // { "spades", 0, "[spades]" },
284 { "sub", 0, "(C" },
285 { "sube", 0, "(_" },
286 { "sum", 0, "SUM" },
287 { "sup", 0, ")C" },
288 { "sup1", LATIN1_sup1, "^1" },
289 { "sup2", LATIN1_sup2, "^2" },
290 { "sup3", LATIN1_sup3, "^3" },
291 { "supe", 0, ")_" },
292 { "szlig", LATIN1_szlig, "ss" },
293 { "tau", 0, "t" },
294 { "there4", 0, ".:" },
295 { "theta", 0, "th" },
296 { "thorn", LATIN1_thorn, "th" },
297 { "tilde", 0, "~" },
298 { "times", LATIN1_times, "x" },
299 { "trade", 0, "[TM]" },
300 { "uArr", 0, "^^" },
301 { "uacute", LATIN1_uacute, "u'" },
302 { "uarr", 0, "^" },
303 { "ucirc", LATIN1_ucirc, "u^" },
304 { "ugrave", LATIN1_ugrave, "u`" },
305 { "uml", LATIN1_uml, "\"" },
306 { "upsilon", 0, "u" },
307 { "uuml", LATIN1_uuml, "u\"" },
308 { "weierp", 0, "P" },
309 { "xi", 0, "x" },
310 { "yacute", LATIN1_yacute, "y'" },
311 { "yen", LATIN1_yen, "YEN" },
312 { "yuml", LATIN1_yuml, "y\"" },
313 { "zeta", 0, "z" },
314 };
315
316 extern int use_iso8859;
317
318 /* ------------------------------------------------------------------------- */
319
320 void
replace_sgml_entities(string * s)321 replace_sgml_entities(string *s)
322 {
323 string::size_type j = 0;
324
325 for (;;) {
326 string::size_type l = s->length();
327
328 /*
329 * Skip characters before ampersand.
330 */
331 while (j < l && s->at(j) != '&') ++j;
332 /*
333 * We could convert high-bit chars to "é" here if use_iso8859
334 * is off, then let them be translated or not. Is the purpose of
335 * !use_iso8859 to allow SGML entities to be seen, or to strongly
336 * filter against high-ASCII chars that might blow up a terminal
337 * that doesn't speak ISO8859? For the moment, "allow SGML entities
338 * to be seen" -- no filtering here.
339 */
340 if (j >= l) break;
341
342 /*
343 * So we have an ampersand...
344 */
345
346 /*
347 * Don't process the last three characters; an SGML entity wouldn't fit
348 * in anyway!
349 */
350 if (j + 3 >= l) break; // Watch out! Unsigned arithmetics!
351
352 string::size_type beg = j++; // Skip the ampersand;
353
354 /*
355 * Look at the next character.
356 */
357 char c = s->at(j++);
358 if (c == '#') {
359
360 /*
361 * Decode entities like "é".
362 * Some authors forget the ";", but we tolerate this.
363 */
364 c = s->at(j++);
365 if (isdigit(c)) {
366 int x = c - '0';
367 for (; j < l; ++j) {
368 c = s->at(j);
369 if (c == ';') { ++j; break; }
370 if (!isdigit(c)) break;
371 x = 10 * x + c - '0';
372 }
373 if (use_iso8859 || (x < 128)) {
374 s->replace(beg, j - beg, 1, (char) x);
375 j = beg + 1;
376 } else {
377 for (int i = 0; i < nelems(entities); i++) {
378 if (x == entities[i].iso8859code) {
379 s->replace(beg, j - beg, entities[i].asciistr);
380 break;
381 }
382 }
383 /* else don't replace it at all, we don't have a translation */
384 }
385 }
386 } else
387
388 if (isalpha(c)) {
389
390 /*
391 * Decode entities like " ".
392 * Some authors forget the ";", but we tolerate this.
393 */
394 char name[8];
395 name[0] = c;
396 size_t i = 1;
397 for (; j < l; ++j) {
398 c = s->at(j);
399 if (c == ';') { ++j; break; }
400 if (!isalnum(c)) break;
401 if (i < sizeof(name) - 1) name[i++] = c;
402 }
403 name[i] = '\0';
404
405 const TextToInt *entity = (const TextToInt *) bsearch(
406 name,
407 entities, nelems(entities), sizeof(TextToInt),
408 (int (*)(const void *, const void *)) strcmp
409 );
410 if (entity != NULL) {
411 if (use_iso8859 && entity->iso8859code) {
412 s->replace(beg, j - beg, 1, (char) entity->iso8859code);
413 j = beg + 1;
414 } else if (entity->asciistr) {
415 s->replace(beg, j - beg, entity->asciistr);
416 j = beg + 1;
417 } /* else don't replace it at all, we don't have a translation */
418 }
419 } else {
420 ; /* EXTENSION: Allow literal '&' sometimes. */
421 }
422 }
423 }
424
425 /* ------------------------------------------------------------------------- */
426
427