1 
2  /***************************************************************************/
3 
4 /*
5  * Portions Copyright (c) 1999 GMRS Software GmbH
6  * Carl-von-Linde-Str. 38, D-85716 Unterschleissheim, http://www.gmrs.de
7  * All rights reserved.
8  *
9  * Author: Arno Unkrig <arno@unkrig.de>
10  */
11 
12 /* This program is free software; you can redistribute it and/or modify
13  * it under the terms of the GNU General Public License as published by
14  * the Free Software Foundation; either version 2 of the License, or
15  * (at your option) any later version.
16  *
17  * This program is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20  * GNU General Public License in the file COPYING for more details.
21  */
22 
23  /***************************************************************************/
24 
25 /*
26  * Changes to version 1.2.2 were made by Martin Bayer <mbayer@zedat.fu-berlin.de>
27  * Dates and reasons of modifications:
28  * Thu Oct  4 22:14:38 CEST 2001: included EURO-sign
29  * Sat Sep 14 15:23:25 CEST 2002: Added plain ASCII output patch by Bela Lubkin
30  * Thu Nov 20 18:23:59 CET 2003: SGML entities array revised
31  */
32 
33  /***************************************************************************/
34 
35 
36 #include <stdlib.h>
37 #include <string.h>
38 #include <ctype.h>
39 
40 #include "html.h"
41 #include "sgml.h"
42 
43 #ifndef nelems
44 #define nelems(array) (sizeof(array) / sizeof((array)[0]))
45 #endif
46 
47 /* ------------------------------------------------------------------------- */
48 
49 /*
50  * Selected SGML entities, with translations to ISO-8859-1 and to
51  * plain ASCII.
52  */
53 
54 /* Straight-ASCII and extra entities partially
55  * added by Bela Lubkin <belal@caldera.com>.
56  */
57 
58 /*
59  * Keep this array sorted alphabetically!
60  */
61 static const struct TextToInt {
62   char name[8];
63   int  iso8859code;
64   char *asciistr;
65 } entities[] = {
66   { "AElig",   LATIN1_AElig,  "AE"         },
67   { "AMP",     0,             "&"          },
68   { "Aacute",  LATIN1_Aacute, "A'"         },
69   { "Acirc",   LATIN1_Acirc,  "A^"         },
70   { "Agrave",  LATIN1_Agrave, "A`"         },
71   { "Alpha",   0,             "A"          },
72   { "Aring",   LATIN1_Aring,  "AA"         },
73   { "Atilde",  LATIN1_Atilde, "A~"         },
74   { "Auml",    LATIN1_Auml,   "A\""        },
75   { "Beta",    0,             "B"          },
76   { "Ccedil",  LATIN1_Ccedil, "C,"         },
77   { "Chi",     0,             "H"          },
78   { "Dagger",  0,             "++"         },
79   { "Delta",   0,             "D"          },
80   { "ETH",     LATIN1_ETH,    "D-"         },
81   { "Eacute",  LATIN1_Eacute, "E'"         },
82   { "Ecirc",   LATIN1_Ecirc,  "E^"         },
83   { "Egrave",  LATIN1_Egrave, "E`"         },
84   { "Epsilon", 0,             "E"          },
85   { "Eta",     0,             "E"          },
86   { "Euml",    LATIN1_Euml,   "E\""        },
87   { "GT",      0,             ">"          },
88   { "Gamma",   0,             "G"          },
89   { "Iacute",  LATIN1_Iacute, "I'"         },
90   { "Icirc",   LATIN1_Icirc,  "I^"         },
91   { "Igrave",  LATIN1_Igrave, "I`"         },
92   { "Iota",    0,             "I"          },
93   { "Iuml",    LATIN1_Iuml,   "I\""        },
94   { "Kappa",   0,             "K"          },
95   { "LT",      0,             "<"          },
96   { "Lambda",  0,             "L"          },
97   { "Mu",      0,             "M"          },
98   { "Ntilde",  LATIN1_Ntilde, "N~"         },
99   { "Nu",      0,             "N"          },
100   { "OElig",   0,             "OE"         },
101   { "Oacute",  LATIN1_Oacute, "O'"         },
102   { "Ocirc",   LATIN1_Ocirc,  "O^"         },
103   { "Ograve",  LATIN1_Ograve, "O`"         },
104   { "Omega",   0,             "O"          },
105   { "Omicron", 0,             "O"          },
106   { "Oslash",  LATIN1_Oslash, "O/"         },
107   { "Otilde",  LATIN1_Otilde, "O~"         },
108   { "Ouml",    LATIN1_Ouml,   "O\""        },
109   { "Phi",     0,             "F"          },
110   { "Pi",      0,             "P"          },
111   { "Prime",   0,             "''"         },
112   { "Psi",     0,             "PS"         },
113   { "QUOT",    0,             "\""         },
114   { "Rho",     0,             "R"          },
115   { "Scaron",  0,             "S"          },
116   { "Sigma",   0,             "S"          },
117   { "THORN",   LATIN1_THORN,  "TH"         },
118   { "Tau",     0,             "T"          },
119   { "Theta",   0,             "TH"         },
120   { "Uacute",  LATIN1_Uacute, "U'"         },
121   { "Ucirc",   LATIN1_Ucirc,  "U^"         },
122   { "Ugrave",  LATIN1_Ugrave, "U`"         },
123   { "Upsilon", 0,             "U"          },
124   { "Uuml",    LATIN1_Uuml,   "U\""        },
125   { "Xi",      0,             "X"          },
126   { "Yacute",  LATIN1_Yacute, "Y'"         },
127   { "Yuml",    0,             "Y\""        },
128   { "Zeta",    0,             "Z"          },
129   { "aacute",  LATIN1_aacute, "a'"         },
130   { "acirc",   LATIN1_acirc,  "a^"         },
131   { "acute",   LATIN1_acute,  "'"          },
132   { "aelig",   LATIN1_aelig,  "ae"         },
133   { "agrave",  LATIN1_agrave, "a`"         },
134   { "alefsym", 0,             "Aleph"      },
135   { "alpha",   0,             "a"          },
136   { "amp",     0,             "&"          },
137   { "and",     0,             "AND"        },
138   { "ang",     0,             "-V"         },
139   { "apos",    0,             "'"          },
140   { "aring",   LATIN1_aring,  "aa"         },
141   { "asymp",   0,             "~="         },
142   { "atilde",  LATIN1_atilde, "a~"         },
143   { "auml",    LATIN1_auml,   "a\""        },
144   { "bdquo",   0,             "\""         },
145   { "beta",    0,             "b"          },
146   { "brvbar",  LATIN1_brvbar, "|"          },
147   { "bull",    0,             " o "        },
148   { "cap",     0,             "(U"         },
149   { "ccedil",  LATIN1_ccedil, "c,"         },
150   { "cedil",   LATIN1_cedil,  ","          },
151   { "cent",    LATIN1_cent,   "-c-"        },
152   { "chi",     0,             "h"          },
153   { "circ",    0,             "^"          },
154 //  { "clubs",   0,             "[clubs]"    },
155   { "cong",    0,             "?="         },
156   { "copy",    LATIN1_copy,   "(c)"        },
157   { "crarr",   0,             "<-'"        },
158   { "cup",     0,             ")U"         },
159   { "curren",  LATIN1_curren, "CUR"        },
160   { "dArr",    0,             "vv"         },
161   { "dagger",  0,             "+"          },
162   { "darr",    0,             "v"          },
163   { "deg",     LATIN1_deg,    "DEG"        },
164   { "delta",   0,             "d"          },
165 //  { "diams",   0,             "[diamonds]" },
166   { "divide",  LATIN1_divide, "/"          },
167   { "eacute",  LATIN1_eacute, "e'"         },
168   { "ecirc",   LATIN1_ecirc,  "e^"         },
169   { "egrave",  LATIN1_egrave, "e`"         },
170   { "empty",   0,             "{}"         },
171   { "epsilon", 0,             "e"          },
172   { "equiv",   0,             "=="         },
173   { "eta",     0,             "e"          },
174   { "eth",     LATIN1_eth,    "d-"         },
175   { "euml",    LATIN1_euml,   "e\""        },
176   { "euro",    0,             "EUR"        },
177   { "exist",   0,             "TE"         },
178   { "fnof",    0,             "f"          },
179   { "forall",  0,             "FA"         },
180   { "frac12",  LATIN1_frac12, " 1/2"       },
181   { "frac14",  LATIN1_frac14, " 1/4"       },
182   { "frac34",  LATIN1_frac34, " 3/4"       },
183   { "frasl",   0,             "/"          },
184   { "gamma",   0,             "g"          },
185   { "ge",      0,             ">="         },
186   { "gt",      0,             ">"          },
187   { "hArr",    0,             "<=>"        },
188   { "harr",    0,             "<->"        },
189 //  { "hearts",  0,             "[hearts]"   },
190   { "hellip",  0,             "..."        },
191   { "iacute",  LATIN1_iacute, "i'"         },
192   { "icirc",   LATIN1_icirc,  "i^"         },
193   { "iexcl",   LATIN1_iexcl,  "!"          },
194   { "igrave",  LATIN1_igrave, "i`"         },
195   { "image",   0,             "Im"         },
196   { "infin",   0,             "oo"         },
197   { "int",     0,             "INT"        },
198   { "iota",    0,             "i"          },
199   { "iquest",  LATIN1_iquest, "?"          },
200   { "isin",    0,             "(-"         },
201   { "iuml",    LATIN1_iuml,   "i\""        },
202   { "kappa",   0,             "k"          },
203   { "lArr",    0,             "<="         },
204   { "lambda",  0,             "l"          },
205   { "lang",    0,             "</"         },
206   { "laquo",   LATIN1_laquo,  "<<"         },
207   { "larr",    0,             "<-"         },
208 //  { "lceil",   0,             "<|"         },
209   { "ldquo",   0,             "\""         },
210   { "le",      0,             "<="         },
211 //  { "lfloor",  0,             "|<"         },
212   { "lowast",  0,             "*"          },
213   { "loz",     0,             "<>"         },
214   { "lsaquo",  0,             "<"          },
215   { "lsquo",   0,             "`"          },
216   { "lt",      0,             "<"          },
217   { "macr",    LATIN1_macr,   "-"          },
218   { "mdash",   0,             "--"         },
219   { "micro",   LATIN1_micro,  "my"         },
220   { "middot",  LATIN1_middot, "."          },
221   { "minus",   0,             "-"          },
222   { "mu",      0,             "m"          },
223   { "nabla",   0,             "Nabla"      },
224   { "nbsp",    LATIN1_nbsp,   " "          },
225   { "ndash",   0,             "-"          },
226   { "ne",      0,             "!="         },
227   { "ni",      0,             "-)"         },
228   { "not",     LATIN1_not,    "NOT"        },
229   { "notin",   0,             "!(-"        },
230   { "nsub",    0,             "!(C"        },
231   { "ntilde",  LATIN1_ntilde, "n~"         },
232   { "nu",      0,             "n"          },
233   { "oacute",  LATIN1_oacute, "o'"         },
234   { "ocirc",   LATIN1_ocirc,  "o^"         },
235   { "oelig",   0,             "oe"         },
236   { "ograve",  LATIN1_ograve, "o`"         },
237   { "oline",   LATIN1_macr,   "-"          },
238   { "omega",   0,             "o"          },
239   { "omicron", 0,             "o"          },
240   { "oplus",   0,             "(+)"        },
241   { "or",      0,             "OR"         },
242   { "ordf",    LATIN1_ordf,   "-a"         },
243   { "ordm",    LATIN1_ordm,   "-o"         },
244   { "oslash",  LATIN1_oslash, "o/"         },
245   { "otilde",  LATIN1_otilde, "o~"         },
246   { "otimes",  0,             "(x)"        },
247   { "ouml",    LATIN1_ouml,   "o\""        },
248   { "para",    LATIN1_para,   "P:"         },
249   { "part",    0,             "PART"       },
250   { "permil",  0,             " 0/00"      },
251   { "perp",    0,             "-T"         },
252   { "phi",     0,             "f"          },
253   { "pi",      0,             "p"          },
254   { "piv",     0,             "Pi"         },
255   { "plusmn",  LATIN1_plusmn, "+/-"        },
256   { "pound",   LATIN1_pound,  "-L-"        },
257   { "prime",   0,             "'"          },
258   { "prod",    0,             "PROD"       },
259   { "prop",    0,             "0("         },
260   { "psi",     0,             "ps"         },
261   { "quot",    0,             "\""         },
262   { "rArr",    0,             "=>"         },
263   { "radic",   0,             "SQRT"       },
264   { "rang",    0,             "/>"         },
265   { "raquo",   LATIN1_raquo,  ">>"         },
266   { "rarr",    0,             "->"         },
267 //  { "rceil",   0,             ">|"         },
268   { "rdquo",   0,             "\""         },
269   { "real",    0,             "Re"         },
270   { "reg",     LATIN1_reg,    "(R)"        },
271 //  { "rfloor",  0,             "|>"         },
272   { "rho",     0,             "r"          },
273   { "rsaquo",  0,             ">"          },
274   { "rsquo",   0,             "'"          },
275   { "sbquo",   0,             "'"          },
276   { "scaron",  0,             "s"          },
277   { "sdot",    0,             "DOT"        },
278   { "sect",    LATIN1_sect,   "S:"         },
279   { "shy",     LATIN1_shy,    ""           },
280   { "sigma",   0,             "s"          },
281   { "sigmaf",  0,             "s"          },
282   { "sim",     0,             "~"          },
283 //  { "spades",  0,             "[spades]"   },
284   { "sub",     0,             "(C"         },
285   { "sube",    0,             "(_"         },
286   { "sum",     0,             "SUM"        },
287   { "sup",     0,             ")C"         },
288   { "sup1",    LATIN1_sup1,   "^1"         },
289   { "sup2",    LATIN1_sup2,   "^2"         },
290   { "sup3",    LATIN1_sup3,   "^3"         },
291   { "supe",    0,             ")_"         },
292   { "szlig",   LATIN1_szlig,  "ss"         },
293   { "tau",     0,             "t"          },
294   { "there4",  0,             ".:"         },
295   { "theta",   0,             "th"         },
296   { "thorn",   LATIN1_thorn,  "th"         },
297   { "tilde",   0,             "~"          },
298   { "times",   LATIN1_times,  "x"          },
299   { "trade",   0,             "[TM]"       },
300   { "uArr",    0,             "^^"         },
301   { "uacute",  LATIN1_uacute, "u'"         },
302   { "uarr",    0,             "^"          },
303   { "ucirc",   LATIN1_ucirc,  "u^"         },
304   { "ugrave",  LATIN1_ugrave, "u`"         },
305   { "uml",     LATIN1_uml,    "\""         },
306   { "upsilon", 0,             "u"          },
307   { "uuml",    LATIN1_uuml,   "u\""        },
308   { "weierp",  0,             "P"          },
309   { "xi",      0,             "x"          },
310   { "yacute",  LATIN1_yacute, "y'"         },
311   { "yen",     LATIN1_yen,    "YEN"        },
312   { "yuml",    LATIN1_yuml,   "y\""        },
313   { "zeta",    0,             "z"          },
314 };
315 
316 extern int use_iso8859;
317 
318 /* ------------------------------------------------------------------------- */
319 
320 void
replace_sgml_entities(string * s)321 replace_sgml_entities(string *s)
322 {
323   string::size_type j = 0;
324 
325   for (;;) {
326     string::size_type l = s->length();
327 
328     /*
329      * Skip characters before ampersand.
330      */
331     while (j < l && s->at(j) != '&') ++j;
332     /*
333      * We could convert high-bit chars to "&#233;" here if use_iso8859
334      * is off, then let them be translated or not.  Is the purpose of
335      * !use_iso8859 to allow SGML entities to be seen, or to strongly
336      * filter against high-ASCII chars that might blow up a terminal
337      * that doesn't speak ISO8859?  For the moment, "allow SGML entities
338      * to be seen" -- no filtering here.
339      */
340     if (j >= l) break;
341 
342     /*
343      * So we have an ampersand...
344      */
345 
346     /*
347      * Don't process the last three characters; an SGML entity wouldn't fit
348      * in anyway!
349      */
350     if (j + 3 >= l) break;          // Watch out! Unsigned arithmetics!
351 
352     string::size_type beg = j++;    // Skip the ampersand;
353 
354     /*
355      * Look at the next character.
356      */
357     char c = s->at(j++);
358     if (c == '#') {
359 
360       /*
361        * Decode entities like "&#233;".
362        * Some authors forget the ";", but we tolerate this.
363        */
364       c = s->at(j++);
365       if (isdigit(c)) {
366         int x = c - '0';
367         for (; j < l; ++j) {
368           c = s->at(j);
369           if (c == ';') { ++j; break; }
370           if (!isdigit(c)) break;
371           x = 10 * x + c - '0';
372         }
373         if (use_iso8859 || (x < 128)) {
374         s->replace(beg, j - beg, 1, (char) x);
375         j = beg + 1;
376         } else {
377           for (int i = 0; i < nelems(entities); i++) {
378             if (x == entities[i].iso8859code) {
379               s->replace(beg, j - beg, entities[i].asciistr);
380               break;
381             }
382           }
383           /* else don't replace it at all, we don't have a translation */
384         }
385       }
386     } else
387 
388     if (isalpha(c)) {
389 
390       /*
391        * Decode entities like "&nbsp;".
392        * Some authors forget the ";", but we tolerate this.
393        */
394       char name[8];
395       name[0] = c;
396       size_t i = 1;
397       for (; j < l; ++j) {
398         c = s->at(j);
399         if (c == ';') { ++j; break; }
400         if (!isalnum(c)) break;
401         if (i < sizeof(name) - 1) name[i++] = c;
402       }
403       name[i] = '\0';
404 
405       const TextToInt *entity = (const TextToInt *) bsearch(
406         name,
407         entities, nelems(entities), sizeof(TextToInt),
408         (int (*)(const void *, const void *)) strcmp
409       );
410       if (entity != NULL) {
411         if (use_iso8859 && entity->iso8859code) {
412           s->replace(beg, j - beg, 1, (char) entity->iso8859code);
413           j = beg + 1;
414         } else if (entity->asciistr) {
415           s->replace(beg, j - beg, entity->asciistr);
416         j = beg + 1;
417         } /* else don't replace it at all, we don't have a translation */
418       }
419     } else {
420       ;                         /* EXTENSION: Allow literal '&' sometimes. */
421     }
422   }
423 }
424 
425 /* ------------------------------------------------------------------------- */
426 
427