1 /* entities.c -- recognize HTML ISO entities
2 
3   (c) 1998-2004 (W3C) MIT, ERCIM, Keio University
4   See tidy.h for the copyright notice.
5 
6   CVS Info :
7 
8     $Author: terry_teague $
9     $Date: 2004/08/02 02:25:13 $
10     $Revision: 1.15 $
11 
12   Entity handling can be static because there are no config or
13   document-specific values.  Lookup table is 100% defined at
14   compile time.
15 
16 */
17 
18 #include <stdio.h>
19 #include "entities.h"
20 #include "tidy-int.h"
21 #include "tmbstr.h"
22 
23 struct _entity;
24 typedef struct _entity entity;
25 
26 struct _entity
27 {
28     ctmbstr name;
29     uint    versions;
30     uint    code;
31 };
32 
33 
34 static const entity entities[] =
35 {
36     /*
37     ** Markup pre-defined character entities
38     */
39     { "quot",    VERS_ALL|VERS_XML,    34 },
40     { "amp",     VERS_ALL|VERS_XML,    38 },
41     { "apos",    VERS_FROM40|VERS_XML, 39 },
42     { "lt",      VERS_ALL|VERS_XML,    60 },
43     { "gt",      VERS_ALL|VERS_XML,    62 },
44 
45     /*
46     ** Latin-1 character entities
47     */
48     { "nbsp",     VERS_ALL,      160 },
49     { "iexcl",    VERS_ALL,      161 },
50     { "cent",     VERS_ALL,      162 },
51     { "pound",    VERS_ALL,      163 },
52     { "curren",   VERS_ALL,      164 },
53     { "yen",      VERS_ALL,      165 },
54     { "brvbar",   VERS_ALL,      166 },
55     { "sect",     VERS_ALL,      167 },
56     { "uml",      VERS_ALL,      168 },
57     { "copy",     VERS_ALL,      169 },
58     { "ordf",     VERS_ALL,      170 },
59     { "laquo",    VERS_ALL,      171 },
60     { "not",      VERS_ALL,      172 },
61     { "shy",      VERS_ALL,      173 },
62     { "reg",      VERS_ALL,      174 },
63     { "macr",     VERS_ALL,      175 },
64     { "deg",      VERS_ALL,      176 },
65     { "plusmn",   VERS_ALL,      177 },
66     { "sup2",     VERS_ALL,      178 },
67     { "sup3",     VERS_ALL,      179 },
68     { "acute",    VERS_ALL,      180 },
69     { "micro",    VERS_ALL,      181 },
70     { "para",     VERS_ALL,      182 },
71     { "middot",   VERS_ALL,      183 },
72     { "cedil",    VERS_ALL,      184 },
73     { "sup1",     VERS_ALL,      185 },
74     { "ordm",     VERS_ALL,      186 },
75     { "raquo",    VERS_ALL,      187 },
76     { "frac14",   VERS_ALL,      188 },
77     { "frac12",   VERS_ALL,      189 },
78     { "frac34",   VERS_ALL,      190 },
79     { "iquest",   VERS_ALL,      191 },
80     { "Agrave",   VERS_ALL,      192 },
81     { "Aacute",   VERS_ALL,      193 },
82     { "Acirc",    VERS_ALL,      194 },
83     { "Atilde",   VERS_ALL,      195 },
84     { "Auml",     VERS_ALL,      196 },
85     { "Aring",    VERS_ALL,      197 },
86     { "AElig",    VERS_ALL,      198 },
87     { "Ccedil",   VERS_ALL,      199 },
88     { "Egrave",   VERS_ALL,      200 },
89     { "Eacute",   VERS_ALL,      201 },
90     { "Ecirc",    VERS_ALL,      202 },
91     { "Euml",     VERS_ALL,      203 },
92     { "Igrave",   VERS_ALL,      204 },
93     { "Iacute",   VERS_ALL,      205 },
94     { "Icirc",    VERS_ALL,      206 },
95     { "Iuml",     VERS_ALL,      207 },
96     { "ETH",      VERS_ALL,      208 },
97     { "Ntilde",   VERS_ALL,      209 },
98     { "Ograve",   VERS_ALL,      210 },
99     { "Oacute",   VERS_ALL,      211 },
100     { "Ocirc",    VERS_ALL,      212 },
101     { "Otilde",   VERS_ALL,      213 },
102     { "Ouml",     VERS_ALL,      214 },
103     { "times",    VERS_ALL,      215 },
104     { "Oslash",   VERS_ALL,      216 },
105     { "Ugrave",   VERS_ALL,      217 },
106     { "Uacute",   VERS_ALL,      218 },
107     { "Ucirc",    VERS_ALL,      219 },
108     { "Uuml",     VERS_ALL,      220 },
109     { "Yacute",   VERS_ALL,      221 },
110     { "THORN",    VERS_ALL,      222 },
111     { "szlig",    VERS_ALL,      223 },
112     { "agrave",   VERS_ALL,      224 },
113     { "aacute",   VERS_ALL,      225 },
114     { "acirc",    VERS_ALL,      226 },
115     { "atilde",   VERS_ALL,      227 },
116     { "auml",     VERS_ALL,      228 },
117     { "aring",    VERS_ALL,      229 },
118     { "aelig",    VERS_ALL,      230 },
119     { "ccedil",   VERS_ALL,      231 },
120     { "egrave",   VERS_ALL,      232 },
121     { "eacute",   VERS_ALL,      233 },
122     { "ecirc",    VERS_ALL,      234 },
123     { "euml",     VERS_ALL,      235 },
124     { "igrave",   VERS_ALL,      236 },
125     { "iacute",   VERS_ALL,      237 },
126     { "icirc",    VERS_ALL,      238 },
127     { "iuml",     VERS_ALL,      239 },
128     { "eth",      VERS_ALL,      240 },
129     { "ntilde",   VERS_ALL,      241 },
130     { "ograve",   VERS_ALL,      242 },
131     { "oacute",   VERS_ALL,      243 },
132     { "ocirc",    VERS_ALL,      244 },
133     { "otilde",   VERS_ALL,      245 },
134     { "ouml",     VERS_ALL,      246 },
135     { "divide",   VERS_ALL,      247 },
136     { "oslash",   VERS_ALL,      248 },
137     { "ugrave",   VERS_ALL,      249 },
138     { "uacute",   VERS_ALL,      250 },
139     { "ucirc",    VERS_ALL,      251 },
140     { "uuml",     VERS_ALL,      252 },
141     { "yacute",   VERS_ALL,      253 },
142     { "thorn",    VERS_ALL,      254 },
143     { "yuml",     VERS_ALL,      255 },
144 
145     /*
146     ** Extended Entities defined in HTML 4: Symbols
147     */
148     { "fnof",     VERS_FROM40,   402 },
149     { "Alpha",    VERS_FROM40,   913 },
150     { "Beta",     VERS_FROM40,   914 },
151     { "Gamma",    VERS_FROM40,   915 },
152     { "Delta",    VERS_FROM40,   916 },
153     { "Epsilon",  VERS_FROM40,   917 },
154     { "Zeta",     VERS_FROM40,   918 },
155     { "Eta",      VERS_FROM40,   919 },
156     { "Theta",    VERS_FROM40,   920 },
157     { "Iota",     VERS_FROM40,   921 },
158     { "Kappa",    VERS_FROM40,   922 },
159     { "Lambda",   VERS_FROM40,   923 },
160     { "Mu",       VERS_FROM40,   924 },
161     { "Nu",       VERS_FROM40,   925 },
162     { "Xi",       VERS_FROM40,   926 },
163     { "Omicron",  VERS_FROM40,   927 },
164     { "Pi",       VERS_FROM40,   928 },
165     { "Rho",      VERS_FROM40,   929 },
166     { "Sigma",    VERS_FROM40,   931 },
167     { "Tau",      VERS_FROM40,   932 },
168     { "Upsilon",  VERS_FROM40,   933 },
169     { "Phi",      VERS_FROM40,   934 },
170     { "Chi",      VERS_FROM40,   935 },
171     { "Psi",      VERS_FROM40,   936 },
172     { "Omega",    VERS_FROM40,   937 },
173     { "alpha",    VERS_FROM40,   945 },
174     { "beta",     VERS_FROM40,   946 },
175     { "gamma",    VERS_FROM40,   947 },
176     { "delta",    VERS_FROM40,   948 },
177     { "epsilon",  VERS_FROM40,   949 },
178     { "zeta",     VERS_FROM40,   950 },
179     { "eta",      VERS_FROM40,   951 },
180     { "theta",    VERS_FROM40,   952 },
181     { "iota",     VERS_FROM40,   953 },
182     { "kappa",    VERS_FROM40,   954 },
183     { "lambda",   VERS_FROM40,   955 },
184     { "mu",       VERS_FROM40,   956 },
185     { "nu",       VERS_FROM40,   957 },
186     { "xi",       VERS_FROM40,   958 },
187     { "omicron",  VERS_FROM40,   959 },
188     { "pi",       VERS_FROM40,   960 },
189     { "rho",      VERS_FROM40,   961 },
190     { "sigmaf",   VERS_FROM40,   962 },
191     { "sigma",    VERS_FROM40,   963 },
192     { "tau",      VERS_FROM40,   964 },
193     { "upsilon",  VERS_FROM40,   965 },
194     { "phi",      VERS_FROM40,   966 },
195     { "chi",      VERS_FROM40,   967 },
196     { "psi",      VERS_FROM40,   968 },
197     { "omega",    VERS_FROM40,   969 },
198     { "thetasym", VERS_FROM40,   977 },
199     { "upsih",    VERS_FROM40,   978 },
200     { "piv",      VERS_FROM40,   982 },
201     { "bull",     VERS_FROM40,  8226 },
202     { "hellip",   VERS_FROM40,  8230 },
203     { "prime",    VERS_FROM40,  8242 },
204     { "Prime",    VERS_FROM40,  8243 },
205     { "oline",    VERS_FROM40,  8254 },
206     { "frasl",    VERS_FROM40,  8260 },
207     { "weierp",   VERS_FROM40,  8472 },
208     { "image",    VERS_FROM40,  8465 },
209     { "real",     VERS_FROM40,  8476 },
210     { "trade",    VERS_FROM40,  8482 },
211     { "alefsym",  VERS_FROM40,  8501 },
212     { "larr",     VERS_FROM40,  8592 },
213     { "uarr",     VERS_FROM40,  8593 },
214     { "rarr",     VERS_FROM40,  8594 },
215     { "darr",     VERS_FROM40,  8595 },
216     { "harr",     VERS_FROM40,  8596 },
217     { "crarr",    VERS_FROM40,  8629 },
218     { "lArr",     VERS_FROM40,  8656 },
219     { "uArr",     VERS_FROM40,  8657 },
220     { "rArr",     VERS_FROM40,  8658 },
221     { "dArr",     VERS_FROM40,  8659 },
222     { "hArr",     VERS_FROM40,  8660 },
223     { "forall",   VERS_FROM40,  8704 },
224     { "part",     VERS_FROM40,  8706 },
225     { "exist",    VERS_FROM40,  8707 },
226     { "empty",    VERS_FROM40,  8709 },
227     { "nabla",    VERS_FROM40,  8711 },
228     { "isin",     VERS_FROM40,  8712 },
229     { "notin",    VERS_FROM40,  8713 },
230     { "ni",       VERS_FROM40,  8715 },
231     { "prod",     VERS_FROM40,  8719 },
232     { "sum",      VERS_FROM40,  8721 },
233     { "minus",    VERS_FROM40,  8722 },
234     { "lowast",   VERS_FROM40,  8727 },
235     { "radic",    VERS_FROM40,  8730 },
236     { "prop",     VERS_FROM40,  8733 },
237     { "infin",    VERS_FROM40,  8734 },
238     { "ang",      VERS_FROM40,  8736 },
239     { "and",      VERS_FROM40,  8743 },
240     { "or",       VERS_FROM40,  8744 },
241     { "cap",      VERS_FROM40,  8745 },
242     { "cup",      VERS_FROM40,  8746 },
243     { "int",      VERS_FROM40,  8747 },
244     { "there4",   VERS_FROM40,  8756 },
245     { "sim",      VERS_FROM40,  8764 },
246     { "cong",     VERS_FROM40,  8773 },
247     { "asymp",    VERS_FROM40,  8776 },
248     { "ne",       VERS_FROM40,  8800 },
249     { "equiv",    VERS_FROM40,  8801 },
250     { "le",       VERS_FROM40,  8804 },
251     { "ge",       VERS_FROM40,  8805 },
252     { "sub",      VERS_FROM40,  8834 },
253     { "sup",      VERS_FROM40,  8835 },
254     { "nsub",     VERS_FROM40,  8836 },
255     { "sube",     VERS_FROM40,  8838 },
256     { "supe",     VERS_FROM40,  8839 },
257     { "oplus",    VERS_FROM40,  8853 },
258     { "otimes",   VERS_FROM40,  8855 },
259     { "perp",     VERS_FROM40,  8869 },
260     { "sdot",     VERS_FROM40,  8901 },
261     { "lceil",    VERS_FROM40,  8968 },
262     { "rceil",    VERS_FROM40,  8969 },
263     { "lfloor",   VERS_FROM40,  8970 },
264     { "rfloor",   VERS_FROM40,  8971 },
265     { "lang",     VERS_FROM40,  9001 },
266     { "rang",     VERS_FROM40,  9002 },
267     { "loz",      VERS_FROM40,  9674 },
268     { "spades",   VERS_FROM40,  9824 },
269     { "clubs",    VERS_FROM40,  9827 },
270     { "hearts",   VERS_FROM40,  9829 },
271     { "diams",    VERS_FROM40,  9830 },
272 
273     /*
274     ** Extended Entities defined in HTML 4: Special (less Markup at top)
275     */
276     { "OElig",    VERS_FROM40,   338 },
277     { "oelig",    VERS_FROM40,   339 },
278     { "Scaron",   VERS_FROM40,   352 },
279     { "scaron",   VERS_FROM40,   353 },
280     { "Yuml",     VERS_FROM40,   376 },
281     { "circ",     VERS_FROM40,   710 },
282     { "tilde",    VERS_FROM40,   732 },
283     { "ensp",     VERS_FROM40,  8194 },
284     { "emsp",     VERS_FROM40,  8195 },
285     { "thinsp",   VERS_FROM40,  8201 },
286     { "zwnj",     VERS_FROM40,  8204 },
287     { "zwj",      VERS_FROM40,  8205 },
288     { "lrm",      VERS_FROM40,  8206 },
289     { "rlm",      VERS_FROM40,  8207 },
290     { "ndash",    VERS_FROM40,  8211 },
291     { "mdash",    VERS_FROM40,  8212 },
292     { "lsquo",    VERS_FROM40,  8216 },
293     { "rsquo",    VERS_FROM40,  8217 },
294     { "sbquo",    VERS_FROM40,  8218 },
295     { "ldquo",    VERS_FROM40,  8220 },
296     { "rdquo",    VERS_FROM40,  8221 },
297     { "bdquo",    VERS_FROM40,  8222 },
298     { "dagger",   VERS_FROM40,  8224 },
299     { "Dagger",   VERS_FROM40,  8225 },
300     { "permil",   VERS_FROM40,  8240 },
301     { "lsaquo",   VERS_FROM40,  8249 },
302     { "rsaquo",   VERS_FROM40,  8250 },
303     { "euro",     VERS_FROM40,  8364 },
304     { NULL,       0,               0 }
305 };
306 
307 
308 /* Pure static implementation.  Trades off lookup speed
309 ** for faster setup time (well, none actually).
310 ** Optimization of comparing 1st character buys enough
311 ** speed that hash doesn't improve things without > 500
312 ** items in list.
313 */
lookup(ctmbstr s)314 static const entity* lookup( ctmbstr s )
315 {
316     tmbchar ch = (tmbchar)( s ? *s : 0 );
317     const entity *np;
318     for ( np = entities; ch && np && np->name; ++np )
319         if ( ch == *np->name && tmbstrcmp(s, np->name) == 0 )
320             return np;
321     return NULL;
322 }
323 
324 /* entity starting with "&" returns zero on error */
EntityCode(ctmbstr name,uint versions)325 uint EntityCode( ctmbstr name, uint versions )
326 {
327     const entity* np;
328     assert( name && name[0] == '&' );
329 
330     /* numeric entitity: name = "&#" followed by number */
331     if ( name[1] == '#' )
332     {
333         uint c = 0;  /* zero on missing/bad number */
334         Bool isXml = ( (versions & VERS_XML) == VERS_XML );
335 
336         /* 'x' prefix denotes hexadecimal number format */
337         if ( name[2] == 'x' || (!isXml && name[2] == 'X') )
338             sscanf( name+3, "%x", &c );
339         else
340             sscanf( name+2, "%u", &c );
341 
342         return (uint) c;
343     }
344 
345    /* Named entity: name ="&" followed by a name */
346     if ( NULL != (np = lookup(name+1)) )
347     {
348         /* Only recognize entity name if version supports it.  */
349         if ( np->versions & versions )
350             return np->code;
351     }
352 
353     return 0;   /* zero signifies unknown entity name */
354 }
355 
EntityInfo(ctmbstr name,Bool isXml,uint * code,uint * versions)356 Bool EntityInfo( ctmbstr name, Bool isXml, uint* code, uint* versions )
357 {
358     const entity* np;
359     assert( name && name[0] == '&' );
360     assert( code != NULL );
361     assert( versions != NULL );
362 
363     /* numeric entitity: name = "&#" followed by number */
364     if ( name[1] == '#' )
365     {
366         uint c = 0;  /* zero on missing/bad number */
367 
368         /* 'x' prefix denotes hexadecimal number format */
369         if ( name[2] == 'x' || (!isXml && name[2] == 'X') )
370             sscanf( name+3, "%x", &c );
371         else
372             sscanf( name+2, "%u", &c );
373 
374         *code = c;
375         *versions = VERS_ALL;
376         return yes;
377     }
378 
379     /* Named entity: name ="&" followed by a name */
380     if ( NULL != (np = lookup(name+1)) )
381     {
382         *code = np->code;
383         *versions = np->versions;
384         return yes;
385     }
386 
387     *code = 0;
388     *versions = ( isXml ? VERS_XML : VERS_PROPRIETARY );
389     return no;
390 }
391 
392 
EntityName(uint ch,uint versions)393 ctmbstr EntityName( uint ch, uint versions )
394 {
395     ctmbstr entnam = NULL;
396     const entity *ep;
397 
398     for ( ep = entities; ep->name != NULL; ++ep )
399     {
400         if ( ep->code == ch )
401         {
402             if ( ep->versions & versions )
403                 entnam = ep->name;
404             break; /* Found code. Stop search. */
405         }
406     }
407     return entnam;
408 }
409