1 /* entities.c -- recognize HTML ISO entities
2
3 (c) 1998-2004 (W3C) MIT, ERCIM, Keio University
4 See tidy.h for the copyright notice.
5
6 CVS Info :
7
8 $Author: terry_teague $
9 $Date: 2004/08/02 02:25:13 $
10 $Revision: 1.15 $
11
12 Entity handling can be static because there are no config or
13 document-specific values. Lookup table is 100% defined at
14 compile time.
15
16 */
17
18 #include <stdio.h>
19 #include "entities.h"
20 #include "tidy-int.h"
21 #include "tmbstr.h"
22
23 struct _entity;
24 typedef struct _entity entity;
25
26 struct _entity
27 {
28 ctmbstr name;
29 uint versions;
30 uint code;
31 };
32
33
34 static const entity entities[] =
35 {
36 /*
37 ** Markup pre-defined character entities
38 */
39 { "quot", VERS_ALL|VERS_XML, 34 },
40 { "amp", VERS_ALL|VERS_XML, 38 },
41 { "apos", VERS_FROM40|VERS_XML, 39 },
42 { "lt", VERS_ALL|VERS_XML, 60 },
43 { "gt", VERS_ALL|VERS_XML, 62 },
44
45 /*
46 ** Latin-1 character entities
47 */
48 { "nbsp", VERS_ALL, 160 },
49 { "iexcl", VERS_ALL, 161 },
50 { "cent", VERS_ALL, 162 },
51 { "pound", VERS_ALL, 163 },
52 { "curren", VERS_ALL, 164 },
53 { "yen", VERS_ALL, 165 },
54 { "brvbar", VERS_ALL, 166 },
55 { "sect", VERS_ALL, 167 },
56 { "uml", VERS_ALL, 168 },
57 { "copy", VERS_ALL, 169 },
58 { "ordf", VERS_ALL, 170 },
59 { "laquo", VERS_ALL, 171 },
60 { "not", VERS_ALL, 172 },
61 { "shy", VERS_ALL, 173 },
62 { "reg", VERS_ALL, 174 },
63 { "macr", VERS_ALL, 175 },
64 { "deg", VERS_ALL, 176 },
65 { "plusmn", VERS_ALL, 177 },
66 { "sup2", VERS_ALL, 178 },
67 { "sup3", VERS_ALL, 179 },
68 { "acute", VERS_ALL, 180 },
69 { "micro", VERS_ALL, 181 },
70 { "para", VERS_ALL, 182 },
71 { "middot", VERS_ALL, 183 },
72 { "cedil", VERS_ALL, 184 },
73 { "sup1", VERS_ALL, 185 },
74 { "ordm", VERS_ALL, 186 },
75 { "raquo", VERS_ALL, 187 },
76 { "frac14", VERS_ALL, 188 },
77 { "frac12", VERS_ALL, 189 },
78 { "frac34", VERS_ALL, 190 },
79 { "iquest", VERS_ALL, 191 },
80 { "Agrave", VERS_ALL, 192 },
81 { "Aacute", VERS_ALL, 193 },
82 { "Acirc", VERS_ALL, 194 },
83 { "Atilde", VERS_ALL, 195 },
84 { "Auml", VERS_ALL, 196 },
85 { "Aring", VERS_ALL, 197 },
86 { "AElig", VERS_ALL, 198 },
87 { "Ccedil", VERS_ALL, 199 },
88 { "Egrave", VERS_ALL, 200 },
89 { "Eacute", VERS_ALL, 201 },
90 { "Ecirc", VERS_ALL, 202 },
91 { "Euml", VERS_ALL, 203 },
92 { "Igrave", VERS_ALL, 204 },
93 { "Iacute", VERS_ALL, 205 },
94 { "Icirc", VERS_ALL, 206 },
95 { "Iuml", VERS_ALL, 207 },
96 { "ETH", VERS_ALL, 208 },
97 { "Ntilde", VERS_ALL, 209 },
98 { "Ograve", VERS_ALL, 210 },
99 { "Oacute", VERS_ALL, 211 },
100 { "Ocirc", VERS_ALL, 212 },
101 { "Otilde", VERS_ALL, 213 },
102 { "Ouml", VERS_ALL, 214 },
103 { "times", VERS_ALL, 215 },
104 { "Oslash", VERS_ALL, 216 },
105 { "Ugrave", VERS_ALL, 217 },
106 { "Uacute", VERS_ALL, 218 },
107 { "Ucirc", VERS_ALL, 219 },
108 { "Uuml", VERS_ALL, 220 },
109 { "Yacute", VERS_ALL, 221 },
110 { "THORN", VERS_ALL, 222 },
111 { "szlig", VERS_ALL, 223 },
112 { "agrave", VERS_ALL, 224 },
113 { "aacute", VERS_ALL, 225 },
114 { "acirc", VERS_ALL, 226 },
115 { "atilde", VERS_ALL, 227 },
116 { "auml", VERS_ALL, 228 },
117 { "aring", VERS_ALL, 229 },
118 { "aelig", VERS_ALL, 230 },
119 { "ccedil", VERS_ALL, 231 },
120 { "egrave", VERS_ALL, 232 },
121 { "eacute", VERS_ALL, 233 },
122 { "ecirc", VERS_ALL, 234 },
123 { "euml", VERS_ALL, 235 },
124 { "igrave", VERS_ALL, 236 },
125 { "iacute", VERS_ALL, 237 },
126 { "icirc", VERS_ALL, 238 },
127 { "iuml", VERS_ALL, 239 },
128 { "eth", VERS_ALL, 240 },
129 { "ntilde", VERS_ALL, 241 },
130 { "ograve", VERS_ALL, 242 },
131 { "oacute", VERS_ALL, 243 },
132 { "ocirc", VERS_ALL, 244 },
133 { "otilde", VERS_ALL, 245 },
134 { "ouml", VERS_ALL, 246 },
135 { "divide", VERS_ALL, 247 },
136 { "oslash", VERS_ALL, 248 },
137 { "ugrave", VERS_ALL, 249 },
138 { "uacute", VERS_ALL, 250 },
139 { "ucirc", VERS_ALL, 251 },
140 { "uuml", VERS_ALL, 252 },
141 { "yacute", VERS_ALL, 253 },
142 { "thorn", VERS_ALL, 254 },
143 { "yuml", VERS_ALL, 255 },
144
145 /*
146 ** Extended Entities defined in HTML 4: Symbols
147 */
148 { "fnof", VERS_FROM40, 402 },
149 { "Alpha", VERS_FROM40, 913 },
150 { "Beta", VERS_FROM40, 914 },
151 { "Gamma", VERS_FROM40, 915 },
152 { "Delta", VERS_FROM40, 916 },
153 { "Epsilon", VERS_FROM40, 917 },
154 { "Zeta", VERS_FROM40, 918 },
155 { "Eta", VERS_FROM40, 919 },
156 { "Theta", VERS_FROM40, 920 },
157 { "Iota", VERS_FROM40, 921 },
158 { "Kappa", VERS_FROM40, 922 },
159 { "Lambda", VERS_FROM40, 923 },
160 { "Mu", VERS_FROM40, 924 },
161 { "Nu", VERS_FROM40, 925 },
162 { "Xi", VERS_FROM40, 926 },
163 { "Omicron", VERS_FROM40, 927 },
164 { "Pi", VERS_FROM40, 928 },
165 { "Rho", VERS_FROM40, 929 },
166 { "Sigma", VERS_FROM40, 931 },
167 { "Tau", VERS_FROM40, 932 },
168 { "Upsilon", VERS_FROM40, 933 },
169 { "Phi", VERS_FROM40, 934 },
170 { "Chi", VERS_FROM40, 935 },
171 { "Psi", VERS_FROM40, 936 },
172 { "Omega", VERS_FROM40, 937 },
173 { "alpha", VERS_FROM40, 945 },
174 { "beta", VERS_FROM40, 946 },
175 { "gamma", VERS_FROM40, 947 },
176 { "delta", VERS_FROM40, 948 },
177 { "epsilon", VERS_FROM40, 949 },
178 { "zeta", VERS_FROM40, 950 },
179 { "eta", VERS_FROM40, 951 },
180 { "theta", VERS_FROM40, 952 },
181 { "iota", VERS_FROM40, 953 },
182 { "kappa", VERS_FROM40, 954 },
183 { "lambda", VERS_FROM40, 955 },
184 { "mu", VERS_FROM40, 956 },
185 { "nu", VERS_FROM40, 957 },
186 { "xi", VERS_FROM40, 958 },
187 { "omicron", VERS_FROM40, 959 },
188 { "pi", VERS_FROM40, 960 },
189 { "rho", VERS_FROM40, 961 },
190 { "sigmaf", VERS_FROM40, 962 },
191 { "sigma", VERS_FROM40, 963 },
192 { "tau", VERS_FROM40, 964 },
193 { "upsilon", VERS_FROM40, 965 },
194 { "phi", VERS_FROM40, 966 },
195 { "chi", VERS_FROM40, 967 },
196 { "psi", VERS_FROM40, 968 },
197 { "omega", VERS_FROM40, 969 },
198 { "thetasym", VERS_FROM40, 977 },
199 { "upsih", VERS_FROM40, 978 },
200 { "piv", VERS_FROM40, 982 },
201 { "bull", VERS_FROM40, 8226 },
202 { "hellip", VERS_FROM40, 8230 },
203 { "prime", VERS_FROM40, 8242 },
204 { "Prime", VERS_FROM40, 8243 },
205 { "oline", VERS_FROM40, 8254 },
206 { "frasl", VERS_FROM40, 8260 },
207 { "weierp", VERS_FROM40, 8472 },
208 { "image", VERS_FROM40, 8465 },
209 { "real", VERS_FROM40, 8476 },
210 { "trade", VERS_FROM40, 8482 },
211 { "alefsym", VERS_FROM40, 8501 },
212 { "larr", VERS_FROM40, 8592 },
213 { "uarr", VERS_FROM40, 8593 },
214 { "rarr", VERS_FROM40, 8594 },
215 { "darr", VERS_FROM40, 8595 },
216 { "harr", VERS_FROM40, 8596 },
217 { "crarr", VERS_FROM40, 8629 },
218 { "lArr", VERS_FROM40, 8656 },
219 { "uArr", VERS_FROM40, 8657 },
220 { "rArr", VERS_FROM40, 8658 },
221 { "dArr", VERS_FROM40, 8659 },
222 { "hArr", VERS_FROM40, 8660 },
223 { "forall", VERS_FROM40, 8704 },
224 { "part", VERS_FROM40, 8706 },
225 { "exist", VERS_FROM40, 8707 },
226 { "empty", VERS_FROM40, 8709 },
227 { "nabla", VERS_FROM40, 8711 },
228 { "isin", VERS_FROM40, 8712 },
229 { "notin", VERS_FROM40, 8713 },
230 { "ni", VERS_FROM40, 8715 },
231 { "prod", VERS_FROM40, 8719 },
232 { "sum", VERS_FROM40, 8721 },
233 { "minus", VERS_FROM40, 8722 },
234 { "lowast", VERS_FROM40, 8727 },
235 { "radic", VERS_FROM40, 8730 },
236 { "prop", VERS_FROM40, 8733 },
237 { "infin", VERS_FROM40, 8734 },
238 { "ang", VERS_FROM40, 8736 },
239 { "and", VERS_FROM40, 8743 },
240 { "or", VERS_FROM40, 8744 },
241 { "cap", VERS_FROM40, 8745 },
242 { "cup", VERS_FROM40, 8746 },
243 { "int", VERS_FROM40, 8747 },
244 { "there4", VERS_FROM40, 8756 },
245 { "sim", VERS_FROM40, 8764 },
246 { "cong", VERS_FROM40, 8773 },
247 { "asymp", VERS_FROM40, 8776 },
248 { "ne", VERS_FROM40, 8800 },
249 { "equiv", VERS_FROM40, 8801 },
250 { "le", VERS_FROM40, 8804 },
251 { "ge", VERS_FROM40, 8805 },
252 { "sub", VERS_FROM40, 8834 },
253 { "sup", VERS_FROM40, 8835 },
254 { "nsub", VERS_FROM40, 8836 },
255 { "sube", VERS_FROM40, 8838 },
256 { "supe", VERS_FROM40, 8839 },
257 { "oplus", VERS_FROM40, 8853 },
258 { "otimes", VERS_FROM40, 8855 },
259 { "perp", VERS_FROM40, 8869 },
260 { "sdot", VERS_FROM40, 8901 },
261 { "lceil", VERS_FROM40, 8968 },
262 { "rceil", VERS_FROM40, 8969 },
263 { "lfloor", VERS_FROM40, 8970 },
264 { "rfloor", VERS_FROM40, 8971 },
265 { "lang", VERS_FROM40, 9001 },
266 { "rang", VERS_FROM40, 9002 },
267 { "loz", VERS_FROM40, 9674 },
268 { "spades", VERS_FROM40, 9824 },
269 { "clubs", VERS_FROM40, 9827 },
270 { "hearts", VERS_FROM40, 9829 },
271 { "diams", VERS_FROM40, 9830 },
272
273 /*
274 ** Extended Entities defined in HTML 4: Special (less Markup at top)
275 */
276 { "OElig", VERS_FROM40, 338 },
277 { "oelig", VERS_FROM40, 339 },
278 { "Scaron", VERS_FROM40, 352 },
279 { "scaron", VERS_FROM40, 353 },
280 { "Yuml", VERS_FROM40, 376 },
281 { "circ", VERS_FROM40, 710 },
282 { "tilde", VERS_FROM40, 732 },
283 { "ensp", VERS_FROM40, 8194 },
284 { "emsp", VERS_FROM40, 8195 },
285 { "thinsp", VERS_FROM40, 8201 },
286 { "zwnj", VERS_FROM40, 8204 },
287 { "zwj", VERS_FROM40, 8205 },
288 { "lrm", VERS_FROM40, 8206 },
289 { "rlm", VERS_FROM40, 8207 },
290 { "ndash", VERS_FROM40, 8211 },
291 { "mdash", VERS_FROM40, 8212 },
292 { "lsquo", VERS_FROM40, 8216 },
293 { "rsquo", VERS_FROM40, 8217 },
294 { "sbquo", VERS_FROM40, 8218 },
295 { "ldquo", VERS_FROM40, 8220 },
296 { "rdquo", VERS_FROM40, 8221 },
297 { "bdquo", VERS_FROM40, 8222 },
298 { "dagger", VERS_FROM40, 8224 },
299 { "Dagger", VERS_FROM40, 8225 },
300 { "permil", VERS_FROM40, 8240 },
301 { "lsaquo", VERS_FROM40, 8249 },
302 { "rsaquo", VERS_FROM40, 8250 },
303 { "euro", VERS_FROM40, 8364 },
304 { NULL, 0, 0 }
305 };
306
307
308 /* Pure static implementation. Trades off lookup speed
309 ** for faster setup time (well, none actually).
310 ** Optimization of comparing 1st character buys enough
311 ** speed that hash doesn't improve things without > 500
312 ** items in list.
313 */
lookup(ctmbstr s)314 static const entity* lookup( ctmbstr s )
315 {
316 tmbchar ch = (tmbchar)( s ? *s : 0 );
317 const entity *np;
318 for ( np = entities; ch && np && np->name; ++np )
319 if ( ch == *np->name && tmbstrcmp(s, np->name) == 0 )
320 return np;
321 return NULL;
322 }
323
324 /* entity starting with "&" returns zero on error */
EntityCode(ctmbstr name,uint versions)325 uint EntityCode( ctmbstr name, uint versions )
326 {
327 const entity* np;
328 assert( name && name[0] == '&' );
329
330 /* numeric entitity: name = "&#" followed by number */
331 if ( name[1] == '#' )
332 {
333 uint c = 0; /* zero on missing/bad number */
334 Bool isXml = ( (versions & VERS_XML) == VERS_XML );
335
336 /* 'x' prefix denotes hexadecimal number format */
337 if ( name[2] == 'x' || (!isXml && name[2] == 'X') )
338 sscanf( name+3, "%x", &c );
339 else
340 sscanf( name+2, "%u", &c );
341
342 return (uint) c;
343 }
344
345 /* Named entity: name ="&" followed by a name */
346 if ( NULL != (np = lookup(name+1)) )
347 {
348 /* Only recognize entity name if version supports it. */
349 if ( np->versions & versions )
350 return np->code;
351 }
352
353 return 0; /* zero signifies unknown entity name */
354 }
355
EntityInfo(ctmbstr name,Bool isXml,uint * code,uint * versions)356 Bool EntityInfo( ctmbstr name, Bool isXml, uint* code, uint* versions )
357 {
358 const entity* np;
359 assert( name && name[0] == '&' );
360 assert( code != NULL );
361 assert( versions != NULL );
362
363 /* numeric entitity: name = "&#" followed by number */
364 if ( name[1] == '#' )
365 {
366 uint c = 0; /* zero on missing/bad number */
367
368 /* 'x' prefix denotes hexadecimal number format */
369 if ( name[2] == 'x' || (!isXml && name[2] == 'X') )
370 sscanf( name+3, "%x", &c );
371 else
372 sscanf( name+2, "%u", &c );
373
374 *code = c;
375 *versions = VERS_ALL;
376 return yes;
377 }
378
379 /* Named entity: name ="&" followed by a name */
380 if ( NULL != (np = lookup(name+1)) )
381 {
382 *code = np->code;
383 *versions = np->versions;
384 return yes;
385 }
386
387 *code = 0;
388 *versions = ( isXml ? VERS_XML : VERS_PROPRIETARY );
389 return no;
390 }
391
392
EntityName(uint ch,uint versions)393 ctmbstr EntityName( uint ch, uint versions )
394 {
395 ctmbstr entnam = NULL;
396 const entity *ep;
397
398 for ( ep = entities; ep->name != NULL; ++ep )
399 {
400 if ( ep->code == ch )
401 {
402 if ( ep->versions & versions )
403 entnam = ep->name;
404 break; /* Found code. Stop search. */
405 }
406 }
407 return entnam;
408 }
409