1 /*
2  * entities.c
3  *
4  * Copyright (c) Chris Putnam 2003-2020
5  *
6  * Source code released under the GPL version 2
7  *
8  */
9 #include <stdio.h>
10 #include <string.h>
11 #include <ctype.h>
12 #include "entities.h"
13 
14 /* HTML 4.0 entities */
15 
16 typedef struct entities {
17 	char html[20];
18 	unsigned int unicode;
19 } entities;
20 
21 entities html_entities[] = {
22 	/* Special Entities */
23 	{ "&quot;",     34 },  /* quotation mark */
24 	{ "&amp;",      38 },  /* ampersand */
25 	{ "&apos;",     39 },  /* apostrophe (note not defined in HTML) */
26 	{ "&lpar;",     40 },  /* left parenthesis */
27 	{ "&rpar;",     41 },  /* right parenthesis */
28 	{ "&hyphen;",   45 },  /* hyphen */
29 	{ "&lt;",       60 },  /* less-than sign */
30 	{ "&gt;",       62 },  /* greater-than sign */
31 	{ "&quest;",    63 },  /* question mark */
32 	{ "&OElig;",   338 },  /* Latin cap ligature OE */
33 	{ "&oelig;",   339 },  /* Latin small ligature OE */
34 	{ "&Scaron;",  352 },  /* Latin cap S with caron */
35 	{ "&scaron;",  353 },  /* Latin cap S with caron */
36 	{ "&Yuml;",    376 },  /* Latin cap y with diaeresis */
37 	{ "&circ;",    710 },  /* modifier letter circumflex */
38 	{ "&tilde;",   732 },  /* small tilde */
39 	{ "&ensp;",   8194 }, /* en space */
40 	{ "&emsp;",   8195 }, /* em space */
41 	{ "&thinsp;", 8201 }, /* thin space */
42 	{ "&zwnj;",   8204 }, /* zero width non-joiner */
43 	{ "&zwj;",    8205 }, /* zero width joiner */
44 	{ "&lrm;",    8206 }, /* left-to-right mark */
45 	{ "&rlm;",    8207 }, /* right-to-left mark */
46 	{ "&ndash;",  8211 }, /* en dash */
47 	{ "&mdash;",  8212 }, /* em dash */
48 	{ "&lsquo;",  8216 }, /* left single quotation mark */
49 	{ "&rsquo;",  8217 }, /* right single quot. mark */
50 	{ "&sbquo;",  8218 }, /* single low-9 quot. mark */
51 	{ "&ldquo;",  8220 }, /* left double quot. mark */
52 	{ "&rdquo;",  8221 }, /* right double quot. mark */
53 	{ "&bdquo;",  8222 }, /* double low-9 quot. mark */
54 	{ "&dagger;", 8224 }, /* dagger */
55 	{ "&Dagger;", 8225 }, /* double dagger */
56 	{ "&permil;", 8240 }, /* per mille sign */
57 	{ "&lsaquo;", 8249 }, /* sin. left angle quot mark */
58 	{ "&rsaquo;", 8250 }, /* sin. right angle quot mark */
59 	{ "&euro;",   8364 }, /* euro sign */
60 	/* Symbols and Greek characters */
61 	{ "&fnof;",    402 }, /* small f with hook = function */
62 	{ "&Alpha;",   913 }, /* capital alpha */
63 	{ "&Beta;",    914 }, /* capital beta */
64 	{ "&Gamma;",   915 }, /* capital gamma */
65 	{ "&Delta;",   916 }, /* capital delta */
66 	{ "&Epsilon;", 917 }, /* capital epsilon */
67 	{ "&Zeta;",    918 }, /* capital zeta */
68 	{ "&Eta;",     919 }, /* capital eta */
69 	{ "&Theta;",   920 }, /* capital theta */
70 	{ "&Iota;",    921 }, /* capital iota */
71 	{ "&Kappa;",   922 }, /* capital kappa */
72 	{ "&Lambda;",  923 }, /* capital lambda */
73 	{ "&Mu;",      924 }, /* capital mu */
74 	{ "&Nu;",      925 }, /* capital nu */
75 	{ "&Xi;",      926 }, /* capital xi */
76 	{ "&Omicron;", 927 }, /* capital omicron */
77 	{ "&Pi;",      928 }, /* capital pi */
78 	{ "&Rho;",     929 }, /* capital rho */
79 	{ "&Sigma;",   931 }, /* capital sigma */
80 	{ "&Tau;",     932 }, /* capital tau */
81 	{ "&Upsilon;", 933 }, /* capital upsilon */
82 	{ "&Phi;",     934 }, /* capital phi */
83 	{ "&Chi;",     935 }, /* capital chi */
84 	{ "&Psi;",     936 }, /* capital psi */
85 	{ "&Omega;",   937 }, /* capital omega */
86 	{ "&alpha;",   945 }, /* small alpha */
87 	{ "&beta;",    946 }, /* small beta */
88 	{ "&gamma;",   947 }, /* small gamma */
89 	{ "&delta;",   948 }, /* small delta */
90 	{ "&epsilon;", 949 }, /* small epsilon */
91 	{ "&zeta;",    950 }, /* small zeta */
92 	{ "&eta;",     951 }, /* small eta */
93 	{ "&theta;",   952 }, /* small theta */
94 	{ "&iota;",    953 }, /* small iota */
95 	{ "&kappa;",   954 }, /* small kappa */
96 	{ "&lambda;",  955 }, /* small lambda */
97 	{ "&mu;",      956 }, /* small mu */
98 	{ "&nu;",      957 }, /* small nu */
99 	{ "&xi;",      958 }, /* small xi */
100 	{ "&omicron;", 959 }, /* small omicron */
101 	{ "&pi;",      960 }, /* small pi */
102 	{ "&rho;",     961 }, /* small rho */
103 	{ "&sigmaf;",  962 }, /* small final sigma */
104 	{ "&sigma;",   963 }, /* small simga */
105 	{ "&tau;",     964 }, /* small tau */
106 	{ "&upsilon;", 965 }, /* small upsilon */
107 	{ "&phi;",     966 }, /* small phi */
108 	{ "&chi;",     967 }, /* small chi */
109 	{ "&psi;",     968 }, /* small psi */
110 	{ "&omega;",   969 }, /* small omega */
111 	{ "&thetasym;",977 }, /* small theta symbol */
112 	{ "&upsih;",   978 }, /* small upsilon with hook */
113 	{ "&piv;",     982 }, /* pi symbol */
114 	{ "&bull;",   8226 }, /* bullet = small blk circle */
115 	{ "&hellip;", 8230 }, /* horizontal ellipsis */
116 	{ "&prime;",  8242 }, /* prime = minutes = feet */
117 	{ "&Prime;",  8243 }, /* double prime */
118 	{ "&oline;",  8254 }, /* overline */
119 	{ "&frasl;",  8260 }, /* fraction slash */
120 	{ "&weierp;", 8472 }, /* Weierstrass p = power set */
121 	{ "&image;",  8465 }, /* imaginary part-black cap I */
122 	{ "&real;",   8476 }, /* real part-black cap R */
123 	{ "&trade;",  8482 }, /* trademark sign */
124 	{ "&alefsym;",8501 }, /* alef symbol */
125 	{ "&larr;",   8592 }, /* left arrow */
126 	{ "&uarr;",   8593 }, /* up arrow */
127 	{ "&rarr;",   8594 }, /* right arrow */
128 	{ "&darr;",   8595 }, /* down arrow */
129 	{ "&harr;",   8596 }, /* left/right arrow */
130 	{ "&crarr;",  8629 }, /* down arrow with corner left */
131 	{ "&lArr;",   8656 }, /* left double arrow */
132 	{ "&uArr;",   8657 }, /* up double arrow */
133 	{ "&rArr;",   8658 }, /* up double arrow */
134 	{ "&dArr;",   8659 }, /* up double arrow */
135 	{ "&hArr;",   8660 }, /* up double arrow */
136 	{ "&forall;", 8704}, /* for all */
137 	{ "&part;",   8706}, /* partial differential */
138 	{ "&exist;",  8707}, /* there exists */
139 	{ "&empty;",  8709}, /* empty set */
140 	{ "&nabla;",  8711}, /* nabla=backwards difference */
141 	{ "&isin;",   8712}, /* element of */
142 	{ "&notin;",  8713}, /* not an element of */
143 	{ "&ni;",     8715}, /* contains as member */
144 	{ "&prod;",   8719}, /* n-ary product */
145 	{ "&sum;",    8721}, /* n-ary summation */
146 	{ "&minus;",  8722}, /* minuss sign */
147 	{ "&lowast;", 8727}, /* asterisk operator */
148 	{ "&radic;",  8730}, /* square root */
149 	{ "&prop;",   8733}, /* proportional to */
150 	{ "&infin;",  8734}, /* infinity */
151 	{ "&ang;",    8736}, /* angle */
152 	{ "&and;",    8743}, /* logical and */
153 	{ "&or;",     8744}, /* logical or */
154 	{ "&cap;",    8745}, /* intersection */
155 	{ "&cup;",    8746}, /* union */
156 	{ "&int;",    8747}, /* integral */
157 	{ "&there4;", 8756}, /* therefore */
158 	{ "&sim;",    8764}, /* tilde operator */
159 	{ "&cong;",   8773}, /* approximately equal to */
160 	{ "&asymp;",  8776}, /* asymptotic to */
161 	{ "&ne;",     8800}, /* not equal to */
162 	{ "&equiv;",  8801}, /* identical to */
163 	{ "&le;",     8804}, /* less-than or equal to */
164 	{ "&ge;",     8805}, /* greater-than or equal to */
165 	{ "&sub;",    8834}, /* subset of */
166 	{ "&sup;",    8835}, /* superset of */
167 	{ "&nsub;",   8836}, /* not a subset of */
168 	{ "&sube;",   8838}, /* subset of or equal to */
169 	{ "&supe;",   8839}, /* superset of or equal to */
170 	{ "&oplus;",  8853}, /* circled plus = direct sum */
171 	{ "&otimes;", 8855}, /* circled times = vec prod */
172 	{ "&perp;",   8869}, /* perpendicular */
173 	{ "&sdot;",   8901}, /* dot operator */
174 	{ "&lceil;",  8968}, /* left ceiling */
175 	{ "&rceil;",  8969}, /* right ceiling */
176 	{ "&lfloor;", 8970}, /* left floor */
177 	{ "&rfloor;", 8971}, /* right floor */
178 	{ "&lang;",   9001}, /* left angle bracket */
179 	{ "&rang;",   9002}, /* right angle bracket */
180 	{ "&loz;",    9674}, /* lozenge */
181 	{ "&spades;", 9824}, /* spades */
182 	{ "&clubs;",  9827}, /* clubs */
183 	{ "&hearts;", 9829}, /* hearts */
184 	{ "&diams;",  9830}, /* diamonds */
185 	/* Latin-1 */
186 	{ "&nbsp;",    32 },  /* non-breaking space */
187 	{ "&iexcl;",  161 },  /* inverted exclamation mark */
188 	{ "&cent;",   162 },  /* cent sign */
189 	{ "&pound;",  163 },  /* pound sign */
190 	{ "&curren;", 164 },  /* currency sign */
191 	{ "&yen;",    165 },  /* yen sign */
192 	{ "&brvbar;", 166 },  /* broken vertical bar */
193 	{ "&sect;",   167 },  /* section sign */
194 	{ "&uml;",    168 },  /* diaeresis - spacing diaeresis */
195 	{ "&copy;",   169 },  /* copyright sign */
196 	{ "&ordf;",   170 },  /* feminine ordinal indicator */
197 	{ "&laquo;",  171 },  /* left-pointing guillemet */
198 	{ "&not;",    172 },  /* not sign */
199 	{ "&shy;",    173 },  /* soft (discretionary) hyphen */
200 	{ "&reg;",    174 },  /* registered sign */
201 	{ "&macr;",   175 },  /* macron = overline */
202 	{ "&deg;",    176 },  /* degree sign */
203 	{ "&plusmn;", 177 },  /* plus-minus sign */
204 	{ "&sup2;",   178 },  /* superscript two */
205 	{ "&sup3;",   179 },  /* superscript three */
206 	{ "&acute;",  180 },  /* acute accent = spacing acute */
207 	{ "&micro;",  181 },  /* micro sign */
208 	{ "&para;",   182 },  /* pilcrow (paragraph) sign */
209 	{ "&middot;", 183 },  /* middle dot (georgian comma) */
210 	{ "&cedil;",  184 },  /* cedilla = spacing cedilla */
211 	{ "&sup1;",   185 },  /* superscript one */
212 	{ "&ordm;",   186 },  /* masculine ordinal indicator */
213 	{ "&raquo;",  187 },  /* right pointing guillemet */
214 	{ "&frac14;", 188 },  /* 1/4 */
215 	{ "&frac12;", 189 },  /* 1/2 */
216 	{ "&frac34;", 190 },  /* 3/4 */
217 	{ "&iquest;", 191 },  /* inverted question mark */
218 	{ "&Agrave;", 192 },  /* cap A with grave */
219 	{ "&Aacute;", 193 },  /* cap A with acute */
220 	{ "&Acirc;",  194 },  /* cap A with circumflex */
221 	{ "&Atilde;", 195 },  /* cap A with tilde */
222 	{ "&Auml;",   196 },  /* cap A with diaeresis */
223 	{ "&Aring;",  197 },  /* cap A with ring */
224 	{ "&AElig;",  198 },  /* cap AE ligature */
225 	{ "&Ccedil;", 199 },  /* cap C with cedilla */
226 	{ "&Egrave;", 200 },  /* cap E with grave */
227 	{ "&Eacute;", 201 },  /* cap E with acute */
228 	{ "&Ecirc;",  202 },  /* cap E with circumflex */
229 	{ "&Euml;",   203 },  /* cap E with diaeresis */
230 	{ "&Igrave;", 204 },  /* cap I with grave */
231 	{ "&Iacute;", 205 },  /* cap I with acute */
232 	{ "&Icirc;",  206 },  /* cap I with circumflex */
233 	{ "&Iuml;",   207 },  /* cap I with diaeresis */
234 	{ "&ETH;",    208 },  /* cap letter ETH */
235 	{ "&Ntilde;", 209 },  /* cap N with tilde */
236 	{ "&Ograve;", 210 },  /* cap O with grave */
237 	{ "&Oacute;", 211 },  /* cap O with acute */
238 	{ "&Ocirc;",  212 },  /* cap O with circumflex */
239 	{ "&Otilde;", 213 },  /* cap O with tilde */
240 	{ "&Ouml;",   214 },  /* cap O with diaeresis */
241 	{ "&times;",  215 },  /* multiplication sign */
242 	{ "&Oslash;", 216 },  /* cap O with stroke */
243 	{ "&Ugrave;", 217 },  /* cap U with grave */
244 	{ "&Uacute;", 218 },  /* cap U with acute */
245 	{ "&Ucirc;",  219 },  /* cap U with circumflex */
246 	{ "&Uuml;",   220 },  /* cap U with diaeresis */
247 	{ "&Yacute;", 221 },  /* cap Y with acute */
248 	{ "&THORN;",  222 },  /* cap letter THORN */
249 	{ "&szlig;",  223 },  /* small sharp s = ess-zed */
250 	{ "&agrave;", 224 },  /* small a with grave */
251 	{ "&aacute;", 225 },  /* small a with acute */
252 	{ "&acirc;",  226 },  /* small a with cirucmflex */
253 	{ "&atilde;", 227 },  /* small a with tilde */
254 	{ "&amul;",   228 },  /* small a with diaeresis */
255 	{ "&aring;",  229 },  /* small a with ring */
256 	{ "&aelig;",  230 },  /* small ligature ae */
257 	{ "&ccedil;", 231 },  /* small c with cedilla */
258 	{ "&egrave;", 232 },  /* small e with grave */
259 	{ "&eacute;", 233 },  /* small e with acute */
260 	{ "&ecirc;",  234 },  /* small e with circumflex */
261 	{ "&emul;",   235 },  /* small e with diaeresis */
262 	{ "&igrave;", 236 },  /* small i with grave */
263 	{ "&iacute;", 237 },  /* small i with acute */
264 	{ "&icirc;",  238 },  /* small i with circumflex */
265 	{ "&iuml;",   239 },  /* small i with diaeresis */
266 	{ "&eth;",    240 },  /* latin small letter eth */
267 	{ "&ntilde;", 241 },  /* small n with tilde */
268 	{ "&ograve;", 242 },  /* small o with grave */
269 	{ "&oacute;", 243 },  /* small o with acute */
270 	{ "&ocirc;",  244 },  /* small o with circumflex */
271 	{ "&otilde;", 245 },  /* small o with tilde */
272 	{ "&ouml;",   246 },  /* small o with diaeresis */
273 	{ "&divide;", 247 },  /* division sign */
274 	{ "&oslash;", 248 },  /* small o with slash */
275 	{ "&ugrave;", 249 },  /* small u with grave */
276 	{ "&uacute;", 250 },  /* small u with acute */
277 	{ "&ucirc;",  251 },  /* small u with circumflex */
278 	{ "&uuml;",   252 },  /* small u with diaeresis */
279 	{ "&yacute;", 253 },  /* small y with acute */
280 	{ "&thorn;",  254 },  /* latin small letter thorn */
281 	{ "&yuml;",   255 },  /* small y with diaeresis */
282 };
283 
284 
285 static unsigned int
decode_html_entity(char * s,unsigned int * pi,int * err)286 decode_html_entity( char *s, unsigned int *pi, int *err )
287 {
288 	int nhtml_entities = sizeof( html_entities ) / sizeof( entities );
289 	char *e;
290 	int i, n=-1, len;
291 	for ( i=0; i<nhtml_entities && n==-1; ++i ) {
292 		e = &(html_entities[i].html[0]);
293 		len = strlen( e );
294 		if ( !strncasecmp( &(s[*pi]), e, len ) ) {
295 			n = i;
296 			*pi += len;
297 		}
298 	}
299 	if ( n==-1 ) {
300 		*err = 1;
301 		return '&';
302 	} else {
303 		*err = 0;
304 		return html_entities[n].unicode;
305 	}
306 }
307 
308 
309 /*
310  * decode decimal entity
311  *
312  *    extract a decimal entity from &#NNNN;
313  *    s[*pi] points to the '&' character
314  */
315 static unsigned int
decode_decimal_entity(char * s,unsigned int * pi,int * err)316 decode_decimal_entity( char *s, unsigned int *pi, int *err )
317 {
318 	unsigned int c = 0, d;
319 	int i = *pi, j = 2;
320 	while ( isdigit( (unsigned char)s[i+j] ) ) {
321 		d = s[i+j] - '0';
322 		c = 10 * c + d;
323 		j++;
324 	}
325 	if ( s[i+j]!=';' ) *err = 1;
326 	else *pi = i+j+1;
327 	return c;
328 }
329 
330 /*
331  * decode hex entity
332  *
333  *    extract a hex entity from &#xNNNN;
334  *    s[*pi] points to the '&' character
335  */
336 static unsigned int
decode_hex_entity(char * s,unsigned int * pi,int * err)337 decode_hex_entity( char *s, unsigned int *pi, int *err )
338 {
339 	unsigned int c = 0, d;
340 	int i = *pi, j = 3;
341 	while ( isxdigit( (unsigned char)s[i+j] ) ) {
342 		if ( isdigit( (unsigned char)s[i+j] ) ) d = s[i+j]-'0';
343 		else d = toupper((unsigned char)s[i+j])-'A' + 10;
344 		c = 16 * c + d;
345 		j++;
346 	}
347 	if ( s[i+j]!=';' ) *err = 1;
348 	else *pi = i+j+1;
349 	return c;
350 }
351 
352 /*
353  * decode numeric entity
354  *
355  *    extract a numeric entity from &#NNN; or &#xNNNN;
356  *
357  *    In XML, the "x" in hexadecimal entries should be lowercase,
358  *    but we'll be generous and accept "X" as well.
359  */
360 static unsigned int
decode_numeric_entity(char * s,unsigned int * pi,int * err)361 decode_numeric_entity( char *s, unsigned int *pi, int *err )
362 {
363 	unsigned int c;
364 	*err = 0;
365 	if ( s[*pi+2]!='x' && s[*pi+2]!='X' ) c = decode_decimal_entity( s, pi, err );
366 	else c = decode_hex_entity( s, pi, err );
367 	if ( *err ) {
368 		*pi = *pi + 1;
369 		c = '&';
370 	}
371 	return c;
372 }
373 
374 /*
375  * decode entity
376  *    extract entity from  &mmmm;
377  *
378  * where &mmmm; is one of
379  * - &#nnnn; is code point in decimal form
380  * - &#xhhhh; is code point in hexadecimal form (note "x" is lowercase in XML)
381  * - &mmmm; corresponds to a pre-defined XML entity, e.g. &quote for quotations
382  *
383  */
384 unsigned int
decode_entity(char * s,unsigned int * pi,int * unicode,int * err)385 decode_entity( char *s, unsigned int *pi, int *unicode, int *err )
386 {
387 	unsigned int c = '&';
388 	*unicode = 0;
389 
390 	if ( s[*pi]!='&' ) {
391 		*err = 1;  /* need to start with ampersand */
392 		c = s[*pi];
393 	} else *err = 0;
394 
395 	if ( !*err ) {
396 		if ( s[*pi+1]=='#' ) c = decode_numeric_entity( s, pi, err );
397 		else {
398 			c = decode_html_entity( s, pi, err );
399 			*unicode = 1;
400 		}
401 	}
402 	if ( *err ) *pi = *pi + 1;
403 
404 	return c;
405 }
406