1 /*
2 * entities.c
3 *
4 * Copyright (c) Chris Putnam 2003-2020
5 *
6 * Source code released under the GPL version 2
7 *
8 */
9 #include <stdio.h>
10 #include <string.h>
11 #include <ctype.h>
12 #include "entities.h"
13
14 /* HTML 4.0 entities */
15
16 typedef struct entities {
17 char html[20];
18 unsigned int unicode;
19 } entities;
20
21 entities html_entities[] = {
22 /* Special Entities */
23 { """, 34 }, /* quotation mark */
24 { "&", 38 }, /* ampersand */
25 { "'", 39 }, /* apostrophe (note not defined in HTML) */
26 { "(", 40 }, /* left parenthesis */
27 { ")", 41 }, /* right parenthesis */
28 { "‐", 45 }, /* hyphen */
29 { "<", 60 }, /* less-than sign */
30 { ">", 62 }, /* greater-than sign */
31 { "?", 63 }, /* question mark */
32 { "Œ", 338 }, /* Latin cap ligature OE */
33 { "œ", 339 }, /* Latin small ligature OE */
34 { "Š", 352 }, /* Latin cap S with caron */
35 { "š", 353 }, /* Latin cap S with caron */
36 { "Ÿ", 376 }, /* Latin cap y with diaeresis */
37 { "ˆ", 710 }, /* modifier letter circumflex */
38 { "˜", 732 }, /* small tilde */
39 { " ", 8194 }, /* en space */
40 { " ", 8195 }, /* em space */
41 { " ", 8201 }, /* thin space */
42 { "‌", 8204 }, /* zero width non-joiner */
43 { "‍", 8205 }, /* zero width joiner */
44 { "‎", 8206 }, /* left-to-right mark */
45 { "‏", 8207 }, /* right-to-left mark */
46 { "–", 8211 }, /* en dash */
47 { "—", 8212 }, /* em dash */
48 { "‘", 8216 }, /* left single quotation mark */
49 { "’", 8217 }, /* right single quot. mark */
50 { "‚", 8218 }, /* single low-9 quot. mark */
51 { "“", 8220 }, /* left double quot. mark */
52 { "”", 8221 }, /* right double quot. mark */
53 { "„", 8222 }, /* double low-9 quot. mark */
54 { "†", 8224 }, /* dagger */
55 { "‡", 8225 }, /* double dagger */
56 { "‰", 8240 }, /* per mille sign */
57 { "‹", 8249 }, /* sin. left angle quot mark */
58 { "›", 8250 }, /* sin. right angle quot mark */
59 { "€", 8364 }, /* euro sign */
60 /* Symbols and Greek characters */
61 { "ƒ", 402 }, /* small f with hook = function */
62 { "Α", 913 }, /* capital alpha */
63 { "Β", 914 }, /* capital beta */
64 { "Γ", 915 }, /* capital gamma */
65 { "Δ", 916 }, /* capital delta */
66 { "Ε", 917 }, /* capital epsilon */
67 { "Ζ", 918 }, /* capital zeta */
68 { "Η", 919 }, /* capital eta */
69 { "Θ", 920 }, /* capital theta */
70 { "Ι", 921 }, /* capital iota */
71 { "Κ", 922 }, /* capital kappa */
72 { "Λ", 923 }, /* capital lambda */
73 { "Μ", 924 }, /* capital mu */
74 { "Ν", 925 }, /* capital nu */
75 { "Ξ", 926 }, /* capital xi */
76 { "Ο", 927 }, /* capital omicron */
77 { "Π", 928 }, /* capital pi */
78 { "Ρ", 929 }, /* capital rho */
79 { "Σ", 931 }, /* capital sigma */
80 { "Τ", 932 }, /* capital tau */
81 { "Υ", 933 }, /* capital upsilon */
82 { "Φ", 934 }, /* capital phi */
83 { "Χ", 935 }, /* capital chi */
84 { "Ψ", 936 }, /* capital psi */
85 { "Ω", 937 }, /* capital omega */
86 { "α", 945 }, /* small alpha */
87 { "β", 946 }, /* small beta */
88 { "γ", 947 }, /* small gamma */
89 { "δ", 948 }, /* small delta */
90 { "ε", 949 }, /* small epsilon */
91 { "ζ", 950 }, /* small zeta */
92 { "η", 951 }, /* small eta */
93 { "θ", 952 }, /* small theta */
94 { "ι", 953 }, /* small iota */
95 { "κ", 954 }, /* small kappa */
96 { "λ", 955 }, /* small lambda */
97 { "μ", 956 }, /* small mu */
98 { "ν", 957 }, /* small nu */
99 { "ξ", 958 }, /* small xi */
100 { "ο", 959 }, /* small omicron */
101 { "π", 960 }, /* small pi */
102 { "ρ", 961 }, /* small rho */
103 { "ς", 962 }, /* small final sigma */
104 { "σ", 963 }, /* small simga */
105 { "τ", 964 }, /* small tau */
106 { "υ", 965 }, /* small upsilon */
107 { "φ", 966 }, /* small phi */
108 { "χ", 967 }, /* small chi */
109 { "ψ", 968 }, /* small psi */
110 { "ω", 969 }, /* small omega */
111 { "ϑ",977 }, /* small theta symbol */
112 { "ϒ", 978 }, /* small upsilon with hook */
113 { "ϖ", 982 }, /* pi symbol */
114 { "•", 8226 }, /* bullet = small blk circle */
115 { "…", 8230 }, /* horizontal ellipsis */
116 { "′", 8242 }, /* prime = minutes = feet */
117 { "″", 8243 }, /* double prime */
118 { "‾", 8254 }, /* overline */
119 { "⁄", 8260 }, /* fraction slash */
120 { "℘", 8472 }, /* Weierstrass p = power set */
121 { "ℑ", 8465 }, /* imaginary part-black cap I */
122 { "ℜ", 8476 }, /* real part-black cap R */
123 { "™", 8482 }, /* trademark sign */
124 { "ℵ",8501 }, /* alef symbol */
125 { "←", 8592 }, /* left arrow */
126 { "↑", 8593 }, /* up arrow */
127 { "→", 8594 }, /* right arrow */
128 { "↓", 8595 }, /* down arrow */
129 { "↔", 8596 }, /* left/right arrow */
130 { "↵", 8629 }, /* down arrow with corner left */
131 { "⇐", 8656 }, /* left double arrow */
132 { "⇑", 8657 }, /* up double arrow */
133 { "⇒", 8658 }, /* up double arrow */
134 { "⇓", 8659 }, /* up double arrow */
135 { "⇔", 8660 }, /* up double arrow */
136 { "∀", 8704}, /* for all */
137 { "∂", 8706}, /* partial differential */
138 { "∃", 8707}, /* there exists */
139 { "∅", 8709}, /* empty set */
140 { "∇", 8711}, /* nabla=backwards difference */
141 { "∈", 8712}, /* element of */
142 { "∉", 8713}, /* not an element of */
143 { "∋", 8715}, /* contains as member */
144 { "∏", 8719}, /* n-ary product */
145 { "∑", 8721}, /* n-ary summation */
146 { "−", 8722}, /* minuss sign */
147 { "∗", 8727}, /* asterisk operator */
148 { "√", 8730}, /* square root */
149 { "∝", 8733}, /* proportional to */
150 { "∞", 8734}, /* infinity */
151 { "∠", 8736}, /* angle */
152 { "∧", 8743}, /* logical and */
153 { "∨", 8744}, /* logical or */
154 { "∩", 8745}, /* intersection */
155 { "∪", 8746}, /* union */
156 { "∫", 8747}, /* integral */
157 { "∴", 8756}, /* therefore */
158 { "∼", 8764}, /* tilde operator */
159 { "≅", 8773}, /* approximately equal to */
160 { "≈", 8776}, /* asymptotic to */
161 { "≠", 8800}, /* not equal to */
162 { "≡", 8801}, /* identical to */
163 { "≤", 8804}, /* less-than or equal to */
164 { "≥", 8805}, /* greater-than or equal to */
165 { "⊂", 8834}, /* subset of */
166 { "⊃", 8835}, /* superset of */
167 { "⊄", 8836}, /* not a subset of */
168 { "⊆", 8838}, /* subset of or equal to */
169 { "⊇", 8839}, /* superset of or equal to */
170 { "⊕", 8853}, /* circled plus = direct sum */
171 { "⊗", 8855}, /* circled times = vec prod */
172 { "⊥", 8869}, /* perpendicular */
173 { "⋅", 8901}, /* dot operator */
174 { "⌈", 8968}, /* left ceiling */
175 { "⌉", 8969}, /* right ceiling */
176 { "⌊", 8970}, /* left floor */
177 { "⌋", 8971}, /* right floor */
178 { "⟨", 9001}, /* left angle bracket */
179 { "⟩", 9002}, /* right angle bracket */
180 { "◊", 9674}, /* lozenge */
181 { "♠", 9824}, /* spades */
182 { "♣", 9827}, /* clubs */
183 { "♥", 9829}, /* hearts */
184 { "♦", 9830}, /* diamonds */
185 /* Latin-1 */
186 { " ", 32 }, /* non-breaking space */
187 { "¡", 161 }, /* inverted exclamation mark */
188 { "¢", 162 }, /* cent sign */
189 { "£", 163 }, /* pound sign */
190 { "¤", 164 }, /* currency sign */
191 { "¥", 165 }, /* yen sign */
192 { "¦", 166 }, /* broken vertical bar */
193 { "§", 167 }, /* section sign */
194 { "¨", 168 }, /* diaeresis - spacing diaeresis */
195 { "©", 169 }, /* copyright sign */
196 { "ª", 170 }, /* feminine ordinal indicator */
197 { "«", 171 }, /* left-pointing guillemet */
198 { "¬", 172 }, /* not sign */
199 { "­", 173 }, /* soft (discretionary) hyphen */
200 { "®", 174 }, /* registered sign */
201 { "¯", 175 }, /* macron = overline */
202 { "°", 176 }, /* degree sign */
203 { "±", 177 }, /* plus-minus sign */
204 { "²", 178 }, /* superscript two */
205 { "³", 179 }, /* superscript three */
206 { "´", 180 }, /* acute accent = spacing acute */
207 { "µ", 181 }, /* micro sign */
208 { "¶", 182 }, /* pilcrow (paragraph) sign */
209 { "·", 183 }, /* middle dot (georgian comma) */
210 { "¸", 184 }, /* cedilla = spacing cedilla */
211 { "¹", 185 }, /* superscript one */
212 { "º", 186 }, /* masculine ordinal indicator */
213 { "»", 187 }, /* right pointing guillemet */
214 { "¼", 188 }, /* 1/4 */
215 { "½", 189 }, /* 1/2 */
216 { "¾", 190 }, /* 3/4 */
217 { "¿", 191 }, /* inverted question mark */
218 { "À", 192 }, /* cap A with grave */
219 { "Á", 193 }, /* cap A with acute */
220 { "Â", 194 }, /* cap A with circumflex */
221 { "Ã", 195 }, /* cap A with tilde */
222 { "Ä", 196 }, /* cap A with diaeresis */
223 { "Å", 197 }, /* cap A with ring */
224 { "Æ", 198 }, /* cap AE ligature */
225 { "Ç", 199 }, /* cap C with cedilla */
226 { "È", 200 }, /* cap E with grave */
227 { "É", 201 }, /* cap E with acute */
228 { "Ê", 202 }, /* cap E with circumflex */
229 { "Ë", 203 }, /* cap E with diaeresis */
230 { "Ì", 204 }, /* cap I with grave */
231 { "Í", 205 }, /* cap I with acute */
232 { "Î", 206 }, /* cap I with circumflex */
233 { "Ï", 207 }, /* cap I with diaeresis */
234 { "Ð", 208 }, /* cap letter ETH */
235 { "Ñ", 209 }, /* cap N with tilde */
236 { "Ò", 210 }, /* cap O with grave */
237 { "Ó", 211 }, /* cap O with acute */
238 { "Ô", 212 }, /* cap O with circumflex */
239 { "Õ", 213 }, /* cap O with tilde */
240 { "Ö", 214 }, /* cap O with diaeresis */
241 { "×", 215 }, /* multiplication sign */
242 { "Ø", 216 }, /* cap O with stroke */
243 { "Ù", 217 }, /* cap U with grave */
244 { "Ú", 218 }, /* cap U with acute */
245 { "Û", 219 }, /* cap U with circumflex */
246 { "Ü", 220 }, /* cap U with diaeresis */
247 { "Ý", 221 }, /* cap Y with acute */
248 { "Þ", 222 }, /* cap letter THORN */
249 { "ß", 223 }, /* small sharp s = ess-zed */
250 { "à", 224 }, /* small a with grave */
251 { "á", 225 }, /* small a with acute */
252 { "â", 226 }, /* small a with cirucmflex */
253 { "ã", 227 }, /* small a with tilde */
254 { "&amul;", 228 }, /* small a with diaeresis */
255 { "å", 229 }, /* small a with ring */
256 { "æ", 230 }, /* small ligature ae */
257 { "ç", 231 }, /* small c with cedilla */
258 { "è", 232 }, /* small e with grave */
259 { "é", 233 }, /* small e with acute */
260 { "ê", 234 }, /* small e with circumflex */
261 { "&emul;", 235 }, /* small e with diaeresis */
262 { "ì", 236 }, /* small i with grave */
263 { "í", 237 }, /* small i with acute */
264 { "î", 238 }, /* small i with circumflex */
265 { "ï", 239 }, /* small i with diaeresis */
266 { "ð", 240 }, /* latin small letter eth */
267 { "ñ", 241 }, /* small n with tilde */
268 { "ò", 242 }, /* small o with grave */
269 { "ó", 243 }, /* small o with acute */
270 { "ô", 244 }, /* small o with circumflex */
271 { "õ", 245 }, /* small o with tilde */
272 { "ö", 246 }, /* small o with diaeresis */
273 { "÷", 247 }, /* division sign */
274 { "ø", 248 }, /* small o with slash */
275 { "ù", 249 }, /* small u with grave */
276 { "ú", 250 }, /* small u with acute */
277 { "û", 251 }, /* small u with circumflex */
278 { "ü", 252 }, /* small u with diaeresis */
279 { "ý", 253 }, /* small y with acute */
280 { "þ", 254 }, /* latin small letter thorn */
281 { "ÿ", 255 }, /* small y with diaeresis */
282 };
283
284
285 static unsigned int
decode_html_entity(char * s,unsigned int * pi,int * err)286 decode_html_entity( char *s, unsigned int *pi, int *err )
287 {
288 int nhtml_entities = sizeof( html_entities ) / sizeof( entities );
289 char *e;
290 int i, n=-1, len;
291 for ( i=0; i<nhtml_entities && n==-1; ++i ) {
292 e = &(html_entities[i].html[0]);
293 len = strlen( e );
294 if ( !strncasecmp( &(s[*pi]), e, len ) ) {
295 n = i;
296 *pi += len;
297 }
298 }
299 if ( n==-1 ) {
300 *err = 1;
301 return '&';
302 } else {
303 *err = 0;
304 return html_entities[n].unicode;
305 }
306 }
307
308
309 /*
310 * decode decimal entity
311 *
312 * extract a decimal entity from &#NNNN;
313 * s[*pi] points to the '&' character
314 */
315 static unsigned int
decode_decimal_entity(char * s,unsigned int * pi,int * err)316 decode_decimal_entity( char *s, unsigned int *pi, int *err )
317 {
318 unsigned int c = 0, d;
319 int i = *pi, j = 2;
320 while ( isdigit( (unsigned char)s[i+j] ) ) {
321 d = s[i+j] - '0';
322 c = 10 * c + d;
323 j++;
324 }
325 if ( s[i+j]!=';' ) *err = 1;
326 else *pi = i+j+1;
327 return c;
328 }
329
330 /*
331 * decode hex entity
332 *
333 * extract a hex entity from &#xNNNN;
334 * s[*pi] points to the '&' character
335 */
336 static unsigned int
decode_hex_entity(char * s,unsigned int * pi,int * err)337 decode_hex_entity( char *s, unsigned int *pi, int *err )
338 {
339 unsigned int c = 0, d;
340 int i = *pi, j = 3;
341 while ( isxdigit( (unsigned char)s[i+j] ) ) {
342 if ( isdigit( (unsigned char)s[i+j] ) ) d = s[i+j]-'0';
343 else d = toupper((unsigned char)s[i+j])-'A' + 10;
344 c = 16 * c + d;
345 j++;
346 }
347 if ( s[i+j]!=';' ) *err = 1;
348 else *pi = i+j+1;
349 return c;
350 }
351
352 /*
353 * decode numeric entity
354 *
355 * extract a numeric entity from &#NNN; or &#xNNNN;
356 *
357 * In XML, the "x" in hexadecimal entries should be lowercase,
358 * but we'll be generous and accept "X" as well.
359 */
360 static unsigned int
decode_numeric_entity(char * s,unsigned int * pi,int * err)361 decode_numeric_entity( char *s, unsigned int *pi, int *err )
362 {
363 unsigned int c;
364 *err = 0;
365 if ( s[*pi+2]!='x' && s[*pi+2]!='X' ) c = decode_decimal_entity( s, pi, err );
366 else c = decode_hex_entity( s, pi, err );
367 if ( *err ) {
368 *pi = *pi + 1;
369 c = '&';
370 }
371 return c;
372 }
373
374 /*
375 * decode entity
376 * extract entity from &mmmm;
377 *
378 * where &mmmm; is one of
379 * - &#nnnn; is code point in decimal form
380 * - &#xhhhh; is code point in hexadecimal form (note "x" is lowercase in XML)
381 * - &mmmm; corresponds to a pre-defined XML entity, e.g. "e for quotations
382 *
383 */
384 unsigned int
decode_entity(char * s,unsigned int * pi,int * unicode,int * err)385 decode_entity( char *s, unsigned int *pi, int *unicode, int *err )
386 {
387 unsigned int c = '&';
388 *unicode = 0;
389
390 if ( s[*pi]!='&' ) {
391 *err = 1; /* need to start with ampersand */
392 c = s[*pi];
393 } else *err = 0;
394
395 if ( !*err ) {
396 if ( s[*pi+1]=='#' ) c = decode_numeric_entity( s, pi, err );
397 else {
398 c = decode_html_entity( s, pi, err );
399 *unicode = 1;
400 }
401 }
402 if ( *err ) *pi = *pi + 1;
403
404 return c;
405 }
406