1 // Use of this software is granted under one of the following two licenses,
2 // to be chosen freely by the user.
3 
4 // 1. Boost Software License - Version 1.0 - August 17th, 2003
5 // ===============================================================================
6 
7 // Copyright (c) 2010 Emweb bv, Herent, Belgium
8 
9 // Permission is hereby granted, free of charge, to any person or organization
10 // obtaining a copy of the software and accompanying documentation covered by
11 // this license (the "Software") to use, reproduce, display, distribute,
12 // execute, and transmit the Software, and to prepare derivative works of the
13 // Software, and to permit third-parties to whom the Software is furnished to
14 // do so, all subject to the following:
15 
16 // The copyright notices in the Software and this entire statement, including
17 // the above license grant, this restriction and the following disclaimer,
18 // must be included in all copies of the Software, in whole or in part, and
19 // all derivative works of the Software, unless such copies or derivative
20 // works are solely in the form of machine-executable object code generated by
21 // a source language processor.
22 
23 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 // FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
26 // SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
27 // FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
28 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29 // DEALINGS IN THE SOFTWARE.
30 
31 // 2. The MIT License
32 // ===============================================================================
33 
34 // Copyright (c) 2010 Emweb bv, Herent, Belgium
35 
36 // Permission is hereby granted, free of charge, to any person obtaining a copy
37 // of this software and associated documentation files (the "Software"), to deal
38 // in the Software without restriction, including without limitation the rights
39 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
40 // of the Software, and to permit persons to whom the Software is furnished to do so,
41 // subject to the following conditions:
42 
43 // The above copyright notice and this permission notice shall be included in all
44 // copies or substantial portions of the Software.
45 
46 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
47 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
48 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
49 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
50 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
51 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
52 // IN THE SOFTWARE.
53 #ifndef RAPIDXML_XHTML_HPP_INCLUDED
54 #define RAPIDXML_XHTML_HPP_INCLUDED
55 
56 #include <cstddef>
57 #include <cstring>
58 
59 namespace Wt {
60 namespace rapidxml
61 {
62     namespace internal
63     {
64         template<class Ch> inline std::size_t measure(const Ch *p);
65     }
66 
67     // returns true if the src could be interpreted as an xhtml entity reference and
68     // decoded as utf8 into dest
69 
70     template<class Ch>
translate_xhtml_entity(Ch * & src,Ch * & dest)71     bool translate_xhtml_entity(Ch *& src, Ch *& dest)
72     {
73       // Caveat: assumes output encoding is UTF-8
74 
75       const int entity_count = 257;
76 
77       struct Entity {
78 	const char *name;
79 	const char *utf8;
80       } entities[] = {
81 	{ "AElig", "\xc3\x86" },
82 	{ "Aacute", "\xc3\x81" },
83 	{ "Acirc", "\xc3\x82" },
84 	{ "Agrave", "\xc3\x80" },
85 	{ "Alpha", "\xce\x91" },
86 	{ "Aring", "\xc3\x85" },
87 	{ "Atilde", "\xc3\x83" },
88 	{ "Auml", "\xc3\x84" },
89 	{ "Beta", "\xce\x92" },
90 	{ "Ccedil", "\xc3\x87" },
91 	{ "Chi", "\xce\xa7" },
92 	{ "Dagger", "\xe2\x80\xa1" },
93 	{ "Delta", "\xce\x94" },
94 	{ "Dstrok", "\xc3\x90" },
95 	{ "ETH", "\xc3\x90" },
96 	{ "Eacute", "\xc3\x89" },
97 	{ "Ecirc", "\xc3\x8a" },
98 	{ "Egrave", "\xc3\x88" },
99 	{ "Epsilon", "\xce\x95" },
100 	{ "Eta", "\xce\x97" },
101 	{ "Euml", "\xc3\x8b" },
102 	{ "Gamma", "\xce\x93" },
103 	{ "Iacute", "\xc3\x8d" },
104 	{ "Icirc", "\xc3\x8e" },
105 	{ "Igrave", "\xc3\x8c" },
106 	{ "Iota", "\xce\x99" },
107 	{ "Iuml", "\xc3\x8f" },
108 	{ "Kappa", "\xce\x9a" },
109 	{ "Lambda", "\xce\x9b" },
110 	{ "Mu", "\xce\x9c" },
111 	{ "Ntilde", "\xc3\x91" },
112 	{ "Nu", "\xce\x9d" },
113 	{ "OElig", "\xc5\x92" },
114 	{ "Oacute", "\xc3\x93" },
115 	{ "Ocirc", "\xc3\x94" },
116 	{ "Ograve", "\xc3\x92" },
117 	{ "Omega", "\xce\xa9" },
118 	{ "Omicron", "\xce\x9f" },
119 	{ "Oslash", "\xc3\x98" },
120 	{ "Otilde", "\xc3\x95" },
121 	{ "Ouml", "\xc3\x96" },
122 	{ "Phi", "\xce\xa6" },
123 	{ "Pi", "\xce\xa0" },
124 	{ "Prime", "\xe2\x80\xb3" },
125 	{ "Psi", "\xce\xa8" },
126 	{ "Rho", "\xce\xa1" },
127 	{ "Scaron", "\xc5\xa0" },
128 	{ "Sigma", "\xce\xa3" },
129 	{ "THORN", "\xc3\x9e" },
130 	{ "Tau", "\xce\xa4" },
131 	{ "Theta", "\xce\x98" },
132 	{ "Uacute", "\xc3\x9a" },
133 	{ "Ucirc", "\xc3\x9b" },
134 	{ "Ugrave", "\xc3\x99" },
135 	{ "Upsilon", "\xce\xa5" },
136 	{ "Uuml", "\xc3\x9c" },
137 	{ "Xi", "\xce\x9e" },
138 	{ "Yacute", "\xc3\x9d" },
139 	{ "Yuml", "\xc5\xb8" },
140 	{ "Zeta", "\xce\x96" },
141 	{ "aacute", "\xc3\xa1" },
142 	{ "acirc", "\xc3\xa2" },
143 	{ "acute", "\xc2\xb4" },
144 	{ "aelig", "\xc3\xa6" },
145 	{ "agrave", "\xc3\xa0" },
146 	{ "alefsym", "\xe2\x84\xb5" },
147 	{ "alpha", "\xce\xb1" },
148 	{ "amp", "\x26" },
149 	{ "and", "\xe2\x88\xa7" },
150 	{ "ang", "\xe2\x88\xa0" },
151 	{ "apos", "\x27" },
152 	{ "aring", "\xc3\xa5" },
153 	{ "asymp", "\xe2\x89\x88" },
154 	{ "atilde", "\xc3\xa3" },
155 	{ "auml", "\xc3\xa4" },
156 	{ "bdquo", "\xe2\x80\x9e" },
157 	{ "beta", "\xce\xb2" },
158 	{ "brkbar", "\xc2\xa6" },
159 	{ "brvbar", "\xc2\xa6" },
160 	{ "bull", "\xe2\x80\xa2" },
161 	{ "cap", "\xe2\x88\xa9" },
162 	{ "ccedil", "\xc3\xa7" },
163 	{ "cedil", "\xc2\xb8" },
164 	{ "cent", "\xc2\xa2" },
165 	{ "chi", "\xcf\x87" },
166 	{ "circ", "\xcb\x86" },
167 	{ "clubs", "\xe2\x99\xa3" },
168 	{ "cong", "\xe2\x89\x85" },
169 	{ "copy", "\xc2\xa9" },
170 	{ "crarr", "\xe2\x86\xb5" },
171 	{ "cup", "\xe2\x88\xaa" },
172 	{ "curren", "\xc2\xa4" },
173 	{ "dArr", "\xe2\x87\x93" },
174 	{ "dagger", "\xe2\x80\xa0" },
175 	{ "darr", "\xe2\x86\x93" },
176 	{ "deg", "\xc2\xb0" },
177 	{ "delta", "\xce\xb4" },
178 	{ "diams", "\xe2\x99\xa6" },
179 	{ "die", "\xc2\xa8" },
180 	{ "divide", "\xc3\xb7" },
181 	{ "eacute", "\xc3\xa9" },
182 	{ "ecirc", "\xc3\xaa" },
183 	{ "egrave", "\xc3\xa8" },
184 	{ "empty", "\xe2\x88\x85" },
185 	{ "emsp", "\xe2\x80\x83" },
186 	{ "ensp", "\xe2\x80\x82" },
187 	{ "epsilon", "\xce\xb5" },
188 	{ "equiv", "\xe2\x89\xa1" },
189 	{ "eta", "\xce\xb7" },
190 	{ "eth", "\xc3\xb0" },
191 	{ "euml", "\xc3\xab" },
192 	{ "euro", "\xe2\x82\xac" },
193 	{ "exist", "\xe2\x88\x83" },
194 	{ "fnof", "\xc6\x92" },
195 	{ "forall", "\xe2\x88\x80" },
196 	{ "frac12", "\xc2\xbd" },
197 	{ "frac14", "\xc2\xbc" },
198 	{ "frac34", "\xc2\xbe" },
199 	{ "frasl", "\xe2\x81\x84" },
200 	{ "gamma", "\xce\xb3" },
201 	{ "ge", "\xe2\x89\xa5" },
202 	{ "gt", "\x3e" },
203 	{ "hArr", "\xe2\x87\x94" },
204 	{ "harr", "\xe2\x86\x94" },
205 	{ "hearts", "\xe2\x99\xa5" },
206 	{ "hellip", "\xe2\x80\xa6" },
207 	{ "hibar", "\xc2\xaf" },
208 	{ "iacute", "\xc3\xad" },
209 	{ "icirc", "\xc3\xae" },
210 	{ "iexcl", "\xc2\xa1" },
211 	{ "igrave", "\xc3\xac" },
212 	{ "image", "\xe2\x84\x91" },
213 	{ "infin", "\xe2\x88\x9e" },
214 	{ "int", "\xe2\x88\xab" },
215 	{ "iota", "\xce\xb9" },
216 	{ "iquest", "\xc2\xbf" },
217 	{ "isin", "\xe2\x88\x88" },
218 	{ "iuml", "\xc3\xaf" },
219 	{ "kappa", "\xce\xba" },
220 	{ "lArr", "\xe2\x87\x90" },
221 	{ "lambda", "\xce\xbb" },
222 	{ "lang", "\xe2\x8c\xa9" },
223 	{ "laquo", "\xc2\xab" },
224 	{ "larr", "\xe2\x86\x90" },
225 	{ "lceil", "\xe2\x8c\x88" },
226 	{ "ldquo", "\xe2\x80\x9c" },
227 	{ "le", "\xe2\x89\xa4" },
228 	{ "lfloor", "\xe2\x8c\x8a" },
229 	{ "lowast", "\xe2\x88\x97" },
230 	{ "loz", "\xe2\x97\x8a" },
231 	{ "lrm", "\xe2\x80\x8e" },
232 	{ "lsaquo", "\xe2\x80\xb9" },
233 	{ "lsquo", "\xe2\x80\x98" },
234 	{ "lt", "\x3c" },
235 	{ "macr", "\xc2\xaf" },
236 	{ "mdash", "\xe2\x80\x94" },
237 	{ "micro", "\xc2\xb5" },
238 	{ "middot", "\xc2\xb7" },
239 	{ "minus", "\xe2\x88\x92" },
240 	{ "mu", "\xce\xbc" },
241 	{ "nabla", "\xe2\x88\x87" },
242 	{ "nbsp", "\xc2\xa0" },
243 	{ "ndash", "\xe2\x80\x93" },
244 	{ "ne", "\xe2\x89\xa0" },
245 	{ "ni", "\xe2\x88\x8b" },
246 	{ "not", "\xc2\xac" },
247 	{ "notin", "\xe2\x88\x89" },
248 	{ "nsub", "\xe2\x8a\x84" },
249 	{ "ntilde", "\xc3\xb1" },
250 	{ "nu", "\xce\xbd" },
251 	{ "oacute", "\xc3\xb3" },
252 	{ "ocirc", "\xc3\xb4" },
253 	{ "oelig", "\xc5\x93" },
254 	{ "ograve", "\xc3\xb2" },
255 	{ "oline", "\xe2\x80\xbe" },
256 	{ "omega", "\xcf\x89" },
257 	{ "omicron", "\xce\xbf" },
258 	{ "oplus", "\xe2\x8a\x95" },
259 	{ "or", "\xe2\x88\xa8" },
260 	{ "ordf", "\xc2\xaa" },
261 	{ "ordm", "\xc2\xba" },
262 	{ "oslash", "\xc3\xb8" },
263 	{ "otilde", "\xc3\xb5" },
264 	{ "otimes", "\xe2\x8a\x97" },
265 	{ "ouml", "\xc3\xb6" },
266 	{ "para", "\xc2\xb6" },
267 	{ "part", "\xe2\x88\x82" },
268 	{ "permil", "\xe2\x80\xb0" },
269 	{ "perp", "\xe2\x8a\xa5" },
270 	{ "phi", "\xcf\x86" },
271 	{ "pi", "\xcf\x80" },
272 	{ "piv", "\xcf\x96" },
273 	{ "plusmn", "\xc2\xb1" },
274 	{ "pound", "\xc2\xa3" },
275 	{ "prime", "\xe2\x80\xb2" },
276 	{ "prod", "\xe2\x88\x8f" },
277 	{ "prop", "\xe2\x88\x9d" },
278 	{ "psi", "\xcf\x88" },
279 	{ "quot", "\x22" },
280 	{ "rArr", "\xe2\x87\x92" },
281 	{ "radic", "\xe2\x88\x9a" },
282 	{ "rang", "\xe2\x8c\xaa" },
283 	{ "raquo", "\xc2\xbb" },
284 	{ "rarr", "\xe2\x86\x92" },
285 	{ "rceil", "\xe2\x8c\x89" },
286 	{ "rdquo", "\xe2\x80\x9d" },
287 	{ "real", "\xe2\x84\x9c" },
288 	{ "reg", "\xc2\xae" },
289 	{ "rfloor", "\xe2\x8c\x8b" },
290 	{ "rho", "\xcf\x81" },
291 	{ "rlm", "\xe2\x80\x8f" },
292 	{ "rsaquo", "\xe2\x80\xba" },
293 	{ "rsquo", "\xe2\x80\x99" },
294 	{ "sbquo", "\xe2\x80\x9a" },
295 	{ "scaron", "\xc5\xa1" },
296 	{ "sdot", "\xe2\x8b\x85" },
297 	{ "sect", "\xc2\xa7" },
298 	{ "shy", "\xc2\xad" },
299 	{ "sigma", "\xcf\x83" },
300 	{ "sigmaf", "\xcf\x82" },
301 	{ "sim", "\xe2\x88\xbc" },
302 	{ "spades", "\xe2\x99\xa0" },
303 	{ "sub", "\xe2\x8a\x82" },
304 	{ "sube", "\xe2\x8a\x86" },
305 	{ "sum", "\xe2\x88\x91" },
306 	{ "sup", "\xe2\x8a\x83" },
307 	{ "sup1", "\xc2\xb9" },
308 	{ "sup2", "\xc2\xb2" },
309 	{ "sup3", "\xc2\xb3" },
310 	{ "supe", "\xe2\x8a\x87" },
311 	{ "szlig", "\xc3\x9f" },
312 	{ "tau", "\xcf\x84" },
313 	{ "there4", "\xe2\x88\xb4" },
314 	{ "theta", "\xce\xb8" },
315 	{ "thetasym", "\xcf\x91" },
316 	{ "thinsp", "\xe2\x80\x89" },
317 	{ "thorn", "\xc3\xbe" },
318 	{ "tilde", "\xcb\x9c" },
319 	{ "times", "\xc3\x97" },
320 	{ "trade", "\xe2\x84\xa2" },
321 	{ "uArr", "\xe2\x87\x91" },
322 	{ "uacute", "\xc3\xba" },
323 	{ "uarr", "\xe2\x86\x91" },
324 	{ "ucirc", "\xc3\xbb" },
325 	{ "ugrave", "\xc3\xb9" },
326 	{ "uml", "\xc2\xa8" },
327 	{ "upsih", "\xcf\x92" },
328 	{ "upsilon", "\xcf\x85" },
329 	{ "uuml", "\xc3\xbc" },
330 	{ "weierp", "\xe2\x84\x98" },
331 	{ "xi", "\xce\xbe" },
332 	{ "yacute", "\xc3\xbd" },
333 	{ "yen", "\xc2\xa5" },
334 	{ "yuml", "\xc3\xbf" },
335 	{ "zeta", "\xce\xb6" },
336 	{ "zwj", "\xe2\x80\x8d" },
337 	{ "zwnj", "\xe2\x80\x8c" }
338       };
339 
340       // first find ; longest char entity reference is 8 chars (thetasym)
341       char entity[9];
342       bool entity_ok = false;
343       for (unsigned i = 0; i < 9; ++i)
344 	if (src[1 + i] == ';') {
345 	  entity[i] = 0;
346 	  entity_ok = true;
347 	  break;
348 	} else
349 	  entity[i] = src[1 + i];
350 
351       if (!entity_ok)
352 	return false;
353 
354       // do binary search for entity in our entity table
355       int imin = 0; // inclusive
356       int imax = entity_count; // exclusive
357       int i;
358 
359       for (;;) {
360 	if (imax - imin <= 1)
361 	  if (std::strcmp(entity, entities[imin].name) == 0) {
362 	    i = imin;
363 	    break;
364 	  } else
365 	    return false; // not found
366 	else {
367 	  i = (imin + imax) / 2;
368 	  int cmp = std::strcmp(entity, entities[i].name);
369 	  if (cmp == 0)
370 	    break;
371 	  else {
372 	    if (cmp < 0)
373 	      imax = i;
374 	    else
375 	      imin = i + 1;
376 	  }
377 	}
378       }
379 
380       src += internal::measure(entity) + 2; // + &;
381       for (const Ch *c = entities[i].utf8; *c; ++c)
382 	*dest++ = *c;
383 
384       return true;
385     }
386 }
387 }
388 
389 #endif
390