1 // Use of this software is granted under one of the following two licenses, 2 // to be chosen freely by the user. 3 4 // 1. Boost Software License - Version 1.0 - August 17th, 2003 5 // =============================================================================== 6 7 // Copyright (c) 2010 Emweb bv, Herent, Belgium 8 9 // Permission is hereby granted, free of charge, to any person or organization 10 // obtaining a copy of the software and accompanying documentation covered by 11 // this license (the "Software") to use, reproduce, display, distribute, 12 // execute, and transmit the Software, and to prepare derivative works of the 13 // Software, and to permit third-parties to whom the Software is furnished to 14 // do so, all subject to the following: 15 16 // The copyright notices in the Software and this entire statement, including 17 // the above license grant, this restriction and the following disclaimer, 18 // must be included in all copies of the Software, in whole or in part, and 19 // all derivative works of the Software, unless such copies or derivative 20 // works are solely in the form of machine-executable object code generated by 21 // a source language processor. 22 23 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 24 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 25 // FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 26 // SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 27 // FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 28 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 29 // DEALINGS IN THE SOFTWARE. 30 31 // 2. The MIT License 32 // =============================================================================== 33 34 // Copyright (c) 2010 Emweb bv, Herent, Belgium 35 36 // Permission is hereby granted, free of charge, to any person obtaining a copy 37 // of this software and associated documentation files (the "Software"), to deal 38 // in the Software without restriction, including without limitation the rights 39 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 40 // of the Software, and to permit persons to whom the Software is furnished to do so, 41 // subject to the following conditions: 42 43 // The above copyright notice and this permission notice shall be included in all 44 // copies or substantial portions of the Software. 45 46 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 47 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 48 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 49 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 50 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 51 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 52 // IN THE SOFTWARE. 53 #ifndef RAPIDXML_XHTML_HPP_INCLUDED 54 #define RAPIDXML_XHTML_HPP_INCLUDED 55 56 #include <cstddef> 57 #include <cstring> 58 59 namespace Wt { 60 namespace rapidxml 61 { 62 namespace internal 63 { 64 template<class Ch> inline std::size_t measure(const Ch *p); 65 } 66 67 // returns true if the src could be interpreted as an xhtml entity reference and 68 // decoded as utf8 into dest 69 70 template<class Ch> translate_xhtml_entity(Ch * & src,Ch * & dest)71 bool translate_xhtml_entity(Ch *& src, Ch *& dest) 72 { 73 // Caveat: assumes output encoding is UTF-8 74 75 const int entity_count = 257; 76 77 struct Entity { 78 const char *name; 79 const char *utf8; 80 } entities[] = { 81 { "AElig", "\xc3\x86" }, 82 { "Aacute", "\xc3\x81" }, 83 { "Acirc", "\xc3\x82" }, 84 { "Agrave", "\xc3\x80" }, 85 { "Alpha", "\xce\x91" }, 86 { "Aring", "\xc3\x85" }, 87 { "Atilde", "\xc3\x83" }, 88 { "Auml", "\xc3\x84" }, 89 { "Beta", "\xce\x92" }, 90 { "Ccedil", "\xc3\x87" }, 91 { "Chi", "\xce\xa7" }, 92 { "Dagger", "\xe2\x80\xa1" }, 93 { "Delta", "\xce\x94" }, 94 { "Dstrok", "\xc3\x90" }, 95 { "ETH", "\xc3\x90" }, 96 { "Eacute", "\xc3\x89" }, 97 { "Ecirc", "\xc3\x8a" }, 98 { "Egrave", "\xc3\x88" }, 99 { "Epsilon", "\xce\x95" }, 100 { "Eta", "\xce\x97" }, 101 { "Euml", "\xc3\x8b" }, 102 { "Gamma", "\xce\x93" }, 103 { "Iacute", "\xc3\x8d" }, 104 { "Icirc", "\xc3\x8e" }, 105 { "Igrave", "\xc3\x8c" }, 106 { "Iota", "\xce\x99" }, 107 { "Iuml", "\xc3\x8f" }, 108 { "Kappa", "\xce\x9a" }, 109 { "Lambda", "\xce\x9b" }, 110 { "Mu", "\xce\x9c" }, 111 { "Ntilde", "\xc3\x91" }, 112 { "Nu", "\xce\x9d" }, 113 { "OElig", "\xc5\x92" }, 114 { "Oacute", "\xc3\x93" }, 115 { "Ocirc", "\xc3\x94" }, 116 { "Ograve", "\xc3\x92" }, 117 { "Omega", "\xce\xa9" }, 118 { "Omicron", "\xce\x9f" }, 119 { "Oslash", "\xc3\x98" }, 120 { "Otilde", "\xc3\x95" }, 121 { "Ouml", "\xc3\x96" }, 122 { "Phi", "\xce\xa6" }, 123 { "Pi", "\xce\xa0" }, 124 { "Prime", "\xe2\x80\xb3" }, 125 { "Psi", "\xce\xa8" }, 126 { "Rho", "\xce\xa1" }, 127 { "Scaron", "\xc5\xa0" }, 128 { "Sigma", "\xce\xa3" }, 129 { "THORN", "\xc3\x9e" }, 130 { "Tau", "\xce\xa4" }, 131 { "Theta", "\xce\x98" }, 132 { "Uacute", "\xc3\x9a" }, 133 { "Ucirc", "\xc3\x9b" }, 134 { "Ugrave", "\xc3\x99" }, 135 { "Upsilon", "\xce\xa5" }, 136 { "Uuml", "\xc3\x9c" }, 137 { "Xi", "\xce\x9e" }, 138 { "Yacute", "\xc3\x9d" }, 139 { "Yuml", "\xc5\xb8" }, 140 { "Zeta", "\xce\x96" }, 141 { "aacute", "\xc3\xa1" }, 142 { "acirc", "\xc3\xa2" }, 143 { "acute", "\xc2\xb4" }, 144 { "aelig", "\xc3\xa6" }, 145 { "agrave", "\xc3\xa0" }, 146 { "alefsym", "\xe2\x84\xb5" }, 147 { "alpha", "\xce\xb1" }, 148 { "amp", "\x26" }, 149 { "and", "\xe2\x88\xa7" }, 150 { "ang", "\xe2\x88\xa0" }, 151 { "apos", "\x27" }, 152 { "aring", "\xc3\xa5" }, 153 { "asymp", "\xe2\x89\x88" }, 154 { "atilde", "\xc3\xa3" }, 155 { "auml", "\xc3\xa4" }, 156 { "bdquo", "\xe2\x80\x9e" }, 157 { "beta", "\xce\xb2" }, 158 { "brkbar", "\xc2\xa6" }, 159 { "brvbar", "\xc2\xa6" }, 160 { "bull", "\xe2\x80\xa2" }, 161 { "cap", "\xe2\x88\xa9" }, 162 { "ccedil", "\xc3\xa7" }, 163 { "cedil", "\xc2\xb8" }, 164 { "cent", "\xc2\xa2" }, 165 { "chi", "\xcf\x87" }, 166 { "circ", "\xcb\x86" }, 167 { "clubs", "\xe2\x99\xa3" }, 168 { "cong", "\xe2\x89\x85" }, 169 { "copy", "\xc2\xa9" }, 170 { "crarr", "\xe2\x86\xb5" }, 171 { "cup", "\xe2\x88\xaa" }, 172 { "curren", "\xc2\xa4" }, 173 { "dArr", "\xe2\x87\x93" }, 174 { "dagger", "\xe2\x80\xa0" }, 175 { "darr", "\xe2\x86\x93" }, 176 { "deg", "\xc2\xb0" }, 177 { "delta", "\xce\xb4" }, 178 { "diams", "\xe2\x99\xa6" }, 179 { "die", "\xc2\xa8" }, 180 { "divide", "\xc3\xb7" }, 181 { "eacute", "\xc3\xa9" }, 182 { "ecirc", "\xc3\xaa" }, 183 { "egrave", "\xc3\xa8" }, 184 { "empty", "\xe2\x88\x85" }, 185 { "emsp", "\xe2\x80\x83" }, 186 { "ensp", "\xe2\x80\x82" }, 187 { "epsilon", "\xce\xb5" }, 188 { "equiv", "\xe2\x89\xa1" }, 189 { "eta", "\xce\xb7" }, 190 { "eth", "\xc3\xb0" }, 191 { "euml", "\xc3\xab" }, 192 { "euro", "\xe2\x82\xac" }, 193 { "exist", "\xe2\x88\x83" }, 194 { "fnof", "\xc6\x92" }, 195 { "forall", "\xe2\x88\x80" }, 196 { "frac12", "\xc2\xbd" }, 197 { "frac14", "\xc2\xbc" }, 198 { "frac34", "\xc2\xbe" }, 199 { "frasl", "\xe2\x81\x84" }, 200 { "gamma", "\xce\xb3" }, 201 { "ge", "\xe2\x89\xa5" }, 202 { "gt", "\x3e" }, 203 { "hArr", "\xe2\x87\x94" }, 204 { "harr", "\xe2\x86\x94" }, 205 { "hearts", "\xe2\x99\xa5" }, 206 { "hellip", "\xe2\x80\xa6" }, 207 { "hibar", "\xc2\xaf" }, 208 { "iacute", "\xc3\xad" }, 209 { "icirc", "\xc3\xae" }, 210 { "iexcl", "\xc2\xa1" }, 211 { "igrave", "\xc3\xac" }, 212 { "image", "\xe2\x84\x91" }, 213 { "infin", "\xe2\x88\x9e" }, 214 { "int", "\xe2\x88\xab" }, 215 { "iota", "\xce\xb9" }, 216 { "iquest", "\xc2\xbf" }, 217 { "isin", "\xe2\x88\x88" }, 218 { "iuml", "\xc3\xaf" }, 219 { "kappa", "\xce\xba" }, 220 { "lArr", "\xe2\x87\x90" }, 221 { "lambda", "\xce\xbb" }, 222 { "lang", "\xe2\x8c\xa9" }, 223 { "laquo", "\xc2\xab" }, 224 { "larr", "\xe2\x86\x90" }, 225 { "lceil", "\xe2\x8c\x88" }, 226 { "ldquo", "\xe2\x80\x9c" }, 227 { "le", "\xe2\x89\xa4" }, 228 { "lfloor", "\xe2\x8c\x8a" }, 229 { "lowast", "\xe2\x88\x97" }, 230 { "loz", "\xe2\x97\x8a" }, 231 { "lrm", "\xe2\x80\x8e" }, 232 { "lsaquo", "\xe2\x80\xb9" }, 233 { "lsquo", "\xe2\x80\x98" }, 234 { "lt", "\x3c" }, 235 { "macr", "\xc2\xaf" }, 236 { "mdash", "\xe2\x80\x94" }, 237 { "micro", "\xc2\xb5" }, 238 { "middot", "\xc2\xb7" }, 239 { "minus", "\xe2\x88\x92" }, 240 { "mu", "\xce\xbc" }, 241 { "nabla", "\xe2\x88\x87" }, 242 { "nbsp", "\xc2\xa0" }, 243 { "ndash", "\xe2\x80\x93" }, 244 { "ne", "\xe2\x89\xa0" }, 245 { "ni", "\xe2\x88\x8b" }, 246 { "not", "\xc2\xac" }, 247 { "notin", "\xe2\x88\x89" }, 248 { "nsub", "\xe2\x8a\x84" }, 249 { "ntilde", "\xc3\xb1" }, 250 { "nu", "\xce\xbd" }, 251 { "oacute", "\xc3\xb3" }, 252 { "ocirc", "\xc3\xb4" }, 253 { "oelig", "\xc5\x93" }, 254 { "ograve", "\xc3\xb2" }, 255 { "oline", "\xe2\x80\xbe" }, 256 { "omega", "\xcf\x89" }, 257 { "omicron", "\xce\xbf" }, 258 { "oplus", "\xe2\x8a\x95" }, 259 { "or", "\xe2\x88\xa8" }, 260 { "ordf", "\xc2\xaa" }, 261 { "ordm", "\xc2\xba" }, 262 { "oslash", "\xc3\xb8" }, 263 { "otilde", "\xc3\xb5" }, 264 { "otimes", "\xe2\x8a\x97" }, 265 { "ouml", "\xc3\xb6" }, 266 { "para", "\xc2\xb6" }, 267 { "part", "\xe2\x88\x82" }, 268 { "permil", "\xe2\x80\xb0" }, 269 { "perp", "\xe2\x8a\xa5" }, 270 { "phi", "\xcf\x86" }, 271 { "pi", "\xcf\x80" }, 272 { "piv", "\xcf\x96" }, 273 { "plusmn", "\xc2\xb1" }, 274 { "pound", "\xc2\xa3" }, 275 { "prime", "\xe2\x80\xb2" }, 276 { "prod", "\xe2\x88\x8f" }, 277 { "prop", "\xe2\x88\x9d" }, 278 { "psi", "\xcf\x88" }, 279 { "quot", "\x22" }, 280 { "rArr", "\xe2\x87\x92" }, 281 { "radic", "\xe2\x88\x9a" }, 282 { "rang", "\xe2\x8c\xaa" }, 283 { "raquo", "\xc2\xbb" }, 284 { "rarr", "\xe2\x86\x92" }, 285 { "rceil", "\xe2\x8c\x89" }, 286 { "rdquo", "\xe2\x80\x9d" }, 287 { "real", "\xe2\x84\x9c" }, 288 { "reg", "\xc2\xae" }, 289 { "rfloor", "\xe2\x8c\x8b" }, 290 { "rho", "\xcf\x81" }, 291 { "rlm", "\xe2\x80\x8f" }, 292 { "rsaquo", "\xe2\x80\xba" }, 293 { "rsquo", "\xe2\x80\x99" }, 294 { "sbquo", "\xe2\x80\x9a" }, 295 { "scaron", "\xc5\xa1" }, 296 { "sdot", "\xe2\x8b\x85" }, 297 { "sect", "\xc2\xa7" }, 298 { "shy", "\xc2\xad" }, 299 { "sigma", "\xcf\x83" }, 300 { "sigmaf", "\xcf\x82" }, 301 { "sim", "\xe2\x88\xbc" }, 302 { "spades", "\xe2\x99\xa0" }, 303 { "sub", "\xe2\x8a\x82" }, 304 { "sube", "\xe2\x8a\x86" }, 305 { "sum", "\xe2\x88\x91" }, 306 { "sup", "\xe2\x8a\x83" }, 307 { "sup1", "\xc2\xb9" }, 308 { "sup2", "\xc2\xb2" }, 309 { "sup3", "\xc2\xb3" }, 310 { "supe", "\xe2\x8a\x87" }, 311 { "szlig", "\xc3\x9f" }, 312 { "tau", "\xcf\x84" }, 313 { "there4", "\xe2\x88\xb4" }, 314 { "theta", "\xce\xb8" }, 315 { "thetasym", "\xcf\x91" }, 316 { "thinsp", "\xe2\x80\x89" }, 317 { "thorn", "\xc3\xbe" }, 318 { "tilde", "\xcb\x9c" }, 319 { "times", "\xc3\x97" }, 320 { "trade", "\xe2\x84\xa2" }, 321 { "uArr", "\xe2\x87\x91" }, 322 { "uacute", "\xc3\xba" }, 323 { "uarr", "\xe2\x86\x91" }, 324 { "ucirc", "\xc3\xbb" }, 325 { "ugrave", "\xc3\xb9" }, 326 { "uml", "\xc2\xa8" }, 327 { "upsih", "\xcf\x92" }, 328 { "upsilon", "\xcf\x85" }, 329 { "uuml", "\xc3\xbc" }, 330 { "weierp", "\xe2\x84\x98" }, 331 { "xi", "\xce\xbe" }, 332 { "yacute", "\xc3\xbd" }, 333 { "yen", "\xc2\xa5" }, 334 { "yuml", "\xc3\xbf" }, 335 { "zeta", "\xce\xb6" }, 336 { "zwj", "\xe2\x80\x8d" }, 337 { "zwnj", "\xe2\x80\x8c" } 338 }; 339 340 // first find ; longest char entity reference is 8 chars (thetasym) 341 char entity[9]; 342 bool entity_ok = false; 343 for (unsigned i = 0; i < 9; ++i) 344 if (src[1 + i] == ';') { 345 entity[i] = 0; 346 entity_ok = true; 347 break; 348 } else 349 entity[i] = src[1 + i]; 350 351 if (!entity_ok) 352 return false; 353 354 // do binary search for entity in our entity table 355 int imin = 0; // inclusive 356 int imax = entity_count; // exclusive 357 int i; 358 359 for (;;) { 360 if (imax - imin <= 1) 361 if (std::strcmp(entity, entities[imin].name) == 0) { 362 i = imin; 363 break; 364 } else 365 return false; // not found 366 else { 367 i = (imin + imax) / 2; 368 int cmp = std::strcmp(entity, entities[i].name); 369 if (cmp == 0) 370 break; 371 else { 372 if (cmp < 0) 373 imax = i; 374 else 375 imin = i + 1; 376 } 377 } 378 } 379 380 src += internal::measure(entity) + 2; // + &; 381 for (const Ch *c = entities[i].utf8; *c; ++c) 382 *dest++ = *c; 383 384 return true; 385 } 386 } 387 } 388 389 #endif 390