1 /* 2 * HTMLparser.c : an HTML 4.0 non-verifying parser 3 * 4 * See Copyright for the status of this software. 5 * 6 * daniel@veillard.com 7 */ 8 9 #define IN_LIBXML 10 #include "libxml.h" 11 #ifdef LIBXML_HTML_ENABLED 12 13 #include <string.h> 14 #ifdef HAVE_CTYPE_H 15 #include <ctype.h> 16 #endif 17 #ifdef HAVE_STDLIB_H 18 #include <stdlib.h> 19 #endif 20 #ifdef HAVE_SYS_STAT_H 21 #include <sys/stat.h> 22 #endif 23 #ifdef HAVE_FCNTL_H 24 #include <fcntl.h> 25 #endif 26 #ifdef HAVE_UNISTD_H 27 #include <unistd.h> 28 #endif 29 #ifdef LIBXML_ZLIB_ENABLED 30 #include <zlib.h> 31 #endif 32 33 #include <libxml/xmlmemory.h> 34 #include <libxml/tree.h> 35 #include <libxml/parser.h> 36 #include <libxml/parserInternals.h> 37 #include <libxml/xmlerror.h> 38 #include <libxml/HTMLparser.h> 39 #include <libxml/HTMLtree.h> 40 #include <libxml/entities.h> 41 #include <libxml/encoding.h> 42 #include <libxml/valid.h> 43 #include <libxml/xmlIO.h> 44 #include <libxml/globals.h> 45 #include <libxml/uri.h> 46 47 #include "buf.h" 48 #include "enc.h" 49 50 #define HTML_MAX_NAMELEN 1000 51 #define HTML_PARSER_BIG_BUFFER_SIZE 1000 52 #define HTML_PARSER_BUFFER_SIZE 100 53 54 /* #define DEBUG */ 55 /* #define DEBUG_PUSH */ 56 57 static int htmlOmittedDefaultValue = 1; 58 59 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len, 60 xmlChar end, xmlChar end2, xmlChar end3); 61 static void htmlParseComment(htmlParserCtxtPtr ctxt); 62 63 /************************************************************************ 64 * * 65 * Some factorized error routines * 66 * * 67 ************************************************************************/ 68 69 /** 70 * htmlErrMemory: 71 * @ctxt: an HTML parser context 72 * @extra: extra information 73 * 74 * Handle a redefinition of attribute error 75 */ 76 static void 77 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra) 78 { 79 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 80 (ctxt->instate == XML_PARSER_EOF)) 81 return; 82 if (ctxt != NULL) { 83 ctxt->errNo = XML_ERR_NO_MEMORY; 84 ctxt->instate = XML_PARSER_EOF; 85 ctxt->disableSAX = 1; 86 } 87 if (extra) 88 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER, 89 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra, 90 NULL, NULL, 0, 0, 91 "Memory allocation failed : %s\n", extra); 92 else 93 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER, 94 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL, 95 NULL, NULL, 0, 0, "Memory allocation failed\n"); 96 } 97 98 /** 99 * htmlParseErr: 100 * @ctxt: an HTML parser context 101 * @error: the error number 102 * @msg: the error message 103 * @str1: string infor 104 * @str2: string infor 105 * 106 * Handle a fatal parser error, i.e. violating Well-Formedness constraints 107 */ 108 static void LIBXML_ATTR_FORMAT(3,0) 109 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error, 110 const char *msg, const xmlChar *str1, const xmlChar *str2) 111 { 112 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 113 (ctxt->instate == XML_PARSER_EOF)) 114 return; 115 if (ctxt != NULL) 116 ctxt->errNo = error; 117 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, 118 XML_ERR_ERROR, NULL, 0, 119 (const char *) str1, (const char *) str2, 120 NULL, 0, 0, 121 msg, str1, str2); 122 if (ctxt != NULL) 123 ctxt->wellFormed = 0; 124 } 125 126 /** 127 * htmlParseErrInt: 128 * @ctxt: an HTML parser context 129 * @error: the error number 130 * @msg: the error message 131 * @val: integer info 132 * 133 * Handle a fatal parser error, i.e. violating Well-Formedness constraints 134 */ 135 static void LIBXML_ATTR_FORMAT(3,0) 136 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error, 137 const char *msg, int val) 138 { 139 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 140 (ctxt->instate == XML_PARSER_EOF)) 141 return; 142 if (ctxt != NULL) 143 ctxt->errNo = error; 144 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, 145 XML_ERR_ERROR, NULL, 0, NULL, NULL, 146 NULL, val, 0, msg, val); 147 if (ctxt != NULL) 148 ctxt->wellFormed = 0; 149 } 150 151 /************************************************************************ 152 * * 153 * Parser stacks related functions and macros * 154 * * 155 ************************************************************************/ 156 157 /** 158 * htmlnamePush: 159 * @ctxt: an HTML parser context 160 * @value: the element name 161 * 162 * Pushes a new element name on top of the name stack 163 * 164 * Returns 0 in case of error, the index in the stack otherwise 165 */ 166 static int 167 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value) 168 { 169 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head"))) 170 ctxt->html = 3; 171 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body"))) 172 ctxt->html = 10; 173 if (ctxt->nameNr >= ctxt->nameMax) { 174 ctxt->nameMax *= 2; 175 ctxt->nameTab = (const xmlChar * *) 176 xmlRealloc((xmlChar * *)ctxt->nameTab, 177 ctxt->nameMax * 178 sizeof(ctxt->nameTab[0])); 179 if (ctxt->nameTab == NULL) { 180 htmlErrMemory(ctxt, NULL); 181 return (0); 182 } 183 } 184 ctxt->nameTab[ctxt->nameNr] = value; 185 ctxt->name = value; 186 return (ctxt->nameNr++); 187 } 188 /** 189 * htmlnamePop: 190 * @ctxt: an HTML parser context 191 * 192 * Pops the top element name from the name stack 193 * 194 * Returns the name just removed 195 */ 196 static const xmlChar * 197 htmlnamePop(htmlParserCtxtPtr ctxt) 198 { 199 const xmlChar *ret; 200 201 if (ctxt->nameNr <= 0) 202 return (NULL); 203 ctxt->nameNr--; 204 if (ctxt->nameNr < 0) 205 return (NULL); 206 if (ctxt->nameNr > 0) 207 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1]; 208 else 209 ctxt->name = NULL; 210 ret = ctxt->nameTab[ctxt->nameNr]; 211 ctxt->nameTab[ctxt->nameNr] = NULL; 212 return (ret); 213 } 214 215 /** 216 * htmlNodeInfoPush: 217 * @ctxt: an HTML parser context 218 * @value: the node info 219 * 220 * Pushes a new element name on top of the node info stack 221 * 222 * Returns 0 in case of error, the index in the stack otherwise 223 */ 224 static int 225 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value) 226 { 227 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) { 228 if (ctxt->nodeInfoMax == 0) 229 ctxt->nodeInfoMax = 5; 230 ctxt->nodeInfoMax *= 2; 231 ctxt->nodeInfoTab = (htmlParserNodeInfo *) 232 xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab, 233 ctxt->nodeInfoMax * 234 sizeof(ctxt->nodeInfoTab[0])); 235 if (ctxt->nodeInfoTab == NULL) { 236 htmlErrMemory(ctxt, NULL); 237 return (0); 238 } 239 } 240 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value; 241 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr]; 242 return (ctxt->nodeInfoNr++); 243 } 244 245 /** 246 * htmlNodeInfoPop: 247 * @ctxt: an HTML parser context 248 * 249 * Pops the top element name from the node info stack 250 * 251 * Returns 0 in case of error, the pointer to NodeInfo otherwise 252 */ 253 static htmlParserNodeInfo * 254 htmlNodeInfoPop(htmlParserCtxtPtr ctxt) 255 { 256 if (ctxt->nodeInfoNr <= 0) 257 return (NULL); 258 ctxt->nodeInfoNr--; 259 if (ctxt->nodeInfoNr < 0) 260 return (NULL); 261 if (ctxt->nodeInfoNr > 0) 262 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1]; 263 else 264 ctxt->nodeInfo = NULL; 265 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr]; 266 } 267 268 /* 269 * Macros for accessing the content. Those should be used only by the parser, 270 * and not exported. 271 * 272 * Dirty macros, i.e. one need to make assumption on the context to use them 273 * 274 * CUR_PTR return the current pointer to the xmlChar to be parsed. 275 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled 276 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled 277 * in UNICODE mode. This should be used internally by the parser 278 * only to compare to ASCII values otherwise it would break when 279 * running with UTF-8 encoding. 280 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only 281 * to compare on ASCII based substring. 282 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR 283 * it should be used only to compare on ASCII based substring. 284 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined 285 * strings without newlines within the parser. 286 * 287 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding 288 * 289 * CURRENT Returns the current char value, with the full decoding of 290 * UTF-8 if we are using this mode. It returns an int. 291 * NEXT Skip to the next character, this does the proper decoding 292 * in UTF-8 mode. It also pop-up unfinished entities on the fly. 293 * NEXTL(l) Skip the current unicode character of l xmlChars long. 294 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly 295 */ 296 297 #define UPPER (toupper(*ctxt->input->cur)) 298 299 #define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val) 300 301 #define NXT(val) ctxt->input->cur[(val)] 302 303 #define UPP(val) (toupper(ctxt->input->cur[(val)])) 304 305 #define CUR_PTR ctxt->input->cur 306 #define BASE_PTR ctxt->input->base 307 308 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \ 309 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \ 310 xmlParserInputShrink(ctxt->input) 311 312 #define GROW if ((ctxt->progressive == 0) && \ 313 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \ 314 xmlParserInputGrow(ctxt->input, INPUT_CHUNK) 315 316 #define CURRENT ((int) (*ctxt->input->cur)) 317 318 #define SKIP_BLANKS htmlSkipBlankChars(ctxt) 319 320 /* Imported from XML */ 321 322 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */ 323 #define CUR ((int) (*ctxt->input->cur)) 324 #define NEXT xmlNextChar(ctxt) 325 326 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur)) 327 328 329 #define NEXTL(l) do { \ 330 if (*(ctxt->input->cur) == '\n') { \ 331 ctxt->input->line++; ctxt->input->col = 1; \ 332 } else ctxt->input->col++; \ 333 ctxt->token = 0; ctxt->input->cur += l; \ 334 } while (0) 335 336 /************ 337 \ 338 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \ 339 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt); 340 ************/ 341 342 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l) 343 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l) 344 345 #define COPY_BUF(l,b,i,v) \ 346 if (l == 1) b[i++] = (xmlChar) v; \ 347 else i += xmlCopyChar(l,&b[i],v) 348 349 /** 350 * htmlFindEncoding: 351 * @the HTML parser context 352 * 353 * Ty to find and encoding in the current data available in the input 354 * buffer this is needed to try to switch to the proper encoding when 355 * one face a character error. 356 * That's an heuristic, since it's operating outside of parsing it could 357 * try to use a meta which had been commented out, that's the reason it 358 * should only be used in case of error, not as a default. 359 * 360 * Returns an encoding string or NULL if not found, the string need to 361 * be freed 362 */ 363 static xmlChar * 364 htmlFindEncoding(xmlParserCtxtPtr ctxt) { 365 const xmlChar *start, *cur, *end; 366 367 if ((ctxt == NULL) || (ctxt->input == NULL) || 368 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) || 369 (ctxt->input->buf->encoder != NULL)) 370 return(NULL); 371 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL)) 372 return(NULL); 373 374 start = ctxt->input->cur; 375 end = ctxt->input->end; 376 /* we also expect the input buffer to be zero terminated */ 377 if (*end != 0) 378 return(NULL); 379 380 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV"); 381 if (cur == NULL) 382 return(NULL); 383 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT"); 384 if (cur == NULL) 385 return(NULL); 386 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET="); 387 if (cur == NULL) 388 return(NULL); 389 cur += 8; 390 start = cur; 391 while (((*cur >= 'A') && (*cur <= 'Z')) || 392 ((*cur >= 'a') && (*cur <= 'z')) || 393 ((*cur >= '0') && (*cur <= '9')) || 394 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/')) 395 cur++; 396 if (cur == start) 397 return(NULL); 398 return(xmlStrndup(start, cur - start)); 399 } 400 401 /** 402 * htmlCurrentChar: 403 * @ctxt: the HTML parser context 404 * @len: pointer to the length of the char read 405 * 406 * The current char value, if using UTF-8 this may actually span multiple 407 * bytes in the input buffer. Implement the end of line normalization: 408 * 2.11 End-of-Line Handling 409 * If the encoding is unspecified, in the case we find an ISO-Latin-1 410 * char, then the encoding converter is plugged in automatically. 411 * 412 * Returns the current char value and its length 413 */ 414 415 static int 416 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { 417 const unsigned char *cur; 418 unsigned char c; 419 unsigned int val; 420 421 if (ctxt->instate == XML_PARSER_EOF) 422 return(0); 423 424 if (ctxt->token != 0) { 425 *len = 0; 426 return(ctxt->token); 427 } 428 if (ctxt->charset != XML_CHAR_ENCODING_UTF8) { 429 xmlChar * guess; 430 xmlCharEncodingHandlerPtr handler; 431 432 /* 433 * Assume it's a fixed length encoding (1) with 434 * a compatible encoding for the ASCII set, since 435 * HTML constructs only use < 128 chars 436 */ 437 if ((int) *ctxt->input->cur < 0x80) { 438 *len = 1; 439 if ((*ctxt->input->cur == 0) && 440 (ctxt->input->cur < ctxt->input->end)) { 441 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 442 "Char 0x%X out of allowed range\n", 0); 443 return(' '); 444 } 445 return((int) *ctxt->input->cur); 446 } 447 448 /* 449 * Humm this is bad, do an automatic flow conversion 450 */ 451 guess = htmlFindEncoding(ctxt); 452 if (guess == NULL) { 453 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); 454 } else { 455 if (ctxt->input->encoding != NULL) 456 xmlFree((xmlChar *) ctxt->input->encoding); 457 ctxt->input->encoding = guess; 458 handler = xmlFindCharEncodingHandler((const char *) guess); 459 if (handler != NULL) { 460 /* 461 * Don't use UTF-8 encoder which isn't required and 462 * can produce invalid UTF-8. 463 */ 464 if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8")) 465 xmlSwitchToEncoding(ctxt, handler); 466 } else { 467 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 468 "Unsupported encoding %s", guess, NULL); 469 } 470 } 471 ctxt->charset = XML_CHAR_ENCODING_UTF8; 472 } 473 474 /* 475 * We are supposed to handle UTF8, check it's valid 476 * From rfc2044: encoding of the Unicode values on UTF-8: 477 * 478 * UCS-4 range (hex.) UTF-8 octet sequence (binary) 479 * 0000 0000-0000 007F 0xxxxxxx 480 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx 481 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx 482 * 483 * Check for the 0x110000 limit too 484 */ 485 cur = ctxt->input->cur; 486 c = *cur; 487 if (c & 0x80) { 488 if ((c & 0x40) == 0) 489 goto encoding_error; 490 if (cur[1] == 0) { 491 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 492 cur = ctxt->input->cur; 493 } 494 if ((cur[1] & 0xc0) != 0x80) 495 goto encoding_error; 496 if ((c & 0xe0) == 0xe0) { 497 498 if (cur[2] == 0) { 499 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 500 cur = ctxt->input->cur; 501 } 502 if ((cur[2] & 0xc0) != 0x80) 503 goto encoding_error; 504 if ((c & 0xf0) == 0xf0) { 505 if (cur[3] == 0) { 506 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 507 cur = ctxt->input->cur; 508 } 509 if (((c & 0xf8) != 0xf0) || 510 ((cur[3] & 0xc0) != 0x80)) 511 goto encoding_error; 512 /* 4-byte code */ 513 *len = 4; 514 val = (cur[0] & 0x7) << 18; 515 val |= (cur[1] & 0x3f) << 12; 516 val |= (cur[2] & 0x3f) << 6; 517 val |= cur[3] & 0x3f; 518 if (val < 0x10000) 519 goto encoding_error; 520 } else { 521 /* 3-byte code */ 522 *len = 3; 523 val = (cur[0] & 0xf) << 12; 524 val |= (cur[1] & 0x3f) << 6; 525 val |= cur[2] & 0x3f; 526 if (val < 0x800) 527 goto encoding_error; 528 } 529 } else { 530 /* 2-byte code */ 531 *len = 2; 532 val = (cur[0] & 0x1f) << 6; 533 val |= cur[1] & 0x3f; 534 if (val < 0x80) 535 goto encoding_error; 536 } 537 if (!IS_CHAR(val)) { 538 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 539 "Char 0x%X out of allowed range\n", val); 540 } 541 return(val); 542 } else { 543 if ((*ctxt->input->cur == 0) && 544 (ctxt->input->cur < ctxt->input->end)) { 545 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 546 "Char 0x%X out of allowed range\n", 0); 547 *len = 1; 548 return(' '); 549 } 550 /* 1-byte code */ 551 *len = 1; 552 return((int) *ctxt->input->cur); 553 } 554 555 encoding_error: 556 /* 557 * If we detect an UTF8 error that probably mean that the 558 * input encoding didn't get properly advertised in the 559 * declaration header. Report the error and switch the encoding 560 * to ISO-Latin-1 (if you don't like this policy, just declare the 561 * encoding !) 562 */ 563 { 564 char buffer[150]; 565 566 if (ctxt->input->end - ctxt->input->cur >= 4) { 567 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", 568 ctxt->input->cur[0], ctxt->input->cur[1], 569 ctxt->input->cur[2], ctxt->input->cur[3]); 570 } else { 571 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]); 572 } 573 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 574 "Input is not proper UTF-8, indicate encoding !\n", 575 BAD_CAST buffer, NULL); 576 } 577 578 /* 579 * Don't switch encodings twice. Note that if there's an encoder, we 580 * shouldn't receive invalid UTF-8 anyway. 581 * 582 * Note that if ctxt->input->buf == NULL, switching encodings is 583 * impossible, see Gitlab issue #34. 584 */ 585 if ((ctxt->input->buf != NULL) && 586 (ctxt->input->buf->encoder == NULL)) 587 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); 588 *len = 1; 589 return((int) *ctxt->input->cur); 590 } 591 592 /** 593 * htmlSkipBlankChars: 594 * @ctxt: the HTML parser context 595 * 596 * skip all blanks character found at that point in the input streams. 597 * 598 * Returns the number of space chars skipped 599 */ 600 601 static int 602 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) { 603 int res = 0; 604 605 while (IS_BLANK_CH(*(ctxt->input->cur))) { 606 if ((*ctxt->input->cur == 0) && 607 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) { 608 xmlPopInput(ctxt); 609 } else { 610 if (*(ctxt->input->cur) == '\n') { 611 ctxt->input->line++; ctxt->input->col = 1; 612 } else ctxt->input->col++; 613 ctxt->input->cur++; 614 if (*ctxt->input->cur == 0) 615 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 616 } 617 res++; 618 } 619 return(res); 620 } 621 622 623 624 /************************************************************************ 625 * * 626 * The list of HTML elements and their properties * 627 * * 628 ************************************************************************/ 629 630 /* 631 * Start Tag: 1 means the start tag can be omitted 632 * End Tag: 1 means the end tag can be omitted 633 * 2 means it's forbidden (empty elements) 634 * 3 means the tag is stylistic and should be closed easily 635 * Depr: this element is deprecated 636 * DTD: 1 means that this element is valid only in the Loose DTD 637 * 2 means that this element is valid only in the Frameset DTD 638 * 639 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description 640 , subElements , impliedsubelt , Attributes, userdata 641 */ 642 643 /* Definitions and a couple of vars for HTML Elements */ 644 645 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small" 646 #define NB_FONTSTYLE 8 647 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym" 648 #define NB_PHRASE 10 649 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe" 650 #define NB_SPECIAL 16 651 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL 652 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL 653 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address" 654 #define NB_BLOCK NB_HEADING + NB_LIST + 14 655 #define FORMCTRL "input", "select", "textarea", "label", "button" 656 #define NB_FORMCTRL 5 657 #define PCDATA 658 #define NB_PCDATA 0 659 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6" 660 #define NB_HEADING 6 661 #define LIST "ul", "ol", "dir", "menu" 662 #define NB_LIST 4 663 #define MODIFIER 664 #define NB_MODIFIER 0 665 #define FLOW BLOCK,INLINE 666 #define NB_FLOW NB_BLOCK + NB_INLINE 667 #define EMPTY NULL 668 669 670 static const char* const html_flow[] = { FLOW, NULL } ; 671 static const char* const html_inline[] = { INLINE, NULL } ; 672 673 /* placeholders: elts with content but no subelements */ 674 static const char* const html_pcdata[] = { NULL } ; 675 #define html_cdata html_pcdata 676 677 678 /* ... and for HTML Attributes */ 679 680 #define COREATTRS "id", "class", "style", "title" 681 #define NB_COREATTRS 4 682 #define I18N "lang", "dir" 683 #define NB_I18N 2 684 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup" 685 #define NB_EVENTS 9 686 #define ATTRS COREATTRS,I18N,EVENTS 687 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS 688 #define CELLHALIGN "align", "char", "charoff" 689 #define NB_CELLHALIGN 3 690 #define CELLVALIGN "valign" 691 #define NB_CELLVALIGN 1 692 693 static const char* const html_attrs[] = { ATTRS, NULL } ; 694 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ; 695 static const char* const core_attrs[] = { COREATTRS, NULL } ; 696 static const char* const i18n_attrs[] = { I18N, NULL } ; 697 698 699 /* Other declarations that should go inline ... */ 700 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name", 701 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords", 702 "tabindex", "onfocus", "onblur", NULL } ; 703 static const char* const target_attr[] = { "target", NULL } ; 704 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ; 705 static const char* const alt_attr[] = { "alt", NULL } ; 706 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ; 707 static const char* const href_attrs[] = { "href", NULL } ; 708 static const char* const clear_attrs[] = { "clear", NULL } ; 709 static const char* const inline_p[] = { INLINE, "p", NULL } ; 710 711 static const char* const flow_param[] = { FLOW, "param", NULL } ; 712 static const char* const applet_attrs[] = { COREATTRS , "codebase", 713 "archive", "alt", "name", "height", "width", "align", 714 "hspace", "vspace", NULL } ; 715 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref", 716 "tabindex", "accesskey", "onfocus", "onblur", NULL } ; 717 static const char* const basefont_attrs[] = 718 { "id", "size", "color", "face", NULL } ; 719 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ; 720 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ; 721 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ; 722 static const char* const body_depr[] = { "background", "bgcolor", "text", 723 "link", "vlink", "alink", NULL } ; 724 static const char* const button_attrs[] = { ATTRS, "name", "value", "type", 725 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ; 726 727 728 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ; 729 static const char* const col_elt[] = { "col", NULL } ; 730 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ; 731 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ; 732 static const char* const dl_contents[] = { "dt", "dd", NULL } ; 733 static const char* const compact_attr[] = { "compact", NULL } ; 734 static const char* const label_attr[] = { "label", NULL } ; 735 static const char* const fieldset_contents[] = { FLOW, "legend" } ; 736 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ; 737 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ; 738 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ; 739 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ; 740 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ; 741 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ; 742 static const char* const head_attrs[] = { I18N, "profile", NULL } ; 743 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ; 744 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ; 745 static const char* const version_attr[] = { "version", NULL } ; 746 static const char* const html_content[] = { "head", "body", "frameset", NULL } ; 747 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ; 748 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ; 749 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ; 750 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ; 751 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ; 752 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ; 753 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ; 754 static const char* const align_attr[] = { "align", NULL } ; 755 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ; 756 static const char* const map_contents[] = { BLOCK, "area", NULL } ; 757 static const char* const name_attr[] = { "name", NULL } ; 758 static const char* const action_attr[] = { "action", NULL } ; 759 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ; 760 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ; 761 static const char* const content_attr[] = { "content", NULL } ; 762 static const char* const type_attr[] = { "type", NULL } ; 763 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ; 764 static const char* const object_contents[] = { FLOW, "param", NULL } ; 765 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ; 766 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ; 767 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ; 768 static const char* const option_elt[] = { "option", NULL } ; 769 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ; 770 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ; 771 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ; 772 static const char* const width_attr[] = { "width", NULL } ; 773 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ; 774 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ; 775 static const char* const language_attr[] = { "language", NULL } ; 776 static const char* const select_content[] = { "optgroup", "option", NULL } ; 777 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ; 778 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ; 779 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ; 780 static const char* const table_depr[] = { "align", "bgcolor", NULL } ; 781 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ; 782 static const char* const tr_elt[] = { "tr", NULL } ; 783 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ; 784 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ; 785 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ; 786 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ; 787 static const char* const tr_contents[] = { "th", "td", NULL } ; 788 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ; 789 static const char* const li_elt[] = { "li", NULL } ; 790 static const char* const ul_depr[] = { "type", "compact", NULL} ; 791 static const char* const dir_attr[] = { "dir", NULL} ; 792 793 #define DECL (const char**) 794 795 static const htmlElemDesc 796 html40ElementTable[] = { 797 { "a", 0, 0, 0, 0, 0, 0, 1, "anchor ", 798 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL 799 }, 800 { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form", 801 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 802 }, 803 { "acronym", 0, 0, 0, 0, 0, 0, 1, "", 804 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 805 }, 806 { "address", 0, 0, 0, 0, 0, 0, 0, "information on author ", 807 DECL inline_p , NULL , DECL html_attrs, NULL, NULL 808 }, 809 { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ", 810 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL 811 }, 812 { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ", 813 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr 814 }, 815 { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style", 816 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 817 }, 818 { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ", 819 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs 820 }, 821 { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " , 822 EMPTY , NULL , NULL, DECL basefont_attrs, NULL 823 }, 824 { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ", 825 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr 826 }, 827 { "big", 0, 3, 0, 0, 0, 0, 1, "large text style", 828 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 829 }, 830 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ", 831 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL 832 }, 833 { "body", 1, 1, 0, 0, 0, 0, 0, "document body ", 834 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL 835 }, 836 { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ", 837 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL 838 }, 839 { "button", 0, 0, 0, 0, 0, 0, 2, "push button ", 840 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL 841 }, 842 { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ", 843 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 844 }, 845 { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ", 846 DECL html_flow , NULL , NULL, DECL html_attrs, NULL 847 }, 848 { "cite", 0, 0, 0, 0, 0, 0, 1, "citation", 849 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 850 }, 851 { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment", 852 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 853 }, 854 { "col", 0, 2, 2, 1, 0, 0, 0, "table column ", 855 EMPTY , NULL , DECL col_attrs , NULL, NULL 856 }, 857 { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ", 858 DECL col_elt , "col" , DECL col_attrs , NULL, NULL 859 }, 860 { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ", 861 DECL html_flow , NULL , DECL html_attrs, NULL, NULL 862 }, 863 { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ", 864 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL 865 }, 866 { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition", 867 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 868 }, 869 { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list", 870 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL 871 }, 872 { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container", 873 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL 874 }, 875 { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ", 876 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL 877 }, 878 { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ", 879 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 880 }, 881 { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis", 882 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 883 }, 884 { "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ", 885 EMPTY, NULL, DECL embed_attrs, NULL, NULL 886 }, 887 { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ", 888 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL 889 }, 890 { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ", 891 DECL html_inline, NULL, NULL, DECL font_attrs, NULL 892 }, 893 { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ", 894 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr 895 }, 896 { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " , 897 EMPTY, NULL, NULL, DECL frame_attrs, NULL 898 }, 899 { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" , 900 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL 901 }, 902 { "h1", 0, 0, 0, 0, 0, 0, 0, "heading ", 903 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 904 }, 905 { "h2", 0, 0, 0, 0, 0, 0, 0, "heading ", 906 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 907 }, 908 { "h3", 0, 0, 0, 0, 0, 0, 0, "heading ", 909 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 910 }, 911 { "h4", 0, 0, 0, 0, 0, 0, 0, "heading ", 912 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 913 }, 914 { "h5", 0, 0, 0, 0, 0, 0, 0, "heading ", 915 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 916 }, 917 { "h6", 0, 0, 0, 0, 0, 0, 0, "heading ", 918 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 919 }, 920 { "head", 1, 1, 0, 0, 0, 0, 0, "document head ", 921 DECL head_contents, NULL, DECL head_attrs, NULL, NULL 922 }, 923 { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " , 924 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL 925 }, 926 { "html", 1, 1, 0, 0, 0, 0, 0, "document root element ", 927 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL 928 }, 929 { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style", 930 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 931 }, 932 { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ", 933 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL 934 }, 935 { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ", 936 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs 937 }, 938 { "input", 0, 2, 2, 1, 0, 0, 1, "form control ", 939 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL 940 }, 941 { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text", 942 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL 943 }, 944 { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ", 945 EMPTY, NULL, NULL, DECL prompt_attrs, NULL 946 }, 947 { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user", 948 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 949 }, 950 { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ", 951 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL 952 }, 953 { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ", 954 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL 955 }, 956 { "li", 0, 1, 1, 0, 0, 0, 0, "list item ", 957 DECL html_flow, NULL, DECL html_attrs, NULL, NULL 958 }, 959 { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ", 960 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL 961 }, 962 { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ", 963 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr 964 }, 965 { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ", 966 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL 967 }, 968 { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ", 969 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr 970 }, 971 { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ", 972 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL 973 }, 974 { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ", 975 DECL html_flow, "div", DECL html_attrs, NULL, NULL 976 }, 977 { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ", 978 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL 979 }, 980 { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ", 981 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL 982 }, 983 { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ", 984 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr 985 }, 986 { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " , 987 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL 988 }, 989 { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ", 990 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 991 }, 992 { "param", 0, 2, 2, 1, 0, 0, 0, "named property value ", 993 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr 994 }, 995 { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ", 996 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL 997 }, 998 { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ", 999 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL 1000 }, 1001 { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style", 1002 DECL html_inline, NULL, NULL, DECL html_attrs, NULL 1003 }, 1004 { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.", 1005 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1006 }, 1007 { "script", 0, 0, 0, 0, 0, 0, 2, "script statements ", 1008 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr 1009 }, 1010 { "select", 0, 0, 0, 0, 0, 0, 1, "option selector ", 1011 DECL select_content, NULL, DECL select_attrs, NULL, NULL 1012 }, 1013 { "small", 0, 3, 0, 0, 0, 0, 1, "small text style", 1014 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1015 }, 1016 { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ", 1017 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1018 }, 1019 { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text", 1020 DECL html_inline, NULL, NULL, DECL html_attrs, NULL 1021 }, 1022 { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis", 1023 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1024 }, 1025 { "style", 0, 0, 0, 0, 0, 0, 0, "style info ", 1026 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr 1027 }, 1028 { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript", 1029 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1030 }, 1031 { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ", 1032 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1033 }, 1034 { "table", 0, 0, 0, 0, 0, 0, 0, "", 1035 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL 1036 }, 1037 { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ", 1038 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 1039 }, 1040 { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell", 1041 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL 1042 }, 1043 { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ", 1044 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr 1045 }, 1046 { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ", 1047 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 1048 }, 1049 { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell", 1050 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL 1051 }, 1052 { "thead", 0, 1, 0, 0, 0, 0, 0, "table header ", 1053 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 1054 }, 1055 { "title", 0, 0, 0, 0, 0, 0, 0, "document title ", 1056 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL 1057 }, 1058 { "tr", 0, 0, 0, 0, 0, 0, 0, "table row ", 1059 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL 1060 }, 1061 { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style", 1062 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1063 }, 1064 { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style", 1065 DECL html_inline, NULL, NULL, DECL html_attrs, NULL 1066 }, 1067 { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ", 1068 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL 1069 }, 1070 { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument", 1071 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1072 } 1073 }; 1074 1075 typedef struct { 1076 const char *oldTag; 1077 const char *newTag; 1078 } htmlStartCloseEntry; 1079 1080 /* 1081 * start tags that imply the end of current element 1082 */ 1083 static const htmlStartCloseEntry htmlStartClose[] = { 1084 { "a", "a" }, 1085 { "a", "fieldset" }, 1086 { "a", "table" }, 1087 { "a", "td" }, 1088 { "a", "th" }, 1089 { "address", "dd" }, 1090 { "address", "dl" }, 1091 { "address", "dt" }, 1092 { "address", "form" }, 1093 { "address", "li" }, 1094 { "address", "ul" }, 1095 { "b", "center" }, 1096 { "b", "p" }, 1097 { "b", "td" }, 1098 { "b", "th" }, 1099 { "big", "p" }, 1100 { "caption", "col" }, 1101 { "caption", "colgroup" }, 1102 { "caption", "tbody" }, 1103 { "caption", "tfoot" }, 1104 { "caption", "thead" }, 1105 { "caption", "tr" }, 1106 { "col", "col" }, 1107 { "col", "colgroup" }, 1108 { "col", "tbody" }, 1109 { "col", "tfoot" }, 1110 { "col", "thead" }, 1111 { "col", "tr" }, 1112 { "colgroup", "colgroup" }, 1113 { "colgroup", "tbody" }, 1114 { "colgroup", "tfoot" }, 1115 { "colgroup", "thead" }, 1116 { "colgroup", "tr" }, 1117 { "dd", "dt" }, 1118 { "dir", "dd" }, 1119 { "dir", "dl" }, 1120 { "dir", "dt" }, 1121 { "dir", "form" }, 1122 { "dir", "ul" }, 1123 { "dl", "form" }, 1124 { "dl", "li" }, 1125 { "dt", "dd" }, 1126 { "dt", "dl" }, 1127 { "font", "center" }, 1128 { "font", "td" }, 1129 { "font", "th" }, 1130 { "form", "form" }, 1131 { "h1", "fieldset" }, 1132 { "h1", "form" }, 1133 { "h1", "li" }, 1134 { "h1", "p" }, 1135 { "h1", "table" }, 1136 { "h2", "fieldset" }, 1137 { "h2", "form" }, 1138 { "h2", "li" }, 1139 { "h2", "p" }, 1140 { "h2", "table" }, 1141 { "h3", "fieldset" }, 1142 { "h3", "form" }, 1143 { "h3", "li" }, 1144 { "h3", "p" }, 1145 { "h3", "table" }, 1146 { "h4", "fieldset" }, 1147 { "h4", "form" }, 1148 { "h4", "li" }, 1149 { "h4", "p" }, 1150 { "h4", "table" }, 1151 { "h5", "fieldset" }, 1152 { "h5", "form" }, 1153 { "h5", "li" }, 1154 { "h5", "p" }, 1155 { "h5", "table" }, 1156 { "h6", "fieldset" }, 1157 { "h6", "form" }, 1158 { "h6", "li" }, 1159 { "h6", "p" }, 1160 { "h6", "table" }, 1161 { "head", "a" }, 1162 { "head", "abbr" }, 1163 { "head", "acronym" }, 1164 { "head", "address" }, 1165 { "head", "b" }, 1166 { "head", "bdo" }, 1167 { "head", "big" }, 1168 { "head", "blockquote" }, 1169 { "head", "body" }, 1170 { "head", "br" }, 1171 { "head", "center" }, 1172 { "head", "cite" }, 1173 { "head", "code" }, 1174 { "head", "dd" }, 1175 { "head", "dfn" }, 1176 { "head", "dir" }, 1177 { "head", "div" }, 1178 { "head", "dl" }, 1179 { "head", "dt" }, 1180 { "head", "em" }, 1181 { "head", "fieldset" }, 1182 { "head", "font" }, 1183 { "head", "form" }, 1184 { "head", "frameset" }, 1185 { "head", "h1" }, 1186 { "head", "h2" }, 1187 { "head", "h3" }, 1188 { "head", "h4" }, 1189 { "head", "h5" }, 1190 { "head", "h6" }, 1191 { "head", "hr" }, 1192 { "head", "i" }, 1193 { "head", "iframe" }, 1194 { "head", "img" }, 1195 { "head", "kbd" }, 1196 { "head", "li" }, 1197 { "head", "listing" }, 1198 { "head", "map" }, 1199 { "head", "menu" }, 1200 { "head", "ol" }, 1201 { "head", "p" }, 1202 { "head", "pre" }, 1203 { "head", "q" }, 1204 { "head", "s" }, 1205 { "head", "samp" }, 1206 { "head", "small" }, 1207 { "head", "span" }, 1208 { "head", "strike" }, 1209 { "head", "strong" }, 1210 { "head", "sub" }, 1211 { "head", "sup" }, 1212 { "head", "table" }, 1213 { "head", "tt" }, 1214 { "head", "u" }, 1215 { "head", "ul" }, 1216 { "head", "var" }, 1217 { "head", "xmp" }, 1218 { "hr", "form" }, 1219 { "i", "center" }, 1220 { "i", "p" }, 1221 { "i", "td" }, 1222 { "i", "th" }, 1223 { "legend", "fieldset" }, 1224 { "li", "li" }, 1225 { "link", "body" }, 1226 { "link", "frameset" }, 1227 { "listing", "dd" }, 1228 { "listing", "dl" }, 1229 { "listing", "dt" }, 1230 { "listing", "fieldset" }, 1231 { "listing", "form" }, 1232 { "listing", "li" }, 1233 { "listing", "table" }, 1234 { "listing", "ul" }, 1235 { "menu", "dd" }, 1236 { "menu", "dl" }, 1237 { "menu", "dt" }, 1238 { "menu", "form" }, 1239 { "menu", "ul" }, 1240 { "ol", "form" }, 1241 { "ol", "ul" }, 1242 { "option", "optgroup" }, 1243 { "option", "option" }, 1244 { "p", "address" }, 1245 { "p", "blockquote" }, 1246 { "p", "body" }, 1247 { "p", "caption" }, 1248 { "p", "center" }, 1249 { "p", "col" }, 1250 { "p", "colgroup" }, 1251 { "p", "dd" }, 1252 { "p", "dir" }, 1253 { "p", "div" }, 1254 { "p", "dl" }, 1255 { "p", "dt" }, 1256 { "p", "fieldset" }, 1257 { "p", "form" }, 1258 { "p", "frameset" }, 1259 { "p", "h1" }, 1260 { "p", "h2" }, 1261 { "p", "h3" }, 1262 { "p", "h4" }, 1263 { "p", "h5" }, 1264 { "p", "h6" }, 1265 { "p", "head" }, 1266 { "p", "hr" }, 1267 { "p", "li" }, 1268 { "p", "listing" }, 1269 { "p", "menu" }, 1270 { "p", "ol" }, 1271 { "p", "p" }, 1272 { "p", "pre" }, 1273 { "p", "table" }, 1274 { "p", "tbody" }, 1275 { "p", "td" }, 1276 { "p", "tfoot" }, 1277 { "p", "th" }, 1278 { "p", "title" }, 1279 { "p", "tr" }, 1280 { "p", "ul" }, 1281 { "p", "xmp" }, 1282 { "pre", "dd" }, 1283 { "pre", "dl" }, 1284 { "pre", "dt" }, 1285 { "pre", "fieldset" }, 1286 { "pre", "form" }, 1287 { "pre", "li" }, 1288 { "pre", "table" }, 1289 { "pre", "ul" }, 1290 { "s", "p" }, 1291 { "script", "noscript" }, 1292 { "small", "p" }, 1293 { "span", "td" }, 1294 { "span", "th" }, 1295 { "strike", "p" }, 1296 { "style", "body" }, 1297 { "style", "frameset" }, 1298 { "tbody", "tbody" }, 1299 { "tbody", "tfoot" }, 1300 { "td", "tbody" }, 1301 { "td", "td" }, 1302 { "td", "tfoot" }, 1303 { "td", "th" }, 1304 { "td", "tr" }, 1305 { "tfoot", "tbody" }, 1306 { "th", "tbody" }, 1307 { "th", "td" }, 1308 { "th", "tfoot" }, 1309 { "th", "th" }, 1310 { "th", "tr" }, 1311 { "thead", "tbody" }, 1312 { "thead", "tfoot" }, 1313 { "title", "body" }, 1314 { "title", "frameset" }, 1315 { "tr", "tbody" }, 1316 { "tr", "tfoot" }, 1317 { "tr", "tr" }, 1318 { "tt", "p" }, 1319 { "u", "p" }, 1320 { "u", "td" }, 1321 { "u", "th" }, 1322 { "ul", "address" }, 1323 { "ul", "form" }, 1324 { "ul", "menu" }, 1325 { "ul", "ol" }, 1326 { "ul", "pre" }, 1327 { "xmp", "dd" }, 1328 { "xmp", "dl" }, 1329 { "xmp", "dt" }, 1330 { "xmp", "fieldset" }, 1331 { "xmp", "form" }, 1332 { "xmp", "li" }, 1333 { "xmp", "table" }, 1334 { "xmp", "ul" } 1335 }; 1336 1337 /* 1338 * The list of HTML elements which are supposed not to have 1339 * CDATA content and where a p element will be implied 1340 * 1341 * TODO: extend that list by reading the HTML SGML DTD on 1342 * implied paragraph 1343 */ 1344 static const char *const htmlNoContentElements[] = { 1345 "html", 1346 "head", 1347 NULL 1348 }; 1349 1350 /* 1351 * The list of HTML attributes which are of content %Script; 1352 * NOTE: when adding ones, check htmlIsScriptAttribute() since 1353 * it assumes the name starts with 'on' 1354 */ 1355 static const char *const htmlScriptAttributes[] = { 1356 "onclick", 1357 "ondblclick", 1358 "onmousedown", 1359 "onmouseup", 1360 "onmouseover", 1361 "onmousemove", 1362 "onmouseout", 1363 "onkeypress", 1364 "onkeydown", 1365 "onkeyup", 1366 "onload", 1367 "onunload", 1368 "onfocus", 1369 "onblur", 1370 "onsubmit", 1371 "onreset", 1372 "onchange", 1373 "onselect" 1374 }; 1375 1376 /* 1377 * This table is used by the htmlparser to know what to do with 1378 * broken html pages. By assigning different priorities to different 1379 * elements the parser can decide how to handle extra endtags. 1380 * Endtags are only allowed to close elements with lower or equal 1381 * priority. 1382 */ 1383 1384 typedef struct { 1385 const char *name; 1386 int priority; 1387 } elementPriority; 1388 1389 static const elementPriority htmlEndPriority[] = { 1390 {"div", 150}, 1391 {"td", 160}, 1392 {"th", 160}, 1393 {"tr", 170}, 1394 {"thead", 180}, 1395 {"tbody", 180}, 1396 {"tfoot", 180}, 1397 {"table", 190}, 1398 {"head", 200}, 1399 {"body", 200}, 1400 {"html", 220}, 1401 {NULL, 100} /* Default priority */ 1402 }; 1403 1404 /************************************************************************ 1405 * * 1406 * functions to handle HTML specific data * 1407 * * 1408 ************************************************************************/ 1409 1410 /** 1411 * htmlInitAutoClose: 1412 * 1413 * This is a no-op now. 1414 */ 1415 void 1416 htmlInitAutoClose(void) { 1417 } 1418 1419 static int 1420 htmlCompareTags(const void *key, const void *member) { 1421 const xmlChar *tag = (const xmlChar *) key; 1422 const htmlElemDesc *desc = (const htmlElemDesc *) member; 1423 1424 return(xmlStrcasecmp(tag, BAD_CAST desc->name)); 1425 } 1426 1427 /** 1428 * htmlTagLookup: 1429 * @tag: The tag name in lowercase 1430 * 1431 * Lookup the HTML tag in the ElementTable 1432 * 1433 * Returns the related htmlElemDescPtr or NULL if not found. 1434 */ 1435 const htmlElemDesc * 1436 htmlTagLookup(const xmlChar *tag) { 1437 if (tag == NULL) 1438 return(NULL); 1439 1440 return((const htmlElemDesc *) bsearch(tag, html40ElementTable, 1441 sizeof(html40ElementTable) / sizeof(htmlElemDesc), 1442 sizeof(htmlElemDesc), htmlCompareTags)); 1443 } 1444 1445 /** 1446 * htmlGetEndPriority: 1447 * @name: The name of the element to look up the priority for. 1448 * 1449 * Return value: The "endtag" priority. 1450 **/ 1451 static int 1452 htmlGetEndPriority (const xmlChar *name) { 1453 int i = 0; 1454 1455 while ((htmlEndPriority[i].name != NULL) && 1456 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name))) 1457 i++; 1458 1459 return(htmlEndPriority[i].priority); 1460 } 1461 1462 1463 static int 1464 htmlCompareStartClose(const void *vkey, const void *member) { 1465 const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey; 1466 const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member; 1467 int ret; 1468 1469 ret = strcmp(key->oldTag, entry->oldTag); 1470 if (ret == 0) 1471 ret = strcmp(key->newTag, entry->newTag); 1472 1473 return(ret); 1474 } 1475 1476 /** 1477 * htmlCheckAutoClose: 1478 * @newtag: The new tag name 1479 * @oldtag: The old tag name 1480 * 1481 * Checks whether the new tag is one of the registered valid tags for 1482 * closing old. 1483 * 1484 * Returns 0 if no, 1 if yes. 1485 */ 1486 static int 1487 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag) 1488 { 1489 htmlStartCloseEntry key; 1490 void *res; 1491 1492 key.oldTag = (const char *) oldtag; 1493 key.newTag = (const char *) newtag; 1494 res = bsearch(&key, htmlStartClose, 1495 sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry), 1496 sizeof(htmlStartCloseEntry), htmlCompareStartClose); 1497 return(res != NULL); 1498 } 1499 1500 /** 1501 * htmlAutoCloseOnClose: 1502 * @ctxt: an HTML parser context 1503 * @newtag: The new tag name 1504 * @force: force the tag closure 1505 * 1506 * The HTML DTD allows an ending tag to implicitly close other tags. 1507 */ 1508 static void 1509 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag) 1510 { 1511 const htmlElemDesc *info; 1512 int i, priority; 1513 1514 priority = htmlGetEndPriority(newtag); 1515 1516 for (i = (ctxt->nameNr - 1); i >= 0; i--) { 1517 1518 if (xmlStrEqual(newtag, ctxt->nameTab[i])) 1519 break; 1520 /* 1521 * A misplaced endtag can only close elements with lower 1522 * or equal priority, so if we find an element with higher 1523 * priority before we find an element with 1524 * matching name, we just ignore this endtag 1525 */ 1526 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority) 1527 return; 1528 } 1529 if (i < 0) 1530 return; 1531 1532 while (!xmlStrEqual(newtag, ctxt->name)) { 1533 info = htmlTagLookup(ctxt->name); 1534 if ((info != NULL) && (info->endTag == 3)) { 1535 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 1536 "Opening and ending tag mismatch: %s and %s\n", 1537 newtag, ctxt->name); 1538 } 1539 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1540 ctxt->sax->endElement(ctxt->userData, ctxt->name); 1541 htmlnamePop(ctxt); 1542 } 1543 } 1544 1545 /** 1546 * htmlAutoCloseOnEnd: 1547 * @ctxt: an HTML parser context 1548 * 1549 * Close all remaining tags at the end of the stream 1550 */ 1551 static void 1552 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) 1553 { 1554 int i; 1555 1556 if (ctxt->nameNr == 0) 1557 return; 1558 for (i = (ctxt->nameNr - 1); i >= 0; i--) { 1559 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1560 ctxt->sax->endElement(ctxt->userData, ctxt->name); 1561 htmlnamePop(ctxt); 1562 } 1563 } 1564 1565 /** 1566 * htmlAutoClose: 1567 * @ctxt: an HTML parser context 1568 * @newtag: The new tag name or NULL 1569 * 1570 * The HTML DTD allows a tag to implicitly close other tags. 1571 * The list is kept in htmlStartClose array. This function is 1572 * called when a new tag has been detected and generates the 1573 * appropriates closes if possible/needed. 1574 * If newtag is NULL this mean we are at the end of the resource 1575 * and we should check 1576 */ 1577 static void 1578 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag) 1579 { 1580 while ((newtag != NULL) && (ctxt->name != NULL) && 1581 (htmlCheckAutoClose(newtag, ctxt->name))) { 1582 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1583 ctxt->sax->endElement(ctxt->userData, ctxt->name); 1584 htmlnamePop(ctxt); 1585 } 1586 if (newtag == NULL) { 1587 htmlAutoCloseOnEnd(ctxt); 1588 return; 1589 } 1590 while ((newtag == NULL) && (ctxt->name != NULL) && 1591 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) || 1592 (xmlStrEqual(ctxt->name, BAD_CAST "body")) || 1593 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) { 1594 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1595 ctxt->sax->endElement(ctxt->userData, ctxt->name); 1596 htmlnamePop(ctxt); 1597 } 1598 } 1599 1600 /** 1601 * htmlAutoCloseTag: 1602 * @doc: the HTML document 1603 * @name: The tag name 1604 * @elem: the HTML element 1605 * 1606 * The HTML DTD allows a tag to implicitly close other tags. 1607 * The list is kept in htmlStartClose array. This function checks 1608 * if the element or one of it's children would autoclose the 1609 * given tag. 1610 * 1611 * Returns 1 if autoclose, 0 otherwise 1612 */ 1613 int 1614 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) { 1615 htmlNodePtr child; 1616 1617 if (elem == NULL) return(1); 1618 if (xmlStrEqual(name, elem->name)) return(0); 1619 if (htmlCheckAutoClose(elem->name, name)) return(1); 1620 child = elem->children; 1621 while (child != NULL) { 1622 if (htmlAutoCloseTag(doc, name, child)) return(1); 1623 child = child->next; 1624 } 1625 return(0); 1626 } 1627 1628 /** 1629 * htmlIsAutoClosed: 1630 * @doc: the HTML document 1631 * @elem: the HTML element 1632 * 1633 * The HTML DTD allows a tag to implicitly close other tags. 1634 * The list is kept in htmlStartClose array. This function checks 1635 * if a tag is autoclosed by one of it's child 1636 * 1637 * Returns 1 if autoclosed, 0 otherwise 1638 */ 1639 int 1640 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) { 1641 htmlNodePtr child; 1642 1643 if (elem == NULL) return(1); 1644 child = elem->children; 1645 while (child != NULL) { 1646 if (htmlAutoCloseTag(doc, elem->name, child)) return(1); 1647 child = child->next; 1648 } 1649 return(0); 1650 } 1651 1652 /** 1653 * htmlCheckImplied: 1654 * @ctxt: an HTML parser context 1655 * @newtag: The new tag name 1656 * 1657 * The HTML DTD allows a tag to exists only implicitly 1658 * called when a new tag has been detected and generates the 1659 * appropriates implicit tags if missing 1660 */ 1661 static void 1662 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) { 1663 int i; 1664 1665 if (ctxt->options & HTML_PARSE_NOIMPLIED) 1666 return; 1667 if (!htmlOmittedDefaultValue) 1668 return; 1669 if (xmlStrEqual(newtag, BAD_CAST"html")) 1670 return; 1671 if (ctxt->nameNr <= 0) { 1672 htmlnamePush(ctxt, BAD_CAST"html"); 1673 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1674 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL); 1675 } 1676 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head"))) 1677 return; 1678 if ((ctxt->nameNr <= 1) && 1679 ((xmlStrEqual(newtag, BAD_CAST"script")) || 1680 (xmlStrEqual(newtag, BAD_CAST"style")) || 1681 (xmlStrEqual(newtag, BAD_CAST"meta")) || 1682 (xmlStrEqual(newtag, BAD_CAST"link")) || 1683 (xmlStrEqual(newtag, BAD_CAST"title")) || 1684 (xmlStrEqual(newtag, BAD_CAST"base")))) { 1685 if (ctxt->html >= 3) { 1686 /* we already saw or generated an <head> before */ 1687 return; 1688 } 1689 /* 1690 * dropped OBJECT ... i you put it first BODY will be 1691 * assumed ! 1692 */ 1693 htmlnamePush(ctxt, BAD_CAST"head"); 1694 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1695 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL); 1696 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) && 1697 (!xmlStrEqual(newtag, BAD_CAST"frame")) && 1698 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) { 1699 if (ctxt->html >= 10) { 1700 /* we already saw or generated a <body> before */ 1701 return; 1702 } 1703 for (i = 0;i < ctxt->nameNr;i++) { 1704 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) { 1705 return; 1706 } 1707 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) { 1708 return; 1709 } 1710 } 1711 1712 htmlnamePush(ctxt, BAD_CAST"body"); 1713 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1714 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL); 1715 } 1716 } 1717 1718 /** 1719 * htmlCheckParagraph 1720 * @ctxt: an HTML parser context 1721 * 1722 * Check whether a p element need to be implied before inserting 1723 * characters in the current element. 1724 * 1725 * Returns 1 if a paragraph has been inserted, 0 if not and -1 1726 * in case of error. 1727 */ 1728 1729 static int 1730 htmlCheckParagraph(htmlParserCtxtPtr ctxt) { 1731 const xmlChar *tag; 1732 int i; 1733 1734 if (ctxt == NULL) 1735 return(-1); 1736 tag = ctxt->name; 1737 if (tag == NULL) { 1738 htmlAutoClose(ctxt, BAD_CAST"p"); 1739 htmlCheckImplied(ctxt, BAD_CAST"p"); 1740 htmlnamePush(ctxt, BAD_CAST"p"); 1741 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1742 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL); 1743 return(1); 1744 } 1745 if (!htmlOmittedDefaultValue) 1746 return(0); 1747 for (i = 0; htmlNoContentElements[i] != NULL; i++) { 1748 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) { 1749 htmlAutoClose(ctxt, BAD_CAST"p"); 1750 htmlCheckImplied(ctxt, BAD_CAST"p"); 1751 htmlnamePush(ctxt, BAD_CAST"p"); 1752 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1753 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL); 1754 return(1); 1755 } 1756 } 1757 return(0); 1758 } 1759 1760 /** 1761 * htmlIsScriptAttribute: 1762 * @name: an attribute name 1763 * 1764 * Check if an attribute is of content type Script 1765 * 1766 * Returns 1 is the attribute is a script 0 otherwise 1767 */ 1768 int 1769 htmlIsScriptAttribute(const xmlChar *name) { 1770 unsigned int i; 1771 1772 if (name == NULL) 1773 return(0); 1774 /* 1775 * all script attributes start with 'on' 1776 */ 1777 if ((name[0] != 'o') || (name[1] != 'n')) 1778 return(0); 1779 for (i = 0; 1780 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]); 1781 i++) { 1782 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i])) 1783 return(1); 1784 } 1785 return(0); 1786 } 1787 1788 /************************************************************************ 1789 * * 1790 * The list of HTML predefined entities * 1791 * * 1792 ************************************************************************/ 1793 1794 1795 static const htmlEntityDesc html40EntitiesTable[] = { 1796 /* 1797 * the 4 absolute ones, plus apostrophe. 1798 */ 1799 { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" }, 1800 { 38, "amp", "ampersand, U+0026 ISOnum" }, 1801 { 39, "apos", "single quote" }, 1802 { 60, "lt", "less-than sign, U+003C ISOnum" }, 1803 { 62, "gt", "greater-than sign, U+003E ISOnum" }, 1804 1805 /* 1806 * A bunch still in the 128-255 range 1807 * Replacing them depend really on the charset used. 1808 */ 1809 { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" }, 1810 { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" }, 1811 { 162, "cent", "cent sign, U+00A2 ISOnum" }, 1812 { 163, "pound","pound sign, U+00A3 ISOnum" }, 1813 { 164, "curren","currency sign, U+00A4 ISOnum" }, 1814 { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" }, 1815 { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" }, 1816 { 167, "sect", "section sign, U+00A7 ISOnum" }, 1817 { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" }, 1818 { 169, "copy", "copyright sign, U+00A9 ISOnum" }, 1819 { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" }, 1820 { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" }, 1821 { 172, "not", "not sign, U+00AC ISOnum" }, 1822 { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" }, 1823 { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" }, 1824 { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" }, 1825 { 176, "deg", "degree sign, U+00B0 ISOnum" }, 1826 { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" }, 1827 { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" }, 1828 { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" }, 1829 { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" }, 1830 { 181, "micro","micro sign, U+00B5 ISOnum" }, 1831 { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" }, 1832 { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" }, 1833 { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" }, 1834 { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" }, 1835 { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" }, 1836 { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" }, 1837 { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" }, 1838 { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" }, 1839 { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" }, 1840 { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" }, 1841 { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" }, 1842 { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" }, 1843 { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" }, 1844 { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" }, 1845 { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" }, 1846 { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" }, 1847 { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" }, 1848 { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" }, 1849 { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" }, 1850 { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" }, 1851 { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" }, 1852 { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" }, 1853 { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" }, 1854 { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" }, 1855 { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" }, 1856 { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" }, 1857 { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" }, 1858 { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" }, 1859 { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" }, 1860 { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" }, 1861 { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" }, 1862 { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" }, 1863 { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" }, 1864 { 215, "times","multiplication sign, U+00D7 ISOnum" }, 1865 { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" }, 1866 { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" }, 1867 { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" }, 1868 { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" }, 1869 { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" }, 1870 { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" }, 1871 { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" }, 1872 { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" }, 1873 { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" }, 1874 { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" }, 1875 { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" }, 1876 { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" }, 1877 { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" }, 1878 { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" }, 1879 { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" }, 1880 { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" }, 1881 { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" }, 1882 { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" }, 1883 { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" }, 1884 { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" }, 1885 { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" }, 1886 { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" }, 1887 { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" }, 1888 { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" }, 1889 { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" }, 1890 { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" }, 1891 { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" }, 1892 { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" }, 1893 { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" }, 1894 { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" }, 1895 { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" }, 1896 { 247, "divide","division sign, U+00F7 ISOnum" }, 1897 { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" }, 1898 { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" }, 1899 { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" }, 1900 { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" }, 1901 { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" }, 1902 { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" }, 1903 { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" }, 1904 { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" }, 1905 1906 { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" }, 1907 { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" }, 1908 { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" }, 1909 { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" }, 1910 { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" }, 1911 1912 /* 1913 * Anything below should really be kept as entities references 1914 */ 1915 { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" }, 1916 1917 { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" }, 1918 { 732, "tilde","small tilde, U+02DC ISOdia" }, 1919 1920 { 913, "Alpha","greek capital letter alpha, U+0391" }, 1921 { 914, "Beta", "greek capital letter beta, U+0392" }, 1922 { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" }, 1923 { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" }, 1924 { 917, "Epsilon","greek capital letter epsilon, U+0395" }, 1925 { 918, "Zeta", "greek capital letter zeta, U+0396" }, 1926 { 919, "Eta", "greek capital letter eta, U+0397" }, 1927 { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" }, 1928 { 921, "Iota", "greek capital letter iota, U+0399" }, 1929 { 922, "Kappa","greek capital letter kappa, U+039A" }, 1930 { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" }, 1931 { 924, "Mu", "greek capital letter mu, U+039C" }, 1932 { 925, "Nu", "greek capital letter nu, U+039D" }, 1933 { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" }, 1934 { 927, "Omicron","greek capital letter omicron, U+039F" }, 1935 { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" }, 1936 { 929, "Rho", "greek capital letter rho, U+03A1" }, 1937 { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" }, 1938 { 932, "Tau", "greek capital letter tau, U+03A4" }, 1939 { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" }, 1940 { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" }, 1941 { 935, "Chi", "greek capital letter chi, U+03A7" }, 1942 { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" }, 1943 { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" }, 1944 1945 { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" }, 1946 { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" }, 1947 { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" }, 1948 { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" }, 1949 { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" }, 1950 { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" }, 1951 { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" }, 1952 { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" }, 1953 { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" }, 1954 { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" }, 1955 { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" }, 1956 { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" }, 1957 { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" }, 1958 { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" }, 1959 { 959, "omicron","greek small letter omicron, U+03BF NEW" }, 1960 { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" }, 1961 { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" }, 1962 { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" }, 1963 { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" }, 1964 { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" }, 1965 { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" }, 1966 { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" }, 1967 { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" }, 1968 { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" }, 1969 { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" }, 1970 { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" }, 1971 { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" }, 1972 { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" }, 1973 1974 { 8194, "ensp", "en space, U+2002 ISOpub" }, 1975 { 8195, "emsp", "em space, U+2003 ISOpub" }, 1976 { 8201, "thinsp","thin space, U+2009 ISOpub" }, 1977 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" }, 1978 { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" }, 1979 { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" }, 1980 { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" }, 1981 { 8211, "ndash","en dash, U+2013 ISOpub" }, 1982 { 8212, "mdash","em dash, U+2014 ISOpub" }, 1983 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" }, 1984 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" }, 1985 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" }, 1986 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" }, 1987 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" }, 1988 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" }, 1989 { 8224, "dagger","dagger, U+2020 ISOpub" }, 1990 { 8225, "Dagger","double dagger, U+2021 ISOpub" }, 1991 1992 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" }, 1993 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" }, 1994 1995 { 8240, "permil","per mille sign, U+2030 ISOtech" }, 1996 1997 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" }, 1998 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" }, 1999 2000 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" }, 2001 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" }, 2002 2003 { 8254, "oline","overline = spacing overscore, U+203E NEW" }, 2004 { 8260, "frasl","fraction slash, U+2044 NEW" }, 2005 2006 { 8364, "euro", "euro sign, U+20AC NEW" }, 2007 2008 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" }, 2009 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" }, 2010 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" }, 2011 { 8482, "trade","trade mark sign, U+2122 ISOnum" }, 2012 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" }, 2013 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" }, 2014 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" }, 2015 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" }, 2016 { 8595, "darr", "downwards arrow, U+2193 ISOnum" }, 2017 { 8596, "harr", "left right arrow, U+2194 ISOamsa" }, 2018 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" }, 2019 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" }, 2020 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" }, 2021 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" }, 2022 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" }, 2023 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" }, 2024 2025 { 8704, "forall","for all, U+2200 ISOtech" }, 2026 { 8706, "part", "partial differential, U+2202 ISOtech" }, 2027 { 8707, "exist","there exists, U+2203 ISOtech" }, 2028 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" }, 2029 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" }, 2030 { 8712, "isin", "element of, U+2208 ISOtech" }, 2031 { 8713, "notin","not an element of, U+2209 ISOtech" }, 2032 { 8715, "ni", "contains as member, U+220B ISOtech" }, 2033 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" }, 2034 { 8721, "sum", "n-ary summation, U+2211 ISOamsb" }, 2035 { 8722, "minus","minus sign, U+2212 ISOtech" }, 2036 { 8727, "lowast","asterisk operator, U+2217 ISOtech" }, 2037 { 8730, "radic","square root = radical sign, U+221A ISOtech" }, 2038 { 8733, "prop", "proportional to, U+221D ISOtech" }, 2039 { 8734, "infin","infinity, U+221E ISOtech" }, 2040 { 8736, "ang", "angle, U+2220 ISOamso" }, 2041 { 8743, "and", "logical and = wedge, U+2227 ISOtech" }, 2042 { 8744, "or", "logical or = vee, U+2228 ISOtech" }, 2043 { 8745, "cap", "intersection = cap, U+2229 ISOtech" }, 2044 { 8746, "cup", "union = cup, U+222A ISOtech" }, 2045 { 8747, "int", "integral, U+222B ISOtech" }, 2046 { 8756, "there4","therefore, U+2234 ISOtech" }, 2047 { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" }, 2048 { 8773, "cong", "approximately equal to, U+2245 ISOtech" }, 2049 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" }, 2050 { 8800, "ne", "not equal to, U+2260 ISOtech" }, 2051 { 8801, "equiv","identical to, U+2261 ISOtech" }, 2052 { 8804, "le", "less-than or equal to, U+2264 ISOtech" }, 2053 { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" }, 2054 { 8834, "sub", "subset of, U+2282 ISOtech" }, 2055 { 8835, "sup", "superset of, U+2283 ISOtech" }, 2056 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" }, 2057 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" }, 2058 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" }, 2059 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" }, 2060 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" }, 2061 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" }, 2062 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" }, 2063 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" }, 2064 { 8969, "rceil","right ceiling, U+2309 ISOamsc" }, 2065 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" }, 2066 { 8971, "rfloor","right floor, U+230B ISOamsc" }, 2067 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" }, 2068 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" }, 2069 { 9674, "loz", "lozenge, U+25CA ISOpub" }, 2070 2071 { 9824, "spades","black spade suit, U+2660 ISOpub" }, 2072 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" }, 2073 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" }, 2074 { 9830, "diams","black diamond suit, U+2666 ISOpub" }, 2075 2076 }; 2077 2078 /************************************************************************ 2079 * * 2080 * Commodity functions to handle entities * 2081 * * 2082 ************************************************************************/ 2083 2084 /* 2085 * Macro used to grow the current buffer. 2086 */ 2087 #define growBuffer(buffer) { \ 2088 xmlChar *tmp; \ 2089 buffer##_size *= 2; \ 2090 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \ 2091 if (tmp == NULL) { \ 2092 htmlErrMemory(ctxt, "growing buffer\n"); \ 2093 xmlFree(buffer); \ 2094 return(NULL); \ 2095 } \ 2096 buffer = tmp; \ 2097 } 2098 2099 /** 2100 * htmlEntityLookup: 2101 * @name: the entity name 2102 * 2103 * Lookup the given entity in EntitiesTable 2104 * 2105 * TODO: the linear scan is really ugly, an hash table is really needed. 2106 * 2107 * Returns the associated htmlEntityDescPtr if found, NULL otherwise. 2108 */ 2109 const htmlEntityDesc * 2110 htmlEntityLookup(const xmlChar *name) { 2111 unsigned int i; 2112 2113 for (i = 0;i < (sizeof(html40EntitiesTable)/ 2114 sizeof(html40EntitiesTable[0]));i++) { 2115 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) { 2116 return((htmlEntityDescPtr) &html40EntitiesTable[i]); 2117 } 2118 } 2119 return(NULL); 2120 } 2121 2122 /** 2123 * htmlEntityValueLookup: 2124 * @value: the entity's unicode value 2125 * 2126 * Lookup the given entity in EntitiesTable 2127 * 2128 * TODO: the linear scan is really ugly, an hash table is really needed. 2129 * 2130 * Returns the associated htmlEntityDescPtr if found, NULL otherwise. 2131 */ 2132 const htmlEntityDesc * 2133 htmlEntityValueLookup(unsigned int value) { 2134 unsigned int i; 2135 2136 for (i = 0;i < (sizeof(html40EntitiesTable)/ 2137 sizeof(html40EntitiesTable[0]));i++) { 2138 if (html40EntitiesTable[i].value >= value) { 2139 if (html40EntitiesTable[i].value > value) 2140 break; 2141 return((htmlEntityDescPtr) &html40EntitiesTable[i]); 2142 } 2143 } 2144 return(NULL); 2145 } 2146 2147 /** 2148 * UTF8ToHtml: 2149 * @out: a pointer to an array of bytes to store the result 2150 * @outlen: the length of @out 2151 * @in: a pointer to an array of UTF-8 chars 2152 * @inlen: the length of @in 2153 * 2154 * Take a block of UTF-8 chars in and try to convert it to an ASCII 2155 * plus HTML entities block of chars out. 2156 * 2157 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise 2158 * The value of @inlen after return is the number of octets consumed 2159 * as the return value is positive, else unpredictable. 2160 * The value of @outlen after return is the number of octets consumed. 2161 */ 2162 int 2163 UTF8ToHtml(unsigned char* out, int *outlen, 2164 const unsigned char* in, int *inlen) { 2165 const unsigned char* processed = in; 2166 const unsigned char* outend; 2167 const unsigned char* outstart = out; 2168 const unsigned char* instart = in; 2169 const unsigned char* inend; 2170 unsigned int c, d; 2171 int trailing; 2172 2173 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1); 2174 if (in == NULL) { 2175 /* 2176 * initialization nothing to do 2177 */ 2178 *outlen = 0; 2179 *inlen = 0; 2180 return(0); 2181 } 2182 inend = in + (*inlen); 2183 outend = out + (*outlen); 2184 while (in < inend) { 2185 d = *in++; 2186 if (d < 0x80) { c= d; trailing= 0; } 2187 else if (d < 0xC0) { 2188 /* trailing byte in leading position */ 2189 *outlen = out - outstart; 2190 *inlen = processed - instart; 2191 return(-2); 2192 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } 2193 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } 2194 else if (d < 0xF8) { c= d & 0x07; trailing= 3; } 2195 else { 2196 /* no chance for this in Ascii */ 2197 *outlen = out - outstart; 2198 *inlen = processed - instart; 2199 return(-2); 2200 } 2201 2202 if (inend - in < trailing) { 2203 break; 2204 } 2205 2206 for ( ; trailing; trailing--) { 2207 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) 2208 break; 2209 c <<= 6; 2210 c |= d & 0x3F; 2211 } 2212 2213 /* assertion: c is a single UTF-4 value */ 2214 if (c < 0x80) { 2215 if (out + 1 >= outend) 2216 break; 2217 *out++ = c; 2218 } else { 2219 int len; 2220 const htmlEntityDesc * ent; 2221 const char *cp; 2222 char nbuf[16]; 2223 2224 /* 2225 * Try to lookup a predefined HTML entity for it 2226 */ 2227 2228 ent = htmlEntityValueLookup(c); 2229 if (ent == NULL) { 2230 snprintf(nbuf, sizeof(nbuf), "#%u", c); 2231 cp = nbuf; 2232 } 2233 else 2234 cp = ent->name; 2235 len = strlen(cp); 2236 if (out + 2 + len >= outend) 2237 break; 2238 *out++ = '&'; 2239 memcpy(out, cp, len); 2240 out += len; 2241 *out++ = ';'; 2242 } 2243 processed = in; 2244 } 2245 *outlen = out - outstart; 2246 *inlen = processed - instart; 2247 return(0); 2248 } 2249 2250 /** 2251 * htmlEncodeEntities: 2252 * @out: a pointer to an array of bytes to store the result 2253 * @outlen: the length of @out 2254 * @in: a pointer to an array of UTF-8 chars 2255 * @inlen: the length of @in 2256 * @quoteChar: the quote character to escape (' or ") or zero. 2257 * 2258 * Take a block of UTF-8 chars in and try to convert it to an ASCII 2259 * plus HTML entities block of chars out. 2260 * 2261 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise 2262 * The value of @inlen after return is the number of octets consumed 2263 * as the return value is positive, else unpredictable. 2264 * The value of @outlen after return is the number of octets consumed. 2265 */ 2266 int 2267 htmlEncodeEntities(unsigned char* out, int *outlen, 2268 const unsigned char* in, int *inlen, int quoteChar) { 2269 const unsigned char* processed = in; 2270 const unsigned char* outend; 2271 const unsigned char* outstart = out; 2272 const unsigned char* instart = in; 2273 const unsigned char* inend; 2274 unsigned int c, d; 2275 int trailing; 2276 2277 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) 2278 return(-1); 2279 outend = out + (*outlen); 2280 inend = in + (*inlen); 2281 while (in < inend) { 2282 d = *in++; 2283 if (d < 0x80) { c= d; trailing= 0; } 2284 else if (d < 0xC0) { 2285 /* trailing byte in leading position */ 2286 *outlen = out - outstart; 2287 *inlen = processed - instart; 2288 return(-2); 2289 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } 2290 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } 2291 else if (d < 0xF8) { c= d & 0x07; trailing= 3; } 2292 else { 2293 /* no chance for this in Ascii */ 2294 *outlen = out - outstart; 2295 *inlen = processed - instart; 2296 return(-2); 2297 } 2298 2299 if (inend - in < trailing) 2300 break; 2301 2302 while (trailing--) { 2303 if (((d= *in++) & 0xC0) != 0x80) { 2304 *outlen = out - outstart; 2305 *inlen = processed - instart; 2306 return(-2); 2307 } 2308 c <<= 6; 2309 c |= d & 0x3F; 2310 } 2311 2312 /* assertion: c is a single UTF-4 value */ 2313 if ((c < 0x80) && (c != (unsigned int) quoteChar) && 2314 (c != '&') && (c != '<') && (c != '>')) { 2315 if (out >= outend) 2316 break; 2317 *out++ = c; 2318 } else { 2319 const htmlEntityDesc * ent; 2320 const char *cp; 2321 char nbuf[16]; 2322 int len; 2323 2324 /* 2325 * Try to lookup a predefined HTML entity for it 2326 */ 2327 ent = htmlEntityValueLookup(c); 2328 if (ent == NULL) { 2329 snprintf(nbuf, sizeof(nbuf), "#%u", c); 2330 cp = nbuf; 2331 } 2332 else 2333 cp = ent->name; 2334 len = strlen(cp); 2335 if (out + 2 + len > outend) 2336 break; 2337 *out++ = '&'; 2338 memcpy(out, cp, len); 2339 out += len; 2340 *out++ = ';'; 2341 } 2342 processed = in; 2343 } 2344 *outlen = out - outstart; 2345 *inlen = processed - instart; 2346 return(0); 2347 } 2348 2349 /************************************************************************ 2350 * * 2351 * Commodity functions to handle streams * 2352 * * 2353 ************************************************************************/ 2354 2355 #ifdef LIBXML_PUSH_ENABLED 2356 /** 2357 * htmlNewInputStream: 2358 * @ctxt: an HTML parser context 2359 * 2360 * Create a new input stream structure 2361 * Returns the new input stream or NULL 2362 */ 2363 static htmlParserInputPtr 2364 htmlNewInputStream(htmlParserCtxtPtr ctxt) { 2365 htmlParserInputPtr input; 2366 2367 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput)); 2368 if (input == NULL) { 2369 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n"); 2370 return(NULL); 2371 } 2372 memset(input, 0, sizeof(htmlParserInput)); 2373 input->filename = NULL; 2374 input->directory = NULL; 2375 input->base = NULL; 2376 input->cur = NULL; 2377 input->buf = NULL; 2378 input->line = 1; 2379 input->col = 1; 2380 input->buf = NULL; 2381 input->free = NULL; 2382 input->version = NULL; 2383 input->consumed = 0; 2384 input->length = 0; 2385 return(input); 2386 } 2387 #endif 2388 2389 2390 /************************************************************************ 2391 * * 2392 * Commodity functions, cleanup needed ? * 2393 * * 2394 ************************************************************************/ 2395 /* 2396 * all tags allowing pc data from the html 4.01 loose dtd 2397 * NOTE: it might be more appropriate to integrate this information 2398 * into the html40ElementTable array but I don't want to risk any 2399 * binary incompatibility 2400 */ 2401 static const char *allowPCData[] = { 2402 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big", 2403 "blockquote", "body", "button", "caption", "center", "cite", "code", 2404 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2", 2405 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend", 2406 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp", 2407 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var" 2408 }; 2409 2410 /** 2411 * areBlanks: 2412 * @ctxt: an HTML parser context 2413 * @str: a xmlChar * 2414 * @len: the size of @str 2415 * 2416 * Is this a sequence of blank chars that one can ignore ? 2417 * 2418 * Returns 1 if ignorable 0 otherwise. 2419 */ 2420 2421 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) { 2422 unsigned int i; 2423 int j; 2424 xmlNodePtr lastChild; 2425 xmlDtdPtr dtd; 2426 2427 for (j = 0;j < len;j++) 2428 if (!(IS_BLANK_CH(str[j]))) return(0); 2429 2430 if (CUR == 0) return(1); 2431 if (CUR != '<') return(0); 2432 if (ctxt->name == NULL) 2433 return(1); 2434 if (xmlStrEqual(ctxt->name, BAD_CAST"html")) 2435 return(1); 2436 if (xmlStrEqual(ctxt->name, BAD_CAST"head")) 2437 return(1); 2438 2439 /* Only strip CDATA children of the body tag for strict HTML DTDs */ 2440 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) { 2441 dtd = xmlGetIntSubset(ctxt->myDoc); 2442 if (dtd != NULL && dtd->ExternalID != NULL) { 2443 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") || 2444 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN")) 2445 return(1); 2446 } 2447 } 2448 2449 if (ctxt->node == NULL) return(0); 2450 lastChild = xmlGetLastChild(ctxt->node); 2451 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE)) 2452 lastChild = lastChild->prev; 2453 if (lastChild == NULL) { 2454 if ((ctxt->node->type != XML_ELEMENT_NODE) && 2455 (ctxt->node->content != NULL)) return(0); 2456 /* keep ws in constructs like ...<b> </b>... 2457 for all tags "b" allowing PCDATA */ 2458 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) { 2459 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) { 2460 return(0); 2461 } 2462 } 2463 } else if (xmlNodeIsText(lastChild)) { 2464 return(0); 2465 } else { 2466 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p> 2467 for all tags "p" allowing PCDATA */ 2468 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) { 2469 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) { 2470 return(0); 2471 } 2472 } 2473 } 2474 return(1); 2475 } 2476 2477 /** 2478 * htmlNewDocNoDtD: 2479 * @URI: URI for the dtd, or NULL 2480 * @ExternalID: the external ID of the DTD, or NULL 2481 * 2482 * Creates a new HTML document without a DTD node if @URI and @ExternalID 2483 * are NULL 2484 * 2485 * Returns a new document, do not initialize the DTD if not provided 2486 */ 2487 htmlDocPtr 2488 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) { 2489 xmlDocPtr cur; 2490 2491 /* 2492 * Allocate a new document and fill the fields. 2493 */ 2494 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc)); 2495 if (cur == NULL) { 2496 htmlErrMemory(NULL, "HTML document creation failed\n"); 2497 return(NULL); 2498 } 2499 memset(cur, 0, sizeof(xmlDoc)); 2500 2501 cur->type = XML_HTML_DOCUMENT_NODE; 2502 cur->version = NULL; 2503 cur->intSubset = NULL; 2504 cur->doc = cur; 2505 cur->name = NULL; 2506 cur->children = NULL; 2507 cur->extSubset = NULL; 2508 cur->oldNs = NULL; 2509 cur->encoding = NULL; 2510 cur->standalone = 1; 2511 cur->compression = 0; 2512 cur->ids = NULL; 2513 cur->refs = NULL; 2514 cur->_private = NULL; 2515 cur->charset = XML_CHAR_ENCODING_UTF8; 2516 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT; 2517 if ((ExternalID != NULL) || 2518 (URI != NULL)) 2519 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI); 2520 return(cur); 2521 } 2522 2523 /** 2524 * htmlNewDoc: 2525 * @URI: URI for the dtd, or NULL 2526 * @ExternalID: the external ID of the DTD, or NULL 2527 * 2528 * Creates a new HTML document 2529 * 2530 * Returns a new document 2531 */ 2532 htmlDocPtr 2533 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) { 2534 if ((URI == NULL) && (ExternalID == NULL)) 2535 return(htmlNewDocNoDtD( 2536 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd", 2537 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN")); 2538 2539 return(htmlNewDocNoDtD(URI, ExternalID)); 2540 } 2541 2542 2543 /************************************************************************ 2544 * * 2545 * The parser itself * 2546 * Relates to http://www.w3.org/TR/html40 * 2547 * * 2548 ************************************************************************/ 2549 2550 /************************************************************************ 2551 * * 2552 * The parser itself * 2553 * * 2554 ************************************************************************/ 2555 2556 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt); 2557 2558 /** 2559 * htmlParseHTMLName: 2560 * @ctxt: an HTML parser context 2561 * 2562 * parse an HTML tag or attribute name, note that we convert it to lowercase 2563 * since HTML names are not case-sensitive. 2564 * 2565 * Returns the Tag Name parsed or NULL 2566 */ 2567 2568 static const xmlChar * 2569 htmlParseHTMLName(htmlParserCtxtPtr ctxt) { 2570 int i = 0; 2571 xmlChar loc[HTML_PARSER_BUFFER_SIZE]; 2572 2573 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') && 2574 (CUR != ':') && (CUR != '.')) return(NULL); 2575 2576 while ((i < HTML_PARSER_BUFFER_SIZE) && 2577 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) || 2578 (CUR == ':') || (CUR == '-') || (CUR == '_') || 2579 (CUR == '.'))) { 2580 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20; 2581 else loc[i] = CUR; 2582 i++; 2583 2584 NEXT; 2585 } 2586 2587 return(xmlDictLookup(ctxt->dict, loc, i)); 2588 } 2589 2590 2591 /** 2592 * htmlParseHTMLName_nonInvasive: 2593 * @ctxt: an HTML parser context 2594 * 2595 * parse an HTML tag or attribute name, note that we convert it to lowercase 2596 * since HTML names are not case-sensitive, this doesn't consume the data 2597 * from the stream, it's a look-ahead 2598 * 2599 * Returns the Tag Name parsed or NULL 2600 */ 2601 2602 static const xmlChar * 2603 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) { 2604 int i = 0; 2605 xmlChar loc[HTML_PARSER_BUFFER_SIZE]; 2606 2607 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') && 2608 (NXT(1) != ':')) return(NULL); 2609 2610 while ((i < HTML_PARSER_BUFFER_SIZE) && 2611 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) || 2612 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) { 2613 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20; 2614 else loc[i] = NXT(1+i); 2615 i++; 2616 } 2617 2618 return(xmlDictLookup(ctxt->dict, loc, i)); 2619 } 2620 2621 2622 /** 2623 * htmlParseName: 2624 * @ctxt: an HTML parser context 2625 * 2626 * parse an HTML name, this routine is case sensitive. 2627 * 2628 * Returns the Name parsed or NULL 2629 */ 2630 2631 static const xmlChar * 2632 htmlParseName(htmlParserCtxtPtr ctxt) { 2633 const xmlChar *in; 2634 const xmlChar *ret; 2635 int count = 0; 2636 2637 GROW; 2638 2639 /* 2640 * Accelerator for simple ASCII names 2641 */ 2642 in = ctxt->input->cur; 2643 if (((*in >= 0x61) && (*in <= 0x7A)) || 2644 ((*in >= 0x41) && (*in <= 0x5A)) || 2645 (*in == '_') || (*in == ':')) { 2646 in++; 2647 while (((*in >= 0x61) && (*in <= 0x7A)) || 2648 ((*in >= 0x41) && (*in <= 0x5A)) || 2649 ((*in >= 0x30) && (*in <= 0x39)) || 2650 (*in == '_') || (*in == '-') || 2651 (*in == ':') || (*in == '.')) 2652 in++; 2653 2654 if (in == ctxt->input->end) 2655 return(NULL); 2656 2657 if ((*in > 0) && (*in < 0x80)) { 2658 count = in - ctxt->input->cur; 2659 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count); 2660 ctxt->input->cur = in; 2661 ctxt->input->col += count; 2662 return(ret); 2663 } 2664 } 2665 return(htmlParseNameComplex(ctxt)); 2666 } 2667 2668 static const xmlChar * 2669 htmlParseNameComplex(xmlParserCtxtPtr ctxt) { 2670 int len = 0, l; 2671 int c; 2672 int count = 0; 2673 const xmlChar *base = ctxt->input->base; 2674 2675 /* 2676 * Handler for more complex cases 2677 */ 2678 GROW; 2679 c = CUR_CHAR(l); 2680 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */ 2681 (!IS_LETTER(c) && (c != '_') && 2682 (c != ':'))) { 2683 return(NULL); 2684 } 2685 2686 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */ 2687 ((IS_LETTER(c)) || (IS_DIGIT(c)) || 2688 (c == '.') || (c == '-') || 2689 (c == '_') || (c == ':') || 2690 (IS_COMBINING(c)) || 2691 (IS_EXTENDER(c)))) { 2692 if (count++ > 100) { 2693 count = 0; 2694 GROW; 2695 } 2696 len += l; 2697 NEXTL(l); 2698 c = CUR_CHAR(l); 2699 if (ctxt->input->base != base) { 2700 /* 2701 * We changed encoding from an unknown encoding 2702 * Input buffer changed location, so we better start again 2703 */ 2704 return(htmlParseNameComplex(ctxt)); 2705 } 2706 } 2707 2708 if (ctxt->input->cur - ctxt->input->base < len) { 2709 /* Sanity check */ 2710 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 2711 "unexpected change of input buffer", NULL, NULL); 2712 return (NULL); 2713 } 2714 2715 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len)); 2716 } 2717 2718 2719 /** 2720 * htmlParseHTMLAttribute: 2721 * @ctxt: an HTML parser context 2722 * @stop: a char stop value 2723 * 2724 * parse an HTML attribute value till the stop (quote), if 2725 * stop is 0 then it stops at the first space 2726 * 2727 * Returns the attribute parsed or NULL 2728 */ 2729 2730 static xmlChar * 2731 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) { 2732 xmlChar *buffer = NULL; 2733 int buffer_size = 0; 2734 xmlChar *out = NULL; 2735 const xmlChar *name = NULL; 2736 const xmlChar *cur = NULL; 2737 const htmlEntityDesc * ent; 2738 2739 /* 2740 * allocate a translation buffer. 2741 */ 2742 buffer_size = HTML_PARSER_BUFFER_SIZE; 2743 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar)); 2744 if (buffer == NULL) { 2745 htmlErrMemory(ctxt, "buffer allocation failed\n"); 2746 return(NULL); 2747 } 2748 out = buffer; 2749 2750 /* 2751 * Ok loop until we reach one of the ending chars 2752 */ 2753 while ((CUR != 0) && (CUR != stop)) { 2754 if ((stop == 0) && (CUR == '>')) break; 2755 if ((stop == 0) && (IS_BLANK_CH(CUR))) break; 2756 if (CUR == '&') { 2757 if (NXT(1) == '#') { 2758 unsigned int c; 2759 int bits; 2760 2761 c = htmlParseCharRef(ctxt); 2762 if (c < 0x80) 2763 { *out++ = c; bits= -6; } 2764 else if (c < 0x800) 2765 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2766 else if (c < 0x10000) 2767 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2768 else 2769 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2770 2771 for ( ; bits >= 0; bits-= 6) { 2772 *out++ = ((c >> bits) & 0x3F) | 0x80; 2773 } 2774 2775 if (out - buffer > buffer_size - 100) { 2776 int indx = out - buffer; 2777 2778 growBuffer(buffer); 2779 out = &buffer[indx]; 2780 } 2781 } else { 2782 ent = htmlParseEntityRef(ctxt, &name); 2783 if (name == NULL) { 2784 *out++ = '&'; 2785 if (out - buffer > buffer_size - 100) { 2786 int indx = out - buffer; 2787 2788 growBuffer(buffer); 2789 out = &buffer[indx]; 2790 } 2791 } else if (ent == NULL) { 2792 *out++ = '&'; 2793 cur = name; 2794 while (*cur != 0) { 2795 if (out - buffer > buffer_size - 100) { 2796 int indx = out - buffer; 2797 2798 growBuffer(buffer); 2799 out = &buffer[indx]; 2800 } 2801 *out++ = *cur++; 2802 } 2803 } else { 2804 unsigned int c; 2805 int bits; 2806 2807 if (out - buffer > buffer_size - 100) { 2808 int indx = out - buffer; 2809 2810 growBuffer(buffer); 2811 out = &buffer[indx]; 2812 } 2813 c = ent->value; 2814 if (c < 0x80) 2815 { *out++ = c; bits= -6; } 2816 else if (c < 0x800) 2817 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2818 else if (c < 0x10000) 2819 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2820 else 2821 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2822 2823 for ( ; bits >= 0; bits-= 6) { 2824 *out++ = ((c >> bits) & 0x3F) | 0x80; 2825 } 2826 } 2827 } 2828 } else { 2829 unsigned int c; 2830 int bits, l; 2831 2832 if (out - buffer > buffer_size - 100) { 2833 int indx = out - buffer; 2834 2835 growBuffer(buffer); 2836 out = &buffer[indx]; 2837 } 2838 c = CUR_CHAR(l); 2839 if (c < 0x80) 2840 { *out++ = c; bits= -6; } 2841 else if (c < 0x800) 2842 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2843 else if (c < 0x10000) 2844 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2845 else 2846 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2847 2848 for ( ; bits >= 0; bits-= 6) { 2849 *out++ = ((c >> bits) & 0x3F) | 0x80; 2850 } 2851 NEXT; 2852 } 2853 } 2854 *out = 0; 2855 return(buffer); 2856 } 2857 2858 /** 2859 * htmlParseEntityRef: 2860 * @ctxt: an HTML parser context 2861 * @str: location to store the entity name 2862 * 2863 * parse an HTML ENTITY references 2864 * 2865 * [68] EntityRef ::= '&' Name ';' 2866 * 2867 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise, 2868 * if non-NULL *str will have to be freed by the caller. 2869 */ 2870 const htmlEntityDesc * 2871 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) { 2872 const xmlChar *name; 2873 const htmlEntityDesc * ent = NULL; 2874 2875 if (str != NULL) *str = NULL; 2876 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL); 2877 2878 if (CUR == '&') { 2879 NEXT; 2880 name = htmlParseName(ctxt); 2881 if (name == NULL) { 2882 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 2883 "htmlParseEntityRef: no name\n", NULL, NULL); 2884 } else { 2885 GROW; 2886 if (CUR == ';') { 2887 if (str != NULL) 2888 *str = name; 2889 2890 /* 2891 * Lookup the entity in the table. 2892 */ 2893 ent = htmlEntityLookup(name); 2894 if (ent != NULL) /* OK that's ugly !!! */ 2895 NEXT; 2896 } else { 2897 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING, 2898 "htmlParseEntityRef: expecting ';'\n", 2899 NULL, NULL); 2900 if (str != NULL) 2901 *str = name; 2902 } 2903 } 2904 } 2905 return(ent); 2906 } 2907 2908 /** 2909 * htmlParseAttValue: 2910 * @ctxt: an HTML parser context 2911 * 2912 * parse a value for an attribute 2913 * Note: the parser won't do substitution of entities here, this 2914 * will be handled later in xmlStringGetNodeList, unless it was 2915 * asked for ctxt->replaceEntities != 0 2916 * 2917 * Returns the AttValue parsed or NULL. 2918 */ 2919 2920 static xmlChar * 2921 htmlParseAttValue(htmlParserCtxtPtr ctxt) { 2922 xmlChar *ret = NULL; 2923 2924 if (CUR == '"') { 2925 NEXT; 2926 ret = htmlParseHTMLAttribute(ctxt, '"'); 2927 if (CUR != '"') { 2928 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED, 2929 "AttValue: \" expected\n", NULL, NULL); 2930 } else 2931 NEXT; 2932 } else if (CUR == '\'') { 2933 NEXT; 2934 ret = htmlParseHTMLAttribute(ctxt, '\''); 2935 if (CUR != '\'') { 2936 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED, 2937 "AttValue: ' expected\n", NULL, NULL); 2938 } else 2939 NEXT; 2940 } else { 2941 /* 2942 * That's an HTMLism, the attribute value may not be quoted 2943 */ 2944 ret = htmlParseHTMLAttribute(ctxt, 0); 2945 if (ret == NULL) { 2946 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE, 2947 "AttValue: no value found\n", NULL, NULL); 2948 } 2949 } 2950 return(ret); 2951 } 2952 2953 /** 2954 * htmlParseSystemLiteral: 2955 * @ctxt: an HTML parser context 2956 * 2957 * parse an HTML Literal 2958 * 2959 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") 2960 * 2961 * Returns the SystemLiteral parsed or NULL 2962 */ 2963 2964 static xmlChar * 2965 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) { 2966 size_t len = 0, startPosition = 0; 2967 int err = 0; 2968 int quote; 2969 xmlChar *ret = NULL; 2970 2971 if ((CUR != '"') && (CUR != '\'')) { 2972 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, 2973 "SystemLiteral \" or ' expected\n", NULL, NULL); 2974 return(NULL); 2975 } 2976 quote = CUR; 2977 NEXT; 2978 2979 if (CUR_PTR < BASE_PTR) 2980 return(ret); 2981 startPosition = CUR_PTR - BASE_PTR; 2982 2983 while ((CUR != 0) && (CUR != quote)) { 2984 /* TODO: Handle UTF-8 */ 2985 if (!IS_CHAR_CH(CUR)) { 2986 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 2987 "Invalid char in SystemLiteral 0x%X\n", CUR); 2988 err = 1; 2989 } 2990 NEXT; 2991 len++; 2992 } 2993 if (CUR != quote) { 2994 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2995 "Unfinished SystemLiteral\n", NULL, NULL); 2996 } else { 2997 NEXT; 2998 if (err == 0) 2999 ret = xmlStrndup((BASE_PTR+startPosition), len); 3000 } 3001 3002 return(ret); 3003 } 3004 3005 /** 3006 * htmlParsePubidLiteral: 3007 * @ctxt: an HTML parser context 3008 * 3009 * parse an HTML public literal 3010 * 3011 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" 3012 * 3013 * Returns the PubidLiteral parsed or NULL. 3014 */ 3015 3016 static xmlChar * 3017 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) { 3018 size_t len = 0, startPosition = 0; 3019 int err = 0; 3020 int quote; 3021 xmlChar *ret = NULL; 3022 3023 if ((CUR != '"') && (CUR != '\'')) { 3024 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, 3025 "PubidLiteral \" or ' expected\n", NULL, NULL); 3026 return(NULL); 3027 } 3028 quote = CUR; 3029 NEXT; 3030 3031 /* 3032 * Name ::= (Letter | '_') (NameChar)* 3033 */ 3034 if (CUR_PTR < BASE_PTR) 3035 return(ret); 3036 startPosition = CUR_PTR - BASE_PTR; 3037 3038 while ((CUR != 0) && (CUR != quote)) { 3039 if (!IS_PUBIDCHAR_CH(CUR)) { 3040 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 3041 "Invalid char in PubidLiteral 0x%X\n", CUR); 3042 err = 1; 3043 } 3044 len++; 3045 NEXT; 3046 } 3047 3048 if (CUR != '"') { 3049 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 3050 "Unfinished PubidLiteral\n", NULL, NULL); 3051 } else { 3052 NEXT; 3053 if (err == 0) 3054 ret = xmlStrndup((BASE_PTR + startPosition), len); 3055 } 3056 3057 return(ret); 3058 } 3059 3060 /** 3061 * htmlParseScript: 3062 * @ctxt: an HTML parser context 3063 * 3064 * parse the content of an HTML SCRIPT or STYLE element 3065 * http://www.w3.org/TR/html4/sgml/dtd.html#Script 3066 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet 3067 * http://www.w3.org/TR/html4/types.html#type-script 3068 * http://www.w3.org/TR/html4/types.html#h-6.15 3069 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1 3070 * 3071 * Script data ( %Script; in the DTD) can be the content of the SCRIPT 3072 * element and the value of intrinsic event attributes. User agents must 3073 * not evaluate script data as HTML markup but instead must pass it on as 3074 * data to a script engine. 3075 * NOTES: 3076 * - The content is passed like CDATA 3077 * - the attributes for style and scripting "onXXX" are also described 3078 * as CDATA but SGML allows entities references in attributes so their 3079 * processing is identical as other attributes 3080 */ 3081 static void 3082 htmlParseScript(htmlParserCtxtPtr ctxt) { 3083 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5]; 3084 int nbchar = 0; 3085 int cur,l; 3086 3087 SHRINK; 3088 cur = CUR_CHAR(l); 3089 while (cur != 0) { 3090 if ((cur == '<') && (NXT(1) == '/')) { 3091 /* 3092 * One should break here, the specification is clear: 3093 * Authors should therefore escape "</" within the content. 3094 * Escape mechanisms are specific to each scripting or 3095 * style sheet language. 3096 * 3097 * In recovery mode, only break if end tag match the 3098 * current tag, effectively ignoring all tags inside the 3099 * script/style block and treating the entire block as 3100 * CDATA. 3101 */ 3102 if (ctxt->recovery) { 3103 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2, 3104 xmlStrlen(ctxt->name)) == 0) 3105 { 3106 break; /* while */ 3107 } else { 3108 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 3109 "Element %s embeds close tag\n", 3110 ctxt->name, NULL); 3111 } 3112 } else { 3113 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) || 3114 ((NXT(2) >= 'a') && (NXT(2) <= 'z'))) 3115 { 3116 break; /* while */ 3117 } 3118 } 3119 } 3120 if (IS_CHAR(cur)) { 3121 COPY_BUF(l,buf,nbchar,cur); 3122 } else { 3123 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 3124 "Invalid char in CDATA 0x%X\n", cur); 3125 } 3126 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { 3127 buf[nbchar] = 0; 3128 if (ctxt->sax->cdataBlock!= NULL) { 3129 /* 3130 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE 3131 */ 3132 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); 3133 } else if (ctxt->sax->characters != NULL) { 3134 ctxt->sax->characters(ctxt->userData, buf, nbchar); 3135 } 3136 nbchar = 0; 3137 } 3138 GROW; 3139 NEXTL(l); 3140 cur = CUR_CHAR(l); 3141 } 3142 3143 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) { 3144 buf[nbchar] = 0; 3145 if (ctxt->sax->cdataBlock!= NULL) { 3146 /* 3147 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE 3148 */ 3149 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); 3150 } else if (ctxt->sax->characters != NULL) { 3151 ctxt->sax->characters(ctxt->userData, buf, nbchar); 3152 } 3153 } 3154 } 3155 3156 3157 /** 3158 * htmlParseCharDataInternal: 3159 * @ctxt: an HTML parser context 3160 * @readahead: optional read ahead character in ascii range 3161 * 3162 * parse a CharData section. 3163 * if we are within a CDATA section ']]>' marks an end of section. 3164 * 3165 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) 3166 */ 3167 3168 static void 3169 htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) { 3170 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6]; 3171 int nbchar = 0; 3172 int cur, l; 3173 int chunk = 0; 3174 3175 if (readahead) 3176 buf[nbchar++] = readahead; 3177 3178 SHRINK; 3179 cur = CUR_CHAR(l); 3180 while (((cur != '<') || (ctxt->token == '<')) && 3181 ((cur != '&') || (ctxt->token == '&')) && 3182 (cur != 0)) { 3183 if (!(IS_CHAR(cur))) { 3184 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 3185 "Invalid char in CDATA 0x%X\n", cur); 3186 } else { 3187 COPY_BUF(l,buf,nbchar,cur); 3188 } 3189 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { 3190 buf[nbchar] = 0; 3191 3192 /* 3193 * Ok the segment is to be consumed as chars. 3194 */ 3195 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { 3196 if (areBlanks(ctxt, buf, nbchar)) { 3197 if (ctxt->keepBlanks) { 3198 if (ctxt->sax->characters != NULL) 3199 ctxt->sax->characters(ctxt->userData, buf, nbchar); 3200 } else { 3201 if (ctxt->sax->ignorableWhitespace != NULL) 3202 ctxt->sax->ignorableWhitespace(ctxt->userData, 3203 buf, nbchar); 3204 } 3205 } else { 3206 htmlCheckParagraph(ctxt); 3207 if (ctxt->sax->characters != NULL) 3208 ctxt->sax->characters(ctxt->userData, buf, nbchar); 3209 } 3210 } 3211 nbchar = 0; 3212 } 3213 NEXTL(l); 3214 chunk++; 3215 if (chunk > HTML_PARSER_BUFFER_SIZE) { 3216 chunk = 0; 3217 SHRINK; 3218 GROW; 3219 } 3220 cur = CUR_CHAR(l); 3221 if (cur == 0) { 3222 SHRINK; 3223 GROW; 3224 cur = CUR_CHAR(l); 3225 } 3226 } 3227 if (nbchar != 0) { 3228 buf[nbchar] = 0; 3229 3230 /* 3231 * Ok the segment is to be consumed as chars. 3232 */ 3233 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { 3234 if (areBlanks(ctxt, buf, nbchar)) { 3235 if (ctxt->keepBlanks) { 3236 if (ctxt->sax->characters != NULL) 3237 ctxt->sax->characters(ctxt->userData, buf, nbchar); 3238 } else { 3239 if (ctxt->sax->ignorableWhitespace != NULL) 3240 ctxt->sax->ignorableWhitespace(ctxt->userData, 3241 buf, nbchar); 3242 } 3243 } else { 3244 htmlCheckParagraph(ctxt); 3245 if (ctxt->sax->characters != NULL) 3246 ctxt->sax->characters(ctxt->userData, buf, nbchar); 3247 } 3248 } 3249 } else { 3250 /* 3251 * Loop detection 3252 */ 3253 if (cur == 0) 3254 ctxt->instate = XML_PARSER_EOF; 3255 } 3256 } 3257 3258 /** 3259 * htmlParseCharData: 3260 * @ctxt: an HTML parser context 3261 * 3262 * parse a CharData section. 3263 * if we are within a CDATA section ']]>' marks an end of section. 3264 * 3265 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) 3266 */ 3267 3268 static void 3269 htmlParseCharData(htmlParserCtxtPtr ctxt) { 3270 htmlParseCharDataInternal(ctxt, 0); 3271 } 3272 3273 /** 3274 * htmlParseExternalID: 3275 * @ctxt: an HTML parser context 3276 * @publicID: a xmlChar** receiving PubidLiteral 3277 * 3278 * Parse an External ID or a Public ID 3279 * 3280 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral 3281 * | 'PUBLIC' S PubidLiteral S SystemLiteral 3282 * 3283 * [83] PublicID ::= 'PUBLIC' S PubidLiteral 3284 * 3285 * Returns the function returns SystemLiteral and in the second 3286 * case publicID receives PubidLiteral, is strict is off 3287 * it is possible to return NULL and have publicID set. 3288 */ 3289 3290 static xmlChar * 3291 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) { 3292 xmlChar *URI = NULL; 3293 3294 if ((UPPER == 'S') && (UPP(1) == 'Y') && 3295 (UPP(2) == 'S') && (UPP(3) == 'T') && 3296 (UPP(4) == 'E') && (UPP(5) == 'M')) { 3297 SKIP(6); 3298 if (!IS_BLANK_CH(CUR)) { 3299 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 3300 "Space required after 'SYSTEM'\n", NULL, NULL); 3301 } 3302 SKIP_BLANKS; 3303 URI = htmlParseSystemLiteral(ctxt); 3304 if (URI == NULL) { 3305 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED, 3306 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL); 3307 } 3308 } else if ((UPPER == 'P') && (UPP(1) == 'U') && 3309 (UPP(2) == 'B') && (UPP(3) == 'L') && 3310 (UPP(4) == 'I') && (UPP(5) == 'C')) { 3311 SKIP(6); 3312 if (!IS_BLANK_CH(CUR)) { 3313 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 3314 "Space required after 'PUBLIC'\n", NULL, NULL); 3315 } 3316 SKIP_BLANKS; 3317 *publicID = htmlParsePubidLiteral(ctxt); 3318 if (*publicID == NULL) { 3319 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED, 3320 "htmlParseExternalID: PUBLIC, no Public Identifier\n", 3321 NULL, NULL); 3322 } 3323 SKIP_BLANKS; 3324 if ((CUR == '"') || (CUR == '\'')) { 3325 URI = htmlParseSystemLiteral(ctxt); 3326 } 3327 } 3328 return(URI); 3329 } 3330 3331 /** 3332 * xmlParsePI: 3333 * @ctxt: an XML parser context 3334 * 3335 * parse an XML Processing Instruction. 3336 * 3337 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>' 3338 */ 3339 static void 3340 htmlParsePI(htmlParserCtxtPtr ctxt) { 3341 xmlChar *buf = NULL; 3342 int len = 0; 3343 int size = HTML_PARSER_BUFFER_SIZE; 3344 int cur, l; 3345 const xmlChar *target; 3346 xmlParserInputState state; 3347 int count = 0; 3348 3349 if ((RAW == '<') && (NXT(1) == '?')) { 3350 state = ctxt->instate; 3351 ctxt->instate = XML_PARSER_PI; 3352 /* 3353 * this is a Processing Instruction. 3354 */ 3355 SKIP(2); 3356 SHRINK; 3357 3358 /* 3359 * Parse the target name and check for special support like 3360 * namespace. 3361 */ 3362 target = htmlParseName(ctxt); 3363 if (target != NULL) { 3364 if (RAW == '>') { 3365 SKIP(1); 3366 3367 /* 3368 * SAX: PI detected. 3369 */ 3370 if ((ctxt->sax) && (!ctxt->disableSAX) && 3371 (ctxt->sax->processingInstruction != NULL)) 3372 ctxt->sax->processingInstruction(ctxt->userData, 3373 target, NULL); 3374 ctxt->instate = state; 3375 return; 3376 } 3377 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar)); 3378 if (buf == NULL) { 3379 htmlErrMemory(ctxt, NULL); 3380 ctxt->instate = state; 3381 return; 3382 } 3383 cur = CUR; 3384 if (!IS_BLANK(cur)) { 3385 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 3386 "ParsePI: PI %s space expected\n", target, NULL); 3387 } 3388 SKIP_BLANKS; 3389 cur = CUR_CHAR(l); 3390 while ((cur != 0) && (cur != '>')) { 3391 if (len + 5 >= size) { 3392 xmlChar *tmp; 3393 3394 size *= 2; 3395 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); 3396 if (tmp == NULL) { 3397 htmlErrMemory(ctxt, NULL); 3398 xmlFree(buf); 3399 ctxt->instate = state; 3400 return; 3401 } 3402 buf = tmp; 3403 } 3404 count++; 3405 if (count > 50) { 3406 GROW; 3407 count = 0; 3408 } 3409 if (IS_CHAR(cur)) { 3410 COPY_BUF(l,buf,len,cur); 3411 } else { 3412 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 3413 "Invalid char in processing instruction " 3414 "0x%X\n", cur); 3415 } 3416 NEXTL(l); 3417 cur = CUR_CHAR(l); 3418 if (cur == 0) { 3419 SHRINK; 3420 GROW; 3421 cur = CUR_CHAR(l); 3422 } 3423 } 3424 buf[len] = 0; 3425 if (cur != '>') { 3426 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED, 3427 "ParsePI: PI %s never end ...\n", target, NULL); 3428 } else { 3429 SKIP(1); 3430 3431 /* 3432 * SAX: PI detected. 3433 */ 3434 if ((ctxt->sax) && (!ctxt->disableSAX) && 3435 (ctxt->sax->processingInstruction != NULL)) 3436 ctxt->sax->processingInstruction(ctxt->userData, 3437 target, buf); 3438 } 3439 xmlFree(buf); 3440 } else { 3441 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED, 3442 "PI is not started correctly", NULL, NULL); 3443 } 3444 ctxt->instate = state; 3445 } 3446 } 3447 3448 /** 3449 * htmlParseComment: 3450 * @ctxt: an HTML parser context 3451 * 3452 * Parse an XML (SGML) comment <!-- .... --> 3453 * 3454 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->' 3455 */ 3456 static void 3457 htmlParseComment(htmlParserCtxtPtr ctxt) { 3458 xmlChar *buf = NULL; 3459 int len; 3460 int size = HTML_PARSER_BUFFER_SIZE; 3461 int q, ql; 3462 int r, rl; 3463 int cur, l; 3464 int next, nl; 3465 xmlParserInputState state; 3466 3467 /* 3468 * Check that there is a comment right here. 3469 */ 3470 if ((RAW != '<') || (NXT(1) != '!') || 3471 (NXT(2) != '-') || (NXT(3) != '-')) return; 3472 3473 state = ctxt->instate; 3474 ctxt->instate = XML_PARSER_COMMENT; 3475 SHRINK; 3476 SKIP(4); 3477 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar)); 3478 if (buf == NULL) { 3479 htmlErrMemory(ctxt, "buffer allocation failed\n"); 3480 ctxt->instate = state; 3481 return; 3482 } 3483 len = 0; 3484 buf[len] = 0; 3485 q = CUR_CHAR(ql); 3486 if (q == 0) 3487 goto unfinished; 3488 NEXTL(ql); 3489 r = CUR_CHAR(rl); 3490 if (r == 0) 3491 goto unfinished; 3492 NEXTL(rl); 3493 cur = CUR_CHAR(l); 3494 while ((cur != 0) && 3495 ((cur != '>') || 3496 (r != '-') || (q != '-'))) { 3497 NEXTL(l); 3498 next = CUR_CHAR(nl); 3499 if (next == 0) { 3500 SHRINK; 3501 GROW; 3502 next = CUR_CHAR(nl); 3503 } 3504 3505 if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) { 3506 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED, 3507 "Comment incorrectly closed by '--!>'", NULL, NULL); 3508 cur = '>'; 3509 break; 3510 } 3511 3512 if (len + 5 >= size) { 3513 xmlChar *tmp; 3514 3515 size *= 2; 3516 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); 3517 if (tmp == NULL) { 3518 xmlFree(buf); 3519 htmlErrMemory(ctxt, "growing buffer failed\n"); 3520 ctxt->instate = state; 3521 return; 3522 } 3523 buf = tmp; 3524 } 3525 if (IS_CHAR(q)) { 3526 COPY_BUF(ql,buf,len,q); 3527 } else { 3528 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 3529 "Invalid char in comment 0x%X\n", q); 3530 } 3531 3532 q = r; 3533 ql = rl; 3534 r = cur; 3535 rl = l; 3536 cur = next; 3537 l = nl; 3538 } 3539 buf[len] = 0; 3540 if (cur == '>') { 3541 NEXT; 3542 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) && 3543 (!ctxt->disableSAX)) 3544 ctxt->sax->comment(ctxt->userData, buf); 3545 xmlFree(buf); 3546 ctxt->instate = state; 3547 return; 3548 } 3549 3550 unfinished: 3551 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED, 3552 "Comment not terminated \n<!--%.50s\n", buf, NULL); 3553 xmlFree(buf); 3554 } 3555 3556 /** 3557 * htmlParseCharRef: 3558 * @ctxt: an HTML parser context 3559 * 3560 * parse Reference declarations 3561 * 3562 * [66] CharRef ::= '&#' [0-9]+ ';' | 3563 * '&#x' [0-9a-fA-F]+ ';' 3564 * 3565 * Returns the value parsed (as an int) 3566 */ 3567 int 3568 htmlParseCharRef(htmlParserCtxtPtr ctxt) { 3569 int val = 0; 3570 3571 if ((ctxt == NULL) || (ctxt->input == NULL)) { 3572 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 3573 "htmlParseCharRef: context error\n", 3574 NULL, NULL); 3575 return(0); 3576 } 3577 if ((CUR == '&') && (NXT(1) == '#') && 3578 ((NXT(2) == 'x') || NXT(2) == 'X')) { 3579 SKIP(3); 3580 while (CUR != ';') { 3581 if ((CUR >= '0') && (CUR <= '9')) { 3582 if (val < 0x110000) 3583 val = val * 16 + (CUR - '0'); 3584 } else if ((CUR >= 'a') && (CUR <= 'f')) { 3585 if (val < 0x110000) 3586 val = val * 16 + (CUR - 'a') + 10; 3587 } else if ((CUR >= 'A') && (CUR <= 'F')) { 3588 if (val < 0x110000) 3589 val = val * 16 + (CUR - 'A') + 10; 3590 } else { 3591 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF, 3592 "htmlParseCharRef: missing semicolon\n", 3593 NULL, NULL); 3594 break; 3595 } 3596 NEXT; 3597 } 3598 if (CUR == ';') 3599 NEXT; 3600 } else if ((CUR == '&') && (NXT(1) == '#')) { 3601 SKIP(2); 3602 while (CUR != ';') { 3603 if ((CUR >= '0') && (CUR <= '9')) { 3604 if (val < 0x110000) 3605 val = val * 10 + (CUR - '0'); 3606 } else { 3607 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF, 3608 "htmlParseCharRef: missing semicolon\n", 3609 NULL, NULL); 3610 break; 3611 } 3612 NEXT; 3613 } 3614 if (CUR == ';') 3615 NEXT; 3616 } else { 3617 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF, 3618 "htmlParseCharRef: invalid value\n", NULL, NULL); 3619 } 3620 /* 3621 * Check the value IS_CHAR ... 3622 */ 3623 if (IS_CHAR(val)) { 3624 return(val); 3625 } else if (val >= 0x110000) { 3626 htmlParseErr(ctxt, XML_ERR_INVALID_CHAR, 3627 "htmlParseCharRef: value too large\n", NULL, NULL); 3628 } else { 3629 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 3630 "htmlParseCharRef: invalid xmlChar value %d\n", 3631 val); 3632 } 3633 return(0); 3634 } 3635 3636 3637 /** 3638 * htmlParseDocTypeDecl: 3639 * @ctxt: an HTML parser context 3640 * 3641 * parse a DOCTYPE declaration 3642 * 3643 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? 3644 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>' 3645 */ 3646 3647 static void 3648 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) { 3649 const xmlChar *name; 3650 xmlChar *ExternalID = NULL; 3651 xmlChar *URI = NULL; 3652 3653 /* 3654 * We know that '<!DOCTYPE' has been detected. 3655 */ 3656 SKIP(9); 3657 3658 SKIP_BLANKS; 3659 3660 /* 3661 * Parse the DOCTYPE name. 3662 */ 3663 name = htmlParseName(ctxt); 3664 if (name == NULL) { 3665 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 3666 "htmlParseDocTypeDecl : no DOCTYPE name !\n", 3667 NULL, NULL); 3668 } 3669 /* 3670 * Check that upper(name) == "HTML" !!!!!!!!!!!!! 3671 */ 3672 3673 SKIP_BLANKS; 3674 3675 /* 3676 * Check for SystemID and ExternalID 3677 */ 3678 URI = htmlParseExternalID(ctxt, &ExternalID); 3679 SKIP_BLANKS; 3680 3681 /* 3682 * We should be at the end of the DOCTYPE declaration. 3683 */ 3684 if (CUR != '>') { 3685 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED, 3686 "DOCTYPE improperly terminated\n", NULL, NULL); 3687 /* Ignore bogus content */ 3688 while ((CUR != 0) && (CUR != '>')) 3689 NEXT; 3690 } 3691 if (CUR == '>') 3692 NEXT; 3693 3694 /* 3695 * Create or update the document accordingly to the DOCTYPE 3696 */ 3697 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) && 3698 (!ctxt->disableSAX)) 3699 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI); 3700 3701 /* 3702 * Cleanup, since we don't use all those identifiers 3703 */ 3704 if (URI != NULL) xmlFree(URI); 3705 if (ExternalID != NULL) xmlFree(ExternalID); 3706 } 3707 3708 /** 3709 * htmlParseAttribute: 3710 * @ctxt: an HTML parser context 3711 * @value: a xmlChar ** used to store the value of the attribute 3712 * 3713 * parse an attribute 3714 * 3715 * [41] Attribute ::= Name Eq AttValue 3716 * 3717 * [25] Eq ::= S? '=' S? 3718 * 3719 * With namespace: 3720 * 3721 * [NS 11] Attribute ::= QName Eq AttValue 3722 * 3723 * Also the case QName == xmlns:??? is handled independently as a namespace 3724 * definition. 3725 * 3726 * Returns the attribute name, and the value in *value. 3727 */ 3728 3729 static const xmlChar * 3730 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) { 3731 const xmlChar *name; 3732 xmlChar *val = NULL; 3733 3734 *value = NULL; 3735 name = htmlParseHTMLName(ctxt); 3736 if (name == NULL) { 3737 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 3738 "error parsing attribute name\n", NULL, NULL); 3739 return(NULL); 3740 } 3741 3742 /* 3743 * read the value 3744 */ 3745 SKIP_BLANKS; 3746 if (CUR == '=') { 3747 NEXT; 3748 SKIP_BLANKS; 3749 val = htmlParseAttValue(ctxt); 3750 } 3751 3752 *value = val; 3753 return(name); 3754 } 3755 3756 /** 3757 * htmlCheckEncodingDirect: 3758 * @ctxt: an HTML parser context 3759 * @attvalue: the attribute value 3760 * 3761 * Checks an attribute value to detect 3762 * the encoding 3763 * If a new encoding is detected the parser is switched to decode 3764 * it and pass UTF8 3765 */ 3766 static void 3767 htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) { 3768 3769 if ((ctxt == NULL) || (encoding == NULL) || 3770 (ctxt->options & HTML_PARSE_IGNORE_ENC)) 3771 return; 3772 3773 /* do not change encoding */ 3774 if (ctxt->input->encoding != NULL) 3775 return; 3776 3777 if (encoding != NULL) { 3778 xmlCharEncoding enc; 3779 xmlCharEncodingHandlerPtr handler; 3780 3781 while ((*encoding == ' ') || (*encoding == '\t')) encoding++; 3782 3783 if (ctxt->input->encoding != NULL) 3784 xmlFree((xmlChar *) ctxt->input->encoding); 3785 ctxt->input->encoding = xmlStrdup(encoding); 3786 3787 enc = xmlParseCharEncoding((const char *) encoding); 3788 /* 3789 * registered set of known encodings 3790 */ 3791 if (enc != XML_CHAR_ENCODING_ERROR) { 3792 if (((enc == XML_CHAR_ENCODING_UTF16LE) || 3793 (enc == XML_CHAR_ENCODING_UTF16BE) || 3794 (enc == XML_CHAR_ENCODING_UCS4LE) || 3795 (enc == XML_CHAR_ENCODING_UCS4BE)) && 3796 (ctxt->input->buf != NULL) && 3797 (ctxt->input->buf->encoder == NULL)) { 3798 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 3799 "htmlCheckEncoding: wrong encoding meta\n", 3800 NULL, NULL); 3801 } else { 3802 xmlSwitchEncoding(ctxt, enc); 3803 } 3804 ctxt->charset = XML_CHAR_ENCODING_UTF8; 3805 } else { 3806 /* 3807 * fallback for unknown encodings 3808 */ 3809 handler = xmlFindCharEncodingHandler((const char *) encoding); 3810 if (handler != NULL) { 3811 xmlSwitchToEncoding(ctxt, handler); 3812 ctxt->charset = XML_CHAR_ENCODING_UTF8; 3813 } else { 3814 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, 3815 "htmlCheckEncoding: unknown encoding %s\n", 3816 encoding, NULL); 3817 } 3818 } 3819 3820 if ((ctxt->input->buf != NULL) && 3821 (ctxt->input->buf->encoder != NULL) && 3822 (ctxt->input->buf->raw != NULL) && 3823 (ctxt->input->buf->buffer != NULL)) { 3824 int nbchars; 3825 int processed; 3826 3827 /* 3828 * convert as much as possible to the parser reading buffer. 3829 */ 3830 processed = ctxt->input->cur - ctxt->input->base; 3831 xmlBufShrink(ctxt->input->buf->buffer, processed); 3832 nbchars = xmlCharEncInput(ctxt->input->buf, 1); 3833 xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input); 3834 if (nbchars < 0) { 3835 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 3836 "htmlCheckEncoding: encoder error\n", 3837 NULL, NULL); 3838 } 3839 } 3840 } 3841 } 3842 3843 /** 3844 * htmlCheckEncoding: 3845 * @ctxt: an HTML parser context 3846 * @attvalue: the attribute value 3847 * 3848 * Checks an http-equiv attribute from a Meta tag to detect 3849 * the encoding 3850 * If a new encoding is detected the parser is switched to decode 3851 * it and pass UTF8 3852 */ 3853 static void 3854 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) { 3855 const xmlChar *encoding; 3856 3857 if (!attvalue) 3858 return; 3859 3860 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset"); 3861 if (encoding != NULL) { 3862 encoding += 7; 3863 } 3864 /* 3865 * skip blank 3866 */ 3867 if (encoding && IS_BLANK_CH(*encoding)) 3868 encoding = xmlStrcasestr(attvalue, BAD_CAST"="); 3869 if (encoding && *encoding == '=') { 3870 encoding ++; 3871 htmlCheckEncodingDirect(ctxt, encoding); 3872 } 3873 } 3874 3875 /** 3876 * htmlCheckMeta: 3877 * @ctxt: an HTML parser context 3878 * @atts: the attributes values 3879 * 3880 * Checks an attributes from a Meta tag 3881 */ 3882 static void 3883 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) { 3884 int i; 3885 const xmlChar *att, *value; 3886 int http = 0; 3887 const xmlChar *content = NULL; 3888 3889 if ((ctxt == NULL) || (atts == NULL)) 3890 return; 3891 3892 i = 0; 3893 att = atts[i++]; 3894 while (att != NULL) { 3895 value = atts[i++]; 3896 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv")) 3897 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) 3898 http = 1; 3899 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset"))) 3900 htmlCheckEncodingDirect(ctxt, value); 3901 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content"))) 3902 content = value; 3903 att = atts[i++]; 3904 } 3905 if ((http) && (content != NULL)) 3906 htmlCheckEncoding(ctxt, content); 3907 3908 } 3909 3910 /** 3911 * htmlParseStartTag: 3912 * @ctxt: an HTML parser context 3913 * 3914 * parse a start of tag either for rule element or 3915 * EmptyElement. In both case we don't parse the tag closing chars. 3916 * 3917 * [40] STag ::= '<' Name (S Attribute)* S? '>' 3918 * 3919 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' 3920 * 3921 * With namespace: 3922 * 3923 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>' 3924 * 3925 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>' 3926 * 3927 * Returns 0 in case of success, -1 in case of error and 1 if discarded 3928 */ 3929 3930 static int 3931 htmlParseStartTag(htmlParserCtxtPtr ctxt) { 3932 const xmlChar *name; 3933 const xmlChar *attname; 3934 xmlChar *attvalue; 3935 const xmlChar **atts; 3936 int nbatts = 0; 3937 int maxatts; 3938 int meta = 0; 3939 int i; 3940 int discardtag = 0; 3941 3942 if ((ctxt == NULL) || (ctxt->input == NULL)) { 3943 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 3944 "htmlParseStartTag: context error\n", NULL, NULL); 3945 return -1; 3946 } 3947 if (ctxt->instate == XML_PARSER_EOF) 3948 return(-1); 3949 if (CUR != '<') return -1; 3950 NEXT; 3951 3952 atts = ctxt->atts; 3953 maxatts = ctxt->maxatts; 3954 3955 GROW; 3956 name = htmlParseHTMLName(ctxt); 3957 if (name == NULL) { 3958 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 3959 "htmlParseStartTag: invalid element name\n", 3960 NULL, NULL); 3961 /* if recover preserve text on classic misconstructs */ 3962 if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') || 3963 (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) { 3964 htmlParseCharDataInternal(ctxt, '<'); 3965 return(-1); 3966 } 3967 3968 3969 /* Dump the bogus tag like browsers do */ 3970 while ((CUR != 0) && (CUR != '>') && 3971 (ctxt->instate != XML_PARSER_EOF)) 3972 NEXT; 3973 return -1; 3974 } 3975 if (xmlStrEqual(name, BAD_CAST"meta")) 3976 meta = 1; 3977 3978 /* 3979 * Check for auto-closure of HTML elements. 3980 */ 3981 htmlAutoClose(ctxt, name); 3982 3983 /* 3984 * Check for implied HTML elements. 3985 */ 3986 htmlCheckImplied(ctxt, name); 3987 3988 /* 3989 * Avoid html at any level > 0, head at any level != 1 3990 * or any attempt to recurse body 3991 */ 3992 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) { 3993 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 3994 "htmlParseStartTag: misplaced <html> tag\n", 3995 name, NULL); 3996 discardtag = 1; 3997 ctxt->depth++; 3998 } 3999 if ((ctxt->nameNr != 1) && 4000 (xmlStrEqual(name, BAD_CAST"head"))) { 4001 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 4002 "htmlParseStartTag: misplaced <head> tag\n", 4003 name, NULL); 4004 discardtag = 1; 4005 ctxt->depth++; 4006 } 4007 if (xmlStrEqual(name, BAD_CAST"body")) { 4008 int indx; 4009 for (indx = 0;indx < ctxt->nameNr;indx++) { 4010 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) { 4011 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 4012 "htmlParseStartTag: misplaced <body> tag\n", 4013 name, NULL); 4014 discardtag = 1; 4015 ctxt->depth++; 4016 } 4017 } 4018 } 4019 4020 /* 4021 * Now parse the attributes, it ends up with the ending 4022 * 4023 * (S Attribute)* S? 4024 */ 4025 SKIP_BLANKS; 4026 while ((CUR != 0) && 4027 (CUR != '>') && 4028 ((CUR != '/') || (NXT(1) != '>'))) { 4029 GROW; 4030 attname = htmlParseAttribute(ctxt, &attvalue); 4031 if (attname != NULL) { 4032 4033 /* 4034 * Well formedness requires at most one declaration of an attribute 4035 */ 4036 for (i = 0; i < nbatts;i += 2) { 4037 if (xmlStrEqual(atts[i], attname)) { 4038 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED, 4039 "Attribute %s redefined\n", attname, NULL); 4040 if (attvalue != NULL) 4041 xmlFree(attvalue); 4042 goto failed; 4043 } 4044 } 4045 4046 /* 4047 * Add the pair to atts 4048 */ 4049 if (atts == NULL) { 4050 maxatts = 22; /* allow for 10 attrs by default */ 4051 atts = (const xmlChar **) 4052 xmlMalloc(maxatts * sizeof(xmlChar *)); 4053 if (atts == NULL) { 4054 htmlErrMemory(ctxt, NULL); 4055 if (attvalue != NULL) 4056 xmlFree(attvalue); 4057 goto failed; 4058 } 4059 ctxt->atts = atts; 4060 ctxt->maxatts = maxatts; 4061 } else if (nbatts + 4 > maxatts) { 4062 const xmlChar **n; 4063 4064 maxatts *= 2; 4065 n = (const xmlChar **) xmlRealloc((void *) atts, 4066 maxatts * sizeof(const xmlChar *)); 4067 if (n == NULL) { 4068 htmlErrMemory(ctxt, NULL); 4069 if (attvalue != NULL) 4070 xmlFree(attvalue); 4071 goto failed; 4072 } 4073 atts = n; 4074 ctxt->atts = atts; 4075 ctxt->maxatts = maxatts; 4076 } 4077 atts[nbatts++] = attname; 4078 atts[nbatts++] = attvalue; 4079 atts[nbatts] = NULL; 4080 atts[nbatts + 1] = NULL; 4081 } 4082 else { 4083 if (attvalue != NULL) 4084 xmlFree(attvalue); 4085 /* Dump the bogus attribute string up to the next blank or 4086 * the end of the tag. */ 4087 while ((CUR != 0) && 4088 !(IS_BLANK_CH(CUR)) && (CUR != '>') && 4089 ((CUR != '/') || (NXT(1) != '>'))) 4090 NEXT; 4091 } 4092 4093 failed: 4094 SKIP_BLANKS; 4095 } 4096 4097 /* 4098 * Handle specific association to the META tag 4099 */ 4100 if (meta && (nbatts != 0)) 4101 htmlCheckMeta(ctxt, atts); 4102 4103 /* 4104 * SAX: Start of Element ! 4105 */ 4106 if (!discardtag) { 4107 htmlnamePush(ctxt, name); 4108 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) { 4109 if (nbatts != 0) 4110 ctxt->sax->startElement(ctxt->userData, name, atts); 4111 else 4112 ctxt->sax->startElement(ctxt->userData, name, NULL); 4113 } 4114 } 4115 4116 if (atts != NULL) { 4117 for (i = 1;i < nbatts;i += 2) { 4118 if (atts[i] != NULL) 4119 xmlFree((xmlChar *) atts[i]); 4120 } 4121 } 4122 4123 return(discardtag); 4124 } 4125 4126 /** 4127 * htmlParseEndTag: 4128 * @ctxt: an HTML parser context 4129 * 4130 * parse an end of tag 4131 * 4132 * [42] ETag ::= '</' Name S? '>' 4133 * 4134 * With namespace 4135 * 4136 * [NS 9] ETag ::= '</' QName S? '>' 4137 * 4138 * Returns 1 if the current level should be closed. 4139 */ 4140 4141 static int 4142 htmlParseEndTag(htmlParserCtxtPtr ctxt) 4143 { 4144 const xmlChar *name; 4145 const xmlChar *oldname; 4146 int i, ret; 4147 4148 if ((CUR != '<') || (NXT(1) != '/')) { 4149 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED, 4150 "htmlParseEndTag: '</' not found\n", NULL, NULL); 4151 return (0); 4152 } 4153 SKIP(2); 4154 4155 name = htmlParseHTMLName(ctxt); 4156 if (name == NULL) 4157 return (0); 4158 /* 4159 * We should definitely be at the ending "S? '>'" part 4160 */ 4161 SKIP_BLANKS; 4162 if (CUR != '>') { 4163 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 4164 "End tag : expected '>'\n", NULL, NULL); 4165 /* Skip to next '>' */ 4166 while ((CUR != 0) && (CUR != '>')) 4167 NEXT; 4168 } 4169 if (CUR == '>') 4170 NEXT; 4171 4172 /* 4173 * if we ignored misplaced tags in htmlParseStartTag don't pop them 4174 * out now. 4175 */ 4176 if ((ctxt->depth > 0) && 4177 (xmlStrEqual(name, BAD_CAST "html") || 4178 xmlStrEqual(name, BAD_CAST "body") || 4179 xmlStrEqual(name, BAD_CAST "head"))) { 4180 ctxt->depth--; 4181 return (0); 4182 } 4183 4184 /* 4185 * If the name read is not one of the element in the parsing stack 4186 * then return, it's just an error. 4187 */ 4188 for (i = (ctxt->nameNr - 1); i >= 0; i--) { 4189 if (xmlStrEqual(name, ctxt->nameTab[i])) 4190 break; 4191 } 4192 if (i < 0) { 4193 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 4194 "Unexpected end tag : %s\n", name, NULL); 4195 return (0); 4196 } 4197 4198 4199 /* 4200 * Check for auto-closure of HTML elements. 4201 */ 4202 4203 htmlAutoCloseOnClose(ctxt, name); 4204 4205 /* 4206 * Well formedness constraints, opening and closing must match. 4207 * With the exception that the autoclose may have popped stuff out 4208 * of the stack. 4209 */ 4210 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) { 4211 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 4212 "Opening and ending tag mismatch: %s and %s\n", 4213 name, ctxt->name); 4214 } 4215 4216 /* 4217 * SAX: End of Tag 4218 */ 4219 oldname = ctxt->name; 4220 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) { 4221 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 4222 ctxt->sax->endElement(ctxt->userData, name); 4223 htmlNodeInfoPop(ctxt); 4224 htmlnamePop(ctxt); 4225 ret = 1; 4226 } else { 4227 ret = 0; 4228 } 4229 4230 return (ret); 4231 } 4232 4233 4234 /** 4235 * htmlParseReference: 4236 * @ctxt: an HTML parser context 4237 * 4238 * parse and handle entity references in content, 4239 * this will end-up in a call to character() since this is either a 4240 * CharRef, or a predefined entity. 4241 */ 4242 static void 4243 htmlParseReference(htmlParserCtxtPtr ctxt) { 4244 const htmlEntityDesc * ent; 4245 xmlChar out[6]; 4246 const xmlChar *name; 4247 if (CUR != '&') return; 4248 4249 if (NXT(1) == '#') { 4250 unsigned int c; 4251 int bits, i = 0; 4252 4253 c = htmlParseCharRef(ctxt); 4254 if (c == 0) 4255 return; 4256 4257 if (c < 0x80) { out[i++]= c; bits= -6; } 4258 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; } 4259 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; } 4260 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; } 4261 4262 for ( ; bits >= 0; bits-= 6) { 4263 out[i++]= ((c >> bits) & 0x3F) | 0x80; 4264 } 4265 out[i] = 0; 4266 4267 htmlCheckParagraph(ctxt); 4268 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 4269 ctxt->sax->characters(ctxt->userData, out, i); 4270 } else { 4271 ent = htmlParseEntityRef(ctxt, &name); 4272 if (name == NULL) { 4273 htmlCheckParagraph(ctxt); 4274 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 4275 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1); 4276 return; 4277 } 4278 if ((ent == NULL) || !(ent->value > 0)) { 4279 htmlCheckParagraph(ctxt); 4280 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) { 4281 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1); 4282 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name)); 4283 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */ 4284 } 4285 } else { 4286 unsigned int c; 4287 int bits, i = 0; 4288 4289 c = ent->value; 4290 if (c < 0x80) 4291 { out[i++]= c; bits= -6; } 4292 else if (c < 0x800) 4293 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; } 4294 else if (c < 0x10000) 4295 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; } 4296 else 4297 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; } 4298 4299 for ( ; bits >= 0; bits-= 6) { 4300 out[i++]= ((c >> bits) & 0x3F) | 0x80; 4301 } 4302 out[i] = 0; 4303 4304 htmlCheckParagraph(ctxt); 4305 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 4306 ctxt->sax->characters(ctxt->userData, out, i); 4307 } 4308 } 4309 } 4310 4311 /** 4312 * htmlParseContent: 4313 * @ctxt: an HTML parser context 4314 * 4315 * Parse a content: comment, sub-element, reference or text. 4316 * Kept for compatibility with old code 4317 */ 4318 4319 static void 4320 htmlParseContent(htmlParserCtxtPtr ctxt) { 4321 xmlChar *currentNode; 4322 int depth; 4323 const xmlChar *name; 4324 4325 currentNode = xmlStrdup(ctxt->name); 4326 depth = ctxt->nameNr; 4327 while (1) { 4328 GROW; 4329 4330 if (ctxt->instate == XML_PARSER_EOF) 4331 break; 4332 4333 /* 4334 * Our tag or one of it's parent or children is ending. 4335 */ 4336 if ((CUR == '<') && (NXT(1) == '/')) { 4337 if (htmlParseEndTag(ctxt) && 4338 ((currentNode != NULL) || (ctxt->nameNr == 0))) { 4339 if (currentNode != NULL) 4340 xmlFree(currentNode); 4341 return; 4342 } 4343 continue; /* while */ 4344 } 4345 4346 else if ((CUR == '<') && 4347 ((IS_ASCII_LETTER(NXT(1))) || 4348 (NXT(1) == '_') || (NXT(1) == ':'))) { 4349 name = htmlParseHTMLName_nonInvasive(ctxt); 4350 if (name == NULL) { 4351 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 4352 "htmlParseStartTag: invalid element name\n", 4353 NULL, NULL); 4354 /* Dump the bogus tag like browsers do */ 4355 while ((CUR != 0) && (CUR != '>')) 4356 NEXT; 4357 4358 if (currentNode != NULL) 4359 xmlFree(currentNode); 4360 return; 4361 } 4362 4363 if (ctxt->name != NULL) { 4364 if (htmlCheckAutoClose(name, ctxt->name) == 1) { 4365 htmlAutoClose(ctxt, name); 4366 continue; 4367 } 4368 } 4369 } 4370 4371 /* 4372 * Has this node been popped out during parsing of 4373 * the next element 4374 */ 4375 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) && 4376 (!xmlStrEqual(currentNode, ctxt->name))) 4377 { 4378 if (currentNode != NULL) xmlFree(currentNode); 4379 return; 4380 } 4381 4382 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) || 4383 (xmlStrEqual(currentNode, BAD_CAST"style")))) { 4384 /* 4385 * Handle SCRIPT/STYLE separately 4386 */ 4387 htmlParseScript(ctxt); 4388 } else { 4389 /* 4390 * Sometimes DOCTYPE arrives in the middle of the document 4391 */ 4392 if ((CUR == '<') && (NXT(1) == '!') && 4393 (UPP(2) == 'D') && (UPP(3) == 'O') && 4394 (UPP(4) == 'C') && (UPP(5) == 'T') && 4395 (UPP(6) == 'Y') && (UPP(7) == 'P') && 4396 (UPP(8) == 'E')) { 4397 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 4398 "Misplaced DOCTYPE declaration\n", 4399 BAD_CAST "DOCTYPE" , NULL); 4400 htmlParseDocTypeDecl(ctxt); 4401 } 4402 4403 /* 4404 * First case : a comment 4405 */ 4406 if ((CUR == '<') && (NXT(1) == '!') && 4407 (NXT(2) == '-') && (NXT(3) == '-')) { 4408 htmlParseComment(ctxt); 4409 } 4410 4411 /* 4412 * Second case : a Processing Instruction. 4413 */ 4414 else if ((CUR == '<') && (NXT(1) == '?')) { 4415 htmlParsePI(ctxt); 4416 } 4417 4418 /* 4419 * Third case : a sub-element. 4420 */ 4421 else if (CUR == '<') { 4422 htmlParseElement(ctxt); 4423 } 4424 4425 /* 4426 * Fourth case : a reference. If if has not been resolved, 4427 * parsing returns it's Name, create the node 4428 */ 4429 else if (CUR == '&') { 4430 htmlParseReference(ctxt); 4431 } 4432 4433 /* 4434 * Fifth case : end of the resource 4435 */ 4436 else if (CUR == 0) { 4437 htmlAutoCloseOnEnd(ctxt); 4438 break; 4439 } 4440 4441 /* 4442 * Last case, text. Note that References are handled directly. 4443 */ 4444 else { 4445 htmlParseCharData(ctxt); 4446 } 4447 } 4448 GROW; 4449 } 4450 if (currentNode != NULL) xmlFree(currentNode); 4451 } 4452 4453 /** 4454 * htmlParseElement: 4455 * @ctxt: an HTML parser context 4456 * 4457 * parse an HTML element, this is highly recursive 4458 * this is kept for compatibility with previous code versions 4459 * 4460 * [39] element ::= EmptyElemTag | STag content ETag 4461 * 4462 * [41] Attribute ::= Name Eq AttValue 4463 */ 4464 4465 void 4466 htmlParseElement(htmlParserCtxtPtr ctxt) { 4467 const xmlChar *name; 4468 xmlChar *currentNode = NULL; 4469 const htmlElemDesc * info; 4470 htmlParserNodeInfo node_info; 4471 int failed; 4472 int depth; 4473 const xmlChar *oldptr; 4474 4475 if ((ctxt == NULL) || (ctxt->input == NULL)) { 4476 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 4477 "htmlParseElement: context error\n", NULL, NULL); 4478 return; 4479 } 4480 4481 if (ctxt->instate == XML_PARSER_EOF) 4482 return; 4483 4484 /* Capture start position */ 4485 if (ctxt->record_info) { 4486 node_info.begin_pos = ctxt->input->consumed + 4487 (CUR_PTR - ctxt->input->base); 4488 node_info.begin_line = ctxt->input->line; 4489 } 4490 4491 failed = htmlParseStartTag(ctxt); 4492 name = ctxt->name; 4493 if ((failed == -1) || (name == NULL)) { 4494 if (CUR == '>') 4495 NEXT; 4496 return; 4497 } 4498 4499 /* 4500 * Lookup the info for that element. 4501 */ 4502 info = htmlTagLookup(name); 4503 if (info == NULL) { 4504 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, 4505 "Tag %s invalid\n", name, NULL); 4506 } 4507 4508 /* 4509 * Check for an Empty Element labeled the XML/SGML way 4510 */ 4511 if ((CUR == '/') && (NXT(1) == '>')) { 4512 SKIP(2); 4513 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 4514 ctxt->sax->endElement(ctxt->userData, name); 4515 htmlnamePop(ctxt); 4516 return; 4517 } 4518 4519 if (CUR == '>') { 4520 NEXT; 4521 } else { 4522 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 4523 "Couldn't find end of Start Tag %s\n", name, NULL); 4524 4525 /* 4526 * end of parsing of this node. 4527 */ 4528 if (xmlStrEqual(name, ctxt->name)) { 4529 nodePop(ctxt); 4530 htmlnamePop(ctxt); 4531 } 4532 4533 /* 4534 * Capture end position and add node 4535 */ 4536 if (ctxt->record_info) { 4537 node_info.end_pos = ctxt->input->consumed + 4538 (CUR_PTR - ctxt->input->base); 4539 node_info.end_line = ctxt->input->line; 4540 node_info.node = ctxt->node; 4541 xmlParserAddNodeInfo(ctxt, &node_info); 4542 } 4543 return; 4544 } 4545 4546 /* 4547 * Check for an Empty Element from DTD definition 4548 */ 4549 if ((info != NULL) && (info->empty)) { 4550 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 4551 ctxt->sax->endElement(ctxt->userData, name); 4552 htmlnamePop(ctxt); 4553 return; 4554 } 4555 4556 /* 4557 * Parse the content of the element: 4558 */ 4559 currentNode = xmlStrdup(ctxt->name); 4560 depth = ctxt->nameNr; 4561 while (CUR != 0) { 4562 oldptr = ctxt->input->cur; 4563 htmlParseContent(ctxt); 4564 if (oldptr==ctxt->input->cur) break; 4565 if (ctxt->nameNr < depth) break; 4566 } 4567 4568 /* 4569 * Capture end position and add node 4570 */ 4571 if ( currentNode != NULL && ctxt->record_info ) { 4572 node_info.end_pos = ctxt->input->consumed + 4573 (CUR_PTR - ctxt->input->base); 4574 node_info.end_line = ctxt->input->line; 4575 node_info.node = ctxt->node; 4576 xmlParserAddNodeInfo(ctxt, &node_info); 4577 } 4578 if (CUR == 0) { 4579 htmlAutoCloseOnEnd(ctxt); 4580 } 4581 4582 if (currentNode != NULL) 4583 xmlFree(currentNode); 4584 } 4585 4586 static void 4587 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) { 4588 /* 4589 * Capture end position and add node 4590 */ 4591 if ( ctxt->node != NULL && ctxt->record_info ) { 4592 ctxt->nodeInfo->end_pos = ctxt->input->consumed + 4593 (CUR_PTR - ctxt->input->base); 4594 ctxt->nodeInfo->end_line = ctxt->input->line; 4595 ctxt->nodeInfo->node = ctxt->node; 4596 xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo); 4597 htmlNodeInfoPop(ctxt); 4598 } 4599 if (CUR == 0) { 4600 htmlAutoCloseOnEnd(ctxt); 4601 } 4602 } 4603 4604 /** 4605 * htmlParseElementInternal: 4606 * @ctxt: an HTML parser context 4607 * 4608 * parse an HTML element, new version, non recursive 4609 * 4610 * [39] element ::= EmptyElemTag | STag content ETag 4611 * 4612 * [41] Attribute ::= Name Eq AttValue 4613 */ 4614 4615 static void 4616 htmlParseElementInternal(htmlParserCtxtPtr ctxt) { 4617 const xmlChar *name; 4618 const htmlElemDesc * info; 4619 htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 }; 4620 int failed; 4621 4622 if ((ctxt == NULL) || (ctxt->input == NULL)) { 4623 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 4624 "htmlParseElementInternal: context error\n", NULL, NULL); 4625 return; 4626 } 4627 4628 if (ctxt->instate == XML_PARSER_EOF) 4629 return; 4630 4631 /* Capture start position */ 4632 if (ctxt->record_info) { 4633 node_info.begin_pos = ctxt->input->consumed + 4634 (CUR_PTR - ctxt->input->base); 4635 node_info.begin_line = ctxt->input->line; 4636 } 4637 4638 failed = htmlParseStartTag(ctxt); 4639 name = ctxt->name; 4640 if ((failed == -1) || (name == NULL)) { 4641 if (CUR == '>') 4642 NEXT; 4643 return; 4644 } 4645 4646 /* 4647 * Lookup the info for that element. 4648 */ 4649 info = htmlTagLookup(name); 4650 if (info == NULL) { 4651 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, 4652 "Tag %s invalid\n", name, NULL); 4653 } 4654 4655 /* 4656 * Check for an Empty Element labeled the XML/SGML way 4657 */ 4658 if ((CUR == '/') && (NXT(1) == '>')) { 4659 SKIP(2); 4660 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 4661 ctxt->sax->endElement(ctxt->userData, name); 4662 htmlnamePop(ctxt); 4663 return; 4664 } 4665 4666 if (CUR == '>') { 4667 NEXT; 4668 } else { 4669 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 4670 "Couldn't find end of Start Tag %s\n", name, NULL); 4671 4672 /* 4673 * end of parsing of this node. 4674 */ 4675 if (xmlStrEqual(name, ctxt->name)) { 4676 nodePop(ctxt); 4677 htmlnamePop(ctxt); 4678 } 4679 4680 if (ctxt->record_info) 4681 htmlNodeInfoPush(ctxt, &node_info); 4682 htmlParserFinishElementParsing(ctxt); 4683 return; 4684 } 4685 4686 /* 4687 * Check for an Empty Element from DTD definition 4688 */ 4689 if ((info != NULL) && (info->empty)) { 4690 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 4691 ctxt->sax->endElement(ctxt->userData, name); 4692 htmlnamePop(ctxt); 4693 return; 4694 } 4695 4696 if (ctxt->record_info) 4697 htmlNodeInfoPush(ctxt, &node_info); 4698 } 4699 4700 /** 4701 * htmlParseContentInternal: 4702 * @ctxt: an HTML parser context 4703 * 4704 * Parse a content: comment, sub-element, reference or text. 4705 * New version for non recursive htmlParseElementInternal 4706 */ 4707 4708 static void 4709 htmlParseContentInternal(htmlParserCtxtPtr ctxt) { 4710 xmlChar *currentNode; 4711 int depth; 4712 const xmlChar *name; 4713 4714 currentNode = xmlStrdup(ctxt->name); 4715 depth = ctxt->nameNr; 4716 while (1) { 4717 GROW; 4718 4719 if (ctxt->instate == XML_PARSER_EOF) 4720 break; 4721 4722 /* 4723 * Our tag or one of it's parent or children is ending. 4724 */ 4725 if ((CUR == '<') && (NXT(1) == '/')) { 4726 if (htmlParseEndTag(ctxt) && 4727 ((currentNode != NULL) || (ctxt->nameNr == 0))) { 4728 if (currentNode != NULL) 4729 xmlFree(currentNode); 4730 4731 currentNode = xmlStrdup(ctxt->name); 4732 depth = ctxt->nameNr; 4733 } 4734 continue; /* while */ 4735 } 4736 4737 else if ((CUR == '<') && 4738 ((IS_ASCII_LETTER(NXT(1))) || 4739 (NXT(1) == '_') || (NXT(1) == ':'))) { 4740 name = htmlParseHTMLName_nonInvasive(ctxt); 4741 if (name == NULL) { 4742 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 4743 "htmlParseStartTag: invalid element name\n", 4744 NULL, NULL); 4745 /* Dump the bogus tag like browsers do */ 4746 while ((CUR == 0) && (CUR != '>')) 4747 NEXT; 4748 4749 htmlParserFinishElementParsing(ctxt); 4750 if (currentNode != NULL) 4751 xmlFree(currentNode); 4752 4753 currentNode = xmlStrdup(ctxt->name); 4754 depth = ctxt->nameNr; 4755 continue; 4756 } 4757 4758 if (ctxt->name != NULL) { 4759 if (htmlCheckAutoClose(name, ctxt->name) == 1) { 4760 htmlAutoClose(ctxt, name); 4761 continue; 4762 } 4763 } 4764 } 4765 4766 /* 4767 * Has this node been popped out during parsing of 4768 * the next element 4769 */ 4770 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) && 4771 (!xmlStrEqual(currentNode, ctxt->name))) 4772 { 4773 htmlParserFinishElementParsing(ctxt); 4774 if (currentNode != NULL) xmlFree(currentNode); 4775 4776 currentNode = xmlStrdup(ctxt->name); 4777 depth = ctxt->nameNr; 4778 continue; 4779 } 4780 4781 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) || 4782 (xmlStrEqual(currentNode, BAD_CAST"style")))) { 4783 /* 4784 * Handle SCRIPT/STYLE separately 4785 */ 4786 htmlParseScript(ctxt); 4787 } else { 4788 /* 4789 * Sometimes DOCTYPE arrives in the middle of the document 4790 */ 4791 if ((CUR == '<') && (NXT(1) == '!') && 4792 (UPP(2) == 'D') && (UPP(3) == 'O') && 4793 (UPP(4) == 'C') && (UPP(5) == 'T') && 4794 (UPP(6) == 'Y') && (UPP(7) == 'P') && 4795 (UPP(8) == 'E')) { 4796 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 4797 "Misplaced DOCTYPE declaration\n", 4798 BAD_CAST "DOCTYPE" , NULL); 4799 htmlParseDocTypeDecl(ctxt); 4800 } 4801 4802 /* 4803 * First case : a comment 4804 */ 4805 if ((CUR == '<') && (NXT(1) == '!') && 4806 (NXT(2) == '-') && (NXT(3) == '-')) { 4807 htmlParseComment(ctxt); 4808 } 4809 4810 /* 4811 * Second case : a Processing Instruction. 4812 */ 4813 else if ((CUR == '<') && (NXT(1) == '?')) { 4814 htmlParsePI(ctxt); 4815 } 4816 4817 /* 4818 * Third case : a sub-element. 4819 */ 4820 else if (CUR == '<') { 4821 htmlParseElementInternal(ctxt); 4822 if (currentNode != NULL) xmlFree(currentNode); 4823 4824 currentNode = xmlStrdup(ctxt->name); 4825 depth = ctxt->nameNr; 4826 } 4827 4828 /* 4829 * Fourth case : a reference. If if has not been resolved, 4830 * parsing returns it's Name, create the node 4831 */ 4832 else if (CUR == '&') { 4833 htmlParseReference(ctxt); 4834 } 4835 4836 /* 4837 * Fifth case : end of the resource 4838 */ 4839 else if (CUR == 0) { 4840 htmlAutoCloseOnEnd(ctxt); 4841 break; 4842 } 4843 4844 /* 4845 * Last case, text. Note that References are handled directly. 4846 */ 4847 else { 4848 htmlParseCharData(ctxt); 4849 } 4850 } 4851 GROW; 4852 } 4853 if (currentNode != NULL) xmlFree(currentNode); 4854 } 4855 4856 /** 4857 * htmlParseContent: 4858 * @ctxt: an HTML parser context 4859 * 4860 * Parse a content: comment, sub-element, reference or text. 4861 * This is the entry point when called from parser.c 4862 */ 4863 4864 void 4865 __htmlParseContent(void *ctxt) { 4866 if (ctxt != NULL) 4867 htmlParseContentInternal((htmlParserCtxtPtr) ctxt); 4868 } 4869 4870 /** 4871 * htmlParseDocument: 4872 * @ctxt: an HTML parser context 4873 * 4874 * parse an HTML document (and build a tree if using the standard SAX 4875 * interface). 4876 * 4877 * Returns 0, -1 in case of error. the parser context is augmented 4878 * as a result of the parsing. 4879 */ 4880 4881 int 4882 htmlParseDocument(htmlParserCtxtPtr ctxt) { 4883 xmlChar start[4]; 4884 xmlCharEncoding enc; 4885 xmlDtdPtr dtd; 4886 4887 xmlInitParser(); 4888 4889 htmlDefaultSAXHandlerInit(); 4890 4891 if ((ctxt == NULL) || (ctxt->input == NULL)) { 4892 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 4893 "htmlParseDocument: context error\n", NULL, NULL); 4894 return(XML_ERR_INTERNAL_ERROR); 4895 } 4896 ctxt->html = 1; 4897 ctxt->linenumbers = 1; 4898 GROW; 4899 /* 4900 * SAX: beginning of the document processing. 4901 */ 4902 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) 4903 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator); 4904 4905 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) && 4906 ((ctxt->input->end - ctxt->input->cur) >= 4)) { 4907 /* 4908 * Get the 4 first bytes and decode the charset 4909 * if enc != XML_CHAR_ENCODING_NONE 4910 * plug some encoding conversion routines. 4911 */ 4912 start[0] = RAW; 4913 start[1] = NXT(1); 4914 start[2] = NXT(2); 4915 start[3] = NXT(3); 4916 enc = xmlDetectCharEncoding(&start[0], 4); 4917 if (enc != XML_CHAR_ENCODING_NONE) { 4918 xmlSwitchEncoding(ctxt, enc); 4919 } 4920 } 4921 4922 /* 4923 * Wipe out everything which is before the first '<' 4924 */ 4925 SKIP_BLANKS; 4926 if (CUR == 0) { 4927 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY, 4928 "Document is empty\n", NULL, NULL); 4929 } 4930 4931 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX)) 4932 ctxt->sax->startDocument(ctxt->userData); 4933 4934 4935 /* 4936 * Parse possible comments and PIs before any content 4937 */ 4938 while (((CUR == '<') && (NXT(1) == '!') && 4939 (NXT(2) == '-') && (NXT(3) == '-')) || 4940 ((CUR == '<') && (NXT(1) == '?'))) { 4941 htmlParseComment(ctxt); 4942 htmlParsePI(ctxt); 4943 SKIP_BLANKS; 4944 } 4945 4946 4947 /* 4948 * Then possibly doc type declaration(s) and more Misc 4949 * (doctypedecl Misc*)? 4950 */ 4951 if ((CUR == '<') && (NXT(1) == '!') && 4952 (UPP(2) == 'D') && (UPP(3) == 'O') && 4953 (UPP(4) == 'C') && (UPP(5) == 'T') && 4954 (UPP(6) == 'Y') && (UPP(7) == 'P') && 4955 (UPP(8) == 'E')) { 4956 htmlParseDocTypeDecl(ctxt); 4957 } 4958 SKIP_BLANKS; 4959 4960 /* 4961 * Parse possible comments and PIs before any content 4962 */ 4963 while (((CUR == '<') && (NXT(1) == '!') && 4964 (NXT(2) == '-') && (NXT(3) == '-')) || 4965 ((CUR == '<') && (NXT(1) == '?'))) { 4966 htmlParseComment(ctxt); 4967 htmlParsePI(ctxt); 4968 SKIP_BLANKS; 4969 } 4970 4971 /* 4972 * Time to start parsing the tree itself 4973 */ 4974 htmlParseContentInternal(ctxt); 4975 4976 /* 4977 * autoclose 4978 */ 4979 if (CUR == 0) 4980 htmlAutoCloseOnEnd(ctxt); 4981 4982 4983 /* 4984 * SAX: end of the document processing. 4985 */ 4986 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 4987 ctxt->sax->endDocument(ctxt->userData); 4988 4989 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) { 4990 dtd = xmlGetIntSubset(ctxt->myDoc); 4991 if (dtd == NULL) 4992 ctxt->myDoc->intSubset = 4993 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html", 4994 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN", 4995 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"); 4996 } 4997 if (! ctxt->wellFormed) return(-1); 4998 return(0); 4999 } 5000 5001 5002 /************************************************************************ 5003 * * 5004 * Parser contexts handling * 5005 * * 5006 ************************************************************************/ 5007 5008 /** 5009 * htmlInitParserCtxt: 5010 * @ctxt: an HTML parser context 5011 * 5012 * Initialize a parser context 5013 * 5014 * Returns 0 in case of success and -1 in case of error 5015 */ 5016 5017 static int 5018 htmlInitParserCtxt(htmlParserCtxtPtr ctxt) 5019 { 5020 htmlSAXHandler *sax; 5021 5022 if (ctxt == NULL) return(-1); 5023 memset(ctxt, 0, sizeof(htmlParserCtxt)); 5024 5025 ctxt->dict = xmlDictCreate(); 5026 if (ctxt->dict == NULL) { 5027 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 5028 return(-1); 5029 } 5030 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler)); 5031 if (sax == NULL) { 5032 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 5033 return(-1); 5034 } 5035 else 5036 memset(sax, 0, sizeof(htmlSAXHandler)); 5037 5038 /* Allocate the Input stack */ 5039 ctxt->inputTab = (htmlParserInputPtr *) 5040 xmlMalloc(5 * sizeof(htmlParserInputPtr)); 5041 if (ctxt->inputTab == NULL) { 5042 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 5043 ctxt->inputNr = 0; 5044 ctxt->inputMax = 0; 5045 ctxt->input = NULL; 5046 return(-1); 5047 } 5048 ctxt->inputNr = 0; 5049 ctxt->inputMax = 5; 5050 ctxt->input = NULL; 5051 ctxt->version = NULL; 5052 ctxt->encoding = NULL; 5053 ctxt->standalone = -1; 5054 ctxt->instate = XML_PARSER_START; 5055 5056 /* Allocate the Node stack */ 5057 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr)); 5058 if (ctxt->nodeTab == NULL) { 5059 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 5060 ctxt->nodeNr = 0; 5061 ctxt->nodeMax = 0; 5062 ctxt->node = NULL; 5063 ctxt->inputNr = 0; 5064 ctxt->inputMax = 0; 5065 ctxt->input = NULL; 5066 return(-1); 5067 } 5068 ctxt->nodeNr = 0; 5069 ctxt->nodeMax = 10; 5070 ctxt->node = NULL; 5071 5072 /* Allocate the Name stack */ 5073 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *)); 5074 if (ctxt->nameTab == NULL) { 5075 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 5076 ctxt->nameNr = 0; 5077 ctxt->nameMax = 0; 5078 ctxt->name = NULL; 5079 ctxt->nodeNr = 0; 5080 ctxt->nodeMax = 0; 5081 ctxt->node = NULL; 5082 ctxt->inputNr = 0; 5083 ctxt->inputMax = 0; 5084 ctxt->input = NULL; 5085 return(-1); 5086 } 5087 ctxt->nameNr = 0; 5088 ctxt->nameMax = 10; 5089 ctxt->name = NULL; 5090 5091 ctxt->nodeInfoTab = NULL; 5092 ctxt->nodeInfoNr = 0; 5093 ctxt->nodeInfoMax = 0; 5094 5095 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler; 5096 else { 5097 ctxt->sax = sax; 5098 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1)); 5099 } 5100 ctxt->userData = ctxt; 5101 ctxt->myDoc = NULL; 5102 ctxt->wellFormed = 1; 5103 ctxt->replaceEntities = 0; 5104 ctxt->linenumbers = xmlLineNumbersDefaultValue; 5105 ctxt->keepBlanks = xmlKeepBlanksDefaultValue; 5106 ctxt->html = 1; 5107 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0; 5108 ctxt->vctxt.userData = ctxt; 5109 ctxt->vctxt.error = xmlParserValidityError; 5110 ctxt->vctxt.warning = xmlParserValidityWarning; 5111 ctxt->record_info = 0; 5112 ctxt->validate = 0; 5113 ctxt->checkIndex = 0; 5114 ctxt->catalogs = NULL; 5115 xmlInitNodeInfoSeq(&ctxt->node_seq); 5116 return(0); 5117 } 5118 5119 /** 5120 * htmlFreeParserCtxt: 5121 * @ctxt: an HTML parser context 5122 * 5123 * Free all the memory used by a parser context. However the parsed 5124 * document in ctxt->myDoc is not freed. 5125 */ 5126 5127 void 5128 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt) 5129 { 5130 xmlFreeParserCtxt(ctxt); 5131 } 5132 5133 /** 5134 * htmlNewParserCtxt: 5135 * 5136 * Allocate and initialize a new parser context. 5137 * 5138 * Returns the htmlParserCtxtPtr or NULL in case of allocation error 5139 */ 5140 5141 htmlParserCtxtPtr 5142 htmlNewParserCtxt(void) 5143 { 5144 xmlParserCtxtPtr ctxt; 5145 5146 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt)); 5147 if (ctxt == NULL) { 5148 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n"); 5149 return(NULL); 5150 } 5151 memset(ctxt, 0, sizeof(xmlParserCtxt)); 5152 if (htmlInitParserCtxt(ctxt) < 0) { 5153 htmlFreeParserCtxt(ctxt); 5154 return(NULL); 5155 } 5156 return(ctxt); 5157 } 5158 5159 /** 5160 * htmlCreateMemoryParserCtxt: 5161 * @buffer: a pointer to a char array 5162 * @size: the size of the array 5163 * 5164 * Create a parser context for an HTML in-memory document. 5165 * 5166 * Returns the new parser context or NULL 5167 */ 5168 htmlParserCtxtPtr 5169 htmlCreateMemoryParserCtxt(const char *buffer, int size) { 5170 xmlParserCtxtPtr ctxt; 5171 xmlParserInputPtr input; 5172 xmlParserInputBufferPtr buf; 5173 5174 if (buffer == NULL) 5175 return(NULL); 5176 if (size <= 0) 5177 return(NULL); 5178 5179 ctxt = htmlNewParserCtxt(); 5180 if (ctxt == NULL) 5181 return(NULL); 5182 5183 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE); 5184 if (buf == NULL) return(NULL); 5185 5186 input = xmlNewInputStream(ctxt); 5187 if (input == NULL) { 5188 xmlFreeParserCtxt(ctxt); 5189 return(NULL); 5190 } 5191 5192 input->filename = NULL; 5193 input->buf = buf; 5194 xmlBufResetInput(buf->buffer, input); 5195 5196 inputPush(ctxt, input); 5197 return(ctxt); 5198 } 5199 5200 /** 5201 * htmlCreateDocParserCtxt: 5202 * @cur: a pointer to an array of xmlChar 5203 * @encoding: a free form C string describing the HTML document encoding, or NULL 5204 * 5205 * Create a parser context for an HTML document. 5206 * 5207 * TODO: check the need to add encoding handling there 5208 * 5209 * Returns the new parser context or NULL 5210 */ 5211 static htmlParserCtxtPtr 5212 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) { 5213 int len; 5214 htmlParserCtxtPtr ctxt; 5215 5216 if (cur == NULL) 5217 return(NULL); 5218 len = xmlStrlen(cur); 5219 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len); 5220 if (ctxt == NULL) 5221 return(NULL); 5222 5223 if (encoding != NULL) { 5224 xmlCharEncoding enc; 5225 xmlCharEncodingHandlerPtr handler; 5226 5227 if (ctxt->input->encoding != NULL) 5228 xmlFree((xmlChar *) ctxt->input->encoding); 5229 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding); 5230 5231 enc = xmlParseCharEncoding(encoding); 5232 /* 5233 * registered set of known encodings 5234 */ 5235 if (enc != XML_CHAR_ENCODING_ERROR) { 5236 xmlSwitchEncoding(ctxt, enc); 5237 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) { 5238 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, 5239 "Unsupported encoding %s\n", 5240 (const xmlChar *) encoding, NULL); 5241 } 5242 } else { 5243 /* 5244 * fallback for unknown encodings 5245 */ 5246 handler = xmlFindCharEncodingHandler((const char *) encoding); 5247 if (handler != NULL) { 5248 xmlSwitchToEncoding(ctxt, handler); 5249 } else { 5250 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, 5251 "Unsupported encoding %s\n", 5252 (const xmlChar *) encoding, NULL); 5253 } 5254 } 5255 } 5256 return(ctxt); 5257 } 5258 5259 #ifdef LIBXML_PUSH_ENABLED 5260 /************************************************************************ 5261 * * 5262 * Progressive parsing interfaces * 5263 * * 5264 ************************************************************************/ 5265 5266 /** 5267 * htmlParseLookupSequence: 5268 * @ctxt: an HTML parser context 5269 * @first: the first char to lookup 5270 * @next: the next char to lookup or zero 5271 * @third: the next char to lookup or zero 5272 * @ignoreattrval: skip over attribute values 5273 * 5274 * Try to find if a sequence (first, next, third) or just (first next) or 5275 * (first) is available in the input stream. 5276 * This function has a side effect of (possibly) incrementing ctxt->checkIndex 5277 * to avoid rescanning sequences of bytes, it DOES change the state of the 5278 * parser, do not use liberally. 5279 * This is basically similar to xmlParseLookupSequence() 5280 * 5281 * Returns the index to the current parsing point if the full sequence 5282 * is available, -1 otherwise. 5283 */ 5284 static int 5285 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first, 5286 xmlChar next, xmlChar third, int ignoreattrval) 5287 { 5288 int base, len; 5289 htmlParserInputPtr in; 5290 const xmlChar *buf; 5291 int invalue = 0; 5292 char valdellim = 0x0; 5293 5294 in = ctxt->input; 5295 if (in == NULL) 5296 return (-1); 5297 5298 base = in->cur - in->base; 5299 if (base < 0) 5300 return (-1); 5301 5302 if (ctxt->checkIndex > base) { 5303 base = ctxt->checkIndex; 5304 /* Abuse hasPErefs member to restore current state. */ 5305 invalue = ctxt->hasPErefs & 1 ? 1 : 0; 5306 } 5307 5308 if (in->buf == NULL) { 5309 buf = in->base; 5310 len = in->length; 5311 } else { 5312 buf = xmlBufContent(in->buf->buffer); 5313 len = xmlBufUse(in->buf->buffer); 5314 } 5315 5316 /* take into account the sequence length */ 5317 if (third) 5318 len -= 2; 5319 else if (next) 5320 len--; 5321 for (; base < len; base++) { 5322 if (ignoreattrval) { 5323 if (buf[base] == '"' || buf[base] == '\'') { 5324 if (invalue) { 5325 if (buf[base] == valdellim) { 5326 invalue = 0; 5327 continue; 5328 } 5329 } else { 5330 valdellim = buf[base]; 5331 invalue = 1; 5332 continue; 5333 } 5334 } else if (invalue) { 5335 continue; 5336 } 5337 } 5338 if (buf[base] == first) { 5339 if (third != 0) { 5340 if ((buf[base + 1] != next) || (buf[base + 2] != third)) 5341 continue; 5342 } else if (next != 0) { 5343 if (buf[base + 1] != next) 5344 continue; 5345 } 5346 ctxt->checkIndex = 0; 5347 #ifdef DEBUG_PUSH 5348 if (next == 0) 5349 xmlGenericError(xmlGenericErrorContext, 5350 "HPP: lookup '%c' found at %d\n", 5351 first, base); 5352 else if (third == 0) 5353 xmlGenericError(xmlGenericErrorContext, 5354 "HPP: lookup '%c%c' found at %d\n", 5355 first, next, base); 5356 else 5357 xmlGenericError(xmlGenericErrorContext, 5358 "HPP: lookup '%c%c%c' found at %d\n", 5359 first, next, third, base); 5360 #endif 5361 return (base - (in->cur - in->base)); 5362 } 5363 } 5364 ctxt->checkIndex = base; 5365 /* Abuse hasPErefs member to track current state. */ 5366 if (invalue) 5367 ctxt->hasPErefs |= 1; 5368 else 5369 ctxt->hasPErefs &= ~1; 5370 #ifdef DEBUG_PUSH 5371 if (next == 0) 5372 xmlGenericError(xmlGenericErrorContext, 5373 "HPP: lookup '%c' failed\n", first); 5374 else if (third == 0) 5375 xmlGenericError(xmlGenericErrorContext, 5376 "HPP: lookup '%c%c' failed\n", first, next); 5377 else 5378 xmlGenericError(xmlGenericErrorContext, 5379 "HPP: lookup '%c%c%c' failed\n", first, next, 5380 third); 5381 #endif 5382 return (-1); 5383 } 5384 5385 /** 5386 * htmlParseLookupCommentEnd: 5387 * @ctxt: an HTML parser context 5388 * 5389 * Try to find a comment end tag in the input stream 5390 * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags. 5391 * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment) 5392 * This function has a side effect of (possibly) incrementing ctxt->checkIndex 5393 * to avoid rescanning sequences of bytes, it DOES change the state of the 5394 * parser, do not use liberally. 5395 * This wraps to htmlParseLookupSequence() 5396 * 5397 * Returns the index to the current parsing point if the full sequence is available, -1 otherwise. 5398 */ 5399 static int 5400 htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt) 5401 { 5402 int mark = 0; 5403 int cur = CUR_PTR - BASE_PTR; 5404 5405 while (mark >= 0) { 5406 mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0); 5407 if ((mark < 0) || 5408 (NXT(mark+2) == '>') || 5409 ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) { 5410 return mark; 5411 } 5412 ctxt->checkIndex = cur + mark + 1; 5413 } 5414 return mark; 5415 } 5416 5417 5418 /** 5419 * htmlParseTryOrFinish: 5420 * @ctxt: an HTML parser context 5421 * @terminate: last chunk indicator 5422 * 5423 * Try to progress on parsing 5424 * 5425 * Returns zero if no parsing was possible 5426 */ 5427 static int 5428 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { 5429 int ret = 0; 5430 htmlParserInputPtr in; 5431 ptrdiff_t avail = 0; 5432 xmlChar cur, next; 5433 5434 htmlParserNodeInfo node_info; 5435 5436 #ifdef DEBUG_PUSH 5437 switch (ctxt->instate) { 5438 case XML_PARSER_EOF: 5439 xmlGenericError(xmlGenericErrorContext, 5440 "HPP: try EOF\n"); break; 5441 case XML_PARSER_START: 5442 xmlGenericError(xmlGenericErrorContext, 5443 "HPP: try START\n"); break; 5444 case XML_PARSER_MISC: 5445 xmlGenericError(xmlGenericErrorContext, 5446 "HPP: try MISC\n");break; 5447 case XML_PARSER_COMMENT: 5448 xmlGenericError(xmlGenericErrorContext, 5449 "HPP: try COMMENT\n");break; 5450 case XML_PARSER_PROLOG: 5451 xmlGenericError(xmlGenericErrorContext, 5452 "HPP: try PROLOG\n");break; 5453 case XML_PARSER_START_TAG: 5454 xmlGenericError(xmlGenericErrorContext, 5455 "HPP: try START_TAG\n");break; 5456 case XML_PARSER_CONTENT: 5457 xmlGenericError(xmlGenericErrorContext, 5458 "HPP: try CONTENT\n");break; 5459 case XML_PARSER_CDATA_SECTION: 5460 xmlGenericError(xmlGenericErrorContext, 5461 "HPP: try CDATA_SECTION\n");break; 5462 case XML_PARSER_END_TAG: 5463 xmlGenericError(xmlGenericErrorContext, 5464 "HPP: try END_TAG\n");break; 5465 case XML_PARSER_ENTITY_DECL: 5466 xmlGenericError(xmlGenericErrorContext, 5467 "HPP: try ENTITY_DECL\n");break; 5468 case XML_PARSER_ENTITY_VALUE: 5469 xmlGenericError(xmlGenericErrorContext, 5470 "HPP: try ENTITY_VALUE\n");break; 5471 case XML_PARSER_ATTRIBUTE_VALUE: 5472 xmlGenericError(xmlGenericErrorContext, 5473 "HPP: try ATTRIBUTE_VALUE\n");break; 5474 case XML_PARSER_DTD: 5475 xmlGenericError(xmlGenericErrorContext, 5476 "HPP: try DTD\n");break; 5477 case XML_PARSER_EPILOG: 5478 xmlGenericError(xmlGenericErrorContext, 5479 "HPP: try EPILOG\n");break; 5480 case XML_PARSER_PI: 5481 xmlGenericError(xmlGenericErrorContext, 5482 "HPP: try PI\n");break; 5483 case XML_PARSER_SYSTEM_LITERAL: 5484 xmlGenericError(xmlGenericErrorContext, 5485 "HPP: try SYSTEM_LITERAL\n");break; 5486 } 5487 #endif 5488 5489 while (1) { 5490 5491 in = ctxt->input; 5492 if (in == NULL) break; 5493 if (in->buf == NULL) 5494 avail = in->length - (in->cur - in->base); 5495 else 5496 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) - 5497 (in->cur - in->base); 5498 if ((avail == 0) && (terminate)) { 5499 htmlAutoCloseOnEnd(ctxt); 5500 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { 5501 /* 5502 * SAX: end of the document processing. 5503 */ 5504 ctxt->instate = XML_PARSER_EOF; 5505 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 5506 ctxt->sax->endDocument(ctxt->userData); 5507 } 5508 } 5509 if (avail < 1) 5510 goto done; 5511 /* 5512 * This is done to make progress and avoid an infinite loop 5513 * if a parsing attempt was aborted by hitting a NUL byte. After 5514 * changing htmlCurrentChar, this probably isn't necessary anymore. 5515 * We should consider removing this check. 5516 */ 5517 cur = in->cur[0]; 5518 if (cur == 0) { 5519 SKIP(1); 5520 continue; 5521 } 5522 5523 switch (ctxt->instate) { 5524 case XML_PARSER_EOF: 5525 /* 5526 * Document parsing is done ! 5527 */ 5528 goto done; 5529 case XML_PARSER_START: 5530 /* 5531 * Very first chars read from the document flow. 5532 */ 5533 cur = in->cur[0]; 5534 if (IS_BLANK_CH(cur)) { 5535 SKIP_BLANKS; 5536 if (in->buf == NULL) 5537 avail = in->length - (in->cur - in->base); 5538 else 5539 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) - 5540 (in->cur - in->base); 5541 } 5542 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) 5543 ctxt->sax->setDocumentLocator(ctxt->userData, 5544 &xmlDefaultSAXLocator); 5545 if ((ctxt->sax) && (ctxt->sax->startDocument) && 5546 (!ctxt->disableSAX)) 5547 ctxt->sax->startDocument(ctxt->userData); 5548 5549 cur = in->cur[0]; 5550 next = in->cur[1]; 5551 if ((cur == '<') && (next == '!') && 5552 (UPP(2) == 'D') && (UPP(3) == 'O') && 5553 (UPP(4) == 'C') && (UPP(5) == 'T') && 5554 (UPP(6) == 'Y') && (UPP(7) == 'P') && 5555 (UPP(8) == 'E')) { 5556 if ((!terminate) && 5557 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0)) 5558 goto done; 5559 #ifdef DEBUG_PUSH 5560 xmlGenericError(xmlGenericErrorContext, 5561 "HPP: Parsing internal subset\n"); 5562 #endif 5563 htmlParseDocTypeDecl(ctxt); 5564 ctxt->instate = XML_PARSER_PROLOG; 5565 #ifdef DEBUG_PUSH 5566 xmlGenericError(xmlGenericErrorContext, 5567 "HPP: entering PROLOG\n"); 5568 #endif 5569 } else { 5570 ctxt->instate = XML_PARSER_MISC; 5571 #ifdef DEBUG_PUSH 5572 xmlGenericError(xmlGenericErrorContext, 5573 "HPP: entering MISC\n"); 5574 #endif 5575 } 5576 break; 5577 case XML_PARSER_MISC: 5578 SKIP_BLANKS; 5579 if (in->buf == NULL) 5580 avail = in->length - (in->cur - in->base); 5581 else 5582 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) - 5583 (in->cur - in->base); 5584 /* 5585 * no chars in buffer 5586 */ 5587 if (avail < 1) 5588 goto done; 5589 /* 5590 * not enough chars in buffer 5591 */ 5592 if (avail < 2) { 5593 if (!terminate) 5594 goto done; 5595 else 5596 next = ' '; 5597 } else { 5598 next = in->cur[1]; 5599 } 5600 cur = in->cur[0]; 5601 if ((cur == '<') && (next == '!') && 5602 (in->cur[2] == '-') && (in->cur[3] == '-')) { 5603 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0)) 5604 goto done; 5605 #ifdef DEBUG_PUSH 5606 xmlGenericError(xmlGenericErrorContext, 5607 "HPP: Parsing Comment\n"); 5608 #endif 5609 htmlParseComment(ctxt); 5610 ctxt->instate = XML_PARSER_MISC; 5611 } else if ((cur == '<') && (next == '?')) { 5612 if ((!terminate) && 5613 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 5614 goto done; 5615 #ifdef DEBUG_PUSH 5616 xmlGenericError(xmlGenericErrorContext, 5617 "HPP: Parsing PI\n"); 5618 #endif 5619 htmlParsePI(ctxt); 5620 ctxt->instate = XML_PARSER_MISC; 5621 } else if ((cur == '<') && (next == '!') && 5622 (UPP(2) == 'D') && (UPP(3) == 'O') && 5623 (UPP(4) == 'C') && (UPP(5) == 'T') && 5624 (UPP(6) == 'Y') && (UPP(7) == 'P') && 5625 (UPP(8) == 'E')) { 5626 if ((!terminate) && 5627 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0)) 5628 goto done; 5629 #ifdef DEBUG_PUSH 5630 xmlGenericError(xmlGenericErrorContext, 5631 "HPP: Parsing internal subset\n"); 5632 #endif 5633 htmlParseDocTypeDecl(ctxt); 5634 ctxt->instate = XML_PARSER_PROLOG; 5635 #ifdef DEBUG_PUSH 5636 xmlGenericError(xmlGenericErrorContext, 5637 "HPP: entering PROLOG\n"); 5638 #endif 5639 } else if ((cur == '<') && (next == '!') && 5640 (avail < 9)) { 5641 goto done; 5642 } else { 5643 ctxt->instate = XML_PARSER_CONTENT; 5644 #ifdef DEBUG_PUSH 5645 xmlGenericError(xmlGenericErrorContext, 5646 "HPP: entering START_TAG\n"); 5647 #endif 5648 } 5649 break; 5650 case XML_PARSER_PROLOG: 5651 SKIP_BLANKS; 5652 if (in->buf == NULL) 5653 avail = in->length - (in->cur - in->base); 5654 else 5655 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) - 5656 (in->cur - in->base); 5657 if (avail < 2) 5658 goto done; 5659 cur = in->cur[0]; 5660 next = in->cur[1]; 5661 if ((cur == '<') && (next == '!') && 5662 (in->cur[2] == '-') && (in->cur[3] == '-')) { 5663 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0)) 5664 goto done; 5665 #ifdef DEBUG_PUSH 5666 xmlGenericError(xmlGenericErrorContext, 5667 "HPP: Parsing Comment\n"); 5668 #endif 5669 htmlParseComment(ctxt); 5670 ctxt->instate = XML_PARSER_PROLOG; 5671 } else if ((cur == '<') && (next == '?')) { 5672 if ((!terminate) && 5673 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 5674 goto done; 5675 #ifdef DEBUG_PUSH 5676 xmlGenericError(xmlGenericErrorContext, 5677 "HPP: Parsing PI\n"); 5678 #endif 5679 htmlParsePI(ctxt); 5680 ctxt->instate = XML_PARSER_PROLOG; 5681 } else if ((cur == '<') && (next == '!') && 5682 (avail < 4)) { 5683 goto done; 5684 } else { 5685 ctxt->instate = XML_PARSER_CONTENT; 5686 #ifdef DEBUG_PUSH 5687 xmlGenericError(xmlGenericErrorContext, 5688 "HPP: entering START_TAG\n"); 5689 #endif 5690 } 5691 break; 5692 case XML_PARSER_EPILOG: 5693 if (in->buf == NULL) 5694 avail = in->length - (in->cur - in->base); 5695 else 5696 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) - 5697 (in->cur - in->base); 5698 if (avail < 1) 5699 goto done; 5700 cur = in->cur[0]; 5701 if (IS_BLANK_CH(cur)) { 5702 htmlParseCharData(ctxt); 5703 goto done; 5704 } 5705 if (avail < 2) 5706 goto done; 5707 next = in->cur[1]; 5708 if ((cur == '<') && (next == '!') && 5709 (in->cur[2] == '-') && (in->cur[3] == '-')) { 5710 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0)) 5711 goto done; 5712 #ifdef DEBUG_PUSH 5713 xmlGenericError(xmlGenericErrorContext, 5714 "HPP: Parsing Comment\n"); 5715 #endif 5716 htmlParseComment(ctxt); 5717 ctxt->instate = XML_PARSER_EPILOG; 5718 } else if ((cur == '<') && (next == '?')) { 5719 if ((!terminate) && 5720 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 5721 goto done; 5722 #ifdef DEBUG_PUSH 5723 xmlGenericError(xmlGenericErrorContext, 5724 "HPP: Parsing PI\n"); 5725 #endif 5726 htmlParsePI(ctxt); 5727 ctxt->instate = XML_PARSER_EPILOG; 5728 } else if ((cur == '<') && (next == '!') && 5729 (avail < 4)) { 5730 goto done; 5731 } else { 5732 ctxt->errNo = XML_ERR_DOCUMENT_END; 5733 ctxt->wellFormed = 0; 5734 ctxt->instate = XML_PARSER_EOF; 5735 #ifdef DEBUG_PUSH 5736 xmlGenericError(xmlGenericErrorContext, 5737 "HPP: entering EOF\n"); 5738 #endif 5739 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 5740 ctxt->sax->endDocument(ctxt->userData); 5741 goto done; 5742 } 5743 break; 5744 case XML_PARSER_START_TAG: { 5745 const xmlChar *name; 5746 int failed; 5747 const htmlElemDesc * info; 5748 5749 /* 5750 * no chars in buffer 5751 */ 5752 if (avail < 1) 5753 goto done; 5754 /* 5755 * not enough chars in buffer 5756 */ 5757 if (avail < 2) { 5758 if (!terminate) 5759 goto done; 5760 else 5761 next = ' '; 5762 } else { 5763 next = in->cur[1]; 5764 } 5765 cur = in->cur[0]; 5766 if (cur != '<') { 5767 ctxt->instate = XML_PARSER_CONTENT; 5768 #ifdef DEBUG_PUSH 5769 xmlGenericError(xmlGenericErrorContext, 5770 "HPP: entering CONTENT\n"); 5771 #endif 5772 break; 5773 } 5774 if (next == '/') { 5775 ctxt->instate = XML_PARSER_END_TAG; 5776 ctxt->checkIndex = 0; 5777 #ifdef DEBUG_PUSH 5778 xmlGenericError(xmlGenericErrorContext, 5779 "HPP: entering END_TAG\n"); 5780 #endif 5781 break; 5782 } 5783 if ((!terminate) && 5784 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0)) 5785 goto done; 5786 5787 /* Capture start position */ 5788 if (ctxt->record_info) { 5789 node_info.begin_pos = ctxt->input->consumed + 5790 (CUR_PTR - ctxt->input->base); 5791 node_info.begin_line = ctxt->input->line; 5792 } 5793 5794 5795 failed = htmlParseStartTag(ctxt); 5796 name = ctxt->name; 5797 if ((failed == -1) || 5798 (name == NULL)) { 5799 if (CUR == '>') 5800 NEXT; 5801 break; 5802 } 5803 5804 /* 5805 * Lookup the info for that element. 5806 */ 5807 info = htmlTagLookup(name); 5808 if (info == NULL) { 5809 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, 5810 "Tag %s invalid\n", name, NULL); 5811 } 5812 5813 /* 5814 * Check for an Empty Element labeled the XML/SGML way 5815 */ 5816 if ((CUR == '/') && (NXT(1) == '>')) { 5817 SKIP(2); 5818 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 5819 ctxt->sax->endElement(ctxt->userData, name); 5820 htmlnamePop(ctxt); 5821 ctxt->instate = XML_PARSER_CONTENT; 5822 #ifdef DEBUG_PUSH 5823 xmlGenericError(xmlGenericErrorContext, 5824 "HPP: entering CONTENT\n"); 5825 #endif 5826 break; 5827 } 5828 5829 if (CUR == '>') { 5830 NEXT; 5831 } else { 5832 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 5833 "Couldn't find end of Start Tag %s\n", 5834 name, NULL); 5835 5836 /* 5837 * end of parsing of this node. 5838 */ 5839 if (xmlStrEqual(name, ctxt->name)) { 5840 nodePop(ctxt); 5841 htmlnamePop(ctxt); 5842 } 5843 5844 if (ctxt->record_info) 5845 htmlNodeInfoPush(ctxt, &node_info); 5846 5847 ctxt->instate = XML_PARSER_CONTENT; 5848 #ifdef DEBUG_PUSH 5849 xmlGenericError(xmlGenericErrorContext, 5850 "HPP: entering CONTENT\n"); 5851 #endif 5852 break; 5853 } 5854 5855 /* 5856 * Check for an Empty Element from DTD definition 5857 */ 5858 if ((info != NULL) && (info->empty)) { 5859 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 5860 ctxt->sax->endElement(ctxt->userData, name); 5861 htmlnamePop(ctxt); 5862 } 5863 5864 if (ctxt->record_info) 5865 htmlNodeInfoPush(ctxt, &node_info); 5866 5867 ctxt->instate = XML_PARSER_CONTENT; 5868 #ifdef DEBUG_PUSH 5869 xmlGenericError(xmlGenericErrorContext, 5870 "HPP: entering CONTENT\n"); 5871 #endif 5872 break; 5873 } 5874 case XML_PARSER_CONTENT: { 5875 xmlChar chr[2] = { 0, 0 }; 5876 5877 /* 5878 * Handle preparsed entities and charRef 5879 */ 5880 if (ctxt->token != 0) { 5881 chr[0] = (xmlChar) ctxt->token; 5882 htmlCheckParagraph(ctxt); 5883 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 5884 ctxt->sax->characters(ctxt->userData, chr, 1); 5885 ctxt->token = 0; 5886 ctxt->checkIndex = 0; 5887 } 5888 if ((avail == 1) && (terminate)) { 5889 cur = in->cur[0]; 5890 if ((cur != '<') && (cur != '&')) { 5891 if (ctxt->sax != NULL) { 5892 chr[0] = cur; 5893 if (IS_BLANK_CH(cur)) { 5894 if (ctxt->keepBlanks) { 5895 if (ctxt->sax->characters != NULL) 5896 ctxt->sax->characters( 5897 ctxt->userData, chr, 1); 5898 } else { 5899 if (ctxt->sax->ignorableWhitespace != NULL) 5900 ctxt->sax->ignorableWhitespace( 5901 ctxt->userData, chr, 1); 5902 } 5903 } else { 5904 htmlCheckParagraph(ctxt); 5905 if (ctxt->sax->characters != NULL) 5906 ctxt->sax->characters( 5907 ctxt->userData, chr, 1); 5908 } 5909 } 5910 ctxt->token = 0; 5911 ctxt->checkIndex = 0; 5912 in->cur++; 5913 break; 5914 } 5915 } 5916 if (avail < 2) 5917 goto done; 5918 cur = in->cur[0]; 5919 next = in->cur[1]; 5920 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) || 5921 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) { 5922 /* 5923 * Handle SCRIPT/STYLE separately 5924 */ 5925 if (!terminate) { 5926 int idx; 5927 xmlChar val; 5928 5929 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0); 5930 if (idx < 0) 5931 goto done; 5932 val = in->cur[idx + 2]; 5933 if (val == 0) /* bad cut of input */ 5934 goto done; 5935 } 5936 htmlParseScript(ctxt); 5937 if ((cur == '<') && (next == '/')) { 5938 ctxt->instate = XML_PARSER_END_TAG; 5939 ctxt->checkIndex = 0; 5940 #ifdef DEBUG_PUSH 5941 xmlGenericError(xmlGenericErrorContext, 5942 "HPP: entering END_TAG\n"); 5943 #endif 5944 break; 5945 } 5946 } else { 5947 /* 5948 * Sometimes DOCTYPE arrives in the middle of the document 5949 */ 5950 if ((cur == '<') && (next == '!') && 5951 (UPP(2) == 'D') && (UPP(3) == 'O') && 5952 (UPP(4) == 'C') && (UPP(5) == 'T') && 5953 (UPP(6) == 'Y') && (UPP(7) == 'P') && 5954 (UPP(8) == 'E')) { 5955 if ((!terminate) && 5956 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0)) 5957 goto done; 5958 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 5959 "Misplaced DOCTYPE declaration\n", 5960 BAD_CAST "DOCTYPE" , NULL); 5961 htmlParseDocTypeDecl(ctxt); 5962 } else if ((cur == '<') && (next == '!') && 5963 (in->cur[2] == '-') && (in->cur[3] == '-')) { 5964 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0)) 5965 goto done; 5966 #ifdef DEBUG_PUSH 5967 xmlGenericError(xmlGenericErrorContext, 5968 "HPP: Parsing Comment\n"); 5969 #endif 5970 htmlParseComment(ctxt); 5971 ctxt->instate = XML_PARSER_CONTENT; 5972 } else if ((cur == '<') && (next == '?')) { 5973 if ((!terminate) && 5974 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 5975 goto done; 5976 #ifdef DEBUG_PUSH 5977 xmlGenericError(xmlGenericErrorContext, 5978 "HPP: Parsing PI\n"); 5979 #endif 5980 htmlParsePI(ctxt); 5981 ctxt->instate = XML_PARSER_CONTENT; 5982 } else if ((cur == '<') && (next == '!') && (avail < 4)) { 5983 goto done; 5984 } else if ((cur == '<') && (next == '/')) { 5985 ctxt->instate = XML_PARSER_END_TAG; 5986 ctxt->checkIndex = 0; 5987 #ifdef DEBUG_PUSH 5988 xmlGenericError(xmlGenericErrorContext, 5989 "HPP: entering END_TAG\n"); 5990 #endif 5991 break; 5992 } else if (cur == '<') { 5993 if ((!terminate) && (next == 0)) 5994 goto done; 5995 /* 5996 * Only switch to START_TAG if the next character 5997 * starts a valid name. Otherwise, htmlParseStartTag 5998 * might return without consuming all characters 5999 * up to the final '>'. 6000 */ 6001 if ((IS_ASCII_LETTER(next)) || 6002 (next == '_') || (next == ':') || (next == '.')) { 6003 ctxt->instate = XML_PARSER_START_TAG; 6004 ctxt->checkIndex = 0; 6005 #ifdef DEBUG_PUSH 6006 xmlGenericError(xmlGenericErrorContext, 6007 "HPP: entering START_TAG\n"); 6008 #endif 6009 } else { 6010 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 6011 "htmlParseTryOrFinish: " 6012 "invalid element name\n", 6013 NULL, NULL); 6014 htmlCheckParagraph(ctxt); 6015 if ((ctxt->sax != NULL) && 6016 (ctxt->sax->characters != NULL)) 6017 ctxt->sax->characters(ctxt->userData, 6018 in->cur, 1); 6019 NEXT; 6020 } 6021 break; 6022 } else { 6023 /* 6024 * check that the text sequence is complete 6025 * before handing out the data to the parser 6026 * to avoid problems with erroneous end of 6027 * data detection. 6028 */ 6029 if ((!terminate) && 6030 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0)) 6031 goto done; 6032 ctxt->checkIndex = 0; 6033 #ifdef DEBUG_PUSH 6034 xmlGenericError(xmlGenericErrorContext, 6035 "HPP: Parsing char data\n"); 6036 #endif 6037 while ((ctxt->instate != XML_PARSER_EOF) && 6038 (cur != '<') && (in->cur < in->end)) { 6039 if (cur == '&') { 6040 htmlParseReference(ctxt); 6041 } else { 6042 htmlParseCharData(ctxt); 6043 } 6044 cur = in->cur[0]; 6045 } 6046 } 6047 } 6048 6049 break; 6050 } 6051 case XML_PARSER_END_TAG: 6052 if (avail < 2) 6053 goto done; 6054 if ((!terminate) && 6055 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 6056 goto done; 6057 htmlParseEndTag(ctxt); 6058 if (ctxt->nameNr == 0) { 6059 ctxt->instate = XML_PARSER_EPILOG; 6060 } else { 6061 ctxt->instate = XML_PARSER_CONTENT; 6062 } 6063 ctxt->checkIndex = 0; 6064 #ifdef DEBUG_PUSH 6065 xmlGenericError(xmlGenericErrorContext, 6066 "HPP: entering CONTENT\n"); 6067 #endif 6068 break; 6069 case XML_PARSER_CDATA_SECTION: 6070 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 6071 "HPP: internal error, state == CDATA\n", 6072 NULL, NULL); 6073 ctxt->instate = XML_PARSER_CONTENT; 6074 ctxt->checkIndex = 0; 6075 #ifdef DEBUG_PUSH 6076 xmlGenericError(xmlGenericErrorContext, 6077 "HPP: entering CONTENT\n"); 6078 #endif 6079 break; 6080 case XML_PARSER_DTD: 6081 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 6082 "HPP: internal error, state == DTD\n", 6083 NULL, NULL); 6084 ctxt->instate = XML_PARSER_CONTENT; 6085 ctxt->checkIndex = 0; 6086 #ifdef DEBUG_PUSH 6087 xmlGenericError(xmlGenericErrorContext, 6088 "HPP: entering CONTENT\n"); 6089 #endif 6090 break; 6091 case XML_PARSER_COMMENT: 6092 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 6093 "HPP: internal error, state == COMMENT\n", 6094 NULL, NULL); 6095 ctxt->instate = XML_PARSER_CONTENT; 6096 ctxt->checkIndex = 0; 6097 #ifdef DEBUG_PUSH 6098 xmlGenericError(xmlGenericErrorContext, 6099 "HPP: entering CONTENT\n"); 6100 #endif 6101 break; 6102 case XML_PARSER_PI: 6103 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 6104 "HPP: internal error, state == PI\n", 6105 NULL, NULL); 6106 ctxt->instate = XML_PARSER_CONTENT; 6107 ctxt->checkIndex = 0; 6108 #ifdef DEBUG_PUSH 6109 xmlGenericError(xmlGenericErrorContext, 6110 "HPP: entering CONTENT\n"); 6111 #endif 6112 break; 6113 case XML_PARSER_ENTITY_DECL: 6114 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 6115 "HPP: internal error, state == ENTITY_DECL\n", 6116 NULL, NULL); 6117 ctxt->instate = XML_PARSER_CONTENT; 6118 ctxt->checkIndex = 0; 6119 #ifdef DEBUG_PUSH 6120 xmlGenericError(xmlGenericErrorContext, 6121 "HPP: entering CONTENT\n"); 6122 #endif 6123 break; 6124 case XML_PARSER_ENTITY_VALUE: 6125 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 6126 "HPP: internal error, state == ENTITY_VALUE\n", 6127 NULL, NULL); 6128 ctxt->instate = XML_PARSER_CONTENT; 6129 ctxt->checkIndex = 0; 6130 #ifdef DEBUG_PUSH 6131 xmlGenericError(xmlGenericErrorContext, 6132 "HPP: entering DTD\n"); 6133 #endif 6134 break; 6135 case XML_PARSER_ATTRIBUTE_VALUE: 6136 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 6137 "HPP: internal error, state == ATTRIBUTE_VALUE\n", 6138 NULL, NULL); 6139 ctxt->instate = XML_PARSER_START_TAG; 6140 ctxt->checkIndex = 0; 6141 #ifdef DEBUG_PUSH 6142 xmlGenericError(xmlGenericErrorContext, 6143 "HPP: entering START_TAG\n"); 6144 #endif 6145 break; 6146 case XML_PARSER_SYSTEM_LITERAL: 6147 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 6148 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n", 6149 NULL, NULL); 6150 ctxt->instate = XML_PARSER_CONTENT; 6151 ctxt->checkIndex = 0; 6152 #ifdef DEBUG_PUSH 6153 xmlGenericError(xmlGenericErrorContext, 6154 "HPP: entering CONTENT\n"); 6155 #endif 6156 break; 6157 case XML_PARSER_IGNORE: 6158 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 6159 "HPP: internal error, state == XML_PARSER_IGNORE\n", 6160 NULL, NULL); 6161 ctxt->instate = XML_PARSER_CONTENT; 6162 ctxt->checkIndex = 0; 6163 #ifdef DEBUG_PUSH 6164 xmlGenericError(xmlGenericErrorContext, 6165 "HPP: entering CONTENT\n"); 6166 #endif 6167 break; 6168 case XML_PARSER_PUBLIC_LITERAL: 6169 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 6170 "HPP: internal error, state == XML_PARSER_LITERAL\n", 6171 NULL, NULL); 6172 ctxt->instate = XML_PARSER_CONTENT; 6173 ctxt->checkIndex = 0; 6174 #ifdef DEBUG_PUSH 6175 xmlGenericError(xmlGenericErrorContext, 6176 "HPP: entering CONTENT\n"); 6177 #endif 6178 break; 6179 6180 } 6181 } 6182 done: 6183 if ((avail == 0) && (terminate)) { 6184 htmlAutoCloseOnEnd(ctxt); 6185 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { 6186 /* 6187 * SAX: end of the document processing. 6188 */ 6189 ctxt->instate = XML_PARSER_EOF; 6190 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 6191 ctxt->sax->endDocument(ctxt->userData); 6192 } 6193 } 6194 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) && 6195 ((terminate) || (ctxt->instate == XML_PARSER_EOF) || 6196 (ctxt->instate == XML_PARSER_EPILOG))) { 6197 xmlDtdPtr dtd; 6198 dtd = xmlGetIntSubset(ctxt->myDoc); 6199 if (dtd == NULL) 6200 ctxt->myDoc->intSubset = 6201 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html", 6202 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN", 6203 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"); 6204 } 6205 #ifdef DEBUG_PUSH 6206 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret); 6207 #endif 6208 return(ret); 6209 } 6210 6211 /** 6212 * htmlParseChunk: 6213 * @ctxt: an HTML parser context 6214 * @chunk: an char array 6215 * @size: the size in byte of the chunk 6216 * @terminate: last chunk indicator 6217 * 6218 * Parse a Chunk of memory 6219 * 6220 * Returns zero if no error, the xmlParserErrors otherwise. 6221 */ 6222 int 6223 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size, 6224 int terminate) { 6225 if ((ctxt == NULL) || (ctxt->input == NULL)) { 6226 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 6227 "htmlParseChunk: context error\n", NULL, NULL); 6228 return(XML_ERR_INTERNAL_ERROR); 6229 } 6230 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && 6231 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) { 6232 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input); 6233 size_t cur = ctxt->input->cur - ctxt->input->base; 6234 int res; 6235 6236 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk); 6237 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur); 6238 if (res < 0) { 6239 ctxt->errNo = XML_PARSER_EOF; 6240 ctxt->disableSAX = 1; 6241 return (XML_PARSER_EOF); 6242 } 6243 #ifdef DEBUG_PUSH 6244 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); 6245 #endif 6246 6247 #if 0 6248 if ((terminate) || (ctxt->input->buf->buffer->use > 80)) 6249 htmlParseTryOrFinish(ctxt, terminate); 6250 #endif 6251 } else if (ctxt->instate != XML_PARSER_EOF) { 6252 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) { 6253 xmlParserInputBufferPtr in = ctxt->input->buf; 6254 if ((in->encoder != NULL) && (in->buffer != NULL) && 6255 (in->raw != NULL)) { 6256 int nbchars; 6257 size_t base = xmlBufGetInputBase(in->buffer, ctxt->input); 6258 size_t current = ctxt->input->cur - ctxt->input->base; 6259 6260 nbchars = xmlCharEncInput(in, terminate); 6261 xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current); 6262 if (nbchars < 0) { 6263 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 6264 "encoder error\n", NULL, NULL); 6265 return(XML_ERR_INVALID_ENCODING); 6266 } 6267 } 6268 } 6269 } 6270 htmlParseTryOrFinish(ctxt, terminate); 6271 if (terminate) { 6272 if ((ctxt->instate != XML_PARSER_EOF) && 6273 (ctxt->instate != XML_PARSER_EPILOG) && 6274 (ctxt->instate != XML_PARSER_MISC)) { 6275 ctxt->errNo = XML_ERR_DOCUMENT_END; 6276 ctxt->wellFormed = 0; 6277 } 6278 if (ctxt->instate != XML_PARSER_EOF) { 6279 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 6280 ctxt->sax->endDocument(ctxt->userData); 6281 } 6282 ctxt->instate = XML_PARSER_EOF; 6283 } 6284 return((xmlParserErrors) ctxt->errNo); 6285 } 6286 6287 /************************************************************************ 6288 * * 6289 * User entry points * 6290 * * 6291 ************************************************************************/ 6292 6293 /** 6294 * htmlCreatePushParserCtxt: 6295 * @sax: a SAX handler 6296 * @user_data: The user data returned on SAX callbacks 6297 * @chunk: a pointer to an array of chars 6298 * @size: number of chars in the array 6299 * @filename: an optional file name or URI 6300 * @enc: an optional encoding 6301 * 6302 * Create a parser context for using the HTML parser in push mode 6303 * The value of @filename is used for fetching external entities 6304 * and error/warning reports. 6305 * 6306 * Returns the new parser context or NULL 6307 */ 6308 htmlParserCtxtPtr 6309 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data, 6310 const char *chunk, int size, const char *filename, 6311 xmlCharEncoding enc) { 6312 htmlParserCtxtPtr ctxt; 6313 htmlParserInputPtr inputStream; 6314 xmlParserInputBufferPtr buf; 6315 6316 xmlInitParser(); 6317 6318 buf = xmlAllocParserInputBuffer(enc); 6319 if (buf == NULL) return(NULL); 6320 6321 ctxt = htmlNewParserCtxt(); 6322 if (ctxt == NULL) { 6323 xmlFreeParserInputBuffer(buf); 6324 return(NULL); 6325 } 6326 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder) 6327 ctxt->charset=XML_CHAR_ENCODING_UTF8; 6328 if (sax != NULL) { 6329 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler) 6330 xmlFree(ctxt->sax); 6331 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler)); 6332 if (ctxt->sax == NULL) { 6333 xmlFree(buf); 6334 xmlFree(ctxt); 6335 return(NULL); 6336 } 6337 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler)); 6338 if (user_data != NULL) 6339 ctxt->userData = user_data; 6340 } 6341 if (filename == NULL) { 6342 ctxt->directory = NULL; 6343 } else { 6344 ctxt->directory = xmlParserGetDirectory(filename); 6345 } 6346 6347 inputStream = htmlNewInputStream(ctxt); 6348 if (inputStream == NULL) { 6349 xmlFreeParserCtxt(ctxt); 6350 xmlFree(buf); 6351 return(NULL); 6352 } 6353 6354 if (filename == NULL) 6355 inputStream->filename = NULL; 6356 else 6357 inputStream->filename = (char *) 6358 xmlCanonicPath((const xmlChar *) filename); 6359 inputStream->buf = buf; 6360 xmlBufResetInput(buf->buffer, inputStream); 6361 6362 inputPush(ctxt, inputStream); 6363 6364 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && 6365 (ctxt->input->buf != NULL)) { 6366 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input); 6367 size_t cur = ctxt->input->cur - ctxt->input->base; 6368 6369 xmlParserInputBufferPush(ctxt->input->buf, size, chunk); 6370 6371 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur); 6372 #ifdef DEBUG_PUSH 6373 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); 6374 #endif 6375 } 6376 ctxt->progressive = 1; 6377 6378 return(ctxt); 6379 } 6380 #endif /* LIBXML_PUSH_ENABLED */ 6381 6382 /** 6383 * htmlSAXParseDoc: 6384 * @cur: a pointer to an array of xmlChar 6385 * @encoding: a free form C string describing the HTML document encoding, or NULL 6386 * @sax: the SAX handler block 6387 * @userData: if using SAX, this pointer will be provided on callbacks. 6388 * 6389 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks 6390 * to handle parse events. If sax is NULL, fallback to the default DOM 6391 * behavior and return a tree. 6392 * 6393 * Returns the resulting document tree unless SAX is NULL or the document is 6394 * not well formed. 6395 */ 6396 6397 htmlDocPtr 6398 htmlSAXParseDoc(const xmlChar *cur, const char *encoding, 6399 htmlSAXHandlerPtr sax, void *userData) { 6400 htmlDocPtr ret; 6401 htmlParserCtxtPtr ctxt; 6402 6403 xmlInitParser(); 6404 6405 if (cur == NULL) return(NULL); 6406 6407 6408 ctxt = htmlCreateDocParserCtxt(cur, encoding); 6409 if (ctxt == NULL) return(NULL); 6410 if (sax != NULL) { 6411 if (ctxt->sax != NULL) xmlFree (ctxt->sax); 6412 ctxt->sax = sax; 6413 ctxt->userData = userData; 6414 } 6415 6416 htmlParseDocument(ctxt); 6417 ret = ctxt->myDoc; 6418 if (sax != NULL) { 6419 ctxt->sax = NULL; 6420 ctxt->userData = NULL; 6421 } 6422 htmlFreeParserCtxt(ctxt); 6423 6424 return(ret); 6425 } 6426 6427 /** 6428 * htmlParseDoc: 6429 * @cur: a pointer to an array of xmlChar 6430 * @encoding: a free form C string describing the HTML document encoding, or NULL 6431 * 6432 * parse an HTML in-memory document and build a tree. 6433 * 6434 * Returns the resulting document tree 6435 */ 6436 6437 htmlDocPtr 6438 htmlParseDoc(const xmlChar *cur, const char *encoding) { 6439 return(htmlSAXParseDoc(cur, encoding, NULL, NULL)); 6440 } 6441 6442 6443 /** 6444 * htmlCreateFileParserCtxt: 6445 * @filename: the filename 6446 * @encoding: a free form C string describing the HTML document encoding, or NULL 6447 * 6448 * Create a parser context for a file content. 6449 * Automatic support for ZLIB/Compress compressed document is provided 6450 * by default if found at compile-time. 6451 * 6452 * Returns the new parser context or NULL 6453 */ 6454 htmlParserCtxtPtr 6455 htmlCreateFileParserCtxt(const char *filename, const char *encoding) 6456 { 6457 htmlParserCtxtPtr ctxt; 6458 htmlParserInputPtr inputStream; 6459 char *canonicFilename; 6460 /* htmlCharEncoding enc; */ 6461 xmlChar *content, *content_line = (xmlChar *) "charset="; 6462 6463 if (filename == NULL) 6464 return(NULL); 6465 6466 ctxt = htmlNewParserCtxt(); 6467 if (ctxt == NULL) { 6468 return(NULL); 6469 } 6470 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename); 6471 if (canonicFilename == NULL) { 6472 #ifdef LIBXML_SAX1_ENABLED 6473 if (xmlDefaultSAXHandler.error != NULL) { 6474 xmlDefaultSAXHandler.error(NULL, "out of memory\n"); 6475 } 6476 #endif 6477 xmlFreeParserCtxt(ctxt); 6478 return(NULL); 6479 } 6480 6481 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt); 6482 xmlFree(canonicFilename); 6483 if (inputStream == NULL) { 6484 xmlFreeParserCtxt(ctxt); 6485 return(NULL); 6486 } 6487 6488 inputPush(ctxt, inputStream); 6489 6490 /* set encoding */ 6491 if (encoding) { 6492 size_t l = strlen(encoding); 6493 6494 if (l < 1000) { 6495 content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1); 6496 if (content) { 6497 strcpy ((char *)content, (char *)content_line); 6498 strcat ((char *)content, (char *)encoding); 6499 htmlCheckEncoding (ctxt, content); 6500 xmlFree (content); 6501 } 6502 } 6503 } 6504 6505 return(ctxt); 6506 } 6507 6508 /** 6509 * htmlSAXParseFile: 6510 * @filename: the filename 6511 * @encoding: a free form C string describing the HTML document encoding, or NULL 6512 * @sax: the SAX handler block 6513 * @userData: if using SAX, this pointer will be provided on callbacks. 6514 * 6515 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress 6516 * compressed document is provided by default if found at compile-time. 6517 * It use the given SAX function block to handle the parsing callback. 6518 * If sax is NULL, fallback to the default DOM tree building routines. 6519 * 6520 * Returns the resulting document tree unless SAX is NULL or the document is 6521 * not well formed. 6522 */ 6523 6524 htmlDocPtr 6525 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax, 6526 void *userData) { 6527 htmlDocPtr ret; 6528 htmlParserCtxtPtr ctxt; 6529 htmlSAXHandlerPtr oldsax = NULL; 6530 6531 xmlInitParser(); 6532 6533 ctxt = htmlCreateFileParserCtxt(filename, encoding); 6534 if (ctxt == NULL) return(NULL); 6535 if (sax != NULL) { 6536 oldsax = ctxt->sax; 6537 ctxt->sax = sax; 6538 ctxt->userData = userData; 6539 } 6540 6541 htmlParseDocument(ctxt); 6542 6543 ret = ctxt->myDoc; 6544 if (sax != NULL) { 6545 ctxt->sax = oldsax; 6546 ctxt->userData = NULL; 6547 } 6548 htmlFreeParserCtxt(ctxt); 6549 6550 return(ret); 6551 } 6552 6553 /** 6554 * htmlParseFile: 6555 * @filename: the filename 6556 * @encoding: a free form C string describing the HTML document encoding, or NULL 6557 * 6558 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress 6559 * compressed document is provided by default if found at compile-time. 6560 * 6561 * Returns the resulting document tree 6562 */ 6563 6564 htmlDocPtr 6565 htmlParseFile(const char *filename, const char *encoding) { 6566 return(htmlSAXParseFile(filename, encoding, NULL, NULL)); 6567 } 6568 6569 /** 6570 * htmlHandleOmittedElem: 6571 * @val: int 0 or 1 6572 * 6573 * Set and return the previous value for handling HTML omitted tags. 6574 * 6575 * Returns the last value for 0 for no handling, 1 for auto insertion. 6576 */ 6577 6578 int 6579 htmlHandleOmittedElem(int val) { 6580 int old = htmlOmittedDefaultValue; 6581 6582 htmlOmittedDefaultValue = val; 6583 return(old); 6584 } 6585 6586 /** 6587 * htmlElementAllowedHere: 6588 * @parent: HTML parent element 6589 * @elt: HTML element 6590 * 6591 * Checks whether an HTML element may be a direct child of a parent element. 6592 * Note - doesn't check for deprecated elements 6593 * 6594 * Returns 1 if allowed; 0 otherwise. 6595 */ 6596 int 6597 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) { 6598 const char** p ; 6599 6600 if ( ! elt || ! parent || ! parent->subelts ) 6601 return 0 ; 6602 6603 for ( p = parent->subelts; *p; ++p ) 6604 if ( !xmlStrcmp((const xmlChar *)*p, elt) ) 6605 return 1 ; 6606 6607 return 0 ; 6608 } 6609 /** 6610 * htmlElementStatusHere: 6611 * @parent: HTML parent element 6612 * @elt: HTML element 6613 * 6614 * Checks whether an HTML element may be a direct child of a parent element. 6615 * and if so whether it is valid or deprecated. 6616 * 6617 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID 6618 */ 6619 htmlStatus 6620 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) { 6621 if ( ! parent || ! elt ) 6622 return HTML_INVALID ; 6623 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) ) 6624 return HTML_INVALID ; 6625 6626 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ; 6627 } 6628 /** 6629 * htmlAttrAllowed: 6630 * @elt: HTML element 6631 * @attr: HTML attribute 6632 * @legacy: whether to allow deprecated attributes 6633 * 6634 * Checks whether an attribute is valid for an element 6635 * Has full knowledge of Required and Deprecated attributes 6636 * 6637 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID 6638 */ 6639 htmlStatus 6640 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) { 6641 const char** p ; 6642 6643 if ( !elt || ! attr ) 6644 return HTML_INVALID ; 6645 6646 if ( elt->attrs_req ) 6647 for ( p = elt->attrs_req; *p; ++p) 6648 if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 6649 return HTML_REQUIRED ; 6650 6651 if ( elt->attrs_opt ) 6652 for ( p = elt->attrs_opt; *p; ++p) 6653 if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 6654 return HTML_VALID ; 6655 6656 if ( legacy && elt->attrs_depr ) 6657 for ( p = elt->attrs_depr; *p; ++p) 6658 if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 6659 return HTML_DEPRECATED ; 6660 6661 return HTML_INVALID ; 6662 } 6663 /** 6664 * htmlNodeStatus: 6665 * @node: an htmlNodePtr in a tree 6666 * @legacy: whether to allow deprecated elements (YES is faster here 6667 * for Element nodes) 6668 * 6669 * Checks whether the tree node is valid. Experimental (the author 6670 * only uses the HTML enhancements in a SAX parser) 6671 * 6672 * Return: for Element nodes, a return from htmlElementAllowedHere (if 6673 * legacy allowed) or htmlElementStatusHere (otherwise). 6674 * for Attribute nodes, a return from htmlAttrAllowed 6675 * for other nodes, HTML_NA (no checks performed) 6676 */ 6677 htmlStatus 6678 htmlNodeStatus(const htmlNodePtr node, int legacy) { 6679 if ( ! node ) 6680 return HTML_INVALID ; 6681 6682 switch ( node->type ) { 6683 case XML_ELEMENT_NODE: 6684 return legacy 6685 ? ( htmlElementAllowedHere ( 6686 htmlTagLookup(node->parent->name) , node->name 6687 ) ? HTML_VALID : HTML_INVALID ) 6688 : htmlElementStatusHere( 6689 htmlTagLookup(node->parent->name) , 6690 htmlTagLookup(node->name) ) 6691 ; 6692 case XML_ATTRIBUTE_NODE: 6693 return htmlAttrAllowed( 6694 htmlTagLookup(node->parent->name) , node->name, legacy) ; 6695 default: return HTML_NA ; 6696 } 6697 } 6698 /************************************************************************ 6699 * * 6700 * New set (2.6.0) of simpler and more flexible APIs * 6701 * * 6702 ************************************************************************/ 6703 /** 6704 * DICT_FREE: 6705 * @str: a string 6706 * 6707 * Free a string if it is not owned by the "dict" dictionary in the 6708 * current scope 6709 */ 6710 #define DICT_FREE(str) \ 6711 if ((str) && ((!dict) || \ 6712 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \ 6713 xmlFree((char *)(str)); 6714 6715 /** 6716 * htmlCtxtReset: 6717 * @ctxt: an HTML parser context 6718 * 6719 * Reset a parser context 6720 */ 6721 void 6722 htmlCtxtReset(htmlParserCtxtPtr ctxt) 6723 { 6724 xmlParserInputPtr input; 6725 xmlDictPtr dict; 6726 6727 if (ctxt == NULL) 6728 return; 6729 6730 xmlInitParser(); 6731 dict = ctxt->dict; 6732 6733 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */ 6734 xmlFreeInputStream(input); 6735 } 6736 ctxt->inputNr = 0; 6737 ctxt->input = NULL; 6738 6739 ctxt->spaceNr = 0; 6740 if (ctxt->spaceTab != NULL) { 6741 ctxt->spaceTab[0] = -1; 6742 ctxt->space = &ctxt->spaceTab[0]; 6743 } else { 6744 ctxt->space = NULL; 6745 } 6746 6747 6748 ctxt->nodeNr = 0; 6749 ctxt->node = NULL; 6750 6751 ctxt->nameNr = 0; 6752 ctxt->name = NULL; 6753 6754 DICT_FREE(ctxt->version); 6755 ctxt->version = NULL; 6756 DICT_FREE(ctxt->encoding); 6757 ctxt->encoding = NULL; 6758 DICT_FREE(ctxt->directory); 6759 ctxt->directory = NULL; 6760 DICT_FREE(ctxt->extSubURI); 6761 ctxt->extSubURI = NULL; 6762 DICT_FREE(ctxt->extSubSystem); 6763 ctxt->extSubSystem = NULL; 6764 if (ctxt->myDoc != NULL) 6765 xmlFreeDoc(ctxt->myDoc); 6766 ctxt->myDoc = NULL; 6767 6768 ctxt->standalone = -1; 6769 ctxt->hasExternalSubset = 0; 6770 ctxt->hasPErefs = 0; 6771 ctxt->html = 1; 6772 ctxt->external = 0; 6773 ctxt->instate = XML_PARSER_START; 6774 ctxt->token = 0; 6775 6776 ctxt->wellFormed = 1; 6777 ctxt->nsWellFormed = 1; 6778 ctxt->disableSAX = 0; 6779 ctxt->valid = 1; 6780 ctxt->vctxt.userData = ctxt; 6781 ctxt->vctxt.error = xmlParserValidityError; 6782 ctxt->vctxt.warning = xmlParserValidityWarning; 6783 ctxt->record_info = 0; 6784 ctxt->checkIndex = 0; 6785 ctxt->inSubset = 0; 6786 ctxt->errNo = XML_ERR_OK; 6787 ctxt->depth = 0; 6788 ctxt->charset = XML_CHAR_ENCODING_NONE; 6789 ctxt->catalogs = NULL; 6790 xmlInitNodeInfoSeq(&ctxt->node_seq); 6791 6792 if (ctxt->attsDefault != NULL) { 6793 xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator); 6794 ctxt->attsDefault = NULL; 6795 } 6796 if (ctxt->attsSpecial != NULL) { 6797 xmlHashFree(ctxt->attsSpecial, NULL); 6798 ctxt->attsSpecial = NULL; 6799 } 6800 } 6801 6802 /** 6803 * htmlCtxtUseOptions: 6804 * @ctxt: an HTML parser context 6805 * @options: a combination of htmlParserOption(s) 6806 * 6807 * Applies the options to the parser context 6808 * 6809 * Returns 0 in case of success, the set of unknown or unimplemented options 6810 * in case of error. 6811 */ 6812 int 6813 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options) 6814 { 6815 if (ctxt == NULL) 6816 return(-1); 6817 6818 if (options & HTML_PARSE_NOWARNING) { 6819 ctxt->sax->warning = NULL; 6820 ctxt->vctxt.warning = NULL; 6821 options -= XML_PARSE_NOWARNING; 6822 ctxt->options |= XML_PARSE_NOWARNING; 6823 } 6824 if (options & HTML_PARSE_NOERROR) { 6825 ctxt->sax->error = NULL; 6826 ctxt->vctxt.error = NULL; 6827 ctxt->sax->fatalError = NULL; 6828 options -= XML_PARSE_NOERROR; 6829 ctxt->options |= XML_PARSE_NOERROR; 6830 } 6831 if (options & HTML_PARSE_PEDANTIC) { 6832 ctxt->pedantic = 1; 6833 options -= XML_PARSE_PEDANTIC; 6834 ctxt->options |= XML_PARSE_PEDANTIC; 6835 } else 6836 ctxt->pedantic = 0; 6837 if (options & XML_PARSE_NOBLANKS) { 6838 ctxt->keepBlanks = 0; 6839 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace; 6840 options -= XML_PARSE_NOBLANKS; 6841 ctxt->options |= XML_PARSE_NOBLANKS; 6842 } else 6843 ctxt->keepBlanks = 1; 6844 if (options & HTML_PARSE_RECOVER) { 6845 ctxt->recovery = 1; 6846 options -= HTML_PARSE_RECOVER; 6847 } else 6848 ctxt->recovery = 0; 6849 if (options & HTML_PARSE_COMPACT) { 6850 ctxt->options |= HTML_PARSE_COMPACT; 6851 options -= HTML_PARSE_COMPACT; 6852 } 6853 if (options & XML_PARSE_HUGE) { 6854 ctxt->options |= XML_PARSE_HUGE; 6855 options -= XML_PARSE_HUGE; 6856 } 6857 if (options & HTML_PARSE_NODEFDTD) { 6858 ctxt->options |= HTML_PARSE_NODEFDTD; 6859 options -= HTML_PARSE_NODEFDTD; 6860 } 6861 if (options & HTML_PARSE_IGNORE_ENC) { 6862 ctxt->options |= HTML_PARSE_IGNORE_ENC; 6863 options -= HTML_PARSE_IGNORE_ENC; 6864 } 6865 if (options & HTML_PARSE_NOIMPLIED) { 6866 ctxt->options |= HTML_PARSE_NOIMPLIED; 6867 options -= HTML_PARSE_NOIMPLIED; 6868 } 6869 ctxt->dictNames = 0; 6870 return (options); 6871 } 6872 6873 /** 6874 * htmlDoRead: 6875 * @ctxt: an HTML parser context 6876 * @URL: the base URL to use for the document 6877 * @encoding: the document encoding, or NULL 6878 * @options: a combination of htmlParserOption(s) 6879 * @reuse: keep the context for reuse 6880 * 6881 * Common front-end for the htmlRead functions 6882 * 6883 * Returns the resulting document tree or NULL 6884 */ 6885 static htmlDocPtr 6886 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding, 6887 int options, int reuse) 6888 { 6889 htmlDocPtr ret; 6890 6891 htmlCtxtUseOptions(ctxt, options); 6892 ctxt->html = 1; 6893 if (encoding != NULL) { 6894 xmlCharEncodingHandlerPtr hdlr; 6895 6896 hdlr = xmlFindCharEncodingHandler(encoding); 6897 if (hdlr != NULL) { 6898 xmlSwitchToEncoding(ctxt, hdlr); 6899 if (ctxt->input->encoding != NULL) 6900 xmlFree((xmlChar *) ctxt->input->encoding); 6901 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding); 6902 } 6903 } 6904 if ((URL != NULL) && (ctxt->input != NULL) && 6905 (ctxt->input->filename == NULL)) 6906 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL); 6907 htmlParseDocument(ctxt); 6908 ret = ctxt->myDoc; 6909 ctxt->myDoc = NULL; 6910 if (!reuse) { 6911 if ((ctxt->dictNames) && 6912 (ret != NULL) && 6913 (ret->dict == ctxt->dict)) 6914 ctxt->dict = NULL; 6915 xmlFreeParserCtxt(ctxt); 6916 } 6917 return (ret); 6918 } 6919 6920 /** 6921 * htmlReadDoc: 6922 * @cur: a pointer to a zero terminated string 6923 * @URL: the base URL to use for the document 6924 * @encoding: the document encoding, or NULL 6925 * @options: a combination of htmlParserOption(s) 6926 * 6927 * parse an XML in-memory document and build a tree. 6928 * 6929 * Returns the resulting document tree 6930 */ 6931 htmlDocPtr 6932 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options) 6933 { 6934 htmlParserCtxtPtr ctxt; 6935 6936 if (cur == NULL) 6937 return (NULL); 6938 6939 xmlInitParser(); 6940 ctxt = htmlCreateDocParserCtxt(cur, NULL); 6941 if (ctxt == NULL) 6942 return (NULL); 6943 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6944 } 6945 6946 /** 6947 * htmlReadFile: 6948 * @filename: a file or URL 6949 * @encoding: the document encoding, or NULL 6950 * @options: a combination of htmlParserOption(s) 6951 * 6952 * parse an XML file from the filesystem or the network. 6953 * 6954 * Returns the resulting document tree 6955 */ 6956 htmlDocPtr 6957 htmlReadFile(const char *filename, const char *encoding, int options) 6958 { 6959 htmlParserCtxtPtr ctxt; 6960 6961 xmlInitParser(); 6962 ctxt = htmlCreateFileParserCtxt(filename, encoding); 6963 if (ctxt == NULL) 6964 return (NULL); 6965 return (htmlDoRead(ctxt, NULL, NULL, options, 0)); 6966 } 6967 6968 /** 6969 * htmlReadMemory: 6970 * @buffer: a pointer to a char array 6971 * @size: the size of the array 6972 * @URL: the base URL to use for the document 6973 * @encoding: the document encoding, or NULL 6974 * @options: a combination of htmlParserOption(s) 6975 * 6976 * parse an XML in-memory document and build a tree. 6977 * 6978 * Returns the resulting document tree 6979 */ 6980 htmlDocPtr 6981 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options) 6982 { 6983 htmlParserCtxtPtr ctxt; 6984 6985 xmlInitParser(); 6986 ctxt = xmlCreateMemoryParserCtxt(buffer, size); 6987 if (ctxt == NULL) 6988 return (NULL); 6989 htmlDefaultSAXHandlerInit(); 6990 if (ctxt->sax != NULL) 6991 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1)); 6992 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6993 } 6994 6995 /** 6996 * htmlReadFd: 6997 * @fd: an open file descriptor 6998 * @URL: the base URL to use for the document 6999 * @encoding: the document encoding, or NULL 7000 * @options: a combination of htmlParserOption(s) 7001 * 7002 * parse an XML from a file descriptor and build a tree. 7003 * 7004 * Returns the resulting document tree 7005 */ 7006 htmlDocPtr 7007 htmlReadFd(int fd, const char *URL, const char *encoding, int options) 7008 { 7009 htmlParserCtxtPtr ctxt; 7010 xmlParserInputBufferPtr input; 7011 xmlParserInputPtr stream; 7012 7013 if (fd < 0) 7014 return (NULL); 7015 xmlInitParser(); 7016 7017 xmlInitParser(); 7018 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE); 7019 if (input == NULL) 7020 return (NULL); 7021 ctxt = xmlNewParserCtxt(); 7022 if (ctxt == NULL) { 7023 xmlFreeParserInputBuffer(input); 7024 return (NULL); 7025 } 7026 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 7027 if (stream == NULL) { 7028 xmlFreeParserInputBuffer(input); 7029 xmlFreeParserCtxt(ctxt); 7030 return (NULL); 7031 } 7032 inputPush(ctxt, stream); 7033 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 7034 } 7035 7036 /** 7037 * htmlReadIO: 7038 * @ioread: an I/O read function 7039 * @ioclose: an I/O close function 7040 * @ioctx: an I/O handler 7041 * @URL: the base URL to use for the document 7042 * @encoding: the document encoding, or NULL 7043 * @options: a combination of htmlParserOption(s) 7044 * 7045 * parse an HTML document from I/O functions and source and build a tree. 7046 * 7047 * Returns the resulting document tree 7048 */ 7049 htmlDocPtr 7050 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose, 7051 void *ioctx, const char *URL, const char *encoding, int options) 7052 { 7053 htmlParserCtxtPtr ctxt; 7054 xmlParserInputBufferPtr input; 7055 xmlParserInputPtr stream; 7056 7057 if (ioread == NULL) 7058 return (NULL); 7059 xmlInitParser(); 7060 7061 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx, 7062 XML_CHAR_ENCODING_NONE); 7063 if (input == NULL) { 7064 if (ioclose != NULL) 7065 ioclose(ioctx); 7066 return (NULL); 7067 } 7068 ctxt = htmlNewParserCtxt(); 7069 if (ctxt == NULL) { 7070 xmlFreeParserInputBuffer(input); 7071 return (NULL); 7072 } 7073 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 7074 if (stream == NULL) { 7075 xmlFreeParserInputBuffer(input); 7076 xmlFreeParserCtxt(ctxt); 7077 return (NULL); 7078 } 7079 inputPush(ctxt, stream); 7080 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 7081 } 7082 7083 /** 7084 * htmlCtxtReadDoc: 7085 * @ctxt: an HTML parser context 7086 * @cur: a pointer to a zero terminated string 7087 * @URL: the base URL to use for the document 7088 * @encoding: the document encoding, or NULL 7089 * @options: a combination of htmlParserOption(s) 7090 * 7091 * parse an XML in-memory document and build a tree. 7092 * This reuses the existing @ctxt parser context 7093 * 7094 * Returns the resulting document tree 7095 */ 7096 htmlDocPtr 7097 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur, 7098 const char *URL, const char *encoding, int options) 7099 { 7100 xmlParserInputPtr stream; 7101 7102 if (cur == NULL) 7103 return (NULL); 7104 if (ctxt == NULL) 7105 return (NULL); 7106 xmlInitParser(); 7107 7108 htmlCtxtReset(ctxt); 7109 7110 stream = xmlNewStringInputStream(ctxt, cur); 7111 if (stream == NULL) { 7112 return (NULL); 7113 } 7114 inputPush(ctxt, stream); 7115 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 7116 } 7117 7118 /** 7119 * htmlCtxtReadFile: 7120 * @ctxt: an HTML parser context 7121 * @filename: a file or URL 7122 * @encoding: the document encoding, or NULL 7123 * @options: a combination of htmlParserOption(s) 7124 * 7125 * parse an XML file from the filesystem or the network. 7126 * This reuses the existing @ctxt parser context 7127 * 7128 * Returns the resulting document tree 7129 */ 7130 htmlDocPtr 7131 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename, 7132 const char *encoding, int options) 7133 { 7134 xmlParserInputPtr stream; 7135 7136 if (filename == NULL) 7137 return (NULL); 7138 if (ctxt == NULL) 7139 return (NULL); 7140 xmlInitParser(); 7141 7142 htmlCtxtReset(ctxt); 7143 7144 stream = xmlLoadExternalEntity(filename, NULL, ctxt); 7145 if (stream == NULL) { 7146 return (NULL); 7147 } 7148 inputPush(ctxt, stream); 7149 return (htmlDoRead(ctxt, NULL, encoding, options, 1)); 7150 } 7151 7152 /** 7153 * htmlCtxtReadMemory: 7154 * @ctxt: an HTML parser context 7155 * @buffer: a pointer to a char array 7156 * @size: the size of the array 7157 * @URL: the base URL to use for the document 7158 * @encoding: the document encoding, or NULL 7159 * @options: a combination of htmlParserOption(s) 7160 * 7161 * parse an XML in-memory document and build a tree. 7162 * This reuses the existing @ctxt parser context 7163 * 7164 * Returns the resulting document tree 7165 */ 7166 htmlDocPtr 7167 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size, 7168 const char *URL, const char *encoding, int options) 7169 { 7170 xmlParserInputBufferPtr input; 7171 xmlParserInputPtr stream; 7172 7173 if (ctxt == NULL) 7174 return (NULL); 7175 if (buffer == NULL) 7176 return (NULL); 7177 xmlInitParser(); 7178 7179 htmlCtxtReset(ctxt); 7180 7181 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE); 7182 if (input == NULL) { 7183 return(NULL); 7184 } 7185 7186 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 7187 if (stream == NULL) { 7188 xmlFreeParserInputBuffer(input); 7189 return(NULL); 7190 } 7191 7192 inputPush(ctxt, stream); 7193 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 7194 } 7195 7196 /** 7197 * htmlCtxtReadFd: 7198 * @ctxt: an HTML parser context 7199 * @fd: an open file descriptor 7200 * @URL: the base URL to use for the document 7201 * @encoding: the document encoding, or NULL 7202 * @options: a combination of htmlParserOption(s) 7203 * 7204 * parse an XML from a file descriptor and build a tree. 7205 * This reuses the existing @ctxt parser context 7206 * 7207 * Returns the resulting document tree 7208 */ 7209 htmlDocPtr 7210 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd, 7211 const char *URL, const char *encoding, int options) 7212 { 7213 xmlParserInputBufferPtr input; 7214 xmlParserInputPtr stream; 7215 7216 if (fd < 0) 7217 return (NULL); 7218 if (ctxt == NULL) 7219 return (NULL); 7220 xmlInitParser(); 7221 7222 htmlCtxtReset(ctxt); 7223 7224 7225 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE); 7226 if (input == NULL) 7227 return (NULL); 7228 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 7229 if (stream == NULL) { 7230 xmlFreeParserInputBuffer(input); 7231 return (NULL); 7232 } 7233 inputPush(ctxt, stream); 7234 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 7235 } 7236 7237 /** 7238 * htmlCtxtReadIO: 7239 * @ctxt: an HTML parser context 7240 * @ioread: an I/O read function 7241 * @ioclose: an I/O close function 7242 * @ioctx: an I/O handler 7243 * @URL: the base URL to use for the document 7244 * @encoding: the document encoding, or NULL 7245 * @options: a combination of htmlParserOption(s) 7246 * 7247 * parse an HTML document from I/O functions and source and build a tree. 7248 * This reuses the existing @ctxt parser context 7249 * 7250 * Returns the resulting document tree 7251 */ 7252 htmlDocPtr 7253 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread, 7254 xmlInputCloseCallback ioclose, void *ioctx, 7255 const char *URL, 7256 const char *encoding, int options) 7257 { 7258 xmlParserInputBufferPtr input; 7259 xmlParserInputPtr stream; 7260 7261 if (ioread == NULL) 7262 return (NULL); 7263 if (ctxt == NULL) 7264 return (NULL); 7265 xmlInitParser(); 7266 7267 htmlCtxtReset(ctxt); 7268 7269 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx, 7270 XML_CHAR_ENCODING_NONE); 7271 if (input == NULL) { 7272 if (ioclose != NULL) 7273 ioclose(ioctx); 7274 return (NULL); 7275 } 7276 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 7277 if (stream == NULL) { 7278 xmlFreeParserInputBuffer(input); 7279 return (NULL); 7280 } 7281 inputPush(ctxt, stream); 7282 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 7283 } 7284 7285 #define bottom_HTMLparser 7286 #include "elfgcchack.h" 7287 #endif /* LIBXML_HTML_ENABLED */ 7288