1 /* 2 * HTMLparser.c : an HTML 4.0 non-verifying parser 3 * 4 * See Copyright for the status of this software. 5 * 6 * daniel@veillard.com 7 */ 8 9 #define IN_LIBXML 10 #include "libxml.h" 11 #ifdef LIBXML_HTML_ENABLED 12 13 #include <string.h> 14 #ifdef HAVE_CTYPE_H 15 #include <ctype.h> 16 #endif 17 #ifdef HAVE_STDLIB_H 18 #include <stdlib.h> 19 #endif 20 #ifdef HAVE_SYS_STAT_H 21 #include <sys/stat.h> 22 #endif 23 #ifdef HAVE_FCNTL_H 24 #include <fcntl.h> 25 #endif 26 #ifdef HAVE_UNISTD_H 27 #include <unistd.h> 28 #endif 29 #ifdef HAVE_ZLIB_H 30 #include <zlib.h> 31 #endif 32 33 #include <libxml/xmlmemory.h> 34 #include <libxml/tree.h> 35 #include <libxml/parser.h> 36 #include <libxml/parserInternals.h> 37 #include <libxml/xmlerror.h> 38 #include <libxml/HTMLparser.h> 39 #include <libxml/HTMLtree.h> 40 #include <libxml/entities.h> 41 #include <libxml/encoding.h> 42 #include <libxml/valid.h> 43 #include <libxml/xmlIO.h> 44 #include <libxml/globals.h> 45 #include <libxml/uri.h> 46 47 #include "buf.h" 48 #include "enc.h" 49 50 #define HTML_MAX_NAMELEN 1000 51 #define HTML_PARSER_BIG_BUFFER_SIZE 1000 52 #define HTML_PARSER_BUFFER_SIZE 100 53 54 /* #define DEBUG */ 55 /* #define DEBUG_PUSH */ 56 57 static int htmlOmittedDefaultValue = 1; 58 59 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len, 60 xmlChar end, xmlChar end2, xmlChar end3); 61 static void htmlParseComment(htmlParserCtxtPtr ctxt); 62 63 /************************************************************************ 64 * * 65 * Some factorized error routines * 66 * * 67 ************************************************************************/ 68 69 /** 70 * htmlErrMemory: 71 * @ctxt: an HTML parser context 72 * @extra: extra informations 73 * 74 * Handle a redefinition of attribute error 75 */ 76 static void 77 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra) 78 { 79 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 80 (ctxt->instate == XML_PARSER_EOF)) 81 return; 82 if (ctxt != NULL) { 83 ctxt->errNo = XML_ERR_NO_MEMORY; 84 ctxt->instate = XML_PARSER_EOF; 85 ctxt->disableSAX = 1; 86 } 87 if (extra) 88 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER, 89 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra, 90 NULL, NULL, 0, 0, 91 "Memory allocation failed : %s\n", extra); 92 else 93 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER, 94 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL, 95 NULL, NULL, 0, 0, "Memory allocation failed\n"); 96 } 97 98 /** 99 * htmlParseErr: 100 * @ctxt: an HTML parser context 101 * @error: the error number 102 * @msg: the error message 103 * @str1: string infor 104 * @str2: string infor 105 * 106 * Handle a fatal parser error, i.e. violating Well-Formedness constraints 107 */ 108 static void LIBXML_ATTR_FORMAT(3,0) 109 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error, 110 const char *msg, const xmlChar *str1, const xmlChar *str2) 111 { 112 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 113 (ctxt->instate == XML_PARSER_EOF)) 114 return; 115 if (ctxt != NULL) 116 ctxt->errNo = error; 117 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, 118 XML_ERR_ERROR, NULL, 0, 119 (const char *) str1, (const char *) str2, 120 NULL, 0, 0, 121 msg, str1, str2); 122 if (ctxt != NULL) 123 ctxt->wellFormed = 0; 124 } 125 126 /** 127 * htmlParseErrInt: 128 * @ctxt: an HTML parser context 129 * @error: the error number 130 * @msg: the error message 131 * @val: integer info 132 * 133 * Handle a fatal parser error, i.e. violating Well-Formedness constraints 134 */ 135 static void LIBXML_ATTR_FORMAT(3,0) 136 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error, 137 const char *msg, int val) 138 { 139 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 140 (ctxt->instate == XML_PARSER_EOF)) 141 return; 142 if (ctxt != NULL) 143 ctxt->errNo = error; 144 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, 145 XML_ERR_ERROR, NULL, 0, NULL, NULL, 146 NULL, val, 0, msg, val); 147 if (ctxt != NULL) 148 ctxt->wellFormed = 0; 149 } 150 151 /************************************************************************ 152 * * 153 * Parser stacks related functions and macros * 154 * * 155 ************************************************************************/ 156 157 /** 158 * htmlnamePush: 159 * @ctxt: an HTML parser context 160 * @value: the element name 161 * 162 * Pushes a new element name on top of the name stack 163 * 164 * Returns 0 in case of error, the index in the stack otherwise 165 */ 166 static int 167 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value) 168 { 169 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head"))) 170 ctxt->html = 3; 171 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body"))) 172 ctxt->html = 10; 173 if (ctxt->nameNr >= ctxt->nameMax) { 174 ctxt->nameMax *= 2; 175 ctxt->nameTab = (const xmlChar * *) 176 xmlRealloc((xmlChar * *)ctxt->nameTab, 177 ctxt->nameMax * 178 sizeof(ctxt->nameTab[0])); 179 if (ctxt->nameTab == NULL) { 180 htmlErrMemory(ctxt, NULL); 181 return (0); 182 } 183 } 184 ctxt->nameTab[ctxt->nameNr] = value; 185 ctxt->name = value; 186 return (ctxt->nameNr++); 187 } 188 /** 189 * htmlnamePop: 190 * @ctxt: an HTML parser context 191 * 192 * Pops the top element name from the name stack 193 * 194 * Returns the name just removed 195 */ 196 static const xmlChar * 197 htmlnamePop(htmlParserCtxtPtr ctxt) 198 { 199 const xmlChar *ret; 200 201 if (ctxt->nameNr <= 0) 202 return (NULL); 203 ctxt->nameNr--; 204 if (ctxt->nameNr < 0) 205 return (NULL); 206 if (ctxt->nameNr > 0) 207 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1]; 208 else 209 ctxt->name = NULL; 210 ret = ctxt->nameTab[ctxt->nameNr]; 211 ctxt->nameTab[ctxt->nameNr] = NULL; 212 return (ret); 213 } 214 215 /** 216 * htmlNodeInfoPush: 217 * @ctxt: an HTML parser context 218 * @value: the node info 219 * 220 * Pushes a new element name on top of the node info stack 221 * 222 * Returns 0 in case of error, the index in the stack otherwise 223 */ 224 static int 225 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value) 226 { 227 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) { 228 if (ctxt->nodeInfoMax == 0) 229 ctxt->nodeInfoMax = 5; 230 ctxt->nodeInfoMax *= 2; 231 ctxt->nodeInfoTab = (htmlParserNodeInfo *) 232 xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab, 233 ctxt->nodeInfoMax * 234 sizeof(ctxt->nodeInfoTab[0])); 235 if (ctxt->nodeInfoTab == NULL) { 236 htmlErrMemory(ctxt, NULL); 237 return (0); 238 } 239 } 240 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value; 241 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr]; 242 return (ctxt->nodeInfoNr++); 243 } 244 245 /** 246 * htmlNodeInfoPop: 247 * @ctxt: an HTML parser context 248 * 249 * Pops the top element name from the node info stack 250 * 251 * Returns 0 in case of error, the pointer to NodeInfo otherwise 252 */ 253 static htmlParserNodeInfo * 254 htmlNodeInfoPop(htmlParserCtxtPtr ctxt) 255 { 256 if (ctxt->nodeInfoNr <= 0) 257 return (NULL); 258 ctxt->nodeInfoNr--; 259 if (ctxt->nodeInfoNr < 0) 260 return (NULL); 261 if (ctxt->nodeInfoNr > 0) 262 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1]; 263 else 264 ctxt->nodeInfo = NULL; 265 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr]; 266 } 267 268 /* 269 * Macros for accessing the content. Those should be used only by the parser, 270 * and not exported. 271 * 272 * Dirty macros, i.e. one need to make assumption on the context to use them 273 * 274 * CUR_PTR return the current pointer to the xmlChar to be parsed. 275 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled 276 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled 277 * in UNICODE mode. This should be used internally by the parser 278 * only to compare to ASCII values otherwise it would break when 279 * running with UTF-8 encoding. 280 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only 281 * to compare on ASCII based substring. 282 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR 283 * it should be used only to compare on ASCII based substring. 284 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined 285 * strings without newlines within the parser. 286 * 287 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding 288 * 289 * CURRENT Returns the current char value, with the full decoding of 290 * UTF-8 if we are using this mode. It returns an int. 291 * NEXT Skip to the next character, this does the proper decoding 292 * in UTF-8 mode. It also pop-up unfinished entities on the fly. 293 * NEXTL(l) Skip the current unicode character of l xmlChars long. 294 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly 295 */ 296 297 #define UPPER (toupper(*ctxt->input->cur)) 298 299 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val) 300 301 #define NXT(val) ctxt->input->cur[(val)] 302 303 #define UPP(val) (toupper(ctxt->input->cur[(val)])) 304 305 #define CUR_PTR ctxt->input->cur 306 #define BASE_PTR ctxt->input->base 307 308 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \ 309 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \ 310 xmlParserInputShrink(ctxt->input) 311 312 #define GROW if ((ctxt->progressive == 0) && \ 313 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \ 314 xmlParserInputGrow(ctxt->input, INPUT_CHUNK) 315 316 #define CURRENT ((int) (*ctxt->input->cur)) 317 318 #define SKIP_BLANKS htmlSkipBlankChars(ctxt) 319 320 /* Inported from XML */ 321 322 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */ 323 #define CUR ((int) (*ctxt->input->cur)) 324 #define NEXT xmlNextChar(ctxt) 325 326 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur)) 327 328 329 #define NEXTL(l) do { \ 330 if (*(ctxt->input->cur) == '\n') { \ 331 ctxt->input->line++; ctxt->input->col = 1; \ 332 } else ctxt->input->col++; \ 333 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \ 334 } while (0) 335 336 /************ 337 \ 338 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \ 339 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt); 340 ************/ 341 342 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l) 343 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l) 344 345 #define COPY_BUF(l,b,i,v) \ 346 if (l == 1) b[i++] = (xmlChar) v; \ 347 else i += xmlCopyChar(l,&b[i],v) 348 349 /** 350 * htmlFindEncoding: 351 * @the HTML parser context 352 * 353 * Ty to find and encoding in the current data available in the input 354 * buffer this is needed to try to switch to the proper encoding when 355 * one face a character error. 356 * That's an heuristic, since it's operating outside of parsing it could 357 * try to use a meta which had been commented out, that's the reason it 358 * should only be used in case of error, not as a default. 359 * 360 * Returns an encoding string or NULL if not found, the string need to 361 * be freed 362 */ 363 static xmlChar * 364 htmlFindEncoding(xmlParserCtxtPtr ctxt) { 365 const xmlChar *start, *cur, *end; 366 367 if ((ctxt == NULL) || (ctxt->input == NULL) || 368 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) || 369 (ctxt->input->buf->encoder != NULL)) 370 return(NULL); 371 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL)) 372 return(NULL); 373 374 start = ctxt->input->cur; 375 end = ctxt->input->end; 376 /* we also expect the input buffer to be zero terminated */ 377 if (*end != 0) 378 return(NULL); 379 380 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV"); 381 if (cur == NULL) 382 return(NULL); 383 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT"); 384 if (cur == NULL) 385 return(NULL); 386 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET="); 387 if (cur == NULL) 388 return(NULL); 389 cur += 8; 390 start = cur; 391 while (((*cur >= 'A') && (*cur <= 'Z')) || 392 ((*cur >= 'a') && (*cur <= 'z')) || 393 ((*cur >= '0') && (*cur <= '9')) || 394 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/')) 395 cur++; 396 if (cur == start) 397 return(NULL); 398 return(xmlStrndup(start, cur - start)); 399 } 400 401 /** 402 * htmlCurrentChar: 403 * @ctxt: the HTML parser context 404 * @len: pointer to the length of the char read 405 * 406 * The current char value, if using UTF-8 this may actually span multiple 407 * bytes in the input buffer. Implement the end of line normalization: 408 * 2.11 End-of-Line Handling 409 * If the encoding is unspecified, in the case we find an ISO-Latin-1 410 * char, then the encoding converter is plugged in automatically. 411 * 412 * Returns the current char value and its length 413 */ 414 415 static int 416 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { 417 if (ctxt->instate == XML_PARSER_EOF) 418 return(0); 419 420 if (ctxt->token != 0) { 421 *len = 0; 422 return(ctxt->token); 423 } 424 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { 425 /* 426 * We are supposed to handle UTF8, check it's valid 427 * From rfc2044: encoding of the Unicode values on UTF-8: 428 * 429 * UCS-4 range (hex.) UTF-8 octet sequence (binary) 430 * 0000 0000-0000 007F 0xxxxxxx 431 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx 432 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx 433 * 434 * Check for the 0x110000 limit too 435 */ 436 const unsigned char *cur = ctxt->input->cur; 437 unsigned char c; 438 unsigned int val; 439 440 c = *cur; 441 if (c & 0x80) { 442 if (cur[1] == 0) { 443 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 444 cur = ctxt->input->cur; 445 } 446 if ((cur[1] & 0xc0) != 0x80) 447 goto encoding_error; 448 if ((c & 0xe0) == 0xe0) { 449 450 if (cur[2] == 0) { 451 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 452 cur = ctxt->input->cur; 453 } 454 if ((cur[2] & 0xc0) != 0x80) 455 goto encoding_error; 456 if ((c & 0xf0) == 0xf0) { 457 if (cur[3] == 0) { 458 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 459 cur = ctxt->input->cur; 460 } 461 if (((c & 0xf8) != 0xf0) || 462 ((cur[3] & 0xc0) != 0x80)) 463 goto encoding_error; 464 /* 4-byte code */ 465 *len = 4; 466 val = (cur[0] & 0x7) << 18; 467 val |= (cur[1] & 0x3f) << 12; 468 val |= (cur[2] & 0x3f) << 6; 469 val |= cur[3] & 0x3f; 470 } else { 471 /* 3-byte code */ 472 *len = 3; 473 val = (cur[0] & 0xf) << 12; 474 val |= (cur[1] & 0x3f) << 6; 475 val |= cur[2] & 0x3f; 476 } 477 } else { 478 /* 2-byte code */ 479 *len = 2; 480 val = (cur[0] & 0x1f) << 6; 481 val |= cur[1] & 0x3f; 482 } 483 if (!IS_CHAR(val)) { 484 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 485 "Char 0x%X out of allowed range\n", val); 486 } 487 return(val); 488 } else { 489 if ((*ctxt->input->cur == 0) && 490 (ctxt->input->cur < ctxt->input->end)) { 491 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 492 "Char 0x%X out of allowed range\n", 0); 493 *len = 1; 494 return(' '); 495 } 496 /* 1-byte code */ 497 *len = 1; 498 return((int) *ctxt->input->cur); 499 } 500 } 501 /* 502 * Assume it's a fixed length encoding (1) with 503 * a compatible encoding for the ASCII set, since 504 * XML constructs only use < 128 chars 505 */ 506 *len = 1; 507 if ((int) *ctxt->input->cur < 0x80) 508 return((int) *ctxt->input->cur); 509 510 /* 511 * Humm this is bad, do an automatic flow conversion 512 */ 513 { 514 xmlChar * guess; 515 xmlCharEncodingHandlerPtr handler; 516 517 guess = htmlFindEncoding(ctxt); 518 if (guess == NULL) { 519 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); 520 } else { 521 if (ctxt->input->encoding != NULL) 522 xmlFree((xmlChar *) ctxt->input->encoding); 523 ctxt->input->encoding = guess; 524 handler = xmlFindCharEncodingHandler((const char *) guess); 525 if (handler != NULL) { 526 xmlSwitchToEncoding(ctxt, handler); 527 } else { 528 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 529 "Unsupported encoding %s", guess, NULL); 530 } 531 } 532 ctxt->charset = XML_CHAR_ENCODING_UTF8; 533 } 534 535 return(xmlCurrentChar(ctxt, len)); 536 537 encoding_error: 538 /* 539 * If we detect an UTF8 error that probably mean that the 540 * input encoding didn't get properly advertized in the 541 * declaration header. Report the error and switch the encoding 542 * to ISO-Latin-1 (if you don't like this policy, just declare the 543 * encoding !) 544 */ 545 { 546 char buffer[150]; 547 548 if (ctxt->input->end - ctxt->input->cur >= 4) { 549 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", 550 ctxt->input->cur[0], ctxt->input->cur[1], 551 ctxt->input->cur[2], ctxt->input->cur[3]); 552 } else { 553 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]); 554 } 555 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 556 "Input is not proper UTF-8, indicate encoding !\n", 557 BAD_CAST buffer, NULL); 558 } 559 560 ctxt->charset = XML_CHAR_ENCODING_8859_1; 561 *len = 1; 562 return((int) *ctxt->input->cur); 563 } 564 565 /** 566 * htmlSkipBlankChars: 567 * @ctxt: the HTML parser context 568 * 569 * skip all blanks character found at that point in the input streams. 570 * 571 * Returns the number of space chars skipped 572 */ 573 574 static int 575 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) { 576 int res = 0; 577 578 while (IS_BLANK_CH(*(ctxt->input->cur))) { 579 if ((*ctxt->input->cur == 0) && 580 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) { 581 xmlPopInput(ctxt); 582 } else { 583 if (*(ctxt->input->cur) == '\n') { 584 ctxt->input->line++; ctxt->input->col = 1; 585 } else ctxt->input->col++; 586 ctxt->input->cur++; 587 ctxt->nbChars++; 588 if (*ctxt->input->cur == 0) 589 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 590 } 591 res++; 592 } 593 return(res); 594 } 595 596 597 598 /************************************************************************ 599 * * 600 * The list of HTML elements and their properties * 601 * * 602 ************************************************************************/ 603 604 /* 605 * Start Tag: 1 means the start tag can be ommited 606 * End Tag: 1 means the end tag can be ommited 607 * 2 means it's forbidden (empty elements) 608 * 3 means the tag is stylistic and should be closed easily 609 * Depr: this element is deprecated 610 * DTD: 1 means that this element is valid only in the Loose DTD 611 * 2 means that this element is valid only in the Frameset DTD 612 * 613 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description 614 , subElements , impliedsubelt , Attributes, userdata 615 */ 616 617 /* Definitions and a couple of vars for HTML Elements */ 618 619 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small" 620 #define NB_FONTSTYLE 8 621 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym" 622 #define NB_PHRASE 10 623 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe" 624 #define NB_SPECIAL 16 625 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL 626 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL 627 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address" 628 #define NB_BLOCK NB_HEADING + NB_LIST + 14 629 #define FORMCTRL "input", "select", "textarea", "label", "button" 630 #define NB_FORMCTRL 5 631 #define PCDATA 632 #define NB_PCDATA 0 633 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6" 634 #define NB_HEADING 6 635 #define LIST "ul", "ol", "dir", "menu" 636 #define NB_LIST 4 637 #define MODIFIER 638 #define NB_MODIFIER 0 639 #define FLOW BLOCK,INLINE 640 #define NB_FLOW NB_BLOCK + NB_INLINE 641 #define EMPTY NULL 642 643 644 static const char* const html_flow[] = { FLOW, NULL } ; 645 static const char* const html_inline[] = { INLINE, NULL } ; 646 647 /* placeholders: elts with content but no subelements */ 648 static const char* const html_pcdata[] = { NULL } ; 649 #define html_cdata html_pcdata 650 651 652 /* ... and for HTML Attributes */ 653 654 #define COREATTRS "id", "class", "style", "title" 655 #define NB_COREATTRS 4 656 #define I18N "lang", "dir" 657 #define NB_I18N 2 658 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup" 659 #define NB_EVENTS 9 660 #define ATTRS COREATTRS,I18N,EVENTS 661 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS 662 #define CELLHALIGN "align", "char", "charoff" 663 #define NB_CELLHALIGN 3 664 #define CELLVALIGN "valign" 665 #define NB_CELLVALIGN 1 666 667 static const char* const html_attrs[] = { ATTRS, NULL } ; 668 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ; 669 static const char* const core_attrs[] = { COREATTRS, NULL } ; 670 static const char* const i18n_attrs[] = { I18N, NULL } ; 671 672 673 /* Other declarations that should go inline ... */ 674 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name", 675 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords", 676 "tabindex", "onfocus", "onblur", NULL } ; 677 static const char* const target_attr[] = { "target", NULL } ; 678 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ; 679 static const char* const alt_attr[] = { "alt", NULL } ; 680 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ; 681 static const char* const href_attrs[] = { "href", NULL } ; 682 static const char* const clear_attrs[] = { "clear", NULL } ; 683 static const char* const inline_p[] = { INLINE, "p", NULL } ; 684 685 static const char* const flow_param[] = { FLOW, "param", NULL } ; 686 static const char* const applet_attrs[] = { COREATTRS , "codebase", 687 "archive", "alt", "name", "height", "width", "align", 688 "hspace", "vspace", NULL } ; 689 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref", 690 "tabindex", "accesskey", "onfocus", "onblur", NULL } ; 691 static const char* const basefont_attrs[] = 692 { "id", "size", "color", "face", NULL } ; 693 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ; 694 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ; 695 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ; 696 static const char* const body_depr[] = { "background", "bgcolor", "text", 697 "link", "vlink", "alink", NULL } ; 698 static const char* const button_attrs[] = { ATTRS, "name", "value", "type", 699 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ; 700 701 702 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ; 703 static const char* const col_elt[] = { "col", NULL } ; 704 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ; 705 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ; 706 static const char* const dl_contents[] = { "dt", "dd", NULL } ; 707 static const char* const compact_attr[] = { "compact", NULL } ; 708 static const char* const label_attr[] = { "label", NULL } ; 709 static const char* const fieldset_contents[] = { FLOW, "legend" } ; 710 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ; 711 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ; 712 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ; 713 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ; 714 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ; 715 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ; 716 static const char* const head_attrs[] = { I18N, "profile", NULL } ; 717 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ; 718 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ; 719 static const char* const version_attr[] = { "version", NULL } ; 720 static const char* const html_content[] = { "head", "body", "frameset", NULL } ; 721 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ; 722 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ; 723 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ; 724 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ; 725 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ; 726 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ; 727 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ; 728 static const char* const align_attr[] = { "align", NULL } ; 729 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ; 730 static const char* const map_contents[] = { BLOCK, "area", NULL } ; 731 static const char* const name_attr[] = { "name", NULL } ; 732 static const char* const action_attr[] = { "action", NULL } ; 733 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ; 734 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ; 735 static const char* const content_attr[] = { "content", NULL } ; 736 static const char* const type_attr[] = { "type", NULL } ; 737 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ; 738 static const char* const object_contents[] = { FLOW, "param", NULL } ; 739 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ; 740 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ; 741 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ; 742 static const char* const option_elt[] = { "option", NULL } ; 743 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ; 744 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ; 745 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ; 746 static const char* const width_attr[] = { "width", NULL } ; 747 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ; 748 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ; 749 static const char* const language_attr[] = { "language", NULL } ; 750 static const char* const select_content[] = { "optgroup", "option", NULL } ; 751 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ; 752 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ; 753 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ; 754 static const char* const table_depr[] = { "align", "bgcolor", NULL } ; 755 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ; 756 static const char* const tr_elt[] = { "tr", NULL } ; 757 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ; 758 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ; 759 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ; 760 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ; 761 static const char* const tr_contents[] = { "th", "td", NULL } ; 762 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ; 763 static const char* const li_elt[] = { "li", NULL } ; 764 static const char* const ul_depr[] = { "type", "compact", NULL} ; 765 static const char* const dir_attr[] = { "dir", NULL} ; 766 767 #define DECL (const char**) 768 769 static const htmlElemDesc 770 html40ElementTable[] = { 771 { "a", 0, 0, 0, 0, 0, 0, 1, "anchor ", 772 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL 773 }, 774 { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form", 775 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 776 }, 777 { "acronym", 0, 0, 0, 0, 0, 0, 1, "", 778 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 779 }, 780 { "address", 0, 0, 0, 0, 0, 0, 0, "information on author ", 781 DECL inline_p , NULL , DECL html_attrs, NULL, NULL 782 }, 783 { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ", 784 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL 785 }, 786 { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ", 787 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr 788 }, 789 { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style", 790 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 791 }, 792 { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ", 793 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs 794 }, 795 { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " , 796 EMPTY , NULL , NULL, DECL basefont_attrs, NULL 797 }, 798 { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ", 799 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr 800 }, 801 { "big", 0, 3, 0, 0, 0, 0, 1, "large text style", 802 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 803 }, 804 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ", 805 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL 806 }, 807 { "body", 1, 1, 0, 0, 0, 0, 0, "document body ", 808 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL 809 }, 810 { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ", 811 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL 812 }, 813 { "button", 0, 0, 0, 0, 0, 0, 2, "push button ", 814 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL 815 }, 816 { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ", 817 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 818 }, 819 { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ", 820 DECL html_flow , NULL , NULL, DECL html_attrs, NULL 821 }, 822 { "cite", 0, 0, 0, 0, 0, 0, 1, "citation", 823 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 824 }, 825 { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment", 826 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 827 }, 828 { "col", 0, 2, 2, 1, 0, 0, 0, "table column ", 829 EMPTY , NULL , DECL col_attrs , NULL, NULL 830 }, 831 { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ", 832 DECL col_elt , "col" , DECL col_attrs , NULL, NULL 833 }, 834 { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ", 835 DECL html_flow , NULL , DECL html_attrs, NULL, NULL 836 }, 837 { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ", 838 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL 839 }, 840 { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition", 841 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 842 }, 843 { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list", 844 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL 845 }, 846 { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container", 847 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL 848 }, 849 { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ", 850 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL 851 }, 852 { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ", 853 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 854 }, 855 { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis", 856 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 857 }, 858 { "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ", 859 EMPTY, NULL, DECL embed_attrs, NULL, NULL 860 }, 861 { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ", 862 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL 863 }, 864 { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ", 865 DECL html_inline, NULL, NULL, DECL font_attrs, NULL 866 }, 867 { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ", 868 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr 869 }, 870 { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " , 871 EMPTY, NULL, NULL, DECL frame_attrs, NULL 872 }, 873 { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" , 874 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL 875 }, 876 { "h1", 0, 0, 0, 0, 0, 0, 0, "heading ", 877 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 878 }, 879 { "h2", 0, 0, 0, 0, 0, 0, 0, "heading ", 880 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 881 }, 882 { "h3", 0, 0, 0, 0, 0, 0, 0, "heading ", 883 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 884 }, 885 { "h4", 0, 0, 0, 0, 0, 0, 0, "heading ", 886 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 887 }, 888 { "h5", 0, 0, 0, 0, 0, 0, 0, "heading ", 889 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 890 }, 891 { "h6", 0, 0, 0, 0, 0, 0, 0, "heading ", 892 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 893 }, 894 { "head", 1, 1, 0, 0, 0, 0, 0, "document head ", 895 DECL head_contents, NULL, DECL head_attrs, NULL, NULL 896 }, 897 { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " , 898 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL 899 }, 900 { "html", 1, 1, 0, 0, 0, 0, 0, "document root element ", 901 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL 902 }, 903 { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style", 904 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 905 }, 906 { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ", 907 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL 908 }, 909 { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ", 910 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs 911 }, 912 { "input", 0, 2, 2, 1, 0, 0, 1, "form control ", 913 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL 914 }, 915 { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text", 916 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL 917 }, 918 { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ", 919 EMPTY, NULL, NULL, DECL prompt_attrs, NULL 920 }, 921 { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user", 922 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 923 }, 924 { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ", 925 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL 926 }, 927 { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ", 928 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL 929 }, 930 { "li", 0, 1, 1, 0, 0, 0, 0, "list item ", 931 DECL html_flow, NULL, DECL html_attrs, NULL, NULL 932 }, 933 { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ", 934 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL 935 }, 936 { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ", 937 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr 938 }, 939 { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ", 940 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL 941 }, 942 { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ", 943 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr 944 }, 945 { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ", 946 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL 947 }, 948 { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ", 949 DECL html_flow, "div", DECL html_attrs, NULL, NULL 950 }, 951 { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ", 952 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL 953 }, 954 { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ", 955 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL 956 }, 957 { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ", 958 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr 959 }, 960 { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " , 961 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL 962 }, 963 { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ", 964 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 965 }, 966 { "param", 0, 2, 2, 1, 0, 0, 0, "named property value ", 967 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr 968 }, 969 { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ", 970 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL 971 }, 972 { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ", 973 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL 974 }, 975 { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style", 976 DECL html_inline, NULL, NULL, DECL html_attrs, NULL 977 }, 978 { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.", 979 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 980 }, 981 { "script", 0, 0, 0, 0, 0, 0, 2, "script statements ", 982 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr 983 }, 984 { "select", 0, 0, 0, 0, 0, 0, 1, "option selector ", 985 DECL select_content, NULL, DECL select_attrs, NULL, NULL 986 }, 987 { "small", 0, 3, 0, 0, 0, 0, 1, "small text style", 988 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 989 }, 990 { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ", 991 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 992 }, 993 { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text", 994 DECL html_inline, NULL, NULL, DECL html_attrs, NULL 995 }, 996 { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis", 997 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 998 }, 999 { "style", 0, 0, 0, 0, 0, 0, 0, "style info ", 1000 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr 1001 }, 1002 { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript", 1003 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1004 }, 1005 { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ", 1006 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1007 }, 1008 { "table", 0, 0, 0, 0, 0, 0, 0, "", 1009 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL 1010 }, 1011 { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ", 1012 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 1013 }, 1014 { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell", 1015 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL 1016 }, 1017 { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ", 1018 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr 1019 }, 1020 { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ", 1021 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 1022 }, 1023 { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell", 1024 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL 1025 }, 1026 { "thead", 0, 1, 0, 0, 0, 0, 0, "table header ", 1027 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 1028 }, 1029 { "title", 0, 0, 0, 0, 0, 0, 0, "document title ", 1030 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL 1031 }, 1032 { "tr", 0, 0, 0, 0, 0, 0, 0, "table row ", 1033 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL 1034 }, 1035 { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style", 1036 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1037 }, 1038 { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style", 1039 DECL html_inline, NULL, NULL, DECL html_attrs, NULL 1040 }, 1041 { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ", 1042 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL 1043 }, 1044 { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument", 1045 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1046 } 1047 }; 1048 1049 /* 1050 * start tags that imply the end of current element 1051 */ 1052 static const char * const htmlStartClose[] = { 1053 "form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6", 1054 "dl", "ul", "ol", "menu", "dir", "address", "pre", 1055 "listing", "xmp", "head", NULL, 1056 "head", "p", NULL, 1057 "title", "p", NULL, 1058 "body", "head", "style", "link", "title", "p", NULL, 1059 "frameset", "head", "style", "link", "title", "p", NULL, 1060 "li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address", 1061 "pre", "listing", "xmp", "head", "li", NULL, 1062 "hr", "p", "head", NULL, 1063 "h1", "p", "head", NULL, 1064 "h2", "p", "head", NULL, 1065 "h3", "p", "head", NULL, 1066 "h4", "p", "head", NULL, 1067 "h5", "p", "head", NULL, 1068 "h6", "p", "head", NULL, 1069 "dir", "p", "head", NULL, 1070 "address", "p", "head", "ul", NULL, 1071 "pre", "p", "head", "ul", NULL, 1072 "listing", "p", "head", NULL, 1073 "xmp", "p", "head", NULL, 1074 "blockquote", "p", "head", NULL, 1075 "dl", "p", "dt", "menu", "dir", "address", "pre", "listing", 1076 "xmp", "head", NULL, 1077 "dt", "p", "menu", "dir", "address", "pre", "listing", "xmp", 1078 "head", "dd", NULL, 1079 "dd", "p", "menu", "dir", "address", "pre", "listing", "xmp", 1080 "head", "dt", NULL, 1081 "ul", "p", "head", "ol", "menu", "dir", "address", "pre", 1082 "listing", "xmp", NULL, 1083 "ol", "p", "head", "ul", NULL, 1084 "menu", "p", "head", "ul", NULL, 1085 "p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL, 1086 "div", "p", "head", NULL, 1087 "noscript", "p", NULL, 1088 "center", "font", "b", "i", "p", "head", NULL, 1089 "a", "a", "head", NULL, 1090 "caption", "p", NULL, 1091 "colgroup", "caption", "colgroup", "col", "p", NULL, 1092 "col", "caption", "col", "p", NULL, 1093 "table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre", 1094 "listing", "xmp", "a", NULL, 1095 "th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL, 1096 "td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL, 1097 "tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL, 1098 "thead", "caption", "col", "colgroup", NULL, 1099 "tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead", 1100 "tbody", "p", NULL, 1101 "tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead", 1102 "tfoot", "tbody", "p", NULL, 1103 "optgroup", "option", NULL, 1104 "option", "option", NULL, 1105 "fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", 1106 "pre", "listing", "xmp", "a", NULL, 1107 /* most tags in in FONTSTYLE, PHRASE and SPECIAL should close <head> */ 1108 "tt", "head", NULL, 1109 "i", "head", NULL, 1110 "b", "head", NULL, 1111 "u", "head", NULL, 1112 "s", "head", NULL, 1113 "strike", "head", NULL, 1114 "big", "head", NULL, 1115 "small", "head", NULL, 1116 1117 "em", "head", NULL, 1118 "strong", "head", NULL, 1119 "dfn", "head", NULL, 1120 "code", "head", NULL, 1121 "samp", "head", NULL, 1122 "kbd", "head", NULL, 1123 "var", "head", NULL, 1124 "cite", "head", NULL, 1125 "abbr", "head", NULL, 1126 "acronym", "head", NULL, 1127 1128 /* "a" */ 1129 "img", "head", NULL, 1130 /* "applet" */ 1131 /* "embed" */ 1132 /* "object" */ 1133 "font", "head", NULL, 1134 /* "basefont" */ 1135 "br", "head", NULL, 1136 /* "script" */ 1137 "map", "head", NULL, 1138 "q", "head", NULL, 1139 "sub", "head", NULL, 1140 "sup", "head", NULL, 1141 "span", "head", NULL, 1142 "bdo", "head", NULL, 1143 "iframe", "head", NULL, 1144 NULL 1145 }; 1146 1147 /* 1148 * The list of HTML elements which are supposed not to have 1149 * CDATA content and where a p element will be implied 1150 * 1151 * TODO: extend that list by reading the HTML SGML DTD on 1152 * implied paragraph 1153 */ 1154 static const char *const htmlNoContentElements[] = { 1155 "html", 1156 "head", 1157 NULL 1158 }; 1159 1160 /* 1161 * The list of HTML attributes which are of content %Script; 1162 * NOTE: when adding ones, check htmlIsScriptAttribute() since 1163 * it assumes the name starts with 'on' 1164 */ 1165 static const char *const htmlScriptAttributes[] = { 1166 "onclick", 1167 "ondblclick", 1168 "onmousedown", 1169 "onmouseup", 1170 "onmouseover", 1171 "onmousemove", 1172 "onmouseout", 1173 "onkeypress", 1174 "onkeydown", 1175 "onkeyup", 1176 "onload", 1177 "onunload", 1178 "onfocus", 1179 "onblur", 1180 "onsubmit", 1181 "onreset", 1182 "onchange", 1183 "onselect" 1184 }; 1185 1186 /* 1187 * This table is used by the htmlparser to know what to do with 1188 * broken html pages. By assigning different priorities to different 1189 * elements the parser can decide how to handle extra endtags. 1190 * Endtags are only allowed to close elements with lower or equal 1191 * priority. 1192 */ 1193 1194 typedef struct { 1195 const char *name; 1196 int priority; 1197 } elementPriority; 1198 1199 static const elementPriority htmlEndPriority[] = { 1200 {"div", 150}, 1201 {"td", 160}, 1202 {"th", 160}, 1203 {"tr", 170}, 1204 {"thead", 180}, 1205 {"tbody", 180}, 1206 {"tfoot", 180}, 1207 {"table", 190}, 1208 {"head", 200}, 1209 {"body", 200}, 1210 {"html", 220}, 1211 {NULL, 100} /* Default priority */ 1212 }; 1213 1214 static const char** htmlStartCloseIndex[100]; 1215 static int htmlStartCloseIndexinitialized = 0; 1216 1217 /************************************************************************ 1218 * * 1219 * functions to handle HTML specific data * 1220 * * 1221 ************************************************************************/ 1222 1223 /** 1224 * htmlInitAutoClose: 1225 * 1226 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names. 1227 * This is not reentrant. Call xmlInitParser() once before processing in 1228 * case of use in multithreaded programs. 1229 */ 1230 void 1231 htmlInitAutoClose(void) { 1232 int indx, i = 0; 1233 1234 if (htmlStartCloseIndexinitialized) return; 1235 1236 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL; 1237 indx = 0; 1238 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) { 1239 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i]; 1240 while (htmlStartClose[i] != NULL) i++; 1241 i++; 1242 } 1243 htmlStartCloseIndexinitialized = 1; 1244 } 1245 1246 /** 1247 * htmlTagLookup: 1248 * @tag: The tag name in lowercase 1249 * 1250 * Lookup the HTML tag in the ElementTable 1251 * 1252 * Returns the related htmlElemDescPtr or NULL if not found. 1253 */ 1254 const htmlElemDesc * 1255 htmlTagLookup(const xmlChar *tag) { 1256 unsigned int i; 1257 1258 for (i = 0; i < (sizeof(html40ElementTable) / 1259 sizeof(html40ElementTable[0]));i++) { 1260 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name)) 1261 return((htmlElemDescPtr) &html40ElementTable[i]); 1262 } 1263 return(NULL); 1264 } 1265 1266 /** 1267 * htmlGetEndPriority: 1268 * @name: The name of the element to look up the priority for. 1269 * 1270 * Return value: The "endtag" priority. 1271 **/ 1272 static int 1273 htmlGetEndPriority (const xmlChar *name) { 1274 int i = 0; 1275 1276 while ((htmlEndPriority[i].name != NULL) && 1277 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name))) 1278 i++; 1279 1280 return(htmlEndPriority[i].priority); 1281 } 1282 1283 1284 /** 1285 * htmlCheckAutoClose: 1286 * @newtag: The new tag name 1287 * @oldtag: The old tag name 1288 * 1289 * Checks whether the new tag is one of the registered valid tags for 1290 * closing old. 1291 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names. 1292 * 1293 * Returns 0 if no, 1 if yes. 1294 */ 1295 static int 1296 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag) 1297 { 1298 int i, indx; 1299 const char **closed = NULL; 1300 1301 if (htmlStartCloseIndexinitialized == 0) 1302 htmlInitAutoClose(); 1303 1304 /* inefficient, but not a big deal */ 1305 for (indx = 0; indx < 100; indx++) { 1306 closed = htmlStartCloseIndex[indx]; 1307 if (closed == NULL) 1308 return (0); 1309 if (xmlStrEqual(BAD_CAST * closed, newtag)) 1310 break; 1311 } 1312 1313 i = closed - htmlStartClose; 1314 i++; 1315 while (htmlStartClose[i] != NULL) { 1316 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) { 1317 return (1); 1318 } 1319 i++; 1320 } 1321 return (0); 1322 } 1323 1324 /** 1325 * htmlAutoCloseOnClose: 1326 * @ctxt: an HTML parser context 1327 * @newtag: The new tag name 1328 * @force: force the tag closure 1329 * 1330 * The HTML DTD allows an ending tag to implicitly close other tags. 1331 */ 1332 static void 1333 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag) 1334 { 1335 const htmlElemDesc *info; 1336 int i, priority; 1337 1338 priority = htmlGetEndPriority(newtag); 1339 1340 for (i = (ctxt->nameNr - 1); i >= 0; i--) { 1341 1342 if (xmlStrEqual(newtag, ctxt->nameTab[i])) 1343 break; 1344 /* 1345 * A missplaced endtag can only close elements with lower 1346 * or equal priority, so if we find an element with higher 1347 * priority before we find an element with 1348 * matching name, we just ignore this endtag 1349 */ 1350 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority) 1351 return; 1352 } 1353 if (i < 0) 1354 return; 1355 1356 while (!xmlStrEqual(newtag, ctxt->name)) { 1357 info = htmlTagLookup(ctxt->name); 1358 if ((info != NULL) && (info->endTag == 3)) { 1359 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 1360 "Opening and ending tag mismatch: %s and %s\n", 1361 newtag, ctxt->name); 1362 } 1363 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1364 ctxt->sax->endElement(ctxt->userData, ctxt->name); 1365 htmlnamePop(ctxt); 1366 } 1367 } 1368 1369 /** 1370 * htmlAutoCloseOnEnd: 1371 * @ctxt: an HTML parser context 1372 * 1373 * Close all remaining tags at the end of the stream 1374 */ 1375 static void 1376 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) 1377 { 1378 int i; 1379 1380 if (ctxt->nameNr == 0) 1381 return; 1382 for (i = (ctxt->nameNr - 1); i >= 0; i--) { 1383 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1384 ctxt->sax->endElement(ctxt->userData, ctxt->name); 1385 htmlnamePop(ctxt); 1386 } 1387 } 1388 1389 /** 1390 * htmlAutoClose: 1391 * @ctxt: an HTML parser context 1392 * @newtag: The new tag name or NULL 1393 * 1394 * The HTML DTD allows a tag to implicitly close other tags. 1395 * The list is kept in htmlStartClose array. This function is 1396 * called when a new tag has been detected and generates the 1397 * appropriates closes if possible/needed. 1398 * If newtag is NULL this mean we are at the end of the resource 1399 * and we should check 1400 */ 1401 static void 1402 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag) 1403 { 1404 while ((newtag != NULL) && (ctxt->name != NULL) && 1405 (htmlCheckAutoClose(newtag, ctxt->name))) { 1406 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1407 ctxt->sax->endElement(ctxt->userData, ctxt->name); 1408 htmlnamePop(ctxt); 1409 } 1410 if (newtag == NULL) { 1411 htmlAutoCloseOnEnd(ctxt); 1412 return; 1413 } 1414 while ((newtag == NULL) && (ctxt->name != NULL) && 1415 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) || 1416 (xmlStrEqual(ctxt->name, BAD_CAST "body")) || 1417 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) { 1418 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1419 ctxt->sax->endElement(ctxt->userData, ctxt->name); 1420 htmlnamePop(ctxt); 1421 } 1422 } 1423 1424 /** 1425 * htmlAutoCloseTag: 1426 * @doc: the HTML document 1427 * @name: The tag name 1428 * @elem: the HTML element 1429 * 1430 * The HTML DTD allows a tag to implicitly close other tags. 1431 * The list is kept in htmlStartClose array. This function checks 1432 * if the element or one of it's children would autoclose the 1433 * given tag. 1434 * 1435 * Returns 1 if autoclose, 0 otherwise 1436 */ 1437 int 1438 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) { 1439 htmlNodePtr child; 1440 1441 if (elem == NULL) return(1); 1442 if (xmlStrEqual(name, elem->name)) return(0); 1443 if (htmlCheckAutoClose(elem->name, name)) return(1); 1444 child = elem->children; 1445 while (child != NULL) { 1446 if (htmlAutoCloseTag(doc, name, child)) return(1); 1447 child = child->next; 1448 } 1449 return(0); 1450 } 1451 1452 /** 1453 * htmlIsAutoClosed: 1454 * @doc: the HTML document 1455 * @elem: the HTML element 1456 * 1457 * The HTML DTD allows a tag to implicitly close other tags. 1458 * The list is kept in htmlStartClose array. This function checks 1459 * if a tag is autoclosed by one of it's child 1460 * 1461 * Returns 1 if autoclosed, 0 otherwise 1462 */ 1463 int 1464 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) { 1465 htmlNodePtr child; 1466 1467 if (elem == NULL) return(1); 1468 child = elem->children; 1469 while (child != NULL) { 1470 if (htmlAutoCloseTag(doc, elem->name, child)) return(1); 1471 child = child->next; 1472 } 1473 return(0); 1474 } 1475 1476 /** 1477 * htmlCheckImplied: 1478 * @ctxt: an HTML parser context 1479 * @newtag: The new tag name 1480 * 1481 * The HTML DTD allows a tag to exists only implicitly 1482 * called when a new tag has been detected and generates the 1483 * appropriates implicit tags if missing 1484 */ 1485 static void 1486 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) { 1487 int i; 1488 1489 if (ctxt->options & HTML_PARSE_NOIMPLIED) 1490 return; 1491 if (!htmlOmittedDefaultValue) 1492 return; 1493 if (xmlStrEqual(newtag, BAD_CAST"html")) 1494 return; 1495 if (ctxt->nameNr <= 0) { 1496 htmlnamePush(ctxt, BAD_CAST"html"); 1497 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1498 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL); 1499 } 1500 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head"))) 1501 return; 1502 if ((ctxt->nameNr <= 1) && 1503 ((xmlStrEqual(newtag, BAD_CAST"script")) || 1504 (xmlStrEqual(newtag, BAD_CAST"style")) || 1505 (xmlStrEqual(newtag, BAD_CAST"meta")) || 1506 (xmlStrEqual(newtag, BAD_CAST"link")) || 1507 (xmlStrEqual(newtag, BAD_CAST"title")) || 1508 (xmlStrEqual(newtag, BAD_CAST"base")))) { 1509 if (ctxt->html >= 3) { 1510 /* we already saw or generated an <head> before */ 1511 return; 1512 } 1513 /* 1514 * dropped OBJECT ... i you put it first BODY will be 1515 * assumed ! 1516 */ 1517 htmlnamePush(ctxt, BAD_CAST"head"); 1518 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1519 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL); 1520 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) && 1521 (!xmlStrEqual(newtag, BAD_CAST"frame")) && 1522 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) { 1523 if (ctxt->html >= 10) { 1524 /* we already saw or generated a <body> before */ 1525 return; 1526 } 1527 for (i = 0;i < ctxt->nameNr;i++) { 1528 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) { 1529 return; 1530 } 1531 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) { 1532 return; 1533 } 1534 } 1535 1536 htmlnamePush(ctxt, BAD_CAST"body"); 1537 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1538 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL); 1539 } 1540 } 1541 1542 /** 1543 * htmlCheckParagraph 1544 * @ctxt: an HTML parser context 1545 * 1546 * Check whether a p element need to be implied before inserting 1547 * characters in the current element. 1548 * 1549 * Returns 1 if a paragraph has been inserted, 0 if not and -1 1550 * in case of error. 1551 */ 1552 1553 static int 1554 htmlCheckParagraph(htmlParserCtxtPtr ctxt) { 1555 const xmlChar *tag; 1556 int i; 1557 1558 if (ctxt == NULL) 1559 return(-1); 1560 tag = ctxt->name; 1561 if (tag == NULL) { 1562 htmlAutoClose(ctxt, BAD_CAST"p"); 1563 htmlCheckImplied(ctxt, BAD_CAST"p"); 1564 htmlnamePush(ctxt, BAD_CAST"p"); 1565 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1566 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL); 1567 return(1); 1568 } 1569 if (!htmlOmittedDefaultValue) 1570 return(0); 1571 for (i = 0; htmlNoContentElements[i] != NULL; i++) { 1572 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) { 1573 htmlAutoClose(ctxt, BAD_CAST"p"); 1574 htmlCheckImplied(ctxt, BAD_CAST"p"); 1575 htmlnamePush(ctxt, BAD_CAST"p"); 1576 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1577 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL); 1578 return(1); 1579 } 1580 } 1581 return(0); 1582 } 1583 1584 /** 1585 * htmlIsScriptAttribute: 1586 * @name: an attribute name 1587 * 1588 * Check if an attribute is of content type Script 1589 * 1590 * Returns 1 is the attribute is a script 0 otherwise 1591 */ 1592 int 1593 htmlIsScriptAttribute(const xmlChar *name) { 1594 unsigned int i; 1595 1596 if (name == NULL) 1597 return(0); 1598 /* 1599 * all script attributes start with 'on' 1600 */ 1601 if ((name[0] != 'o') || (name[1] != 'n')) 1602 return(0); 1603 for (i = 0; 1604 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]); 1605 i++) { 1606 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i])) 1607 return(1); 1608 } 1609 return(0); 1610 } 1611 1612 /************************************************************************ 1613 * * 1614 * The list of HTML predefined entities * 1615 * * 1616 ************************************************************************/ 1617 1618 1619 static const htmlEntityDesc html40EntitiesTable[] = { 1620 /* 1621 * the 4 absolute ones, plus apostrophe. 1622 */ 1623 { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" }, 1624 { 38, "amp", "ampersand, U+0026 ISOnum" }, 1625 { 39, "apos", "single quote" }, 1626 { 60, "lt", "less-than sign, U+003C ISOnum" }, 1627 { 62, "gt", "greater-than sign, U+003E ISOnum" }, 1628 1629 /* 1630 * A bunch still in the 128-255 range 1631 * Replacing them depend really on the charset used. 1632 */ 1633 { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" }, 1634 { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" }, 1635 { 162, "cent", "cent sign, U+00A2 ISOnum" }, 1636 { 163, "pound","pound sign, U+00A3 ISOnum" }, 1637 { 164, "curren","currency sign, U+00A4 ISOnum" }, 1638 { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" }, 1639 { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" }, 1640 { 167, "sect", "section sign, U+00A7 ISOnum" }, 1641 { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" }, 1642 { 169, "copy", "copyright sign, U+00A9 ISOnum" }, 1643 { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" }, 1644 { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" }, 1645 { 172, "not", "not sign, U+00AC ISOnum" }, 1646 { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" }, 1647 { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" }, 1648 { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" }, 1649 { 176, "deg", "degree sign, U+00B0 ISOnum" }, 1650 { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" }, 1651 { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" }, 1652 { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" }, 1653 { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" }, 1654 { 181, "micro","micro sign, U+00B5 ISOnum" }, 1655 { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" }, 1656 { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" }, 1657 { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" }, 1658 { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" }, 1659 { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" }, 1660 { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" }, 1661 { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" }, 1662 { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" }, 1663 { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" }, 1664 { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" }, 1665 { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" }, 1666 { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" }, 1667 { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" }, 1668 { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" }, 1669 { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" }, 1670 { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" }, 1671 { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" }, 1672 { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" }, 1673 { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" }, 1674 { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" }, 1675 { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" }, 1676 { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" }, 1677 { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" }, 1678 { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" }, 1679 { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" }, 1680 { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" }, 1681 { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" }, 1682 { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" }, 1683 { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" }, 1684 { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" }, 1685 { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" }, 1686 { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" }, 1687 { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" }, 1688 { 215, "times","multiplication sign, U+00D7 ISOnum" }, 1689 { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" }, 1690 { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" }, 1691 { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" }, 1692 { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" }, 1693 { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" }, 1694 { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" }, 1695 { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" }, 1696 { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" }, 1697 { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" }, 1698 { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" }, 1699 { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" }, 1700 { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" }, 1701 { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" }, 1702 { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" }, 1703 { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" }, 1704 { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" }, 1705 { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" }, 1706 { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" }, 1707 { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" }, 1708 { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" }, 1709 { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" }, 1710 { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" }, 1711 { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" }, 1712 { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" }, 1713 { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" }, 1714 { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" }, 1715 { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" }, 1716 { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" }, 1717 { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" }, 1718 { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" }, 1719 { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" }, 1720 { 247, "divide","division sign, U+00F7 ISOnum" }, 1721 { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" }, 1722 { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" }, 1723 { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" }, 1724 { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" }, 1725 { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" }, 1726 { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" }, 1727 { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" }, 1728 { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" }, 1729 1730 { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" }, 1731 { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" }, 1732 { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" }, 1733 { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" }, 1734 { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" }, 1735 1736 /* 1737 * Anything below should really be kept as entities references 1738 */ 1739 { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" }, 1740 1741 { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" }, 1742 { 732, "tilde","small tilde, U+02DC ISOdia" }, 1743 1744 { 913, "Alpha","greek capital letter alpha, U+0391" }, 1745 { 914, "Beta", "greek capital letter beta, U+0392" }, 1746 { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" }, 1747 { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" }, 1748 { 917, "Epsilon","greek capital letter epsilon, U+0395" }, 1749 { 918, "Zeta", "greek capital letter zeta, U+0396" }, 1750 { 919, "Eta", "greek capital letter eta, U+0397" }, 1751 { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" }, 1752 { 921, "Iota", "greek capital letter iota, U+0399" }, 1753 { 922, "Kappa","greek capital letter kappa, U+039A" }, 1754 { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" }, 1755 { 924, "Mu", "greek capital letter mu, U+039C" }, 1756 { 925, "Nu", "greek capital letter nu, U+039D" }, 1757 { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" }, 1758 { 927, "Omicron","greek capital letter omicron, U+039F" }, 1759 { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" }, 1760 { 929, "Rho", "greek capital letter rho, U+03A1" }, 1761 { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" }, 1762 { 932, "Tau", "greek capital letter tau, U+03A4" }, 1763 { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" }, 1764 { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" }, 1765 { 935, "Chi", "greek capital letter chi, U+03A7" }, 1766 { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" }, 1767 { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" }, 1768 1769 { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" }, 1770 { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" }, 1771 { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" }, 1772 { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" }, 1773 { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" }, 1774 { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" }, 1775 { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" }, 1776 { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" }, 1777 { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" }, 1778 { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" }, 1779 { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" }, 1780 { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" }, 1781 { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" }, 1782 { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" }, 1783 { 959, "omicron","greek small letter omicron, U+03BF NEW" }, 1784 { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" }, 1785 { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" }, 1786 { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" }, 1787 { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" }, 1788 { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" }, 1789 { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" }, 1790 { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" }, 1791 { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" }, 1792 { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" }, 1793 { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" }, 1794 { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" }, 1795 { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" }, 1796 { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" }, 1797 1798 { 8194, "ensp", "en space, U+2002 ISOpub" }, 1799 { 8195, "emsp", "em space, U+2003 ISOpub" }, 1800 { 8201, "thinsp","thin space, U+2009 ISOpub" }, 1801 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" }, 1802 { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" }, 1803 { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" }, 1804 { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" }, 1805 { 8211, "ndash","en dash, U+2013 ISOpub" }, 1806 { 8212, "mdash","em dash, U+2014 ISOpub" }, 1807 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" }, 1808 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" }, 1809 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" }, 1810 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" }, 1811 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" }, 1812 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" }, 1813 { 8224, "dagger","dagger, U+2020 ISOpub" }, 1814 { 8225, "Dagger","double dagger, U+2021 ISOpub" }, 1815 1816 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" }, 1817 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" }, 1818 1819 { 8240, "permil","per mille sign, U+2030 ISOtech" }, 1820 1821 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" }, 1822 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" }, 1823 1824 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" }, 1825 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" }, 1826 1827 { 8254, "oline","overline = spacing overscore, U+203E NEW" }, 1828 { 8260, "frasl","fraction slash, U+2044 NEW" }, 1829 1830 { 8364, "euro", "euro sign, U+20AC NEW" }, 1831 1832 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" }, 1833 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" }, 1834 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" }, 1835 { 8482, "trade","trade mark sign, U+2122 ISOnum" }, 1836 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" }, 1837 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" }, 1838 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" }, 1839 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" }, 1840 { 8595, "darr", "downwards arrow, U+2193 ISOnum" }, 1841 { 8596, "harr", "left right arrow, U+2194 ISOamsa" }, 1842 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" }, 1843 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" }, 1844 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" }, 1845 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" }, 1846 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" }, 1847 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" }, 1848 1849 { 8704, "forall","for all, U+2200 ISOtech" }, 1850 { 8706, "part", "partial differential, U+2202 ISOtech" }, 1851 { 8707, "exist","there exists, U+2203 ISOtech" }, 1852 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" }, 1853 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" }, 1854 { 8712, "isin", "element of, U+2208 ISOtech" }, 1855 { 8713, "notin","not an element of, U+2209 ISOtech" }, 1856 { 8715, "ni", "contains as member, U+220B ISOtech" }, 1857 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" }, 1858 { 8721, "sum", "n-ary summation, U+2211 ISOamsb" }, 1859 { 8722, "minus","minus sign, U+2212 ISOtech" }, 1860 { 8727, "lowast","asterisk operator, U+2217 ISOtech" }, 1861 { 8730, "radic","square root = radical sign, U+221A ISOtech" }, 1862 { 8733, "prop", "proportional to, U+221D ISOtech" }, 1863 { 8734, "infin","infinity, U+221E ISOtech" }, 1864 { 8736, "ang", "angle, U+2220 ISOamso" }, 1865 { 8743, "and", "logical and = wedge, U+2227 ISOtech" }, 1866 { 8744, "or", "logical or = vee, U+2228 ISOtech" }, 1867 { 8745, "cap", "intersection = cap, U+2229 ISOtech" }, 1868 { 8746, "cup", "union = cup, U+222A ISOtech" }, 1869 { 8747, "int", "integral, U+222B ISOtech" }, 1870 { 8756, "there4","therefore, U+2234 ISOtech" }, 1871 { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" }, 1872 { 8773, "cong", "approximately equal to, U+2245 ISOtech" }, 1873 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" }, 1874 { 8800, "ne", "not equal to, U+2260 ISOtech" }, 1875 { 8801, "equiv","identical to, U+2261 ISOtech" }, 1876 { 8804, "le", "less-than or equal to, U+2264 ISOtech" }, 1877 { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" }, 1878 { 8834, "sub", "subset of, U+2282 ISOtech" }, 1879 { 8835, "sup", "superset of, U+2283 ISOtech" }, 1880 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" }, 1881 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" }, 1882 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" }, 1883 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" }, 1884 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" }, 1885 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" }, 1886 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" }, 1887 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" }, 1888 { 8969, "rceil","right ceiling, U+2309 ISOamsc" }, 1889 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" }, 1890 { 8971, "rfloor","right floor, U+230B ISOamsc" }, 1891 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" }, 1892 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" }, 1893 { 9674, "loz", "lozenge, U+25CA ISOpub" }, 1894 1895 { 9824, "spades","black spade suit, U+2660 ISOpub" }, 1896 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" }, 1897 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" }, 1898 { 9830, "diams","black diamond suit, U+2666 ISOpub" }, 1899 1900 }; 1901 1902 /************************************************************************ 1903 * * 1904 * Commodity functions to handle entities * 1905 * * 1906 ************************************************************************/ 1907 1908 /* 1909 * Macro used to grow the current buffer. 1910 */ 1911 #define growBuffer(buffer) { \ 1912 xmlChar *tmp; \ 1913 buffer##_size *= 2; \ 1914 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \ 1915 if (tmp == NULL) { \ 1916 htmlErrMemory(ctxt, "growing buffer\n"); \ 1917 xmlFree(buffer); \ 1918 return(NULL); \ 1919 } \ 1920 buffer = tmp; \ 1921 } 1922 1923 /** 1924 * htmlEntityLookup: 1925 * @name: the entity name 1926 * 1927 * Lookup the given entity in EntitiesTable 1928 * 1929 * TODO: the linear scan is really ugly, an hash table is really needed. 1930 * 1931 * Returns the associated htmlEntityDescPtr if found, NULL otherwise. 1932 */ 1933 const htmlEntityDesc * 1934 htmlEntityLookup(const xmlChar *name) { 1935 unsigned int i; 1936 1937 for (i = 0;i < (sizeof(html40EntitiesTable)/ 1938 sizeof(html40EntitiesTable[0]));i++) { 1939 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) { 1940 return((htmlEntityDescPtr) &html40EntitiesTable[i]); 1941 } 1942 } 1943 return(NULL); 1944 } 1945 1946 /** 1947 * htmlEntityValueLookup: 1948 * @value: the entity's unicode value 1949 * 1950 * Lookup the given entity in EntitiesTable 1951 * 1952 * TODO: the linear scan is really ugly, an hash table is really needed. 1953 * 1954 * Returns the associated htmlEntityDescPtr if found, NULL otherwise. 1955 */ 1956 const htmlEntityDesc * 1957 htmlEntityValueLookup(unsigned int value) { 1958 unsigned int i; 1959 1960 for (i = 0;i < (sizeof(html40EntitiesTable)/ 1961 sizeof(html40EntitiesTable[0]));i++) { 1962 if (html40EntitiesTable[i].value >= value) { 1963 if (html40EntitiesTable[i].value > value) 1964 break; 1965 return((htmlEntityDescPtr) &html40EntitiesTable[i]); 1966 } 1967 } 1968 return(NULL); 1969 } 1970 1971 /** 1972 * UTF8ToHtml: 1973 * @out: a pointer to an array of bytes to store the result 1974 * @outlen: the length of @out 1975 * @in: a pointer to an array of UTF-8 chars 1976 * @inlen: the length of @in 1977 * 1978 * Take a block of UTF-8 chars in and try to convert it to an ASCII 1979 * plus HTML entities block of chars out. 1980 * 1981 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise 1982 * The value of @inlen after return is the number of octets consumed 1983 * as the return value is positive, else unpredictable. 1984 * The value of @outlen after return is the number of octets consumed. 1985 */ 1986 int 1987 UTF8ToHtml(unsigned char* out, int *outlen, 1988 const unsigned char* in, int *inlen) { 1989 const unsigned char* processed = in; 1990 const unsigned char* outend; 1991 const unsigned char* outstart = out; 1992 const unsigned char* instart = in; 1993 const unsigned char* inend; 1994 unsigned int c, d; 1995 int trailing; 1996 1997 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1); 1998 if (in == NULL) { 1999 /* 2000 * initialization nothing to do 2001 */ 2002 *outlen = 0; 2003 *inlen = 0; 2004 return(0); 2005 } 2006 inend = in + (*inlen); 2007 outend = out + (*outlen); 2008 while (in < inend) { 2009 d = *in++; 2010 if (d < 0x80) { c= d; trailing= 0; } 2011 else if (d < 0xC0) { 2012 /* trailing byte in leading position */ 2013 *outlen = out - outstart; 2014 *inlen = processed - instart; 2015 return(-2); 2016 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } 2017 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } 2018 else if (d < 0xF8) { c= d & 0x07; trailing= 3; } 2019 else { 2020 /* no chance for this in Ascii */ 2021 *outlen = out - outstart; 2022 *inlen = processed - instart; 2023 return(-2); 2024 } 2025 2026 if (inend - in < trailing) { 2027 break; 2028 } 2029 2030 for ( ; trailing; trailing--) { 2031 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) 2032 break; 2033 c <<= 6; 2034 c |= d & 0x3F; 2035 } 2036 2037 /* assertion: c is a single UTF-4 value */ 2038 if (c < 0x80) { 2039 if (out + 1 >= outend) 2040 break; 2041 *out++ = c; 2042 } else { 2043 int len; 2044 const htmlEntityDesc * ent; 2045 const char *cp; 2046 char nbuf[16]; 2047 2048 /* 2049 * Try to lookup a predefined HTML entity for it 2050 */ 2051 2052 ent = htmlEntityValueLookup(c); 2053 if (ent == NULL) { 2054 snprintf(nbuf, sizeof(nbuf), "#%u", c); 2055 cp = nbuf; 2056 } 2057 else 2058 cp = ent->name; 2059 len = strlen(cp); 2060 if (out + 2 + len >= outend) 2061 break; 2062 *out++ = '&'; 2063 memcpy(out, cp, len); 2064 out += len; 2065 *out++ = ';'; 2066 } 2067 processed = in; 2068 } 2069 *outlen = out - outstart; 2070 *inlen = processed - instart; 2071 return(0); 2072 } 2073 2074 /** 2075 * htmlEncodeEntities: 2076 * @out: a pointer to an array of bytes to store the result 2077 * @outlen: the length of @out 2078 * @in: a pointer to an array of UTF-8 chars 2079 * @inlen: the length of @in 2080 * @quoteChar: the quote character to escape (' or ") or zero. 2081 * 2082 * Take a block of UTF-8 chars in and try to convert it to an ASCII 2083 * plus HTML entities block of chars out. 2084 * 2085 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise 2086 * The value of @inlen after return is the number of octets consumed 2087 * as the return value is positive, else unpredictable. 2088 * The value of @outlen after return is the number of octets consumed. 2089 */ 2090 int 2091 htmlEncodeEntities(unsigned char* out, int *outlen, 2092 const unsigned char* in, int *inlen, int quoteChar) { 2093 const unsigned char* processed = in; 2094 const unsigned char* outend; 2095 const unsigned char* outstart = out; 2096 const unsigned char* instart = in; 2097 const unsigned char* inend; 2098 unsigned int c, d; 2099 int trailing; 2100 2101 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) 2102 return(-1); 2103 outend = out + (*outlen); 2104 inend = in + (*inlen); 2105 while (in < inend) { 2106 d = *in++; 2107 if (d < 0x80) { c= d; trailing= 0; } 2108 else if (d < 0xC0) { 2109 /* trailing byte in leading position */ 2110 *outlen = out - outstart; 2111 *inlen = processed - instart; 2112 return(-2); 2113 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } 2114 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } 2115 else if (d < 0xF8) { c= d & 0x07; trailing= 3; } 2116 else { 2117 /* no chance for this in Ascii */ 2118 *outlen = out - outstart; 2119 *inlen = processed - instart; 2120 return(-2); 2121 } 2122 2123 if (inend - in < trailing) 2124 break; 2125 2126 while (trailing--) { 2127 if (((d= *in++) & 0xC0) != 0x80) { 2128 *outlen = out - outstart; 2129 *inlen = processed - instart; 2130 return(-2); 2131 } 2132 c <<= 6; 2133 c |= d & 0x3F; 2134 } 2135 2136 /* assertion: c is a single UTF-4 value */ 2137 if ((c < 0x80) && (c != (unsigned int) quoteChar) && 2138 (c != '&') && (c != '<') && (c != '>')) { 2139 if (out >= outend) 2140 break; 2141 *out++ = c; 2142 } else { 2143 const htmlEntityDesc * ent; 2144 const char *cp; 2145 char nbuf[16]; 2146 int len; 2147 2148 /* 2149 * Try to lookup a predefined HTML entity for it 2150 */ 2151 ent = htmlEntityValueLookup(c); 2152 if (ent == NULL) { 2153 snprintf(nbuf, sizeof(nbuf), "#%u", c); 2154 cp = nbuf; 2155 } 2156 else 2157 cp = ent->name; 2158 len = strlen(cp); 2159 if (out + 2 + len > outend) 2160 break; 2161 *out++ = '&'; 2162 memcpy(out, cp, len); 2163 out += len; 2164 *out++ = ';'; 2165 } 2166 processed = in; 2167 } 2168 *outlen = out - outstart; 2169 *inlen = processed - instart; 2170 return(0); 2171 } 2172 2173 /************************************************************************ 2174 * * 2175 * Commodity functions to handle streams * 2176 * * 2177 ************************************************************************/ 2178 2179 /** 2180 * htmlNewInputStream: 2181 * @ctxt: an HTML parser context 2182 * 2183 * Create a new input stream structure 2184 * Returns the new input stream or NULL 2185 */ 2186 static htmlParserInputPtr 2187 htmlNewInputStream(htmlParserCtxtPtr ctxt) { 2188 htmlParserInputPtr input; 2189 2190 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput)); 2191 if (input == NULL) { 2192 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n"); 2193 return(NULL); 2194 } 2195 memset(input, 0, sizeof(htmlParserInput)); 2196 input->filename = NULL; 2197 input->directory = NULL; 2198 input->base = NULL; 2199 input->cur = NULL; 2200 input->buf = NULL; 2201 input->line = 1; 2202 input->col = 1; 2203 input->buf = NULL; 2204 input->free = NULL; 2205 input->version = NULL; 2206 input->consumed = 0; 2207 input->length = 0; 2208 return(input); 2209 } 2210 2211 2212 /************************************************************************ 2213 * * 2214 * Commodity functions, cleanup needed ? * 2215 * * 2216 ************************************************************************/ 2217 /* 2218 * all tags allowing pc data from the html 4.01 loose dtd 2219 * NOTE: it might be more apropriate to integrate this information 2220 * into the html40ElementTable array but I don't want to risk any 2221 * binary incomptibility 2222 */ 2223 static const char *allowPCData[] = { 2224 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big", 2225 "blockquote", "body", "button", "caption", "center", "cite", "code", 2226 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2", 2227 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend", 2228 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp", 2229 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var" 2230 }; 2231 2232 /** 2233 * areBlanks: 2234 * @ctxt: an HTML parser context 2235 * @str: a xmlChar * 2236 * @len: the size of @str 2237 * 2238 * Is this a sequence of blank chars that one can ignore ? 2239 * 2240 * Returns 1 if ignorable 0 otherwise. 2241 */ 2242 2243 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) { 2244 unsigned int i; 2245 int j; 2246 xmlNodePtr lastChild; 2247 xmlDtdPtr dtd; 2248 2249 for (j = 0;j < len;j++) 2250 if (!(IS_BLANK_CH(str[j]))) return(0); 2251 2252 if (CUR == 0) return(1); 2253 if (CUR != '<') return(0); 2254 if (ctxt->name == NULL) 2255 return(1); 2256 if (xmlStrEqual(ctxt->name, BAD_CAST"html")) 2257 return(1); 2258 if (xmlStrEqual(ctxt->name, BAD_CAST"head")) 2259 return(1); 2260 2261 /* Only strip CDATA children of the body tag for strict HTML DTDs */ 2262 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) { 2263 dtd = xmlGetIntSubset(ctxt->myDoc); 2264 if (dtd != NULL && dtd->ExternalID != NULL) { 2265 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") || 2266 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN")) 2267 return(1); 2268 } 2269 } 2270 2271 if (ctxt->node == NULL) return(0); 2272 lastChild = xmlGetLastChild(ctxt->node); 2273 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE)) 2274 lastChild = lastChild->prev; 2275 if (lastChild == NULL) { 2276 if ((ctxt->node->type != XML_ELEMENT_NODE) && 2277 (ctxt->node->content != NULL)) return(0); 2278 /* keep ws in constructs like ...<b> </b>... 2279 for all tags "b" allowing PCDATA */ 2280 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) { 2281 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) { 2282 return(0); 2283 } 2284 } 2285 } else if (xmlNodeIsText(lastChild)) { 2286 return(0); 2287 } else { 2288 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p> 2289 for all tags "p" allowing PCDATA */ 2290 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) { 2291 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) { 2292 return(0); 2293 } 2294 } 2295 } 2296 return(1); 2297 } 2298 2299 /** 2300 * htmlNewDocNoDtD: 2301 * @URI: URI for the dtd, or NULL 2302 * @ExternalID: the external ID of the DTD, or NULL 2303 * 2304 * Creates a new HTML document without a DTD node if @URI and @ExternalID 2305 * are NULL 2306 * 2307 * Returns a new document, do not initialize the DTD if not provided 2308 */ 2309 htmlDocPtr 2310 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) { 2311 xmlDocPtr cur; 2312 2313 /* 2314 * Allocate a new document and fill the fields. 2315 */ 2316 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc)); 2317 if (cur == NULL) { 2318 htmlErrMemory(NULL, "HTML document creation failed\n"); 2319 return(NULL); 2320 } 2321 memset(cur, 0, sizeof(xmlDoc)); 2322 2323 cur->type = XML_HTML_DOCUMENT_NODE; 2324 cur->version = NULL; 2325 cur->intSubset = NULL; 2326 cur->doc = cur; 2327 cur->name = NULL; 2328 cur->children = NULL; 2329 cur->extSubset = NULL; 2330 cur->oldNs = NULL; 2331 cur->encoding = NULL; 2332 cur->standalone = 1; 2333 cur->compression = 0; 2334 cur->ids = NULL; 2335 cur->refs = NULL; 2336 cur->_private = NULL; 2337 cur->charset = XML_CHAR_ENCODING_UTF8; 2338 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT; 2339 if ((ExternalID != NULL) || 2340 (URI != NULL)) 2341 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI); 2342 return(cur); 2343 } 2344 2345 /** 2346 * htmlNewDoc: 2347 * @URI: URI for the dtd, or NULL 2348 * @ExternalID: the external ID of the DTD, or NULL 2349 * 2350 * Creates a new HTML document 2351 * 2352 * Returns a new document 2353 */ 2354 htmlDocPtr 2355 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) { 2356 if ((URI == NULL) && (ExternalID == NULL)) 2357 return(htmlNewDocNoDtD( 2358 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd", 2359 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN")); 2360 2361 return(htmlNewDocNoDtD(URI, ExternalID)); 2362 } 2363 2364 2365 /************************************************************************ 2366 * * 2367 * The parser itself * 2368 * Relates to http://www.w3.org/TR/html40 * 2369 * * 2370 ************************************************************************/ 2371 2372 /************************************************************************ 2373 * * 2374 * The parser itself * 2375 * * 2376 ************************************************************************/ 2377 2378 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt); 2379 2380 /** 2381 * htmlParseHTMLName: 2382 * @ctxt: an HTML parser context 2383 * 2384 * parse an HTML tag or attribute name, note that we convert it to lowercase 2385 * since HTML names are not case-sensitive. 2386 * 2387 * Returns the Tag Name parsed or NULL 2388 */ 2389 2390 static const xmlChar * 2391 htmlParseHTMLName(htmlParserCtxtPtr ctxt) { 2392 int i = 0; 2393 xmlChar loc[HTML_PARSER_BUFFER_SIZE]; 2394 2395 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') && 2396 (CUR != ':') && (CUR != '.')) return(NULL); 2397 2398 while ((i < HTML_PARSER_BUFFER_SIZE) && 2399 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) || 2400 (CUR == ':') || (CUR == '-') || (CUR == '_') || 2401 (CUR == '.'))) { 2402 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20; 2403 else loc[i] = CUR; 2404 i++; 2405 2406 NEXT; 2407 } 2408 2409 return(xmlDictLookup(ctxt->dict, loc, i)); 2410 } 2411 2412 2413 /** 2414 * htmlParseHTMLName_nonInvasive: 2415 * @ctxt: an HTML parser context 2416 * 2417 * parse an HTML tag or attribute name, note that we convert it to lowercase 2418 * since HTML names are not case-sensitive, this doesn't consume the data 2419 * from the stream, it's a look-ahead 2420 * 2421 * Returns the Tag Name parsed or NULL 2422 */ 2423 2424 static const xmlChar * 2425 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) { 2426 int i = 0; 2427 xmlChar loc[HTML_PARSER_BUFFER_SIZE]; 2428 2429 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') && 2430 (NXT(1) != ':')) return(NULL); 2431 2432 while ((i < HTML_PARSER_BUFFER_SIZE) && 2433 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) || 2434 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) { 2435 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20; 2436 else loc[i] = NXT(1+i); 2437 i++; 2438 } 2439 2440 return(xmlDictLookup(ctxt->dict, loc, i)); 2441 } 2442 2443 2444 /** 2445 * htmlParseName: 2446 * @ctxt: an HTML parser context 2447 * 2448 * parse an HTML name, this routine is case sensitive. 2449 * 2450 * Returns the Name parsed or NULL 2451 */ 2452 2453 static const xmlChar * 2454 htmlParseName(htmlParserCtxtPtr ctxt) { 2455 const xmlChar *in; 2456 const xmlChar *ret; 2457 int count = 0; 2458 2459 GROW; 2460 2461 /* 2462 * Accelerator for simple ASCII names 2463 */ 2464 in = ctxt->input->cur; 2465 if (((*in >= 0x61) && (*in <= 0x7A)) || 2466 ((*in >= 0x41) && (*in <= 0x5A)) || 2467 (*in == '_') || (*in == ':')) { 2468 in++; 2469 while (((*in >= 0x61) && (*in <= 0x7A)) || 2470 ((*in >= 0x41) && (*in <= 0x5A)) || 2471 ((*in >= 0x30) && (*in <= 0x39)) || 2472 (*in == '_') || (*in == '-') || 2473 (*in == ':') || (*in == '.')) 2474 in++; 2475 2476 if (in == ctxt->input->end) 2477 return(NULL); 2478 2479 if ((*in > 0) && (*in < 0x80)) { 2480 count = in - ctxt->input->cur; 2481 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count); 2482 ctxt->input->cur = in; 2483 ctxt->nbChars += count; 2484 ctxt->input->col += count; 2485 return(ret); 2486 } 2487 } 2488 return(htmlParseNameComplex(ctxt)); 2489 } 2490 2491 static const xmlChar * 2492 htmlParseNameComplex(xmlParserCtxtPtr ctxt) { 2493 int len = 0, l; 2494 int c; 2495 int count = 0; 2496 const xmlChar *base = ctxt->input->base; 2497 2498 /* 2499 * Handler for more complex cases 2500 */ 2501 GROW; 2502 c = CUR_CHAR(l); 2503 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */ 2504 (!IS_LETTER(c) && (c != '_') && 2505 (c != ':'))) { 2506 return(NULL); 2507 } 2508 2509 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */ 2510 ((IS_LETTER(c)) || (IS_DIGIT(c)) || 2511 (c == '.') || (c == '-') || 2512 (c == '_') || (c == ':') || 2513 (IS_COMBINING(c)) || 2514 (IS_EXTENDER(c)))) { 2515 if (count++ > 100) { 2516 count = 0; 2517 GROW; 2518 } 2519 len += l; 2520 NEXTL(l); 2521 c = CUR_CHAR(l); 2522 if (ctxt->input->base != base) { 2523 /* 2524 * We changed encoding from an unknown encoding 2525 * Input buffer changed location, so we better start again 2526 */ 2527 return(htmlParseNameComplex(ctxt)); 2528 } 2529 } 2530 2531 if (ctxt->input->cur - ctxt->input->base < len) { 2532 /* Sanity check */ 2533 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 2534 "unexpected change of input buffer", NULL, NULL); 2535 return (NULL); 2536 } 2537 2538 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len)); 2539 } 2540 2541 2542 /** 2543 * htmlParseHTMLAttribute: 2544 * @ctxt: an HTML parser context 2545 * @stop: a char stop value 2546 * 2547 * parse an HTML attribute value till the stop (quote), if 2548 * stop is 0 then it stops at the first space 2549 * 2550 * Returns the attribute parsed or NULL 2551 */ 2552 2553 static xmlChar * 2554 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) { 2555 xmlChar *buffer = NULL; 2556 int buffer_size = 0; 2557 xmlChar *out = NULL; 2558 const xmlChar *name = NULL; 2559 const xmlChar *cur = NULL; 2560 const htmlEntityDesc * ent; 2561 2562 /* 2563 * allocate a translation buffer. 2564 */ 2565 buffer_size = HTML_PARSER_BUFFER_SIZE; 2566 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar)); 2567 if (buffer == NULL) { 2568 htmlErrMemory(ctxt, "buffer allocation failed\n"); 2569 return(NULL); 2570 } 2571 out = buffer; 2572 2573 /* 2574 * Ok loop until we reach one of the ending chars 2575 */ 2576 while ((CUR != 0) && (CUR != stop)) { 2577 if ((stop == 0) && (CUR == '>')) break; 2578 if ((stop == 0) && (IS_BLANK_CH(CUR))) break; 2579 if (CUR == '&') { 2580 if (NXT(1) == '#') { 2581 unsigned int c; 2582 int bits; 2583 2584 c = htmlParseCharRef(ctxt); 2585 if (c < 0x80) 2586 { *out++ = c; bits= -6; } 2587 else if (c < 0x800) 2588 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2589 else if (c < 0x10000) 2590 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2591 else 2592 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2593 2594 for ( ; bits >= 0; bits-= 6) { 2595 *out++ = ((c >> bits) & 0x3F) | 0x80; 2596 } 2597 2598 if (out - buffer > buffer_size - 100) { 2599 int indx = out - buffer; 2600 2601 growBuffer(buffer); 2602 out = &buffer[indx]; 2603 } 2604 } else { 2605 ent = htmlParseEntityRef(ctxt, &name); 2606 if (name == NULL) { 2607 *out++ = '&'; 2608 if (out - buffer > buffer_size - 100) { 2609 int indx = out - buffer; 2610 2611 growBuffer(buffer); 2612 out = &buffer[indx]; 2613 } 2614 } else if (ent == NULL) { 2615 *out++ = '&'; 2616 cur = name; 2617 while (*cur != 0) { 2618 if (out - buffer > buffer_size - 100) { 2619 int indx = out - buffer; 2620 2621 growBuffer(buffer); 2622 out = &buffer[indx]; 2623 } 2624 *out++ = *cur++; 2625 } 2626 } else { 2627 unsigned int c; 2628 int bits; 2629 2630 if (out - buffer > buffer_size - 100) { 2631 int indx = out - buffer; 2632 2633 growBuffer(buffer); 2634 out = &buffer[indx]; 2635 } 2636 c = ent->value; 2637 if (c < 0x80) 2638 { *out++ = c; bits= -6; } 2639 else if (c < 0x800) 2640 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2641 else if (c < 0x10000) 2642 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2643 else 2644 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2645 2646 for ( ; bits >= 0; bits-= 6) { 2647 *out++ = ((c >> bits) & 0x3F) | 0x80; 2648 } 2649 } 2650 } 2651 } else { 2652 unsigned int c; 2653 int bits, l; 2654 2655 if (out - buffer > buffer_size - 100) { 2656 int indx = out - buffer; 2657 2658 growBuffer(buffer); 2659 out = &buffer[indx]; 2660 } 2661 c = CUR_CHAR(l); 2662 if (c < 0x80) 2663 { *out++ = c; bits= -6; } 2664 else if (c < 0x800) 2665 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2666 else if (c < 0x10000) 2667 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2668 else 2669 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2670 2671 for ( ; bits >= 0; bits-= 6) { 2672 *out++ = ((c >> bits) & 0x3F) | 0x80; 2673 } 2674 NEXT; 2675 } 2676 } 2677 *out = 0; 2678 return(buffer); 2679 } 2680 2681 /** 2682 * htmlParseEntityRef: 2683 * @ctxt: an HTML parser context 2684 * @str: location to store the entity name 2685 * 2686 * parse an HTML ENTITY references 2687 * 2688 * [68] EntityRef ::= '&' Name ';' 2689 * 2690 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise, 2691 * if non-NULL *str will have to be freed by the caller. 2692 */ 2693 const htmlEntityDesc * 2694 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) { 2695 const xmlChar *name; 2696 const htmlEntityDesc * ent = NULL; 2697 2698 if (str != NULL) *str = NULL; 2699 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL); 2700 2701 if (CUR == '&') { 2702 NEXT; 2703 name = htmlParseName(ctxt); 2704 if (name == NULL) { 2705 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 2706 "htmlParseEntityRef: no name\n", NULL, NULL); 2707 } else { 2708 GROW; 2709 if (CUR == ';') { 2710 if (str != NULL) 2711 *str = name; 2712 2713 /* 2714 * Lookup the entity in the table. 2715 */ 2716 ent = htmlEntityLookup(name); 2717 if (ent != NULL) /* OK that's ugly !!! */ 2718 NEXT; 2719 } else { 2720 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING, 2721 "htmlParseEntityRef: expecting ';'\n", 2722 NULL, NULL); 2723 if (str != NULL) 2724 *str = name; 2725 } 2726 } 2727 } 2728 return(ent); 2729 } 2730 2731 /** 2732 * htmlParseAttValue: 2733 * @ctxt: an HTML parser context 2734 * 2735 * parse a value for an attribute 2736 * Note: the parser won't do substitution of entities here, this 2737 * will be handled later in xmlStringGetNodeList, unless it was 2738 * asked for ctxt->replaceEntities != 0 2739 * 2740 * Returns the AttValue parsed or NULL. 2741 */ 2742 2743 static xmlChar * 2744 htmlParseAttValue(htmlParserCtxtPtr ctxt) { 2745 xmlChar *ret = NULL; 2746 2747 if (CUR == '"') { 2748 NEXT; 2749 ret = htmlParseHTMLAttribute(ctxt, '"'); 2750 if (CUR != '"') { 2751 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED, 2752 "AttValue: \" expected\n", NULL, NULL); 2753 } else 2754 NEXT; 2755 } else if (CUR == '\'') { 2756 NEXT; 2757 ret = htmlParseHTMLAttribute(ctxt, '\''); 2758 if (CUR != '\'') { 2759 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED, 2760 "AttValue: ' expected\n", NULL, NULL); 2761 } else 2762 NEXT; 2763 } else { 2764 /* 2765 * That's an HTMLism, the attribute value may not be quoted 2766 */ 2767 ret = htmlParseHTMLAttribute(ctxt, 0); 2768 if (ret == NULL) { 2769 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE, 2770 "AttValue: no value found\n", NULL, NULL); 2771 } 2772 } 2773 return(ret); 2774 } 2775 2776 /** 2777 * htmlParseSystemLiteral: 2778 * @ctxt: an HTML parser context 2779 * 2780 * parse an HTML Literal 2781 * 2782 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") 2783 * 2784 * Returns the SystemLiteral parsed or NULL 2785 */ 2786 2787 static xmlChar * 2788 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) { 2789 size_t len = 0, startPosition = 0; 2790 xmlChar *ret = NULL; 2791 2792 if (CUR == '"') { 2793 NEXT; 2794 2795 if (CUR_PTR < BASE_PTR) 2796 return(ret); 2797 startPosition = CUR_PTR - BASE_PTR; 2798 2799 while ((IS_CHAR_CH(CUR)) && (CUR != '"')) { 2800 NEXT; 2801 len++; 2802 } 2803 if (!IS_CHAR_CH(CUR)) { 2804 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2805 "Unfinished SystemLiteral\n", NULL, NULL); 2806 } else { 2807 ret = xmlStrndup((BASE_PTR+startPosition), len); 2808 NEXT; 2809 } 2810 } else if (CUR == '\'') { 2811 NEXT; 2812 2813 if (CUR_PTR < BASE_PTR) 2814 return(ret); 2815 startPosition = CUR_PTR - BASE_PTR; 2816 2817 while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) { 2818 NEXT; 2819 len++; 2820 } 2821 if (!IS_CHAR_CH(CUR)) { 2822 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2823 "Unfinished SystemLiteral\n", NULL, NULL); 2824 } else { 2825 ret = xmlStrndup((BASE_PTR+startPosition), len); 2826 NEXT; 2827 } 2828 } else { 2829 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, 2830 " or ' expected\n", NULL, NULL); 2831 } 2832 2833 return(ret); 2834 } 2835 2836 /** 2837 * htmlParsePubidLiteral: 2838 * @ctxt: an HTML parser context 2839 * 2840 * parse an HTML public literal 2841 * 2842 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" 2843 * 2844 * Returns the PubidLiteral parsed or NULL. 2845 */ 2846 2847 static xmlChar * 2848 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) { 2849 size_t len = 0, startPosition = 0; 2850 xmlChar *ret = NULL; 2851 /* 2852 * Name ::= (Letter | '_') (NameChar)* 2853 */ 2854 if (CUR == '"') { 2855 NEXT; 2856 2857 if (CUR_PTR < BASE_PTR) 2858 return(ret); 2859 startPosition = CUR_PTR - BASE_PTR; 2860 2861 while (IS_PUBIDCHAR_CH(CUR)) { 2862 len++; 2863 NEXT; 2864 } 2865 2866 if (CUR != '"') { 2867 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2868 "Unfinished PubidLiteral\n", NULL, NULL); 2869 } else { 2870 ret = xmlStrndup((BASE_PTR + startPosition), len); 2871 NEXT; 2872 } 2873 } else if (CUR == '\'') { 2874 NEXT; 2875 2876 if (CUR_PTR < BASE_PTR) 2877 return(ret); 2878 startPosition = CUR_PTR - BASE_PTR; 2879 2880 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')){ 2881 len++; 2882 NEXT; 2883 } 2884 2885 if (CUR != '\'') { 2886 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2887 "Unfinished PubidLiteral\n", NULL, NULL); 2888 } else { 2889 ret = xmlStrndup((BASE_PTR + startPosition), len); 2890 NEXT; 2891 } 2892 } else { 2893 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, 2894 "PubidLiteral \" or ' expected\n", NULL, NULL); 2895 } 2896 2897 return(ret); 2898 } 2899 2900 /** 2901 * htmlParseScript: 2902 * @ctxt: an HTML parser context 2903 * 2904 * parse the content of an HTML SCRIPT or STYLE element 2905 * http://www.w3.org/TR/html4/sgml/dtd.html#Script 2906 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet 2907 * http://www.w3.org/TR/html4/types.html#type-script 2908 * http://www.w3.org/TR/html4/types.html#h-6.15 2909 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1 2910 * 2911 * Script data ( %Script; in the DTD) can be the content of the SCRIPT 2912 * element and the value of intrinsic event attributes. User agents must 2913 * not evaluate script data as HTML markup but instead must pass it on as 2914 * data to a script engine. 2915 * NOTES: 2916 * - The content is passed like CDATA 2917 * - the attributes for style and scripting "onXXX" are also described 2918 * as CDATA but SGML allows entities references in attributes so their 2919 * processing is identical as other attributes 2920 */ 2921 static void 2922 htmlParseScript(htmlParserCtxtPtr ctxt) { 2923 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5]; 2924 int nbchar = 0; 2925 int cur,l; 2926 2927 SHRINK; 2928 cur = CUR_CHAR(l); 2929 while (IS_CHAR_CH(cur)) { 2930 if ((cur == '<') && (NXT(1) == '/')) { 2931 /* 2932 * One should break here, the specification is clear: 2933 * Authors should therefore escape "</" within the content. 2934 * Escape mechanisms are specific to each scripting or 2935 * style sheet language. 2936 * 2937 * In recovery mode, only break if end tag match the 2938 * current tag, effectively ignoring all tags inside the 2939 * script/style block and treating the entire block as 2940 * CDATA. 2941 */ 2942 if (ctxt->recovery) { 2943 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2, 2944 xmlStrlen(ctxt->name)) == 0) 2945 { 2946 break; /* while */ 2947 } else { 2948 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 2949 "Element %s embeds close tag\n", 2950 ctxt->name, NULL); 2951 } 2952 } else { 2953 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) || 2954 ((NXT(2) >= 'a') && (NXT(2) <= 'z'))) 2955 { 2956 break; /* while */ 2957 } 2958 } 2959 } 2960 COPY_BUF(l,buf,nbchar,cur); 2961 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { 2962 if (ctxt->sax->cdataBlock!= NULL) { 2963 /* 2964 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE 2965 */ 2966 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); 2967 } else if (ctxt->sax->characters != NULL) { 2968 ctxt->sax->characters(ctxt->userData, buf, nbchar); 2969 } 2970 nbchar = 0; 2971 } 2972 GROW; 2973 NEXTL(l); 2974 cur = CUR_CHAR(l); 2975 } 2976 2977 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) { 2978 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 2979 "Invalid char in CDATA 0x%X\n", cur); 2980 if (ctxt->input->cur < ctxt->input->end) { 2981 NEXT; 2982 } 2983 } 2984 2985 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) { 2986 if (ctxt->sax->cdataBlock!= NULL) { 2987 /* 2988 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE 2989 */ 2990 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); 2991 } else if (ctxt->sax->characters != NULL) { 2992 ctxt->sax->characters(ctxt->userData, buf, nbchar); 2993 } 2994 } 2995 } 2996 2997 2998 /** 2999 * htmlParseCharDataInternal: 3000 * @ctxt: an HTML parser context 3001 * @readahead: optional read ahead character in ascii range 3002 * 3003 * parse a CharData section. 3004 * if we are within a CDATA section ']]>' marks an end of section. 3005 * 3006 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) 3007 */ 3008 3009 static void 3010 htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) { 3011 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6]; 3012 int nbchar = 0; 3013 int cur, l; 3014 int chunk = 0; 3015 3016 if (readahead) 3017 buf[nbchar++] = readahead; 3018 3019 SHRINK; 3020 cur = CUR_CHAR(l); 3021 while (((cur != '<') || (ctxt->token == '<')) && 3022 ((cur != '&') || (ctxt->token == '&')) && 3023 (cur != 0)) { 3024 if (!(IS_CHAR(cur))) { 3025 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 3026 "Invalid char in CDATA 0x%X\n", cur); 3027 } else { 3028 COPY_BUF(l,buf,nbchar,cur); 3029 } 3030 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { 3031 /* 3032 * Ok the segment is to be consumed as chars. 3033 */ 3034 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { 3035 if (areBlanks(ctxt, buf, nbchar)) { 3036 if (ctxt->keepBlanks) { 3037 if (ctxt->sax->characters != NULL) 3038 ctxt->sax->characters(ctxt->userData, buf, nbchar); 3039 } else { 3040 if (ctxt->sax->ignorableWhitespace != NULL) 3041 ctxt->sax->ignorableWhitespace(ctxt->userData, 3042 buf, nbchar); 3043 } 3044 } else { 3045 htmlCheckParagraph(ctxt); 3046 if (ctxt->sax->characters != NULL) 3047 ctxt->sax->characters(ctxt->userData, buf, nbchar); 3048 } 3049 } 3050 nbchar = 0; 3051 } 3052 NEXTL(l); 3053 chunk++; 3054 if (chunk > HTML_PARSER_BUFFER_SIZE) { 3055 chunk = 0; 3056 SHRINK; 3057 GROW; 3058 } 3059 cur = CUR_CHAR(l); 3060 if (cur == 0) { 3061 SHRINK; 3062 GROW; 3063 cur = CUR_CHAR(l); 3064 } 3065 } 3066 if (nbchar != 0) { 3067 buf[nbchar] = 0; 3068 3069 /* 3070 * Ok the segment is to be consumed as chars. 3071 */ 3072 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { 3073 if (areBlanks(ctxt, buf, nbchar)) { 3074 if (ctxt->keepBlanks) { 3075 if (ctxt->sax->characters != NULL) 3076 ctxt->sax->characters(ctxt->userData, buf, nbchar); 3077 } else { 3078 if (ctxt->sax->ignorableWhitespace != NULL) 3079 ctxt->sax->ignorableWhitespace(ctxt->userData, 3080 buf, nbchar); 3081 } 3082 } else { 3083 htmlCheckParagraph(ctxt); 3084 if (ctxt->sax->characters != NULL) 3085 ctxt->sax->characters(ctxt->userData, buf, nbchar); 3086 } 3087 } 3088 } else { 3089 /* 3090 * Loop detection 3091 */ 3092 if (cur == 0) 3093 ctxt->instate = XML_PARSER_EOF; 3094 } 3095 } 3096 3097 /** 3098 * htmlParseCharData: 3099 * @ctxt: an HTML parser context 3100 * 3101 * parse a CharData section. 3102 * if we are within a CDATA section ']]>' marks an end of section. 3103 * 3104 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) 3105 */ 3106 3107 static void 3108 htmlParseCharData(htmlParserCtxtPtr ctxt) { 3109 htmlParseCharDataInternal(ctxt, 0); 3110 } 3111 3112 /** 3113 * htmlParseExternalID: 3114 * @ctxt: an HTML parser context 3115 * @publicID: a xmlChar** receiving PubidLiteral 3116 * 3117 * Parse an External ID or a Public ID 3118 * 3119 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral 3120 * | 'PUBLIC' S PubidLiteral S SystemLiteral 3121 * 3122 * [83] PublicID ::= 'PUBLIC' S PubidLiteral 3123 * 3124 * Returns the function returns SystemLiteral and in the second 3125 * case publicID receives PubidLiteral, is strict is off 3126 * it is possible to return NULL and have publicID set. 3127 */ 3128 3129 static xmlChar * 3130 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) { 3131 xmlChar *URI = NULL; 3132 3133 if ((UPPER == 'S') && (UPP(1) == 'Y') && 3134 (UPP(2) == 'S') && (UPP(3) == 'T') && 3135 (UPP(4) == 'E') && (UPP(5) == 'M')) { 3136 SKIP(6); 3137 if (!IS_BLANK_CH(CUR)) { 3138 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 3139 "Space required after 'SYSTEM'\n", NULL, NULL); 3140 } 3141 SKIP_BLANKS; 3142 URI = htmlParseSystemLiteral(ctxt); 3143 if (URI == NULL) { 3144 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED, 3145 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL); 3146 } 3147 } else if ((UPPER == 'P') && (UPP(1) == 'U') && 3148 (UPP(2) == 'B') && (UPP(3) == 'L') && 3149 (UPP(4) == 'I') && (UPP(5) == 'C')) { 3150 SKIP(6); 3151 if (!IS_BLANK_CH(CUR)) { 3152 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 3153 "Space required after 'PUBLIC'\n", NULL, NULL); 3154 } 3155 SKIP_BLANKS; 3156 *publicID = htmlParsePubidLiteral(ctxt); 3157 if (*publicID == NULL) { 3158 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED, 3159 "htmlParseExternalID: PUBLIC, no Public Identifier\n", 3160 NULL, NULL); 3161 } 3162 SKIP_BLANKS; 3163 if ((CUR == '"') || (CUR == '\'')) { 3164 URI = htmlParseSystemLiteral(ctxt); 3165 } 3166 } 3167 return(URI); 3168 } 3169 3170 /** 3171 * xmlParsePI: 3172 * @ctxt: an XML parser context 3173 * 3174 * parse an XML Processing Instruction. 3175 * 3176 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>' 3177 */ 3178 static void 3179 htmlParsePI(htmlParserCtxtPtr ctxt) { 3180 xmlChar *buf = NULL; 3181 int len = 0; 3182 int size = HTML_PARSER_BUFFER_SIZE; 3183 int cur, l; 3184 const xmlChar *target; 3185 xmlParserInputState state; 3186 int count = 0; 3187 3188 if ((RAW == '<') && (NXT(1) == '?')) { 3189 state = ctxt->instate; 3190 ctxt->instate = XML_PARSER_PI; 3191 /* 3192 * this is a Processing Instruction. 3193 */ 3194 SKIP(2); 3195 SHRINK; 3196 3197 /* 3198 * Parse the target name and check for special support like 3199 * namespace. 3200 */ 3201 target = htmlParseName(ctxt); 3202 if (target != NULL) { 3203 if (RAW == '>') { 3204 SKIP(1); 3205 3206 /* 3207 * SAX: PI detected. 3208 */ 3209 if ((ctxt->sax) && (!ctxt->disableSAX) && 3210 (ctxt->sax->processingInstruction != NULL)) 3211 ctxt->sax->processingInstruction(ctxt->userData, 3212 target, NULL); 3213 ctxt->instate = state; 3214 return; 3215 } 3216 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar)); 3217 if (buf == NULL) { 3218 htmlErrMemory(ctxt, NULL); 3219 ctxt->instate = state; 3220 return; 3221 } 3222 cur = CUR; 3223 if (!IS_BLANK(cur)) { 3224 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 3225 "ParsePI: PI %s space expected\n", target, NULL); 3226 } 3227 SKIP_BLANKS; 3228 cur = CUR_CHAR(l); 3229 while (IS_CHAR(cur) && (cur != '>')) { 3230 if (len + 5 >= size) { 3231 xmlChar *tmp; 3232 3233 size *= 2; 3234 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); 3235 if (tmp == NULL) { 3236 htmlErrMemory(ctxt, NULL); 3237 xmlFree(buf); 3238 ctxt->instate = state; 3239 return; 3240 } 3241 buf = tmp; 3242 } 3243 count++; 3244 if (count > 50) { 3245 GROW; 3246 count = 0; 3247 } 3248 COPY_BUF(l,buf,len,cur); 3249 NEXTL(l); 3250 cur = CUR_CHAR(l); 3251 if (cur == 0) { 3252 SHRINK; 3253 GROW; 3254 cur = CUR_CHAR(l); 3255 } 3256 } 3257 buf[len] = 0; 3258 if (cur != '>') { 3259 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED, 3260 "ParsePI: PI %s never end ...\n", target, NULL); 3261 } else { 3262 SKIP(1); 3263 3264 /* 3265 * SAX: PI detected. 3266 */ 3267 if ((ctxt->sax) && (!ctxt->disableSAX) && 3268 (ctxt->sax->processingInstruction != NULL)) 3269 ctxt->sax->processingInstruction(ctxt->userData, 3270 target, buf); 3271 } 3272 xmlFree(buf); 3273 } else { 3274 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED, 3275 "PI is not started correctly", NULL, NULL); 3276 } 3277 ctxt->instate = state; 3278 } 3279 } 3280 3281 /** 3282 * htmlParseComment: 3283 * @ctxt: an HTML parser context 3284 * 3285 * Parse an XML (SGML) comment <!-- .... --> 3286 * 3287 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->' 3288 */ 3289 static void 3290 htmlParseComment(htmlParserCtxtPtr ctxt) { 3291 xmlChar *buf = NULL; 3292 int len; 3293 int size = HTML_PARSER_BUFFER_SIZE; 3294 int q, ql; 3295 int r, rl; 3296 int cur, l; 3297 xmlParserInputState state; 3298 3299 /* 3300 * Check that there is a comment right here. 3301 */ 3302 if ((RAW != '<') || (NXT(1) != '!') || 3303 (NXT(2) != '-') || (NXT(3) != '-')) return; 3304 3305 state = ctxt->instate; 3306 ctxt->instate = XML_PARSER_COMMENT; 3307 SHRINK; 3308 SKIP(4); 3309 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar)); 3310 if (buf == NULL) { 3311 htmlErrMemory(ctxt, "buffer allocation failed\n"); 3312 ctxt->instate = state; 3313 return; 3314 } 3315 len = 0; 3316 buf[len] = 0; 3317 q = CUR_CHAR(ql); 3318 if (!IS_CHAR(q)) 3319 goto unfinished; 3320 NEXTL(ql); 3321 r = CUR_CHAR(rl); 3322 if (!IS_CHAR(r)) 3323 goto unfinished; 3324 NEXTL(rl); 3325 cur = CUR_CHAR(l); 3326 while (IS_CHAR(cur) && 3327 ((cur != '>') || 3328 (r != '-') || (q != '-'))) { 3329 if (len + 5 >= size) { 3330 xmlChar *tmp; 3331 3332 size *= 2; 3333 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); 3334 if (tmp == NULL) { 3335 xmlFree(buf); 3336 htmlErrMemory(ctxt, "growing buffer failed\n"); 3337 ctxt->instate = state; 3338 return; 3339 } 3340 buf = tmp; 3341 } 3342 COPY_BUF(ql,buf,len,q); 3343 q = r; 3344 ql = rl; 3345 r = cur; 3346 rl = l; 3347 NEXTL(l); 3348 cur = CUR_CHAR(l); 3349 if (cur == 0) { 3350 SHRINK; 3351 GROW; 3352 cur = CUR_CHAR(l); 3353 } 3354 } 3355 buf[len] = 0; 3356 if (IS_CHAR(cur)) { 3357 NEXT; 3358 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) && 3359 (!ctxt->disableSAX)) 3360 ctxt->sax->comment(ctxt->userData, buf); 3361 xmlFree(buf); 3362 ctxt->instate = state; 3363 return; 3364 } 3365 3366 unfinished: 3367 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED, 3368 "Comment not terminated \n<!--%.50s\n", buf, NULL); 3369 xmlFree(buf); 3370 } 3371 3372 /** 3373 * htmlParseCharRef: 3374 * @ctxt: an HTML parser context 3375 * 3376 * parse Reference declarations 3377 * 3378 * [66] CharRef ::= '&#' [0-9]+ ';' | 3379 * '&#x' [0-9a-fA-F]+ ';' 3380 * 3381 * Returns the value parsed (as an int) 3382 */ 3383 int 3384 htmlParseCharRef(htmlParserCtxtPtr ctxt) { 3385 int val = 0; 3386 3387 if ((ctxt == NULL) || (ctxt->input == NULL)) { 3388 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 3389 "htmlParseCharRef: context error\n", 3390 NULL, NULL); 3391 return(0); 3392 } 3393 if ((CUR == '&') && (NXT(1) == '#') && 3394 ((NXT(2) == 'x') || NXT(2) == 'X')) { 3395 SKIP(3); 3396 while (CUR != ';') { 3397 if ((CUR >= '0') && (CUR <= '9')) 3398 val = val * 16 + (CUR - '0'); 3399 else if ((CUR >= 'a') && (CUR <= 'f')) 3400 val = val * 16 + (CUR - 'a') + 10; 3401 else if ((CUR >= 'A') && (CUR <= 'F')) 3402 val = val * 16 + (CUR - 'A') + 10; 3403 else { 3404 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF, 3405 "htmlParseCharRef: missing semicolon\n", 3406 NULL, NULL); 3407 break; 3408 } 3409 NEXT; 3410 } 3411 if (CUR == ';') 3412 NEXT; 3413 } else if ((CUR == '&') && (NXT(1) == '#')) { 3414 SKIP(2); 3415 while (CUR != ';') { 3416 if ((CUR >= '0') && (CUR <= '9')) 3417 val = val * 10 + (CUR - '0'); 3418 else { 3419 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF, 3420 "htmlParseCharRef: missing semicolon\n", 3421 NULL, NULL); 3422 break; 3423 } 3424 NEXT; 3425 } 3426 if (CUR == ';') 3427 NEXT; 3428 } else { 3429 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF, 3430 "htmlParseCharRef: invalid value\n", NULL, NULL); 3431 } 3432 /* 3433 * Check the value IS_CHAR ... 3434 */ 3435 if (IS_CHAR(val)) { 3436 return(val); 3437 } else { 3438 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 3439 "htmlParseCharRef: invalid xmlChar value %d\n", 3440 val); 3441 } 3442 return(0); 3443 } 3444 3445 3446 /** 3447 * htmlParseDocTypeDecl: 3448 * @ctxt: an HTML parser context 3449 * 3450 * parse a DOCTYPE declaration 3451 * 3452 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? 3453 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>' 3454 */ 3455 3456 static void 3457 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) { 3458 const xmlChar *name; 3459 xmlChar *ExternalID = NULL; 3460 xmlChar *URI = NULL; 3461 3462 /* 3463 * We know that '<!DOCTYPE' has been detected. 3464 */ 3465 SKIP(9); 3466 3467 SKIP_BLANKS; 3468 3469 /* 3470 * Parse the DOCTYPE name. 3471 */ 3472 name = htmlParseName(ctxt); 3473 if (name == NULL) { 3474 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 3475 "htmlParseDocTypeDecl : no DOCTYPE name !\n", 3476 NULL, NULL); 3477 } 3478 /* 3479 * Check that upper(name) == "HTML" !!!!!!!!!!!!! 3480 */ 3481 3482 SKIP_BLANKS; 3483 3484 /* 3485 * Check for SystemID and ExternalID 3486 */ 3487 URI = htmlParseExternalID(ctxt, &ExternalID); 3488 SKIP_BLANKS; 3489 3490 /* 3491 * We should be at the end of the DOCTYPE declaration. 3492 */ 3493 if (CUR != '>') { 3494 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED, 3495 "DOCTYPE improperly terminated\n", NULL, NULL); 3496 /* We shouldn't try to resynchronize ... */ 3497 } 3498 NEXT; 3499 3500 /* 3501 * Create or update the document accordingly to the DOCTYPE 3502 */ 3503 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) && 3504 (!ctxt->disableSAX)) 3505 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI); 3506 3507 /* 3508 * Cleanup, since we don't use all those identifiers 3509 */ 3510 if (URI != NULL) xmlFree(URI); 3511 if (ExternalID != NULL) xmlFree(ExternalID); 3512 } 3513 3514 /** 3515 * htmlParseAttribute: 3516 * @ctxt: an HTML parser context 3517 * @value: a xmlChar ** used to store the value of the attribute 3518 * 3519 * parse an attribute 3520 * 3521 * [41] Attribute ::= Name Eq AttValue 3522 * 3523 * [25] Eq ::= S? '=' S? 3524 * 3525 * With namespace: 3526 * 3527 * [NS 11] Attribute ::= QName Eq AttValue 3528 * 3529 * Also the case QName == xmlns:??? is handled independently as a namespace 3530 * definition. 3531 * 3532 * Returns the attribute name, and the value in *value. 3533 */ 3534 3535 static const xmlChar * 3536 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) { 3537 const xmlChar *name; 3538 xmlChar *val = NULL; 3539 3540 *value = NULL; 3541 name = htmlParseHTMLName(ctxt); 3542 if (name == NULL) { 3543 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 3544 "error parsing attribute name\n", NULL, NULL); 3545 return(NULL); 3546 } 3547 3548 /* 3549 * read the value 3550 */ 3551 SKIP_BLANKS; 3552 if (CUR == '=') { 3553 NEXT; 3554 SKIP_BLANKS; 3555 val = htmlParseAttValue(ctxt); 3556 } 3557 3558 *value = val; 3559 return(name); 3560 } 3561 3562 /** 3563 * htmlCheckEncodingDirect: 3564 * @ctxt: an HTML parser context 3565 * @attvalue: the attribute value 3566 * 3567 * Checks an attribute value to detect 3568 * the encoding 3569 * If a new encoding is detected the parser is switched to decode 3570 * it and pass UTF8 3571 */ 3572 static void 3573 htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) { 3574 3575 if ((ctxt == NULL) || (encoding == NULL) || 3576 (ctxt->options & HTML_PARSE_IGNORE_ENC)) 3577 return; 3578 3579 /* do not change encoding */ 3580 if (ctxt->input->encoding != NULL) 3581 return; 3582 3583 if (encoding != NULL) { 3584 xmlCharEncoding enc; 3585 xmlCharEncodingHandlerPtr handler; 3586 3587 while ((*encoding == ' ') || (*encoding == '\t')) encoding++; 3588 3589 if (ctxt->input->encoding != NULL) 3590 xmlFree((xmlChar *) ctxt->input->encoding); 3591 ctxt->input->encoding = xmlStrdup(encoding); 3592 3593 enc = xmlParseCharEncoding((const char *) encoding); 3594 /* 3595 * registered set of known encodings 3596 */ 3597 if (enc != XML_CHAR_ENCODING_ERROR) { 3598 if (((enc == XML_CHAR_ENCODING_UTF16LE) || 3599 (enc == XML_CHAR_ENCODING_UTF16BE) || 3600 (enc == XML_CHAR_ENCODING_UCS4LE) || 3601 (enc == XML_CHAR_ENCODING_UCS4BE)) && 3602 (ctxt->input->buf != NULL) && 3603 (ctxt->input->buf->encoder == NULL)) { 3604 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 3605 "htmlCheckEncoding: wrong encoding meta\n", 3606 NULL, NULL); 3607 } else { 3608 xmlSwitchEncoding(ctxt, enc); 3609 } 3610 ctxt->charset = XML_CHAR_ENCODING_UTF8; 3611 } else { 3612 /* 3613 * fallback for unknown encodings 3614 */ 3615 handler = xmlFindCharEncodingHandler((const char *) encoding); 3616 if (handler != NULL) { 3617 xmlSwitchToEncoding(ctxt, handler); 3618 ctxt->charset = XML_CHAR_ENCODING_UTF8; 3619 } else { 3620 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, 3621 "htmlCheckEncoding: unknown encoding %s\n", 3622 encoding, NULL); 3623 } 3624 } 3625 3626 if ((ctxt->input->buf != NULL) && 3627 (ctxt->input->buf->encoder != NULL) && 3628 (ctxt->input->buf->raw != NULL) && 3629 (ctxt->input->buf->buffer != NULL)) { 3630 int nbchars; 3631 int processed; 3632 3633 /* 3634 * convert as much as possible to the parser reading buffer. 3635 */ 3636 processed = ctxt->input->cur - ctxt->input->base; 3637 xmlBufShrink(ctxt->input->buf->buffer, processed); 3638 nbchars = xmlCharEncInput(ctxt->input->buf, 1); 3639 if (nbchars < 0) { 3640 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 3641 "htmlCheckEncoding: encoder error\n", 3642 NULL, NULL); 3643 } 3644 xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input); 3645 } 3646 } 3647 } 3648 3649 /** 3650 * htmlCheckEncoding: 3651 * @ctxt: an HTML parser context 3652 * @attvalue: the attribute value 3653 * 3654 * Checks an http-equiv attribute from a Meta tag to detect 3655 * the encoding 3656 * If a new encoding is detected the parser is switched to decode 3657 * it and pass UTF8 3658 */ 3659 static void 3660 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) { 3661 const xmlChar *encoding; 3662 3663 if (!attvalue) 3664 return; 3665 3666 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset"); 3667 if (encoding != NULL) { 3668 encoding += 7; 3669 } 3670 /* 3671 * skip blank 3672 */ 3673 if (encoding && IS_BLANK_CH(*encoding)) 3674 encoding = xmlStrcasestr(attvalue, BAD_CAST"="); 3675 if (encoding && *encoding == '=') { 3676 encoding ++; 3677 htmlCheckEncodingDirect(ctxt, encoding); 3678 } 3679 } 3680 3681 /** 3682 * htmlCheckMeta: 3683 * @ctxt: an HTML parser context 3684 * @atts: the attributes values 3685 * 3686 * Checks an attributes from a Meta tag 3687 */ 3688 static void 3689 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) { 3690 int i; 3691 const xmlChar *att, *value; 3692 int http = 0; 3693 const xmlChar *content = NULL; 3694 3695 if ((ctxt == NULL) || (atts == NULL)) 3696 return; 3697 3698 i = 0; 3699 att = atts[i++]; 3700 while (att != NULL) { 3701 value = atts[i++]; 3702 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv")) 3703 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) 3704 http = 1; 3705 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset"))) 3706 htmlCheckEncodingDirect(ctxt, value); 3707 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content"))) 3708 content = value; 3709 att = atts[i++]; 3710 } 3711 if ((http) && (content != NULL)) 3712 htmlCheckEncoding(ctxt, content); 3713 3714 } 3715 3716 /** 3717 * htmlParseStartTag: 3718 * @ctxt: an HTML parser context 3719 * 3720 * parse a start of tag either for rule element or 3721 * EmptyElement. In both case we don't parse the tag closing chars. 3722 * 3723 * [40] STag ::= '<' Name (S Attribute)* S? '>' 3724 * 3725 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' 3726 * 3727 * With namespace: 3728 * 3729 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>' 3730 * 3731 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>' 3732 * 3733 * Returns 0 in case of success, -1 in case of error and 1 if discarded 3734 */ 3735 3736 static int 3737 htmlParseStartTag(htmlParserCtxtPtr ctxt) { 3738 const xmlChar *name; 3739 const xmlChar *attname; 3740 xmlChar *attvalue; 3741 const xmlChar **atts; 3742 int nbatts = 0; 3743 int maxatts; 3744 int meta = 0; 3745 int i; 3746 int discardtag = 0; 3747 3748 if ((ctxt == NULL) || (ctxt->input == NULL)) { 3749 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 3750 "htmlParseStartTag: context error\n", NULL, NULL); 3751 return -1; 3752 } 3753 if (ctxt->instate == XML_PARSER_EOF) 3754 return(-1); 3755 if (CUR != '<') return -1; 3756 NEXT; 3757 3758 atts = ctxt->atts; 3759 maxatts = ctxt->maxatts; 3760 3761 GROW; 3762 name = htmlParseHTMLName(ctxt); 3763 if (name == NULL) { 3764 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 3765 "htmlParseStartTag: invalid element name\n", 3766 NULL, NULL); 3767 /* if recover preserve text on classic misconstructs */ 3768 if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') || 3769 (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) { 3770 htmlParseCharDataInternal(ctxt, '<'); 3771 return(-1); 3772 } 3773 3774 3775 /* Dump the bogus tag like browsers do */ 3776 while ((IS_CHAR_CH(CUR)) && (CUR != '>') && 3777 (ctxt->instate != XML_PARSER_EOF)) 3778 NEXT; 3779 return -1; 3780 } 3781 if (xmlStrEqual(name, BAD_CAST"meta")) 3782 meta = 1; 3783 3784 /* 3785 * Check for auto-closure of HTML elements. 3786 */ 3787 htmlAutoClose(ctxt, name); 3788 3789 /* 3790 * Check for implied HTML elements. 3791 */ 3792 htmlCheckImplied(ctxt, name); 3793 3794 /* 3795 * Avoid html at any level > 0, head at any level != 1 3796 * or any attempt to recurse body 3797 */ 3798 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) { 3799 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 3800 "htmlParseStartTag: misplaced <html> tag\n", 3801 name, NULL); 3802 discardtag = 1; 3803 ctxt->depth++; 3804 } 3805 if ((ctxt->nameNr != 1) && 3806 (xmlStrEqual(name, BAD_CAST"head"))) { 3807 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 3808 "htmlParseStartTag: misplaced <head> tag\n", 3809 name, NULL); 3810 discardtag = 1; 3811 ctxt->depth++; 3812 } 3813 if (xmlStrEqual(name, BAD_CAST"body")) { 3814 int indx; 3815 for (indx = 0;indx < ctxt->nameNr;indx++) { 3816 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) { 3817 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 3818 "htmlParseStartTag: misplaced <body> tag\n", 3819 name, NULL); 3820 discardtag = 1; 3821 ctxt->depth++; 3822 } 3823 } 3824 } 3825 3826 /* 3827 * Now parse the attributes, it ends up with the ending 3828 * 3829 * (S Attribute)* S? 3830 */ 3831 SKIP_BLANKS; 3832 while ((IS_CHAR_CH(CUR)) && 3833 (CUR != '>') && 3834 ((CUR != '/') || (NXT(1) != '>'))) { 3835 long cons = ctxt->nbChars; 3836 3837 GROW; 3838 attname = htmlParseAttribute(ctxt, &attvalue); 3839 if (attname != NULL) { 3840 3841 /* 3842 * Well formedness requires at most one declaration of an attribute 3843 */ 3844 for (i = 0; i < nbatts;i += 2) { 3845 if (xmlStrEqual(atts[i], attname)) { 3846 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED, 3847 "Attribute %s redefined\n", attname, NULL); 3848 if (attvalue != NULL) 3849 xmlFree(attvalue); 3850 goto failed; 3851 } 3852 } 3853 3854 /* 3855 * Add the pair to atts 3856 */ 3857 if (atts == NULL) { 3858 maxatts = 22; /* allow for 10 attrs by default */ 3859 atts = (const xmlChar **) 3860 xmlMalloc(maxatts * sizeof(xmlChar *)); 3861 if (atts == NULL) { 3862 htmlErrMemory(ctxt, NULL); 3863 if (attvalue != NULL) 3864 xmlFree(attvalue); 3865 goto failed; 3866 } 3867 ctxt->atts = atts; 3868 ctxt->maxatts = maxatts; 3869 } else if (nbatts + 4 > maxatts) { 3870 const xmlChar **n; 3871 3872 maxatts *= 2; 3873 n = (const xmlChar **) xmlRealloc((void *) atts, 3874 maxatts * sizeof(const xmlChar *)); 3875 if (n == NULL) { 3876 htmlErrMemory(ctxt, NULL); 3877 if (attvalue != NULL) 3878 xmlFree(attvalue); 3879 goto failed; 3880 } 3881 atts = n; 3882 ctxt->atts = atts; 3883 ctxt->maxatts = maxatts; 3884 } 3885 atts[nbatts++] = attname; 3886 atts[nbatts++] = attvalue; 3887 atts[nbatts] = NULL; 3888 atts[nbatts + 1] = NULL; 3889 } 3890 else { 3891 if (attvalue != NULL) 3892 xmlFree(attvalue); 3893 /* Dump the bogus attribute string up to the next blank or 3894 * the end of the tag. */ 3895 while ((IS_CHAR_CH(CUR)) && 3896 !(IS_BLANK_CH(CUR)) && (CUR != '>') && 3897 ((CUR != '/') || (NXT(1) != '>'))) 3898 NEXT; 3899 } 3900 3901 failed: 3902 SKIP_BLANKS; 3903 if (cons == ctxt->nbChars) { 3904 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 3905 "htmlParseStartTag: problem parsing attributes\n", 3906 NULL, NULL); 3907 break; 3908 } 3909 } 3910 3911 /* 3912 * Handle specific association to the META tag 3913 */ 3914 if (meta && (nbatts != 0)) 3915 htmlCheckMeta(ctxt, atts); 3916 3917 /* 3918 * SAX: Start of Element ! 3919 */ 3920 if (!discardtag) { 3921 htmlnamePush(ctxt, name); 3922 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) { 3923 if (nbatts != 0) 3924 ctxt->sax->startElement(ctxt->userData, name, atts); 3925 else 3926 ctxt->sax->startElement(ctxt->userData, name, NULL); 3927 } 3928 } 3929 3930 if (atts != NULL) { 3931 for (i = 1;i < nbatts;i += 2) { 3932 if (atts[i] != NULL) 3933 xmlFree((xmlChar *) atts[i]); 3934 } 3935 } 3936 3937 return(discardtag); 3938 } 3939 3940 /** 3941 * htmlParseEndTag: 3942 * @ctxt: an HTML parser context 3943 * 3944 * parse an end of tag 3945 * 3946 * [42] ETag ::= '</' Name S? '>' 3947 * 3948 * With namespace 3949 * 3950 * [NS 9] ETag ::= '</' QName S? '>' 3951 * 3952 * Returns 1 if the current level should be closed. 3953 */ 3954 3955 static int 3956 htmlParseEndTag(htmlParserCtxtPtr ctxt) 3957 { 3958 const xmlChar *name; 3959 const xmlChar *oldname; 3960 int i, ret; 3961 3962 if ((CUR != '<') || (NXT(1) != '/')) { 3963 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED, 3964 "htmlParseEndTag: '</' not found\n", NULL, NULL); 3965 return (0); 3966 } 3967 SKIP(2); 3968 3969 name = htmlParseHTMLName(ctxt); 3970 if (name == NULL) 3971 return (0); 3972 /* 3973 * We should definitely be at the ending "S? '>'" part 3974 */ 3975 SKIP_BLANKS; 3976 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) { 3977 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 3978 "End tag : expected '>'\n", NULL, NULL); 3979 if (ctxt->recovery) { 3980 /* 3981 * We're not at the ending > !! 3982 * Error, unless in recover mode where we search forwards 3983 * until we find a > 3984 */ 3985 while (CUR != '\0' && CUR != '>') NEXT; 3986 NEXT; 3987 } 3988 } else 3989 NEXT; 3990 3991 /* 3992 * if we ignored misplaced tags in htmlParseStartTag don't pop them 3993 * out now. 3994 */ 3995 if ((ctxt->depth > 0) && 3996 (xmlStrEqual(name, BAD_CAST "html") || 3997 xmlStrEqual(name, BAD_CAST "body") || 3998 xmlStrEqual(name, BAD_CAST "head"))) { 3999 ctxt->depth--; 4000 return (0); 4001 } 4002 4003 /* 4004 * If the name read is not one of the element in the parsing stack 4005 * then return, it's just an error. 4006 */ 4007 for (i = (ctxt->nameNr - 1); i >= 0; i--) { 4008 if (xmlStrEqual(name, ctxt->nameTab[i])) 4009 break; 4010 } 4011 if (i < 0) { 4012 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 4013 "Unexpected end tag : %s\n", name, NULL); 4014 return (0); 4015 } 4016 4017 4018 /* 4019 * Check for auto-closure of HTML elements. 4020 */ 4021 4022 htmlAutoCloseOnClose(ctxt, name); 4023 4024 /* 4025 * Well formedness constraints, opening and closing must match. 4026 * With the exception that the autoclose may have popped stuff out 4027 * of the stack. 4028 */ 4029 if (!xmlStrEqual(name, ctxt->name)) { 4030 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) { 4031 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 4032 "Opening and ending tag mismatch: %s and %s\n", 4033 name, ctxt->name); 4034 } 4035 } 4036 4037 /* 4038 * SAX: End of Tag 4039 */ 4040 oldname = ctxt->name; 4041 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) { 4042 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 4043 ctxt->sax->endElement(ctxt->userData, name); 4044 htmlNodeInfoPop(ctxt); 4045 htmlnamePop(ctxt); 4046 ret = 1; 4047 } else { 4048 ret = 0; 4049 } 4050 4051 return (ret); 4052 } 4053 4054 4055 /** 4056 * htmlParseReference: 4057 * @ctxt: an HTML parser context 4058 * 4059 * parse and handle entity references in content, 4060 * this will end-up in a call to character() since this is either a 4061 * CharRef, or a predefined entity. 4062 */ 4063 static void 4064 htmlParseReference(htmlParserCtxtPtr ctxt) { 4065 const htmlEntityDesc * ent; 4066 xmlChar out[6]; 4067 const xmlChar *name; 4068 if (CUR != '&') return; 4069 4070 if (NXT(1) == '#') { 4071 unsigned int c; 4072 int bits, i = 0; 4073 4074 c = htmlParseCharRef(ctxt); 4075 if (c == 0) 4076 return; 4077 4078 if (c < 0x80) { out[i++]= c; bits= -6; } 4079 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; } 4080 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; } 4081 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; } 4082 4083 for ( ; bits >= 0; bits-= 6) { 4084 out[i++]= ((c >> bits) & 0x3F) | 0x80; 4085 } 4086 out[i] = 0; 4087 4088 htmlCheckParagraph(ctxt); 4089 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 4090 ctxt->sax->characters(ctxt->userData, out, i); 4091 } else { 4092 ent = htmlParseEntityRef(ctxt, &name); 4093 if (name == NULL) { 4094 htmlCheckParagraph(ctxt); 4095 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 4096 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1); 4097 return; 4098 } 4099 if ((ent == NULL) || !(ent->value > 0)) { 4100 htmlCheckParagraph(ctxt); 4101 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) { 4102 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1); 4103 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name)); 4104 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */ 4105 } 4106 } else { 4107 unsigned int c; 4108 int bits, i = 0; 4109 4110 c = ent->value; 4111 if (c < 0x80) 4112 { out[i++]= c; bits= -6; } 4113 else if (c < 0x800) 4114 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; } 4115 else if (c < 0x10000) 4116 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; } 4117 else 4118 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; } 4119 4120 for ( ; bits >= 0; bits-= 6) { 4121 out[i++]= ((c >> bits) & 0x3F) | 0x80; 4122 } 4123 out[i] = 0; 4124 4125 htmlCheckParagraph(ctxt); 4126 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 4127 ctxt->sax->characters(ctxt->userData, out, i); 4128 } 4129 } 4130 } 4131 4132 /** 4133 * htmlParseContent: 4134 * @ctxt: an HTML parser context 4135 * 4136 * Parse a content: comment, sub-element, reference or text. 4137 * Kept for compatibility with old code 4138 */ 4139 4140 static void 4141 htmlParseContent(htmlParserCtxtPtr ctxt) { 4142 xmlChar *currentNode; 4143 int depth; 4144 const xmlChar *name; 4145 4146 currentNode = xmlStrdup(ctxt->name); 4147 depth = ctxt->nameNr; 4148 while (1) { 4149 long cons = ctxt->nbChars; 4150 4151 GROW; 4152 4153 if (ctxt->instate == XML_PARSER_EOF) 4154 break; 4155 4156 /* 4157 * Our tag or one of it's parent or children is ending. 4158 */ 4159 if ((CUR == '<') && (NXT(1) == '/')) { 4160 if (htmlParseEndTag(ctxt) && 4161 ((currentNode != NULL) || (ctxt->nameNr == 0))) { 4162 if (currentNode != NULL) 4163 xmlFree(currentNode); 4164 return; 4165 } 4166 continue; /* while */ 4167 } 4168 4169 else if ((CUR == '<') && 4170 ((IS_ASCII_LETTER(NXT(1))) || 4171 (NXT(1) == '_') || (NXT(1) == ':'))) { 4172 name = htmlParseHTMLName_nonInvasive(ctxt); 4173 if (name == NULL) { 4174 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 4175 "htmlParseStartTag: invalid element name\n", 4176 NULL, NULL); 4177 /* Dump the bogus tag like browsers do */ 4178 while ((IS_CHAR_CH(CUR)) && (CUR != '>')) 4179 NEXT; 4180 4181 if (currentNode != NULL) 4182 xmlFree(currentNode); 4183 return; 4184 } 4185 4186 if (ctxt->name != NULL) { 4187 if (htmlCheckAutoClose(name, ctxt->name) == 1) { 4188 htmlAutoClose(ctxt, name); 4189 continue; 4190 } 4191 } 4192 } 4193 4194 /* 4195 * Has this node been popped out during parsing of 4196 * the next element 4197 */ 4198 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) && 4199 (!xmlStrEqual(currentNode, ctxt->name))) 4200 { 4201 if (currentNode != NULL) xmlFree(currentNode); 4202 return; 4203 } 4204 4205 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) || 4206 (xmlStrEqual(currentNode, BAD_CAST"style")))) { 4207 /* 4208 * Handle SCRIPT/STYLE separately 4209 */ 4210 htmlParseScript(ctxt); 4211 } else { 4212 /* 4213 * Sometimes DOCTYPE arrives in the middle of the document 4214 */ 4215 if ((CUR == '<') && (NXT(1) == '!') && 4216 (UPP(2) == 'D') && (UPP(3) == 'O') && 4217 (UPP(4) == 'C') && (UPP(5) == 'T') && 4218 (UPP(6) == 'Y') && (UPP(7) == 'P') && 4219 (UPP(8) == 'E')) { 4220 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 4221 "Misplaced DOCTYPE declaration\n", 4222 BAD_CAST "DOCTYPE" , NULL); 4223 htmlParseDocTypeDecl(ctxt); 4224 } 4225 4226 /* 4227 * First case : a comment 4228 */ 4229 if ((CUR == '<') && (NXT(1) == '!') && 4230 (NXT(2) == '-') && (NXT(3) == '-')) { 4231 htmlParseComment(ctxt); 4232 } 4233 4234 /* 4235 * Second case : a Processing Instruction. 4236 */ 4237 else if ((CUR == '<') && (NXT(1) == '?')) { 4238 htmlParsePI(ctxt); 4239 } 4240 4241 /* 4242 * Third case : a sub-element. 4243 */ 4244 else if (CUR == '<') { 4245 htmlParseElement(ctxt); 4246 } 4247 4248 /* 4249 * Fourth case : a reference. If if has not been resolved, 4250 * parsing returns it's Name, create the node 4251 */ 4252 else if (CUR == '&') { 4253 htmlParseReference(ctxt); 4254 } 4255 4256 /* 4257 * Fifth case : end of the resource 4258 */ 4259 else if (CUR == 0) { 4260 htmlAutoCloseOnEnd(ctxt); 4261 break; 4262 } 4263 4264 /* 4265 * Last case, text. Note that References are handled directly. 4266 */ 4267 else { 4268 htmlParseCharData(ctxt); 4269 } 4270 4271 if (cons == ctxt->nbChars) { 4272 if (ctxt->node != NULL) { 4273 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 4274 "detected an error in element content\n", 4275 NULL, NULL); 4276 } 4277 break; 4278 } 4279 } 4280 GROW; 4281 } 4282 if (currentNode != NULL) xmlFree(currentNode); 4283 } 4284 4285 /** 4286 * htmlParseElement: 4287 * @ctxt: an HTML parser context 4288 * 4289 * parse an HTML element, this is highly recursive 4290 * this is kept for compatibility with previous code versions 4291 * 4292 * [39] element ::= EmptyElemTag | STag content ETag 4293 * 4294 * [41] Attribute ::= Name Eq AttValue 4295 */ 4296 4297 void 4298 htmlParseElement(htmlParserCtxtPtr ctxt) { 4299 const xmlChar *name; 4300 xmlChar *currentNode = NULL; 4301 const htmlElemDesc * info; 4302 htmlParserNodeInfo node_info; 4303 int failed; 4304 int depth; 4305 const xmlChar *oldptr; 4306 4307 if ((ctxt == NULL) || (ctxt->input == NULL)) { 4308 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 4309 "htmlParseElement: context error\n", NULL, NULL); 4310 return; 4311 } 4312 4313 if (ctxt->instate == XML_PARSER_EOF) 4314 return; 4315 4316 /* Capture start position */ 4317 if (ctxt->record_info) { 4318 node_info.begin_pos = ctxt->input->consumed + 4319 (CUR_PTR - ctxt->input->base); 4320 node_info.begin_line = ctxt->input->line; 4321 } 4322 4323 failed = htmlParseStartTag(ctxt); 4324 name = ctxt->name; 4325 if ((failed == -1) || (name == NULL)) { 4326 if (CUR == '>') 4327 NEXT; 4328 return; 4329 } 4330 4331 /* 4332 * Lookup the info for that element. 4333 */ 4334 info = htmlTagLookup(name); 4335 if (info == NULL) { 4336 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, 4337 "Tag %s invalid\n", name, NULL); 4338 } 4339 4340 /* 4341 * Check for an Empty Element labeled the XML/SGML way 4342 */ 4343 if ((CUR == '/') && (NXT(1) == '>')) { 4344 SKIP(2); 4345 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 4346 ctxt->sax->endElement(ctxt->userData, name); 4347 htmlnamePop(ctxt); 4348 return; 4349 } 4350 4351 if (CUR == '>') { 4352 NEXT; 4353 } else { 4354 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 4355 "Couldn't find end of Start Tag %s\n", name, NULL); 4356 4357 /* 4358 * end of parsing of this node. 4359 */ 4360 if (xmlStrEqual(name, ctxt->name)) { 4361 nodePop(ctxt); 4362 htmlnamePop(ctxt); 4363 } 4364 4365 /* 4366 * Capture end position and add node 4367 */ 4368 if (ctxt->record_info) { 4369 node_info.end_pos = ctxt->input->consumed + 4370 (CUR_PTR - ctxt->input->base); 4371 node_info.end_line = ctxt->input->line; 4372 node_info.node = ctxt->node; 4373 xmlParserAddNodeInfo(ctxt, &node_info); 4374 } 4375 return; 4376 } 4377 4378 /* 4379 * Check for an Empty Element from DTD definition 4380 */ 4381 if ((info != NULL) && (info->empty)) { 4382 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 4383 ctxt->sax->endElement(ctxt->userData, name); 4384 htmlnamePop(ctxt); 4385 return; 4386 } 4387 4388 /* 4389 * Parse the content of the element: 4390 */ 4391 currentNode = xmlStrdup(ctxt->name); 4392 depth = ctxt->nameNr; 4393 while (IS_CHAR_CH(CUR)) { 4394 oldptr = ctxt->input->cur; 4395 htmlParseContent(ctxt); 4396 if (oldptr==ctxt->input->cur) break; 4397 if (ctxt->nameNr < depth) break; 4398 } 4399 4400 /* 4401 * Capture end position and add node 4402 */ 4403 if ( currentNode != NULL && ctxt->record_info ) { 4404 node_info.end_pos = ctxt->input->consumed + 4405 (CUR_PTR - ctxt->input->base); 4406 node_info.end_line = ctxt->input->line; 4407 node_info.node = ctxt->node; 4408 xmlParserAddNodeInfo(ctxt, &node_info); 4409 } 4410 if (!IS_CHAR_CH(CUR)) { 4411 htmlAutoCloseOnEnd(ctxt); 4412 } 4413 4414 if (currentNode != NULL) 4415 xmlFree(currentNode); 4416 } 4417 4418 static void 4419 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) { 4420 /* 4421 * Capture end position and add node 4422 */ 4423 if ( ctxt->node != NULL && ctxt->record_info ) { 4424 ctxt->nodeInfo->end_pos = ctxt->input->consumed + 4425 (CUR_PTR - ctxt->input->base); 4426 ctxt->nodeInfo->end_line = ctxt->input->line; 4427 ctxt->nodeInfo->node = ctxt->node; 4428 xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo); 4429 htmlNodeInfoPop(ctxt); 4430 } 4431 if (!IS_CHAR_CH(CUR)) { 4432 htmlAutoCloseOnEnd(ctxt); 4433 } 4434 } 4435 4436 /** 4437 * htmlParseElementInternal: 4438 * @ctxt: an HTML parser context 4439 * 4440 * parse an HTML element, new version, non recursive 4441 * 4442 * [39] element ::= EmptyElemTag | STag content ETag 4443 * 4444 * [41] Attribute ::= Name Eq AttValue 4445 */ 4446 4447 static void 4448 htmlParseElementInternal(htmlParserCtxtPtr ctxt) { 4449 const xmlChar *name; 4450 const htmlElemDesc * info; 4451 htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 }; 4452 int failed; 4453 4454 if ((ctxt == NULL) || (ctxt->input == NULL)) { 4455 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 4456 "htmlParseElementInternal: context error\n", NULL, NULL); 4457 return; 4458 } 4459 4460 if (ctxt->instate == XML_PARSER_EOF) 4461 return; 4462 4463 /* Capture start position */ 4464 if (ctxt->record_info) { 4465 node_info.begin_pos = ctxt->input->consumed + 4466 (CUR_PTR - ctxt->input->base); 4467 node_info.begin_line = ctxt->input->line; 4468 } 4469 4470 failed = htmlParseStartTag(ctxt); 4471 name = ctxt->name; 4472 if ((failed == -1) || (name == NULL)) { 4473 if (CUR == '>') 4474 NEXT; 4475 return; 4476 } 4477 4478 /* 4479 * Lookup the info for that element. 4480 */ 4481 info = htmlTagLookup(name); 4482 if (info == NULL) { 4483 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, 4484 "Tag %s invalid\n", name, NULL); 4485 } 4486 4487 /* 4488 * Check for an Empty Element labeled the XML/SGML way 4489 */ 4490 if ((CUR == '/') && (NXT(1) == '>')) { 4491 SKIP(2); 4492 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 4493 ctxt->sax->endElement(ctxt->userData, name); 4494 htmlnamePop(ctxt); 4495 return; 4496 } 4497 4498 if (CUR == '>') { 4499 NEXT; 4500 } else { 4501 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 4502 "Couldn't find end of Start Tag %s\n", name, NULL); 4503 4504 /* 4505 * end of parsing of this node. 4506 */ 4507 if (xmlStrEqual(name, ctxt->name)) { 4508 nodePop(ctxt); 4509 htmlnamePop(ctxt); 4510 } 4511 4512 if (ctxt->record_info) 4513 htmlNodeInfoPush(ctxt, &node_info); 4514 htmlParserFinishElementParsing(ctxt); 4515 return; 4516 } 4517 4518 /* 4519 * Check for an Empty Element from DTD definition 4520 */ 4521 if ((info != NULL) && (info->empty)) { 4522 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 4523 ctxt->sax->endElement(ctxt->userData, name); 4524 htmlnamePop(ctxt); 4525 return; 4526 } 4527 4528 if (ctxt->record_info) 4529 htmlNodeInfoPush(ctxt, &node_info); 4530 } 4531 4532 /** 4533 * htmlParseContentInternal: 4534 * @ctxt: an HTML parser context 4535 * 4536 * Parse a content: comment, sub-element, reference or text. 4537 * New version for non recursive htmlParseElementInternal 4538 */ 4539 4540 static void 4541 htmlParseContentInternal(htmlParserCtxtPtr ctxt) { 4542 xmlChar *currentNode; 4543 int depth; 4544 const xmlChar *name; 4545 4546 currentNode = xmlStrdup(ctxt->name); 4547 depth = ctxt->nameNr; 4548 while (1) { 4549 long cons = ctxt->nbChars; 4550 4551 GROW; 4552 4553 if (ctxt->instate == XML_PARSER_EOF) 4554 break; 4555 4556 /* 4557 * Our tag or one of it's parent or children is ending. 4558 */ 4559 if ((CUR == '<') && (NXT(1) == '/')) { 4560 if (htmlParseEndTag(ctxt) && 4561 ((currentNode != NULL) || (ctxt->nameNr == 0))) { 4562 if (currentNode != NULL) 4563 xmlFree(currentNode); 4564 4565 currentNode = xmlStrdup(ctxt->name); 4566 depth = ctxt->nameNr; 4567 } 4568 continue; /* while */ 4569 } 4570 4571 else if ((CUR == '<') && 4572 ((IS_ASCII_LETTER(NXT(1))) || 4573 (NXT(1) == '_') || (NXT(1) == ':'))) { 4574 name = htmlParseHTMLName_nonInvasive(ctxt); 4575 if (name == NULL) { 4576 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 4577 "htmlParseStartTag: invalid element name\n", 4578 NULL, NULL); 4579 /* Dump the bogus tag like browsers do */ 4580 while ((IS_CHAR_CH(CUR)) && (CUR != '>')) 4581 NEXT; 4582 4583 htmlParserFinishElementParsing(ctxt); 4584 if (currentNode != NULL) 4585 xmlFree(currentNode); 4586 4587 currentNode = xmlStrdup(ctxt->name); 4588 depth = ctxt->nameNr; 4589 continue; 4590 } 4591 4592 if (ctxt->name != NULL) { 4593 if (htmlCheckAutoClose(name, ctxt->name) == 1) { 4594 htmlAutoClose(ctxt, name); 4595 continue; 4596 } 4597 } 4598 } 4599 4600 /* 4601 * Has this node been popped out during parsing of 4602 * the next element 4603 */ 4604 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) && 4605 (!xmlStrEqual(currentNode, ctxt->name))) 4606 { 4607 htmlParserFinishElementParsing(ctxt); 4608 if (currentNode != NULL) xmlFree(currentNode); 4609 4610 currentNode = xmlStrdup(ctxt->name); 4611 depth = ctxt->nameNr; 4612 continue; 4613 } 4614 4615 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) || 4616 (xmlStrEqual(currentNode, BAD_CAST"style")))) { 4617 /* 4618 * Handle SCRIPT/STYLE separately 4619 */ 4620 htmlParseScript(ctxt); 4621 } else { 4622 /* 4623 * Sometimes DOCTYPE arrives in the middle of the document 4624 */ 4625 if ((CUR == '<') && (NXT(1) == '!') && 4626 (UPP(2) == 'D') && (UPP(3) == 'O') && 4627 (UPP(4) == 'C') && (UPP(5) == 'T') && 4628 (UPP(6) == 'Y') && (UPP(7) == 'P') && 4629 (UPP(8) == 'E')) { 4630 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 4631 "Misplaced DOCTYPE declaration\n", 4632 BAD_CAST "DOCTYPE" , NULL); 4633 htmlParseDocTypeDecl(ctxt); 4634 } 4635 4636 /* 4637 * First case : a comment 4638 */ 4639 if ((CUR == '<') && (NXT(1) == '!') && 4640 (NXT(2) == '-') && (NXT(3) == '-')) { 4641 htmlParseComment(ctxt); 4642 } 4643 4644 /* 4645 * Second case : a Processing Instruction. 4646 */ 4647 else if ((CUR == '<') && (NXT(1) == '?')) { 4648 htmlParsePI(ctxt); 4649 } 4650 4651 /* 4652 * Third case : a sub-element. 4653 */ 4654 else if (CUR == '<') { 4655 htmlParseElementInternal(ctxt); 4656 if (currentNode != NULL) xmlFree(currentNode); 4657 4658 currentNode = xmlStrdup(ctxt->name); 4659 depth = ctxt->nameNr; 4660 } 4661 4662 /* 4663 * Fourth case : a reference. If if has not been resolved, 4664 * parsing returns it's Name, create the node 4665 */ 4666 else if (CUR == '&') { 4667 htmlParseReference(ctxt); 4668 } 4669 4670 /* 4671 * Fifth case : end of the resource 4672 */ 4673 else if (CUR == 0) { 4674 htmlAutoCloseOnEnd(ctxt); 4675 break; 4676 } 4677 4678 /* 4679 * Last case, text. Note that References are handled directly. 4680 */ 4681 else { 4682 htmlParseCharData(ctxt); 4683 } 4684 4685 if (cons == ctxt->nbChars) { 4686 if (ctxt->node != NULL) { 4687 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 4688 "detected an error in element content\n", 4689 NULL, NULL); 4690 } 4691 break; 4692 } 4693 } 4694 GROW; 4695 } 4696 if (currentNode != NULL) xmlFree(currentNode); 4697 } 4698 4699 /** 4700 * htmlParseContent: 4701 * @ctxt: an HTML parser context 4702 * 4703 * Parse a content: comment, sub-element, reference or text. 4704 * This is the entry point when called from parser.c 4705 */ 4706 4707 void 4708 __htmlParseContent(void *ctxt) { 4709 if (ctxt != NULL) 4710 htmlParseContentInternal((htmlParserCtxtPtr) ctxt); 4711 } 4712 4713 /** 4714 * htmlParseDocument: 4715 * @ctxt: an HTML parser context 4716 * 4717 * parse an HTML document (and build a tree if using the standard SAX 4718 * interface). 4719 * 4720 * Returns 0, -1 in case of error. the parser context is augmented 4721 * as a result of the parsing. 4722 */ 4723 4724 int 4725 htmlParseDocument(htmlParserCtxtPtr ctxt) { 4726 xmlChar start[4]; 4727 xmlCharEncoding enc; 4728 xmlDtdPtr dtd; 4729 4730 xmlInitParser(); 4731 4732 htmlDefaultSAXHandlerInit(); 4733 4734 if ((ctxt == NULL) || (ctxt->input == NULL)) { 4735 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 4736 "htmlParseDocument: context error\n", NULL, NULL); 4737 return(XML_ERR_INTERNAL_ERROR); 4738 } 4739 ctxt->html = 1; 4740 ctxt->linenumbers = 1; 4741 GROW; 4742 /* 4743 * SAX: beginning of the document processing. 4744 */ 4745 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) 4746 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator); 4747 4748 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) && 4749 ((ctxt->input->end - ctxt->input->cur) >= 4)) { 4750 /* 4751 * Get the 4 first bytes and decode the charset 4752 * if enc != XML_CHAR_ENCODING_NONE 4753 * plug some encoding conversion routines. 4754 */ 4755 start[0] = RAW; 4756 start[1] = NXT(1); 4757 start[2] = NXT(2); 4758 start[3] = NXT(3); 4759 enc = xmlDetectCharEncoding(&start[0], 4); 4760 if (enc != XML_CHAR_ENCODING_NONE) { 4761 xmlSwitchEncoding(ctxt, enc); 4762 } 4763 } 4764 4765 /* 4766 * Wipe out everything which is before the first '<' 4767 */ 4768 SKIP_BLANKS; 4769 if (CUR == 0) { 4770 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY, 4771 "Document is empty\n", NULL, NULL); 4772 } 4773 4774 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX)) 4775 ctxt->sax->startDocument(ctxt->userData); 4776 4777 4778 /* 4779 * Parse possible comments and PIs before any content 4780 */ 4781 while (((CUR == '<') && (NXT(1) == '!') && 4782 (NXT(2) == '-') && (NXT(3) == '-')) || 4783 ((CUR == '<') && (NXT(1) == '?'))) { 4784 htmlParseComment(ctxt); 4785 htmlParsePI(ctxt); 4786 SKIP_BLANKS; 4787 } 4788 4789 4790 /* 4791 * Then possibly doc type declaration(s) and more Misc 4792 * (doctypedecl Misc*)? 4793 */ 4794 if ((CUR == '<') && (NXT(1) == '!') && 4795 (UPP(2) == 'D') && (UPP(3) == 'O') && 4796 (UPP(4) == 'C') && (UPP(5) == 'T') && 4797 (UPP(6) == 'Y') && (UPP(7) == 'P') && 4798 (UPP(8) == 'E')) { 4799 htmlParseDocTypeDecl(ctxt); 4800 } 4801 SKIP_BLANKS; 4802 4803 /* 4804 * Parse possible comments and PIs before any content 4805 */ 4806 while (((CUR == '<') && (NXT(1) == '!') && 4807 (NXT(2) == '-') && (NXT(3) == '-')) || 4808 ((CUR == '<') && (NXT(1) == '?'))) { 4809 htmlParseComment(ctxt); 4810 htmlParsePI(ctxt); 4811 SKIP_BLANKS; 4812 } 4813 4814 /* 4815 * Time to start parsing the tree itself 4816 */ 4817 htmlParseContentInternal(ctxt); 4818 4819 /* 4820 * autoclose 4821 */ 4822 if (CUR == 0) 4823 htmlAutoCloseOnEnd(ctxt); 4824 4825 4826 /* 4827 * SAX: end of the document processing. 4828 */ 4829 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 4830 ctxt->sax->endDocument(ctxt->userData); 4831 4832 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) { 4833 dtd = xmlGetIntSubset(ctxt->myDoc); 4834 if (dtd == NULL) 4835 ctxt->myDoc->intSubset = 4836 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html", 4837 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN", 4838 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"); 4839 } 4840 if (! ctxt->wellFormed) return(-1); 4841 return(0); 4842 } 4843 4844 4845 /************************************************************************ 4846 * * 4847 * Parser contexts handling * 4848 * * 4849 ************************************************************************/ 4850 4851 /** 4852 * htmlInitParserCtxt: 4853 * @ctxt: an HTML parser context 4854 * 4855 * Initialize a parser context 4856 * 4857 * Returns 0 in case of success and -1 in case of error 4858 */ 4859 4860 static int 4861 htmlInitParserCtxt(htmlParserCtxtPtr ctxt) 4862 { 4863 htmlSAXHandler *sax; 4864 4865 if (ctxt == NULL) return(-1); 4866 memset(ctxt, 0, sizeof(htmlParserCtxt)); 4867 4868 ctxt->dict = xmlDictCreate(); 4869 if (ctxt->dict == NULL) { 4870 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4871 return(-1); 4872 } 4873 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler)); 4874 if (sax == NULL) { 4875 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4876 return(-1); 4877 } 4878 else 4879 memset(sax, 0, sizeof(htmlSAXHandler)); 4880 4881 /* Allocate the Input stack */ 4882 ctxt->inputTab = (htmlParserInputPtr *) 4883 xmlMalloc(5 * sizeof(htmlParserInputPtr)); 4884 if (ctxt->inputTab == NULL) { 4885 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4886 ctxt->inputNr = 0; 4887 ctxt->inputMax = 0; 4888 ctxt->input = NULL; 4889 return(-1); 4890 } 4891 ctxt->inputNr = 0; 4892 ctxt->inputMax = 5; 4893 ctxt->input = NULL; 4894 ctxt->version = NULL; 4895 ctxt->encoding = NULL; 4896 ctxt->standalone = -1; 4897 ctxt->instate = XML_PARSER_START; 4898 4899 /* Allocate the Node stack */ 4900 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr)); 4901 if (ctxt->nodeTab == NULL) { 4902 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4903 ctxt->nodeNr = 0; 4904 ctxt->nodeMax = 0; 4905 ctxt->node = NULL; 4906 ctxt->inputNr = 0; 4907 ctxt->inputMax = 0; 4908 ctxt->input = NULL; 4909 return(-1); 4910 } 4911 ctxt->nodeNr = 0; 4912 ctxt->nodeMax = 10; 4913 ctxt->node = NULL; 4914 4915 /* Allocate the Name stack */ 4916 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *)); 4917 if (ctxt->nameTab == NULL) { 4918 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4919 ctxt->nameNr = 0; 4920 ctxt->nameMax = 0; 4921 ctxt->name = NULL; 4922 ctxt->nodeNr = 0; 4923 ctxt->nodeMax = 0; 4924 ctxt->node = NULL; 4925 ctxt->inputNr = 0; 4926 ctxt->inputMax = 0; 4927 ctxt->input = NULL; 4928 return(-1); 4929 } 4930 ctxt->nameNr = 0; 4931 ctxt->nameMax = 10; 4932 ctxt->name = NULL; 4933 4934 ctxt->nodeInfoTab = NULL; 4935 ctxt->nodeInfoNr = 0; 4936 ctxt->nodeInfoMax = 0; 4937 4938 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler; 4939 else { 4940 ctxt->sax = sax; 4941 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1)); 4942 } 4943 ctxt->userData = ctxt; 4944 ctxt->myDoc = NULL; 4945 ctxt->wellFormed = 1; 4946 ctxt->replaceEntities = 0; 4947 ctxt->linenumbers = xmlLineNumbersDefaultValue; 4948 ctxt->keepBlanks = xmlKeepBlanksDefaultValue; 4949 ctxt->html = 1; 4950 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0; 4951 ctxt->vctxt.userData = ctxt; 4952 ctxt->vctxt.error = xmlParserValidityError; 4953 ctxt->vctxt.warning = xmlParserValidityWarning; 4954 ctxt->record_info = 0; 4955 ctxt->validate = 0; 4956 ctxt->nbChars = 0; 4957 ctxt->checkIndex = 0; 4958 ctxt->catalogs = NULL; 4959 xmlInitNodeInfoSeq(&ctxt->node_seq); 4960 return(0); 4961 } 4962 4963 /** 4964 * htmlFreeParserCtxt: 4965 * @ctxt: an HTML parser context 4966 * 4967 * Free all the memory used by a parser context. However the parsed 4968 * document in ctxt->myDoc is not freed. 4969 */ 4970 4971 void 4972 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt) 4973 { 4974 xmlFreeParserCtxt(ctxt); 4975 } 4976 4977 /** 4978 * htmlNewParserCtxt: 4979 * 4980 * Allocate and initialize a new parser context. 4981 * 4982 * Returns the htmlParserCtxtPtr or NULL in case of allocation error 4983 */ 4984 4985 htmlParserCtxtPtr 4986 htmlNewParserCtxt(void) 4987 { 4988 xmlParserCtxtPtr ctxt; 4989 4990 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt)); 4991 if (ctxt == NULL) { 4992 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n"); 4993 return(NULL); 4994 } 4995 memset(ctxt, 0, sizeof(xmlParserCtxt)); 4996 if (htmlInitParserCtxt(ctxt) < 0) { 4997 htmlFreeParserCtxt(ctxt); 4998 return(NULL); 4999 } 5000 return(ctxt); 5001 } 5002 5003 /** 5004 * htmlCreateMemoryParserCtxt: 5005 * @buffer: a pointer to a char array 5006 * @size: the size of the array 5007 * 5008 * Create a parser context for an HTML in-memory document. 5009 * 5010 * Returns the new parser context or NULL 5011 */ 5012 htmlParserCtxtPtr 5013 htmlCreateMemoryParserCtxt(const char *buffer, int size) { 5014 xmlParserCtxtPtr ctxt; 5015 xmlParserInputPtr input; 5016 xmlParserInputBufferPtr buf; 5017 5018 if (buffer == NULL) 5019 return(NULL); 5020 if (size <= 0) 5021 return(NULL); 5022 5023 ctxt = htmlNewParserCtxt(); 5024 if (ctxt == NULL) 5025 return(NULL); 5026 5027 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE); 5028 if (buf == NULL) return(NULL); 5029 5030 input = xmlNewInputStream(ctxt); 5031 if (input == NULL) { 5032 xmlFreeParserCtxt(ctxt); 5033 return(NULL); 5034 } 5035 5036 input->filename = NULL; 5037 input->buf = buf; 5038 xmlBufResetInput(buf->buffer, input); 5039 5040 inputPush(ctxt, input); 5041 return(ctxt); 5042 } 5043 5044 /** 5045 * htmlCreateDocParserCtxt: 5046 * @cur: a pointer to an array of xmlChar 5047 * @encoding: a free form C string describing the HTML document encoding, or NULL 5048 * 5049 * Create a parser context for an HTML document. 5050 * 5051 * TODO: check the need to add encoding handling there 5052 * 5053 * Returns the new parser context or NULL 5054 */ 5055 static htmlParserCtxtPtr 5056 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) { 5057 int len; 5058 htmlParserCtxtPtr ctxt; 5059 5060 if (cur == NULL) 5061 return(NULL); 5062 len = xmlStrlen(cur); 5063 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len); 5064 if (ctxt == NULL) 5065 return(NULL); 5066 5067 if (encoding != NULL) { 5068 xmlCharEncoding enc; 5069 xmlCharEncodingHandlerPtr handler; 5070 5071 if (ctxt->input->encoding != NULL) 5072 xmlFree((xmlChar *) ctxt->input->encoding); 5073 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding); 5074 5075 enc = xmlParseCharEncoding(encoding); 5076 /* 5077 * registered set of known encodings 5078 */ 5079 if (enc != XML_CHAR_ENCODING_ERROR) { 5080 xmlSwitchEncoding(ctxt, enc); 5081 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) { 5082 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, 5083 "Unsupported encoding %s\n", 5084 (const xmlChar *) encoding, NULL); 5085 } 5086 } else { 5087 /* 5088 * fallback for unknown encodings 5089 */ 5090 handler = xmlFindCharEncodingHandler((const char *) encoding); 5091 if (handler != NULL) { 5092 xmlSwitchToEncoding(ctxt, handler); 5093 } else { 5094 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, 5095 "Unsupported encoding %s\n", 5096 (const xmlChar *) encoding, NULL); 5097 } 5098 } 5099 } 5100 return(ctxt); 5101 } 5102 5103 #ifdef LIBXML_PUSH_ENABLED 5104 /************************************************************************ 5105 * * 5106 * Progressive parsing interfaces * 5107 * * 5108 ************************************************************************/ 5109 5110 /** 5111 * htmlParseLookupSequence: 5112 * @ctxt: an HTML parser context 5113 * @first: the first char to lookup 5114 * @next: the next char to lookup or zero 5115 * @third: the next char to lookup or zero 5116 * @comment: flag to force checking inside comments 5117 * 5118 * Try to find if a sequence (first, next, third) or just (first next) or 5119 * (first) is available in the input stream. 5120 * This function has a side effect of (possibly) incrementing ctxt->checkIndex 5121 * to avoid rescanning sequences of bytes, it DOES change the state of the 5122 * parser, do not use liberally. 5123 * This is basically similar to xmlParseLookupSequence() 5124 * 5125 * Returns the index to the current parsing point if the full sequence 5126 * is available, -1 otherwise. 5127 */ 5128 static int 5129 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first, 5130 xmlChar next, xmlChar third, int iscomment, 5131 int ignoreattrval) 5132 { 5133 int base, len; 5134 htmlParserInputPtr in; 5135 const xmlChar *buf; 5136 int incomment = 0; 5137 int invalue = 0; 5138 char valdellim = 0x0; 5139 5140 in = ctxt->input; 5141 if (in == NULL) 5142 return (-1); 5143 5144 base = in->cur - in->base; 5145 if (base < 0) 5146 return (-1); 5147 5148 if (ctxt->checkIndex > base) 5149 base = ctxt->checkIndex; 5150 5151 if (in->buf == NULL) { 5152 buf = in->base; 5153 len = in->length; 5154 } else { 5155 buf = xmlBufContent(in->buf->buffer); 5156 len = xmlBufUse(in->buf->buffer); 5157 } 5158 5159 /* take into account the sequence length */ 5160 if (third) 5161 len -= 2; 5162 else if (next) 5163 len--; 5164 for (; base < len; base++) { 5165 if ((!incomment) && (base + 4 < len) && (!iscomment)) { 5166 if ((buf[base] == '<') && (buf[base + 1] == '!') && 5167 (buf[base + 2] == '-') && (buf[base + 3] == '-')) { 5168 incomment = 1; 5169 /* do not increment past <! - some people use <!--> */ 5170 base += 2; 5171 } 5172 } 5173 if (ignoreattrval) { 5174 if (buf[base] == '"' || buf[base] == '\'') { 5175 if (invalue) { 5176 if (buf[base] == valdellim) { 5177 invalue = 0; 5178 continue; 5179 } 5180 } else { 5181 valdellim = buf[base]; 5182 invalue = 1; 5183 continue; 5184 } 5185 } else if (invalue) { 5186 continue; 5187 } 5188 } 5189 if (incomment) { 5190 if (base + 3 > len) 5191 return (-1); 5192 if ((buf[base] == '-') && (buf[base + 1] == '-') && 5193 (buf[base + 2] == '>')) { 5194 incomment = 0; 5195 base += 2; 5196 } 5197 continue; 5198 } 5199 if (buf[base] == first) { 5200 if (third != 0) { 5201 if ((buf[base + 1] != next) || (buf[base + 2] != third)) 5202 continue; 5203 } else if (next != 0) { 5204 if (buf[base + 1] != next) 5205 continue; 5206 } 5207 ctxt->checkIndex = 0; 5208 #ifdef DEBUG_PUSH 5209 if (next == 0) 5210 xmlGenericError(xmlGenericErrorContext, 5211 "HPP: lookup '%c' found at %d\n", 5212 first, base); 5213 else if (third == 0) 5214 xmlGenericError(xmlGenericErrorContext, 5215 "HPP: lookup '%c%c' found at %d\n", 5216 first, next, base); 5217 else 5218 xmlGenericError(xmlGenericErrorContext, 5219 "HPP: lookup '%c%c%c' found at %d\n", 5220 first, next, third, base); 5221 #endif 5222 return (base - (in->cur - in->base)); 5223 } 5224 } 5225 if ((!incomment) && (!invalue)) 5226 ctxt->checkIndex = base; 5227 #ifdef DEBUG_PUSH 5228 if (next == 0) 5229 xmlGenericError(xmlGenericErrorContext, 5230 "HPP: lookup '%c' failed\n", first); 5231 else if (third == 0) 5232 xmlGenericError(xmlGenericErrorContext, 5233 "HPP: lookup '%c%c' failed\n", first, next); 5234 else 5235 xmlGenericError(xmlGenericErrorContext, 5236 "HPP: lookup '%c%c%c' failed\n", first, next, 5237 third); 5238 #endif 5239 return (-1); 5240 } 5241 5242 /** 5243 * htmlParseLookupChars: 5244 * @ctxt: an HTML parser context 5245 * @stop: Array of chars, which stop the lookup. 5246 * @stopLen: Length of stop-Array 5247 * 5248 * Try to find if any char of the stop-Array is available in the input 5249 * stream. 5250 * This function has a side effect of (possibly) incrementing ctxt->checkIndex 5251 * to avoid rescanning sequences of bytes, it DOES change the state of the 5252 * parser, do not use liberally. 5253 * 5254 * Returns the index to the current parsing point if a stopChar 5255 * is available, -1 otherwise. 5256 */ 5257 static int 5258 htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop, 5259 int stopLen) 5260 { 5261 int base, len; 5262 htmlParserInputPtr in; 5263 const xmlChar *buf; 5264 int incomment = 0; 5265 int i; 5266 5267 in = ctxt->input; 5268 if (in == NULL) 5269 return (-1); 5270 5271 base = in->cur - in->base; 5272 if (base < 0) 5273 return (-1); 5274 5275 if (ctxt->checkIndex > base) 5276 base = ctxt->checkIndex; 5277 5278 if (in->buf == NULL) { 5279 buf = in->base; 5280 len = in->length; 5281 } else { 5282 buf = xmlBufContent(in->buf->buffer); 5283 len = xmlBufUse(in->buf->buffer); 5284 } 5285 5286 for (; base < len; base++) { 5287 if (!incomment && (base + 4 < len)) { 5288 if ((buf[base] == '<') && (buf[base + 1] == '!') && 5289 (buf[base + 2] == '-') && (buf[base + 3] == '-')) { 5290 incomment = 1; 5291 /* do not increment past <! - some people use <!--> */ 5292 base += 2; 5293 } 5294 } 5295 if (incomment) { 5296 if (base + 3 > len) 5297 return (-1); 5298 if ((buf[base] == '-') && (buf[base + 1] == '-') && 5299 (buf[base + 2] == '>')) { 5300 incomment = 0; 5301 base += 2; 5302 } 5303 continue; 5304 } 5305 for (i = 0; i < stopLen; ++i) { 5306 if (buf[base] == stop[i]) { 5307 ctxt->checkIndex = 0; 5308 return (base - (in->cur - in->base)); 5309 } 5310 } 5311 } 5312 ctxt->checkIndex = base; 5313 return (-1); 5314 } 5315 5316 /** 5317 * htmlParseTryOrFinish: 5318 * @ctxt: an HTML parser context 5319 * @terminate: last chunk indicator 5320 * 5321 * Try to progress on parsing 5322 * 5323 * Returns zero if no parsing was possible 5324 */ 5325 static int 5326 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { 5327 int ret = 0; 5328 htmlParserInputPtr in; 5329 int avail = 0; 5330 xmlChar cur, next; 5331 5332 htmlParserNodeInfo node_info; 5333 5334 #ifdef DEBUG_PUSH 5335 switch (ctxt->instate) { 5336 case XML_PARSER_EOF: 5337 xmlGenericError(xmlGenericErrorContext, 5338 "HPP: try EOF\n"); break; 5339 case XML_PARSER_START: 5340 xmlGenericError(xmlGenericErrorContext, 5341 "HPP: try START\n"); break; 5342 case XML_PARSER_MISC: 5343 xmlGenericError(xmlGenericErrorContext, 5344 "HPP: try MISC\n");break; 5345 case XML_PARSER_COMMENT: 5346 xmlGenericError(xmlGenericErrorContext, 5347 "HPP: try COMMENT\n");break; 5348 case XML_PARSER_PROLOG: 5349 xmlGenericError(xmlGenericErrorContext, 5350 "HPP: try PROLOG\n");break; 5351 case XML_PARSER_START_TAG: 5352 xmlGenericError(xmlGenericErrorContext, 5353 "HPP: try START_TAG\n");break; 5354 case XML_PARSER_CONTENT: 5355 xmlGenericError(xmlGenericErrorContext, 5356 "HPP: try CONTENT\n");break; 5357 case XML_PARSER_CDATA_SECTION: 5358 xmlGenericError(xmlGenericErrorContext, 5359 "HPP: try CDATA_SECTION\n");break; 5360 case XML_PARSER_END_TAG: 5361 xmlGenericError(xmlGenericErrorContext, 5362 "HPP: try END_TAG\n");break; 5363 case XML_PARSER_ENTITY_DECL: 5364 xmlGenericError(xmlGenericErrorContext, 5365 "HPP: try ENTITY_DECL\n");break; 5366 case XML_PARSER_ENTITY_VALUE: 5367 xmlGenericError(xmlGenericErrorContext, 5368 "HPP: try ENTITY_VALUE\n");break; 5369 case XML_PARSER_ATTRIBUTE_VALUE: 5370 xmlGenericError(xmlGenericErrorContext, 5371 "HPP: try ATTRIBUTE_VALUE\n");break; 5372 case XML_PARSER_DTD: 5373 xmlGenericError(xmlGenericErrorContext, 5374 "HPP: try DTD\n");break; 5375 case XML_PARSER_EPILOG: 5376 xmlGenericError(xmlGenericErrorContext, 5377 "HPP: try EPILOG\n");break; 5378 case XML_PARSER_PI: 5379 xmlGenericError(xmlGenericErrorContext, 5380 "HPP: try PI\n");break; 5381 case XML_PARSER_SYSTEM_LITERAL: 5382 xmlGenericError(xmlGenericErrorContext, 5383 "HPP: try SYSTEM_LITERAL\n");break; 5384 } 5385 #endif 5386 5387 while (1) { 5388 5389 in = ctxt->input; 5390 if (in == NULL) break; 5391 if (in->buf == NULL) 5392 avail = in->length - (in->cur - in->base); 5393 else 5394 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); 5395 if ((avail == 0) && (terminate)) { 5396 htmlAutoCloseOnEnd(ctxt); 5397 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { 5398 /* 5399 * SAX: end of the document processing. 5400 */ 5401 ctxt->instate = XML_PARSER_EOF; 5402 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 5403 ctxt->sax->endDocument(ctxt->userData); 5404 } 5405 } 5406 if (avail < 1) 5407 goto done; 5408 cur = in->cur[0]; 5409 if (cur == 0) { 5410 SKIP(1); 5411 continue; 5412 } 5413 5414 switch (ctxt->instate) { 5415 case XML_PARSER_EOF: 5416 /* 5417 * Document parsing is done ! 5418 */ 5419 goto done; 5420 case XML_PARSER_START: 5421 /* 5422 * Very first chars read from the document flow. 5423 */ 5424 cur = in->cur[0]; 5425 if (IS_BLANK_CH(cur)) { 5426 SKIP_BLANKS; 5427 if (in->buf == NULL) 5428 avail = in->length - (in->cur - in->base); 5429 else 5430 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); 5431 } 5432 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) 5433 ctxt->sax->setDocumentLocator(ctxt->userData, 5434 &xmlDefaultSAXLocator); 5435 if ((ctxt->sax) && (ctxt->sax->startDocument) && 5436 (!ctxt->disableSAX)) 5437 ctxt->sax->startDocument(ctxt->userData); 5438 5439 cur = in->cur[0]; 5440 next = in->cur[1]; 5441 if ((cur == '<') && (next == '!') && 5442 (UPP(2) == 'D') && (UPP(3) == 'O') && 5443 (UPP(4) == 'C') && (UPP(5) == 'T') && 5444 (UPP(6) == 'Y') && (UPP(7) == 'P') && 5445 (UPP(8) == 'E')) { 5446 if ((!terminate) && 5447 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5448 goto done; 5449 #ifdef DEBUG_PUSH 5450 xmlGenericError(xmlGenericErrorContext, 5451 "HPP: Parsing internal subset\n"); 5452 #endif 5453 htmlParseDocTypeDecl(ctxt); 5454 ctxt->instate = XML_PARSER_PROLOG; 5455 #ifdef DEBUG_PUSH 5456 xmlGenericError(xmlGenericErrorContext, 5457 "HPP: entering PROLOG\n"); 5458 #endif 5459 } else { 5460 ctxt->instate = XML_PARSER_MISC; 5461 #ifdef DEBUG_PUSH 5462 xmlGenericError(xmlGenericErrorContext, 5463 "HPP: entering MISC\n"); 5464 #endif 5465 } 5466 break; 5467 case XML_PARSER_MISC: 5468 SKIP_BLANKS; 5469 if (in->buf == NULL) 5470 avail = in->length - (in->cur - in->base); 5471 else 5472 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); 5473 /* 5474 * no chars in buffer 5475 */ 5476 if (avail < 1) 5477 goto done; 5478 /* 5479 * not enouth chars in buffer 5480 */ 5481 if (avail < 2) { 5482 if (!terminate) 5483 goto done; 5484 else 5485 next = ' '; 5486 } else { 5487 next = in->cur[1]; 5488 } 5489 cur = in->cur[0]; 5490 if ((cur == '<') && (next == '!') && 5491 (in->cur[2] == '-') && (in->cur[3] == '-')) { 5492 if ((!terminate) && 5493 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0)) 5494 goto done; 5495 #ifdef DEBUG_PUSH 5496 xmlGenericError(xmlGenericErrorContext, 5497 "HPP: Parsing Comment\n"); 5498 #endif 5499 htmlParseComment(ctxt); 5500 ctxt->instate = XML_PARSER_MISC; 5501 } else if ((cur == '<') && (next == '?')) { 5502 if ((!terminate) && 5503 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5504 goto done; 5505 #ifdef DEBUG_PUSH 5506 xmlGenericError(xmlGenericErrorContext, 5507 "HPP: Parsing PI\n"); 5508 #endif 5509 htmlParsePI(ctxt); 5510 ctxt->instate = XML_PARSER_MISC; 5511 } else if ((cur == '<') && (next == '!') && 5512 (UPP(2) == 'D') && (UPP(3) == 'O') && 5513 (UPP(4) == 'C') && (UPP(5) == 'T') && 5514 (UPP(6) == 'Y') && (UPP(7) == 'P') && 5515 (UPP(8) == 'E')) { 5516 if ((!terminate) && 5517 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5518 goto done; 5519 #ifdef DEBUG_PUSH 5520 xmlGenericError(xmlGenericErrorContext, 5521 "HPP: Parsing internal subset\n"); 5522 #endif 5523 htmlParseDocTypeDecl(ctxt); 5524 ctxt->instate = XML_PARSER_PROLOG; 5525 #ifdef DEBUG_PUSH 5526 xmlGenericError(xmlGenericErrorContext, 5527 "HPP: entering PROLOG\n"); 5528 #endif 5529 } else if ((cur == '<') && (next == '!') && 5530 (avail < 9)) { 5531 goto done; 5532 } else { 5533 ctxt->instate = XML_PARSER_START_TAG; 5534 #ifdef DEBUG_PUSH 5535 xmlGenericError(xmlGenericErrorContext, 5536 "HPP: entering START_TAG\n"); 5537 #endif 5538 } 5539 break; 5540 case XML_PARSER_PROLOG: 5541 SKIP_BLANKS; 5542 if (in->buf == NULL) 5543 avail = in->length - (in->cur - in->base); 5544 else 5545 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); 5546 if (avail < 2) 5547 goto done; 5548 cur = in->cur[0]; 5549 next = in->cur[1]; 5550 if ((cur == '<') && (next == '!') && 5551 (in->cur[2] == '-') && (in->cur[3] == '-')) { 5552 if ((!terminate) && 5553 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0)) 5554 goto done; 5555 #ifdef DEBUG_PUSH 5556 xmlGenericError(xmlGenericErrorContext, 5557 "HPP: Parsing Comment\n"); 5558 #endif 5559 htmlParseComment(ctxt); 5560 ctxt->instate = XML_PARSER_PROLOG; 5561 } else if ((cur == '<') && (next == '?')) { 5562 if ((!terminate) && 5563 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5564 goto done; 5565 #ifdef DEBUG_PUSH 5566 xmlGenericError(xmlGenericErrorContext, 5567 "HPP: Parsing PI\n"); 5568 #endif 5569 htmlParsePI(ctxt); 5570 ctxt->instate = XML_PARSER_PROLOG; 5571 } else if ((cur == '<') && (next == '!') && 5572 (avail < 4)) { 5573 goto done; 5574 } else { 5575 ctxt->instate = XML_PARSER_START_TAG; 5576 #ifdef DEBUG_PUSH 5577 xmlGenericError(xmlGenericErrorContext, 5578 "HPP: entering START_TAG\n"); 5579 #endif 5580 } 5581 break; 5582 case XML_PARSER_EPILOG: 5583 if (in->buf == NULL) 5584 avail = in->length - (in->cur - in->base); 5585 else 5586 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); 5587 if (avail < 1) 5588 goto done; 5589 cur = in->cur[0]; 5590 if (IS_BLANK_CH(cur)) { 5591 htmlParseCharData(ctxt); 5592 goto done; 5593 } 5594 if (avail < 2) 5595 goto done; 5596 next = in->cur[1]; 5597 if ((cur == '<') && (next == '!') && 5598 (in->cur[2] == '-') && (in->cur[3] == '-')) { 5599 if ((!terminate) && 5600 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0)) 5601 goto done; 5602 #ifdef DEBUG_PUSH 5603 xmlGenericError(xmlGenericErrorContext, 5604 "HPP: Parsing Comment\n"); 5605 #endif 5606 htmlParseComment(ctxt); 5607 ctxt->instate = XML_PARSER_EPILOG; 5608 } else if ((cur == '<') && (next == '?')) { 5609 if ((!terminate) && 5610 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5611 goto done; 5612 #ifdef DEBUG_PUSH 5613 xmlGenericError(xmlGenericErrorContext, 5614 "HPP: Parsing PI\n"); 5615 #endif 5616 htmlParsePI(ctxt); 5617 ctxt->instate = XML_PARSER_EPILOG; 5618 } else if ((cur == '<') && (next == '!') && 5619 (avail < 4)) { 5620 goto done; 5621 } else { 5622 ctxt->errNo = XML_ERR_DOCUMENT_END; 5623 ctxt->wellFormed = 0; 5624 ctxt->instate = XML_PARSER_EOF; 5625 #ifdef DEBUG_PUSH 5626 xmlGenericError(xmlGenericErrorContext, 5627 "HPP: entering EOF\n"); 5628 #endif 5629 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 5630 ctxt->sax->endDocument(ctxt->userData); 5631 goto done; 5632 } 5633 break; 5634 case XML_PARSER_START_TAG: { 5635 const xmlChar *name; 5636 int failed; 5637 const htmlElemDesc * info; 5638 5639 /* 5640 * no chars in buffer 5641 */ 5642 if (avail < 1) 5643 goto done; 5644 /* 5645 * not enouth chars in buffer 5646 */ 5647 if (avail < 2) { 5648 if (!terminate) 5649 goto done; 5650 else 5651 next = ' '; 5652 } else { 5653 next = in->cur[1]; 5654 } 5655 cur = in->cur[0]; 5656 if (cur != '<') { 5657 ctxt->instate = XML_PARSER_CONTENT; 5658 #ifdef DEBUG_PUSH 5659 xmlGenericError(xmlGenericErrorContext, 5660 "HPP: entering CONTENT\n"); 5661 #endif 5662 break; 5663 } 5664 if (next == '/') { 5665 ctxt->instate = XML_PARSER_END_TAG; 5666 ctxt->checkIndex = 0; 5667 #ifdef DEBUG_PUSH 5668 xmlGenericError(xmlGenericErrorContext, 5669 "HPP: entering END_TAG\n"); 5670 #endif 5671 break; 5672 } 5673 if ((!terminate) && 5674 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5675 goto done; 5676 5677 /* Capture start position */ 5678 if (ctxt->record_info) { 5679 node_info.begin_pos = ctxt->input->consumed + 5680 (CUR_PTR - ctxt->input->base); 5681 node_info.begin_line = ctxt->input->line; 5682 } 5683 5684 5685 failed = htmlParseStartTag(ctxt); 5686 name = ctxt->name; 5687 if ((failed == -1) || 5688 (name == NULL)) { 5689 if (CUR == '>') 5690 NEXT; 5691 break; 5692 } 5693 5694 /* 5695 * Lookup the info for that element. 5696 */ 5697 info = htmlTagLookup(name); 5698 if (info == NULL) { 5699 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, 5700 "Tag %s invalid\n", name, NULL); 5701 } 5702 5703 /* 5704 * Check for an Empty Element labeled the XML/SGML way 5705 */ 5706 if ((CUR == '/') && (NXT(1) == '>')) { 5707 SKIP(2); 5708 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 5709 ctxt->sax->endElement(ctxt->userData, name); 5710 htmlnamePop(ctxt); 5711 ctxt->instate = XML_PARSER_CONTENT; 5712 #ifdef DEBUG_PUSH 5713 xmlGenericError(xmlGenericErrorContext, 5714 "HPP: entering CONTENT\n"); 5715 #endif 5716 break; 5717 } 5718 5719 if (CUR == '>') { 5720 NEXT; 5721 } else { 5722 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 5723 "Couldn't find end of Start Tag %s\n", 5724 name, NULL); 5725 5726 /* 5727 * end of parsing of this node. 5728 */ 5729 if (xmlStrEqual(name, ctxt->name)) { 5730 nodePop(ctxt); 5731 htmlnamePop(ctxt); 5732 } 5733 5734 if (ctxt->record_info) 5735 htmlNodeInfoPush(ctxt, &node_info); 5736 5737 ctxt->instate = XML_PARSER_CONTENT; 5738 #ifdef DEBUG_PUSH 5739 xmlGenericError(xmlGenericErrorContext, 5740 "HPP: entering CONTENT\n"); 5741 #endif 5742 break; 5743 } 5744 5745 /* 5746 * Check for an Empty Element from DTD definition 5747 */ 5748 if ((info != NULL) && (info->empty)) { 5749 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 5750 ctxt->sax->endElement(ctxt->userData, name); 5751 htmlnamePop(ctxt); 5752 } 5753 5754 if (ctxt->record_info) 5755 htmlNodeInfoPush(ctxt, &node_info); 5756 5757 ctxt->instate = XML_PARSER_CONTENT; 5758 #ifdef DEBUG_PUSH 5759 xmlGenericError(xmlGenericErrorContext, 5760 "HPP: entering CONTENT\n"); 5761 #endif 5762 break; 5763 } 5764 case XML_PARSER_CONTENT: { 5765 long cons; 5766 /* 5767 * Handle preparsed entities and charRef 5768 */ 5769 if (ctxt->token != 0) { 5770 xmlChar chr[2] = { 0 , 0 } ; 5771 5772 chr[0] = (xmlChar) ctxt->token; 5773 htmlCheckParagraph(ctxt); 5774 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 5775 ctxt->sax->characters(ctxt->userData, chr, 1); 5776 ctxt->token = 0; 5777 ctxt->checkIndex = 0; 5778 } 5779 if ((avail == 1) && (terminate)) { 5780 cur = in->cur[0]; 5781 if ((cur != '<') && (cur != '&')) { 5782 if (ctxt->sax != NULL) { 5783 if (IS_BLANK_CH(cur)) { 5784 if (ctxt->keepBlanks) { 5785 if (ctxt->sax->characters != NULL) 5786 ctxt->sax->characters( 5787 ctxt->userData, &in->cur[0], 1); 5788 } else { 5789 if (ctxt->sax->ignorableWhitespace != NULL) 5790 ctxt->sax->ignorableWhitespace( 5791 ctxt->userData, &in->cur[0], 1); 5792 } 5793 } else { 5794 htmlCheckParagraph(ctxt); 5795 if (ctxt->sax->characters != NULL) 5796 ctxt->sax->characters( 5797 ctxt->userData, &in->cur[0], 1); 5798 } 5799 } 5800 ctxt->token = 0; 5801 ctxt->checkIndex = 0; 5802 in->cur++; 5803 break; 5804 } 5805 } 5806 if (avail < 2) 5807 goto done; 5808 cur = in->cur[0]; 5809 next = in->cur[1]; 5810 cons = ctxt->nbChars; 5811 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) || 5812 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) { 5813 /* 5814 * Handle SCRIPT/STYLE separately 5815 */ 5816 if (!terminate) { 5817 int idx; 5818 xmlChar val; 5819 5820 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0); 5821 if (idx < 0) 5822 goto done; 5823 val = in->cur[idx + 2]; 5824 if (val == 0) /* bad cut of input */ 5825 goto done; 5826 } 5827 htmlParseScript(ctxt); 5828 if ((cur == '<') && (next == '/')) { 5829 ctxt->instate = XML_PARSER_END_TAG; 5830 ctxt->checkIndex = 0; 5831 #ifdef DEBUG_PUSH 5832 xmlGenericError(xmlGenericErrorContext, 5833 "HPP: entering END_TAG\n"); 5834 #endif 5835 break; 5836 } 5837 } else { 5838 /* 5839 * Sometimes DOCTYPE arrives in the middle of the document 5840 */ 5841 if ((cur == '<') && (next == '!') && 5842 (UPP(2) == 'D') && (UPP(3) == 'O') && 5843 (UPP(4) == 'C') && (UPP(5) == 'T') && 5844 (UPP(6) == 'Y') && (UPP(7) == 'P') && 5845 (UPP(8) == 'E')) { 5846 if ((!terminate) && 5847 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5848 goto done; 5849 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 5850 "Misplaced DOCTYPE declaration\n", 5851 BAD_CAST "DOCTYPE" , NULL); 5852 htmlParseDocTypeDecl(ctxt); 5853 } else if ((cur == '<') && (next == '!') && 5854 (in->cur[2] == '-') && (in->cur[3] == '-')) { 5855 if ((!terminate) && 5856 (htmlParseLookupSequence( 5857 ctxt, '-', '-', '>', 1, 1) < 0)) 5858 goto done; 5859 #ifdef DEBUG_PUSH 5860 xmlGenericError(xmlGenericErrorContext, 5861 "HPP: Parsing Comment\n"); 5862 #endif 5863 htmlParseComment(ctxt); 5864 ctxt->instate = XML_PARSER_CONTENT; 5865 } else if ((cur == '<') && (next == '?')) { 5866 if ((!terminate) && 5867 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5868 goto done; 5869 #ifdef DEBUG_PUSH 5870 xmlGenericError(xmlGenericErrorContext, 5871 "HPP: Parsing PI\n"); 5872 #endif 5873 htmlParsePI(ctxt); 5874 ctxt->instate = XML_PARSER_CONTENT; 5875 } else if ((cur == '<') && (next == '!') && (avail < 4)) { 5876 goto done; 5877 } else if ((cur == '<') && (next == '/')) { 5878 ctxt->instate = XML_PARSER_END_TAG; 5879 ctxt->checkIndex = 0; 5880 #ifdef DEBUG_PUSH 5881 xmlGenericError(xmlGenericErrorContext, 5882 "HPP: entering END_TAG\n"); 5883 #endif 5884 break; 5885 } else if (cur == '<') { 5886 ctxt->instate = XML_PARSER_START_TAG; 5887 ctxt->checkIndex = 0; 5888 #ifdef DEBUG_PUSH 5889 xmlGenericError(xmlGenericErrorContext, 5890 "HPP: entering START_TAG\n"); 5891 #endif 5892 break; 5893 } else if (cur == '&') { 5894 if ((!terminate) && 5895 (htmlParseLookupChars(ctxt, 5896 BAD_CAST "; >/", 4) < 0)) 5897 goto done; 5898 #ifdef DEBUG_PUSH 5899 xmlGenericError(xmlGenericErrorContext, 5900 "HPP: Parsing Reference\n"); 5901 #endif 5902 /* TODO: check generation of subtrees if noent !!! */ 5903 htmlParseReference(ctxt); 5904 } else { 5905 /* 5906 * check that the text sequence is complete 5907 * before handing out the data to the parser 5908 * to avoid problems with erroneous end of 5909 * data detection. 5910 */ 5911 if ((!terminate) && 5912 (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0)) 5913 goto done; 5914 ctxt->checkIndex = 0; 5915 #ifdef DEBUG_PUSH 5916 xmlGenericError(xmlGenericErrorContext, 5917 "HPP: Parsing char data\n"); 5918 #endif 5919 htmlParseCharData(ctxt); 5920 } 5921 } 5922 if (cons == ctxt->nbChars) { 5923 if (ctxt->node != NULL) { 5924 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5925 "detected an error in element content\n", 5926 NULL, NULL); 5927 } 5928 NEXT; 5929 break; 5930 } 5931 5932 break; 5933 } 5934 case XML_PARSER_END_TAG: 5935 if (avail < 2) 5936 goto done; 5937 if ((!terminate) && 5938 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5939 goto done; 5940 htmlParseEndTag(ctxt); 5941 if (ctxt->nameNr == 0) { 5942 ctxt->instate = XML_PARSER_EPILOG; 5943 } else { 5944 ctxt->instate = XML_PARSER_CONTENT; 5945 } 5946 ctxt->checkIndex = 0; 5947 #ifdef DEBUG_PUSH 5948 xmlGenericError(xmlGenericErrorContext, 5949 "HPP: entering CONTENT\n"); 5950 #endif 5951 break; 5952 case XML_PARSER_CDATA_SECTION: 5953 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5954 "HPP: internal error, state == CDATA\n", 5955 NULL, NULL); 5956 ctxt->instate = XML_PARSER_CONTENT; 5957 ctxt->checkIndex = 0; 5958 #ifdef DEBUG_PUSH 5959 xmlGenericError(xmlGenericErrorContext, 5960 "HPP: entering CONTENT\n"); 5961 #endif 5962 break; 5963 case XML_PARSER_DTD: 5964 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5965 "HPP: internal error, state == DTD\n", 5966 NULL, NULL); 5967 ctxt->instate = XML_PARSER_CONTENT; 5968 ctxt->checkIndex = 0; 5969 #ifdef DEBUG_PUSH 5970 xmlGenericError(xmlGenericErrorContext, 5971 "HPP: entering CONTENT\n"); 5972 #endif 5973 break; 5974 case XML_PARSER_COMMENT: 5975 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5976 "HPP: internal error, state == COMMENT\n", 5977 NULL, NULL); 5978 ctxt->instate = XML_PARSER_CONTENT; 5979 ctxt->checkIndex = 0; 5980 #ifdef DEBUG_PUSH 5981 xmlGenericError(xmlGenericErrorContext, 5982 "HPP: entering CONTENT\n"); 5983 #endif 5984 break; 5985 case XML_PARSER_PI: 5986 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5987 "HPP: internal error, state == PI\n", 5988 NULL, NULL); 5989 ctxt->instate = XML_PARSER_CONTENT; 5990 ctxt->checkIndex = 0; 5991 #ifdef DEBUG_PUSH 5992 xmlGenericError(xmlGenericErrorContext, 5993 "HPP: entering CONTENT\n"); 5994 #endif 5995 break; 5996 case XML_PARSER_ENTITY_DECL: 5997 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5998 "HPP: internal error, state == ENTITY_DECL\n", 5999 NULL, NULL); 6000 ctxt->instate = XML_PARSER_CONTENT; 6001 ctxt->checkIndex = 0; 6002 #ifdef DEBUG_PUSH 6003 xmlGenericError(xmlGenericErrorContext, 6004 "HPP: entering CONTENT\n"); 6005 #endif 6006 break; 6007 case XML_PARSER_ENTITY_VALUE: 6008 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 6009 "HPP: internal error, state == ENTITY_VALUE\n", 6010 NULL, NULL); 6011 ctxt->instate = XML_PARSER_CONTENT; 6012 ctxt->checkIndex = 0; 6013 #ifdef DEBUG_PUSH 6014 xmlGenericError(xmlGenericErrorContext, 6015 "HPP: entering DTD\n"); 6016 #endif 6017 break; 6018 case XML_PARSER_ATTRIBUTE_VALUE: 6019 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 6020 "HPP: internal error, state == ATTRIBUTE_VALUE\n", 6021 NULL, NULL); 6022 ctxt->instate = XML_PARSER_START_TAG; 6023 ctxt->checkIndex = 0; 6024 #ifdef DEBUG_PUSH 6025 xmlGenericError(xmlGenericErrorContext, 6026 "HPP: entering START_TAG\n"); 6027 #endif 6028 break; 6029 case XML_PARSER_SYSTEM_LITERAL: 6030 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 6031 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n", 6032 NULL, NULL); 6033 ctxt->instate = XML_PARSER_CONTENT; 6034 ctxt->checkIndex = 0; 6035 #ifdef DEBUG_PUSH 6036 xmlGenericError(xmlGenericErrorContext, 6037 "HPP: entering CONTENT\n"); 6038 #endif 6039 break; 6040 case XML_PARSER_IGNORE: 6041 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 6042 "HPP: internal error, state == XML_PARSER_IGNORE\n", 6043 NULL, NULL); 6044 ctxt->instate = XML_PARSER_CONTENT; 6045 ctxt->checkIndex = 0; 6046 #ifdef DEBUG_PUSH 6047 xmlGenericError(xmlGenericErrorContext, 6048 "HPP: entering CONTENT\n"); 6049 #endif 6050 break; 6051 case XML_PARSER_PUBLIC_LITERAL: 6052 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 6053 "HPP: internal error, state == XML_PARSER_LITERAL\n", 6054 NULL, NULL); 6055 ctxt->instate = XML_PARSER_CONTENT; 6056 ctxt->checkIndex = 0; 6057 #ifdef DEBUG_PUSH 6058 xmlGenericError(xmlGenericErrorContext, 6059 "HPP: entering CONTENT\n"); 6060 #endif 6061 break; 6062 6063 } 6064 } 6065 done: 6066 if ((avail == 0) && (terminate)) { 6067 htmlAutoCloseOnEnd(ctxt); 6068 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { 6069 /* 6070 * SAX: end of the document processing. 6071 */ 6072 ctxt->instate = XML_PARSER_EOF; 6073 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 6074 ctxt->sax->endDocument(ctxt->userData); 6075 } 6076 } 6077 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) && 6078 ((terminate) || (ctxt->instate == XML_PARSER_EOF) || 6079 (ctxt->instate == XML_PARSER_EPILOG))) { 6080 xmlDtdPtr dtd; 6081 dtd = xmlGetIntSubset(ctxt->myDoc); 6082 if (dtd == NULL) 6083 ctxt->myDoc->intSubset = 6084 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html", 6085 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN", 6086 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"); 6087 } 6088 #ifdef DEBUG_PUSH 6089 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret); 6090 #endif 6091 return(ret); 6092 } 6093 6094 /** 6095 * htmlParseChunk: 6096 * @ctxt: an HTML parser context 6097 * @chunk: an char array 6098 * @size: the size in byte of the chunk 6099 * @terminate: last chunk indicator 6100 * 6101 * Parse a Chunk of memory 6102 * 6103 * Returns zero if no error, the xmlParserErrors otherwise. 6104 */ 6105 int 6106 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size, 6107 int terminate) { 6108 if ((ctxt == NULL) || (ctxt->input == NULL)) { 6109 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 6110 "htmlParseChunk: context error\n", NULL, NULL); 6111 return(XML_ERR_INTERNAL_ERROR); 6112 } 6113 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && 6114 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) { 6115 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input); 6116 size_t cur = ctxt->input->cur - ctxt->input->base; 6117 int res; 6118 6119 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk); 6120 if (res < 0) { 6121 ctxt->errNo = XML_PARSER_EOF; 6122 ctxt->disableSAX = 1; 6123 return (XML_PARSER_EOF); 6124 } 6125 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur); 6126 #ifdef DEBUG_PUSH 6127 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); 6128 #endif 6129 6130 #if 0 6131 if ((terminate) || (ctxt->input->buf->buffer->use > 80)) 6132 htmlParseTryOrFinish(ctxt, terminate); 6133 #endif 6134 } else if (ctxt->instate != XML_PARSER_EOF) { 6135 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) { 6136 xmlParserInputBufferPtr in = ctxt->input->buf; 6137 if ((in->encoder != NULL) && (in->buffer != NULL) && 6138 (in->raw != NULL)) { 6139 int nbchars; 6140 size_t base = xmlBufGetInputBase(in->buffer, ctxt->input); 6141 size_t current = ctxt->input->cur - ctxt->input->base; 6142 6143 nbchars = xmlCharEncInput(in, terminate); 6144 if (nbchars < 0) { 6145 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 6146 "encoder error\n", NULL, NULL); 6147 return(XML_ERR_INVALID_ENCODING); 6148 } 6149 xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current); 6150 } 6151 } 6152 } 6153 htmlParseTryOrFinish(ctxt, terminate); 6154 if (terminate) { 6155 if ((ctxt->instate != XML_PARSER_EOF) && 6156 (ctxt->instate != XML_PARSER_EPILOG) && 6157 (ctxt->instate != XML_PARSER_MISC)) { 6158 ctxt->errNo = XML_ERR_DOCUMENT_END; 6159 ctxt->wellFormed = 0; 6160 } 6161 if (ctxt->instate != XML_PARSER_EOF) { 6162 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 6163 ctxt->sax->endDocument(ctxt->userData); 6164 } 6165 ctxt->instate = XML_PARSER_EOF; 6166 } 6167 return((xmlParserErrors) ctxt->errNo); 6168 } 6169 6170 /************************************************************************ 6171 * * 6172 * User entry points * 6173 * * 6174 ************************************************************************/ 6175 6176 /** 6177 * htmlCreatePushParserCtxt: 6178 * @sax: a SAX handler 6179 * @user_data: The user data returned on SAX callbacks 6180 * @chunk: a pointer to an array of chars 6181 * @size: number of chars in the array 6182 * @filename: an optional file name or URI 6183 * @enc: an optional encoding 6184 * 6185 * Create a parser context for using the HTML parser in push mode 6186 * The value of @filename is used for fetching external entities 6187 * and error/warning reports. 6188 * 6189 * Returns the new parser context or NULL 6190 */ 6191 htmlParserCtxtPtr 6192 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data, 6193 const char *chunk, int size, const char *filename, 6194 xmlCharEncoding enc) { 6195 htmlParserCtxtPtr ctxt; 6196 htmlParserInputPtr inputStream; 6197 xmlParserInputBufferPtr buf; 6198 6199 xmlInitParser(); 6200 6201 buf = xmlAllocParserInputBuffer(enc); 6202 if (buf == NULL) return(NULL); 6203 6204 ctxt = htmlNewParserCtxt(); 6205 if (ctxt == NULL) { 6206 xmlFreeParserInputBuffer(buf); 6207 return(NULL); 6208 } 6209 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder) 6210 ctxt->charset=XML_CHAR_ENCODING_UTF8; 6211 if (sax != NULL) { 6212 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler) 6213 xmlFree(ctxt->sax); 6214 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler)); 6215 if (ctxt->sax == NULL) { 6216 xmlFree(buf); 6217 xmlFree(ctxt); 6218 return(NULL); 6219 } 6220 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler)); 6221 if (user_data != NULL) 6222 ctxt->userData = user_data; 6223 } 6224 if (filename == NULL) { 6225 ctxt->directory = NULL; 6226 } else { 6227 ctxt->directory = xmlParserGetDirectory(filename); 6228 } 6229 6230 inputStream = htmlNewInputStream(ctxt); 6231 if (inputStream == NULL) { 6232 xmlFreeParserCtxt(ctxt); 6233 xmlFree(buf); 6234 return(NULL); 6235 } 6236 6237 if (filename == NULL) 6238 inputStream->filename = NULL; 6239 else 6240 inputStream->filename = (char *) 6241 xmlCanonicPath((const xmlChar *) filename); 6242 inputStream->buf = buf; 6243 xmlBufResetInput(buf->buffer, inputStream); 6244 6245 inputPush(ctxt, inputStream); 6246 6247 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && 6248 (ctxt->input->buf != NULL)) { 6249 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input); 6250 size_t cur = ctxt->input->cur - ctxt->input->base; 6251 6252 xmlParserInputBufferPush(ctxt->input->buf, size, chunk); 6253 6254 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur); 6255 #ifdef DEBUG_PUSH 6256 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); 6257 #endif 6258 } 6259 ctxt->progressive = 1; 6260 6261 return(ctxt); 6262 } 6263 #endif /* LIBXML_PUSH_ENABLED */ 6264 6265 /** 6266 * htmlSAXParseDoc: 6267 * @cur: a pointer to an array of xmlChar 6268 * @encoding: a free form C string describing the HTML document encoding, or NULL 6269 * @sax: the SAX handler block 6270 * @userData: if using SAX, this pointer will be provided on callbacks. 6271 * 6272 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks 6273 * to handle parse events. If sax is NULL, fallback to the default DOM 6274 * behavior and return a tree. 6275 * 6276 * Returns the resulting document tree unless SAX is NULL or the document is 6277 * not well formed. 6278 */ 6279 6280 htmlDocPtr 6281 htmlSAXParseDoc(const xmlChar *cur, const char *encoding, 6282 htmlSAXHandlerPtr sax, void *userData) { 6283 htmlDocPtr ret; 6284 htmlParserCtxtPtr ctxt; 6285 6286 xmlInitParser(); 6287 6288 if (cur == NULL) return(NULL); 6289 6290 6291 ctxt = htmlCreateDocParserCtxt(cur, encoding); 6292 if (ctxt == NULL) return(NULL); 6293 if (sax != NULL) { 6294 if (ctxt->sax != NULL) xmlFree (ctxt->sax); 6295 ctxt->sax = sax; 6296 ctxt->userData = userData; 6297 } 6298 6299 htmlParseDocument(ctxt); 6300 ret = ctxt->myDoc; 6301 if (sax != NULL) { 6302 ctxt->sax = NULL; 6303 ctxt->userData = NULL; 6304 } 6305 htmlFreeParserCtxt(ctxt); 6306 6307 return(ret); 6308 } 6309 6310 /** 6311 * htmlParseDoc: 6312 * @cur: a pointer to an array of xmlChar 6313 * @encoding: a free form C string describing the HTML document encoding, or NULL 6314 * 6315 * parse an HTML in-memory document and build a tree. 6316 * 6317 * Returns the resulting document tree 6318 */ 6319 6320 htmlDocPtr 6321 htmlParseDoc(const xmlChar *cur, const char *encoding) { 6322 return(htmlSAXParseDoc(cur, encoding, NULL, NULL)); 6323 } 6324 6325 6326 /** 6327 * htmlCreateFileParserCtxt: 6328 * @filename: the filename 6329 * @encoding: a free form C string describing the HTML document encoding, or NULL 6330 * 6331 * Create a parser context for a file content. 6332 * Automatic support for ZLIB/Compress compressed document is provided 6333 * by default if found at compile-time. 6334 * 6335 * Returns the new parser context or NULL 6336 */ 6337 htmlParserCtxtPtr 6338 htmlCreateFileParserCtxt(const char *filename, const char *encoding) 6339 { 6340 htmlParserCtxtPtr ctxt; 6341 htmlParserInputPtr inputStream; 6342 char *canonicFilename; 6343 /* htmlCharEncoding enc; */ 6344 xmlChar *content, *content_line = (xmlChar *) "charset="; 6345 6346 if (filename == NULL) 6347 return(NULL); 6348 6349 ctxt = htmlNewParserCtxt(); 6350 if (ctxt == NULL) { 6351 return(NULL); 6352 } 6353 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename); 6354 if (canonicFilename == NULL) { 6355 #ifdef LIBXML_SAX1_ENABLED 6356 if (xmlDefaultSAXHandler.error != NULL) { 6357 xmlDefaultSAXHandler.error(NULL, "out of memory\n"); 6358 } 6359 #endif 6360 xmlFreeParserCtxt(ctxt); 6361 return(NULL); 6362 } 6363 6364 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt); 6365 xmlFree(canonicFilename); 6366 if (inputStream == NULL) { 6367 xmlFreeParserCtxt(ctxt); 6368 return(NULL); 6369 } 6370 6371 inputPush(ctxt, inputStream); 6372 6373 /* set encoding */ 6374 if (encoding) { 6375 size_t l = strlen(encoding); 6376 6377 if (l < 1000) { 6378 content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1); 6379 if (content) { 6380 strcpy ((char *)content, (char *)content_line); 6381 strcat ((char *)content, (char *)encoding); 6382 htmlCheckEncoding (ctxt, content); 6383 xmlFree (content); 6384 } 6385 } 6386 } 6387 6388 return(ctxt); 6389 } 6390 6391 /** 6392 * htmlSAXParseFile: 6393 * @filename: the filename 6394 * @encoding: a free form C string describing the HTML document encoding, or NULL 6395 * @sax: the SAX handler block 6396 * @userData: if using SAX, this pointer will be provided on callbacks. 6397 * 6398 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress 6399 * compressed document is provided by default if found at compile-time. 6400 * It use the given SAX function block to handle the parsing callback. 6401 * If sax is NULL, fallback to the default DOM tree building routines. 6402 * 6403 * Returns the resulting document tree unless SAX is NULL or the document is 6404 * not well formed. 6405 */ 6406 6407 htmlDocPtr 6408 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax, 6409 void *userData) { 6410 htmlDocPtr ret; 6411 htmlParserCtxtPtr ctxt; 6412 htmlSAXHandlerPtr oldsax = NULL; 6413 6414 xmlInitParser(); 6415 6416 ctxt = htmlCreateFileParserCtxt(filename, encoding); 6417 if (ctxt == NULL) return(NULL); 6418 if (sax != NULL) { 6419 oldsax = ctxt->sax; 6420 ctxt->sax = sax; 6421 ctxt->userData = userData; 6422 } 6423 6424 htmlParseDocument(ctxt); 6425 6426 ret = ctxt->myDoc; 6427 if (sax != NULL) { 6428 ctxt->sax = oldsax; 6429 ctxt->userData = NULL; 6430 } 6431 htmlFreeParserCtxt(ctxt); 6432 6433 return(ret); 6434 } 6435 6436 /** 6437 * htmlParseFile: 6438 * @filename: the filename 6439 * @encoding: a free form C string describing the HTML document encoding, or NULL 6440 * 6441 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress 6442 * compressed document is provided by default if found at compile-time. 6443 * 6444 * Returns the resulting document tree 6445 */ 6446 6447 htmlDocPtr 6448 htmlParseFile(const char *filename, const char *encoding) { 6449 return(htmlSAXParseFile(filename, encoding, NULL, NULL)); 6450 } 6451 6452 /** 6453 * htmlHandleOmittedElem: 6454 * @val: int 0 or 1 6455 * 6456 * Set and return the previous value for handling HTML omitted tags. 6457 * 6458 * Returns the last value for 0 for no handling, 1 for auto insertion. 6459 */ 6460 6461 int 6462 htmlHandleOmittedElem(int val) { 6463 int old = htmlOmittedDefaultValue; 6464 6465 htmlOmittedDefaultValue = val; 6466 return(old); 6467 } 6468 6469 /** 6470 * htmlElementAllowedHere: 6471 * @parent: HTML parent element 6472 * @elt: HTML element 6473 * 6474 * Checks whether an HTML element may be a direct child of a parent element. 6475 * Note - doesn't check for deprecated elements 6476 * 6477 * Returns 1 if allowed; 0 otherwise. 6478 */ 6479 int 6480 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) { 6481 const char** p ; 6482 6483 if ( ! elt || ! parent || ! parent->subelts ) 6484 return 0 ; 6485 6486 for ( p = parent->subelts; *p; ++p ) 6487 if ( !xmlStrcmp((const xmlChar *)*p, elt) ) 6488 return 1 ; 6489 6490 return 0 ; 6491 } 6492 /** 6493 * htmlElementStatusHere: 6494 * @parent: HTML parent element 6495 * @elt: HTML element 6496 * 6497 * Checks whether an HTML element may be a direct child of a parent element. 6498 * and if so whether it is valid or deprecated. 6499 * 6500 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID 6501 */ 6502 htmlStatus 6503 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) { 6504 if ( ! parent || ! elt ) 6505 return HTML_INVALID ; 6506 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) ) 6507 return HTML_INVALID ; 6508 6509 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ; 6510 } 6511 /** 6512 * htmlAttrAllowed: 6513 * @elt: HTML element 6514 * @attr: HTML attribute 6515 * @legacy: whether to allow deprecated attributes 6516 * 6517 * Checks whether an attribute is valid for an element 6518 * Has full knowledge of Required and Deprecated attributes 6519 * 6520 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID 6521 */ 6522 htmlStatus 6523 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) { 6524 const char** p ; 6525 6526 if ( !elt || ! attr ) 6527 return HTML_INVALID ; 6528 6529 if ( elt->attrs_req ) 6530 for ( p = elt->attrs_req; *p; ++p) 6531 if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 6532 return HTML_REQUIRED ; 6533 6534 if ( elt->attrs_opt ) 6535 for ( p = elt->attrs_opt; *p; ++p) 6536 if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 6537 return HTML_VALID ; 6538 6539 if ( legacy && elt->attrs_depr ) 6540 for ( p = elt->attrs_depr; *p; ++p) 6541 if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 6542 return HTML_DEPRECATED ; 6543 6544 return HTML_INVALID ; 6545 } 6546 /** 6547 * htmlNodeStatus: 6548 * @node: an htmlNodePtr in a tree 6549 * @legacy: whether to allow deprecated elements (YES is faster here 6550 * for Element nodes) 6551 * 6552 * Checks whether the tree node is valid. Experimental (the author 6553 * only uses the HTML enhancements in a SAX parser) 6554 * 6555 * Return: for Element nodes, a return from htmlElementAllowedHere (if 6556 * legacy allowed) or htmlElementStatusHere (otherwise). 6557 * for Attribute nodes, a return from htmlAttrAllowed 6558 * for other nodes, HTML_NA (no checks performed) 6559 */ 6560 htmlStatus 6561 htmlNodeStatus(const htmlNodePtr node, int legacy) { 6562 if ( ! node ) 6563 return HTML_INVALID ; 6564 6565 switch ( node->type ) { 6566 case XML_ELEMENT_NODE: 6567 return legacy 6568 ? ( htmlElementAllowedHere ( 6569 htmlTagLookup(node->parent->name) , node->name 6570 ) ? HTML_VALID : HTML_INVALID ) 6571 : htmlElementStatusHere( 6572 htmlTagLookup(node->parent->name) , 6573 htmlTagLookup(node->name) ) 6574 ; 6575 case XML_ATTRIBUTE_NODE: 6576 return htmlAttrAllowed( 6577 htmlTagLookup(node->parent->name) , node->name, legacy) ; 6578 default: return HTML_NA ; 6579 } 6580 } 6581 /************************************************************************ 6582 * * 6583 * New set (2.6.0) of simpler and more flexible APIs * 6584 * * 6585 ************************************************************************/ 6586 /** 6587 * DICT_FREE: 6588 * @str: a string 6589 * 6590 * Free a string if it is not owned by the "dict" dictionary in the 6591 * current scope 6592 */ 6593 #define DICT_FREE(str) \ 6594 if ((str) && ((!dict) || \ 6595 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \ 6596 xmlFree((char *)(str)); 6597 6598 /** 6599 * htmlCtxtReset: 6600 * @ctxt: an HTML parser context 6601 * 6602 * Reset a parser context 6603 */ 6604 void 6605 htmlCtxtReset(htmlParserCtxtPtr ctxt) 6606 { 6607 xmlParserInputPtr input; 6608 xmlDictPtr dict; 6609 6610 if (ctxt == NULL) 6611 return; 6612 6613 xmlInitParser(); 6614 dict = ctxt->dict; 6615 6616 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */ 6617 xmlFreeInputStream(input); 6618 } 6619 ctxt->inputNr = 0; 6620 ctxt->input = NULL; 6621 6622 ctxt->spaceNr = 0; 6623 if (ctxt->spaceTab != NULL) { 6624 ctxt->spaceTab[0] = -1; 6625 ctxt->space = &ctxt->spaceTab[0]; 6626 } else { 6627 ctxt->space = NULL; 6628 } 6629 6630 6631 ctxt->nodeNr = 0; 6632 ctxt->node = NULL; 6633 6634 ctxt->nameNr = 0; 6635 ctxt->name = NULL; 6636 6637 DICT_FREE(ctxt->version); 6638 ctxt->version = NULL; 6639 DICT_FREE(ctxt->encoding); 6640 ctxt->encoding = NULL; 6641 DICT_FREE(ctxt->directory); 6642 ctxt->directory = NULL; 6643 DICT_FREE(ctxt->extSubURI); 6644 ctxt->extSubURI = NULL; 6645 DICT_FREE(ctxt->extSubSystem); 6646 ctxt->extSubSystem = NULL; 6647 if (ctxt->myDoc != NULL) 6648 xmlFreeDoc(ctxt->myDoc); 6649 ctxt->myDoc = NULL; 6650 6651 ctxt->standalone = -1; 6652 ctxt->hasExternalSubset = 0; 6653 ctxt->hasPErefs = 0; 6654 ctxt->html = 1; 6655 ctxt->external = 0; 6656 ctxt->instate = XML_PARSER_START; 6657 ctxt->token = 0; 6658 6659 ctxt->wellFormed = 1; 6660 ctxt->nsWellFormed = 1; 6661 ctxt->disableSAX = 0; 6662 ctxt->valid = 1; 6663 ctxt->vctxt.userData = ctxt; 6664 ctxt->vctxt.error = xmlParserValidityError; 6665 ctxt->vctxt.warning = xmlParserValidityWarning; 6666 ctxt->record_info = 0; 6667 ctxt->nbChars = 0; 6668 ctxt->checkIndex = 0; 6669 ctxt->inSubset = 0; 6670 ctxt->errNo = XML_ERR_OK; 6671 ctxt->depth = 0; 6672 ctxt->charset = XML_CHAR_ENCODING_NONE; 6673 ctxt->catalogs = NULL; 6674 xmlInitNodeInfoSeq(&ctxt->node_seq); 6675 6676 if (ctxt->attsDefault != NULL) { 6677 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree); 6678 ctxt->attsDefault = NULL; 6679 } 6680 if (ctxt->attsSpecial != NULL) { 6681 xmlHashFree(ctxt->attsSpecial, NULL); 6682 ctxt->attsSpecial = NULL; 6683 } 6684 } 6685 6686 /** 6687 * htmlCtxtUseOptions: 6688 * @ctxt: an HTML parser context 6689 * @options: a combination of htmlParserOption(s) 6690 * 6691 * Applies the options to the parser context 6692 * 6693 * Returns 0 in case of success, the set of unknown or unimplemented options 6694 * in case of error. 6695 */ 6696 int 6697 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options) 6698 { 6699 if (ctxt == NULL) 6700 return(-1); 6701 6702 if (options & HTML_PARSE_NOWARNING) { 6703 ctxt->sax->warning = NULL; 6704 ctxt->vctxt.warning = NULL; 6705 options -= XML_PARSE_NOWARNING; 6706 ctxt->options |= XML_PARSE_NOWARNING; 6707 } 6708 if (options & HTML_PARSE_NOERROR) { 6709 ctxt->sax->error = NULL; 6710 ctxt->vctxt.error = NULL; 6711 ctxt->sax->fatalError = NULL; 6712 options -= XML_PARSE_NOERROR; 6713 ctxt->options |= XML_PARSE_NOERROR; 6714 } 6715 if (options & HTML_PARSE_PEDANTIC) { 6716 ctxt->pedantic = 1; 6717 options -= XML_PARSE_PEDANTIC; 6718 ctxt->options |= XML_PARSE_PEDANTIC; 6719 } else 6720 ctxt->pedantic = 0; 6721 if (options & XML_PARSE_NOBLANKS) { 6722 ctxt->keepBlanks = 0; 6723 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace; 6724 options -= XML_PARSE_NOBLANKS; 6725 ctxt->options |= XML_PARSE_NOBLANKS; 6726 } else 6727 ctxt->keepBlanks = 1; 6728 if (options & HTML_PARSE_RECOVER) { 6729 ctxt->recovery = 1; 6730 options -= HTML_PARSE_RECOVER; 6731 } else 6732 ctxt->recovery = 0; 6733 if (options & HTML_PARSE_COMPACT) { 6734 ctxt->options |= HTML_PARSE_COMPACT; 6735 options -= HTML_PARSE_COMPACT; 6736 } 6737 if (options & XML_PARSE_HUGE) { 6738 ctxt->options |= XML_PARSE_HUGE; 6739 options -= XML_PARSE_HUGE; 6740 } 6741 if (options & HTML_PARSE_NODEFDTD) { 6742 ctxt->options |= HTML_PARSE_NODEFDTD; 6743 options -= HTML_PARSE_NODEFDTD; 6744 } 6745 if (options & HTML_PARSE_IGNORE_ENC) { 6746 ctxt->options |= HTML_PARSE_IGNORE_ENC; 6747 options -= HTML_PARSE_IGNORE_ENC; 6748 } 6749 if (options & HTML_PARSE_NOIMPLIED) { 6750 ctxt->options |= HTML_PARSE_NOIMPLIED; 6751 options -= HTML_PARSE_NOIMPLIED; 6752 } 6753 ctxt->dictNames = 0; 6754 return (options); 6755 } 6756 6757 /** 6758 * htmlDoRead: 6759 * @ctxt: an HTML parser context 6760 * @URL: the base URL to use for the document 6761 * @encoding: the document encoding, or NULL 6762 * @options: a combination of htmlParserOption(s) 6763 * @reuse: keep the context for reuse 6764 * 6765 * Common front-end for the htmlRead functions 6766 * 6767 * Returns the resulting document tree or NULL 6768 */ 6769 static htmlDocPtr 6770 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding, 6771 int options, int reuse) 6772 { 6773 htmlDocPtr ret; 6774 6775 htmlCtxtUseOptions(ctxt, options); 6776 ctxt->html = 1; 6777 if (encoding != NULL) { 6778 xmlCharEncodingHandlerPtr hdlr; 6779 6780 hdlr = xmlFindCharEncodingHandler(encoding); 6781 if (hdlr != NULL) { 6782 xmlSwitchToEncoding(ctxt, hdlr); 6783 if (ctxt->input->encoding != NULL) 6784 xmlFree((xmlChar *) ctxt->input->encoding); 6785 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding); 6786 } 6787 } 6788 if ((URL != NULL) && (ctxt->input != NULL) && 6789 (ctxt->input->filename == NULL)) 6790 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL); 6791 htmlParseDocument(ctxt); 6792 ret = ctxt->myDoc; 6793 ctxt->myDoc = NULL; 6794 if (!reuse) { 6795 if ((ctxt->dictNames) && 6796 (ret != NULL) && 6797 (ret->dict == ctxt->dict)) 6798 ctxt->dict = NULL; 6799 xmlFreeParserCtxt(ctxt); 6800 } 6801 return (ret); 6802 } 6803 6804 /** 6805 * htmlReadDoc: 6806 * @cur: a pointer to a zero terminated string 6807 * @URL: the base URL to use for the document 6808 * @encoding: the document encoding, or NULL 6809 * @options: a combination of htmlParserOption(s) 6810 * 6811 * parse an XML in-memory document and build a tree. 6812 * 6813 * Returns the resulting document tree 6814 */ 6815 htmlDocPtr 6816 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options) 6817 { 6818 htmlParserCtxtPtr ctxt; 6819 6820 if (cur == NULL) 6821 return (NULL); 6822 6823 xmlInitParser(); 6824 ctxt = htmlCreateDocParserCtxt(cur, NULL); 6825 if (ctxt == NULL) 6826 return (NULL); 6827 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6828 } 6829 6830 /** 6831 * htmlReadFile: 6832 * @filename: a file or URL 6833 * @encoding: the document encoding, or NULL 6834 * @options: a combination of htmlParserOption(s) 6835 * 6836 * parse an XML file from the filesystem or the network. 6837 * 6838 * Returns the resulting document tree 6839 */ 6840 htmlDocPtr 6841 htmlReadFile(const char *filename, const char *encoding, int options) 6842 { 6843 htmlParserCtxtPtr ctxt; 6844 6845 xmlInitParser(); 6846 ctxt = htmlCreateFileParserCtxt(filename, encoding); 6847 if (ctxt == NULL) 6848 return (NULL); 6849 return (htmlDoRead(ctxt, NULL, NULL, options, 0)); 6850 } 6851 6852 /** 6853 * htmlReadMemory: 6854 * @buffer: a pointer to a char array 6855 * @size: the size of the array 6856 * @URL: the base URL to use for the document 6857 * @encoding: the document encoding, or NULL 6858 * @options: a combination of htmlParserOption(s) 6859 * 6860 * parse an XML in-memory document and build a tree. 6861 * 6862 * Returns the resulting document tree 6863 */ 6864 htmlDocPtr 6865 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options) 6866 { 6867 htmlParserCtxtPtr ctxt; 6868 6869 xmlInitParser(); 6870 ctxt = xmlCreateMemoryParserCtxt(buffer, size); 6871 if (ctxt == NULL) 6872 return (NULL); 6873 htmlDefaultSAXHandlerInit(); 6874 if (ctxt->sax != NULL) 6875 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1)); 6876 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6877 } 6878 6879 /** 6880 * htmlReadFd: 6881 * @fd: an open file descriptor 6882 * @URL: the base URL to use for the document 6883 * @encoding: the document encoding, or NULL 6884 * @options: a combination of htmlParserOption(s) 6885 * 6886 * parse an XML from a file descriptor and build a tree. 6887 * 6888 * Returns the resulting document tree 6889 */ 6890 htmlDocPtr 6891 htmlReadFd(int fd, const char *URL, const char *encoding, int options) 6892 { 6893 htmlParserCtxtPtr ctxt; 6894 xmlParserInputBufferPtr input; 6895 xmlParserInputPtr stream; 6896 6897 if (fd < 0) 6898 return (NULL); 6899 xmlInitParser(); 6900 6901 xmlInitParser(); 6902 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE); 6903 if (input == NULL) 6904 return (NULL); 6905 ctxt = xmlNewParserCtxt(); 6906 if (ctxt == NULL) { 6907 xmlFreeParserInputBuffer(input); 6908 return (NULL); 6909 } 6910 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6911 if (stream == NULL) { 6912 xmlFreeParserInputBuffer(input); 6913 xmlFreeParserCtxt(ctxt); 6914 return (NULL); 6915 } 6916 inputPush(ctxt, stream); 6917 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6918 } 6919 6920 /** 6921 * htmlReadIO: 6922 * @ioread: an I/O read function 6923 * @ioclose: an I/O close function 6924 * @ioctx: an I/O handler 6925 * @URL: the base URL to use for the document 6926 * @encoding: the document encoding, or NULL 6927 * @options: a combination of htmlParserOption(s) 6928 * 6929 * parse an HTML document from I/O functions and source and build a tree. 6930 * 6931 * Returns the resulting document tree 6932 */ 6933 htmlDocPtr 6934 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose, 6935 void *ioctx, const char *URL, const char *encoding, int options) 6936 { 6937 htmlParserCtxtPtr ctxt; 6938 xmlParserInputBufferPtr input; 6939 xmlParserInputPtr stream; 6940 6941 if (ioread == NULL) 6942 return (NULL); 6943 xmlInitParser(); 6944 6945 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx, 6946 XML_CHAR_ENCODING_NONE); 6947 if (input == NULL) { 6948 if (ioclose != NULL) 6949 ioclose(ioctx); 6950 return (NULL); 6951 } 6952 ctxt = htmlNewParserCtxt(); 6953 if (ctxt == NULL) { 6954 xmlFreeParserInputBuffer(input); 6955 return (NULL); 6956 } 6957 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6958 if (stream == NULL) { 6959 xmlFreeParserInputBuffer(input); 6960 xmlFreeParserCtxt(ctxt); 6961 return (NULL); 6962 } 6963 inputPush(ctxt, stream); 6964 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6965 } 6966 6967 /** 6968 * htmlCtxtReadDoc: 6969 * @ctxt: an HTML parser context 6970 * @cur: a pointer to a zero terminated string 6971 * @URL: the base URL to use for the document 6972 * @encoding: the document encoding, or NULL 6973 * @options: a combination of htmlParserOption(s) 6974 * 6975 * parse an XML in-memory document and build a tree. 6976 * This reuses the existing @ctxt parser context 6977 * 6978 * Returns the resulting document tree 6979 */ 6980 htmlDocPtr 6981 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur, 6982 const char *URL, const char *encoding, int options) 6983 { 6984 xmlParserInputPtr stream; 6985 6986 if (cur == NULL) 6987 return (NULL); 6988 if (ctxt == NULL) 6989 return (NULL); 6990 xmlInitParser(); 6991 6992 htmlCtxtReset(ctxt); 6993 6994 stream = xmlNewStringInputStream(ctxt, cur); 6995 if (stream == NULL) { 6996 return (NULL); 6997 } 6998 inputPush(ctxt, stream); 6999 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 7000 } 7001 7002 /** 7003 * htmlCtxtReadFile: 7004 * @ctxt: an HTML parser context 7005 * @filename: a file or URL 7006 * @encoding: the document encoding, or NULL 7007 * @options: a combination of htmlParserOption(s) 7008 * 7009 * parse an XML file from the filesystem or the network. 7010 * This reuses the existing @ctxt parser context 7011 * 7012 * Returns the resulting document tree 7013 */ 7014 htmlDocPtr 7015 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename, 7016 const char *encoding, int options) 7017 { 7018 xmlParserInputPtr stream; 7019 7020 if (filename == NULL) 7021 return (NULL); 7022 if (ctxt == NULL) 7023 return (NULL); 7024 xmlInitParser(); 7025 7026 htmlCtxtReset(ctxt); 7027 7028 stream = xmlLoadExternalEntity(filename, NULL, ctxt); 7029 if (stream == NULL) { 7030 return (NULL); 7031 } 7032 inputPush(ctxt, stream); 7033 return (htmlDoRead(ctxt, NULL, encoding, options, 1)); 7034 } 7035 7036 /** 7037 * htmlCtxtReadMemory: 7038 * @ctxt: an HTML parser context 7039 * @buffer: a pointer to a char array 7040 * @size: the size of the array 7041 * @URL: the base URL to use for the document 7042 * @encoding: the document encoding, or NULL 7043 * @options: a combination of htmlParserOption(s) 7044 * 7045 * parse an XML in-memory document and build a tree. 7046 * This reuses the existing @ctxt parser context 7047 * 7048 * Returns the resulting document tree 7049 */ 7050 htmlDocPtr 7051 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size, 7052 const char *URL, const char *encoding, int options) 7053 { 7054 xmlParserInputBufferPtr input; 7055 xmlParserInputPtr stream; 7056 7057 if (ctxt == NULL) 7058 return (NULL); 7059 if (buffer == NULL) 7060 return (NULL); 7061 xmlInitParser(); 7062 7063 htmlCtxtReset(ctxt); 7064 7065 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE); 7066 if (input == NULL) { 7067 return(NULL); 7068 } 7069 7070 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 7071 if (stream == NULL) { 7072 xmlFreeParserInputBuffer(input); 7073 return(NULL); 7074 } 7075 7076 inputPush(ctxt, stream); 7077 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 7078 } 7079 7080 /** 7081 * htmlCtxtReadFd: 7082 * @ctxt: an HTML parser context 7083 * @fd: an open file descriptor 7084 * @URL: the base URL to use for the document 7085 * @encoding: the document encoding, or NULL 7086 * @options: a combination of htmlParserOption(s) 7087 * 7088 * parse an XML from a file descriptor and build a tree. 7089 * This reuses the existing @ctxt parser context 7090 * 7091 * Returns the resulting document tree 7092 */ 7093 htmlDocPtr 7094 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd, 7095 const char *URL, const char *encoding, int options) 7096 { 7097 xmlParserInputBufferPtr input; 7098 xmlParserInputPtr stream; 7099 7100 if (fd < 0) 7101 return (NULL); 7102 if (ctxt == NULL) 7103 return (NULL); 7104 xmlInitParser(); 7105 7106 htmlCtxtReset(ctxt); 7107 7108 7109 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE); 7110 if (input == NULL) 7111 return (NULL); 7112 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 7113 if (stream == NULL) { 7114 xmlFreeParserInputBuffer(input); 7115 return (NULL); 7116 } 7117 inputPush(ctxt, stream); 7118 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 7119 } 7120 7121 /** 7122 * htmlCtxtReadIO: 7123 * @ctxt: an HTML parser context 7124 * @ioread: an I/O read function 7125 * @ioclose: an I/O close function 7126 * @ioctx: an I/O handler 7127 * @URL: the base URL to use for the document 7128 * @encoding: the document encoding, or NULL 7129 * @options: a combination of htmlParserOption(s) 7130 * 7131 * parse an HTML document from I/O functions and source and build a tree. 7132 * This reuses the existing @ctxt parser context 7133 * 7134 * Returns the resulting document tree 7135 */ 7136 htmlDocPtr 7137 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread, 7138 xmlInputCloseCallback ioclose, void *ioctx, 7139 const char *URL, 7140 const char *encoding, int options) 7141 { 7142 xmlParserInputBufferPtr input; 7143 xmlParserInputPtr stream; 7144 7145 if (ioread == NULL) 7146 return (NULL); 7147 if (ctxt == NULL) 7148 return (NULL); 7149 xmlInitParser(); 7150 7151 htmlCtxtReset(ctxt); 7152 7153 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx, 7154 XML_CHAR_ENCODING_NONE); 7155 if (input == NULL) { 7156 if (ioclose != NULL) 7157 ioclose(ioctx); 7158 return (NULL); 7159 } 7160 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 7161 if (stream == NULL) { 7162 xmlFreeParserInputBuffer(input); 7163 return (NULL); 7164 } 7165 inputPush(ctxt, stream); 7166 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 7167 } 7168 7169 #define bottom_HTMLparser 7170 #include "elfgcchack.h" 7171 #endif /* LIBXML_HTML_ENABLED */ 7172