1 /* 2 * HTMLparser.c : an HTML 4.0 non-verifying parser 3 * 4 * See Copyright for the status of this software. 5 * 6 * daniel@veillard.com 7 */ 8 9 #define IN_LIBXML 10 #include "libxml.h" 11 #ifdef LIBXML_HTML_ENABLED 12 13 #include <string.h> 14 #ifdef HAVE_CTYPE_H 15 #include <ctype.h> 16 #endif 17 #ifdef HAVE_STDLIB_H 18 #include <stdlib.h> 19 #endif 20 #ifdef HAVE_SYS_STAT_H 21 #include <sys/stat.h> 22 #endif 23 #ifdef HAVE_FCNTL_H 24 #include <fcntl.h> 25 #endif 26 #ifdef HAVE_UNISTD_H 27 #include <unistd.h> 28 #endif 29 #ifdef LIBXML_ZLIB_ENABLED 30 #include <zlib.h> 31 #endif 32 33 #include <libxml/xmlmemory.h> 34 #include <libxml/tree.h> 35 #include <libxml/parser.h> 36 #include <libxml/parserInternals.h> 37 #include <libxml/xmlerror.h> 38 #include <libxml/HTMLparser.h> 39 #include <libxml/HTMLtree.h> 40 #include <libxml/entities.h> 41 #include <libxml/encoding.h> 42 #include <libxml/valid.h> 43 #include <libxml/xmlIO.h> 44 #include <libxml/globals.h> 45 #include <libxml/uri.h> 46 47 #include "buf.h" 48 #include "enc.h" 49 50 #define HTML_MAX_NAMELEN 1000 51 #define HTML_PARSER_BIG_BUFFER_SIZE 1000 52 #define HTML_PARSER_BUFFER_SIZE 100 53 54 /* #define DEBUG */ 55 /* #define DEBUG_PUSH */ 56 57 static int htmlOmittedDefaultValue = 1; 58 59 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len, 60 xmlChar end, xmlChar end2, xmlChar end3); 61 static void htmlParseComment(htmlParserCtxtPtr ctxt); 62 63 /************************************************************************ 64 * * 65 * Some factorized error routines * 66 * * 67 ************************************************************************/ 68 69 /** 70 * htmlErrMemory: 71 * @ctxt: an HTML parser context 72 * @extra: extra informations 73 * 74 * Handle a redefinition of attribute error 75 */ 76 static void 77 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra) 78 { 79 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 80 (ctxt->instate == XML_PARSER_EOF)) 81 return; 82 if (ctxt != NULL) { 83 ctxt->errNo = XML_ERR_NO_MEMORY; 84 ctxt->instate = XML_PARSER_EOF; 85 ctxt->disableSAX = 1; 86 } 87 if (extra) 88 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER, 89 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra, 90 NULL, NULL, 0, 0, 91 "Memory allocation failed : %s\n", extra); 92 else 93 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER, 94 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL, 95 NULL, NULL, 0, 0, "Memory allocation failed\n"); 96 } 97 98 /** 99 * htmlParseErr: 100 * @ctxt: an HTML parser context 101 * @error: the error number 102 * @msg: the error message 103 * @str1: string infor 104 * @str2: string infor 105 * 106 * Handle a fatal parser error, i.e. violating Well-Formedness constraints 107 */ 108 static void LIBXML_ATTR_FORMAT(3,0) 109 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error, 110 const char *msg, const xmlChar *str1, const xmlChar *str2) 111 { 112 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 113 (ctxt->instate == XML_PARSER_EOF)) 114 return; 115 if (ctxt != NULL) 116 ctxt->errNo = error; 117 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, 118 XML_ERR_ERROR, NULL, 0, 119 (const char *) str1, (const char *) str2, 120 NULL, 0, 0, 121 msg, str1, str2); 122 if (ctxt != NULL) 123 ctxt->wellFormed = 0; 124 } 125 126 /** 127 * htmlParseErrInt: 128 * @ctxt: an HTML parser context 129 * @error: the error number 130 * @msg: the error message 131 * @val: integer info 132 * 133 * Handle a fatal parser error, i.e. violating Well-Formedness constraints 134 */ 135 static void LIBXML_ATTR_FORMAT(3,0) 136 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error, 137 const char *msg, int val) 138 { 139 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 140 (ctxt->instate == XML_PARSER_EOF)) 141 return; 142 if (ctxt != NULL) 143 ctxt->errNo = error; 144 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, 145 XML_ERR_ERROR, NULL, 0, NULL, NULL, 146 NULL, val, 0, msg, val); 147 if (ctxt != NULL) 148 ctxt->wellFormed = 0; 149 } 150 151 /************************************************************************ 152 * * 153 * Parser stacks related functions and macros * 154 * * 155 ************************************************************************/ 156 157 /** 158 * htmlnamePush: 159 * @ctxt: an HTML parser context 160 * @value: the element name 161 * 162 * Pushes a new element name on top of the name stack 163 * 164 * Returns 0 in case of error, the index in the stack otherwise 165 */ 166 static int 167 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value) 168 { 169 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head"))) 170 ctxt->html = 3; 171 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body"))) 172 ctxt->html = 10; 173 if (ctxt->nameNr >= ctxt->nameMax) { 174 ctxt->nameMax *= 2; 175 ctxt->nameTab = (const xmlChar * *) 176 xmlRealloc((xmlChar * *)ctxt->nameTab, 177 ctxt->nameMax * 178 sizeof(ctxt->nameTab[0])); 179 if (ctxt->nameTab == NULL) { 180 htmlErrMemory(ctxt, NULL); 181 return (0); 182 } 183 } 184 ctxt->nameTab[ctxt->nameNr] = value; 185 ctxt->name = value; 186 return (ctxt->nameNr++); 187 } 188 /** 189 * htmlnamePop: 190 * @ctxt: an HTML parser context 191 * 192 * Pops the top element name from the name stack 193 * 194 * Returns the name just removed 195 */ 196 static const xmlChar * 197 htmlnamePop(htmlParserCtxtPtr ctxt) 198 { 199 const xmlChar *ret; 200 201 if (ctxt->nameNr <= 0) 202 return (NULL); 203 ctxt->nameNr--; 204 if (ctxt->nameNr < 0) 205 return (NULL); 206 if (ctxt->nameNr > 0) 207 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1]; 208 else 209 ctxt->name = NULL; 210 ret = ctxt->nameTab[ctxt->nameNr]; 211 ctxt->nameTab[ctxt->nameNr] = NULL; 212 return (ret); 213 } 214 215 /** 216 * htmlNodeInfoPush: 217 * @ctxt: an HTML parser context 218 * @value: the node info 219 * 220 * Pushes a new element name on top of the node info stack 221 * 222 * Returns 0 in case of error, the index in the stack otherwise 223 */ 224 static int 225 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value) 226 { 227 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) { 228 if (ctxt->nodeInfoMax == 0) 229 ctxt->nodeInfoMax = 5; 230 ctxt->nodeInfoMax *= 2; 231 ctxt->nodeInfoTab = (htmlParserNodeInfo *) 232 xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab, 233 ctxt->nodeInfoMax * 234 sizeof(ctxt->nodeInfoTab[0])); 235 if (ctxt->nodeInfoTab == NULL) { 236 htmlErrMemory(ctxt, NULL); 237 return (0); 238 } 239 } 240 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value; 241 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr]; 242 return (ctxt->nodeInfoNr++); 243 } 244 245 /** 246 * htmlNodeInfoPop: 247 * @ctxt: an HTML parser context 248 * 249 * Pops the top element name from the node info stack 250 * 251 * Returns 0 in case of error, the pointer to NodeInfo otherwise 252 */ 253 static htmlParserNodeInfo * 254 htmlNodeInfoPop(htmlParserCtxtPtr ctxt) 255 { 256 if (ctxt->nodeInfoNr <= 0) 257 return (NULL); 258 ctxt->nodeInfoNr--; 259 if (ctxt->nodeInfoNr < 0) 260 return (NULL); 261 if (ctxt->nodeInfoNr > 0) 262 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1]; 263 else 264 ctxt->nodeInfo = NULL; 265 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr]; 266 } 267 268 /* 269 * Macros for accessing the content. Those should be used only by the parser, 270 * and not exported. 271 * 272 * Dirty macros, i.e. one need to make assumption on the context to use them 273 * 274 * CUR_PTR return the current pointer to the xmlChar to be parsed. 275 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled 276 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled 277 * in UNICODE mode. This should be used internally by the parser 278 * only to compare to ASCII values otherwise it would break when 279 * running with UTF-8 encoding. 280 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only 281 * to compare on ASCII based substring. 282 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR 283 * it should be used only to compare on ASCII based substring. 284 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined 285 * strings without newlines within the parser. 286 * 287 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding 288 * 289 * CURRENT Returns the current char value, with the full decoding of 290 * UTF-8 if we are using this mode. It returns an int. 291 * NEXT Skip to the next character, this does the proper decoding 292 * in UTF-8 mode. It also pop-up unfinished entities on the fly. 293 * NEXTL(l) Skip the current unicode character of l xmlChars long. 294 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly 295 */ 296 297 #define UPPER (toupper(*ctxt->input->cur)) 298 299 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val) 300 301 #define NXT(val) ctxt->input->cur[(val)] 302 303 #define UPP(val) (toupper(ctxt->input->cur[(val)])) 304 305 #define CUR_PTR ctxt->input->cur 306 #define BASE_PTR ctxt->input->base 307 308 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \ 309 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \ 310 xmlParserInputShrink(ctxt->input) 311 312 #define GROW if ((ctxt->progressive == 0) && \ 313 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \ 314 xmlParserInputGrow(ctxt->input, INPUT_CHUNK) 315 316 #define CURRENT ((int) (*ctxt->input->cur)) 317 318 #define SKIP_BLANKS htmlSkipBlankChars(ctxt) 319 320 /* Imported from XML */ 321 322 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */ 323 #define CUR ((int) (*ctxt->input->cur)) 324 #define NEXT xmlNextChar(ctxt) 325 326 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur)) 327 328 329 #define NEXTL(l) do { \ 330 if (*(ctxt->input->cur) == '\n') { \ 331 ctxt->input->line++; ctxt->input->col = 1; \ 332 } else ctxt->input->col++; \ 333 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \ 334 } while (0) 335 336 /************ 337 \ 338 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \ 339 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt); 340 ************/ 341 342 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l) 343 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l) 344 345 #define COPY_BUF(l,b,i,v) \ 346 if (l == 1) b[i++] = (xmlChar) v; \ 347 else i += xmlCopyChar(l,&b[i],v) 348 349 /** 350 * htmlFindEncoding: 351 * @the HTML parser context 352 * 353 * Ty to find and encoding in the current data available in the input 354 * buffer this is needed to try to switch to the proper encoding when 355 * one face a character error. 356 * That's an heuristic, since it's operating outside of parsing it could 357 * try to use a meta which had been commented out, that's the reason it 358 * should only be used in case of error, not as a default. 359 * 360 * Returns an encoding string or NULL if not found, the string need to 361 * be freed 362 */ 363 static xmlChar * 364 htmlFindEncoding(xmlParserCtxtPtr ctxt) { 365 const xmlChar *start, *cur, *end; 366 367 if ((ctxt == NULL) || (ctxt->input == NULL) || 368 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) || 369 (ctxt->input->buf->encoder != NULL)) 370 return(NULL); 371 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL)) 372 return(NULL); 373 374 start = ctxt->input->cur; 375 end = ctxt->input->end; 376 /* we also expect the input buffer to be zero terminated */ 377 if (*end != 0) 378 return(NULL); 379 380 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV"); 381 if (cur == NULL) 382 return(NULL); 383 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT"); 384 if (cur == NULL) 385 return(NULL); 386 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET="); 387 if (cur == NULL) 388 return(NULL); 389 cur += 8; 390 start = cur; 391 while (((*cur >= 'A') && (*cur <= 'Z')) || 392 ((*cur >= 'a') && (*cur <= 'z')) || 393 ((*cur >= '0') && (*cur <= '9')) || 394 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/')) 395 cur++; 396 if (cur == start) 397 return(NULL); 398 return(xmlStrndup(start, cur - start)); 399 } 400 401 /** 402 * htmlCurrentChar: 403 * @ctxt: the HTML parser context 404 * @len: pointer to the length of the char read 405 * 406 * The current char value, if using UTF-8 this may actually span multiple 407 * bytes in the input buffer. Implement the end of line normalization: 408 * 2.11 End-of-Line Handling 409 * If the encoding is unspecified, in the case we find an ISO-Latin-1 410 * char, then the encoding converter is plugged in automatically. 411 * 412 * Returns the current char value and its length 413 */ 414 415 static int 416 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { 417 if (ctxt->instate == XML_PARSER_EOF) 418 return(0); 419 420 if (ctxt->token != 0) { 421 *len = 0; 422 return(ctxt->token); 423 } 424 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { 425 /* 426 * We are supposed to handle UTF8, check it's valid 427 * From rfc2044: encoding of the Unicode values on UTF-8: 428 * 429 * UCS-4 range (hex.) UTF-8 octet sequence (binary) 430 * 0000 0000-0000 007F 0xxxxxxx 431 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx 432 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx 433 * 434 * Check for the 0x110000 limit too 435 */ 436 const unsigned char *cur = ctxt->input->cur; 437 unsigned char c; 438 unsigned int val; 439 440 c = *cur; 441 if (c & 0x80) { 442 if (cur[1] == 0) { 443 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 444 cur = ctxt->input->cur; 445 } 446 if ((cur[1] & 0xc0) != 0x80) 447 goto encoding_error; 448 if ((c & 0xe0) == 0xe0) { 449 450 if (cur[2] == 0) { 451 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 452 cur = ctxt->input->cur; 453 } 454 if ((cur[2] & 0xc0) != 0x80) 455 goto encoding_error; 456 if ((c & 0xf0) == 0xf0) { 457 if (cur[3] == 0) { 458 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 459 cur = ctxt->input->cur; 460 } 461 if (((c & 0xf8) != 0xf0) || 462 ((cur[3] & 0xc0) != 0x80)) 463 goto encoding_error; 464 /* 4-byte code */ 465 *len = 4; 466 val = (cur[0] & 0x7) << 18; 467 val |= (cur[1] & 0x3f) << 12; 468 val |= (cur[2] & 0x3f) << 6; 469 val |= cur[3] & 0x3f; 470 } else { 471 /* 3-byte code */ 472 *len = 3; 473 val = (cur[0] & 0xf) << 12; 474 val |= (cur[1] & 0x3f) << 6; 475 val |= cur[2] & 0x3f; 476 } 477 } else { 478 /* 2-byte code */ 479 *len = 2; 480 val = (cur[0] & 0x1f) << 6; 481 val |= cur[1] & 0x3f; 482 } 483 if (!IS_CHAR(val)) { 484 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 485 "Char 0x%X out of allowed range\n", val); 486 } 487 return(val); 488 } else { 489 if ((*ctxt->input->cur == 0) && 490 (ctxt->input->cur < ctxt->input->end)) { 491 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 492 "Char 0x%X out of allowed range\n", 0); 493 *len = 1; 494 return(' '); 495 } 496 /* 1-byte code */ 497 *len = 1; 498 return((int) *ctxt->input->cur); 499 } 500 } 501 /* 502 * Assume it's a fixed length encoding (1) with 503 * a compatible encoding for the ASCII set, since 504 * XML constructs only use < 128 chars 505 */ 506 *len = 1; 507 if ((int) *ctxt->input->cur < 0x80) 508 return((int) *ctxt->input->cur); 509 510 /* 511 * Humm this is bad, do an automatic flow conversion 512 */ 513 { 514 xmlChar * guess; 515 xmlCharEncodingHandlerPtr handler; 516 517 guess = htmlFindEncoding(ctxt); 518 if (guess == NULL) { 519 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); 520 } else { 521 if (ctxt->input->encoding != NULL) 522 xmlFree((xmlChar *) ctxt->input->encoding); 523 ctxt->input->encoding = guess; 524 handler = xmlFindCharEncodingHandler((const char *) guess); 525 if (handler != NULL) { 526 xmlSwitchToEncoding(ctxt, handler); 527 } else { 528 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 529 "Unsupported encoding %s", guess, NULL); 530 } 531 } 532 ctxt->charset = XML_CHAR_ENCODING_UTF8; 533 } 534 535 return(xmlCurrentChar(ctxt, len)); 536 537 encoding_error: 538 /* 539 * If we detect an UTF8 error that probably mean that the 540 * input encoding didn't get properly advertised in the 541 * declaration header. Report the error and switch the encoding 542 * to ISO-Latin-1 (if you don't like this policy, just declare the 543 * encoding !) 544 */ 545 { 546 char buffer[150]; 547 548 if (ctxt->input->end - ctxt->input->cur >= 4) { 549 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", 550 ctxt->input->cur[0], ctxt->input->cur[1], 551 ctxt->input->cur[2], ctxt->input->cur[3]); 552 } else { 553 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]); 554 } 555 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 556 "Input is not proper UTF-8, indicate encoding !\n", 557 BAD_CAST buffer, NULL); 558 } 559 560 ctxt->charset = XML_CHAR_ENCODING_8859_1; 561 *len = 1; 562 return((int) *ctxt->input->cur); 563 } 564 565 /** 566 * htmlSkipBlankChars: 567 * @ctxt: the HTML parser context 568 * 569 * skip all blanks character found at that point in the input streams. 570 * 571 * Returns the number of space chars skipped 572 */ 573 574 static int 575 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) { 576 int res = 0; 577 578 while (IS_BLANK_CH(*(ctxt->input->cur))) { 579 if ((*ctxt->input->cur == 0) && 580 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) { 581 xmlPopInput(ctxt); 582 } else { 583 if (*(ctxt->input->cur) == '\n') { 584 ctxt->input->line++; ctxt->input->col = 1; 585 } else ctxt->input->col++; 586 ctxt->input->cur++; 587 ctxt->nbChars++; 588 if (*ctxt->input->cur == 0) 589 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 590 } 591 res++; 592 } 593 return(res); 594 } 595 596 597 598 /************************************************************************ 599 * * 600 * The list of HTML elements and their properties * 601 * * 602 ************************************************************************/ 603 604 /* 605 * Start Tag: 1 means the start tag can be omitted 606 * End Tag: 1 means the end tag can be omitted 607 * 2 means it's forbidden (empty elements) 608 * 3 means the tag is stylistic and should be closed easily 609 * Depr: this element is deprecated 610 * DTD: 1 means that this element is valid only in the Loose DTD 611 * 2 means that this element is valid only in the Frameset DTD 612 * 613 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description 614 , subElements , impliedsubelt , Attributes, userdata 615 */ 616 617 /* Definitions and a couple of vars for HTML Elements */ 618 619 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small" 620 #define NB_FONTSTYLE 8 621 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym" 622 #define NB_PHRASE 10 623 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe" 624 #define NB_SPECIAL 16 625 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL 626 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL 627 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address" 628 #define NB_BLOCK NB_HEADING + NB_LIST + 14 629 #define FORMCTRL "input", "select", "textarea", "label", "button" 630 #define NB_FORMCTRL 5 631 #define PCDATA 632 #define NB_PCDATA 0 633 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6" 634 #define NB_HEADING 6 635 #define LIST "ul", "ol", "dir", "menu" 636 #define NB_LIST 4 637 #define MODIFIER 638 #define NB_MODIFIER 0 639 #define FLOW BLOCK,INLINE 640 #define NB_FLOW NB_BLOCK + NB_INLINE 641 #define EMPTY NULL 642 643 644 static const char* const html_flow[] = { FLOW, NULL } ; 645 static const char* const html_inline[] = { INLINE, NULL } ; 646 647 /* placeholders: elts with content but no subelements */ 648 static const char* const html_pcdata[] = { NULL } ; 649 #define html_cdata html_pcdata 650 651 652 /* ... and for HTML Attributes */ 653 654 #define COREATTRS "id", "class", "style", "title" 655 #define NB_COREATTRS 4 656 #define I18N "lang", "dir" 657 #define NB_I18N 2 658 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup" 659 #define NB_EVENTS 9 660 #define ATTRS COREATTRS,I18N,EVENTS 661 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS 662 #define CELLHALIGN "align", "char", "charoff" 663 #define NB_CELLHALIGN 3 664 #define CELLVALIGN "valign" 665 #define NB_CELLVALIGN 1 666 667 static const char* const html_attrs[] = { ATTRS, NULL } ; 668 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ; 669 static const char* const core_attrs[] = { COREATTRS, NULL } ; 670 static const char* const i18n_attrs[] = { I18N, NULL } ; 671 672 673 /* Other declarations that should go inline ... */ 674 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name", 675 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords", 676 "tabindex", "onfocus", "onblur", NULL } ; 677 static const char* const target_attr[] = { "target", NULL } ; 678 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ; 679 static const char* const alt_attr[] = { "alt", NULL } ; 680 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ; 681 static const char* const href_attrs[] = { "href", NULL } ; 682 static const char* const clear_attrs[] = { "clear", NULL } ; 683 static const char* const inline_p[] = { INLINE, "p", NULL } ; 684 685 static const char* const flow_param[] = { FLOW, "param", NULL } ; 686 static const char* const applet_attrs[] = { COREATTRS , "codebase", 687 "archive", "alt", "name", "height", "width", "align", 688 "hspace", "vspace", NULL } ; 689 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref", 690 "tabindex", "accesskey", "onfocus", "onblur", NULL } ; 691 static const char* const basefont_attrs[] = 692 { "id", "size", "color", "face", NULL } ; 693 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ; 694 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ; 695 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ; 696 static const char* const body_depr[] = { "background", "bgcolor", "text", 697 "link", "vlink", "alink", NULL } ; 698 static const char* const button_attrs[] = { ATTRS, "name", "value", "type", 699 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ; 700 701 702 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ; 703 static const char* const col_elt[] = { "col", NULL } ; 704 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ; 705 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ; 706 static const char* const dl_contents[] = { "dt", "dd", NULL } ; 707 static const char* const compact_attr[] = { "compact", NULL } ; 708 static const char* const label_attr[] = { "label", NULL } ; 709 static const char* const fieldset_contents[] = { FLOW, "legend" } ; 710 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ; 711 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ; 712 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ; 713 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ; 714 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ; 715 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ; 716 static const char* const head_attrs[] = { I18N, "profile", NULL } ; 717 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ; 718 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ; 719 static const char* const version_attr[] = { "version", NULL } ; 720 static const char* const html_content[] = { "head", "body", "frameset", NULL } ; 721 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ; 722 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ; 723 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ; 724 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ; 725 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ; 726 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ; 727 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ; 728 static const char* const align_attr[] = { "align", NULL } ; 729 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ; 730 static const char* const map_contents[] = { BLOCK, "area", NULL } ; 731 static const char* const name_attr[] = { "name", NULL } ; 732 static const char* const action_attr[] = { "action", NULL } ; 733 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ; 734 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ; 735 static const char* const content_attr[] = { "content", NULL } ; 736 static const char* const type_attr[] = { "type", NULL } ; 737 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ; 738 static const char* const object_contents[] = { FLOW, "param", NULL } ; 739 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ; 740 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ; 741 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ; 742 static const char* const option_elt[] = { "option", NULL } ; 743 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ; 744 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ; 745 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ; 746 static const char* const width_attr[] = { "width", NULL } ; 747 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ; 748 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ; 749 static const char* const language_attr[] = { "language", NULL } ; 750 static const char* const select_content[] = { "optgroup", "option", NULL } ; 751 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ; 752 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ; 753 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ; 754 static const char* const table_depr[] = { "align", "bgcolor", NULL } ; 755 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ; 756 static const char* const tr_elt[] = { "tr", NULL } ; 757 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ; 758 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ; 759 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ; 760 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ; 761 static const char* const tr_contents[] = { "th", "td", NULL } ; 762 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ; 763 static const char* const li_elt[] = { "li", NULL } ; 764 static const char* const ul_depr[] = { "type", "compact", NULL} ; 765 static const char* const dir_attr[] = { "dir", NULL} ; 766 767 #define DECL (const char**) 768 769 static const htmlElemDesc 770 html40ElementTable[] = { 771 { "a", 0, 0, 0, 0, 0, 0, 1, "anchor ", 772 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL 773 }, 774 { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form", 775 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 776 }, 777 { "acronym", 0, 0, 0, 0, 0, 0, 1, "", 778 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 779 }, 780 { "address", 0, 0, 0, 0, 0, 0, 0, "information on author ", 781 DECL inline_p , NULL , DECL html_attrs, NULL, NULL 782 }, 783 { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ", 784 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL 785 }, 786 { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ", 787 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr 788 }, 789 { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style", 790 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 791 }, 792 { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ", 793 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs 794 }, 795 { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " , 796 EMPTY , NULL , NULL, DECL basefont_attrs, NULL 797 }, 798 { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ", 799 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr 800 }, 801 { "big", 0, 3, 0, 0, 0, 0, 1, "large text style", 802 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 803 }, 804 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ", 805 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL 806 }, 807 { "body", 1, 1, 0, 0, 0, 0, 0, "document body ", 808 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL 809 }, 810 { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ", 811 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL 812 }, 813 { "button", 0, 0, 0, 0, 0, 0, 2, "push button ", 814 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL 815 }, 816 { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ", 817 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 818 }, 819 { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ", 820 DECL html_flow , NULL , NULL, DECL html_attrs, NULL 821 }, 822 { "cite", 0, 0, 0, 0, 0, 0, 1, "citation", 823 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 824 }, 825 { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment", 826 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 827 }, 828 { "col", 0, 2, 2, 1, 0, 0, 0, "table column ", 829 EMPTY , NULL , DECL col_attrs , NULL, NULL 830 }, 831 { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ", 832 DECL col_elt , "col" , DECL col_attrs , NULL, NULL 833 }, 834 { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ", 835 DECL html_flow , NULL , DECL html_attrs, NULL, NULL 836 }, 837 { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ", 838 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL 839 }, 840 { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition", 841 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 842 }, 843 { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list", 844 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL 845 }, 846 { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container", 847 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL 848 }, 849 { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ", 850 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL 851 }, 852 { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ", 853 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 854 }, 855 { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis", 856 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 857 }, 858 { "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ", 859 EMPTY, NULL, DECL embed_attrs, NULL, NULL 860 }, 861 { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ", 862 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL 863 }, 864 { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ", 865 DECL html_inline, NULL, NULL, DECL font_attrs, NULL 866 }, 867 { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ", 868 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr 869 }, 870 { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " , 871 EMPTY, NULL, NULL, DECL frame_attrs, NULL 872 }, 873 { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" , 874 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL 875 }, 876 { "h1", 0, 0, 0, 0, 0, 0, 0, "heading ", 877 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 878 }, 879 { "h2", 0, 0, 0, 0, 0, 0, 0, "heading ", 880 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 881 }, 882 { "h3", 0, 0, 0, 0, 0, 0, 0, "heading ", 883 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 884 }, 885 { "h4", 0, 0, 0, 0, 0, 0, 0, "heading ", 886 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 887 }, 888 { "h5", 0, 0, 0, 0, 0, 0, 0, "heading ", 889 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 890 }, 891 { "h6", 0, 0, 0, 0, 0, 0, 0, "heading ", 892 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 893 }, 894 { "head", 1, 1, 0, 0, 0, 0, 0, "document head ", 895 DECL head_contents, NULL, DECL head_attrs, NULL, NULL 896 }, 897 { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " , 898 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL 899 }, 900 { "html", 1, 1, 0, 0, 0, 0, 0, "document root element ", 901 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL 902 }, 903 { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style", 904 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 905 }, 906 { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ", 907 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL 908 }, 909 { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ", 910 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs 911 }, 912 { "input", 0, 2, 2, 1, 0, 0, 1, "form control ", 913 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL 914 }, 915 { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text", 916 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL 917 }, 918 { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ", 919 EMPTY, NULL, NULL, DECL prompt_attrs, NULL 920 }, 921 { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user", 922 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 923 }, 924 { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ", 925 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL 926 }, 927 { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ", 928 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL 929 }, 930 { "li", 0, 1, 1, 0, 0, 0, 0, "list item ", 931 DECL html_flow, NULL, DECL html_attrs, NULL, NULL 932 }, 933 { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ", 934 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL 935 }, 936 { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ", 937 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr 938 }, 939 { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ", 940 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL 941 }, 942 { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ", 943 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr 944 }, 945 { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ", 946 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL 947 }, 948 { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ", 949 DECL html_flow, "div", DECL html_attrs, NULL, NULL 950 }, 951 { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ", 952 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL 953 }, 954 { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ", 955 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL 956 }, 957 { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ", 958 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr 959 }, 960 { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " , 961 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL 962 }, 963 { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ", 964 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 965 }, 966 { "param", 0, 2, 2, 1, 0, 0, 0, "named property value ", 967 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr 968 }, 969 { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ", 970 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL 971 }, 972 { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ", 973 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL 974 }, 975 { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style", 976 DECL html_inline, NULL, NULL, DECL html_attrs, NULL 977 }, 978 { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.", 979 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 980 }, 981 { "script", 0, 0, 0, 0, 0, 0, 2, "script statements ", 982 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr 983 }, 984 { "select", 0, 0, 0, 0, 0, 0, 1, "option selector ", 985 DECL select_content, NULL, DECL select_attrs, NULL, NULL 986 }, 987 { "small", 0, 3, 0, 0, 0, 0, 1, "small text style", 988 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 989 }, 990 { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ", 991 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 992 }, 993 { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text", 994 DECL html_inline, NULL, NULL, DECL html_attrs, NULL 995 }, 996 { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis", 997 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 998 }, 999 { "style", 0, 0, 0, 0, 0, 0, 0, "style info ", 1000 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr 1001 }, 1002 { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript", 1003 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1004 }, 1005 { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ", 1006 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1007 }, 1008 { "table", 0, 0, 0, 0, 0, 0, 0, "", 1009 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL 1010 }, 1011 { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ", 1012 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 1013 }, 1014 { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell", 1015 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL 1016 }, 1017 { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ", 1018 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr 1019 }, 1020 { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ", 1021 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 1022 }, 1023 { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell", 1024 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL 1025 }, 1026 { "thead", 0, 1, 0, 0, 0, 0, 0, "table header ", 1027 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 1028 }, 1029 { "title", 0, 0, 0, 0, 0, 0, 0, "document title ", 1030 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL 1031 }, 1032 { "tr", 0, 0, 0, 0, 0, 0, 0, "table row ", 1033 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL 1034 }, 1035 { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style", 1036 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1037 }, 1038 { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style", 1039 DECL html_inline, NULL, NULL, DECL html_attrs, NULL 1040 }, 1041 { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ", 1042 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL 1043 }, 1044 { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument", 1045 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1046 } 1047 }; 1048 1049 /* 1050 * start tags that imply the end of current element 1051 */ 1052 static const char * const htmlStartClose[] = { 1053 "form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6", 1054 "dl", "ul", "ol", "menu", "dir", "address", "pre", 1055 "listing", "xmp", "head", NULL, 1056 "head", "p", NULL, 1057 "title", "p", NULL, 1058 "body", "head", "style", "link", "title", "p", NULL, 1059 "frameset", "head", "style", "link", "title", "p", NULL, 1060 "li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address", 1061 "pre", "listing", "xmp", "head", "li", NULL, 1062 "hr", "p", "head", NULL, 1063 "h1", "p", "head", NULL, 1064 "h2", "p", "head", NULL, 1065 "h3", "p", "head", NULL, 1066 "h4", "p", "head", NULL, 1067 "h5", "p", "head", NULL, 1068 "h6", "p", "head", NULL, 1069 "dir", "p", "head", NULL, 1070 "address", "p", "head", "ul", NULL, 1071 "pre", "p", "head", "ul", NULL, 1072 "listing", "p", "head", NULL, 1073 "xmp", "p", "head", NULL, 1074 "blockquote", "p", "head", NULL, 1075 "dl", "p", "dt", "menu", "dir", "address", "pre", "listing", 1076 "xmp", "head", NULL, 1077 "dt", "p", "menu", "dir", "address", "pre", "listing", "xmp", 1078 "head", "dd", NULL, 1079 "dd", "p", "menu", "dir", "address", "pre", "listing", "xmp", 1080 "head", "dt", NULL, 1081 "ul", "p", "head", "ol", "menu", "dir", "address", "pre", 1082 "listing", "xmp", NULL, 1083 "ol", "p", "head", "ul", NULL, 1084 "menu", "p", "head", "ul", NULL, 1085 "p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL, 1086 "div", "p", "head", NULL, 1087 "noscript", "script", NULL, 1088 "center", "font", "b", "i", "p", "head", NULL, 1089 "a", "a", "head", NULL, 1090 "caption", "p", NULL, 1091 "colgroup", "caption", "colgroup", "col", "p", NULL, 1092 "col", "caption", "col", "p", NULL, 1093 "table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre", 1094 "listing", "xmp", "a", NULL, 1095 "th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL, 1096 "td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL, 1097 "tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL, 1098 "thead", "caption", "col", "colgroup", NULL, 1099 "tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead", 1100 "tbody", "p", NULL, 1101 "tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead", 1102 "tfoot", "tbody", "p", NULL, 1103 "optgroup", "option", NULL, 1104 "option", "option", NULL, 1105 "fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", 1106 "pre", "listing", "xmp", "a", NULL, 1107 /* most tags in in FONTSTYLE, PHRASE and SPECIAL should close <head> */ 1108 "tt", "head", NULL, 1109 "i", "head", NULL, 1110 "b", "head", NULL, 1111 "u", "head", NULL, 1112 "s", "head", NULL, 1113 "strike", "head", NULL, 1114 "big", "head", NULL, 1115 "small", "head", NULL, 1116 1117 "em", "head", NULL, 1118 "strong", "head", NULL, 1119 "dfn", "head", NULL, 1120 "code", "head", NULL, 1121 "samp", "head", NULL, 1122 "kbd", "head", NULL, 1123 "var", "head", NULL, 1124 "cite", "head", NULL, 1125 "abbr", "head", NULL, 1126 "acronym", "head", NULL, 1127 1128 /* "a" */ 1129 "img", "head", NULL, 1130 /* "applet" */ 1131 /* "embed" */ 1132 /* "object" */ 1133 "font", "head", NULL, 1134 /* "basefont" */ 1135 "br", "head", NULL, 1136 /* "script" */ 1137 "map", "head", NULL, 1138 "q", "head", NULL, 1139 "sub", "head", NULL, 1140 "sup", "head", NULL, 1141 "span", "head", NULL, 1142 "bdo", "head", NULL, 1143 "iframe", "head", NULL, 1144 NULL 1145 }; 1146 1147 /* 1148 * The list of HTML elements which are supposed not to have 1149 * CDATA content and where a p element will be implied 1150 * 1151 * TODO: extend that list by reading the HTML SGML DTD on 1152 * implied paragraph 1153 */ 1154 static const char *const htmlNoContentElements[] = { 1155 "html", 1156 "head", 1157 NULL 1158 }; 1159 1160 /* 1161 * The list of HTML attributes which are of content %Script; 1162 * NOTE: when adding ones, check htmlIsScriptAttribute() since 1163 * it assumes the name starts with 'on' 1164 */ 1165 static const char *const htmlScriptAttributes[] = { 1166 "onclick", 1167 "ondblclick", 1168 "onmousedown", 1169 "onmouseup", 1170 "onmouseover", 1171 "onmousemove", 1172 "onmouseout", 1173 "onkeypress", 1174 "onkeydown", 1175 "onkeyup", 1176 "onload", 1177 "onunload", 1178 "onfocus", 1179 "onblur", 1180 "onsubmit", 1181 "onreset", 1182 "onchange", 1183 "onselect" 1184 }; 1185 1186 /* 1187 * This table is used by the htmlparser to know what to do with 1188 * broken html pages. By assigning different priorities to different 1189 * elements the parser can decide how to handle extra endtags. 1190 * Endtags are only allowed to close elements with lower or equal 1191 * priority. 1192 */ 1193 1194 typedef struct { 1195 const char *name; 1196 int priority; 1197 } elementPriority; 1198 1199 static const elementPriority htmlEndPriority[] = { 1200 {"div", 150}, 1201 {"td", 160}, 1202 {"th", 160}, 1203 {"tr", 170}, 1204 {"thead", 180}, 1205 {"tbody", 180}, 1206 {"tfoot", 180}, 1207 {"table", 190}, 1208 {"head", 200}, 1209 {"body", 200}, 1210 {"html", 220}, 1211 {NULL, 100} /* Default priority */ 1212 }; 1213 1214 static const char** htmlStartCloseIndex[100]; 1215 static int htmlStartCloseIndexinitialized = 0; 1216 1217 /************************************************************************ 1218 * * 1219 * functions to handle HTML specific data * 1220 * * 1221 ************************************************************************/ 1222 1223 /** 1224 * htmlInitAutoClose: 1225 * 1226 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names. 1227 * This is not reentrant. Call xmlInitParser() once before processing in 1228 * case of use in multithreaded programs. 1229 */ 1230 void 1231 htmlInitAutoClose(void) { 1232 int indx, i = 0; 1233 1234 if (htmlStartCloseIndexinitialized) return; 1235 1236 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL; 1237 indx = 0; 1238 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) { 1239 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i]; 1240 while (htmlStartClose[i] != NULL) i++; 1241 i++; 1242 } 1243 htmlStartCloseIndexinitialized = 1; 1244 } 1245 1246 /** 1247 * htmlTagLookup: 1248 * @tag: The tag name in lowercase 1249 * 1250 * Lookup the HTML tag in the ElementTable 1251 * 1252 * Returns the related htmlElemDescPtr or NULL if not found. 1253 */ 1254 const htmlElemDesc * 1255 htmlTagLookup(const xmlChar *tag) { 1256 unsigned int i; 1257 1258 for (i = 0; i < (sizeof(html40ElementTable) / 1259 sizeof(html40ElementTable[0]));i++) { 1260 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name)) 1261 return((htmlElemDescPtr) &html40ElementTable[i]); 1262 } 1263 return(NULL); 1264 } 1265 1266 /** 1267 * htmlGetEndPriority: 1268 * @name: The name of the element to look up the priority for. 1269 * 1270 * Return value: The "endtag" priority. 1271 **/ 1272 static int 1273 htmlGetEndPriority (const xmlChar *name) { 1274 int i = 0; 1275 1276 while ((htmlEndPriority[i].name != NULL) && 1277 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name))) 1278 i++; 1279 1280 return(htmlEndPriority[i].priority); 1281 } 1282 1283 1284 /** 1285 * htmlCheckAutoClose: 1286 * @newtag: The new tag name 1287 * @oldtag: The old tag name 1288 * 1289 * Checks whether the new tag is one of the registered valid tags for 1290 * closing old. 1291 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names. 1292 * 1293 * Returns 0 if no, 1 if yes. 1294 */ 1295 static int 1296 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag) 1297 { 1298 int i, indx; 1299 const char **closed = NULL; 1300 1301 if (htmlStartCloseIndexinitialized == 0) 1302 htmlInitAutoClose(); 1303 1304 /* inefficient, but not a big deal */ 1305 for (indx = 0; indx < 100; indx++) { 1306 closed = htmlStartCloseIndex[indx]; 1307 if (closed == NULL) 1308 return (0); 1309 if (xmlStrEqual(BAD_CAST * closed, newtag)) 1310 break; 1311 } 1312 1313 i = closed - htmlStartClose; 1314 i++; 1315 while (htmlStartClose[i] != NULL) { 1316 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) { 1317 return (1); 1318 } 1319 i++; 1320 } 1321 return (0); 1322 } 1323 1324 /** 1325 * htmlAutoCloseOnClose: 1326 * @ctxt: an HTML parser context 1327 * @newtag: The new tag name 1328 * @force: force the tag closure 1329 * 1330 * The HTML DTD allows an ending tag to implicitly close other tags. 1331 */ 1332 static void 1333 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag) 1334 { 1335 const htmlElemDesc *info; 1336 int i, priority; 1337 1338 priority = htmlGetEndPriority(newtag); 1339 1340 for (i = (ctxt->nameNr - 1); i >= 0; i--) { 1341 1342 if (xmlStrEqual(newtag, ctxt->nameTab[i])) 1343 break; 1344 /* 1345 * A misplaced endtag can only close elements with lower 1346 * or equal priority, so if we find an element with higher 1347 * priority before we find an element with 1348 * matching name, we just ignore this endtag 1349 */ 1350 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority) 1351 return; 1352 } 1353 if (i < 0) 1354 return; 1355 1356 while (!xmlStrEqual(newtag, ctxt->name)) { 1357 info = htmlTagLookup(ctxt->name); 1358 if ((info != NULL) && (info->endTag == 3)) { 1359 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 1360 "Opening and ending tag mismatch: %s and %s\n", 1361 newtag, ctxt->name); 1362 } 1363 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1364 ctxt->sax->endElement(ctxt->userData, ctxt->name); 1365 htmlnamePop(ctxt); 1366 } 1367 } 1368 1369 /** 1370 * htmlAutoCloseOnEnd: 1371 * @ctxt: an HTML parser context 1372 * 1373 * Close all remaining tags at the end of the stream 1374 */ 1375 static void 1376 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) 1377 { 1378 int i; 1379 1380 if (ctxt->nameNr == 0) 1381 return; 1382 for (i = (ctxt->nameNr - 1); i >= 0; i--) { 1383 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1384 ctxt->sax->endElement(ctxt->userData, ctxt->name); 1385 htmlnamePop(ctxt); 1386 } 1387 } 1388 1389 /** 1390 * htmlAutoClose: 1391 * @ctxt: an HTML parser context 1392 * @newtag: The new tag name or NULL 1393 * 1394 * The HTML DTD allows a tag to implicitly close other tags. 1395 * The list is kept in htmlStartClose array. This function is 1396 * called when a new tag has been detected and generates the 1397 * appropriates closes if possible/needed. 1398 * If newtag is NULL this mean we are at the end of the resource 1399 * and we should check 1400 */ 1401 static void 1402 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag) 1403 { 1404 while ((newtag != NULL) && (ctxt->name != NULL) && 1405 (htmlCheckAutoClose(newtag, ctxt->name))) { 1406 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1407 ctxt->sax->endElement(ctxt->userData, ctxt->name); 1408 htmlnamePop(ctxt); 1409 } 1410 if (newtag == NULL) { 1411 htmlAutoCloseOnEnd(ctxt); 1412 return; 1413 } 1414 while ((newtag == NULL) && (ctxt->name != NULL) && 1415 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) || 1416 (xmlStrEqual(ctxt->name, BAD_CAST "body")) || 1417 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) { 1418 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1419 ctxt->sax->endElement(ctxt->userData, ctxt->name); 1420 htmlnamePop(ctxt); 1421 } 1422 } 1423 1424 /** 1425 * htmlAutoCloseTag: 1426 * @doc: the HTML document 1427 * @name: The tag name 1428 * @elem: the HTML element 1429 * 1430 * The HTML DTD allows a tag to implicitly close other tags. 1431 * The list is kept in htmlStartClose array. This function checks 1432 * if the element or one of it's children would autoclose the 1433 * given tag. 1434 * 1435 * Returns 1 if autoclose, 0 otherwise 1436 */ 1437 int 1438 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) { 1439 htmlNodePtr child; 1440 1441 if (elem == NULL) return(1); 1442 if (xmlStrEqual(name, elem->name)) return(0); 1443 if (htmlCheckAutoClose(elem->name, name)) return(1); 1444 child = elem->children; 1445 while (child != NULL) { 1446 if (htmlAutoCloseTag(doc, name, child)) return(1); 1447 child = child->next; 1448 } 1449 return(0); 1450 } 1451 1452 /** 1453 * htmlIsAutoClosed: 1454 * @doc: the HTML document 1455 * @elem: the HTML element 1456 * 1457 * The HTML DTD allows a tag to implicitly close other tags. 1458 * The list is kept in htmlStartClose array. This function checks 1459 * if a tag is autoclosed by one of it's child 1460 * 1461 * Returns 1 if autoclosed, 0 otherwise 1462 */ 1463 int 1464 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) { 1465 htmlNodePtr child; 1466 1467 if (elem == NULL) return(1); 1468 child = elem->children; 1469 while (child != NULL) { 1470 if (htmlAutoCloseTag(doc, elem->name, child)) return(1); 1471 child = child->next; 1472 } 1473 return(0); 1474 } 1475 1476 /** 1477 * htmlCheckImplied: 1478 * @ctxt: an HTML parser context 1479 * @newtag: The new tag name 1480 * 1481 * The HTML DTD allows a tag to exists only implicitly 1482 * called when a new tag has been detected and generates the 1483 * appropriates implicit tags if missing 1484 */ 1485 static void 1486 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) { 1487 int i; 1488 1489 if (ctxt->options & HTML_PARSE_NOIMPLIED) 1490 return; 1491 if (!htmlOmittedDefaultValue) 1492 return; 1493 if (xmlStrEqual(newtag, BAD_CAST"html")) 1494 return; 1495 if (ctxt->nameNr <= 0) { 1496 htmlnamePush(ctxt, BAD_CAST"html"); 1497 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1498 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL); 1499 } 1500 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head"))) 1501 return; 1502 if ((ctxt->nameNr <= 1) && 1503 ((xmlStrEqual(newtag, BAD_CAST"script")) || 1504 (xmlStrEqual(newtag, BAD_CAST"style")) || 1505 (xmlStrEqual(newtag, BAD_CAST"meta")) || 1506 (xmlStrEqual(newtag, BAD_CAST"link")) || 1507 (xmlStrEqual(newtag, BAD_CAST"title")) || 1508 (xmlStrEqual(newtag, BAD_CAST"base")))) { 1509 if (ctxt->html >= 3) { 1510 /* we already saw or generated an <head> before */ 1511 return; 1512 } 1513 /* 1514 * dropped OBJECT ... i you put it first BODY will be 1515 * assumed ! 1516 */ 1517 htmlnamePush(ctxt, BAD_CAST"head"); 1518 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1519 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL); 1520 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) && 1521 (!xmlStrEqual(newtag, BAD_CAST"frame")) && 1522 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) { 1523 if (ctxt->html >= 10) { 1524 /* we already saw or generated a <body> before */ 1525 return; 1526 } 1527 for (i = 0;i < ctxt->nameNr;i++) { 1528 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) { 1529 return; 1530 } 1531 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) { 1532 return; 1533 } 1534 } 1535 1536 htmlnamePush(ctxt, BAD_CAST"body"); 1537 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1538 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL); 1539 } 1540 } 1541 1542 /** 1543 * htmlCheckParagraph 1544 * @ctxt: an HTML parser context 1545 * 1546 * Check whether a p element need to be implied before inserting 1547 * characters in the current element. 1548 * 1549 * Returns 1 if a paragraph has been inserted, 0 if not and -1 1550 * in case of error. 1551 */ 1552 1553 static int 1554 htmlCheckParagraph(htmlParserCtxtPtr ctxt) { 1555 const xmlChar *tag; 1556 int i; 1557 1558 if (ctxt == NULL) 1559 return(-1); 1560 tag = ctxt->name; 1561 if (tag == NULL) { 1562 htmlAutoClose(ctxt, BAD_CAST"p"); 1563 htmlCheckImplied(ctxt, BAD_CAST"p"); 1564 htmlnamePush(ctxt, BAD_CAST"p"); 1565 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1566 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL); 1567 return(1); 1568 } 1569 if (!htmlOmittedDefaultValue) 1570 return(0); 1571 for (i = 0; htmlNoContentElements[i] != NULL; i++) { 1572 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) { 1573 htmlAutoClose(ctxt, BAD_CAST"p"); 1574 htmlCheckImplied(ctxt, BAD_CAST"p"); 1575 htmlnamePush(ctxt, BAD_CAST"p"); 1576 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1577 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL); 1578 return(1); 1579 } 1580 } 1581 return(0); 1582 } 1583 1584 /** 1585 * htmlIsScriptAttribute: 1586 * @name: an attribute name 1587 * 1588 * Check if an attribute is of content type Script 1589 * 1590 * Returns 1 is the attribute is a script 0 otherwise 1591 */ 1592 int 1593 htmlIsScriptAttribute(const xmlChar *name) { 1594 unsigned int i; 1595 1596 if (name == NULL) 1597 return(0); 1598 /* 1599 * all script attributes start with 'on' 1600 */ 1601 if ((name[0] != 'o') || (name[1] != 'n')) 1602 return(0); 1603 for (i = 0; 1604 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]); 1605 i++) { 1606 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i])) 1607 return(1); 1608 } 1609 return(0); 1610 } 1611 1612 /************************************************************************ 1613 * * 1614 * The list of HTML predefined entities * 1615 * * 1616 ************************************************************************/ 1617 1618 1619 static const htmlEntityDesc html40EntitiesTable[] = { 1620 /* 1621 * the 4 absolute ones, plus apostrophe. 1622 */ 1623 { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" }, 1624 { 38, "amp", "ampersand, U+0026 ISOnum" }, 1625 { 39, "apos", "single quote" }, 1626 { 60, "lt", "less-than sign, U+003C ISOnum" }, 1627 { 62, "gt", "greater-than sign, U+003E ISOnum" }, 1628 1629 /* 1630 * A bunch still in the 128-255 range 1631 * Replacing them depend really on the charset used. 1632 */ 1633 { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" }, 1634 { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" }, 1635 { 162, "cent", "cent sign, U+00A2 ISOnum" }, 1636 { 163, "pound","pound sign, U+00A3 ISOnum" }, 1637 { 164, "curren","currency sign, U+00A4 ISOnum" }, 1638 { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" }, 1639 { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" }, 1640 { 167, "sect", "section sign, U+00A7 ISOnum" }, 1641 { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" }, 1642 { 169, "copy", "copyright sign, U+00A9 ISOnum" }, 1643 { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" }, 1644 { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" }, 1645 { 172, "not", "not sign, U+00AC ISOnum" }, 1646 { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" }, 1647 { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" }, 1648 { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" }, 1649 { 176, "deg", "degree sign, U+00B0 ISOnum" }, 1650 { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" }, 1651 { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" }, 1652 { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" }, 1653 { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" }, 1654 { 181, "micro","micro sign, U+00B5 ISOnum" }, 1655 { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" }, 1656 { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" }, 1657 { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" }, 1658 { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" }, 1659 { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" }, 1660 { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" }, 1661 { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" }, 1662 { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" }, 1663 { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" }, 1664 { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" }, 1665 { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" }, 1666 { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" }, 1667 { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" }, 1668 { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" }, 1669 { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" }, 1670 { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" }, 1671 { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" }, 1672 { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" }, 1673 { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" }, 1674 { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" }, 1675 { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" }, 1676 { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" }, 1677 { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" }, 1678 { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" }, 1679 { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" }, 1680 { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" }, 1681 { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" }, 1682 { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" }, 1683 { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" }, 1684 { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" }, 1685 { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" }, 1686 { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" }, 1687 { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" }, 1688 { 215, "times","multiplication sign, U+00D7 ISOnum" }, 1689 { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" }, 1690 { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" }, 1691 { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" }, 1692 { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" }, 1693 { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" }, 1694 { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" }, 1695 { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" }, 1696 { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" }, 1697 { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" }, 1698 { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" }, 1699 { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" }, 1700 { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" }, 1701 { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" }, 1702 { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" }, 1703 { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" }, 1704 { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" }, 1705 { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" }, 1706 { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" }, 1707 { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" }, 1708 { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" }, 1709 { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" }, 1710 { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" }, 1711 { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" }, 1712 { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" }, 1713 { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" }, 1714 { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" }, 1715 { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" }, 1716 { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" }, 1717 { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" }, 1718 { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" }, 1719 { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" }, 1720 { 247, "divide","division sign, U+00F7 ISOnum" }, 1721 { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" }, 1722 { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" }, 1723 { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" }, 1724 { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" }, 1725 { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" }, 1726 { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" }, 1727 { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" }, 1728 { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" }, 1729 1730 { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" }, 1731 { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" }, 1732 { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" }, 1733 { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" }, 1734 { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" }, 1735 1736 /* 1737 * Anything below should really be kept as entities references 1738 */ 1739 { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" }, 1740 1741 { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" }, 1742 { 732, "tilde","small tilde, U+02DC ISOdia" }, 1743 1744 { 913, "Alpha","greek capital letter alpha, U+0391" }, 1745 { 914, "Beta", "greek capital letter beta, U+0392" }, 1746 { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" }, 1747 { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" }, 1748 { 917, "Epsilon","greek capital letter epsilon, U+0395" }, 1749 { 918, "Zeta", "greek capital letter zeta, U+0396" }, 1750 { 919, "Eta", "greek capital letter eta, U+0397" }, 1751 { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" }, 1752 { 921, "Iota", "greek capital letter iota, U+0399" }, 1753 { 922, "Kappa","greek capital letter kappa, U+039A" }, 1754 { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" }, 1755 { 924, "Mu", "greek capital letter mu, U+039C" }, 1756 { 925, "Nu", "greek capital letter nu, U+039D" }, 1757 { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" }, 1758 { 927, "Omicron","greek capital letter omicron, U+039F" }, 1759 { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" }, 1760 { 929, "Rho", "greek capital letter rho, U+03A1" }, 1761 { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" }, 1762 { 932, "Tau", "greek capital letter tau, U+03A4" }, 1763 { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" }, 1764 { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" }, 1765 { 935, "Chi", "greek capital letter chi, U+03A7" }, 1766 { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" }, 1767 { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" }, 1768 1769 { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" }, 1770 { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" }, 1771 { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" }, 1772 { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" }, 1773 { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" }, 1774 { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" }, 1775 { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" }, 1776 { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" }, 1777 { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" }, 1778 { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" }, 1779 { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" }, 1780 { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" }, 1781 { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" }, 1782 { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" }, 1783 { 959, "omicron","greek small letter omicron, U+03BF NEW" }, 1784 { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" }, 1785 { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" }, 1786 { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" }, 1787 { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" }, 1788 { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" }, 1789 { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" }, 1790 { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" }, 1791 { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" }, 1792 { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" }, 1793 { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" }, 1794 { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" }, 1795 { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" }, 1796 { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" }, 1797 1798 { 8194, "ensp", "en space, U+2002 ISOpub" }, 1799 { 8195, "emsp", "em space, U+2003 ISOpub" }, 1800 { 8201, "thinsp","thin space, U+2009 ISOpub" }, 1801 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" }, 1802 { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" }, 1803 { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" }, 1804 { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" }, 1805 { 8211, "ndash","en dash, U+2013 ISOpub" }, 1806 { 8212, "mdash","em dash, U+2014 ISOpub" }, 1807 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" }, 1808 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" }, 1809 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" }, 1810 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" }, 1811 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" }, 1812 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" }, 1813 { 8224, "dagger","dagger, U+2020 ISOpub" }, 1814 { 8225, "Dagger","double dagger, U+2021 ISOpub" }, 1815 1816 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" }, 1817 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" }, 1818 1819 { 8240, "permil","per mille sign, U+2030 ISOtech" }, 1820 1821 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" }, 1822 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" }, 1823 1824 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" }, 1825 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" }, 1826 1827 { 8254, "oline","overline = spacing overscore, U+203E NEW" }, 1828 { 8260, "frasl","fraction slash, U+2044 NEW" }, 1829 1830 { 8364, "euro", "euro sign, U+20AC NEW" }, 1831 1832 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" }, 1833 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" }, 1834 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" }, 1835 { 8482, "trade","trade mark sign, U+2122 ISOnum" }, 1836 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" }, 1837 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" }, 1838 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" }, 1839 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" }, 1840 { 8595, "darr", "downwards arrow, U+2193 ISOnum" }, 1841 { 8596, "harr", "left right arrow, U+2194 ISOamsa" }, 1842 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" }, 1843 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" }, 1844 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" }, 1845 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" }, 1846 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" }, 1847 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" }, 1848 1849 { 8704, "forall","for all, U+2200 ISOtech" }, 1850 { 8706, "part", "partial differential, U+2202 ISOtech" }, 1851 { 8707, "exist","there exists, U+2203 ISOtech" }, 1852 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" }, 1853 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" }, 1854 { 8712, "isin", "element of, U+2208 ISOtech" }, 1855 { 8713, "notin","not an element of, U+2209 ISOtech" }, 1856 { 8715, "ni", "contains as member, U+220B ISOtech" }, 1857 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" }, 1858 { 8721, "sum", "n-ary summation, U+2211 ISOamsb" }, 1859 { 8722, "minus","minus sign, U+2212 ISOtech" }, 1860 { 8727, "lowast","asterisk operator, U+2217 ISOtech" }, 1861 { 8730, "radic","square root = radical sign, U+221A ISOtech" }, 1862 { 8733, "prop", "proportional to, U+221D ISOtech" }, 1863 { 8734, "infin","infinity, U+221E ISOtech" }, 1864 { 8736, "ang", "angle, U+2220 ISOamso" }, 1865 { 8743, "and", "logical and = wedge, U+2227 ISOtech" }, 1866 { 8744, "or", "logical or = vee, U+2228 ISOtech" }, 1867 { 8745, "cap", "intersection = cap, U+2229 ISOtech" }, 1868 { 8746, "cup", "union = cup, U+222A ISOtech" }, 1869 { 8747, "int", "integral, U+222B ISOtech" }, 1870 { 8756, "there4","therefore, U+2234 ISOtech" }, 1871 { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" }, 1872 { 8773, "cong", "approximately equal to, U+2245 ISOtech" }, 1873 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" }, 1874 { 8800, "ne", "not equal to, U+2260 ISOtech" }, 1875 { 8801, "equiv","identical to, U+2261 ISOtech" }, 1876 { 8804, "le", "less-than or equal to, U+2264 ISOtech" }, 1877 { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" }, 1878 { 8834, "sub", "subset of, U+2282 ISOtech" }, 1879 { 8835, "sup", "superset of, U+2283 ISOtech" }, 1880 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" }, 1881 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" }, 1882 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" }, 1883 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" }, 1884 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" }, 1885 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" }, 1886 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" }, 1887 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" }, 1888 { 8969, "rceil","right ceiling, U+2309 ISOamsc" }, 1889 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" }, 1890 { 8971, "rfloor","right floor, U+230B ISOamsc" }, 1891 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" }, 1892 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" }, 1893 { 9674, "loz", "lozenge, U+25CA ISOpub" }, 1894 1895 { 9824, "spades","black spade suit, U+2660 ISOpub" }, 1896 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" }, 1897 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" }, 1898 { 9830, "diams","black diamond suit, U+2666 ISOpub" }, 1899 1900 }; 1901 1902 /************************************************************************ 1903 * * 1904 * Commodity functions to handle entities * 1905 * * 1906 ************************************************************************/ 1907 1908 /* 1909 * Macro used to grow the current buffer. 1910 */ 1911 #define growBuffer(buffer) { \ 1912 xmlChar *tmp; \ 1913 buffer##_size *= 2; \ 1914 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \ 1915 if (tmp == NULL) { \ 1916 htmlErrMemory(ctxt, "growing buffer\n"); \ 1917 xmlFree(buffer); \ 1918 return(NULL); \ 1919 } \ 1920 buffer = tmp; \ 1921 } 1922 1923 /** 1924 * htmlEntityLookup: 1925 * @name: the entity name 1926 * 1927 * Lookup the given entity in EntitiesTable 1928 * 1929 * TODO: the linear scan is really ugly, an hash table is really needed. 1930 * 1931 * Returns the associated htmlEntityDescPtr if found, NULL otherwise. 1932 */ 1933 const htmlEntityDesc * 1934 htmlEntityLookup(const xmlChar *name) { 1935 unsigned int i; 1936 1937 for (i = 0;i < (sizeof(html40EntitiesTable)/ 1938 sizeof(html40EntitiesTable[0]));i++) { 1939 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) { 1940 return((htmlEntityDescPtr) &html40EntitiesTable[i]); 1941 } 1942 } 1943 return(NULL); 1944 } 1945 1946 /** 1947 * htmlEntityValueLookup: 1948 * @value: the entity's unicode value 1949 * 1950 * Lookup the given entity in EntitiesTable 1951 * 1952 * TODO: the linear scan is really ugly, an hash table is really needed. 1953 * 1954 * Returns the associated htmlEntityDescPtr if found, NULL otherwise. 1955 */ 1956 const htmlEntityDesc * 1957 htmlEntityValueLookup(unsigned int value) { 1958 unsigned int i; 1959 1960 for (i = 0;i < (sizeof(html40EntitiesTable)/ 1961 sizeof(html40EntitiesTable[0]));i++) { 1962 if (html40EntitiesTable[i].value >= value) { 1963 if (html40EntitiesTable[i].value > value) 1964 break; 1965 return((htmlEntityDescPtr) &html40EntitiesTable[i]); 1966 } 1967 } 1968 return(NULL); 1969 } 1970 1971 /** 1972 * UTF8ToHtml: 1973 * @out: a pointer to an array of bytes to store the result 1974 * @outlen: the length of @out 1975 * @in: a pointer to an array of UTF-8 chars 1976 * @inlen: the length of @in 1977 * 1978 * Take a block of UTF-8 chars in and try to convert it to an ASCII 1979 * plus HTML entities block of chars out. 1980 * 1981 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise 1982 * The value of @inlen after return is the number of octets consumed 1983 * as the return value is positive, else unpredictable. 1984 * The value of @outlen after return is the number of octets consumed. 1985 */ 1986 int 1987 UTF8ToHtml(unsigned char* out, int *outlen, 1988 const unsigned char* in, int *inlen) { 1989 const unsigned char* processed = in; 1990 const unsigned char* outend; 1991 const unsigned char* outstart = out; 1992 const unsigned char* instart = in; 1993 const unsigned char* inend; 1994 unsigned int c, d; 1995 int trailing; 1996 1997 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1); 1998 if (in == NULL) { 1999 /* 2000 * initialization nothing to do 2001 */ 2002 *outlen = 0; 2003 *inlen = 0; 2004 return(0); 2005 } 2006 inend = in + (*inlen); 2007 outend = out + (*outlen); 2008 while (in < inend) { 2009 d = *in++; 2010 if (d < 0x80) { c= d; trailing= 0; } 2011 else if (d < 0xC0) { 2012 /* trailing byte in leading position */ 2013 *outlen = out - outstart; 2014 *inlen = processed - instart; 2015 return(-2); 2016 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } 2017 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } 2018 else if (d < 0xF8) { c= d & 0x07; trailing= 3; } 2019 else { 2020 /* no chance for this in Ascii */ 2021 *outlen = out - outstart; 2022 *inlen = processed - instart; 2023 return(-2); 2024 } 2025 2026 if (inend - in < trailing) { 2027 break; 2028 } 2029 2030 for ( ; trailing; trailing--) { 2031 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) 2032 break; 2033 c <<= 6; 2034 c |= d & 0x3F; 2035 } 2036 2037 /* assertion: c is a single UTF-4 value */ 2038 if (c < 0x80) { 2039 if (out + 1 >= outend) 2040 break; 2041 *out++ = c; 2042 } else { 2043 int len; 2044 const htmlEntityDesc * ent; 2045 const char *cp; 2046 char nbuf[16]; 2047 2048 /* 2049 * Try to lookup a predefined HTML entity for it 2050 */ 2051 2052 ent = htmlEntityValueLookup(c); 2053 if (ent == NULL) { 2054 snprintf(nbuf, sizeof(nbuf), "#%u", c); 2055 cp = nbuf; 2056 } 2057 else 2058 cp = ent->name; 2059 len = strlen(cp); 2060 if (out + 2 + len >= outend) 2061 break; 2062 *out++ = '&'; 2063 memcpy(out, cp, len); 2064 out += len; 2065 *out++ = ';'; 2066 } 2067 processed = in; 2068 } 2069 *outlen = out - outstart; 2070 *inlen = processed - instart; 2071 return(0); 2072 } 2073 2074 /** 2075 * htmlEncodeEntities: 2076 * @out: a pointer to an array of bytes to store the result 2077 * @outlen: the length of @out 2078 * @in: a pointer to an array of UTF-8 chars 2079 * @inlen: the length of @in 2080 * @quoteChar: the quote character to escape (' or ") or zero. 2081 * 2082 * Take a block of UTF-8 chars in and try to convert it to an ASCII 2083 * plus HTML entities block of chars out. 2084 * 2085 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise 2086 * The value of @inlen after return is the number of octets consumed 2087 * as the return value is positive, else unpredictable. 2088 * The value of @outlen after return is the number of octets consumed. 2089 */ 2090 int 2091 htmlEncodeEntities(unsigned char* out, int *outlen, 2092 const unsigned char* in, int *inlen, int quoteChar) { 2093 const unsigned char* processed = in; 2094 const unsigned char* outend; 2095 const unsigned char* outstart = out; 2096 const unsigned char* instart = in; 2097 const unsigned char* inend; 2098 unsigned int c, d; 2099 int trailing; 2100 2101 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) 2102 return(-1); 2103 outend = out + (*outlen); 2104 inend = in + (*inlen); 2105 while (in < inend) { 2106 d = *in++; 2107 if (d < 0x80) { c= d; trailing= 0; } 2108 else if (d < 0xC0) { 2109 /* trailing byte in leading position */ 2110 *outlen = out - outstart; 2111 *inlen = processed - instart; 2112 return(-2); 2113 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } 2114 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } 2115 else if (d < 0xF8) { c= d & 0x07; trailing= 3; } 2116 else { 2117 /* no chance for this in Ascii */ 2118 *outlen = out - outstart; 2119 *inlen = processed - instart; 2120 return(-2); 2121 } 2122 2123 if (inend - in < trailing) 2124 break; 2125 2126 while (trailing--) { 2127 if (((d= *in++) & 0xC0) != 0x80) { 2128 *outlen = out - outstart; 2129 *inlen = processed - instart; 2130 return(-2); 2131 } 2132 c <<= 6; 2133 c |= d & 0x3F; 2134 } 2135 2136 /* assertion: c is a single UTF-4 value */ 2137 if ((c < 0x80) && (c != (unsigned int) quoteChar) && 2138 (c != '&') && (c != '<') && (c != '>')) { 2139 if (out >= outend) 2140 break; 2141 *out++ = c; 2142 } else { 2143 const htmlEntityDesc * ent; 2144 const char *cp; 2145 char nbuf[16]; 2146 int len; 2147 2148 /* 2149 * Try to lookup a predefined HTML entity for it 2150 */ 2151 ent = htmlEntityValueLookup(c); 2152 if (ent == NULL) { 2153 snprintf(nbuf, sizeof(nbuf), "#%u", c); 2154 cp = nbuf; 2155 } 2156 else 2157 cp = ent->name; 2158 len = strlen(cp); 2159 if (out + 2 + len > outend) 2160 break; 2161 *out++ = '&'; 2162 memcpy(out, cp, len); 2163 out += len; 2164 *out++ = ';'; 2165 } 2166 processed = in; 2167 } 2168 *outlen = out - outstart; 2169 *inlen = processed - instart; 2170 return(0); 2171 } 2172 2173 /************************************************************************ 2174 * * 2175 * Commodity functions to handle streams * 2176 * * 2177 ************************************************************************/ 2178 2179 #ifdef LIBXML_PUSH_ENABLED 2180 /** 2181 * htmlNewInputStream: 2182 * @ctxt: an HTML parser context 2183 * 2184 * Create a new input stream structure 2185 * Returns the new input stream or NULL 2186 */ 2187 static htmlParserInputPtr 2188 htmlNewInputStream(htmlParserCtxtPtr ctxt) { 2189 htmlParserInputPtr input; 2190 2191 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput)); 2192 if (input == NULL) { 2193 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n"); 2194 return(NULL); 2195 } 2196 memset(input, 0, sizeof(htmlParserInput)); 2197 input->filename = NULL; 2198 input->directory = NULL; 2199 input->base = NULL; 2200 input->cur = NULL; 2201 input->buf = NULL; 2202 input->line = 1; 2203 input->col = 1; 2204 input->buf = NULL; 2205 input->free = NULL; 2206 input->version = NULL; 2207 input->consumed = 0; 2208 input->length = 0; 2209 return(input); 2210 } 2211 #endif 2212 2213 2214 /************************************************************************ 2215 * * 2216 * Commodity functions, cleanup needed ? * 2217 * * 2218 ************************************************************************/ 2219 /* 2220 * all tags allowing pc data from the html 4.01 loose dtd 2221 * NOTE: it might be more appropriate to integrate this information 2222 * into the html40ElementTable array but I don't want to risk any 2223 * binary incompatibility 2224 */ 2225 static const char *allowPCData[] = { 2226 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big", 2227 "blockquote", "body", "button", "caption", "center", "cite", "code", 2228 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2", 2229 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend", 2230 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp", 2231 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var" 2232 }; 2233 2234 /** 2235 * areBlanks: 2236 * @ctxt: an HTML parser context 2237 * @str: a xmlChar * 2238 * @len: the size of @str 2239 * 2240 * Is this a sequence of blank chars that one can ignore ? 2241 * 2242 * Returns 1 if ignorable 0 otherwise. 2243 */ 2244 2245 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) { 2246 unsigned int i; 2247 int j; 2248 xmlNodePtr lastChild; 2249 xmlDtdPtr dtd; 2250 2251 for (j = 0;j < len;j++) 2252 if (!(IS_BLANK_CH(str[j]))) return(0); 2253 2254 if (CUR == 0) return(1); 2255 if (CUR != '<') return(0); 2256 if (ctxt->name == NULL) 2257 return(1); 2258 if (xmlStrEqual(ctxt->name, BAD_CAST"html")) 2259 return(1); 2260 if (xmlStrEqual(ctxt->name, BAD_CAST"head")) 2261 return(1); 2262 2263 /* Only strip CDATA children of the body tag for strict HTML DTDs */ 2264 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) { 2265 dtd = xmlGetIntSubset(ctxt->myDoc); 2266 if (dtd != NULL && dtd->ExternalID != NULL) { 2267 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") || 2268 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN")) 2269 return(1); 2270 } 2271 } 2272 2273 if (ctxt->node == NULL) return(0); 2274 lastChild = xmlGetLastChild(ctxt->node); 2275 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE)) 2276 lastChild = lastChild->prev; 2277 if (lastChild == NULL) { 2278 if ((ctxt->node->type != XML_ELEMENT_NODE) && 2279 (ctxt->node->content != NULL)) return(0); 2280 /* keep ws in constructs like ...<b> </b>... 2281 for all tags "b" allowing PCDATA */ 2282 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) { 2283 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) { 2284 return(0); 2285 } 2286 } 2287 } else if (xmlNodeIsText(lastChild)) { 2288 return(0); 2289 } else { 2290 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p> 2291 for all tags "p" allowing PCDATA */ 2292 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) { 2293 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) { 2294 return(0); 2295 } 2296 } 2297 } 2298 return(1); 2299 } 2300 2301 /** 2302 * htmlNewDocNoDtD: 2303 * @URI: URI for the dtd, or NULL 2304 * @ExternalID: the external ID of the DTD, or NULL 2305 * 2306 * Creates a new HTML document without a DTD node if @URI and @ExternalID 2307 * are NULL 2308 * 2309 * Returns a new document, do not initialize the DTD if not provided 2310 */ 2311 htmlDocPtr 2312 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) { 2313 xmlDocPtr cur; 2314 2315 /* 2316 * Allocate a new document and fill the fields. 2317 */ 2318 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc)); 2319 if (cur == NULL) { 2320 htmlErrMemory(NULL, "HTML document creation failed\n"); 2321 return(NULL); 2322 } 2323 memset(cur, 0, sizeof(xmlDoc)); 2324 2325 cur->type = XML_HTML_DOCUMENT_NODE; 2326 cur->version = NULL; 2327 cur->intSubset = NULL; 2328 cur->doc = cur; 2329 cur->name = NULL; 2330 cur->children = NULL; 2331 cur->extSubset = NULL; 2332 cur->oldNs = NULL; 2333 cur->encoding = NULL; 2334 cur->standalone = 1; 2335 cur->compression = 0; 2336 cur->ids = NULL; 2337 cur->refs = NULL; 2338 cur->_private = NULL; 2339 cur->charset = XML_CHAR_ENCODING_UTF8; 2340 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT; 2341 if ((ExternalID != NULL) || 2342 (URI != NULL)) 2343 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI); 2344 return(cur); 2345 } 2346 2347 /** 2348 * htmlNewDoc: 2349 * @URI: URI for the dtd, or NULL 2350 * @ExternalID: the external ID of the DTD, or NULL 2351 * 2352 * Creates a new HTML document 2353 * 2354 * Returns a new document 2355 */ 2356 htmlDocPtr 2357 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) { 2358 if ((URI == NULL) && (ExternalID == NULL)) 2359 return(htmlNewDocNoDtD( 2360 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd", 2361 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN")); 2362 2363 return(htmlNewDocNoDtD(URI, ExternalID)); 2364 } 2365 2366 2367 /************************************************************************ 2368 * * 2369 * The parser itself * 2370 * Relates to http://www.w3.org/TR/html40 * 2371 * * 2372 ************************************************************************/ 2373 2374 /************************************************************************ 2375 * * 2376 * The parser itself * 2377 * * 2378 ************************************************************************/ 2379 2380 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt); 2381 2382 /** 2383 * htmlParseHTMLName: 2384 * @ctxt: an HTML parser context 2385 * 2386 * parse an HTML tag or attribute name, note that we convert it to lowercase 2387 * since HTML names are not case-sensitive. 2388 * 2389 * Returns the Tag Name parsed or NULL 2390 */ 2391 2392 static const xmlChar * 2393 htmlParseHTMLName(htmlParserCtxtPtr ctxt) { 2394 int i = 0; 2395 xmlChar loc[HTML_PARSER_BUFFER_SIZE]; 2396 2397 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') && 2398 (CUR != ':') && (CUR != '.')) return(NULL); 2399 2400 while ((i < HTML_PARSER_BUFFER_SIZE) && 2401 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) || 2402 (CUR == ':') || (CUR == '-') || (CUR == '_') || 2403 (CUR == '.'))) { 2404 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20; 2405 else loc[i] = CUR; 2406 i++; 2407 2408 NEXT; 2409 } 2410 2411 return(xmlDictLookup(ctxt->dict, loc, i)); 2412 } 2413 2414 2415 /** 2416 * htmlParseHTMLName_nonInvasive: 2417 * @ctxt: an HTML parser context 2418 * 2419 * parse an HTML tag or attribute name, note that we convert it to lowercase 2420 * since HTML names are not case-sensitive, this doesn't consume the data 2421 * from the stream, it's a look-ahead 2422 * 2423 * Returns the Tag Name parsed or NULL 2424 */ 2425 2426 static const xmlChar * 2427 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) { 2428 int i = 0; 2429 xmlChar loc[HTML_PARSER_BUFFER_SIZE]; 2430 2431 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') && 2432 (NXT(1) != ':')) return(NULL); 2433 2434 while ((i < HTML_PARSER_BUFFER_SIZE) && 2435 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) || 2436 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) { 2437 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20; 2438 else loc[i] = NXT(1+i); 2439 i++; 2440 } 2441 2442 return(xmlDictLookup(ctxt->dict, loc, i)); 2443 } 2444 2445 2446 /** 2447 * htmlParseName: 2448 * @ctxt: an HTML parser context 2449 * 2450 * parse an HTML name, this routine is case sensitive. 2451 * 2452 * Returns the Name parsed or NULL 2453 */ 2454 2455 static const xmlChar * 2456 htmlParseName(htmlParserCtxtPtr ctxt) { 2457 const xmlChar *in; 2458 const xmlChar *ret; 2459 int count = 0; 2460 2461 GROW; 2462 2463 /* 2464 * Accelerator for simple ASCII names 2465 */ 2466 in = ctxt->input->cur; 2467 if (((*in >= 0x61) && (*in <= 0x7A)) || 2468 ((*in >= 0x41) && (*in <= 0x5A)) || 2469 (*in == '_') || (*in == ':')) { 2470 in++; 2471 while (((*in >= 0x61) && (*in <= 0x7A)) || 2472 ((*in >= 0x41) && (*in <= 0x5A)) || 2473 ((*in >= 0x30) && (*in <= 0x39)) || 2474 (*in == '_') || (*in == '-') || 2475 (*in == ':') || (*in == '.')) 2476 in++; 2477 2478 if (in == ctxt->input->end) 2479 return(NULL); 2480 2481 if ((*in > 0) && (*in < 0x80)) { 2482 count = in - ctxt->input->cur; 2483 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count); 2484 ctxt->input->cur = in; 2485 ctxt->nbChars += count; 2486 ctxt->input->col += count; 2487 return(ret); 2488 } 2489 } 2490 return(htmlParseNameComplex(ctxt)); 2491 } 2492 2493 static const xmlChar * 2494 htmlParseNameComplex(xmlParserCtxtPtr ctxt) { 2495 int len = 0, l; 2496 int c; 2497 int count = 0; 2498 const xmlChar *base = ctxt->input->base; 2499 2500 /* 2501 * Handler for more complex cases 2502 */ 2503 GROW; 2504 c = CUR_CHAR(l); 2505 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */ 2506 (!IS_LETTER(c) && (c != '_') && 2507 (c != ':'))) { 2508 return(NULL); 2509 } 2510 2511 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */ 2512 ((IS_LETTER(c)) || (IS_DIGIT(c)) || 2513 (c == '.') || (c == '-') || 2514 (c == '_') || (c == ':') || 2515 (IS_COMBINING(c)) || 2516 (IS_EXTENDER(c)))) { 2517 if (count++ > 100) { 2518 count = 0; 2519 GROW; 2520 } 2521 len += l; 2522 NEXTL(l); 2523 c = CUR_CHAR(l); 2524 if (ctxt->input->base != base) { 2525 /* 2526 * We changed encoding from an unknown encoding 2527 * Input buffer changed location, so we better start again 2528 */ 2529 return(htmlParseNameComplex(ctxt)); 2530 } 2531 } 2532 2533 if (ctxt->input->cur - ctxt->input->base < len) { 2534 /* Sanity check */ 2535 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 2536 "unexpected change of input buffer", NULL, NULL); 2537 return (NULL); 2538 } 2539 2540 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len)); 2541 } 2542 2543 2544 /** 2545 * htmlParseHTMLAttribute: 2546 * @ctxt: an HTML parser context 2547 * @stop: a char stop value 2548 * 2549 * parse an HTML attribute value till the stop (quote), if 2550 * stop is 0 then it stops at the first space 2551 * 2552 * Returns the attribute parsed or NULL 2553 */ 2554 2555 static xmlChar * 2556 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) { 2557 xmlChar *buffer = NULL; 2558 int buffer_size = 0; 2559 xmlChar *out = NULL; 2560 const xmlChar *name = NULL; 2561 const xmlChar *cur = NULL; 2562 const htmlEntityDesc * ent; 2563 2564 /* 2565 * allocate a translation buffer. 2566 */ 2567 buffer_size = HTML_PARSER_BUFFER_SIZE; 2568 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar)); 2569 if (buffer == NULL) { 2570 htmlErrMemory(ctxt, "buffer allocation failed\n"); 2571 return(NULL); 2572 } 2573 out = buffer; 2574 2575 /* 2576 * Ok loop until we reach one of the ending chars 2577 */ 2578 while ((CUR != 0) && (CUR != stop)) { 2579 if ((stop == 0) && (CUR == '>')) break; 2580 if ((stop == 0) && (IS_BLANK_CH(CUR))) break; 2581 if (CUR == '&') { 2582 if (NXT(1) == '#') { 2583 unsigned int c; 2584 int bits; 2585 2586 c = htmlParseCharRef(ctxt); 2587 if (c < 0x80) 2588 { *out++ = c; bits= -6; } 2589 else if (c < 0x800) 2590 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2591 else if (c < 0x10000) 2592 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2593 else 2594 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2595 2596 for ( ; bits >= 0; bits-= 6) { 2597 *out++ = ((c >> bits) & 0x3F) | 0x80; 2598 } 2599 2600 if (out - buffer > buffer_size - 100) { 2601 int indx = out - buffer; 2602 2603 growBuffer(buffer); 2604 out = &buffer[indx]; 2605 } 2606 } else { 2607 ent = htmlParseEntityRef(ctxt, &name); 2608 if (name == NULL) { 2609 *out++ = '&'; 2610 if (out - buffer > buffer_size - 100) { 2611 int indx = out - buffer; 2612 2613 growBuffer(buffer); 2614 out = &buffer[indx]; 2615 } 2616 } else if (ent == NULL) { 2617 *out++ = '&'; 2618 cur = name; 2619 while (*cur != 0) { 2620 if (out - buffer > buffer_size - 100) { 2621 int indx = out - buffer; 2622 2623 growBuffer(buffer); 2624 out = &buffer[indx]; 2625 } 2626 *out++ = *cur++; 2627 } 2628 } else { 2629 unsigned int c; 2630 int bits; 2631 2632 if (out - buffer > buffer_size - 100) { 2633 int indx = out - buffer; 2634 2635 growBuffer(buffer); 2636 out = &buffer[indx]; 2637 } 2638 c = ent->value; 2639 if (c < 0x80) 2640 { *out++ = c; bits= -6; } 2641 else if (c < 0x800) 2642 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2643 else if (c < 0x10000) 2644 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2645 else 2646 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2647 2648 for ( ; bits >= 0; bits-= 6) { 2649 *out++ = ((c >> bits) & 0x3F) | 0x80; 2650 } 2651 } 2652 } 2653 } else { 2654 unsigned int c; 2655 int bits, l; 2656 2657 if (out - buffer > buffer_size - 100) { 2658 int indx = out - buffer; 2659 2660 growBuffer(buffer); 2661 out = &buffer[indx]; 2662 } 2663 c = CUR_CHAR(l); 2664 if (c < 0x80) 2665 { *out++ = c; bits= -6; } 2666 else if (c < 0x800) 2667 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2668 else if (c < 0x10000) 2669 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2670 else 2671 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2672 2673 for ( ; bits >= 0; bits-= 6) { 2674 *out++ = ((c >> bits) & 0x3F) | 0x80; 2675 } 2676 NEXT; 2677 } 2678 } 2679 *out = 0; 2680 return(buffer); 2681 } 2682 2683 /** 2684 * htmlParseEntityRef: 2685 * @ctxt: an HTML parser context 2686 * @str: location to store the entity name 2687 * 2688 * parse an HTML ENTITY references 2689 * 2690 * [68] EntityRef ::= '&' Name ';' 2691 * 2692 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise, 2693 * if non-NULL *str will have to be freed by the caller. 2694 */ 2695 const htmlEntityDesc * 2696 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) { 2697 const xmlChar *name; 2698 const htmlEntityDesc * ent = NULL; 2699 2700 if (str != NULL) *str = NULL; 2701 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL); 2702 2703 if (CUR == '&') { 2704 NEXT; 2705 name = htmlParseName(ctxt); 2706 if (name == NULL) { 2707 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 2708 "htmlParseEntityRef: no name\n", NULL, NULL); 2709 } else { 2710 GROW; 2711 if (CUR == ';') { 2712 if (str != NULL) 2713 *str = name; 2714 2715 /* 2716 * Lookup the entity in the table. 2717 */ 2718 ent = htmlEntityLookup(name); 2719 if (ent != NULL) /* OK that's ugly !!! */ 2720 NEXT; 2721 } else { 2722 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING, 2723 "htmlParseEntityRef: expecting ';'\n", 2724 NULL, NULL); 2725 if (str != NULL) 2726 *str = name; 2727 } 2728 } 2729 } 2730 return(ent); 2731 } 2732 2733 /** 2734 * htmlParseAttValue: 2735 * @ctxt: an HTML parser context 2736 * 2737 * parse a value for an attribute 2738 * Note: the parser won't do substitution of entities here, this 2739 * will be handled later in xmlStringGetNodeList, unless it was 2740 * asked for ctxt->replaceEntities != 0 2741 * 2742 * Returns the AttValue parsed or NULL. 2743 */ 2744 2745 static xmlChar * 2746 htmlParseAttValue(htmlParserCtxtPtr ctxt) { 2747 xmlChar *ret = NULL; 2748 2749 if (CUR == '"') { 2750 NEXT; 2751 ret = htmlParseHTMLAttribute(ctxt, '"'); 2752 if (CUR != '"') { 2753 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED, 2754 "AttValue: \" expected\n", NULL, NULL); 2755 } else 2756 NEXT; 2757 } else if (CUR == '\'') { 2758 NEXT; 2759 ret = htmlParseHTMLAttribute(ctxt, '\''); 2760 if (CUR != '\'') { 2761 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED, 2762 "AttValue: ' expected\n", NULL, NULL); 2763 } else 2764 NEXT; 2765 } else { 2766 /* 2767 * That's an HTMLism, the attribute value may not be quoted 2768 */ 2769 ret = htmlParseHTMLAttribute(ctxt, 0); 2770 if (ret == NULL) { 2771 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE, 2772 "AttValue: no value found\n", NULL, NULL); 2773 } 2774 } 2775 return(ret); 2776 } 2777 2778 /** 2779 * htmlParseSystemLiteral: 2780 * @ctxt: an HTML parser context 2781 * 2782 * parse an HTML Literal 2783 * 2784 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") 2785 * 2786 * Returns the SystemLiteral parsed or NULL 2787 */ 2788 2789 static xmlChar * 2790 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) { 2791 size_t len = 0, startPosition = 0; 2792 xmlChar *ret = NULL; 2793 2794 if (CUR == '"') { 2795 NEXT; 2796 2797 if (CUR_PTR < BASE_PTR) 2798 return(ret); 2799 startPosition = CUR_PTR - BASE_PTR; 2800 2801 while ((IS_CHAR_CH(CUR)) && (CUR != '"')) { 2802 NEXT; 2803 len++; 2804 } 2805 if (!IS_CHAR_CH(CUR)) { 2806 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2807 "Unfinished SystemLiteral\n", NULL, NULL); 2808 } else { 2809 ret = xmlStrndup((BASE_PTR+startPosition), len); 2810 NEXT; 2811 } 2812 } else if (CUR == '\'') { 2813 NEXT; 2814 2815 if (CUR_PTR < BASE_PTR) 2816 return(ret); 2817 startPosition = CUR_PTR - BASE_PTR; 2818 2819 while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) { 2820 NEXT; 2821 len++; 2822 } 2823 if (!IS_CHAR_CH(CUR)) { 2824 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2825 "Unfinished SystemLiteral\n", NULL, NULL); 2826 } else { 2827 ret = xmlStrndup((BASE_PTR+startPosition), len); 2828 NEXT; 2829 } 2830 } else { 2831 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, 2832 " or ' expected\n", NULL, NULL); 2833 } 2834 2835 return(ret); 2836 } 2837 2838 /** 2839 * htmlParsePubidLiteral: 2840 * @ctxt: an HTML parser context 2841 * 2842 * parse an HTML public literal 2843 * 2844 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" 2845 * 2846 * Returns the PubidLiteral parsed or NULL. 2847 */ 2848 2849 static xmlChar * 2850 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) { 2851 size_t len = 0, startPosition = 0; 2852 xmlChar *ret = NULL; 2853 /* 2854 * Name ::= (Letter | '_') (NameChar)* 2855 */ 2856 if (CUR == '"') { 2857 NEXT; 2858 2859 if (CUR_PTR < BASE_PTR) 2860 return(ret); 2861 startPosition = CUR_PTR - BASE_PTR; 2862 2863 while (IS_PUBIDCHAR_CH(CUR)) { 2864 len++; 2865 NEXT; 2866 } 2867 2868 if (CUR != '"') { 2869 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2870 "Unfinished PubidLiteral\n", NULL, NULL); 2871 } else { 2872 ret = xmlStrndup((BASE_PTR + startPosition), len); 2873 NEXT; 2874 } 2875 } else if (CUR == '\'') { 2876 NEXT; 2877 2878 if (CUR_PTR < BASE_PTR) 2879 return(ret); 2880 startPosition = CUR_PTR - BASE_PTR; 2881 2882 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')){ 2883 len++; 2884 NEXT; 2885 } 2886 2887 if (CUR != '\'') { 2888 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2889 "Unfinished PubidLiteral\n", NULL, NULL); 2890 } else { 2891 ret = xmlStrndup((BASE_PTR + startPosition), len); 2892 NEXT; 2893 } 2894 } else { 2895 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, 2896 "PubidLiteral \" or ' expected\n", NULL, NULL); 2897 } 2898 2899 return(ret); 2900 } 2901 2902 /** 2903 * htmlParseScript: 2904 * @ctxt: an HTML parser context 2905 * 2906 * parse the content of an HTML SCRIPT or STYLE element 2907 * http://www.w3.org/TR/html4/sgml/dtd.html#Script 2908 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet 2909 * http://www.w3.org/TR/html4/types.html#type-script 2910 * http://www.w3.org/TR/html4/types.html#h-6.15 2911 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1 2912 * 2913 * Script data ( %Script; in the DTD) can be the content of the SCRIPT 2914 * element and the value of intrinsic event attributes. User agents must 2915 * not evaluate script data as HTML markup but instead must pass it on as 2916 * data to a script engine. 2917 * NOTES: 2918 * - The content is passed like CDATA 2919 * - the attributes for style and scripting "onXXX" are also described 2920 * as CDATA but SGML allows entities references in attributes so their 2921 * processing is identical as other attributes 2922 */ 2923 static void 2924 htmlParseScript(htmlParserCtxtPtr ctxt) { 2925 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5]; 2926 int nbchar = 0; 2927 int cur,l; 2928 2929 SHRINK; 2930 cur = CUR_CHAR(l); 2931 while (IS_CHAR_CH(cur)) { 2932 if ((cur == '<') && (NXT(1) == '/')) { 2933 /* 2934 * One should break here, the specification is clear: 2935 * Authors should therefore escape "</" within the content. 2936 * Escape mechanisms are specific to each scripting or 2937 * style sheet language. 2938 * 2939 * In recovery mode, only break if end tag match the 2940 * current tag, effectively ignoring all tags inside the 2941 * script/style block and treating the entire block as 2942 * CDATA. 2943 */ 2944 if (ctxt->recovery) { 2945 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2, 2946 xmlStrlen(ctxt->name)) == 0) 2947 { 2948 break; /* while */ 2949 } else { 2950 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 2951 "Element %s embeds close tag\n", 2952 ctxt->name, NULL); 2953 } 2954 } else { 2955 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) || 2956 ((NXT(2) >= 'a') && (NXT(2) <= 'z'))) 2957 { 2958 break; /* while */ 2959 } 2960 } 2961 } 2962 COPY_BUF(l,buf,nbchar,cur); 2963 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { 2964 buf[nbchar] = 0; 2965 if (ctxt->sax->cdataBlock!= NULL) { 2966 /* 2967 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE 2968 */ 2969 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); 2970 } else if (ctxt->sax->characters != NULL) { 2971 ctxt->sax->characters(ctxt->userData, buf, nbchar); 2972 } 2973 nbchar = 0; 2974 } 2975 GROW; 2976 NEXTL(l); 2977 cur = CUR_CHAR(l); 2978 } 2979 2980 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) { 2981 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 2982 "Invalid char in CDATA 0x%X\n", cur); 2983 if (ctxt->input->cur < ctxt->input->end) { 2984 NEXT; 2985 } 2986 } 2987 2988 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) { 2989 buf[nbchar] = 0; 2990 if (ctxt->sax->cdataBlock!= NULL) { 2991 /* 2992 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE 2993 */ 2994 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); 2995 } else if (ctxt->sax->characters != NULL) { 2996 ctxt->sax->characters(ctxt->userData, buf, nbchar); 2997 } 2998 } 2999 } 3000 3001 3002 /** 3003 * htmlParseCharDataInternal: 3004 * @ctxt: an HTML parser context 3005 * @readahead: optional read ahead character in ascii range 3006 * 3007 * parse a CharData section. 3008 * if we are within a CDATA section ']]>' marks an end of section. 3009 * 3010 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) 3011 */ 3012 3013 static void 3014 htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) { 3015 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6]; 3016 int nbchar = 0; 3017 int cur, l; 3018 int chunk = 0; 3019 3020 if (readahead) 3021 buf[nbchar++] = readahead; 3022 3023 SHRINK; 3024 cur = CUR_CHAR(l); 3025 while (((cur != '<') || (ctxt->token == '<')) && 3026 ((cur != '&') || (ctxt->token == '&')) && 3027 (cur != 0)) { 3028 if (!(IS_CHAR(cur))) { 3029 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 3030 "Invalid char in CDATA 0x%X\n", cur); 3031 } else { 3032 COPY_BUF(l,buf,nbchar,cur); 3033 } 3034 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { 3035 buf[nbchar] = 0; 3036 3037 /* 3038 * Ok the segment is to be consumed as chars. 3039 */ 3040 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { 3041 if (areBlanks(ctxt, buf, nbchar)) { 3042 if (ctxt->keepBlanks) { 3043 if (ctxt->sax->characters != NULL) 3044 ctxt->sax->characters(ctxt->userData, buf, nbchar); 3045 } else { 3046 if (ctxt->sax->ignorableWhitespace != NULL) 3047 ctxt->sax->ignorableWhitespace(ctxt->userData, 3048 buf, nbchar); 3049 } 3050 } else { 3051 htmlCheckParagraph(ctxt); 3052 if (ctxt->sax->characters != NULL) 3053 ctxt->sax->characters(ctxt->userData, buf, nbchar); 3054 } 3055 } 3056 nbchar = 0; 3057 } 3058 NEXTL(l); 3059 chunk++; 3060 if (chunk > HTML_PARSER_BUFFER_SIZE) { 3061 chunk = 0; 3062 SHRINK; 3063 GROW; 3064 } 3065 cur = CUR_CHAR(l); 3066 if (cur == 0) { 3067 SHRINK; 3068 GROW; 3069 cur = CUR_CHAR(l); 3070 } 3071 } 3072 if (nbchar != 0) { 3073 buf[nbchar] = 0; 3074 3075 /* 3076 * Ok the segment is to be consumed as chars. 3077 */ 3078 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { 3079 if (areBlanks(ctxt, buf, nbchar)) { 3080 if (ctxt->keepBlanks) { 3081 if (ctxt->sax->characters != NULL) 3082 ctxt->sax->characters(ctxt->userData, buf, nbchar); 3083 } else { 3084 if (ctxt->sax->ignorableWhitespace != NULL) 3085 ctxt->sax->ignorableWhitespace(ctxt->userData, 3086 buf, nbchar); 3087 } 3088 } else { 3089 htmlCheckParagraph(ctxt); 3090 if (ctxt->sax->characters != NULL) 3091 ctxt->sax->characters(ctxt->userData, buf, nbchar); 3092 } 3093 } 3094 } else { 3095 /* 3096 * Loop detection 3097 */ 3098 if (cur == 0) 3099 ctxt->instate = XML_PARSER_EOF; 3100 } 3101 } 3102 3103 /** 3104 * htmlParseCharData: 3105 * @ctxt: an HTML parser context 3106 * 3107 * parse a CharData section. 3108 * if we are within a CDATA section ']]>' marks an end of section. 3109 * 3110 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) 3111 */ 3112 3113 static void 3114 htmlParseCharData(htmlParserCtxtPtr ctxt) { 3115 htmlParseCharDataInternal(ctxt, 0); 3116 } 3117 3118 /** 3119 * htmlParseExternalID: 3120 * @ctxt: an HTML parser context 3121 * @publicID: a xmlChar** receiving PubidLiteral 3122 * 3123 * Parse an External ID or a Public ID 3124 * 3125 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral 3126 * | 'PUBLIC' S PubidLiteral S SystemLiteral 3127 * 3128 * [83] PublicID ::= 'PUBLIC' S PubidLiteral 3129 * 3130 * Returns the function returns SystemLiteral and in the second 3131 * case publicID receives PubidLiteral, is strict is off 3132 * it is possible to return NULL and have publicID set. 3133 */ 3134 3135 static xmlChar * 3136 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) { 3137 xmlChar *URI = NULL; 3138 3139 if ((UPPER == 'S') && (UPP(1) == 'Y') && 3140 (UPP(2) == 'S') && (UPP(3) == 'T') && 3141 (UPP(4) == 'E') && (UPP(5) == 'M')) { 3142 SKIP(6); 3143 if (!IS_BLANK_CH(CUR)) { 3144 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 3145 "Space required after 'SYSTEM'\n", NULL, NULL); 3146 } 3147 SKIP_BLANKS; 3148 URI = htmlParseSystemLiteral(ctxt); 3149 if (URI == NULL) { 3150 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED, 3151 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL); 3152 } 3153 } else if ((UPPER == 'P') && (UPP(1) == 'U') && 3154 (UPP(2) == 'B') && (UPP(3) == 'L') && 3155 (UPP(4) == 'I') && (UPP(5) == 'C')) { 3156 SKIP(6); 3157 if (!IS_BLANK_CH(CUR)) { 3158 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 3159 "Space required after 'PUBLIC'\n", NULL, NULL); 3160 } 3161 SKIP_BLANKS; 3162 *publicID = htmlParsePubidLiteral(ctxt); 3163 if (*publicID == NULL) { 3164 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED, 3165 "htmlParseExternalID: PUBLIC, no Public Identifier\n", 3166 NULL, NULL); 3167 } 3168 SKIP_BLANKS; 3169 if ((CUR == '"') || (CUR == '\'')) { 3170 URI = htmlParseSystemLiteral(ctxt); 3171 } 3172 } 3173 return(URI); 3174 } 3175 3176 /** 3177 * xmlParsePI: 3178 * @ctxt: an XML parser context 3179 * 3180 * parse an XML Processing Instruction. 3181 * 3182 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>' 3183 */ 3184 static void 3185 htmlParsePI(htmlParserCtxtPtr ctxt) { 3186 xmlChar *buf = NULL; 3187 int len = 0; 3188 int size = HTML_PARSER_BUFFER_SIZE; 3189 int cur, l; 3190 const xmlChar *target; 3191 xmlParserInputState state; 3192 int count = 0; 3193 3194 if ((RAW == '<') && (NXT(1) == '?')) { 3195 state = ctxt->instate; 3196 ctxt->instate = XML_PARSER_PI; 3197 /* 3198 * this is a Processing Instruction. 3199 */ 3200 SKIP(2); 3201 SHRINK; 3202 3203 /* 3204 * Parse the target name and check for special support like 3205 * namespace. 3206 */ 3207 target = htmlParseName(ctxt); 3208 if (target != NULL) { 3209 if (RAW == '>') { 3210 SKIP(1); 3211 3212 /* 3213 * SAX: PI detected. 3214 */ 3215 if ((ctxt->sax) && (!ctxt->disableSAX) && 3216 (ctxt->sax->processingInstruction != NULL)) 3217 ctxt->sax->processingInstruction(ctxt->userData, 3218 target, NULL); 3219 ctxt->instate = state; 3220 return; 3221 } 3222 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar)); 3223 if (buf == NULL) { 3224 htmlErrMemory(ctxt, NULL); 3225 ctxt->instate = state; 3226 return; 3227 } 3228 cur = CUR; 3229 if (!IS_BLANK(cur)) { 3230 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 3231 "ParsePI: PI %s space expected\n", target, NULL); 3232 } 3233 SKIP_BLANKS; 3234 cur = CUR_CHAR(l); 3235 while (IS_CHAR(cur) && (cur != '>')) { 3236 if (len + 5 >= size) { 3237 xmlChar *tmp; 3238 3239 size *= 2; 3240 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); 3241 if (tmp == NULL) { 3242 htmlErrMemory(ctxt, NULL); 3243 xmlFree(buf); 3244 ctxt->instate = state; 3245 return; 3246 } 3247 buf = tmp; 3248 } 3249 count++; 3250 if (count > 50) { 3251 GROW; 3252 count = 0; 3253 } 3254 COPY_BUF(l,buf,len,cur); 3255 NEXTL(l); 3256 cur = CUR_CHAR(l); 3257 if (cur == 0) { 3258 SHRINK; 3259 GROW; 3260 cur = CUR_CHAR(l); 3261 } 3262 } 3263 buf[len] = 0; 3264 if (cur != '>') { 3265 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED, 3266 "ParsePI: PI %s never end ...\n", target, NULL); 3267 } else { 3268 SKIP(1); 3269 3270 /* 3271 * SAX: PI detected. 3272 */ 3273 if ((ctxt->sax) && (!ctxt->disableSAX) && 3274 (ctxt->sax->processingInstruction != NULL)) 3275 ctxt->sax->processingInstruction(ctxt->userData, 3276 target, buf); 3277 } 3278 xmlFree(buf); 3279 } else { 3280 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED, 3281 "PI is not started correctly", NULL, NULL); 3282 } 3283 ctxt->instate = state; 3284 } 3285 } 3286 3287 /** 3288 * htmlParseComment: 3289 * @ctxt: an HTML parser context 3290 * 3291 * Parse an XML (SGML) comment <!-- .... --> 3292 * 3293 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->' 3294 */ 3295 static void 3296 htmlParseComment(htmlParserCtxtPtr ctxt) { 3297 xmlChar *buf = NULL; 3298 int len; 3299 int size = HTML_PARSER_BUFFER_SIZE; 3300 int q, ql; 3301 int r, rl; 3302 int cur, l; 3303 xmlParserInputState state; 3304 3305 /* 3306 * Check that there is a comment right here. 3307 */ 3308 if ((RAW != '<') || (NXT(1) != '!') || 3309 (NXT(2) != '-') || (NXT(3) != '-')) return; 3310 3311 state = ctxt->instate; 3312 ctxt->instate = XML_PARSER_COMMENT; 3313 SHRINK; 3314 SKIP(4); 3315 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar)); 3316 if (buf == NULL) { 3317 htmlErrMemory(ctxt, "buffer allocation failed\n"); 3318 ctxt->instate = state; 3319 return; 3320 } 3321 len = 0; 3322 buf[len] = 0; 3323 q = CUR_CHAR(ql); 3324 if (!IS_CHAR(q)) 3325 goto unfinished; 3326 NEXTL(ql); 3327 r = CUR_CHAR(rl); 3328 if (!IS_CHAR(r)) 3329 goto unfinished; 3330 NEXTL(rl); 3331 cur = CUR_CHAR(l); 3332 while (IS_CHAR(cur) && 3333 ((cur != '>') || 3334 (r != '-') || (q != '-'))) { 3335 if (len + 5 >= size) { 3336 xmlChar *tmp; 3337 3338 size *= 2; 3339 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); 3340 if (tmp == NULL) { 3341 xmlFree(buf); 3342 htmlErrMemory(ctxt, "growing buffer failed\n"); 3343 ctxt->instate = state; 3344 return; 3345 } 3346 buf = tmp; 3347 } 3348 COPY_BUF(ql,buf,len,q); 3349 q = r; 3350 ql = rl; 3351 r = cur; 3352 rl = l; 3353 NEXTL(l); 3354 cur = CUR_CHAR(l); 3355 if (cur == 0) { 3356 SHRINK; 3357 GROW; 3358 cur = CUR_CHAR(l); 3359 } 3360 } 3361 buf[len] = 0; 3362 if (IS_CHAR(cur)) { 3363 NEXT; 3364 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) && 3365 (!ctxt->disableSAX)) 3366 ctxt->sax->comment(ctxt->userData, buf); 3367 xmlFree(buf); 3368 ctxt->instate = state; 3369 return; 3370 } 3371 3372 unfinished: 3373 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED, 3374 "Comment not terminated \n<!--%.50s\n", buf, NULL); 3375 xmlFree(buf); 3376 } 3377 3378 /** 3379 * htmlParseCharRef: 3380 * @ctxt: an HTML parser context 3381 * 3382 * parse Reference declarations 3383 * 3384 * [66] CharRef ::= '&#' [0-9]+ ';' | 3385 * '&#x' [0-9a-fA-F]+ ';' 3386 * 3387 * Returns the value parsed (as an int) 3388 */ 3389 int 3390 htmlParseCharRef(htmlParserCtxtPtr ctxt) { 3391 int val = 0; 3392 3393 if ((ctxt == NULL) || (ctxt->input == NULL)) { 3394 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 3395 "htmlParseCharRef: context error\n", 3396 NULL, NULL); 3397 return(0); 3398 } 3399 if ((CUR == '&') && (NXT(1) == '#') && 3400 ((NXT(2) == 'x') || NXT(2) == 'X')) { 3401 SKIP(3); 3402 while (CUR != ';') { 3403 if ((CUR >= '0') && (CUR <= '9')) 3404 val = val * 16 + (CUR - '0'); 3405 else if ((CUR >= 'a') && (CUR <= 'f')) 3406 val = val * 16 + (CUR - 'a') + 10; 3407 else if ((CUR >= 'A') && (CUR <= 'F')) 3408 val = val * 16 + (CUR - 'A') + 10; 3409 else { 3410 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF, 3411 "htmlParseCharRef: missing semicolon\n", 3412 NULL, NULL); 3413 break; 3414 } 3415 NEXT; 3416 } 3417 if (CUR == ';') 3418 NEXT; 3419 } else if ((CUR == '&') && (NXT(1) == '#')) { 3420 SKIP(2); 3421 while (CUR != ';') { 3422 if ((CUR >= '0') && (CUR <= '9')) 3423 val = val * 10 + (CUR - '0'); 3424 else { 3425 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF, 3426 "htmlParseCharRef: missing semicolon\n", 3427 NULL, NULL); 3428 break; 3429 } 3430 NEXT; 3431 } 3432 if (CUR == ';') 3433 NEXT; 3434 } else { 3435 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF, 3436 "htmlParseCharRef: invalid value\n", NULL, NULL); 3437 } 3438 /* 3439 * Check the value IS_CHAR ... 3440 */ 3441 if (IS_CHAR(val)) { 3442 return(val); 3443 } else { 3444 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 3445 "htmlParseCharRef: invalid xmlChar value %d\n", 3446 val); 3447 } 3448 return(0); 3449 } 3450 3451 3452 /** 3453 * htmlParseDocTypeDecl: 3454 * @ctxt: an HTML parser context 3455 * 3456 * parse a DOCTYPE declaration 3457 * 3458 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? 3459 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>' 3460 */ 3461 3462 static void 3463 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) { 3464 const xmlChar *name; 3465 xmlChar *ExternalID = NULL; 3466 xmlChar *URI = NULL; 3467 3468 /* 3469 * We know that '<!DOCTYPE' has been detected. 3470 */ 3471 SKIP(9); 3472 3473 SKIP_BLANKS; 3474 3475 /* 3476 * Parse the DOCTYPE name. 3477 */ 3478 name = htmlParseName(ctxt); 3479 if (name == NULL) { 3480 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 3481 "htmlParseDocTypeDecl : no DOCTYPE name !\n", 3482 NULL, NULL); 3483 } 3484 /* 3485 * Check that upper(name) == "HTML" !!!!!!!!!!!!! 3486 */ 3487 3488 SKIP_BLANKS; 3489 3490 /* 3491 * Check for SystemID and ExternalID 3492 */ 3493 URI = htmlParseExternalID(ctxt, &ExternalID); 3494 SKIP_BLANKS; 3495 3496 /* 3497 * We should be at the end of the DOCTYPE declaration. 3498 */ 3499 if (CUR != '>') { 3500 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED, 3501 "DOCTYPE improperly terminated\n", NULL, NULL); 3502 /* We shouldn't try to resynchronize ... */ 3503 } 3504 NEXT; 3505 3506 /* 3507 * Create or update the document accordingly to the DOCTYPE 3508 */ 3509 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) && 3510 (!ctxt->disableSAX)) 3511 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI); 3512 3513 /* 3514 * Cleanup, since we don't use all those identifiers 3515 */ 3516 if (URI != NULL) xmlFree(URI); 3517 if (ExternalID != NULL) xmlFree(ExternalID); 3518 } 3519 3520 /** 3521 * htmlParseAttribute: 3522 * @ctxt: an HTML parser context 3523 * @value: a xmlChar ** used to store the value of the attribute 3524 * 3525 * parse an attribute 3526 * 3527 * [41] Attribute ::= Name Eq AttValue 3528 * 3529 * [25] Eq ::= S? '=' S? 3530 * 3531 * With namespace: 3532 * 3533 * [NS 11] Attribute ::= QName Eq AttValue 3534 * 3535 * Also the case QName == xmlns:??? is handled independently as a namespace 3536 * definition. 3537 * 3538 * Returns the attribute name, and the value in *value. 3539 */ 3540 3541 static const xmlChar * 3542 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) { 3543 const xmlChar *name; 3544 xmlChar *val = NULL; 3545 3546 *value = NULL; 3547 name = htmlParseHTMLName(ctxt); 3548 if (name == NULL) { 3549 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 3550 "error parsing attribute name\n", NULL, NULL); 3551 return(NULL); 3552 } 3553 3554 /* 3555 * read the value 3556 */ 3557 SKIP_BLANKS; 3558 if (CUR == '=') { 3559 NEXT; 3560 SKIP_BLANKS; 3561 val = htmlParseAttValue(ctxt); 3562 } 3563 3564 *value = val; 3565 return(name); 3566 } 3567 3568 /** 3569 * htmlCheckEncodingDirect: 3570 * @ctxt: an HTML parser context 3571 * @attvalue: the attribute value 3572 * 3573 * Checks an attribute value to detect 3574 * the encoding 3575 * If a new encoding is detected the parser is switched to decode 3576 * it and pass UTF8 3577 */ 3578 static void 3579 htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) { 3580 3581 if ((ctxt == NULL) || (encoding == NULL) || 3582 (ctxt->options & HTML_PARSE_IGNORE_ENC)) 3583 return; 3584 3585 /* do not change encoding */ 3586 if (ctxt->input->encoding != NULL) 3587 return; 3588 3589 if (encoding != NULL) { 3590 xmlCharEncoding enc; 3591 xmlCharEncodingHandlerPtr handler; 3592 3593 while ((*encoding == ' ') || (*encoding == '\t')) encoding++; 3594 3595 if (ctxt->input->encoding != NULL) 3596 xmlFree((xmlChar *) ctxt->input->encoding); 3597 ctxt->input->encoding = xmlStrdup(encoding); 3598 3599 enc = xmlParseCharEncoding((const char *) encoding); 3600 /* 3601 * registered set of known encodings 3602 */ 3603 if (enc != XML_CHAR_ENCODING_ERROR) { 3604 if (((enc == XML_CHAR_ENCODING_UTF16LE) || 3605 (enc == XML_CHAR_ENCODING_UTF16BE) || 3606 (enc == XML_CHAR_ENCODING_UCS4LE) || 3607 (enc == XML_CHAR_ENCODING_UCS4BE)) && 3608 (ctxt->input->buf != NULL) && 3609 (ctxt->input->buf->encoder == NULL)) { 3610 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 3611 "htmlCheckEncoding: wrong encoding meta\n", 3612 NULL, NULL); 3613 } else { 3614 xmlSwitchEncoding(ctxt, enc); 3615 } 3616 ctxt->charset = XML_CHAR_ENCODING_UTF8; 3617 } else { 3618 /* 3619 * fallback for unknown encodings 3620 */ 3621 handler = xmlFindCharEncodingHandler((const char *) encoding); 3622 if (handler != NULL) { 3623 xmlSwitchToEncoding(ctxt, handler); 3624 ctxt->charset = XML_CHAR_ENCODING_UTF8; 3625 } else { 3626 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, 3627 "htmlCheckEncoding: unknown encoding %s\n", 3628 encoding, NULL); 3629 } 3630 } 3631 3632 if ((ctxt->input->buf != NULL) && 3633 (ctxt->input->buf->encoder != NULL) && 3634 (ctxt->input->buf->raw != NULL) && 3635 (ctxt->input->buf->buffer != NULL)) { 3636 int nbchars; 3637 int processed; 3638 3639 /* 3640 * convert as much as possible to the parser reading buffer. 3641 */ 3642 processed = ctxt->input->cur - ctxt->input->base; 3643 xmlBufShrink(ctxt->input->buf->buffer, processed); 3644 nbchars = xmlCharEncInput(ctxt->input->buf, 1); 3645 xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input); 3646 if (nbchars < 0) { 3647 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 3648 "htmlCheckEncoding: encoder error\n", 3649 NULL, NULL); 3650 } 3651 } 3652 } 3653 } 3654 3655 /** 3656 * htmlCheckEncoding: 3657 * @ctxt: an HTML parser context 3658 * @attvalue: the attribute value 3659 * 3660 * Checks an http-equiv attribute from a Meta tag to detect 3661 * the encoding 3662 * If a new encoding is detected the parser is switched to decode 3663 * it and pass UTF8 3664 */ 3665 static void 3666 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) { 3667 const xmlChar *encoding; 3668 3669 if (!attvalue) 3670 return; 3671 3672 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset"); 3673 if (encoding != NULL) { 3674 encoding += 7; 3675 } 3676 /* 3677 * skip blank 3678 */ 3679 if (encoding && IS_BLANK_CH(*encoding)) 3680 encoding = xmlStrcasestr(attvalue, BAD_CAST"="); 3681 if (encoding && *encoding == '=') { 3682 encoding ++; 3683 htmlCheckEncodingDirect(ctxt, encoding); 3684 } 3685 } 3686 3687 /** 3688 * htmlCheckMeta: 3689 * @ctxt: an HTML parser context 3690 * @atts: the attributes values 3691 * 3692 * Checks an attributes from a Meta tag 3693 */ 3694 static void 3695 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) { 3696 int i; 3697 const xmlChar *att, *value; 3698 int http = 0; 3699 const xmlChar *content = NULL; 3700 3701 if ((ctxt == NULL) || (atts == NULL)) 3702 return; 3703 3704 i = 0; 3705 att = atts[i++]; 3706 while (att != NULL) { 3707 value = atts[i++]; 3708 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv")) 3709 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) 3710 http = 1; 3711 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset"))) 3712 htmlCheckEncodingDirect(ctxt, value); 3713 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content"))) 3714 content = value; 3715 att = atts[i++]; 3716 } 3717 if ((http) && (content != NULL)) 3718 htmlCheckEncoding(ctxt, content); 3719 3720 } 3721 3722 /** 3723 * htmlParseStartTag: 3724 * @ctxt: an HTML parser context 3725 * 3726 * parse a start of tag either for rule element or 3727 * EmptyElement. In both case we don't parse the tag closing chars. 3728 * 3729 * [40] STag ::= '<' Name (S Attribute)* S? '>' 3730 * 3731 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' 3732 * 3733 * With namespace: 3734 * 3735 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>' 3736 * 3737 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>' 3738 * 3739 * Returns 0 in case of success, -1 in case of error and 1 if discarded 3740 */ 3741 3742 static int 3743 htmlParseStartTag(htmlParserCtxtPtr ctxt) { 3744 const xmlChar *name; 3745 const xmlChar *attname; 3746 xmlChar *attvalue; 3747 const xmlChar **atts; 3748 int nbatts = 0; 3749 int maxatts; 3750 int meta = 0; 3751 int i; 3752 int discardtag = 0; 3753 3754 if ((ctxt == NULL) || (ctxt->input == NULL)) { 3755 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 3756 "htmlParseStartTag: context error\n", NULL, NULL); 3757 return -1; 3758 } 3759 if (ctxt->instate == XML_PARSER_EOF) 3760 return(-1); 3761 if (CUR != '<') return -1; 3762 NEXT; 3763 3764 atts = ctxt->atts; 3765 maxatts = ctxt->maxatts; 3766 3767 GROW; 3768 name = htmlParseHTMLName(ctxt); 3769 if (name == NULL) { 3770 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 3771 "htmlParseStartTag: invalid element name\n", 3772 NULL, NULL); 3773 /* if recover preserve text on classic misconstructs */ 3774 if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') || 3775 (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) { 3776 htmlParseCharDataInternal(ctxt, '<'); 3777 return(-1); 3778 } 3779 3780 3781 /* Dump the bogus tag like browsers do */ 3782 while ((IS_CHAR_CH(CUR)) && (CUR != '>') && 3783 (ctxt->instate != XML_PARSER_EOF)) 3784 NEXT; 3785 return -1; 3786 } 3787 if (xmlStrEqual(name, BAD_CAST"meta")) 3788 meta = 1; 3789 3790 /* 3791 * Check for auto-closure of HTML elements. 3792 */ 3793 htmlAutoClose(ctxt, name); 3794 3795 /* 3796 * Check for implied HTML elements. 3797 */ 3798 htmlCheckImplied(ctxt, name); 3799 3800 /* 3801 * Avoid html at any level > 0, head at any level != 1 3802 * or any attempt to recurse body 3803 */ 3804 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) { 3805 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 3806 "htmlParseStartTag: misplaced <html> tag\n", 3807 name, NULL); 3808 discardtag = 1; 3809 ctxt->depth++; 3810 } 3811 if ((ctxt->nameNr != 1) && 3812 (xmlStrEqual(name, BAD_CAST"head"))) { 3813 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 3814 "htmlParseStartTag: misplaced <head> tag\n", 3815 name, NULL); 3816 discardtag = 1; 3817 ctxt->depth++; 3818 } 3819 if (xmlStrEqual(name, BAD_CAST"body")) { 3820 int indx; 3821 for (indx = 0;indx < ctxt->nameNr;indx++) { 3822 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) { 3823 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 3824 "htmlParseStartTag: misplaced <body> tag\n", 3825 name, NULL); 3826 discardtag = 1; 3827 ctxt->depth++; 3828 } 3829 } 3830 } 3831 3832 /* 3833 * Now parse the attributes, it ends up with the ending 3834 * 3835 * (S Attribute)* S? 3836 */ 3837 SKIP_BLANKS; 3838 while ((IS_CHAR_CH(CUR)) && 3839 (CUR != '>') && 3840 ((CUR != '/') || (NXT(1) != '>'))) { 3841 long cons = ctxt->nbChars; 3842 3843 GROW; 3844 attname = htmlParseAttribute(ctxt, &attvalue); 3845 if (attname != NULL) { 3846 3847 /* 3848 * Well formedness requires at most one declaration of an attribute 3849 */ 3850 for (i = 0; i < nbatts;i += 2) { 3851 if (xmlStrEqual(atts[i], attname)) { 3852 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED, 3853 "Attribute %s redefined\n", attname, NULL); 3854 if (attvalue != NULL) 3855 xmlFree(attvalue); 3856 goto failed; 3857 } 3858 } 3859 3860 /* 3861 * Add the pair to atts 3862 */ 3863 if (atts == NULL) { 3864 maxatts = 22; /* allow for 10 attrs by default */ 3865 atts = (const xmlChar **) 3866 xmlMalloc(maxatts * sizeof(xmlChar *)); 3867 if (atts == NULL) { 3868 htmlErrMemory(ctxt, NULL); 3869 if (attvalue != NULL) 3870 xmlFree(attvalue); 3871 goto failed; 3872 } 3873 ctxt->atts = atts; 3874 ctxt->maxatts = maxatts; 3875 } else if (nbatts + 4 > maxatts) { 3876 const xmlChar **n; 3877 3878 maxatts *= 2; 3879 n = (const xmlChar **) xmlRealloc((void *) atts, 3880 maxatts * sizeof(const xmlChar *)); 3881 if (n == NULL) { 3882 htmlErrMemory(ctxt, NULL); 3883 if (attvalue != NULL) 3884 xmlFree(attvalue); 3885 goto failed; 3886 } 3887 atts = n; 3888 ctxt->atts = atts; 3889 ctxt->maxatts = maxatts; 3890 } 3891 atts[nbatts++] = attname; 3892 atts[nbatts++] = attvalue; 3893 atts[nbatts] = NULL; 3894 atts[nbatts + 1] = NULL; 3895 } 3896 else { 3897 if (attvalue != NULL) 3898 xmlFree(attvalue); 3899 /* Dump the bogus attribute string up to the next blank or 3900 * the end of the tag. */ 3901 while ((IS_CHAR_CH(CUR)) && 3902 !(IS_BLANK_CH(CUR)) && (CUR != '>') && 3903 ((CUR != '/') || (NXT(1) != '>'))) 3904 NEXT; 3905 } 3906 3907 failed: 3908 SKIP_BLANKS; 3909 if (cons == ctxt->nbChars) { 3910 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 3911 "htmlParseStartTag: problem parsing attributes\n", 3912 NULL, NULL); 3913 break; 3914 } 3915 } 3916 3917 /* 3918 * Handle specific association to the META tag 3919 */ 3920 if (meta && (nbatts != 0)) 3921 htmlCheckMeta(ctxt, atts); 3922 3923 /* 3924 * SAX: Start of Element ! 3925 */ 3926 if (!discardtag) { 3927 htmlnamePush(ctxt, name); 3928 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) { 3929 if (nbatts != 0) 3930 ctxt->sax->startElement(ctxt->userData, name, atts); 3931 else 3932 ctxt->sax->startElement(ctxt->userData, name, NULL); 3933 } 3934 } 3935 3936 if (atts != NULL) { 3937 for (i = 1;i < nbatts;i += 2) { 3938 if (atts[i] != NULL) 3939 xmlFree((xmlChar *) atts[i]); 3940 } 3941 } 3942 3943 return(discardtag); 3944 } 3945 3946 /** 3947 * htmlParseEndTag: 3948 * @ctxt: an HTML parser context 3949 * 3950 * parse an end of tag 3951 * 3952 * [42] ETag ::= '</' Name S? '>' 3953 * 3954 * With namespace 3955 * 3956 * [NS 9] ETag ::= '</' QName S? '>' 3957 * 3958 * Returns 1 if the current level should be closed. 3959 */ 3960 3961 static int 3962 htmlParseEndTag(htmlParserCtxtPtr ctxt) 3963 { 3964 const xmlChar *name; 3965 const xmlChar *oldname; 3966 int i, ret; 3967 3968 if ((CUR != '<') || (NXT(1) != '/')) { 3969 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED, 3970 "htmlParseEndTag: '</' not found\n", NULL, NULL); 3971 return (0); 3972 } 3973 SKIP(2); 3974 3975 name = htmlParseHTMLName(ctxt); 3976 if (name == NULL) 3977 return (0); 3978 /* 3979 * We should definitely be at the ending "S? '>'" part 3980 */ 3981 SKIP_BLANKS; 3982 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) { 3983 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 3984 "End tag : expected '>'\n", NULL, NULL); 3985 if (ctxt->recovery) { 3986 /* 3987 * We're not at the ending > !! 3988 * Error, unless in recover mode where we search forwards 3989 * until we find a > 3990 */ 3991 while (CUR != '\0' && CUR != '>') NEXT; 3992 NEXT; 3993 } 3994 } else 3995 NEXT; 3996 3997 /* 3998 * if we ignored misplaced tags in htmlParseStartTag don't pop them 3999 * out now. 4000 */ 4001 if ((ctxt->depth > 0) && 4002 (xmlStrEqual(name, BAD_CAST "html") || 4003 xmlStrEqual(name, BAD_CAST "body") || 4004 xmlStrEqual(name, BAD_CAST "head"))) { 4005 ctxt->depth--; 4006 return (0); 4007 } 4008 4009 /* 4010 * If the name read is not one of the element in the parsing stack 4011 * then return, it's just an error. 4012 */ 4013 for (i = (ctxt->nameNr - 1); i >= 0; i--) { 4014 if (xmlStrEqual(name, ctxt->nameTab[i])) 4015 break; 4016 } 4017 if (i < 0) { 4018 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 4019 "Unexpected end tag : %s\n", name, NULL); 4020 return (0); 4021 } 4022 4023 4024 /* 4025 * Check for auto-closure of HTML elements. 4026 */ 4027 4028 htmlAutoCloseOnClose(ctxt, name); 4029 4030 /* 4031 * Well formedness constraints, opening and closing must match. 4032 * With the exception that the autoclose may have popped stuff out 4033 * of the stack. 4034 */ 4035 if (!xmlStrEqual(name, ctxt->name)) { 4036 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) { 4037 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 4038 "Opening and ending tag mismatch: %s and %s\n", 4039 name, ctxt->name); 4040 } 4041 } 4042 4043 /* 4044 * SAX: End of Tag 4045 */ 4046 oldname = ctxt->name; 4047 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) { 4048 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 4049 ctxt->sax->endElement(ctxt->userData, name); 4050 htmlNodeInfoPop(ctxt); 4051 htmlnamePop(ctxt); 4052 ret = 1; 4053 } else { 4054 ret = 0; 4055 } 4056 4057 return (ret); 4058 } 4059 4060 4061 /** 4062 * htmlParseReference: 4063 * @ctxt: an HTML parser context 4064 * 4065 * parse and handle entity references in content, 4066 * this will end-up in a call to character() since this is either a 4067 * CharRef, or a predefined entity. 4068 */ 4069 static void 4070 htmlParseReference(htmlParserCtxtPtr ctxt) { 4071 const htmlEntityDesc * ent; 4072 xmlChar out[6]; 4073 const xmlChar *name; 4074 if (CUR != '&') return; 4075 4076 if (NXT(1) == '#') { 4077 unsigned int c; 4078 int bits, i = 0; 4079 4080 c = htmlParseCharRef(ctxt); 4081 if (c == 0) 4082 return; 4083 4084 if (c < 0x80) { out[i++]= c; bits= -6; } 4085 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; } 4086 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; } 4087 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; } 4088 4089 for ( ; bits >= 0; bits-= 6) { 4090 out[i++]= ((c >> bits) & 0x3F) | 0x80; 4091 } 4092 out[i] = 0; 4093 4094 htmlCheckParagraph(ctxt); 4095 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 4096 ctxt->sax->characters(ctxt->userData, out, i); 4097 } else { 4098 ent = htmlParseEntityRef(ctxt, &name); 4099 if (name == NULL) { 4100 htmlCheckParagraph(ctxt); 4101 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 4102 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1); 4103 return; 4104 } 4105 if ((ent == NULL) || !(ent->value > 0)) { 4106 htmlCheckParagraph(ctxt); 4107 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) { 4108 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1); 4109 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name)); 4110 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */ 4111 } 4112 } else { 4113 unsigned int c; 4114 int bits, i = 0; 4115 4116 c = ent->value; 4117 if (c < 0x80) 4118 { out[i++]= c; bits= -6; } 4119 else if (c < 0x800) 4120 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; } 4121 else if (c < 0x10000) 4122 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; } 4123 else 4124 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; } 4125 4126 for ( ; bits >= 0; bits-= 6) { 4127 out[i++]= ((c >> bits) & 0x3F) | 0x80; 4128 } 4129 out[i] = 0; 4130 4131 htmlCheckParagraph(ctxt); 4132 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 4133 ctxt->sax->characters(ctxt->userData, out, i); 4134 } 4135 } 4136 } 4137 4138 /** 4139 * htmlParseContent: 4140 * @ctxt: an HTML parser context 4141 * 4142 * Parse a content: comment, sub-element, reference or text. 4143 * Kept for compatibility with old code 4144 */ 4145 4146 static void 4147 htmlParseContent(htmlParserCtxtPtr ctxt) { 4148 xmlChar *currentNode; 4149 int depth; 4150 const xmlChar *name; 4151 4152 currentNode = xmlStrdup(ctxt->name); 4153 depth = ctxt->nameNr; 4154 while (1) { 4155 long cons = ctxt->nbChars; 4156 4157 GROW; 4158 4159 if (ctxt->instate == XML_PARSER_EOF) 4160 break; 4161 4162 /* 4163 * Our tag or one of it's parent or children is ending. 4164 */ 4165 if ((CUR == '<') && (NXT(1) == '/')) { 4166 if (htmlParseEndTag(ctxt) && 4167 ((currentNode != NULL) || (ctxt->nameNr == 0))) { 4168 if (currentNode != NULL) 4169 xmlFree(currentNode); 4170 return; 4171 } 4172 continue; /* while */ 4173 } 4174 4175 else if ((CUR == '<') && 4176 ((IS_ASCII_LETTER(NXT(1))) || 4177 (NXT(1) == '_') || (NXT(1) == ':'))) { 4178 name = htmlParseHTMLName_nonInvasive(ctxt); 4179 if (name == NULL) { 4180 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 4181 "htmlParseStartTag: invalid element name\n", 4182 NULL, NULL); 4183 /* Dump the bogus tag like browsers do */ 4184 while ((IS_CHAR_CH(CUR)) && (CUR != '>')) 4185 NEXT; 4186 4187 if (currentNode != NULL) 4188 xmlFree(currentNode); 4189 return; 4190 } 4191 4192 if (ctxt->name != NULL) { 4193 if (htmlCheckAutoClose(name, ctxt->name) == 1) { 4194 htmlAutoClose(ctxt, name); 4195 continue; 4196 } 4197 } 4198 } 4199 4200 /* 4201 * Has this node been popped out during parsing of 4202 * the next element 4203 */ 4204 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) && 4205 (!xmlStrEqual(currentNode, ctxt->name))) 4206 { 4207 if (currentNode != NULL) xmlFree(currentNode); 4208 return; 4209 } 4210 4211 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) || 4212 (xmlStrEqual(currentNode, BAD_CAST"style")))) { 4213 /* 4214 * Handle SCRIPT/STYLE separately 4215 */ 4216 htmlParseScript(ctxt); 4217 } else { 4218 /* 4219 * Sometimes DOCTYPE arrives in the middle of the document 4220 */ 4221 if ((CUR == '<') && (NXT(1) == '!') && 4222 (UPP(2) == 'D') && (UPP(3) == 'O') && 4223 (UPP(4) == 'C') && (UPP(5) == 'T') && 4224 (UPP(6) == 'Y') && (UPP(7) == 'P') && 4225 (UPP(8) == 'E')) { 4226 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 4227 "Misplaced DOCTYPE declaration\n", 4228 BAD_CAST "DOCTYPE" , NULL); 4229 htmlParseDocTypeDecl(ctxt); 4230 } 4231 4232 /* 4233 * First case : a comment 4234 */ 4235 if ((CUR == '<') && (NXT(1) == '!') && 4236 (NXT(2) == '-') && (NXT(3) == '-')) { 4237 htmlParseComment(ctxt); 4238 } 4239 4240 /* 4241 * Second case : a Processing Instruction. 4242 */ 4243 else if ((CUR == '<') && (NXT(1) == '?')) { 4244 htmlParsePI(ctxt); 4245 } 4246 4247 /* 4248 * Third case : a sub-element. 4249 */ 4250 else if (CUR == '<') { 4251 htmlParseElement(ctxt); 4252 } 4253 4254 /* 4255 * Fourth case : a reference. If if has not been resolved, 4256 * parsing returns it's Name, create the node 4257 */ 4258 else if (CUR == '&') { 4259 htmlParseReference(ctxt); 4260 } 4261 4262 /* 4263 * Fifth case : end of the resource 4264 */ 4265 else if (CUR == 0) { 4266 htmlAutoCloseOnEnd(ctxt); 4267 break; 4268 } 4269 4270 /* 4271 * Last case, text. Note that References are handled directly. 4272 */ 4273 else { 4274 htmlParseCharData(ctxt); 4275 } 4276 4277 if (cons == ctxt->nbChars) { 4278 if (ctxt->node != NULL) { 4279 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 4280 "detected an error in element content\n", 4281 NULL, NULL); 4282 } 4283 break; 4284 } 4285 } 4286 GROW; 4287 } 4288 if (currentNode != NULL) xmlFree(currentNode); 4289 } 4290 4291 /** 4292 * htmlParseElement: 4293 * @ctxt: an HTML parser context 4294 * 4295 * parse an HTML element, this is highly recursive 4296 * this is kept for compatibility with previous code versions 4297 * 4298 * [39] element ::= EmptyElemTag | STag content ETag 4299 * 4300 * [41] Attribute ::= Name Eq AttValue 4301 */ 4302 4303 void 4304 htmlParseElement(htmlParserCtxtPtr ctxt) { 4305 const xmlChar *name; 4306 xmlChar *currentNode = NULL; 4307 const htmlElemDesc * info; 4308 htmlParserNodeInfo node_info; 4309 int failed; 4310 int depth; 4311 const xmlChar *oldptr; 4312 4313 if ((ctxt == NULL) || (ctxt->input == NULL)) { 4314 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 4315 "htmlParseElement: context error\n", NULL, NULL); 4316 return; 4317 } 4318 4319 if (ctxt->instate == XML_PARSER_EOF) 4320 return; 4321 4322 /* Capture start position */ 4323 if (ctxt->record_info) { 4324 node_info.begin_pos = ctxt->input->consumed + 4325 (CUR_PTR - ctxt->input->base); 4326 node_info.begin_line = ctxt->input->line; 4327 } 4328 4329 failed = htmlParseStartTag(ctxt); 4330 name = ctxt->name; 4331 if ((failed == -1) || (name == NULL)) { 4332 if (CUR == '>') 4333 NEXT; 4334 return; 4335 } 4336 4337 /* 4338 * Lookup the info for that element. 4339 */ 4340 info = htmlTagLookup(name); 4341 if (info == NULL) { 4342 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, 4343 "Tag %s invalid\n", name, NULL); 4344 } 4345 4346 /* 4347 * Check for an Empty Element labeled the XML/SGML way 4348 */ 4349 if ((CUR == '/') && (NXT(1) == '>')) { 4350 SKIP(2); 4351 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 4352 ctxt->sax->endElement(ctxt->userData, name); 4353 htmlnamePop(ctxt); 4354 return; 4355 } 4356 4357 if (CUR == '>') { 4358 NEXT; 4359 } else { 4360 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 4361 "Couldn't find end of Start Tag %s\n", name, NULL); 4362 4363 /* 4364 * end of parsing of this node. 4365 */ 4366 if (xmlStrEqual(name, ctxt->name)) { 4367 nodePop(ctxt); 4368 htmlnamePop(ctxt); 4369 } 4370 4371 /* 4372 * Capture end position and add node 4373 */ 4374 if (ctxt->record_info) { 4375 node_info.end_pos = ctxt->input->consumed + 4376 (CUR_PTR - ctxt->input->base); 4377 node_info.end_line = ctxt->input->line; 4378 node_info.node = ctxt->node; 4379 xmlParserAddNodeInfo(ctxt, &node_info); 4380 } 4381 return; 4382 } 4383 4384 /* 4385 * Check for an Empty Element from DTD definition 4386 */ 4387 if ((info != NULL) && (info->empty)) { 4388 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 4389 ctxt->sax->endElement(ctxt->userData, name); 4390 htmlnamePop(ctxt); 4391 return; 4392 } 4393 4394 /* 4395 * Parse the content of the element: 4396 */ 4397 currentNode = xmlStrdup(ctxt->name); 4398 depth = ctxt->nameNr; 4399 while (IS_CHAR_CH(CUR)) { 4400 oldptr = ctxt->input->cur; 4401 htmlParseContent(ctxt); 4402 if (oldptr==ctxt->input->cur) break; 4403 if (ctxt->nameNr < depth) break; 4404 } 4405 4406 /* 4407 * Capture end position and add node 4408 */ 4409 if ( currentNode != NULL && ctxt->record_info ) { 4410 node_info.end_pos = ctxt->input->consumed + 4411 (CUR_PTR - ctxt->input->base); 4412 node_info.end_line = ctxt->input->line; 4413 node_info.node = ctxt->node; 4414 xmlParserAddNodeInfo(ctxt, &node_info); 4415 } 4416 if (!IS_CHAR_CH(CUR)) { 4417 htmlAutoCloseOnEnd(ctxt); 4418 } 4419 4420 if (currentNode != NULL) 4421 xmlFree(currentNode); 4422 } 4423 4424 static void 4425 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) { 4426 /* 4427 * Capture end position and add node 4428 */ 4429 if ( ctxt->node != NULL && ctxt->record_info ) { 4430 ctxt->nodeInfo->end_pos = ctxt->input->consumed + 4431 (CUR_PTR - ctxt->input->base); 4432 ctxt->nodeInfo->end_line = ctxt->input->line; 4433 ctxt->nodeInfo->node = ctxt->node; 4434 xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo); 4435 htmlNodeInfoPop(ctxt); 4436 } 4437 if (!IS_CHAR_CH(CUR)) { 4438 htmlAutoCloseOnEnd(ctxt); 4439 } 4440 } 4441 4442 /** 4443 * htmlParseElementInternal: 4444 * @ctxt: an HTML parser context 4445 * 4446 * parse an HTML element, new version, non recursive 4447 * 4448 * [39] element ::= EmptyElemTag | STag content ETag 4449 * 4450 * [41] Attribute ::= Name Eq AttValue 4451 */ 4452 4453 static void 4454 htmlParseElementInternal(htmlParserCtxtPtr ctxt) { 4455 const xmlChar *name; 4456 const htmlElemDesc * info; 4457 htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 }; 4458 int failed; 4459 4460 if ((ctxt == NULL) || (ctxt->input == NULL)) { 4461 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 4462 "htmlParseElementInternal: context error\n", NULL, NULL); 4463 return; 4464 } 4465 4466 if (ctxt->instate == XML_PARSER_EOF) 4467 return; 4468 4469 /* Capture start position */ 4470 if (ctxt->record_info) { 4471 node_info.begin_pos = ctxt->input->consumed + 4472 (CUR_PTR - ctxt->input->base); 4473 node_info.begin_line = ctxt->input->line; 4474 } 4475 4476 failed = htmlParseStartTag(ctxt); 4477 name = ctxt->name; 4478 if ((failed == -1) || (name == NULL)) { 4479 if (CUR == '>') 4480 NEXT; 4481 return; 4482 } 4483 4484 /* 4485 * Lookup the info for that element. 4486 */ 4487 info = htmlTagLookup(name); 4488 if (info == NULL) { 4489 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, 4490 "Tag %s invalid\n", name, NULL); 4491 } 4492 4493 /* 4494 * Check for an Empty Element labeled the XML/SGML way 4495 */ 4496 if ((CUR == '/') && (NXT(1) == '>')) { 4497 SKIP(2); 4498 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 4499 ctxt->sax->endElement(ctxt->userData, name); 4500 htmlnamePop(ctxt); 4501 return; 4502 } 4503 4504 if (CUR == '>') { 4505 NEXT; 4506 } else { 4507 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 4508 "Couldn't find end of Start Tag %s\n", name, NULL); 4509 4510 /* 4511 * end of parsing of this node. 4512 */ 4513 if (xmlStrEqual(name, ctxt->name)) { 4514 nodePop(ctxt); 4515 htmlnamePop(ctxt); 4516 } 4517 4518 if (ctxt->record_info) 4519 htmlNodeInfoPush(ctxt, &node_info); 4520 htmlParserFinishElementParsing(ctxt); 4521 return; 4522 } 4523 4524 /* 4525 * Check for an Empty Element from DTD definition 4526 */ 4527 if ((info != NULL) && (info->empty)) { 4528 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 4529 ctxt->sax->endElement(ctxt->userData, name); 4530 htmlnamePop(ctxt); 4531 return; 4532 } 4533 4534 if (ctxt->record_info) 4535 htmlNodeInfoPush(ctxt, &node_info); 4536 } 4537 4538 /** 4539 * htmlParseContentInternal: 4540 * @ctxt: an HTML parser context 4541 * 4542 * Parse a content: comment, sub-element, reference or text. 4543 * New version for non recursive htmlParseElementInternal 4544 */ 4545 4546 static void 4547 htmlParseContentInternal(htmlParserCtxtPtr ctxt) { 4548 xmlChar *currentNode; 4549 int depth; 4550 const xmlChar *name; 4551 4552 currentNode = xmlStrdup(ctxt->name); 4553 depth = ctxt->nameNr; 4554 while (1) { 4555 long cons = ctxt->nbChars; 4556 4557 GROW; 4558 4559 if (ctxt->instate == XML_PARSER_EOF) 4560 break; 4561 4562 /* 4563 * Our tag or one of it's parent or children is ending. 4564 */ 4565 if ((CUR == '<') && (NXT(1) == '/')) { 4566 if (htmlParseEndTag(ctxt) && 4567 ((currentNode != NULL) || (ctxt->nameNr == 0))) { 4568 if (currentNode != NULL) 4569 xmlFree(currentNode); 4570 4571 currentNode = xmlStrdup(ctxt->name); 4572 depth = ctxt->nameNr; 4573 } 4574 continue; /* while */ 4575 } 4576 4577 else if ((CUR == '<') && 4578 ((IS_ASCII_LETTER(NXT(1))) || 4579 (NXT(1) == '_') || (NXT(1) == ':'))) { 4580 name = htmlParseHTMLName_nonInvasive(ctxt); 4581 if (name == NULL) { 4582 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 4583 "htmlParseStartTag: invalid element name\n", 4584 NULL, NULL); 4585 /* Dump the bogus tag like browsers do */ 4586 while ((IS_CHAR_CH(CUR)) && (CUR != '>')) 4587 NEXT; 4588 4589 htmlParserFinishElementParsing(ctxt); 4590 if (currentNode != NULL) 4591 xmlFree(currentNode); 4592 4593 currentNode = xmlStrdup(ctxt->name); 4594 depth = ctxt->nameNr; 4595 continue; 4596 } 4597 4598 if (ctxt->name != NULL) { 4599 if (htmlCheckAutoClose(name, ctxt->name) == 1) { 4600 htmlAutoClose(ctxt, name); 4601 continue; 4602 } 4603 } 4604 } 4605 4606 /* 4607 * Has this node been popped out during parsing of 4608 * the next element 4609 */ 4610 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) && 4611 (!xmlStrEqual(currentNode, ctxt->name))) 4612 { 4613 htmlParserFinishElementParsing(ctxt); 4614 if (currentNode != NULL) xmlFree(currentNode); 4615 4616 currentNode = xmlStrdup(ctxt->name); 4617 depth = ctxt->nameNr; 4618 continue; 4619 } 4620 4621 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) || 4622 (xmlStrEqual(currentNode, BAD_CAST"style")))) { 4623 /* 4624 * Handle SCRIPT/STYLE separately 4625 */ 4626 htmlParseScript(ctxt); 4627 } else { 4628 /* 4629 * Sometimes DOCTYPE arrives in the middle of the document 4630 */ 4631 if ((CUR == '<') && (NXT(1) == '!') && 4632 (UPP(2) == 'D') && (UPP(3) == 'O') && 4633 (UPP(4) == 'C') && (UPP(5) == 'T') && 4634 (UPP(6) == 'Y') && (UPP(7) == 'P') && 4635 (UPP(8) == 'E')) { 4636 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 4637 "Misplaced DOCTYPE declaration\n", 4638 BAD_CAST "DOCTYPE" , NULL); 4639 htmlParseDocTypeDecl(ctxt); 4640 } 4641 4642 /* 4643 * First case : a comment 4644 */ 4645 if ((CUR == '<') && (NXT(1) == '!') && 4646 (NXT(2) == '-') && (NXT(3) == '-')) { 4647 htmlParseComment(ctxt); 4648 } 4649 4650 /* 4651 * Second case : a Processing Instruction. 4652 */ 4653 else if ((CUR == '<') && (NXT(1) == '?')) { 4654 htmlParsePI(ctxt); 4655 } 4656 4657 /* 4658 * Third case : a sub-element. 4659 */ 4660 else if (CUR == '<') { 4661 htmlParseElementInternal(ctxt); 4662 if (currentNode != NULL) xmlFree(currentNode); 4663 4664 currentNode = xmlStrdup(ctxt->name); 4665 depth = ctxt->nameNr; 4666 } 4667 4668 /* 4669 * Fourth case : a reference. If if has not been resolved, 4670 * parsing returns it's Name, create the node 4671 */ 4672 else if (CUR == '&') { 4673 htmlParseReference(ctxt); 4674 } 4675 4676 /* 4677 * Fifth case : end of the resource 4678 */ 4679 else if (CUR == 0) { 4680 htmlAutoCloseOnEnd(ctxt); 4681 break; 4682 } 4683 4684 /* 4685 * Last case, text. Note that References are handled directly. 4686 */ 4687 else { 4688 htmlParseCharData(ctxt); 4689 } 4690 4691 if (cons == ctxt->nbChars) { 4692 if (ctxt->node != NULL) { 4693 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 4694 "detected an error in element content\n", 4695 NULL, NULL); 4696 } 4697 break; 4698 } 4699 } 4700 GROW; 4701 } 4702 if (currentNode != NULL) xmlFree(currentNode); 4703 } 4704 4705 /** 4706 * htmlParseContent: 4707 * @ctxt: an HTML parser context 4708 * 4709 * Parse a content: comment, sub-element, reference or text. 4710 * This is the entry point when called from parser.c 4711 */ 4712 4713 void 4714 __htmlParseContent(void *ctxt) { 4715 if (ctxt != NULL) 4716 htmlParseContentInternal((htmlParserCtxtPtr) ctxt); 4717 } 4718 4719 /** 4720 * htmlParseDocument: 4721 * @ctxt: an HTML parser context 4722 * 4723 * parse an HTML document (and build a tree if using the standard SAX 4724 * interface). 4725 * 4726 * Returns 0, -1 in case of error. the parser context is augmented 4727 * as a result of the parsing. 4728 */ 4729 4730 int 4731 htmlParseDocument(htmlParserCtxtPtr ctxt) { 4732 xmlChar start[4]; 4733 xmlCharEncoding enc; 4734 xmlDtdPtr dtd; 4735 4736 xmlInitParser(); 4737 4738 htmlDefaultSAXHandlerInit(); 4739 4740 if ((ctxt == NULL) || (ctxt->input == NULL)) { 4741 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 4742 "htmlParseDocument: context error\n", NULL, NULL); 4743 return(XML_ERR_INTERNAL_ERROR); 4744 } 4745 ctxt->html = 1; 4746 ctxt->linenumbers = 1; 4747 GROW; 4748 /* 4749 * SAX: beginning of the document processing. 4750 */ 4751 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) 4752 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator); 4753 4754 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) && 4755 ((ctxt->input->end - ctxt->input->cur) >= 4)) { 4756 /* 4757 * Get the 4 first bytes and decode the charset 4758 * if enc != XML_CHAR_ENCODING_NONE 4759 * plug some encoding conversion routines. 4760 */ 4761 start[0] = RAW; 4762 start[1] = NXT(1); 4763 start[2] = NXT(2); 4764 start[3] = NXT(3); 4765 enc = xmlDetectCharEncoding(&start[0], 4); 4766 if (enc != XML_CHAR_ENCODING_NONE) { 4767 xmlSwitchEncoding(ctxt, enc); 4768 } 4769 } 4770 4771 /* 4772 * Wipe out everything which is before the first '<' 4773 */ 4774 SKIP_BLANKS; 4775 if (CUR == 0) { 4776 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY, 4777 "Document is empty\n", NULL, NULL); 4778 } 4779 4780 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX)) 4781 ctxt->sax->startDocument(ctxt->userData); 4782 4783 4784 /* 4785 * Parse possible comments and PIs before any content 4786 */ 4787 while (((CUR == '<') && (NXT(1) == '!') && 4788 (NXT(2) == '-') && (NXT(3) == '-')) || 4789 ((CUR == '<') && (NXT(1) == '?'))) { 4790 htmlParseComment(ctxt); 4791 htmlParsePI(ctxt); 4792 SKIP_BLANKS; 4793 } 4794 4795 4796 /* 4797 * Then possibly doc type declaration(s) and more Misc 4798 * (doctypedecl Misc*)? 4799 */ 4800 if ((CUR == '<') && (NXT(1) == '!') && 4801 (UPP(2) == 'D') && (UPP(3) == 'O') && 4802 (UPP(4) == 'C') && (UPP(5) == 'T') && 4803 (UPP(6) == 'Y') && (UPP(7) == 'P') && 4804 (UPP(8) == 'E')) { 4805 htmlParseDocTypeDecl(ctxt); 4806 } 4807 SKIP_BLANKS; 4808 4809 /* 4810 * Parse possible comments and PIs before any content 4811 */ 4812 while (((CUR == '<') && (NXT(1) == '!') && 4813 (NXT(2) == '-') && (NXT(3) == '-')) || 4814 ((CUR == '<') && (NXT(1) == '?'))) { 4815 htmlParseComment(ctxt); 4816 htmlParsePI(ctxt); 4817 SKIP_BLANKS; 4818 } 4819 4820 /* 4821 * Time to start parsing the tree itself 4822 */ 4823 htmlParseContentInternal(ctxt); 4824 4825 /* 4826 * autoclose 4827 */ 4828 if (CUR == 0) 4829 htmlAutoCloseOnEnd(ctxt); 4830 4831 4832 /* 4833 * SAX: end of the document processing. 4834 */ 4835 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 4836 ctxt->sax->endDocument(ctxt->userData); 4837 4838 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) { 4839 dtd = xmlGetIntSubset(ctxt->myDoc); 4840 if (dtd == NULL) 4841 ctxt->myDoc->intSubset = 4842 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html", 4843 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN", 4844 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"); 4845 } 4846 if (! ctxt->wellFormed) return(-1); 4847 return(0); 4848 } 4849 4850 4851 /************************************************************************ 4852 * * 4853 * Parser contexts handling * 4854 * * 4855 ************************************************************************/ 4856 4857 /** 4858 * htmlInitParserCtxt: 4859 * @ctxt: an HTML parser context 4860 * 4861 * Initialize a parser context 4862 * 4863 * Returns 0 in case of success and -1 in case of error 4864 */ 4865 4866 static int 4867 htmlInitParserCtxt(htmlParserCtxtPtr ctxt) 4868 { 4869 htmlSAXHandler *sax; 4870 4871 if (ctxt == NULL) return(-1); 4872 memset(ctxt, 0, sizeof(htmlParserCtxt)); 4873 4874 ctxt->dict = xmlDictCreate(); 4875 if (ctxt->dict == NULL) { 4876 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4877 return(-1); 4878 } 4879 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler)); 4880 if (sax == NULL) { 4881 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4882 return(-1); 4883 } 4884 else 4885 memset(sax, 0, sizeof(htmlSAXHandler)); 4886 4887 /* Allocate the Input stack */ 4888 ctxt->inputTab = (htmlParserInputPtr *) 4889 xmlMalloc(5 * sizeof(htmlParserInputPtr)); 4890 if (ctxt->inputTab == NULL) { 4891 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4892 ctxt->inputNr = 0; 4893 ctxt->inputMax = 0; 4894 ctxt->input = NULL; 4895 return(-1); 4896 } 4897 ctxt->inputNr = 0; 4898 ctxt->inputMax = 5; 4899 ctxt->input = NULL; 4900 ctxt->version = NULL; 4901 ctxt->encoding = NULL; 4902 ctxt->standalone = -1; 4903 ctxt->instate = XML_PARSER_START; 4904 4905 /* Allocate the Node stack */ 4906 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr)); 4907 if (ctxt->nodeTab == NULL) { 4908 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4909 ctxt->nodeNr = 0; 4910 ctxt->nodeMax = 0; 4911 ctxt->node = NULL; 4912 ctxt->inputNr = 0; 4913 ctxt->inputMax = 0; 4914 ctxt->input = NULL; 4915 return(-1); 4916 } 4917 ctxt->nodeNr = 0; 4918 ctxt->nodeMax = 10; 4919 ctxt->node = NULL; 4920 4921 /* Allocate the Name stack */ 4922 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *)); 4923 if (ctxt->nameTab == NULL) { 4924 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4925 ctxt->nameNr = 0; 4926 ctxt->nameMax = 0; 4927 ctxt->name = NULL; 4928 ctxt->nodeNr = 0; 4929 ctxt->nodeMax = 0; 4930 ctxt->node = NULL; 4931 ctxt->inputNr = 0; 4932 ctxt->inputMax = 0; 4933 ctxt->input = NULL; 4934 return(-1); 4935 } 4936 ctxt->nameNr = 0; 4937 ctxt->nameMax = 10; 4938 ctxt->name = NULL; 4939 4940 ctxt->nodeInfoTab = NULL; 4941 ctxt->nodeInfoNr = 0; 4942 ctxt->nodeInfoMax = 0; 4943 4944 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler; 4945 else { 4946 ctxt->sax = sax; 4947 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1)); 4948 } 4949 ctxt->userData = ctxt; 4950 ctxt->myDoc = NULL; 4951 ctxt->wellFormed = 1; 4952 ctxt->replaceEntities = 0; 4953 ctxt->linenumbers = xmlLineNumbersDefaultValue; 4954 ctxt->keepBlanks = xmlKeepBlanksDefaultValue; 4955 ctxt->html = 1; 4956 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0; 4957 ctxt->vctxt.userData = ctxt; 4958 ctxt->vctxt.error = xmlParserValidityError; 4959 ctxt->vctxt.warning = xmlParserValidityWarning; 4960 ctxt->record_info = 0; 4961 ctxt->validate = 0; 4962 ctxt->nbChars = 0; 4963 ctxt->checkIndex = 0; 4964 ctxt->catalogs = NULL; 4965 xmlInitNodeInfoSeq(&ctxt->node_seq); 4966 return(0); 4967 } 4968 4969 /** 4970 * htmlFreeParserCtxt: 4971 * @ctxt: an HTML parser context 4972 * 4973 * Free all the memory used by a parser context. However the parsed 4974 * document in ctxt->myDoc is not freed. 4975 */ 4976 4977 void 4978 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt) 4979 { 4980 xmlFreeParserCtxt(ctxt); 4981 } 4982 4983 /** 4984 * htmlNewParserCtxt: 4985 * 4986 * Allocate and initialize a new parser context. 4987 * 4988 * Returns the htmlParserCtxtPtr or NULL in case of allocation error 4989 */ 4990 4991 htmlParserCtxtPtr 4992 htmlNewParserCtxt(void) 4993 { 4994 xmlParserCtxtPtr ctxt; 4995 4996 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt)); 4997 if (ctxt == NULL) { 4998 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n"); 4999 return(NULL); 5000 } 5001 memset(ctxt, 0, sizeof(xmlParserCtxt)); 5002 if (htmlInitParserCtxt(ctxt) < 0) { 5003 htmlFreeParserCtxt(ctxt); 5004 return(NULL); 5005 } 5006 return(ctxt); 5007 } 5008 5009 /** 5010 * htmlCreateMemoryParserCtxt: 5011 * @buffer: a pointer to a char array 5012 * @size: the size of the array 5013 * 5014 * Create a parser context for an HTML in-memory document. 5015 * 5016 * Returns the new parser context or NULL 5017 */ 5018 htmlParserCtxtPtr 5019 htmlCreateMemoryParserCtxt(const char *buffer, int size) { 5020 xmlParserCtxtPtr ctxt; 5021 xmlParserInputPtr input; 5022 xmlParserInputBufferPtr buf; 5023 5024 if (buffer == NULL) 5025 return(NULL); 5026 if (size <= 0) 5027 return(NULL); 5028 5029 ctxt = htmlNewParserCtxt(); 5030 if (ctxt == NULL) 5031 return(NULL); 5032 5033 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE); 5034 if (buf == NULL) return(NULL); 5035 5036 input = xmlNewInputStream(ctxt); 5037 if (input == NULL) { 5038 xmlFreeParserCtxt(ctxt); 5039 return(NULL); 5040 } 5041 5042 input->filename = NULL; 5043 input->buf = buf; 5044 xmlBufResetInput(buf->buffer, input); 5045 5046 inputPush(ctxt, input); 5047 return(ctxt); 5048 } 5049 5050 /** 5051 * htmlCreateDocParserCtxt: 5052 * @cur: a pointer to an array of xmlChar 5053 * @encoding: a free form C string describing the HTML document encoding, or NULL 5054 * 5055 * Create a parser context for an HTML document. 5056 * 5057 * TODO: check the need to add encoding handling there 5058 * 5059 * Returns the new parser context or NULL 5060 */ 5061 static htmlParserCtxtPtr 5062 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) { 5063 int len; 5064 htmlParserCtxtPtr ctxt; 5065 5066 if (cur == NULL) 5067 return(NULL); 5068 len = xmlStrlen(cur); 5069 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len); 5070 if (ctxt == NULL) 5071 return(NULL); 5072 5073 if (encoding != NULL) { 5074 xmlCharEncoding enc; 5075 xmlCharEncodingHandlerPtr handler; 5076 5077 if (ctxt->input->encoding != NULL) 5078 xmlFree((xmlChar *) ctxt->input->encoding); 5079 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding); 5080 5081 enc = xmlParseCharEncoding(encoding); 5082 /* 5083 * registered set of known encodings 5084 */ 5085 if (enc != XML_CHAR_ENCODING_ERROR) { 5086 xmlSwitchEncoding(ctxt, enc); 5087 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) { 5088 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, 5089 "Unsupported encoding %s\n", 5090 (const xmlChar *) encoding, NULL); 5091 } 5092 } else { 5093 /* 5094 * fallback for unknown encodings 5095 */ 5096 handler = xmlFindCharEncodingHandler((const char *) encoding); 5097 if (handler != NULL) { 5098 xmlSwitchToEncoding(ctxt, handler); 5099 } else { 5100 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, 5101 "Unsupported encoding %s\n", 5102 (const xmlChar *) encoding, NULL); 5103 } 5104 } 5105 } 5106 return(ctxt); 5107 } 5108 5109 #ifdef LIBXML_PUSH_ENABLED 5110 /************************************************************************ 5111 * * 5112 * Progressive parsing interfaces * 5113 * * 5114 ************************************************************************/ 5115 5116 /** 5117 * htmlParseLookupSequence: 5118 * @ctxt: an HTML parser context 5119 * @first: the first char to lookup 5120 * @next: the next char to lookup or zero 5121 * @third: the next char to lookup or zero 5122 * @comment: flag to force checking inside comments 5123 * 5124 * Try to find if a sequence (first, next, third) or just (first next) or 5125 * (first) is available in the input stream. 5126 * This function has a side effect of (possibly) incrementing ctxt->checkIndex 5127 * to avoid rescanning sequences of bytes, it DOES change the state of the 5128 * parser, do not use liberally. 5129 * This is basically similar to xmlParseLookupSequence() 5130 * 5131 * Returns the index to the current parsing point if the full sequence 5132 * is available, -1 otherwise. 5133 */ 5134 static int 5135 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first, 5136 xmlChar next, xmlChar third, int iscomment, 5137 int ignoreattrval) 5138 { 5139 int base, len; 5140 htmlParserInputPtr in; 5141 const xmlChar *buf; 5142 int incomment = 0; 5143 int invalue = 0; 5144 char valdellim = 0x0; 5145 5146 in = ctxt->input; 5147 if (in == NULL) 5148 return (-1); 5149 5150 base = in->cur - in->base; 5151 if (base < 0) 5152 return (-1); 5153 5154 if (ctxt->checkIndex > base) 5155 base = ctxt->checkIndex; 5156 5157 if (in->buf == NULL) { 5158 buf = in->base; 5159 len = in->length; 5160 } else { 5161 buf = xmlBufContent(in->buf->buffer); 5162 len = xmlBufUse(in->buf->buffer); 5163 } 5164 5165 /* take into account the sequence length */ 5166 if (third) 5167 len -= 2; 5168 else if (next) 5169 len--; 5170 for (; base < len; base++) { 5171 if ((!incomment) && (base + 4 < len) && (!iscomment)) { 5172 if ((buf[base] == '<') && (buf[base + 1] == '!') && 5173 (buf[base + 2] == '-') && (buf[base + 3] == '-')) { 5174 incomment = 1; 5175 /* do not increment past <! - some people use <!--> */ 5176 base += 2; 5177 } 5178 } 5179 if (ignoreattrval) { 5180 if (buf[base] == '"' || buf[base] == '\'') { 5181 if (invalue) { 5182 if (buf[base] == valdellim) { 5183 invalue = 0; 5184 continue; 5185 } 5186 } else { 5187 valdellim = buf[base]; 5188 invalue = 1; 5189 continue; 5190 } 5191 } else if (invalue) { 5192 continue; 5193 } 5194 } 5195 if (incomment) { 5196 if (base + 3 > len) 5197 return (-1); 5198 if ((buf[base] == '-') && (buf[base + 1] == '-') && 5199 (buf[base + 2] == '>')) { 5200 incomment = 0; 5201 base += 2; 5202 } 5203 continue; 5204 } 5205 if (buf[base] == first) { 5206 if (third != 0) { 5207 if ((buf[base + 1] != next) || (buf[base + 2] != third)) 5208 continue; 5209 } else if (next != 0) { 5210 if (buf[base + 1] != next) 5211 continue; 5212 } 5213 ctxt->checkIndex = 0; 5214 #ifdef DEBUG_PUSH 5215 if (next == 0) 5216 xmlGenericError(xmlGenericErrorContext, 5217 "HPP: lookup '%c' found at %d\n", 5218 first, base); 5219 else if (third == 0) 5220 xmlGenericError(xmlGenericErrorContext, 5221 "HPP: lookup '%c%c' found at %d\n", 5222 first, next, base); 5223 else 5224 xmlGenericError(xmlGenericErrorContext, 5225 "HPP: lookup '%c%c%c' found at %d\n", 5226 first, next, third, base); 5227 #endif 5228 return (base - (in->cur - in->base)); 5229 } 5230 } 5231 if ((!incomment) && (!invalue)) 5232 ctxt->checkIndex = base; 5233 #ifdef DEBUG_PUSH 5234 if (next == 0) 5235 xmlGenericError(xmlGenericErrorContext, 5236 "HPP: lookup '%c' failed\n", first); 5237 else if (third == 0) 5238 xmlGenericError(xmlGenericErrorContext, 5239 "HPP: lookup '%c%c' failed\n", first, next); 5240 else 5241 xmlGenericError(xmlGenericErrorContext, 5242 "HPP: lookup '%c%c%c' failed\n", first, next, 5243 third); 5244 #endif 5245 return (-1); 5246 } 5247 5248 /** 5249 * htmlParseLookupChars: 5250 * @ctxt: an HTML parser context 5251 * @stop: Array of chars, which stop the lookup. 5252 * @stopLen: Length of stop-Array 5253 * 5254 * Try to find if any char of the stop-Array is available in the input 5255 * stream. 5256 * This function has a side effect of (possibly) incrementing ctxt->checkIndex 5257 * to avoid rescanning sequences of bytes, it DOES change the state of the 5258 * parser, do not use liberally. 5259 * 5260 * Returns the index to the current parsing point if a stopChar 5261 * is available, -1 otherwise. 5262 */ 5263 static int 5264 htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop, 5265 int stopLen) 5266 { 5267 int base, len; 5268 htmlParserInputPtr in; 5269 const xmlChar *buf; 5270 int incomment = 0; 5271 int i; 5272 5273 in = ctxt->input; 5274 if (in == NULL) 5275 return (-1); 5276 5277 base = in->cur - in->base; 5278 if (base < 0) 5279 return (-1); 5280 5281 if (ctxt->checkIndex > base) 5282 base = ctxt->checkIndex; 5283 5284 if (in->buf == NULL) { 5285 buf = in->base; 5286 len = in->length; 5287 } else { 5288 buf = xmlBufContent(in->buf->buffer); 5289 len = xmlBufUse(in->buf->buffer); 5290 } 5291 5292 for (; base < len; base++) { 5293 if (!incomment && (base + 4 < len)) { 5294 if ((buf[base] == '<') && (buf[base + 1] == '!') && 5295 (buf[base + 2] == '-') && (buf[base + 3] == '-')) { 5296 incomment = 1; 5297 /* do not increment past <! - some people use <!--> */ 5298 base += 2; 5299 } 5300 } 5301 if (incomment) { 5302 if (base + 3 > len) 5303 return (-1); 5304 if ((buf[base] == '-') && (buf[base + 1] == '-') && 5305 (buf[base + 2] == '>')) { 5306 incomment = 0; 5307 base += 2; 5308 } 5309 continue; 5310 } 5311 for (i = 0; i < stopLen; ++i) { 5312 if (buf[base] == stop[i]) { 5313 ctxt->checkIndex = 0; 5314 return (base - (in->cur - in->base)); 5315 } 5316 } 5317 } 5318 ctxt->checkIndex = base; 5319 return (-1); 5320 } 5321 5322 /** 5323 * htmlParseTryOrFinish: 5324 * @ctxt: an HTML parser context 5325 * @terminate: last chunk indicator 5326 * 5327 * Try to progress on parsing 5328 * 5329 * Returns zero if no parsing was possible 5330 */ 5331 static int 5332 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { 5333 int ret = 0; 5334 htmlParserInputPtr in; 5335 int avail = 0; 5336 xmlChar cur, next; 5337 5338 htmlParserNodeInfo node_info; 5339 5340 #ifdef DEBUG_PUSH 5341 switch (ctxt->instate) { 5342 case XML_PARSER_EOF: 5343 xmlGenericError(xmlGenericErrorContext, 5344 "HPP: try EOF\n"); break; 5345 case XML_PARSER_START: 5346 xmlGenericError(xmlGenericErrorContext, 5347 "HPP: try START\n"); break; 5348 case XML_PARSER_MISC: 5349 xmlGenericError(xmlGenericErrorContext, 5350 "HPP: try MISC\n");break; 5351 case XML_PARSER_COMMENT: 5352 xmlGenericError(xmlGenericErrorContext, 5353 "HPP: try COMMENT\n");break; 5354 case XML_PARSER_PROLOG: 5355 xmlGenericError(xmlGenericErrorContext, 5356 "HPP: try PROLOG\n");break; 5357 case XML_PARSER_START_TAG: 5358 xmlGenericError(xmlGenericErrorContext, 5359 "HPP: try START_TAG\n");break; 5360 case XML_PARSER_CONTENT: 5361 xmlGenericError(xmlGenericErrorContext, 5362 "HPP: try CONTENT\n");break; 5363 case XML_PARSER_CDATA_SECTION: 5364 xmlGenericError(xmlGenericErrorContext, 5365 "HPP: try CDATA_SECTION\n");break; 5366 case XML_PARSER_END_TAG: 5367 xmlGenericError(xmlGenericErrorContext, 5368 "HPP: try END_TAG\n");break; 5369 case XML_PARSER_ENTITY_DECL: 5370 xmlGenericError(xmlGenericErrorContext, 5371 "HPP: try ENTITY_DECL\n");break; 5372 case XML_PARSER_ENTITY_VALUE: 5373 xmlGenericError(xmlGenericErrorContext, 5374 "HPP: try ENTITY_VALUE\n");break; 5375 case XML_PARSER_ATTRIBUTE_VALUE: 5376 xmlGenericError(xmlGenericErrorContext, 5377 "HPP: try ATTRIBUTE_VALUE\n");break; 5378 case XML_PARSER_DTD: 5379 xmlGenericError(xmlGenericErrorContext, 5380 "HPP: try DTD\n");break; 5381 case XML_PARSER_EPILOG: 5382 xmlGenericError(xmlGenericErrorContext, 5383 "HPP: try EPILOG\n");break; 5384 case XML_PARSER_PI: 5385 xmlGenericError(xmlGenericErrorContext, 5386 "HPP: try PI\n");break; 5387 case XML_PARSER_SYSTEM_LITERAL: 5388 xmlGenericError(xmlGenericErrorContext, 5389 "HPP: try SYSTEM_LITERAL\n");break; 5390 } 5391 #endif 5392 5393 while (1) { 5394 5395 in = ctxt->input; 5396 if (in == NULL) break; 5397 if (in->buf == NULL) 5398 avail = in->length - (in->cur - in->base); 5399 else 5400 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); 5401 if ((avail == 0) && (terminate)) { 5402 htmlAutoCloseOnEnd(ctxt); 5403 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { 5404 /* 5405 * SAX: end of the document processing. 5406 */ 5407 ctxt->instate = XML_PARSER_EOF; 5408 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 5409 ctxt->sax->endDocument(ctxt->userData); 5410 } 5411 } 5412 if (avail < 1) 5413 goto done; 5414 cur = in->cur[0]; 5415 if (cur == 0) { 5416 SKIP(1); 5417 continue; 5418 } 5419 5420 switch (ctxt->instate) { 5421 case XML_PARSER_EOF: 5422 /* 5423 * Document parsing is done ! 5424 */ 5425 goto done; 5426 case XML_PARSER_START: 5427 /* 5428 * Very first chars read from the document flow. 5429 */ 5430 cur = in->cur[0]; 5431 if (IS_BLANK_CH(cur)) { 5432 SKIP_BLANKS; 5433 if (in->buf == NULL) 5434 avail = in->length - (in->cur - in->base); 5435 else 5436 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); 5437 } 5438 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) 5439 ctxt->sax->setDocumentLocator(ctxt->userData, 5440 &xmlDefaultSAXLocator); 5441 if ((ctxt->sax) && (ctxt->sax->startDocument) && 5442 (!ctxt->disableSAX)) 5443 ctxt->sax->startDocument(ctxt->userData); 5444 5445 cur = in->cur[0]; 5446 next = in->cur[1]; 5447 if ((cur == '<') && (next == '!') && 5448 (UPP(2) == 'D') && (UPP(3) == 'O') && 5449 (UPP(4) == 'C') && (UPP(5) == 'T') && 5450 (UPP(6) == 'Y') && (UPP(7) == 'P') && 5451 (UPP(8) == 'E')) { 5452 if ((!terminate) && 5453 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5454 goto done; 5455 #ifdef DEBUG_PUSH 5456 xmlGenericError(xmlGenericErrorContext, 5457 "HPP: Parsing internal subset\n"); 5458 #endif 5459 htmlParseDocTypeDecl(ctxt); 5460 ctxt->instate = XML_PARSER_PROLOG; 5461 #ifdef DEBUG_PUSH 5462 xmlGenericError(xmlGenericErrorContext, 5463 "HPP: entering PROLOG\n"); 5464 #endif 5465 } else { 5466 ctxt->instate = XML_PARSER_MISC; 5467 #ifdef DEBUG_PUSH 5468 xmlGenericError(xmlGenericErrorContext, 5469 "HPP: entering MISC\n"); 5470 #endif 5471 } 5472 break; 5473 case XML_PARSER_MISC: 5474 SKIP_BLANKS; 5475 if (in->buf == NULL) 5476 avail = in->length - (in->cur - in->base); 5477 else 5478 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); 5479 /* 5480 * no chars in buffer 5481 */ 5482 if (avail < 1) 5483 goto done; 5484 /* 5485 * not enouth chars in buffer 5486 */ 5487 if (avail < 2) { 5488 if (!terminate) 5489 goto done; 5490 else 5491 next = ' '; 5492 } else { 5493 next = in->cur[1]; 5494 } 5495 cur = in->cur[0]; 5496 if ((cur == '<') && (next == '!') && 5497 (in->cur[2] == '-') && (in->cur[3] == '-')) { 5498 if ((!terminate) && 5499 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0)) 5500 goto done; 5501 #ifdef DEBUG_PUSH 5502 xmlGenericError(xmlGenericErrorContext, 5503 "HPP: Parsing Comment\n"); 5504 #endif 5505 htmlParseComment(ctxt); 5506 ctxt->instate = XML_PARSER_MISC; 5507 } else if ((cur == '<') && (next == '?')) { 5508 if ((!terminate) && 5509 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5510 goto done; 5511 #ifdef DEBUG_PUSH 5512 xmlGenericError(xmlGenericErrorContext, 5513 "HPP: Parsing PI\n"); 5514 #endif 5515 htmlParsePI(ctxt); 5516 ctxt->instate = XML_PARSER_MISC; 5517 } else if ((cur == '<') && (next == '!') && 5518 (UPP(2) == 'D') && (UPP(3) == 'O') && 5519 (UPP(4) == 'C') && (UPP(5) == 'T') && 5520 (UPP(6) == 'Y') && (UPP(7) == 'P') && 5521 (UPP(8) == 'E')) { 5522 if ((!terminate) && 5523 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5524 goto done; 5525 #ifdef DEBUG_PUSH 5526 xmlGenericError(xmlGenericErrorContext, 5527 "HPP: Parsing internal subset\n"); 5528 #endif 5529 htmlParseDocTypeDecl(ctxt); 5530 ctxt->instate = XML_PARSER_PROLOG; 5531 #ifdef DEBUG_PUSH 5532 xmlGenericError(xmlGenericErrorContext, 5533 "HPP: entering PROLOG\n"); 5534 #endif 5535 } else if ((cur == '<') && (next == '!') && 5536 (avail < 9)) { 5537 goto done; 5538 } else { 5539 ctxt->instate = XML_PARSER_START_TAG; 5540 #ifdef DEBUG_PUSH 5541 xmlGenericError(xmlGenericErrorContext, 5542 "HPP: entering START_TAG\n"); 5543 #endif 5544 } 5545 break; 5546 case XML_PARSER_PROLOG: 5547 SKIP_BLANKS; 5548 if (in->buf == NULL) 5549 avail = in->length - (in->cur - in->base); 5550 else 5551 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); 5552 if (avail < 2) 5553 goto done; 5554 cur = in->cur[0]; 5555 next = in->cur[1]; 5556 if ((cur == '<') && (next == '!') && 5557 (in->cur[2] == '-') && (in->cur[3] == '-')) { 5558 if ((!terminate) && 5559 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0)) 5560 goto done; 5561 #ifdef DEBUG_PUSH 5562 xmlGenericError(xmlGenericErrorContext, 5563 "HPP: Parsing Comment\n"); 5564 #endif 5565 htmlParseComment(ctxt); 5566 ctxt->instate = XML_PARSER_PROLOG; 5567 } else if ((cur == '<') && (next == '?')) { 5568 if ((!terminate) && 5569 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5570 goto done; 5571 #ifdef DEBUG_PUSH 5572 xmlGenericError(xmlGenericErrorContext, 5573 "HPP: Parsing PI\n"); 5574 #endif 5575 htmlParsePI(ctxt); 5576 ctxt->instate = XML_PARSER_PROLOG; 5577 } else if ((cur == '<') && (next == '!') && 5578 (avail < 4)) { 5579 goto done; 5580 } else { 5581 ctxt->instate = XML_PARSER_START_TAG; 5582 #ifdef DEBUG_PUSH 5583 xmlGenericError(xmlGenericErrorContext, 5584 "HPP: entering START_TAG\n"); 5585 #endif 5586 } 5587 break; 5588 case XML_PARSER_EPILOG: 5589 if (in->buf == NULL) 5590 avail = in->length - (in->cur - in->base); 5591 else 5592 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); 5593 if (avail < 1) 5594 goto done; 5595 cur = in->cur[0]; 5596 if (IS_BLANK_CH(cur)) { 5597 htmlParseCharData(ctxt); 5598 goto done; 5599 } 5600 if (avail < 2) 5601 goto done; 5602 next = in->cur[1]; 5603 if ((cur == '<') && (next == '!') && 5604 (in->cur[2] == '-') && (in->cur[3] == '-')) { 5605 if ((!terminate) && 5606 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0)) 5607 goto done; 5608 #ifdef DEBUG_PUSH 5609 xmlGenericError(xmlGenericErrorContext, 5610 "HPP: Parsing Comment\n"); 5611 #endif 5612 htmlParseComment(ctxt); 5613 ctxt->instate = XML_PARSER_EPILOG; 5614 } else if ((cur == '<') && (next == '?')) { 5615 if ((!terminate) && 5616 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5617 goto done; 5618 #ifdef DEBUG_PUSH 5619 xmlGenericError(xmlGenericErrorContext, 5620 "HPP: Parsing PI\n"); 5621 #endif 5622 htmlParsePI(ctxt); 5623 ctxt->instate = XML_PARSER_EPILOG; 5624 } else if ((cur == '<') && (next == '!') && 5625 (avail < 4)) { 5626 goto done; 5627 } else { 5628 ctxt->errNo = XML_ERR_DOCUMENT_END; 5629 ctxt->wellFormed = 0; 5630 ctxt->instate = XML_PARSER_EOF; 5631 #ifdef DEBUG_PUSH 5632 xmlGenericError(xmlGenericErrorContext, 5633 "HPP: entering EOF\n"); 5634 #endif 5635 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 5636 ctxt->sax->endDocument(ctxt->userData); 5637 goto done; 5638 } 5639 break; 5640 case XML_PARSER_START_TAG: { 5641 const xmlChar *name; 5642 int failed; 5643 const htmlElemDesc * info; 5644 5645 /* 5646 * no chars in buffer 5647 */ 5648 if (avail < 1) 5649 goto done; 5650 /* 5651 * not enouth chars in buffer 5652 */ 5653 if (avail < 2) { 5654 if (!terminate) 5655 goto done; 5656 else 5657 next = ' '; 5658 } else { 5659 next = in->cur[1]; 5660 } 5661 cur = in->cur[0]; 5662 if (cur != '<') { 5663 ctxt->instate = XML_PARSER_CONTENT; 5664 #ifdef DEBUG_PUSH 5665 xmlGenericError(xmlGenericErrorContext, 5666 "HPP: entering CONTENT\n"); 5667 #endif 5668 break; 5669 } 5670 if (next == '/') { 5671 ctxt->instate = XML_PARSER_END_TAG; 5672 ctxt->checkIndex = 0; 5673 #ifdef DEBUG_PUSH 5674 xmlGenericError(xmlGenericErrorContext, 5675 "HPP: entering END_TAG\n"); 5676 #endif 5677 break; 5678 } 5679 if ((!terminate) && 5680 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5681 goto done; 5682 5683 /* Capture start position */ 5684 if (ctxt->record_info) { 5685 node_info.begin_pos = ctxt->input->consumed + 5686 (CUR_PTR - ctxt->input->base); 5687 node_info.begin_line = ctxt->input->line; 5688 } 5689 5690 5691 failed = htmlParseStartTag(ctxt); 5692 name = ctxt->name; 5693 if ((failed == -1) || 5694 (name == NULL)) { 5695 if (CUR == '>') 5696 NEXT; 5697 break; 5698 } 5699 5700 /* 5701 * Lookup the info for that element. 5702 */ 5703 info = htmlTagLookup(name); 5704 if (info == NULL) { 5705 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, 5706 "Tag %s invalid\n", name, NULL); 5707 } 5708 5709 /* 5710 * Check for an Empty Element labeled the XML/SGML way 5711 */ 5712 if ((CUR == '/') && (NXT(1) == '>')) { 5713 SKIP(2); 5714 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 5715 ctxt->sax->endElement(ctxt->userData, name); 5716 htmlnamePop(ctxt); 5717 ctxt->instate = XML_PARSER_CONTENT; 5718 #ifdef DEBUG_PUSH 5719 xmlGenericError(xmlGenericErrorContext, 5720 "HPP: entering CONTENT\n"); 5721 #endif 5722 break; 5723 } 5724 5725 if (CUR == '>') { 5726 NEXT; 5727 } else { 5728 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 5729 "Couldn't find end of Start Tag %s\n", 5730 name, NULL); 5731 5732 /* 5733 * end of parsing of this node. 5734 */ 5735 if (xmlStrEqual(name, ctxt->name)) { 5736 nodePop(ctxt); 5737 htmlnamePop(ctxt); 5738 } 5739 5740 if (ctxt->record_info) 5741 htmlNodeInfoPush(ctxt, &node_info); 5742 5743 ctxt->instate = XML_PARSER_CONTENT; 5744 #ifdef DEBUG_PUSH 5745 xmlGenericError(xmlGenericErrorContext, 5746 "HPP: entering CONTENT\n"); 5747 #endif 5748 break; 5749 } 5750 5751 /* 5752 * Check for an Empty Element from DTD definition 5753 */ 5754 if ((info != NULL) && (info->empty)) { 5755 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 5756 ctxt->sax->endElement(ctxt->userData, name); 5757 htmlnamePop(ctxt); 5758 } 5759 5760 if (ctxt->record_info) 5761 htmlNodeInfoPush(ctxt, &node_info); 5762 5763 ctxt->instate = XML_PARSER_CONTENT; 5764 #ifdef DEBUG_PUSH 5765 xmlGenericError(xmlGenericErrorContext, 5766 "HPP: entering CONTENT\n"); 5767 #endif 5768 break; 5769 } 5770 case XML_PARSER_CONTENT: { 5771 xmlChar chr[2] = { 0, 0 }; 5772 long cons; 5773 5774 /* 5775 * Handle preparsed entities and charRef 5776 */ 5777 if (ctxt->token != 0) { 5778 chr[0] = (xmlChar) ctxt->token; 5779 htmlCheckParagraph(ctxt); 5780 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 5781 ctxt->sax->characters(ctxt->userData, chr, 1); 5782 ctxt->token = 0; 5783 ctxt->checkIndex = 0; 5784 } 5785 if ((avail == 1) && (terminate)) { 5786 cur = in->cur[0]; 5787 if ((cur != '<') && (cur != '&')) { 5788 if (ctxt->sax != NULL) { 5789 chr[0] = cur; 5790 if (IS_BLANK_CH(cur)) { 5791 if (ctxt->keepBlanks) { 5792 if (ctxt->sax->characters != NULL) 5793 ctxt->sax->characters( 5794 ctxt->userData, chr, 1); 5795 } else { 5796 if (ctxt->sax->ignorableWhitespace != NULL) 5797 ctxt->sax->ignorableWhitespace( 5798 ctxt->userData, chr, 1); 5799 } 5800 } else { 5801 htmlCheckParagraph(ctxt); 5802 if (ctxt->sax->characters != NULL) 5803 ctxt->sax->characters( 5804 ctxt->userData, chr, 1); 5805 } 5806 } 5807 ctxt->token = 0; 5808 ctxt->checkIndex = 0; 5809 in->cur++; 5810 break; 5811 } 5812 } 5813 if (avail < 2) 5814 goto done; 5815 cur = in->cur[0]; 5816 next = in->cur[1]; 5817 cons = ctxt->nbChars; 5818 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) || 5819 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) { 5820 /* 5821 * Handle SCRIPT/STYLE separately 5822 */ 5823 if (!terminate) { 5824 int idx; 5825 xmlChar val; 5826 5827 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0); 5828 if (idx < 0) 5829 goto done; 5830 val = in->cur[idx + 2]; 5831 if (val == 0) /* bad cut of input */ 5832 goto done; 5833 } 5834 htmlParseScript(ctxt); 5835 if ((cur == '<') && (next == '/')) { 5836 ctxt->instate = XML_PARSER_END_TAG; 5837 ctxt->checkIndex = 0; 5838 #ifdef DEBUG_PUSH 5839 xmlGenericError(xmlGenericErrorContext, 5840 "HPP: entering END_TAG\n"); 5841 #endif 5842 break; 5843 } 5844 } else { 5845 /* 5846 * Sometimes DOCTYPE arrives in the middle of the document 5847 */ 5848 if ((cur == '<') && (next == '!') && 5849 (UPP(2) == 'D') && (UPP(3) == 'O') && 5850 (UPP(4) == 'C') && (UPP(5) == 'T') && 5851 (UPP(6) == 'Y') && (UPP(7) == 'P') && 5852 (UPP(8) == 'E')) { 5853 if ((!terminate) && 5854 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5855 goto done; 5856 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 5857 "Misplaced DOCTYPE declaration\n", 5858 BAD_CAST "DOCTYPE" , NULL); 5859 htmlParseDocTypeDecl(ctxt); 5860 } else if ((cur == '<') && (next == '!') && 5861 (in->cur[2] == '-') && (in->cur[3] == '-')) { 5862 if ((!terminate) && 5863 (htmlParseLookupSequence( 5864 ctxt, '-', '-', '>', 1, 1) < 0)) 5865 goto done; 5866 #ifdef DEBUG_PUSH 5867 xmlGenericError(xmlGenericErrorContext, 5868 "HPP: Parsing Comment\n"); 5869 #endif 5870 htmlParseComment(ctxt); 5871 ctxt->instate = XML_PARSER_CONTENT; 5872 } else if ((cur == '<') && (next == '?')) { 5873 if ((!terminate) && 5874 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5875 goto done; 5876 #ifdef DEBUG_PUSH 5877 xmlGenericError(xmlGenericErrorContext, 5878 "HPP: Parsing PI\n"); 5879 #endif 5880 htmlParsePI(ctxt); 5881 ctxt->instate = XML_PARSER_CONTENT; 5882 } else if ((cur == '<') && (next == '!') && (avail < 4)) { 5883 goto done; 5884 } else if ((cur == '<') && (next == '/')) { 5885 ctxt->instate = XML_PARSER_END_TAG; 5886 ctxt->checkIndex = 0; 5887 #ifdef DEBUG_PUSH 5888 xmlGenericError(xmlGenericErrorContext, 5889 "HPP: entering END_TAG\n"); 5890 #endif 5891 break; 5892 } else if (cur == '<') { 5893 ctxt->instate = XML_PARSER_START_TAG; 5894 ctxt->checkIndex = 0; 5895 #ifdef DEBUG_PUSH 5896 xmlGenericError(xmlGenericErrorContext, 5897 "HPP: entering START_TAG\n"); 5898 #endif 5899 break; 5900 } else if (cur == '&') { 5901 if ((!terminate) && 5902 (htmlParseLookupChars(ctxt, 5903 BAD_CAST "; >/", 4) < 0)) 5904 goto done; 5905 #ifdef DEBUG_PUSH 5906 xmlGenericError(xmlGenericErrorContext, 5907 "HPP: Parsing Reference\n"); 5908 #endif 5909 /* TODO: check generation of subtrees if noent !!! */ 5910 htmlParseReference(ctxt); 5911 } else { 5912 /* 5913 * check that the text sequence is complete 5914 * before handing out the data to the parser 5915 * to avoid problems with erroneous end of 5916 * data detection. 5917 */ 5918 if ((!terminate) && 5919 (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0)) 5920 goto done; 5921 ctxt->checkIndex = 0; 5922 #ifdef DEBUG_PUSH 5923 xmlGenericError(xmlGenericErrorContext, 5924 "HPP: Parsing char data\n"); 5925 #endif 5926 htmlParseCharData(ctxt); 5927 } 5928 } 5929 if (cons == ctxt->nbChars) { 5930 if (ctxt->node != NULL) { 5931 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5932 "detected an error in element content\n", 5933 NULL, NULL); 5934 } 5935 NEXT; 5936 break; 5937 } 5938 5939 break; 5940 } 5941 case XML_PARSER_END_TAG: 5942 if (avail < 2) 5943 goto done; 5944 if ((!terminate) && 5945 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5946 goto done; 5947 htmlParseEndTag(ctxt); 5948 if (ctxt->nameNr == 0) { 5949 ctxt->instate = XML_PARSER_EPILOG; 5950 } else { 5951 ctxt->instate = XML_PARSER_CONTENT; 5952 } 5953 ctxt->checkIndex = 0; 5954 #ifdef DEBUG_PUSH 5955 xmlGenericError(xmlGenericErrorContext, 5956 "HPP: entering CONTENT\n"); 5957 #endif 5958 break; 5959 case XML_PARSER_CDATA_SECTION: 5960 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5961 "HPP: internal error, state == CDATA\n", 5962 NULL, NULL); 5963 ctxt->instate = XML_PARSER_CONTENT; 5964 ctxt->checkIndex = 0; 5965 #ifdef DEBUG_PUSH 5966 xmlGenericError(xmlGenericErrorContext, 5967 "HPP: entering CONTENT\n"); 5968 #endif 5969 break; 5970 case XML_PARSER_DTD: 5971 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5972 "HPP: internal error, state == DTD\n", 5973 NULL, NULL); 5974 ctxt->instate = XML_PARSER_CONTENT; 5975 ctxt->checkIndex = 0; 5976 #ifdef DEBUG_PUSH 5977 xmlGenericError(xmlGenericErrorContext, 5978 "HPP: entering CONTENT\n"); 5979 #endif 5980 break; 5981 case XML_PARSER_COMMENT: 5982 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5983 "HPP: internal error, state == COMMENT\n", 5984 NULL, NULL); 5985 ctxt->instate = XML_PARSER_CONTENT; 5986 ctxt->checkIndex = 0; 5987 #ifdef DEBUG_PUSH 5988 xmlGenericError(xmlGenericErrorContext, 5989 "HPP: entering CONTENT\n"); 5990 #endif 5991 break; 5992 case XML_PARSER_PI: 5993 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5994 "HPP: internal error, state == PI\n", 5995 NULL, NULL); 5996 ctxt->instate = XML_PARSER_CONTENT; 5997 ctxt->checkIndex = 0; 5998 #ifdef DEBUG_PUSH 5999 xmlGenericError(xmlGenericErrorContext, 6000 "HPP: entering CONTENT\n"); 6001 #endif 6002 break; 6003 case XML_PARSER_ENTITY_DECL: 6004 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 6005 "HPP: internal error, state == ENTITY_DECL\n", 6006 NULL, NULL); 6007 ctxt->instate = XML_PARSER_CONTENT; 6008 ctxt->checkIndex = 0; 6009 #ifdef DEBUG_PUSH 6010 xmlGenericError(xmlGenericErrorContext, 6011 "HPP: entering CONTENT\n"); 6012 #endif 6013 break; 6014 case XML_PARSER_ENTITY_VALUE: 6015 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 6016 "HPP: internal error, state == ENTITY_VALUE\n", 6017 NULL, NULL); 6018 ctxt->instate = XML_PARSER_CONTENT; 6019 ctxt->checkIndex = 0; 6020 #ifdef DEBUG_PUSH 6021 xmlGenericError(xmlGenericErrorContext, 6022 "HPP: entering DTD\n"); 6023 #endif 6024 break; 6025 case XML_PARSER_ATTRIBUTE_VALUE: 6026 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 6027 "HPP: internal error, state == ATTRIBUTE_VALUE\n", 6028 NULL, NULL); 6029 ctxt->instate = XML_PARSER_START_TAG; 6030 ctxt->checkIndex = 0; 6031 #ifdef DEBUG_PUSH 6032 xmlGenericError(xmlGenericErrorContext, 6033 "HPP: entering START_TAG\n"); 6034 #endif 6035 break; 6036 case XML_PARSER_SYSTEM_LITERAL: 6037 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 6038 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n", 6039 NULL, NULL); 6040 ctxt->instate = XML_PARSER_CONTENT; 6041 ctxt->checkIndex = 0; 6042 #ifdef DEBUG_PUSH 6043 xmlGenericError(xmlGenericErrorContext, 6044 "HPP: entering CONTENT\n"); 6045 #endif 6046 break; 6047 case XML_PARSER_IGNORE: 6048 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 6049 "HPP: internal error, state == XML_PARSER_IGNORE\n", 6050 NULL, NULL); 6051 ctxt->instate = XML_PARSER_CONTENT; 6052 ctxt->checkIndex = 0; 6053 #ifdef DEBUG_PUSH 6054 xmlGenericError(xmlGenericErrorContext, 6055 "HPP: entering CONTENT\n"); 6056 #endif 6057 break; 6058 case XML_PARSER_PUBLIC_LITERAL: 6059 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 6060 "HPP: internal error, state == XML_PARSER_LITERAL\n", 6061 NULL, NULL); 6062 ctxt->instate = XML_PARSER_CONTENT; 6063 ctxt->checkIndex = 0; 6064 #ifdef DEBUG_PUSH 6065 xmlGenericError(xmlGenericErrorContext, 6066 "HPP: entering CONTENT\n"); 6067 #endif 6068 break; 6069 6070 } 6071 } 6072 done: 6073 if ((avail == 0) && (terminate)) { 6074 htmlAutoCloseOnEnd(ctxt); 6075 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { 6076 /* 6077 * SAX: end of the document processing. 6078 */ 6079 ctxt->instate = XML_PARSER_EOF; 6080 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 6081 ctxt->sax->endDocument(ctxt->userData); 6082 } 6083 } 6084 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) && 6085 ((terminate) || (ctxt->instate == XML_PARSER_EOF) || 6086 (ctxt->instate == XML_PARSER_EPILOG))) { 6087 xmlDtdPtr dtd; 6088 dtd = xmlGetIntSubset(ctxt->myDoc); 6089 if (dtd == NULL) 6090 ctxt->myDoc->intSubset = 6091 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html", 6092 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN", 6093 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"); 6094 } 6095 #ifdef DEBUG_PUSH 6096 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret); 6097 #endif 6098 return(ret); 6099 } 6100 6101 /** 6102 * htmlParseChunk: 6103 * @ctxt: an HTML parser context 6104 * @chunk: an char array 6105 * @size: the size in byte of the chunk 6106 * @terminate: last chunk indicator 6107 * 6108 * Parse a Chunk of memory 6109 * 6110 * Returns zero if no error, the xmlParserErrors otherwise. 6111 */ 6112 int 6113 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size, 6114 int terminate) { 6115 if ((ctxt == NULL) || (ctxt->input == NULL)) { 6116 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 6117 "htmlParseChunk: context error\n", NULL, NULL); 6118 return(XML_ERR_INTERNAL_ERROR); 6119 } 6120 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && 6121 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) { 6122 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input); 6123 size_t cur = ctxt->input->cur - ctxt->input->base; 6124 int res; 6125 6126 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk); 6127 if (res < 0) { 6128 ctxt->errNo = XML_PARSER_EOF; 6129 ctxt->disableSAX = 1; 6130 return (XML_PARSER_EOF); 6131 } 6132 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur); 6133 #ifdef DEBUG_PUSH 6134 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); 6135 #endif 6136 6137 #if 0 6138 if ((terminate) || (ctxt->input->buf->buffer->use > 80)) 6139 htmlParseTryOrFinish(ctxt, terminate); 6140 #endif 6141 } else if (ctxt->instate != XML_PARSER_EOF) { 6142 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) { 6143 xmlParserInputBufferPtr in = ctxt->input->buf; 6144 if ((in->encoder != NULL) && (in->buffer != NULL) && 6145 (in->raw != NULL)) { 6146 int nbchars; 6147 size_t base = xmlBufGetInputBase(in->buffer, ctxt->input); 6148 size_t current = ctxt->input->cur - ctxt->input->base; 6149 6150 nbchars = xmlCharEncInput(in, terminate); 6151 if (nbchars < 0) { 6152 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 6153 "encoder error\n", NULL, NULL); 6154 return(XML_ERR_INVALID_ENCODING); 6155 } 6156 xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current); 6157 } 6158 } 6159 } 6160 htmlParseTryOrFinish(ctxt, terminate); 6161 if (terminate) { 6162 if ((ctxt->instate != XML_PARSER_EOF) && 6163 (ctxt->instate != XML_PARSER_EPILOG) && 6164 (ctxt->instate != XML_PARSER_MISC)) { 6165 ctxt->errNo = XML_ERR_DOCUMENT_END; 6166 ctxt->wellFormed = 0; 6167 } 6168 if (ctxt->instate != XML_PARSER_EOF) { 6169 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 6170 ctxt->sax->endDocument(ctxt->userData); 6171 } 6172 ctxt->instate = XML_PARSER_EOF; 6173 } 6174 return((xmlParserErrors) ctxt->errNo); 6175 } 6176 6177 /************************************************************************ 6178 * * 6179 * User entry points * 6180 * * 6181 ************************************************************************/ 6182 6183 /** 6184 * htmlCreatePushParserCtxt: 6185 * @sax: a SAX handler 6186 * @user_data: The user data returned on SAX callbacks 6187 * @chunk: a pointer to an array of chars 6188 * @size: number of chars in the array 6189 * @filename: an optional file name or URI 6190 * @enc: an optional encoding 6191 * 6192 * Create a parser context for using the HTML parser in push mode 6193 * The value of @filename is used for fetching external entities 6194 * and error/warning reports. 6195 * 6196 * Returns the new parser context or NULL 6197 */ 6198 htmlParserCtxtPtr 6199 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data, 6200 const char *chunk, int size, const char *filename, 6201 xmlCharEncoding enc) { 6202 htmlParserCtxtPtr ctxt; 6203 htmlParserInputPtr inputStream; 6204 xmlParserInputBufferPtr buf; 6205 6206 xmlInitParser(); 6207 6208 buf = xmlAllocParserInputBuffer(enc); 6209 if (buf == NULL) return(NULL); 6210 6211 ctxt = htmlNewParserCtxt(); 6212 if (ctxt == NULL) { 6213 xmlFreeParserInputBuffer(buf); 6214 return(NULL); 6215 } 6216 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder) 6217 ctxt->charset=XML_CHAR_ENCODING_UTF8; 6218 if (sax != NULL) { 6219 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler) 6220 xmlFree(ctxt->sax); 6221 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler)); 6222 if (ctxt->sax == NULL) { 6223 xmlFree(buf); 6224 xmlFree(ctxt); 6225 return(NULL); 6226 } 6227 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler)); 6228 if (user_data != NULL) 6229 ctxt->userData = user_data; 6230 } 6231 if (filename == NULL) { 6232 ctxt->directory = NULL; 6233 } else { 6234 ctxt->directory = xmlParserGetDirectory(filename); 6235 } 6236 6237 inputStream = htmlNewInputStream(ctxt); 6238 if (inputStream == NULL) { 6239 xmlFreeParserCtxt(ctxt); 6240 xmlFree(buf); 6241 return(NULL); 6242 } 6243 6244 if (filename == NULL) 6245 inputStream->filename = NULL; 6246 else 6247 inputStream->filename = (char *) 6248 xmlCanonicPath((const xmlChar *) filename); 6249 inputStream->buf = buf; 6250 xmlBufResetInput(buf->buffer, inputStream); 6251 6252 inputPush(ctxt, inputStream); 6253 6254 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && 6255 (ctxt->input->buf != NULL)) { 6256 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input); 6257 size_t cur = ctxt->input->cur - ctxt->input->base; 6258 6259 xmlParserInputBufferPush(ctxt->input->buf, size, chunk); 6260 6261 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur); 6262 #ifdef DEBUG_PUSH 6263 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); 6264 #endif 6265 } 6266 ctxt->progressive = 1; 6267 6268 return(ctxt); 6269 } 6270 #endif /* LIBXML_PUSH_ENABLED */ 6271 6272 /** 6273 * htmlSAXParseDoc: 6274 * @cur: a pointer to an array of xmlChar 6275 * @encoding: a free form C string describing the HTML document encoding, or NULL 6276 * @sax: the SAX handler block 6277 * @userData: if using SAX, this pointer will be provided on callbacks. 6278 * 6279 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks 6280 * to handle parse events. If sax is NULL, fallback to the default DOM 6281 * behavior and return a tree. 6282 * 6283 * Returns the resulting document tree unless SAX is NULL or the document is 6284 * not well formed. 6285 */ 6286 6287 htmlDocPtr 6288 htmlSAXParseDoc(const xmlChar *cur, const char *encoding, 6289 htmlSAXHandlerPtr sax, void *userData) { 6290 htmlDocPtr ret; 6291 htmlParserCtxtPtr ctxt; 6292 6293 xmlInitParser(); 6294 6295 if (cur == NULL) return(NULL); 6296 6297 6298 ctxt = htmlCreateDocParserCtxt(cur, encoding); 6299 if (ctxt == NULL) return(NULL); 6300 if (sax != NULL) { 6301 if (ctxt->sax != NULL) xmlFree (ctxt->sax); 6302 ctxt->sax = sax; 6303 ctxt->userData = userData; 6304 } 6305 6306 htmlParseDocument(ctxt); 6307 ret = ctxt->myDoc; 6308 if (sax != NULL) { 6309 ctxt->sax = NULL; 6310 ctxt->userData = NULL; 6311 } 6312 htmlFreeParserCtxt(ctxt); 6313 6314 return(ret); 6315 } 6316 6317 /** 6318 * htmlParseDoc: 6319 * @cur: a pointer to an array of xmlChar 6320 * @encoding: a free form C string describing the HTML document encoding, or NULL 6321 * 6322 * parse an HTML in-memory document and build a tree. 6323 * 6324 * Returns the resulting document tree 6325 */ 6326 6327 htmlDocPtr 6328 htmlParseDoc(const xmlChar *cur, const char *encoding) { 6329 return(htmlSAXParseDoc(cur, encoding, NULL, NULL)); 6330 } 6331 6332 6333 /** 6334 * htmlCreateFileParserCtxt: 6335 * @filename: the filename 6336 * @encoding: a free form C string describing the HTML document encoding, or NULL 6337 * 6338 * Create a parser context for a file content. 6339 * Automatic support for ZLIB/Compress compressed document is provided 6340 * by default if found at compile-time. 6341 * 6342 * Returns the new parser context or NULL 6343 */ 6344 htmlParserCtxtPtr 6345 htmlCreateFileParserCtxt(const char *filename, const char *encoding) 6346 { 6347 htmlParserCtxtPtr ctxt; 6348 htmlParserInputPtr inputStream; 6349 char *canonicFilename; 6350 /* htmlCharEncoding enc; */ 6351 xmlChar *content, *content_line = (xmlChar *) "charset="; 6352 6353 if (filename == NULL) 6354 return(NULL); 6355 6356 ctxt = htmlNewParserCtxt(); 6357 if (ctxt == NULL) { 6358 return(NULL); 6359 } 6360 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename); 6361 if (canonicFilename == NULL) { 6362 #ifdef LIBXML_SAX1_ENABLED 6363 if (xmlDefaultSAXHandler.error != NULL) { 6364 xmlDefaultSAXHandler.error(NULL, "out of memory\n"); 6365 } 6366 #endif 6367 xmlFreeParserCtxt(ctxt); 6368 return(NULL); 6369 } 6370 6371 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt); 6372 xmlFree(canonicFilename); 6373 if (inputStream == NULL) { 6374 xmlFreeParserCtxt(ctxt); 6375 return(NULL); 6376 } 6377 6378 inputPush(ctxt, inputStream); 6379 6380 /* set encoding */ 6381 if (encoding) { 6382 size_t l = strlen(encoding); 6383 6384 if (l < 1000) { 6385 content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1); 6386 if (content) { 6387 strcpy ((char *)content, (char *)content_line); 6388 strcat ((char *)content, (char *)encoding); 6389 htmlCheckEncoding (ctxt, content); 6390 xmlFree (content); 6391 } 6392 } 6393 } 6394 6395 return(ctxt); 6396 } 6397 6398 /** 6399 * htmlSAXParseFile: 6400 * @filename: the filename 6401 * @encoding: a free form C string describing the HTML document encoding, or NULL 6402 * @sax: the SAX handler block 6403 * @userData: if using SAX, this pointer will be provided on callbacks. 6404 * 6405 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress 6406 * compressed document is provided by default if found at compile-time. 6407 * It use the given SAX function block to handle the parsing callback. 6408 * If sax is NULL, fallback to the default DOM tree building routines. 6409 * 6410 * Returns the resulting document tree unless SAX is NULL or the document is 6411 * not well formed. 6412 */ 6413 6414 htmlDocPtr 6415 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax, 6416 void *userData) { 6417 htmlDocPtr ret; 6418 htmlParserCtxtPtr ctxt; 6419 htmlSAXHandlerPtr oldsax = NULL; 6420 6421 xmlInitParser(); 6422 6423 ctxt = htmlCreateFileParserCtxt(filename, encoding); 6424 if (ctxt == NULL) return(NULL); 6425 if (sax != NULL) { 6426 oldsax = ctxt->sax; 6427 ctxt->sax = sax; 6428 ctxt->userData = userData; 6429 } 6430 6431 htmlParseDocument(ctxt); 6432 6433 ret = ctxt->myDoc; 6434 if (sax != NULL) { 6435 ctxt->sax = oldsax; 6436 ctxt->userData = NULL; 6437 } 6438 htmlFreeParserCtxt(ctxt); 6439 6440 return(ret); 6441 } 6442 6443 /** 6444 * htmlParseFile: 6445 * @filename: the filename 6446 * @encoding: a free form C string describing the HTML document encoding, or NULL 6447 * 6448 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress 6449 * compressed document is provided by default if found at compile-time. 6450 * 6451 * Returns the resulting document tree 6452 */ 6453 6454 htmlDocPtr 6455 htmlParseFile(const char *filename, const char *encoding) { 6456 return(htmlSAXParseFile(filename, encoding, NULL, NULL)); 6457 } 6458 6459 /** 6460 * htmlHandleOmittedElem: 6461 * @val: int 0 or 1 6462 * 6463 * Set and return the previous value for handling HTML omitted tags. 6464 * 6465 * Returns the last value for 0 for no handling, 1 for auto insertion. 6466 */ 6467 6468 int 6469 htmlHandleOmittedElem(int val) { 6470 int old = htmlOmittedDefaultValue; 6471 6472 htmlOmittedDefaultValue = val; 6473 return(old); 6474 } 6475 6476 /** 6477 * htmlElementAllowedHere: 6478 * @parent: HTML parent element 6479 * @elt: HTML element 6480 * 6481 * Checks whether an HTML element may be a direct child of a parent element. 6482 * Note - doesn't check for deprecated elements 6483 * 6484 * Returns 1 if allowed; 0 otherwise. 6485 */ 6486 int 6487 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) { 6488 const char** p ; 6489 6490 if ( ! elt || ! parent || ! parent->subelts ) 6491 return 0 ; 6492 6493 for ( p = parent->subelts; *p; ++p ) 6494 if ( !xmlStrcmp((const xmlChar *)*p, elt) ) 6495 return 1 ; 6496 6497 return 0 ; 6498 } 6499 /** 6500 * htmlElementStatusHere: 6501 * @parent: HTML parent element 6502 * @elt: HTML element 6503 * 6504 * Checks whether an HTML element may be a direct child of a parent element. 6505 * and if so whether it is valid or deprecated. 6506 * 6507 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID 6508 */ 6509 htmlStatus 6510 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) { 6511 if ( ! parent || ! elt ) 6512 return HTML_INVALID ; 6513 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) ) 6514 return HTML_INVALID ; 6515 6516 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ; 6517 } 6518 /** 6519 * htmlAttrAllowed: 6520 * @elt: HTML element 6521 * @attr: HTML attribute 6522 * @legacy: whether to allow deprecated attributes 6523 * 6524 * Checks whether an attribute is valid for an element 6525 * Has full knowledge of Required and Deprecated attributes 6526 * 6527 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID 6528 */ 6529 htmlStatus 6530 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) { 6531 const char** p ; 6532 6533 if ( !elt || ! attr ) 6534 return HTML_INVALID ; 6535 6536 if ( elt->attrs_req ) 6537 for ( p = elt->attrs_req; *p; ++p) 6538 if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 6539 return HTML_REQUIRED ; 6540 6541 if ( elt->attrs_opt ) 6542 for ( p = elt->attrs_opt; *p; ++p) 6543 if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 6544 return HTML_VALID ; 6545 6546 if ( legacy && elt->attrs_depr ) 6547 for ( p = elt->attrs_depr; *p; ++p) 6548 if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 6549 return HTML_DEPRECATED ; 6550 6551 return HTML_INVALID ; 6552 } 6553 /** 6554 * htmlNodeStatus: 6555 * @node: an htmlNodePtr in a tree 6556 * @legacy: whether to allow deprecated elements (YES is faster here 6557 * for Element nodes) 6558 * 6559 * Checks whether the tree node is valid. Experimental (the author 6560 * only uses the HTML enhancements in a SAX parser) 6561 * 6562 * Return: for Element nodes, a return from htmlElementAllowedHere (if 6563 * legacy allowed) or htmlElementStatusHere (otherwise). 6564 * for Attribute nodes, a return from htmlAttrAllowed 6565 * for other nodes, HTML_NA (no checks performed) 6566 */ 6567 htmlStatus 6568 htmlNodeStatus(const htmlNodePtr node, int legacy) { 6569 if ( ! node ) 6570 return HTML_INVALID ; 6571 6572 switch ( node->type ) { 6573 case XML_ELEMENT_NODE: 6574 return legacy 6575 ? ( htmlElementAllowedHere ( 6576 htmlTagLookup(node->parent->name) , node->name 6577 ) ? HTML_VALID : HTML_INVALID ) 6578 : htmlElementStatusHere( 6579 htmlTagLookup(node->parent->name) , 6580 htmlTagLookup(node->name) ) 6581 ; 6582 case XML_ATTRIBUTE_NODE: 6583 return htmlAttrAllowed( 6584 htmlTagLookup(node->parent->name) , node->name, legacy) ; 6585 default: return HTML_NA ; 6586 } 6587 } 6588 /************************************************************************ 6589 * * 6590 * New set (2.6.0) of simpler and more flexible APIs * 6591 * * 6592 ************************************************************************/ 6593 /** 6594 * DICT_FREE: 6595 * @str: a string 6596 * 6597 * Free a string if it is not owned by the "dict" dictionary in the 6598 * current scope 6599 */ 6600 #define DICT_FREE(str) \ 6601 if ((str) && ((!dict) || \ 6602 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \ 6603 xmlFree((char *)(str)); 6604 6605 /** 6606 * htmlCtxtReset: 6607 * @ctxt: an HTML parser context 6608 * 6609 * Reset a parser context 6610 */ 6611 void 6612 htmlCtxtReset(htmlParserCtxtPtr ctxt) 6613 { 6614 xmlParserInputPtr input; 6615 xmlDictPtr dict; 6616 6617 if (ctxt == NULL) 6618 return; 6619 6620 xmlInitParser(); 6621 dict = ctxt->dict; 6622 6623 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */ 6624 xmlFreeInputStream(input); 6625 } 6626 ctxt->inputNr = 0; 6627 ctxt->input = NULL; 6628 6629 ctxt->spaceNr = 0; 6630 if (ctxt->spaceTab != NULL) { 6631 ctxt->spaceTab[0] = -1; 6632 ctxt->space = &ctxt->spaceTab[0]; 6633 } else { 6634 ctxt->space = NULL; 6635 } 6636 6637 6638 ctxt->nodeNr = 0; 6639 ctxt->node = NULL; 6640 6641 ctxt->nameNr = 0; 6642 ctxt->name = NULL; 6643 6644 DICT_FREE(ctxt->version); 6645 ctxt->version = NULL; 6646 DICT_FREE(ctxt->encoding); 6647 ctxt->encoding = NULL; 6648 DICT_FREE(ctxt->directory); 6649 ctxt->directory = NULL; 6650 DICT_FREE(ctxt->extSubURI); 6651 ctxt->extSubURI = NULL; 6652 DICT_FREE(ctxt->extSubSystem); 6653 ctxt->extSubSystem = NULL; 6654 if (ctxt->myDoc != NULL) 6655 xmlFreeDoc(ctxt->myDoc); 6656 ctxt->myDoc = NULL; 6657 6658 ctxt->standalone = -1; 6659 ctxt->hasExternalSubset = 0; 6660 ctxt->hasPErefs = 0; 6661 ctxt->html = 1; 6662 ctxt->external = 0; 6663 ctxt->instate = XML_PARSER_START; 6664 ctxt->token = 0; 6665 6666 ctxt->wellFormed = 1; 6667 ctxt->nsWellFormed = 1; 6668 ctxt->disableSAX = 0; 6669 ctxt->valid = 1; 6670 ctxt->vctxt.userData = ctxt; 6671 ctxt->vctxt.error = xmlParserValidityError; 6672 ctxt->vctxt.warning = xmlParserValidityWarning; 6673 ctxt->record_info = 0; 6674 ctxt->nbChars = 0; 6675 ctxt->checkIndex = 0; 6676 ctxt->inSubset = 0; 6677 ctxt->errNo = XML_ERR_OK; 6678 ctxt->depth = 0; 6679 ctxt->charset = XML_CHAR_ENCODING_NONE; 6680 ctxt->catalogs = NULL; 6681 xmlInitNodeInfoSeq(&ctxt->node_seq); 6682 6683 if (ctxt->attsDefault != NULL) { 6684 xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator); 6685 ctxt->attsDefault = NULL; 6686 } 6687 if (ctxt->attsSpecial != NULL) { 6688 xmlHashFree(ctxt->attsSpecial, NULL); 6689 ctxt->attsSpecial = NULL; 6690 } 6691 } 6692 6693 /** 6694 * htmlCtxtUseOptions: 6695 * @ctxt: an HTML parser context 6696 * @options: a combination of htmlParserOption(s) 6697 * 6698 * Applies the options to the parser context 6699 * 6700 * Returns 0 in case of success, the set of unknown or unimplemented options 6701 * in case of error. 6702 */ 6703 int 6704 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options) 6705 { 6706 if (ctxt == NULL) 6707 return(-1); 6708 6709 if (options & HTML_PARSE_NOWARNING) { 6710 ctxt->sax->warning = NULL; 6711 ctxt->vctxt.warning = NULL; 6712 options -= XML_PARSE_NOWARNING; 6713 ctxt->options |= XML_PARSE_NOWARNING; 6714 } 6715 if (options & HTML_PARSE_NOERROR) { 6716 ctxt->sax->error = NULL; 6717 ctxt->vctxt.error = NULL; 6718 ctxt->sax->fatalError = NULL; 6719 options -= XML_PARSE_NOERROR; 6720 ctxt->options |= XML_PARSE_NOERROR; 6721 } 6722 if (options & HTML_PARSE_PEDANTIC) { 6723 ctxt->pedantic = 1; 6724 options -= XML_PARSE_PEDANTIC; 6725 ctxt->options |= XML_PARSE_PEDANTIC; 6726 } else 6727 ctxt->pedantic = 0; 6728 if (options & XML_PARSE_NOBLANKS) { 6729 ctxt->keepBlanks = 0; 6730 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace; 6731 options -= XML_PARSE_NOBLANKS; 6732 ctxt->options |= XML_PARSE_NOBLANKS; 6733 } else 6734 ctxt->keepBlanks = 1; 6735 if (options & HTML_PARSE_RECOVER) { 6736 ctxt->recovery = 1; 6737 options -= HTML_PARSE_RECOVER; 6738 } else 6739 ctxt->recovery = 0; 6740 if (options & HTML_PARSE_COMPACT) { 6741 ctxt->options |= HTML_PARSE_COMPACT; 6742 options -= HTML_PARSE_COMPACT; 6743 } 6744 if (options & XML_PARSE_HUGE) { 6745 ctxt->options |= XML_PARSE_HUGE; 6746 options -= XML_PARSE_HUGE; 6747 } 6748 if (options & HTML_PARSE_NODEFDTD) { 6749 ctxt->options |= HTML_PARSE_NODEFDTD; 6750 options -= HTML_PARSE_NODEFDTD; 6751 } 6752 if (options & HTML_PARSE_IGNORE_ENC) { 6753 ctxt->options |= HTML_PARSE_IGNORE_ENC; 6754 options -= HTML_PARSE_IGNORE_ENC; 6755 } 6756 if (options & HTML_PARSE_NOIMPLIED) { 6757 ctxt->options |= HTML_PARSE_NOIMPLIED; 6758 options -= HTML_PARSE_NOIMPLIED; 6759 } 6760 ctxt->dictNames = 0; 6761 return (options); 6762 } 6763 6764 /** 6765 * htmlDoRead: 6766 * @ctxt: an HTML parser context 6767 * @URL: the base URL to use for the document 6768 * @encoding: the document encoding, or NULL 6769 * @options: a combination of htmlParserOption(s) 6770 * @reuse: keep the context for reuse 6771 * 6772 * Common front-end for the htmlRead functions 6773 * 6774 * Returns the resulting document tree or NULL 6775 */ 6776 static htmlDocPtr 6777 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding, 6778 int options, int reuse) 6779 { 6780 htmlDocPtr ret; 6781 6782 htmlCtxtUseOptions(ctxt, options); 6783 ctxt->html = 1; 6784 if (encoding != NULL) { 6785 xmlCharEncodingHandlerPtr hdlr; 6786 6787 hdlr = xmlFindCharEncodingHandler(encoding); 6788 if (hdlr != NULL) { 6789 xmlSwitchToEncoding(ctxt, hdlr); 6790 if (ctxt->input->encoding != NULL) 6791 xmlFree((xmlChar *) ctxt->input->encoding); 6792 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding); 6793 } 6794 } 6795 if ((URL != NULL) && (ctxt->input != NULL) && 6796 (ctxt->input->filename == NULL)) 6797 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL); 6798 htmlParseDocument(ctxt); 6799 ret = ctxt->myDoc; 6800 ctxt->myDoc = NULL; 6801 if (!reuse) { 6802 if ((ctxt->dictNames) && 6803 (ret != NULL) && 6804 (ret->dict == ctxt->dict)) 6805 ctxt->dict = NULL; 6806 xmlFreeParserCtxt(ctxt); 6807 } 6808 return (ret); 6809 } 6810 6811 /** 6812 * htmlReadDoc: 6813 * @cur: a pointer to a zero terminated string 6814 * @URL: the base URL to use for the document 6815 * @encoding: the document encoding, or NULL 6816 * @options: a combination of htmlParserOption(s) 6817 * 6818 * parse an XML in-memory document and build a tree. 6819 * 6820 * Returns the resulting document tree 6821 */ 6822 htmlDocPtr 6823 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options) 6824 { 6825 htmlParserCtxtPtr ctxt; 6826 6827 if (cur == NULL) 6828 return (NULL); 6829 6830 xmlInitParser(); 6831 ctxt = htmlCreateDocParserCtxt(cur, NULL); 6832 if (ctxt == NULL) 6833 return (NULL); 6834 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6835 } 6836 6837 /** 6838 * htmlReadFile: 6839 * @filename: a file or URL 6840 * @encoding: the document encoding, or NULL 6841 * @options: a combination of htmlParserOption(s) 6842 * 6843 * parse an XML file from the filesystem or the network. 6844 * 6845 * Returns the resulting document tree 6846 */ 6847 htmlDocPtr 6848 htmlReadFile(const char *filename, const char *encoding, int options) 6849 { 6850 htmlParserCtxtPtr ctxt; 6851 6852 xmlInitParser(); 6853 ctxt = htmlCreateFileParserCtxt(filename, encoding); 6854 if (ctxt == NULL) 6855 return (NULL); 6856 return (htmlDoRead(ctxt, NULL, NULL, options, 0)); 6857 } 6858 6859 /** 6860 * htmlReadMemory: 6861 * @buffer: a pointer to a char array 6862 * @size: the size of the array 6863 * @URL: the base URL to use for the document 6864 * @encoding: the document encoding, or NULL 6865 * @options: a combination of htmlParserOption(s) 6866 * 6867 * parse an XML in-memory document and build a tree. 6868 * 6869 * Returns the resulting document tree 6870 */ 6871 htmlDocPtr 6872 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options) 6873 { 6874 htmlParserCtxtPtr ctxt; 6875 6876 xmlInitParser(); 6877 ctxt = xmlCreateMemoryParserCtxt(buffer, size); 6878 if (ctxt == NULL) 6879 return (NULL); 6880 htmlDefaultSAXHandlerInit(); 6881 if (ctxt->sax != NULL) 6882 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1)); 6883 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6884 } 6885 6886 /** 6887 * htmlReadFd: 6888 * @fd: an open file descriptor 6889 * @URL: the base URL to use for the document 6890 * @encoding: the document encoding, or NULL 6891 * @options: a combination of htmlParserOption(s) 6892 * 6893 * parse an XML from a file descriptor and build a tree. 6894 * 6895 * Returns the resulting document tree 6896 */ 6897 htmlDocPtr 6898 htmlReadFd(int fd, const char *URL, const char *encoding, int options) 6899 { 6900 htmlParserCtxtPtr ctxt; 6901 xmlParserInputBufferPtr input; 6902 xmlParserInputPtr stream; 6903 6904 if (fd < 0) 6905 return (NULL); 6906 xmlInitParser(); 6907 6908 xmlInitParser(); 6909 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE); 6910 if (input == NULL) 6911 return (NULL); 6912 ctxt = xmlNewParserCtxt(); 6913 if (ctxt == NULL) { 6914 xmlFreeParserInputBuffer(input); 6915 return (NULL); 6916 } 6917 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6918 if (stream == NULL) { 6919 xmlFreeParserInputBuffer(input); 6920 xmlFreeParserCtxt(ctxt); 6921 return (NULL); 6922 } 6923 inputPush(ctxt, stream); 6924 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6925 } 6926 6927 /** 6928 * htmlReadIO: 6929 * @ioread: an I/O read function 6930 * @ioclose: an I/O close function 6931 * @ioctx: an I/O handler 6932 * @URL: the base URL to use for the document 6933 * @encoding: the document encoding, or NULL 6934 * @options: a combination of htmlParserOption(s) 6935 * 6936 * parse an HTML document from I/O functions and source and build a tree. 6937 * 6938 * Returns the resulting document tree 6939 */ 6940 htmlDocPtr 6941 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose, 6942 void *ioctx, const char *URL, const char *encoding, int options) 6943 { 6944 htmlParserCtxtPtr ctxt; 6945 xmlParserInputBufferPtr input; 6946 xmlParserInputPtr stream; 6947 6948 if (ioread == NULL) 6949 return (NULL); 6950 xmlInitParser(); 6951 6952 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx, 6953 XML_CHAR_ENCODING_NONE); 6954 if (input == NULL) { 6955 if (ioclose != NULL) 6956 ioclose(ioctx); 6957 return (NULL); 6958 } 6959 ctxt = htmlNewParserCtxt(); 6960 if (ctxt == NULL) { 6961 xmlFreeParserInputBuffer(input); 6962 return (NULL); 6963 } 6964 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6965 if (stream == NULL) { 6966 xmlFreeParserInputBuffer(input); 6967 xmlFreeParserCtxt(ctxt); 6968 return (NULL); 6969 } 6970 inputPush(ctxt, stream); 6971 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6972 } 6973 6974 /** 6975 * htmlCtxtReadDoc: 6976 * @ctxt: an HTML parser context 6977 * @cur: a pointer to a zero terminated string 6978 * @URL: the base URL to use for the document 6979 * @encoding: the document encoding, or NULL 6980 * @options: a combination of htmlParserOption(s) 6981 * 6982 * parse an XML in-memory document and build a tree. 6983 * This reuses the existing @ctxt parser context 6984 * 6985 * Returns the resulting document tree 6986 */ 6987 htmlDocPtr 6988 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur, 6989 const char *URL, const char *encoding, int options) 6990 { 6991 xmlParserInputPtr stream; 6992 6993 if (cur == NULL) 6994 return (NULL); 6995 if (ctxt == NULL) 6996 return (NULL); 6997 xmlInitParser(); 6998 6999 htmlCtxtReset(ctxt); 7000 7001 stream = xmlNewStringInputStream(ctxt, cur); 7002 if (stream == NULL) { 7003 return (NULL); 7004 } 7005 inputPush(ctxt, stream); 7006 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 7007 } 7008 7009 /** 7010 * htmlCtxtReadFile: 7011 * @ctxt: an HTML parser context 7012 * @filename: a file or URL 7013 * @encoding: the document encoding, or NULL 7014 * @options: a combination of htmlParserOption(s) 7015 * 7016 * parse an XML file from the filesystem or the network. 7017 * This reuses the existing @ctxt parser context 7018 * 7019 * Returns the resulting document tree 7020 */ 7021 htmlDocPtr 7022 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename, 7023 const char *encoding, int options) 7024 { 7025 xmlParserInputPtr stream; 7026 7027 if (filename == NULL) 7028 return (NULL); 7029 if (ctxt == NULL) 7030 return (NULL); 7031 xmlInitParser(); 7032 7033 htmlCtxtReset(ctxt); 7034 7035 stream = xmlLoadExternalEntity(filename, NULL, ctxt); 7036 if (stream == NULL) { 7037 return (NULL); 7038 } 7039 inputPush(ctxt, stream); 7040 return (htmlDoRead(ctxt, NULL, encoding, options, 1)); 7041 } 7042 7043 /** 7044 * htmlCtxtReadMemory: 7045 * @ctxt: an HTML parser context 7046 * @buffer: a pointer to a char array 7047 * @size: the size of the array 7048 * @URL: the base URL to use for the document 7049 * @encoding: the document encoding, or NULL 7050 * @options: a combination of htmlParserOption(s) 7051 * 7052 * parse an XML in-memory document and build a tree. 7053 * This reuses the existing @ctxt parser context 7054 * 7055 * Returns the resulting document tree 7056 */ 7057 htmlDocPtr 7058 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size, 7059 const char *URL, const char *encoding, int options) 7060 { 7061 xmlParserInputBufferPtr input; 7062 xmlParserInputPtr stream; 7063 7064 if (ctxt == NULL) 7065 return (NULL); 7066 if (buffer == NULL) 7067 return (NULL); 7068 xmlInitParser(); 7069 7070 htmlCtxtReset(ctxt); 7071 7072 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE); 7073 if (input == NULL) { 7074 return(NULL); 7075 } 7076 7077 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 7078 if (stream == NULL) { 7079 xmlFreeParserInputBuffer(input); 7080 return(NULL); 7081 } 7082 7083 inputPush(ctxt, stream); 7084 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 7085 } 7086 7087 /** 7088 * htmlCtxtReadFd: 7089 * @ctxt: an HTML parser context 7090 * @fd: an open file descriptor 7091 * @URL: the base URL to use for the document 7092 * @encoding: the document encoding, or NULL 7093 * @options: a combination of htmlParserOption(s) 7094 * 7095 * parse an XML from a file descriptor and build a tree. 7096 * This reuses the existing @ctxt parser context 7097 * 7098 * Returns the resulting document tree 7099 */ 7100 htmlDocPtr 7101 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd, 7102 const char *URL, const char *encoding, int options) 7103 { 7104 xmlParserInputBufferPtr input; 7105 xmlParserInputPtr stream; 7106 7107 if (fd < 0) 7108 return (NULL); 7109 if (ctxt == NULL) 7110 return (NULL); 7111 xmlInitParser(); 7112 7113 htmlCtxtReset(ctxt); 7114 7115 7116 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE); 7117 if (input == NULL) 7118 return (NULL); 7119 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 7120 if (stream == NULL) { 7121 xmlFreeParserInputBuffer(input); 7122 return (NULL); 7123 } 7124 inputPush(ctxt, stream); 7125 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 7126 } 7127 7128 /** 7129 * htmlCtxtReadIO: 7130 * @ctxt: an HTML parser context 7131 * @ioread: an I/O read function 7132 * @ioclose: an I/O close function 7133 * @ioctx: an I/O handler 7134 * @URL: the base URL to use for the document 7135 * @encoding: the document encoding, or NULL 7136 * @options: a combination of htmlParserOption(s) 7137 * 7138 * parse an HTML document from I/O functions and source and build a tree. 7139 * This reuses the existing @ctxt parser context 7140 * 7141 * Returns the resulting document tree 7142 */ 7143 htmlDocPtr 7144 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread, 7145 xmlInputCloseCallback ioclose, void *ioctx, 7146 const char *URL, 7147 const char *encoding, int options) 7148 { 7149 xmlParserInputBufferPtr input; 7150 xmlParserInputPtr stream; 7151 7152 if (ioread == NULL) 7153 return (NULL); 7154 if (ctxt == NULL) 7155 return (NULL); 7156 xmlInitParser(); 7157 7158 htmlCtxtReset(ctxt); 7159 7160 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx, 7161 XML_CHAR_ENCODING_NONE); 7162 if (input == NULL) { 7163 if (ioclose != NULL) 7164 ioclose(ioctx); 7165 return (NULL); 7166 } 7167 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 7168 if (stream == NULL) { 7169 xmlFreeParserInputBuffer(input); 7170 return (NULL); 7171 } 7172 inputPush(ctxt, stream); 7173 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 7174 } 7175 7176 #define bottom_HTMLparser 7177 #include "elfgcchack.h" 7178 #endif /* LIBXML_HTML_ENABLED */ 7179