1 /* 2 * HTMLtree.c : implementation of access function for an HTML tree. 3 * 4 * See Copyright for the status of this software. 5 * 6 * daniel@veillard.com 7 */ 8 9 10 #define IN_LIBXML 11 #include "libxml.h" 12 #ifdef LIBXML_HTML_ENABLED 13 14 #include <string.h> /* for memset() only ! */ 15 #include <ctype.h> 16 #include <stdlib.h> 17 18 #include <libxml/xmlmemory.h> 19 #include <libxml/HTMLparser.h> 20 #include <libxml/HTMLtree.h> 21 #include <libxml/entities.h> 22 #include <libxml/valid.h> 23 #include <libxml/xmlerror.h> 24 #include <libxml/parserInternals.h> 25 #include <libxml/globals.h> 26 #include <libxml/uri.h> 27 28 #include "buf.h" 29 30 /************************************************************************ 31 * * 32 * Getting/Setting encoding meta tags * 33 * * 34 ************************************************************************/ 35 36 /** 37 * htmlGetMetaEncoding: 38 * @doc: the document 39 * 40 * Encoding definition lookup in the Meta tags 41 * 42 * Returns the current encoding as flagged in the HTML source 43 */ 44 const xmlChar * 45 htmlGetMetaEncoding(htmlDocPtr doc) { 46 htmlNodePtr cur; 47 const xmlChar *content; 48 const xmlChar *encoding; 49 50 if (doc == NULL) 51 return(NULL); 52 cur = doc->children; 53 54 /* 55 * Search the html 56 */ 57 while (cur != NULL) { 58 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 59 if (xmlStrEqual(cur->name, BAD_CAST"html")) 60 break; 61 if (xmlStrEqual(cur->name, BAD_CAST"head")) 62 goto found_head; 63 if (xmlStrEqual(cur->name, BAD_CAST"meta")) 64 goto found_meta; 65 } 66 cur = cur->next; 67 } 68 if (cur == NULL) 69 return(NULL); 70 cur = cur->children; 71 72 /* 73 * Search the head 74 */ 75 while (cur != NULL) { 76 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 77 if (xmlStrEqual(cur->name, BAD_CAST"head")) 78 break; 79 if (xmlStrEqual(cur->name, BAD_CAST"meta")) 80 goto found_meta; 81 } 82 cur = cur->next; 83 } 84 if (cur == NULL) 85 return(NULL); 86 found_head: 87 cur = cur->children; 88 89 /* 90 * Search the meta elements 91 */ 92 found_meta: 93 while (cur != NULL) { 94 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 95 if (xmlStrEqual(cur->name, BAD_CAST"meta")) { 96 xmlAttrPtr attr = cur->properties; 97 int http; 98 const xmlChar *value; 99 100 content = NULL; 101 http = 0; 102 while (attr != NULL) { 103 if ((attr->children != NULL) && 104 (attr->children->type == XML_TEXT_NODE) && 105 (attr->children->next == NULL)) { 106 value = attr->children->content; 107 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv")) 108 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) 109 http = 1; 110 else if ((value != NULL) 111 && (!xmlStrcasecmp(attr->name, BAD_CAST"content"))) 112 content = value; 113 if ((http != 0) && (content != NULL)) 114 goto found_content; 115 } 116 attr = attr->next; 117 } 118 } 119 } 120 cur = cur->next; 121 } 122 return(NULL); 123 124 found_content: 125 encoding = xmlStrstr(content, BAD_CAST"charset="); 126 if (encoding == NULL) 127 encoding = xmlStrstr(content, BAD_CAST"Charset="); 128 if (encoding == NULL) 129 encoding = xmlStrstr(content, BAD_CAST"CHARSET="); 130 if (encoding != NULL) { 131 encoding += 8; 132 } else { 133 encoding = xmlStrstr(content, BAD_CAST"charset ="); 134 if (encoding == NULL) 135 encoding = xmlStrstr(content, BAD_CAST"Charset ="); 136 if (encoding == NULL) 137 encoding = xmlStrstr(content, BAD_CAST"CHARSET ="); 138 if (encoding != NULL) 139 encoding += 9; 140 } 141 if (encoding != NULL) { 142 while ((*encoding == ' ') || (*encoding == '\t')) encoding++; 143 } 144 return(encoding); 145 } 146 147 /** 148 * htmlSetMetaEncoding: 149 * @doc: the document 150 * @encoding: the encoding string 151 * 152 * Sets the current encoding in the Meta tags 153 * NOTE: this will not change the document content encoding, just 154 * the META flag associated. 155 * 156 * Returns 0 in case of success and -1 in case of error 157 */ 158 int 159 htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) { 160 htmlNodePtr cur, meta = NULL, head = NULL; 161 const xmlChar *content = NULL; 162 char newcontent[100]; 163 164 newcontent[0] = 0; 165 166 if (doc == NULL) 167 return(-1); 168 169 /* html isn't a real encoding it's just libxml2 way to get entities */ 170 if (!xmlStrcasecmp(encoding, BAD_CAST "html")) 171 return(-1); 172 173 if (encoding != NULL) { 174 snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s", 175 (char *)encoding); 176 newcontent[sizeof(newcontent) - 1] = 0; 177 } 178 179 cur = doc->children; 180 181 /* 182 * Search the html 183 */ 184 while (cur != NULL) { 185 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 186 if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0) 187 break; 188 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0) 189 goto found_head; 190 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) 191 goto found_meta; 192 } 193 cur = cur->next; 194 } 195 if (cur == NULL) 196 return(-1); 197 cur = cur->children; 198 199 /* 200 * Search the head 201 */ 202 while (cur != NULL) { 203 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 204 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0) 205 break; 206 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) { 207 head = cur->parent; 208 goto found_meta; 209 } 210 } 211 cur = cur->next; 212 } 213 if (cur == NULL) 214 return(-1); 215 found_head: 216 head = cur; 217 if (cur->children == NULL) 218 goto create; 219 cur = cur->children; 220 221 found_meta: 222 /* 223 * Search and update all the remaining the meta elements carrying 224 * encoding information 225 */ 226 while (cur != NULL) { 227 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 228 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) { 229 xmlAttrPtr attr = cur->properties; 230 int http; 231 const xmlChar *value; 232 233 content = NULL; 234 http = 0; 235 while (attr != NULL) { 236 if ((attr->children != NULL) && 237 (attr->children->type == XML_TEXT_NODE) && 238 (attr->children->next == NULL)) { 239 value = attr->children->content; 240 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv")) 241 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) 242 http = 1; 243 else 244 { 245 if ((value != NULL) && 246 (!xmlStrcasecmp(attr->name, BAD_CAST"content"))) 247 content = value; 248 } 249 if ((http != 0) && (content != NULL)) 250 break; 251 } 252 attr = attr->next; 253 } 254 if ((http != 0) && (content != NULL)) { 255 meta = cur; 256 break; 257 } 258 259 } 260 } 261 cur = cur->next; 262 } 263 create: 264 if (meta == NULL) { 265 if ((encoding != NULL) && (head != NULL)) { 266 /* 267 * Create a new Meta element with the right attributes 268 */ 269 270 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL); 271 if (head->children == NULL) 272 xmlAddChild(head, meta); 273 else 274 xmlAddPrevSibling(head->children, meta); 275 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type"); 276 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent); 277 } 278 } else { 279 /* remove the meta tag if NULL is passed */ 280 if (encoding == NULL) { 281 xmlUnlinkNode(meta); 282 xmlFreeNode(meta); 283 } 284 /* change the document only if there is a real encoding change */ 285 else if (xmlStrcasestr(content, encoding) == NULL) { 286 xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent); 287 } 288 } 289 290 291 return(0); 292 } 293 294 /** 295 * booleanHTMLAttrs: 296 * 297 * These are the HTML attributes which will be output 298 * in minimized form, i.e. <option selected="selected"> will be 299 * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method" 300 * 301 */ 302 static const char* const htmlBooleanAttrs[] = { 303 "checked", "compact", "declare", "defer", "disabled", "ismap", 304 "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly", 305 "selected", NULL 306 }; 307 308 309 /** 310 * htmlIsBooleanAttr: 311 * @name: the name of the attribute to check 312 * 313 * Determine if a given attribute is a boolean attribute. 314 * 315 * returns: false if the attribute is not boolean, true otherwise. 316 */ 317 int 318 htmlIsBooleanAttr(const xmlChar *name) 319 { 320 int i = 0; 321 322 while (htmlBooleanAttrs[i] != NULL) { 323 if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0) 324 return 1; 325 i++; 326 } 327 return 0; 328 } 329 330 #ifdef LIBXML_OUTPUT_ENABLED 331 /* 332 * private routine exported from xmlIO.c 333 */ 334 xmlOutputBufferPtr 335 xmlAllocOutputBufferInternal(xmlCharEncodingHandlerPtr encoder); 336 /************************************************************************ 337 * * 338 * Output error handlers * 339 * * 340 ************************************************************************/ 341 /** 342 * htmlSaveErrMemory: 343 * @extra: extra information 344 * 345 * Handle an out of memory condition 346 */ 347 static void 348 htmlSaveErrMemory(const char *extra) 349 { 350 __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra); 351 } 352 353 /** 354 * htmlSaveErr: 355 * @code: the error number 356 * @node: the location of the error. 357 * @extra: extra information 358 * 359 * Handle an out of memory condition 360 */ 361 static void 362 htmlSaveErr(int code, xmlNodePtr node, const char *extra) 363 { 364 const char *msg = NULL; 365 366 switch(code) { 367 case XML_SAVE_NOT_UTF8: 368 msg = "string is not in UTF-8\n"; 369 break; 370 case XML_SAVE_CHAR_INVALID: 371 msg = "invalid character value\n"; 372 break; 373 case XML_SAVE_UNKNOWN_ENCODING: 374 msg = "unknown encoding %s\n"; 375 break; 376 case XML_SAVE_NO_DOCTYPE: 377 msg = "HTML has no DOCTYPE\n"; 378 break; 379 default: 380 msg = "unexpected error number\n"; 381 } 382 __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra); 383 } 384 385 /************************************************************************ 386 * * 387 * Dumping HTML tree content to a simple buffer * 388 * * 389 ************************************************************************/ 390 391 /** 392 * htmlBufNodeDumpFormat: 393 * @buf: the xmlBufPtr output 394 * @doc: the document 395 * @cur: the current node 396 * @format: should formatting spaces been added 397 * 398 * Dump an HTML node, recursive behaviour,children are printed too. 399 * 400 * Returns the number of byte written or -1 in case of error 401 */ 402 static size_t 403 htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur, 404 int format) { 405 size_t use; 406 int ret; 407 xmlOutputBufferPtr outbuf; 408 409 if (cur == NULL) { 410 return (-1); 411 } 412 if (buf == NULL) { 413 return (-1); 414 } 415 outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer)); 416 if (outbuf == NULL) { 417 htmlSaveErrMemory("allocating HTML output buffer"); 418 return (-1); 419 } 420 memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer)); 421 outbuf->buffer = buf; 422 outbuf->encoder = NULL; 423 outbuf->writecallback = NULL; 424 outbuf->closecallback = NULL; 425 outbuf->context = NULL; 426 outbuf->written = 0; 427 428 use = xmlBufUse(buf); 429 htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format); 430 xmlFree(outbuf); 431 ret = xmlBufUse(buf) - use; 432 return (ret); 433 } 434 435 /** 436 * htmlNodeDump: 437 * @buf: the HTML buffer output 438 * @doc: the document 439 * @cur: the current node 440 * 441 * Dump an HTML node, recursive behaviour,children are printed too, 442 * and formatting returns are added. 443 * 444 * Returns the number of byte written or -1 in case of error 445 */ 446 int 447 htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) { 448 xmlBufPtr buffer; 449 size_t ret; 450 451 if ((buf == NULL) || (cur == NULL)) 452 return(-1); 453 454 xmlInitParser(); 455 buffer = xmlBufFromBuffer(buf); 456 if (buffer == NULL) 457 return(-1); 458 459 ret = htmlBufNodeDumpFormat(buffer, doc, cur, 1); 460 461 xmlBufBackToBuffer(buffer); 462 463 if (ret > INT_MAX) 464 return(-1); 465 return((int) ret); 466 } 467 468 /** 469 * htmlNodeDumpFileFormat: 470 * @out: the FILE pointer 471 * @doc: the document 472 * @cur: the current node 473 * @encoding: the document encoding 474 * @format: should formatting spaces been added 475 * 476 * Dump an HTML node, recursive behaviour,children are printed too. 477 * 478 * TODO: if encoding == NULL try to save in the doc encoding 479 * 480 * returns: the number of byte written or -1 in case of failure. 481 */ 482 int 483 htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc, 484 xmlNodePtr cur, const char *encoding, int format) { 485 xmlOutputBufferPtr buf; 486 xmlCharEncodingHandlerPtr handler = NULL; 487 int ret; 488 489 xmlInitParser(); 490 491 if (encoding != NULL) { 492 xmlCharEncoding enc; 493 494 enc = xmlParseCharEncoding(encoding); 495 if (enc != XML_CHAR_ENCODING_UTF8) { 496 handler = xmlFindCharEncodingHandler(encoding); 497 if (handler == NULL) 498 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 499 } 500 } else { 501 /* 502 * Fallback to HTML or ASCII when the encoding is unspecified 503 */ 504 if (handler == NULL) 505 handler = xmlFindCharEncodingHandler("HTML"); 506 if (handler == NULL) 507 handler = xmlFindCharEncodingHandler("ascii"); 508 } 509 510 /* 511 * save the content to a temp buffer. 512 */ 513 buf = xmlOutputBufferCreateFile(out, handler); 514 if (buf == NULL) return(0); 515 516 htmlNodeDumpFormatOutput(buf, doc, cur, NULL, format); 517 518 ret = xmlOutputBufferClose(buf); 519 return(ret); 520 } 521 522 /** 523 * htmlNodeDumpFile: 524 * @out: the FILE pointer 525 * @doc: the document 526 * @cur: the current node 527 * 528 * Dump an HTML node, recursive behaviour,children are printed too, 529 * and formatting returns are added. 530 */ 531 void 532 htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) { 533 htmlNodeDumpFileFormat(out, doc, cur, NULL, 1); 534 } 535 536 /** 537 * htmlDocDumpMemoryFormat: 538 * @cur: the document 539 * @mem: OUT: the memory pointer 540 * @size: OUT: the memory length 541 * @format: should formatting spaces been added 542 * 543 * Dump an HTML document in memory and return the xmlChar * and it's size. 544 * It's up to the caller to free the memory. 545 */ 546 void 547 htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) { 548 xmlOutputBufferPtr buf; 549 xmlCharEncodingHandlerPtr handler = NULL; 550 const char *encoding; 551 552 xmlInitParser(); 553 554 if ((mem == NULL) || (size == NULL)) 555 return; 556 if (cur == NULL) { 557 *mem = NULL; 558 *size = 0; 559 return; 560 } 561 562 encoding = (const char *) htmlGetMetaEncoding(cur); 563 564 if (encoding != NULL) { 565 xmlCharEncoding enc; 566 567 enc = xmlParseCharEncoding(encoding); 568 if (enc != XML_CHAR_ENCODING_UTF8) { 569 handler = xmlFindCharEncodingHandler(encoding); 570 if (handler == NULL) 571 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 572 573 } 574 } else { 575 /* 576 * Fallback to HTML or ASCII when the encoding is unspecified 577 */ 578 if (handler == NULL) 579 handler = xmlFindCharEncodingHandler("HTML"); 580 if (handler == NULL) 581 handler = xmlFindCharEncodingHandler("ascii"); 582 } 583 584 buf = xmlAllocOutputBufferInternal(handler); 585 if (buf == NULL) { 586 *mem = NULL; 587 *size = 0; 588 return; 589 } 590 591 htmlDocContentDumpFormatOutput(buf, cur, NULL, format); 592 593 xmlOutputBufferFlush(buf); 594 if (buf->conv != NULL) { 595 *size = xmlBufUse(buf->conv); 596 *mem = xmlStrndup(xmlBufContent(buf->conv), *size); 597 } else { 598 *size = xmlBufUse(buf->buffer); 599 *mem = xmlStrndup(xmlBufContent(buf->buffer), *size); 600 } 601 (void)xmlOutputBufferClose(buf); 602 } 603 604 /** 605 * htmlDocDumpMemory: 606 * @cur: the document 607 * @mem: OUT: the memory pointer 608 * @size: OUT: the memory length 609 * 610 * Dump an HTML document in memory and return the xmlChar * and it's size. 611 * It's up to the caller to free the memory. 612 */ 613 void 614 htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) { 615 htmlDocDumpMemoryFormat(cur, mem, size, 1); 616 } 617 618 619 /************************************************************************ 620 * * 621 * Dumping HTML tree content to an I/O output buffer * 622 * * 623 ************************************************************************/ 624 625 void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur); 626 627 /** 628 * htmlDtdDumpOutput: 629 * @buf: the HTML buffer output 630 * @doc: the document 631 * @encoding: the encoding string 632 * 633 * TODO: check whether encoding is needed 634 * 635 * Dump the HTML document DTD, if any. 636 */ 637 static void 638 htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 639 const char *encoding ATTRIBUTE_UNUSED) { 640 xmlDtdPtr cur = doc->intSubset; 641 642 if (cur == NULL) { 643 htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL); 644 return; 645 } 646 xmlOutputBufferWriteString(buf, "<!DOCTYPE "); 647 xmlOutputBufferWriteString(buf, (const char *)cur->name); 648 if (cur->ExternalID != NULL) { 649 xmlOutputBufferWriteString(buf, " PUBLIC "); 650 xmlBufWriteQuotedString(buf->buffer, cur->ExternalID); 651 if (cur->SystemID != NULL) { 652 xmlOutputBufferWriteString(buf, " "); 653 xmlBufWriteQuotedString(buf->buffer, cur->SystemID); 654 } 655 } else if (cur->SystemID != NULL && 656 xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) { 657 xmlOutputBufferWriteString(buf, " SYSTEM "); 658 xmlBufWriteQuotedString(buf->buffer, cur->SystemID); 659 } 660 xmlOutputBufferWriteString(buf, ">\n"); 661 } 662 663 /** 664 * htmlAttrDumpOutput: 665 * @buf: the HTML buffer output 666 * @doc: the document 667 * @cur: the attribute pointer 668 * 669 * Dump an HTML attribute 670 */ 671 static void 672 htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) { 673 xmlChar *value; 674 675 /* 676 * The html output method should not escape a & character 677 * occurring in an attribute value immediately followed by 678 * a { character (see Section B.7.1 of the HTML 4.0 Recommendation). 679 * This is implemented in xmlEncodeEntitiesReentrant 680 */ 681 682 if (cur == NULL) { 683 return; 684 } 685 xmlOutputBufferWriteString(buf, " "); 686 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 687 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 688 xmlOutputBufferWriteString(buf, ":"); 689 } 690 xmlOutputBufferWriteString(buf, (const char *)cur->name); 691 if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) { 692 value = xmlNodeListGetString(doc, cur->children, 0); 693 if (value) { 694 xmlOutputBufferWriteString(buf, "="); 695 if ((cur->ns == NULL) && (cur->parent != NULL) && 696 (cur->parent->ns == NULL) && 697 ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) || 698 (!xmlStrcasecmp(cur->name, BAD_CAST "action")) || 699 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) || 700 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) && 701 (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) { 702 xmlChar *escaped; 703 xmlChar *tmp = value; 704 705 while (IS_BLANK_CH(*tmp)) tmp++; 706 707 /* 708 * the < and > have already been escaped at the entity level 709 * And doing so here breaks server side includes 710 */ 711 escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+<>"); 712 if (escaped != NULL) { 713 xmlBufWriteQuotedString(buf->buffer, escaped); 714 xmlFree(escaped); 715 } else { 716 xmlBufWriteQuotedString(buf->buffer, value); 717 } 718 } else { 719 xmlBufWriteQuotedString(buf->buffer, value); 720 } 721 xmlFree(value); 722 } else { 723 xmlOutputBufferWriteString(buf, "=\"\""); 724 } 725 } 726 } 727 728 /** 729 * htmlNodeDumpFormatOutput: 730 * @buf: the HTML buffer output 731 * @doc: the document 732 * @cur: the current node 733 * @encoding: the encoding string (unused) 734 * @format: should formatting spaces been added 735 * 736 * Dump an HTML node, recursive behaviour,children are printed too. 737 */ 738 void 739 htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 740 xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED, 741 int format) { 742 xmlNodePtr root, parent; 743 xmlAttrPtr attr; 744 const htmlElemDesc * info; 745 746 xmlInitParser(); 747 748 if ((cur == NULL) || (buf == NULL)) { 749 return; 750 } 751 752 root = cur; 753 parent = cur->parent; 754 while (1) { 755 switch (cur->type) { 756 case XML_HTML_DOCUMENT_NODE: 757 case XML_DOCUMENT_NODE: 758 if (((xmlDocPtr) cur)->intSubset != NULL) { 759 htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL); 760 } 761 if (cur->children != NULL) { 762 /* Always validate cur->parent when descending. */ 763 if (cur->parent == parent) { 764 parent = cur; 765 cur = cur->children; 766 continue; 767 } 768 } else { 769 xmlOutputBufferWriteString(buf, "\n"); 770 } 771 break; 772 773 case XML_ELEMENT_NODE: 774 /* 775 * Some users like lxml are known to pass nodes with a corrupted 776 * tree structure. Fall back to a recursive call to handle this 777 * case. 778 */ 779 if ((cur->parent != parent) && (cur->children != NULL)) { 780 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format); 781 break; 782 } 783 784 /* 785 * Get specific HTML info for that node. 786 */ 787 if (cur->ns == NULL) 788 info = htmlTagLookup(cur->name); 789 else 790 info = NULL; 791 792 xmlOutputBufferWriteString(buf, "<"); 793 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 794 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 795 xmlOutputBufferWriteString(buf, ":"); 796 } 797 xmlOutputBufferWriteString(buf, (const char *)cur->name); 798 if (cur->nsDef) 799 xmlNsListDumpOutput(buf, cur->nsDef); 800 attr = cur->properties; 801 while (attr != NULL) { 802 htmlAttrDumpOutput(buf, doc, attr); 803 attr = attr->next; 804 } 805 806 if ((info != NULL) && (info->empty)) { 807 xmlOutputBufferWriteString(buf, ">"); 808 } else if (cur->children == NULL) { 809 if ((info != NULL) && (info->saveEndTag != 0) && 810 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) && 811 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) { 812 xmlOutputBufferWriteString(buf, ">"); 813 } else { 814 xmlOutputBufferWriteString(buf, "></"); 815 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 816 xmlOutputBufferWriteString(buf, 817 (const char *)cur->ns->prefix); 818 xmlOutputBufferWriteString(buf, ":"); 819 } 820 xmlOutputBufferWriteString(buf, (const char *)cur->name); 821 xmlOutputBufferWriteString(buf, ">"); 822 } 823 } else { 824 xmlOutputBufferWriteString(buf, ">"); 825 if ((format) && (info != NULL) && (!info->isinline) && 826 (cur->children->type != HTML_TEXT_NODE) && 827 (cur->children->type != HTML_ENTITY_REF_NODE) && 828 (cur->children != cur->last) && 829 (cur->name != NULL) && 830 (cur->name[0] != 'p')) /* p, pre, param */ 831 xmlOutputBufferWriteString(buf, "\n"); 832 parent = cur; 833 cur = cur->children; 834 continue; 835 } 836 837 if ((format) && (cur->next != NULL) && 838 (info != NULL) && (!info->isinline)) { 839 if ((cur->next->type != HTML_TEXT_NODE) && 840 (cur->next->type != HTML_ENTITY_REF_NODE) && 841 (parent != NULL) && 842 (parent->name != NULL) && 843 (parent->name[0] != 'p')) /* p, pre, param */ 844 xmlOutputBufferWriteString(buf, "\n"); 845 } 846 847 break; 848 849 case XML_ATTRIBUTE_NODE: 850 htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur); 851 break; 852 853 case HTML_TEXT_NODE: 854 if (cur->content == NULL) 855 break; 856 if (((cur->name == (const xmlChar *)xmlStringText) || 857 (cur->name != (const xmlChar *)xmlStringTextNoenc)) && 858 ((parent == NULL) || 859 ((xmlStrcasecmp(parent->name, BAD_CAST "script")) && 860 (xmlStrcasecmp(parent->name, BAD_CAST "style"))))) { 861 xmlChar *buffer; 862 863 buffer = xmlEncodeEntitiesReentrant(doc, cur->content); 864 if (buffer != NULL) { 865 xmlOutputBufferWriteString(buf, (const char *)buffer); 866 xmlFree(buffer); 867 } 868 } else { 869 xmlOutputBufferWriteString(buf, (const char *)cur->content); 870 } 871 break; 872 873 case HTML_COMMENT_NODE: 874 if (cur->content != NULL) { 875 xmlOutputBufferWriteString(buf, "<!--"); 876 xmlOutputBufferWriteString(buf, (const char *)cur->content); 877 xmlOutputBufferWriteString(buf, "-->"); 878 } 879 break; 880 881 case HTML_PI_NODE: 882 if (cur->name != NULL) { 883 xmlOutputBufferWriteString(buf, "<?"); 884 xmlOutputBufferWriteString(buf, (const char *)cur->name); 885 if (cur->content != NULL) { 886 xmlOutputBufferWriteString(buf, " "); 887 xmlOutputBufferWriteString(buf, 888 (const char *)cur->content); 889 } 890 xmlOutputBufferWriteString(buf, ">"); 891 } 892 break; 893 894 case HTML_ENTITY_REF_NODE: 895 xmlOutputBufferWriteString(buf, "&"); 896 xmlOutputBufferWriteString(buf, (const char *)cur->name); 897 xmlOutputBufferWriteString(buf, ";"); 898 break; 899 900 case HTML_PRESERVE_NODE: 901 if (cur->content != NULL) { 902 xmlOutputBufferWriteString(buf, (const char *)cur->content); 903 } 904 break; 905 906 default: 907 break; 908 } 909 910 while (1) { 911 if (cur == root) 912 return; 913 if (cur->next != NULL) { 914 cur = cur->next; 915 break; 916 } 917 918 cur = parent; 919 /* cur->parent was validated when descending. */ 920 parent = cur->parent; 921 922 if ((cur->type == XML_HTML_DOCUMENT_NODE) || 923 (cur->type == XML_DOCUMENT_NODE)) { 924 xmlOutputBufferWriteString(buf, "\n"); 925 } else { 926 if ((format) && (cur->ns == NULL)) 927 info = htmlTagLookup(cur->name); 928 else 929 info = NULL; 930 931 if ((format) && (info != NULL) && (!info->isinline) && 932 (cur->last->type != HTML_TEXT_NODE) && 933 (cur->last->type != HTML_ENTITY_REF_NODE) && 934 (cur->children != cur->last) && 935 (cur->name != NULL) && 936 (cur->name[0] != 'p')) /* p, pre, param */ 937 xmlOutputBufferWriteString(buf, "\n"); 938 939 xmlOutputBufferWriteString(buf, "</"); 940 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 941 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 942 xmlOutputBufferWriteString(buf, ":"); 943 } 944 xmlOutputBufferWriteString(buf, (const char *)cur->name); 945 xmlOutputBufferWriteString(buf, ">"); 946 947 if ((format) && (info != NULL) && (!info->isinline) && 948 (cur->next != NULL)) { 949 if ((cur->next->type != HTML_TEXT_NODE) && 950 (cur->next->type != HTML_ENTITY_REF_NODE) && 951 (parent != NULL) && 952 (parent->name != NULL) && 953 (parent->name[0] != 'p')) /* p, pre, param */ 954 xmlOutputBufferWriteString(buf, "\n"); 955 } 956 } 957 } 958 } 959 } 960 961 /** 962 * htmlNodeDumpOutput: 963 * @buf: the HTML buffer output 964 * @doc: the document 965 * @cur: the current node 966 * @encoding: the encoding string (unused) 967 * 968 * Dump an HTML node, recursive behaviour,children are printed too, 969 * and formatting returns/spaces are added. 970 */ 971 void 972 htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 973 xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED) { 974 htmlNodeDumpFormatOutput(buf, doc, cur, NULL, 1); 975 } 976 977 /** 978 * htmlDocContentDumpFormatOutput: 979 * @buf: the HTML buffer output 980 * @cur: the document 981 * @encoding: the encoding string (unused) 982 * @format: should formatting spaces been added 983 * 984 * Dump an HTML document. 985 */ 986 void 987 htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, 988 const char *encoding ATTRIBUTE_UNUSED, 989 int format) { 990 int type = 0; 991 if (cur) { 992 type = cur->type; 993 cur->type = XML_HTML_DOCUMENT_NODE; 994 } 995 htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, format); 996 if (cur) 997 cur->type = (xmlElementType) type; 998 } 999 1000 /** 1001 * htmlDocContentDumpOutput: 1002 * @buf: the HTML buffer output 1003 * @cur: the document 1004 * @encoding: the encoding string (unused) 1005 * 1006 * Dump an HTML document. Formatting return/spaces are added. 1007 */ 1008 void 1009 htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, 1010 const char *encoding ATTRIBUTE_UNUSED) { 1011 htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, 1); 1012 } 1013 1014 /************************************************************************ 1015 * * 1016 * Saving functions front-ends * 1017 * * 1018 ************************************************************************/ 1019 1020 /** 1021 * htmlDocDump: 1022 * @f: the FILE* 1023 * @cur: the document 1024 * 1025 * Dump an HTML document to an open FILE. 1026 * 1027 * returns: the number of byte written or -1 in case of failure. 1028 */ 1029 int 1030 htmlDocDump(FILE *f, xmlDocPtr cur) { 1031 xmlOutputBufferPtr buf; 1032 xmlCharEncodingHandlerPtr handler = NULL; 1033 const char *encoding; 1034 int ret; 1035 1036 xmlInitParser(); 1037 1038 if ((cur == NULL) || (f == NULL)) { 1039 return(-1); 1040 } 1041 1042 encoding = (const char *) htmlGetMetaEncoding(cur); 1043 1044 if (encoding != NULL) { 1045 xmlCharEncoding enc; 1046 1047 enc = xmlParseCharEncoding(encoding); 1048 if (enc != XML_CHAR_ENCODING_UTF8) { 1049 handler = xmlFindCharEncodingHandler(encoding); 1050 if (handler == NULL) 1051 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 1052 } 1053 } else { 1054 /* 1055 * Fallback to HTML or ASCII when the encoding is unspecified 1056 */ 1057 if (handler == NULL) 1058 handler = xmlFindCharEncodingHandler("HTML"); 1059 if (handler == NULL) 1060 handler = xmlFindCharEncodingHandler("ascii"); 1061 } 1062 1063 buf = xmlOutputBufferCreateFile(f, handler); 1064 if (buf == NULL) return(-1); 1065 htmlDocContentDumpOutput(buf, cur, NULL); 1066 1067 ret = xmlOutputBufferClose(buf); 1068 return(ret); 1069 } 1070 1071 /** 1072 * htmlSaveFile: 1073 * @filename: the filename (or URL) 1074 * @cur: the document 1075 * 1076 * Dump an HTML document to a file. If @filename is "-" the stdout file is 1077 * used. 1078 * returns: the number of byte written or -1 in case of failure. 1079 */ 1080 int 1081 htmlSaveFile(const char *filename, xmlDocPtr cur) { 1082 xmlOutputBufferPtr buf; 1083 xmlCharEncodingHandlerPtr handler = NULL; 1084 const char *encoding; 1085 int ret; 1086 1087 if ((cur == NULL) || (filename == NULL)) 1088 return(-1); 1089 1090 xmlInitParser(); 1091 1092 encoding = (const char *) htmlGetMetaEncoding(cur); 1093 1094 if (encoding != NULL) { 1095 xmlCharEncoding enc; 1096 1097 enc = xmlParseCharEncoding(encoding); 1098 if (enc != XML_CHAR_ENCODING_UTF8) { 1099 handler = xmlFindCharEncodingHandler(encoding); 1100 if (handler == NULL) 1101 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 1102 } 1103 } else { 1104 /* 1105 * Fallback to HTML or ASCII when the encoding is unspecified 1106 */ 1107 if (handler == NULL) 1108 handler = xmlFindCharEncodingHandler("HTML"); 1109 if (handler == NULL) 1110 handler = xmlFindCharEncodingHandler("ascii"); 1111 } 1112 1113 /* 1114 * save the content to a temp buffer. 1115 */ 1116 buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression); 1117 if (buf == NULL) return(0); 1118 1119 htmlDocContentDumpOutput(buf, cur, NULL); 1120 1121 ret = xmlOutputBufferClose(buf); 1122 return(ret); 1123 } 1124 1125 /** 1126 * htmlSaveFileFormat: 1127 * @filename: the filename 1128 * @cur: the document 1129 * @format: should formatting spaces been added 1130 * @encoding: the document encoding 1131 * 1132 * Dump an HTML document to a file using a given encoding. 1133 * 1134 * returns: the number of byte written or -1 in case of failure. 1135 */ 1136 int 1137 htmlSaveFileFormat(const char *filename, xmlDocPtr cur, 1138 const char *encoding, int format) { 1139 xmlOutputBufferPtr buf; 1140 xmlCharEncodingHandlerPtr handler = NULL; 1141 int ret; 1142 1143 if ((cur == NULL) || (filename == NULL)) 1144 return(-1); 1145 1146 xmlInitParser(); 1147 1148 if (encoding != NULL) { 1149 xmlCharEncoding enc; 1150 1151 enc = xmlParseCharEncoding(encoding); 1152 if (enc != XML_CHAR_ENCODING_UTF8) { 1153 handler = xmlFindCharEncodingHandler(encoding); 1154 if (handler == NULL) 1155 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 1156 } 1157 htmlSetMetaEncoding(cur, (const xmlChar *) encoding); 1158 } else { 1159 htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8"); 1160 1161 /* 1162 * Fallback to HTML or ASCII when the encoding is unspecified 1163 */ 1164 if (handler == NULL) 1165 handler = xmlFindCharEncodingHandler("HTML"); 1166 if (handler == NULL) 1167 handler = xmlFindCharEncodingHandler("ascii"); 1168 } 1169 1170 /* 1171 * save the content to a temp buffer. 1172 */ 1173 buf = xmlOutputBufferCreateFilename(filename, handler, 0); 1174 if (buf == NULL) return(0); 1175 1176 htmlDocContentDumpFormatOutput(buf, cur, encoding, format); 1177 1178 ret = xmlOutputBufferClose(buf); 1179 return(ret); 1180 } 1181 1182 /** 1183 * htmlSaveFileEnc: 1184 * @filename: the filename 1185 * @cur: the document 1186 * @encoding: the document encoding 1187 * 1188 * Dump an HTML document to a file using a given encoding 1189 * and formatting returns/spaces are added. 1190 * 1191 * returns: the number of byte written or -1 in case of failure. 1192 */ 1193 int 1194 htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) { 1195 return(htmlSaveFileFormat(filename, cur, encoding, 1)); 1196 } 1197 1198 #endif /* LIBXML_OUTPUT_ENABLED */ 1199 1200 #endif /* LIBXML_HTML_ENABLED */ 1201