1 /* 2 * HTMLtree.c : implementation of access function for an HTML tree. 3 * 4 * See Copyright for the status of this software. 5 * 6 * daniel@veillard.com 7 */ 8 9 10 #define IN_LIBXML 11 #include "libxml.h" 12 #ifdef LIBXML_HTML_ENABLED 13 14 #include <string.h> /* for memset() only ! */ 15 16 #ifdef HAVE_CTYPE_H 17 #include <ctype.h> 18 #endif 19 #ifdef HAVE_STDLIB_H 20 #include <stdlib.h> 21 #endif 22 23 #include <libxml/xmlmemory.h> 24 #include <libxml/HTMLparser.h> 25 #include <libxml/HTMLtree.h> 26 #include <libxml/entities.h> 27 #include <libxml/valid.h> 28 #include <libxml/xmlerror.h> 29 #include <libxml/parserInternals.h> 30 #include <libxml/globals.h> 31 #include <libxml/uri.h> 32 33 #include "buf.h" 34 35 /************************************************************************ 36 * * 37 * Getting/Setting encoding meta tags * 38 * * 39 ************************************************************************/ 40 41 /** 42 * htmlGetMetaEncoding: 43 * @doc: the document 44 * 45 * Encoding definition lookup in the Meta tags 46 * 47 * Returns the current encoding as flagged in the HTML source 48 */ 49 const xmlChar * 50 htmlGetMetaEncoding(htmlDocPtr doc) { 51 htmlNodePtr cur; 52 const xmlChar *content; 53 const xmlChar *encoding; 54 55 if (doc == NULL) 56 return(NULL); 57 cur = doc->children; 58 59 /* 60 * Search the html 61 */ 62 while (cur != NULL) { 63 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 64 if (xmlStrEqual(cur->name, BAD_CAST"html")) 65 break; 66 if (xmlStrEqual(cur->name, BAD_CAST"head")) 67 goto found_head; 68 if (xmlStrEqual(cur->name, BAD_CAST"meta")) 69 goto found_meta; 70 } 71 cur = cur->next; 72 } 73 if (cur == NULL) 74 return(NULL); 75 cur = cur->children; 76 77 /* 78 * Search the head 79 */ 80 while (cur != NULL) { 81 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 82 if (xmlStrEqual(cur->name, BAD_CAST"head")) 83 break; 84 if (xmlStrEqual(cur->name, BAD_CAST"meta")) 85 goto found_meta; 86 } 87 cur = cur->next; 88 } 89 if (cur == NULL) 90 return(NULL); 91 found_head: 92 cur = cur->children; 93 94 /* 95 * Search the meta elements 96 */ 97 found_meta: 98 while (cur != NULL) { 99 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 100 if (xmlStrEqual(cur->name, BAD_CAST"meta")) { 101 xmlAttrPtr attr = cur->properties; 102 int http; 103 const xmlChar *value; 104 105 content = NULL; 106 http = 0; 107 while (attr != NULL) { 108 if ((attr->children != NULL) && 109 (attr->children->type == XML_TEXT_NODE) && 110 (attr->children->next == NULL)) { 111 value = attr->children->content; 112 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv")) 113 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) 114 http = 1; 115 else if ((value != NULL) 116 && (!xmlStrcasecmp(attr->name, BAD_CAST"content"))) 117 content = value; 118 if ((http != 0) && (content != NULL)) 119 goto found_content; 120 } 121 attr = attr->next; 122 } 123 } 124 } 125 cur = cur->next; 126 } 127 return(NULL); 128 129 found_content: 130 encoding = xmlStrstr(content, BAD_CAST"charset="); 131 if (encoding == NULL) 132 encoding = xmlStrstr(content, BAD_CAST"Charset="); 133 if (encoding == NULL) 134 encoding = xmlStrstr(content, BAD_CAST"CHARSET="); 135 if (encoding != NULL) { 136 encoding += 8; 137 } else { 138 encoding = xmlStrstr(content, BAD_CAST"charset ="); 139 if (encoding == NULL) 140 encoding = xmlStrstr(content, BAD_CAST"Charset ="); 141 if (encoding == NULL) 142 encoding = xmlStrstr(content, BAD_CAST"CHARSET ="); 143 if (encoding != NULL) 144 encoding += 9; 145 } 146 if (encoding != NULL) { 147 while ((*encoding == ' ') || (*encoding == '\t')) encoding++; 148 } 149 return(encoding); 150 } 151 152 /** 153 * htmlSetMetaEncoding: 154 * @doc: the document 155 * @encoding: the encoding string 156 * 157 * Sets the current encoding in the Meta tags 158 * NOTE: this will not change the document content encoding, just 159 * the META flag associated. 160 * 161 * Returns 0 in case of success and -1 in case of error 162 */ 163 int 164 htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) { 165 htmlNodePtr cur, meta = NULL, head = NULL; 166 const xmlChar *content = NULL; 167 char newcontent[100]; 168 169 newcontent[0] = 0; 170 171 if (doc == NULL) 172 return(-1); 173 174 /* html isn't a real encoding it's just libxml2 way to get entities */ 175 if (!xmlStrcasecmp(encoding, BAD_CAST "html")) 176 return(-1); 177 178 if (encoding != NULL) { 179 snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s", 180 (char *)encoding); 181 newcontent[sizeof(newcontent) - 1] = 0; 182 } 183 184 cur = doc->children; 185 186 /* 187 * Search the html 188 */ 189 while (cur != NULL) { 190 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 191 if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0) 192 break; 193 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0) 194 goto found_head; 195 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) 196 goto found_meta; 197 } 198 cur = cur->next; 199 } 200 if (cur == NULL) 201 return(-1); 202 cur = cur->children; 203 204 /* 205 * Search the head 206 */ 207 while (cur != NULL) { 208 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 209 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0) 210 break; 211 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) { 212 head = cur->parent; 213 goto found_meta; 214 } 215 } 216 cur = cur->next; 217 } 218 if (cur == NULL) 219 return(-1); 220 found_head: 221 head = cur; 222 if (cur->children == NULL) 223 goto create; 224 cur = cur->children; 225 226 found_meta: 227 /* 228 * Search and update all the remaining the meta elements carrying 229 * encoding information 230 */ 231 while (cur != NULL) { 232 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 233 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) { 234 xmlAttrPtr attr = cur->properties; 235 int http; 236 const xmlChar *value; 237 238 content = NULL; 239 http = 0; 240 while (attr != NULL) { 241 if ((attr->children != NULL) && 242 (attr->children->type == XML_TEXT_NODE) && 243 (attr->children->next == NULL)) { 244 value = attr->children->content; 245 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv")) 246 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) 247 http = 1; 248 else 249 { 250 if ((value != NULL) && 251 (!xmlStrcasecmp(attr->name, BAD_CAST"content"))) 252 content = value; 253 } 254 if ((http != 0) && (content != NULL)) 255 break; 256 } 257 attr = attr->next; 258 } 259 if ((http != 0) && (content != NULL)) { 260 meta = cur; 261 break; 262 } 263 264 } 265 } 266 cur = cur->next; 267 } 268 create: 269 if (meta == NULL) { 270 if ((encoding != NULL) && (head != NULL)) { 271 /* 272 * Create a new Meta element with the right attributes 273 */ 274 275 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL); 276 if (head->children == NULL) 277 xmlAddChild(head, meta); 278 else 279 xmlAddPrevSibling(head->children, meta); 280 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type"); 281 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent); 282 } 283 } else { 284 /* remove the meta tag if NULL is passed */ 285 if (encoding == NULL) { 286 xmlUnlinkNode(meta); 287 xmlFreeNode(meta); 288 } 289 /* change the document only if there is a real encoding change */ 290 else if (xmlStrcasestr(content, encoding) == NULL) { 291 xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent); 292 } 293 } 294 295 296 return(0); 297 } 298 299 /** 300 * booleanHTMLAttrs: 301 * 302 * These are the HTML attributes which will be output 303 * in minimized form, i.e. <option selected="selected"> will be 304 * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method" 305 * 306 */ 307 static const char* htmlBooleanAttrs[] = { 308 "checked", "compact", "declare", "defer", "disabled", "ismap", 309 "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly", 310 "selected", NULL 311 }; 312 313 314 /** 315 * htmlIsBooleanAttr: 316 * @name: the name of the attribute to check 317 * 318 * Determine if a given attribute is a boolean attribute. 319 * 320 * returns: false if the attribute is not boolean, true otherwise. 321 */ 322 int 323 htmlIsBooleanAttr(const xmlChar *name) 324 { 325 int i = 0; 326 327 while (htmlBooleanAttrs[i] != NULL) { 328 if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0) 329 return 1; 330 i++; 331 } 332 return 0; 333 } 334 335 #ifdef LIBXML_OUTPUT_ENABLED 336 /* 337 * private routine exported from xmlIO.c 338 */ 339 xmlOutputBufferPtr 340 xmlAllocOutputBufferInternal(xmlCharEncodingHandlerPtr encoder); 341 /************************************************************************ 342 * * 343 * Output error handlers * 344 * * 345 ************************************************************************/ 346 /** 347 * htmlSaveErrMemory: 348 * @extra: extra information 349 * 350 * Handle an out of memory condition 351 */ 352 static void 353 htmlSaveErrMemory(const char *extra) 354 { 355 __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra); 356 } 357 358 /** 359 * htmlSaveErr: 360 * @code: the error number 361 * @node: the location of the error. 362 * @extra: extra information 363 * 364 * Handle an out of memory condition 365 */ 366 static void 367 htmlSaveErr(int code, xmlNodePtr node, const char *extra) 368 { 369 const char *msg = NULL; 370 371 switch(code) { 372 case XML_SAVE_NOT_UTF8: 373 msg = "string is not in UTF-8\n"; 374 break; 375 case XML_SAVE_CHAR_INVALID: 376 msg = "invalid character value\n"; 377 break; 378 case XML_SAVE_UNKNOWN_ENCODING: 379 msg = "unknown encoding %s\n"; 380 break; 381 case XML_SAVE_NO_DOCTYPE: 382 msg = "HTML has no DOCTYPE\n"; 383 break; 384 default: 385 msg = "unexpected error number\n"; 386 } 387 __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra); 388 } 389 390 /************************************************************************ 391 * * 392 * Dumping HTML tree content to a simple buffer * 393 * * 394 ************************************************************************/ 395 396 /** 397 * htmlBufNodeDumpFormat: 398 * @buf: the xmlBufPtr output 399 * @doc: the document 400 * @cur: the current node 401 * @format: should formatting spaces been added 402 * 403 * Dump an HTML node, recursive behaviour,children are printed too. 404 * 405 * Returns the number of byte written or -1 in case of error 406 */ 407 static size_t 408 htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur, 409 int format) { 410 size_t use; 411 int ret; 412 xmlOutputBufferPtr outbuf; 413 414 if (cur == NULL) { 415 return (-1); 416 } 417 if (buf == NULL) { 418 return (-1); 419 } 420 outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer)); 421 if (outbuf == NULL) { 422 htmlSaveErrMemory("allocating HTML output buffer"); 423 return (-1); 424 } 425 memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer)); 426 outbuf->buffer = buf; 427 outbuf->encoder = NULL; 428 outbuf->writecallback = NULL; 429 outbuf->closecallback = NULL; 430 outbuf->context = NULL; 431 outbuf->written = 0; 432 433 use = xmlBufUse(buf); 434 htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format); 435 xmlFree(outbuf); 436 ret = xmlBufUse(buf) - use; 437 return (ret); 438 } 439 440 /** 441 * htmlNodeDump: 442 * @buf: the HTML buffer output 443 * @doc: the document 444 * @cur: the current node 445 * 446 * Dump an HTML node, recursive behaviour,children are printed too, 447 * and formatting returns are added. 448 * 449 * Returns the number of byte written or -1 in case of error 450 */ 451 int 452 htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) { 453 xmlBufPtr buffer; 454 size_t ret; 455 456 if ((buf == NULL) || (cur == NULL)) 457 return(-1); 458 459 xmlInitParser(); 460 buffer = xmlBufFromBuffer(buf); 461 if (buffer == NULL) 462 return(-1); 463 464 ret = htmlBufNodeDumpFormat(buffer, doc, cur, 1); 465 466 xmlBufBackToBuffer(buffer); 467 468 if (ret > INT_MAX) 469 return(-1); 470 return((int) ret); 471 } 472 473 /** 474 * htmlNodeDumpFileFormat: 475 * @out: the FILE pointer 476 * @doc: the document 477 * @cur: the current node 478 * @encoding: the document encoding 479 * @format: should formatting spaces been added 480 * 481 * Dump an HTML node, recursive behaviour,children are printed too. 482 * 483 * TODO: if encoding == NULL try to save in the doc encoding 484 * 485 * returns: the number of byte written or -1 in case of failure. 486 */ 487 int 488 htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc, 489 xmlNodePtr cur, const char *encoding, int format) { 490 xmlOutputBufferPtr buf; 491 xmlCharEncodingHandlerPtr handler = NULL; 492 int ret; 493 494 xmlInitParser(); 495 496 if (encoding != NULL) { 497 xmlCharEncoding enc; 498 499 enc = xmlParseCharEncoding(encoding); 500 if (enc != XML_CHAR_ENCODING_UTF8) { 501 handler = xmlFindCharEncodingHandler(encoding); 502 if (handler == NULL) 503 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 504 } 505 } else { 506 /* 507 * Fallback to HTML or ASCII when the encoding is unspecified 508 */ 509 if (handler == NULL) 510 handler = xmlFindCharEncodingHandler("HTML"); 511 if (handler == NULL) 512 handler = xmlFindCharEncodingHandler("ascii"); 513 } 514 515 /* 516 * save the content to a temp buffer. 517 */ 518 buf = xmlOutputBufferCreateFile(out, handler); 519 if (buf == NULL) return(0); 520 521 htmlNodeDumpFormatOutput(buf, doc, cur, NULL, format); 522 523 ret = xmlOutputBufferClose(buf); 524 return(ret); 525 } 526 527 /** 528 * htmlNodeDumpFile: 529 * @out: the FILE pointer 530 * @doc: the document 531 * @cur: the current node 532 * 533 * Dump an HTML node, recursive behaviour,children are printed too, 534 * and formatting returns are added. 535 */ 536 void 537 htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) { 538 htmlNodeDumpFileFormat(out, doc, cur, NULL, 1); 539 } 540 541 /** 542 * htmlDocDumpMemoryFormat: 543 * @cur: the document 544 * @mem: OUT: the memory pointer 545 * @size: OUT: the memory length 546 * @format: should formatting spaces been added 547 * 548 * Dump an HTML document in memory and return the xmlChar * and it's size. 549 * It's up to the caller to free the memory. 550 */ 551 void 552 htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) { 553 xmlOutputBufferPtr buf; 554 xmlCharEncodingHandlerPtr handler = NULL; 555 const char *encoding; 556 557 xmlInitParser(); 558 559 if ((mem == NULL) || (size == NULL)) 560 return; 561 if (cur == NULL) { 562 *mem = NULL; 563 *size = 0; 564 return; 565 } 566 567 encoding = (const char *) htmlGetMetaEncoding(cur); 568 569 if (encoding != NULL) { 570 xmlCharEncoding enc; 571 572 enc = xmlParseCharEncoding(encoding); 573 if (enc != XML_CHAR_ENCODING_UTF8) { 574 handler = xmlFindCharEncodingHandler(encoding); 575 if (handler == NULL) 576 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 577 578 } 579 } else { 580 /* 581 * Fallback to HTML or ASCII when the encoding is unspecified 582 */ 583 if (handler == NULL) 584 handler = xmlFindCharEncodingHandler("HTML"); 585 if (handler == NULL) 586 handler = xmlFindCharEncodingHandler("ascii"); 587 } 588 589 buf = xmlAllocOutputBufferInternal(handler); 590 if (buf == NULL) { 591 *mem = NULL; 592 *size = 0; 593 return; 594 } 595 596 htmlDocContentDumpFormatOutput(buf, cur, NULL, format); 597 598 xmlOutputBufferFlush(buf); 599 if (buf->conv != NULL) { 600 *size = xmlBufUse(buf->conv); 601 *mem = xmlStrndup(xmlBufContent(buf->conv), *size); 602 } else { 603 *size = xmlBufUse(buf->buffer); 604 *mem = xmlStrndup(xmlBufContent(buf->buffer), *size); 605 } 606 (void)xmlOutputBufferClose(buf); 607 } 608 609 /** 610 * htmlDocDumpMemory: 611 * @cur: the document 612 * @mem: OUT: the memory pointer 613 * @size: OUT: the memory length 614 * 615 * Dump an HTML document in memory and return the xmlChar * and it's size. 616 * It's up to the caller to free the memory. 617 */ 618 void 619 htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) { 620 htmlDocDumpMemoryFormat(cur, mem, size, 1); 621 } 622 623 624 /************************************************************************ 625 * * 626 * Dumping HTML tree content to an I/O output buffer * 627 * * 628 ************************************************************************/ 629 630 void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur); 631 632 /** 633 * htmlDtdDumpOutput: 634 * @buf: the HTML buffer output 635 * @doc: the document 636 * @encoding: the encoding string 637 * 638 * TODO: check whether encoding is needed 639 * 640 * Dump the HTML document DTD, if any. 641 */ 642 static void 643 htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 644 const char *encoding ATTRIBUTE_UNUSED) { 645 xmlDtdPtr cur = doc->intSubset; 646 647 if (cur == NULL) { 648 htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL); 649 return; 650 } 651 xmlOutputBufferWriteString(buf, "<!DOCTYPE "); 652 xmlOutputBufferWriteString(buf, (const char *)cur->name); 653 if (cur->ExternalID != NULL) { 654 xmlOutputBufferWriteString(buf, " PUBLIC "); 655 xmlBufWriteQuotedString(buf->buffer, cur->ExternalID); 656 if (cur->SystemID != NULL) { 657 xmlOutputBufferWriteString(buf, " "); 658 xmlBufWriteQuotedString(buf->buffer, cur->SystemID); 659 } 660 } else if (cur->SystemID != NULL && 661 xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) { 662 xmlOutputBufferWriteString(buf, " SYSTEM "); 663 xmlBufWriteQuotedString(buf->buffer, cur->SystemID); 664 } 665 xmlOutputBufferWriteString(buf, ">\n"); 666 } 667 668 /** 669 * htmlAttrDumpOutput: 670 * @buf: the HTML buffer output 671 * @doc: the document 672 * @cur: the attribute pointer 673 * 674 * Dump an HTML attribute 675 */ 676 static void 677 htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) { 678 xmlChar *value; 679 680 /* 681 * The html output method should not escape a & character 682 * occurring in an attribute value immediately followed by 683 * a { character (see Section B.7.1 of the HTML 4.0 Recommendation). 684 * This is implemented in xmlEncodeEntitiesReentrant 685 */ 686 687 if (cur == NULL) { 688 return; 689 } 690 xmlOutputBufferWriteString(buf, " "); 691 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 692 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 693 xmlOutputBufferWriteString(buf, ":"); 694 } 695 xmlOutputBufferWriteString(buf, (const char *)cur->name); 696 if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) { 697 value = xmlNodeListGetString(doc, cur->children, 0); 698 if (value) { 699 xmlOutputBufferWriteString(buf, "="); 700 if ((cur->ns == NULL) && (cur->parent != NULL) && 701 (cur->parent->ns == NULL) && 702 ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) || 703 (!xmlStrcasecmp(cur->name, BAD_CAST "action")) || 704 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) || 705 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) && 706 (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) { 707 xmlChar *escaped; 708 xmlChar *tmp = value; 709 710 while (IS_BLANK_CH(*tmp)) tmp++; 711 712 /* 713 * the < and > have already been escaped at the entity level 714 * And doing so here breaks server side includes 715 */ 716 escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+<>"); 717 if (escaped != NULL) { 718 xmlBufWriteQuotedString(buf->buffer, escaped); 719 xmlFree(escaped); 720 } else { 721 xmlBufWriteQuotedString(buf->buffer, value); 722 } 723 } else { 724 xmlBufWriteQuotedString(buf->buffer, value); 725 } 726 xmlFree(value); 727 } else { 728 xmlOutputBufferWriteString(buf, "=\"\""); 729 } 730 } 731 } 732 733 /** 734 * htmlNodeDumpFormatOutput: 735 * @buf: the HTML buffer output 736 * @doc: the document 737 * @cur: the current node 738 * @encoding: the encoding string (unused) 739 * @format: should formatting spaces been added 740 * 741 * Dump an HTML node, recursive behaviour,children are printed too. 742 */ 743 void 744 htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 745 xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED, 746 int format) { 747 xmlNodePtr root; 748 xmlAttrPtr attr; 749 const htmlElemDesc * info; 750 751 xmlInitParser(); 752 753 if ((cur == NULL) || (buf == NULL)) { 754 return; 755 } 756 757 root = cur; 758 while (1) { 759 switch (cur->type) { 760 case XML_HTML_DOCUMENT_NODE: 761 case XML_DOCUMENT_NODE: 762 if (((xmlDocPtr) cur)->intSubset != NULL) { 763 htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL); 764 } 765 if (cur->children != NULL) { 766 cur = cur->children; 767 continue; 768 } 769 break; 770 771 case XML_ELEMENT_NODE: 772 /* 773 * Get specific HTML info for that node. 774 */ 775 if (cur->ns == NULL) 776 info = htmlTagLookup(cur->name); 777 else 778 info = NULL; 779 780 xmlOutputBufferWriteString(buf, "<"); 781 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 782 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 783 xmlOutputBufferWriteString(buf, ":"); 784 } 785 xmlOutputBufferWriteString(buf, (const char *)cur->name); 786 if (cur->nsDef) 787 xmlNsListDumpOutput(buf, cur->nsDef); 788 attr = cur->properties; 789 while (attr != NULL) { 790 htmlAttrDumpOutput(buf, doc, attr); 791 attr = attr->next; 792 } 793 794 if ((info != NULL) && (info->empty)) { 795 xmlOutputBufferWriteString(buf, ">"); 796 } else if (cur->children == NULL) { 797 if ((info != NULL) && (info->saveEndTag != 0) && 798 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) && 799 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) { 800 xmlOutputBufferWriteString(buf, ">"); 801 } else { 802 xmlOutputBufferWriteString(buf, "></"); 803 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 804 xmlOutputBufferWriteString(buf, 805 (const char *)cur->ns->prefix); 806 xmlOutputBufferWriteString(buf, ":"); 807 } 808 xmlOutputBufferWriteString(buf, (const char *)cur->name); 809 xmlOutputBufferWriteString(buf, ">"); 810 } 811 } else { 812 xmlOutputBufferWriteString(buf, ">"); 813 if ((format) && (info != NULL) && (!info->isinline) && 814 (cur->children->type != HTML_TEXT_NODE) && 815 (cur->children->type != HTML_ENTITY_REF_NODE) && 816 (cur->children != cur->last) && 817 (cur->name != NULL) && 818 (cur->name[0] != 'p')) /* p, pre, param */ 819 xmlOutputBufferWriteString(buf, "\n"); 820 cur = cur->children; 821 continue; 822 } 823 824 if ((format) && (cur->next != NULL) && 825 (info != NULL) && (!info->isinline)) { 826 if ((cur->next->type != HTML_TEXT_NODE) && 827 (cur->next->type != HTML_ENTITY_REF_NODE) && 828 (cur->parent != NULL) && 829 (cur->parent->name != NULL) && 830 (cur->parent->name[0] != 'p')) /* p, pre, param */ 831 xmlOutputBufferWriteString(buf, "\n"); 832 } 833 834 break; 835 836 case XML_ATTRIBUTE_NODE: 837 htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur); 838 break; 839 840 case HTML_TEXT_NODE: 841 if (cur->content == NULL) 842 break; 843 if (((cur->name == (const xmlChar *)xmlStringText) || 844 (cur->name != (const xmlChar *)xmlStringTextNoenc)) && 845 ((cur->parent == NULL) || 846 ((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) && 847 (xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) { 848 xmlChar *buffer; 849 850 buffer = xmlEncodeEntitiesReentrant(doc, cur->content); 851 if (buffer != NULL) { 852 xmlOutputBufferWriteString(buf, (const char *)buffer); 853 xmlFree(buffer); 854 } 855 } else { 856 xmlOutputBufferWriteString(buf, (const char *)cur->content); 857 } 858 break; 859 860 case HTML_COMMENT_NODE: 861 if (cur->content != NULL) { 862 xmlOutputBufferWriteString(buf, "<!--"); 863 xmlOutputBufferWriteString(buf, (const char *)cur->content); 864 xmlOutputBufferWriteString(buf, "-->"); 865 } 866 break; 867 868 case HTML_PI_NODE: 869 if (cur->name != NULL) { 870 xmlOutputBufferWriteString(buf, "<?"); 871 xmlOutputBufferWriteString(buf, (const char *)cur->name); 872 if (cur->content != NULL) { 873 xmlOutputBufferWriteString(buf, " "); 874 xmlOutputBufferWriteString(buf, 875 (const char *)cur->content); 876 } 877 xmlOutputBufferWriteString(buf, ">"); 878 } 879 break; 880 881 case HTML_ENTITY_REF_NODE: 882 xmlOutputBufferWriteString(buf, "&"); 883 xmlOutputBufferWriteString(buf, (const char *)cur->name); 884 xmlOutputBufferWriteString(buf, ";"); 885 break; 886 887 case HTML_PRESERVE_NODE: 888 if (cur->content != NULL) { 889 xmlOutputBufferWriteString(buf, (const char *)cur->content); 890 } 891 break; 892 893 default: 894 break; 895 } 896 897 while (1) { 898 if (cur == root) 899 return; 900 if (cur->next != NULL) { 901 cur = cur->next; 902 break; 903 } 904 905 /* 906 * The parent should never be NULL here but we want to handle 907 * corrupted documents gracefully. 908 */ 909 if (cur->parent == NULL) 910 return; 911 cur = cur->parent; 912 913 if ((cur->type == XML_HTML_DOCUMENT_NODE) || 914 (cur->type == XML_DOCUMENT_NODE)) { 915 xmlOutputBufferWriteString(buf, "\n"); 916 } else { 917 if ((format) && (cur->ns == NULL)) 918 info = htmlTagLookup(cur->name); 919 else 920 info = NULL; 921 922 if ((format) && (info != NULL) && (!info->isinline) && 923 (cur->last->type != HTML_TEXT_NODE) && 924 (cur->last->type != HTML_ENTITY_REF_NODE) && 925 (cur->children != cur->last) && 926 (cur->name != NULL) && 927 (cur->name[0] != 'p')) /* p, pre, param */ 928 xmlOutputBufferWriteString(buf, "\n"); 929 930 xmlOutputBufferWriteString(buf, "</"); 931 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 932 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 933 xmlOutputBufferWriteString(buf, ":"); 934 } 935 xmlOutputBufferWriteString(buf, (const char *)cur->name); 936 xmlOutputBufferWriteString(buf, ">"); 937 938 if ((format) && (info != NULL) && (!info->isinline) && 939 (cur->next != NULL)) { 940 if ((cur->next->type != HTML_TEXT_NODE) && 941 (cur->next->type != HTML_ENTITY_REF_NODE) && 942 (cur->parent != NULL) && 943 (cur->parent->name != NULL) && 944 (cur->parent->name[0] != 'p')) /* p, pre, param */ 945 xmlOutputBufferWriteString(buf, "\n"); 946 } 947 } 948 } 949 } 950 } 951 952 /** 953 * htmlNodeDumpOutput: 954 * @buf: the HTML buffer output 955 * @doc: the document 956 * @cur: the current node 957 * @encoding: the encoding string (unused) 958 * 959 * Dump an HTML node, recursive behaviour,children are printed too, 960 * and formatting returns/spaces are added. 961 */ 962 void 963 htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 964 xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED) { 965 htmlNodeDumpFormatOutput(buf, doc, cur, NULL, 1); 966 } 967 968 /** 969 * htmlDocContentDumpFormatOutput: 970 * @buf: the HTML buffer output 971 * @cur: the document 972 * @encoding: the encoding string (unused) 973 * @format: should formatting spaces been added 974 * 975 * Dump an HTML document. 976 */ 977 void 978 htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, 979 const char *encoding ATTRIBUTE_UNUSED, 980 int format) { 981 htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, format); 982 } 983 984 /** 985 * htmlDocContentDumpOutput: 986 * @buf: the HTML buffer output 987 * @cur: the document 988 * @encoding: the encoding string (unused) 989 * 990 * Dump an HTML document. Formatting return/spaces are added. 991 */ 992 void 993 htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, 994 const char *encoding ATTRIBUTE_UNUSED) { 995 htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, 1); 996 } 997 998 /************************************************************************ 999 * * 1000 * Saving functions front-ends * 1001 * * 1002 ************************************************************************/ 1003 1004 /** 1005 * htmlDocDump: 1006 * @f: the FILE* 1007 * @cur: the document 1008 * 1009 * Dump an HTML document to an open FILE. 1010 * 1011 * returns: the number of byte written or -1 in case of failure. 1012 */ 1013 int 1014 htmlDocDump(FILE *f, xmlDocPtr cur) { 1015 xmlOutputBufferPtr buf; 1016 xmlCharEncodingHandlerPtr handler = NULL; 1017 const char *encoding; 1018 int ret; 1019 1020 xmlInitParser(); 1021 1022 if ((cur == NULL) || (f == NULL)) { 1023 return(-1); 1024 } 1025 1026 encoding = (const char *) htmlGetMetaEncoding(cur); 1027 1028 if (encoding != NULL) { 1029 xmlCharEncoding enc; 1030 1031 enc = xmlParseCharEncoding(encoding); 1032 if (enc != XML_CHAR_ENCODING_UTF8) { 1033 handler = xmlFindCharEncodingHandler(encoding); 1034 if (handler == NULL) 1035 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 1036 } 1037 } else { 1038 /* 1039 * Fallback to HTML or ASCII when the encoding is unspecified 1040 */ 1041 if (handler == NULL) 1042 handler = xmlFindCharEncodingHandler("HTML"); 1043 if (handler == NULL) 1044 handler = xmlFindCharEncodingHandler("ascii"); 1045 } 1046 1047 buf = xmlOutputBufferCreateFile(f, handler); 1048 if (buf == NULL) return(-1); 1049 htmlDocContentDumpOutput(buf, cur, NULL); 1050 1051 ret = xmlOutputBufferClose(buf); 1052 return(ret); 1053 } 1054 1055 /** 1056 * htmlSaveFile: 1057 * @filename: the filename (or URL) 1058 * @cur: the document 1059 * 1060 * Dump an HTML document to a file. If @filename is "-" the stdout file is 1061 * used. 1062 * returns: the number of byte written or -1 in case of failure. 1063 */ 1064 int 1065 htmlSaveFile(const char *filename, xmlDocPtr cur) { 1066 xmlOutputBufferPtr buf; 1067 xmlCharEncodingHandlerPtr handler = NULL; 1068 const char *encoding; 1069 int ret; 1070 1071 if ((cur == NULL) || (filename == NULL)) 1072 return(-1); 1073 1074 xmlInitParser(); 1075 1076 encoding = (const char *) htmlGetMetaEncoding(cur); 1077 1078 if (encoding != NULL) { 1079 xmlCharEncoding enc; 1080 1081 enc = xmlParseCharEncoding(encoding); 1082 if (enc != XML_CHAR_ENCODING_UTF8) { 1083 handler = xmlFindCharEncodingHandler(encoding); 1084 if (handler == NULL) 1085 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 1086 } 1087 } else { 1088 /* 1089 * Fallback to HTML or ASCII when the encoding is unspecified 1090 */ 1091 if (handler == NULL) 1092 handler = xmlFindCharEncodingHandler("HTML"); 1093 if (handler == NULL) 1094 handler = xmlFindCharEncodingHandler("ascii"); 1095 } 1096 1097 /* 1098 * save the content to a temp buffer. 1099 */ 1100 buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression); 1101 if (buf == NULL) return(0); 1102 1103 htmlDocContentDumpOutput(buf, cur, NULL); 1104 1105 ret = xmlOutputBufferClose(buf); 1106 return(ret); 1107 } 1108 1109 /** 1110 * htmlSaveFileFormat: 1111 * @filename: the filename 1112 * @cur: the document 1113 * @format: should formatting spaces been added 1114 * @encoding: the document encoding 1115 * 1116 * Dump an HTML document to a file using a given encoding. 1117 * 1118 * returns: the number of byte written or -1 in case of failure. 1119 */ 1120 int 1121 htmlSaveFileFormat(const char *filename, xmlDocPtr cur, 1122 const char *encoding, int format) { 1123 xmlOutputBufferPtr buf; 1124 xmlCharEncodingHandlerPtr handler = NULL; 1125 int ret; 1126 1127 if ((cur == NULL) || (filename == NULL)) 1128 return(-1); 1129 1130 xmlInitParser(); 1131 1132 if (encoding != NULL) { 1133 xmlCharEncoding enc; 1134 1135 enc = xmlParseCharEncoding(encoding); 1136 if (enc != XML_CHAR_ENCODING_UTF8) { 1137 handler = xmlFindCharEncodingHandler(encoding); 1138 if (handler == NULL) 1139 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 1140 } 1141 htmlSetMetaEncoding(cur, (const xmlChar *) encoding); 1142 } else { 1143 htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8"); 1144 1145 /* 1146 * Fallback to HTML or ASCII when the encoding is unspecified 1147 */ 1148 if (handler == NULL) 1149 handler = xmlFindCharEncodingHandler("HTML"); 1150 if (handler == NULL) 1151 handler = xmlFindCharEncodingHandler("ascii"); 1152 } 1153 1154 /* 1155 * save the content to a temp buffer. 1156 */ 1157 buf = xmlOutputBufferCreateFilename(filename, handler, 0); 1158 if (buf == NULL) return(0); 1159 1160 htmlDocContentDumpFormatOutput(buf, cur, encoding, format); 1161 1162 ret = xmlOutputBufferClose(buf); 1163 return(ret); 1164 } 1165 1166 /** 1167 * htmlSaveFileEnc: 1168 * @filename: the filename 1169 * @cur: the document 1170 * @encoding: the document encoding 1171 * 1172 * Dump an HTML document to a file using a given encoding 1173 * and formatting returns/spaces are added. 1174 * 1175 * returns: the number of byte written or -1 in case of failure. 1176 */ 1177 int 1178 htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) { 1179 return(htmlSaveFileFormat(filename, cur, encoding, 1)); 1180 } 1181 1182 #endif /* LIBXML_OUTPUT_ENABLED */ 1183 1184 #define bottom_HTMLtree 1185 #include "elfgcchack.h" 1186 #endif /* LIBXML_HTML_ENABLED */ 1187