1 /* 2 * HTMLtree.c : implementation of access function for an HTML tree. 3 * 4 * See Copyright for the status of this software. 5 * 6 * daniel@veillard.com 7 */ 8 9 10 #define IN_LIBXML 11 #include "libxml.h" 12 #ifdef LIBXML_HTML_ENABLED 13 14 #include <string.h> /* for memset() only ! */ 15 16 #ifdef HAVE_CTYPE_H 17 #include <ctype.h> 18 #endif 19 #ifdef HAVE_STDLIB_H 20 #include <stdlib.h> 21 #endif 22 23 #include <libxml/xmlmemory.h> 24 #include <libxml/HTMLparser.h> 25 #include <libxml/HTMLtree.h> 26 #include <libxml/entities.h> 27 #include <libxml/valid.h> 28 #include <libxml/xmlerror.h> 29 #include <libxml/parserInternals.h> 30 #include <libxml/globals.h> 31 #include <libxml/uri.h> 32 33 #include "buf.h" 34 35 /************************************************************************ 36 * * 37 * Getting/Setting encoding meta tags * 38 * * 39 ************************************************************************/ 40 41 /** 42 * htmlGetMetaEncoding: 43 * @doc: the document 44 * 45 * Encoding definition lookup in the Meta tags 46 * 47 * Returns the current encoding as flagged in the HTML source 48 */ 49 const xmlChar * 50 htmlGetMetaEncoding(htmlDocPtr doc) { 51 htmlNodePtr cur; 52 const xmlChar *content; 53 const xmlChar *encoding; 54 55 if (doc == NULL) 56 return(NULL); 57 cur = doc->children; 58 59 /* 60 * Search the html 61 */ 62 while (cur != NULL) { 63 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 64 if (xmlStrEqual(cur->name, BAD_CAST"html")) 65 break; 66 if (xmlStrEqual(cur->name, BAD_CAST"head")) 67 goto found_head; 68 if (xmlStrEqual(cur->name, BAD_CAST"meta")) 69 goto found_meta; 70 } 71 cur = cur->next; 72 } 73 if (cur == NULL) 74 return(NULL); 75 cur = cur->children; 76 77 /* 78 * Search the head 79 */ 80 while (cur != NULL) { 81 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 82 if (xmlStrEqual(cur->name, BAD_CAST"head")) 83 break; 84 if (xmlStrEqual(cur->name, BAD_CAST"meta")) 85 goto found_meta; 86 } 87 cur = cur->next; 88 } 89 if (cur == NULL) 90 return(NULL); 91 found_head: 92 cur = cur->children; 93 94 /* 95 * Search the meta elements 96 */ 97 found_meta: 98 while (cur != NULL) { 99 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 100 if (xmlStrEqual(cur->name, BAD_CAST"meta")) { 101 xmlAttrPtr attr = cur->properties; 102 int http; 103 const xmlChar *value; 104 105 content = NULL; 106 http = 0; 107 while (attr != NULL) { 108 if ((attr->children != NULL) && 109 (attr->children->type == XML_TEXT_NODE) && 110 (attr->children->next == NULL)) { 111 value = attr->children->content; 112 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv")) 113 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) 114 http = 1; 115 else if ((value != NULL) 116 && (!xmlStrcasecmp(attr->name, BAD_CAST"content"))) 117 content = value; 118 if ((http != 0) && (content != NULL)) 119 goto found_content; 120 } 121 attr = attr->next; 122 } 123 } 124 } 125 cur = cur->next; 126 } 127 return(NULL); 128 129 found_content: 130 encoding = xmlStrstr(content, BAD_CAST"charset="); 131 if (encoding == NULL) 132 encoding = xmlStrstr(content, BAD_CAST"Charset="); 133 if (encoding == NULL) 134 encoding = xmlStrstr(content, BAD_CAST"CHARSET="); 135 if (encoding != NULL) { 136 encoding += 8; 137 } else { 138 encoding = xmlStrstr(content, BAD_CAST"charset ="); 139 if (encoding == NULL) 140 encoding = xmlStrstr(content, BAD_CAST"Charset ="); 141 if (encoding == NULL) 142 encoding = xmlStrstr(content, BAD_CAST"CHARSET ="); 143 if (encoding != NULL) 144 encoding += 9; 145 } 146 if (encoding != NULL) { 147 while ((*encoding == ' ') || (*encoding == '\t')) encoding++; 148 } 149 return(encoding); 150 } 151 152 /** 153 * htmlSetMetaEncoding: 154 * @doc: the document 155 * @encoding: the encoding string 156 * 157 * Sets the current encoding in the Meta tags 158 * NOTE: this will not change the document content encoding, just 159 * the META flag associated. 160 * 161 * Returns 0 in case of success and -1 in case of error 162 */ 163 int 164 htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) { 165 htmlNodePtr cur, meta = NULL, head = NULL; 166 const xmlChar *content = NULL; 167 char newcontent[100]; 168 169 newcontent[0] = 0; 170 171 if (doc == NULL) 172 return(-1); 173 174 /* html isn't a real encoding it's just libxml2 way to get entities */ 175 if (!xmlStrcasecmp(encoding, BAD_CAST "html")) 176 return(-1); 177 178 if (encoding != NULL) { 179 snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s", 180 (char *)encoding); 181 newcontent[sizeof(newcontent) - 1] = 0; 182 } 183 184 cur = doc->children; 185 186 /* 187 * Search the html 188 */ 189 while (cur != NULL) { 190 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 191 if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0) 192 break; 193 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0) 194 goto found_head; 195 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) 196 goto found_meta; 197 } 198 cur = cur->next; 199 } 200 if (cur == NULL) 201 return(-1); 202 cur = cur->children; 203 204 /* 205 * Search the head 206 */ 207 while (cur != NULL) { 208 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 209 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0) 210 break; 211 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) { 212 head = cur->parent; 213 goto found_meta; 214 } 215 } 216 cur = cur->next; 217 } 218 if (cur == NULL) 219 return(-1); 220 found_head: 221 head = cur; 222 if (cur->children == NULL) 223 goto create; 224 cur = cur->children; 225 226 found_meta: 227 /* 228 * Search and update all the remaining the meta elements carrying 229 * encoding informations 230 */ 231 while (cur != NULL) { 232 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 233 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) { 234 xmlAttrPtr attr = cur->properties; 235 int http; 236 const xmlChar *value; 237 238 content = NULL; 239 http = 0; 240 while (attr != NULL) { 241 if ((attr->children != NULL) && 242 (attr->children->type == XML_TEXT_NODE) && 243 (attr->children->next == NULL)) { 244 value = attr->children->content; 245 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv")) 246 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) 247 http = 1; 248 else 249 { 250 if ((value != NULL) && 251 (!xmlStrcasecmp(attr->name, BAD_CAST"content"))) 252 content = value; 253 } 254 if ((http != 0) && (content != NULL)) 255 break; 256 } 257 attr = attr->next; 258 } 259 if ((http != 0) && (content != NULL)) { 260 meta = cur; 261 break; 262 } 263 264 } 265 } 266 cur = cur->next; 267 } 268 create: 269 if (meta == NULL) { 270 if ((encoding != NULL) && (head != NULL)) { 271 /* 272 * Create a new Meta element with the right attributes 273 */ 274 275 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL); 276 if (head->children == NULL) 277 xmlAddChild(head, meta); 278 else 279 xmlAddPrevSibling(head->children, meta); 280 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type"); 281 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent); 282 } 283 } else { 284 /* remove the meta tag if NULL is passed */ 285 if (encoding == NULL) { 286 xmlUnlinkNode(meta); 287 xmlFreeNode(meta); 288 } 289 /* change the document only if there is a real encoding change */ 290 else if (xmlStrcasestr(content, encoding) == NULL) { 291 xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent); 292 } 293 } 294 295 296 return(0); 297 } 298 299 /** 300 * booleanHTMLAttrs: 301 * 302 * These are the HTML attributes which will be output 303 * in minimized form, i.e. <option selected="selected"> will be 304 * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method" 305 * 306 */ 307 static const char* htmlBooleanAttrs[] = { 308 "checked", "compact", "declare", "defer", "disabled", "ismap", 309 "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly", 310 "selected", NULL 311 }; 312 313 314 /** 315 * htmlIsBooleanAttr: 316 * @name: the name of the attribute to check 317 * 318 * Determine if a given attribute is a boolean attribute. 319 * 320 * returns: false if the attribute is not boolean, true otherwise. 321 */ 322 int 323 htmlIsBooleanAttr(const xmlChar *name) 324 { 325 int i = 0; 326 327 while (htmlBooleanAttrs[i] != NULL) { 328 if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0) 329 return 1; 330 i++; 331 } 332 return 0; 333 } 334 335 #ifdef LIBXML_OUTPUT_ENABLED 336 /* 337 * private routine exported from xmlIO.c 338 */ 339 xmlOutputBufferPtr 340 xmlAllocOutputBufferInternal(xmlCharEncodingHandlerPtr encoder); 341 /************************************************************************ 342 * * 343 * Output error handlers * 344 * * 345 ************************************************************************/ 346 /** 347 * htmlSaveErrMemory: 348 * @extra: extra informations 349 * 350 * Handle an out of memory condition 351 */ 352 static void 353 htmlSaveErrMemory(const char *extra) 354 { 355 __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra); 356 } 357 358 /** 359 * htmlSaveErr: 360 * @code: the error number 361 * @node: the location of the error. 362 * @extra: extra informations 363 * 364 * Handle an out of memory condition 365 */ 366 static void 367 htmlSaveErr(int code, xmlNodePtr node, const char *extra) 368 { 369 const char *msg = NULL; 370 371 switch(code) { 372 case XML_SAVE_NOT_UTF8: 373 msg = "string is not in UTF-8\n"; 374 break; 375 case XML_SAVE_CHAR_INVALID: 376 msg = "invalid character value\n"; 377 break; 378 case XML_SAVE_UNKNOWN_ENCODING: 379 msg = "unknown encoding %s\n"; 380 break; 381 case XML_SAVE_NO_DOCTYPE: 382 msg = "HTML has no DOCTYPE\n"; 383 break; 384 default: 385 msg = "unexpected error number\n"; 386 } 387 __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra); 388 } 389 390 /************************************************************************ 391 * * 392 * Dumping HTML tree content to a simple buffer * 393 * * 394 ************************************************************************/ 395 396 /** 397 * htmlBufNodeDumpFormat: 398 * @buf: the xmlBufPtr output 399 * @doc: the document 400 * @cur: the current node 401 * @format: should formatting spaces been added 402 * 403 * Dump an HTML node, recursive behaviour,children are printed too. 404 * 405 * Returns the number of byte written or -1 in case of error 406 */ 407 static size_t 408 htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur, 409 int format) { 410 size_t use; 411 int ret; 412 xmlOutputBufferPtr outbuf; 413 414 if (cur == NULL) { 415 return (-1); 416 } 417 if (buf == NULL) { 418 return (-1); 419 } 420 outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer)); 421 if (outbuf == NULL) { 422 htmlSaveErrMemory("allocating HTML output buffer"); 423 return (-1); 424 } 425 memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer)); 426 outbuf->buffer = buf; 427 outbuf->encoder = NULL; 428 outbuf->writecallback = NULL; 429 outbuf->closecallback = NULL; 430 outbuf->context = NULL; 431 outbuf->written = 0; 432 433 use = xmlBufUse(buf); 434 htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format); 435 xmlFree(outbuf); 436 ret = xmlBufUse(buf) - use; 437 return (ret); 438 } 439 440 /** 441 * htmlNodeDump: 442 * @buf: the HTML buffer output 443 * @doc: the document 444 * @cur: the current node 445 * 446 * Dump an HTML node, recursive behaviour,children are printed too, 447 * and formatting returns are added. 448 * 449 * Returns the number of byte written or -1 in case of error 450 */ 451 int 452 htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) { 453 xmlBufPtr buffer; 454 size_t ret; 455 456 if ((buf == NULL) || (cur == NULL)) 457 return(-1); 458 459 xmlInitParser(); 460 buffer = xmlBufFromBuffer(buf); 461 if (buffer == NULL) 462 return(-1); 463 464 ret = htmlBufNodeDumpFormat(buffer, doc, cur, 1); 465 466 xmlBufBackToBuffer(buffer); 467 468 if (ret > INT_MAX) 469 return(-1); 470 return((int) ret); 471 } 472 473 /** 474 * htmlNodeDumpFileFormat: 475 * @out: the FILE pointer 476 * @doc: the document 477 * @cur: the current node 478 * @encoding: the document encoding 479 * @format: should formatting spaces been added 480 * 481 * Dump an HTML node, recursive behaviour,children are printed too. 482 * 483 * TODO: if encoding == NULL try to save in the doc encoding 484 * 485 * returns: the number of byte written or -1 in case of failure. 486 */ 487 int 488 htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc, 489 xmlNodePtr cur, const char *encoding, int format) { 490 xmlOutputBufferPtr buf; 491 xmlCharEncodingHandlerPtr handler = NULL; 492 int ret; 493 494 xmlInitParser(); 495 496 if (encoding != NULL) { 497 xmlCharEncoding enc; 498 499 enc = xmlParseCharEncoding(encoding); 500 if (enc != XML_CHAR_ENCODING_UTF8) { 501 handler = xmlFindCharEncodingHandler(encoding); 502 if (handler == NULL) 503 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 504 } 505 } else { 506 /* 507 * Fallback to HTML or ASCII when the encoding is unspecified 508 */ 509 if (handler == NULL) 510 handler = xmlFindCharEncodingHandler("HTML"); 511 if (handler == NULL) 512 handler = xmlFindCharEncodingHandler("ascii"); 513 } 514 515 /* 516 * save the content to a temp buffer. 517 */ 518 buf = xmlOutputBufferCreateFile(out, handler); 519 if (buf == NULL) return(0); 520 521 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format); 522 523 ret = xmlOutputBufferClose(buf); 524 return(ret); 525 } 526 527 /** 528 * htmlNodeDumpFile: 529 * @out: the FILE pointer 530 * @doc: the document 531 * @cur: the current node 532 * 533 * Dump an HTML node, recursive behaviour,children are printed too, 534 * and formatting returns are added. 535 */ 536 void 537 htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) { 538 htmlNodeDumpFileFormat(out, doc, cur, NULL, 1); 539 } 540 541 /** 542 * htmlDocDumpMemoryFormat: 543 * @cur: the document 544 * @mem: OUT: the memory pointer 545 * @size: OUT: the memory length 546 * @format: should formatting spaces been added 547 * 548 * Dump an HTML document in memory and return the xmlChar * and it's size. 549 * It's up to the caller to free the memory. 550 */ 551 void 552 htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) { 553 xmlOutputBufferPtr buf; 554 xmlCharEncodingHandlerPtr handler = NULL; 555 const char *encoding; 556 557 xmlInitParser(); 558 559 if ((mem == NULL) || (size == NULL)) 560 return; 561 if (cur == NULL) { 562 *mem = NULL; 563 *size = 0; 564 return; 565 } 566 567 encoding = (const char *) htmlGetMetaEncoding(cur); 568 569 if (encoding != NULL) { 570 xmlCharEncoding enc; 571 572 enc = xmlParseCharEncoding(encoding); 573 if (enc != XML_CHAR_ENCODING_UTF8) { 574 handler = xmlFindCharEncodingHandler(encoding); 575 if (handler == NULL) 576 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 577 578 } 579 } else { 580 /* 581 * Fallback to HTML or ASCII when the encoding is unspecified 582 */ 583 if (handler == NULL) 584 handler = xmlFindCharEncodingHandler("HTML"); 585 if (handler == NULL) 586 handler = xmlFindCharEncodingHandler("ascii"); 587 } 588 589 buf = xmlAllocOutputBufferInternal(handler); 590 if (buf == NULL) { 591 *mem = NULL; 592 *size = 0; 593 return; 594 } 595 596 htmlDocContentDumpFormatOutput(buf, cur, NULL, format); 597 598 xmlOutputBufferFlush(buf); 599 if (buf->conv != NULL) { 600 *size = xmlBufUse(buf->conv); 601 *mem = xmlStrndup(xmlBufContent(buf->conv), *size); 602 } else { 603 *size = xmlBufUse(buf->buffer); 604 *mem = xmlStrndup(xmlBufContent(buf->buffer), *size); 605 } 606 (void)xmlOutputBufferClose(buf); 607 } 608 609 /** 610 * htmlDocDumpMemory: 611 * @cur: the document 612 * @mem: OUT: the memory pointer 613 * @size: OUT: the memory length 614 * 615 * Dump an HTML document in memory and return the xmlChar * and it's size. 616 * It's up to the caller to free the memory. 617 */ 618 void 619 htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) { 620 htmlDocDumpMemoryFormat(cur, mem, size, 1); 621 } 622 623 624 /************************************************************************ 625 * * 626 * Dumping HTML tree content to an I/O output buffer * 627 * * 628 ************************************************************************/ 629 630 void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur); 631 632 /** 633 * htmlDtdDumpOutput: 634 * @buf: the HTML buffer output 635 * @doc: the document 636 * @encoding: the encoding string 637 * 638 * TODO: check whether encoding is needed 639 * 640 * Dump the HTML document DTD, if any. 641 */ 642 static void 643 htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 644 const char *encoding ATTRIBUTE_UNUSED) { 645 xmlDtdPtr cur = doc->intSubset; 646 647 if (cur == NULL) { 648 htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL); 649 return; 650 } 651 xmlOutputBufferWriteString(buf, "<!DOCTYPE "); 652 xmlOutputBufferWriteString(buf, (const char *)cur->name); 653 if (cur->ExternalID != NULL) { 654 xmlOutputBufferWriteString(buf, " PUBLIC "); 655 xmlBufWriteQuotedString(buf->buffer, cur->ExternalID); 656 if (cur->SystemID != NULL) { 657 xmlOutputBufferWriteString(buf, " "); 658 xmlBufWriteQuotedString(buf->buffer, cur->SystemID); 659 } 660 } else if (cur->SystemID != NULL && 661 xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) { 662 xmlOutputBufferWriteString(buf, " SYSTEM "); 663 xmlBufWriteQuotedString(buf->buffer, cur->SystemID); 664 } 665 xmlOutputBufferWriteString(buf, ">\n"); 666 } 667 668 /** 669 * htmlAttrDumpOutput: 670 * @buf: the HTML buffer output 671 * @doc: the document 672 * @cur: the attribute pointer 673 * @encoding: the encoding string 674 * 675 * Dump an HTML attribute 676 */ 677 static void 678 htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, 679 const char *encoding ATTRIBUTE_UNUSED) { 680 xmlChar *value; 681 682 /* 683 * The html output method should not escape a & character 684 * occurring in an attribute value immediately followed by 685 * a { character (see Section B.7.1 of the HTML 4.0 Recommendation). 686 * This is implemented in xmlEncodeEntitiesReentrant 687 */ 688 689 if (cur == NULL) { 690 return; 691 } 692 xmlOutputBufferWriteString(buf, " "); 693 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 694 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 695 xmlOutputBufferWriteString(buf, ":"); 696 } 697 xmlOutputBufferWriteString(buf, (const char *)cur->name); 698 if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) { 699 value = xmlNodeListGetString(doc, cur->children, 0); 700 if (value) { 701 xmlOutputBufferWriteString(buf, "="); 702 if ((cur->ns == NULL) && (cur->parent != NULL) && 703 (cur->parent->ns == NULL) && 704 ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) || 705 (!xmlStrcasecmp(cur->name, BAD_CAST "action")) || 706 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) || 707 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) && 708 (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) { 709 xmlChar *tmp = value; 710 /* xmlURIEscapeStr() escapes '"' so it can be safely used. */ 711 xmlBufCCat(buf->buffer, "\""); 712 713 while (IS_BLANK_CH(*tmp)) tmp++; 714 715 /* URI Escape everything, except server side includes. */ 716 for ( ; ; ) { 717 xmlChar *escaped; 718 xmlChar endChar; 719 xmlChar *end = NULL; 720 xmlChar *start = (xmlChar *)xmlStrstr(tmp, BAD_CAST "<!--"); 721 if (start != NULL) { 722 end = (xmlChar *)xmlStrstr(tmp, BAD_CAST "-->"); 723 if (end != NULL) { 724 *start = '\0'; 725 } 726 } 727 728 /* Escape the whole string, or until start (set to '\0'). */ 729 escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+"); 730 if (escaped != NULL) { 731 xmlBufCat(buf->buffer, escaped); 732 xmlFree(escaped); 733 } else { 734 xmlBufCat(buf->buffer, tmp); 735 } 736 737 if (end == NULL) { /* Everything has been written. */ 738 break; 739 } 740 741 /* Do not escape anything within server side includes. */ 742 *start = '<'; /* Restore the first character of "<!--". */ 743 end += 3; /* strlen("-->") */ 744 endChar = *end; 745 *end = '\0'; 746 xmlBufCat(buf->buffer, start); 747 *end = endChar; 748 tmp = end; 749 } 750 751 xmlBufCCat(buf->buffer, "\""); 752 } else { 753 xmlBufWriteQuotedString(buf->buffer, value); 754 } 755 xmlFree(value); 756 } else { 757 xmlOutputBufferWriteString(buf, "=\"\""); 758 } 759 } 760 } 761 762 /** 763 * htmlAttrListDumpOutput: 764 * @buf: the HTML buffer output 765 * @doc: the document 766 * @cur: the first attribute pointer 767 * @encoding: the encoding string 768 * 769 * Dump a list of HTML attributes 770 */ 771 static void 772 htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) { 773 if (cur == NULL) { 774 return; 775 } 776 while (cur != NULL) { 777 htmlAttrDumpOutput(buf, doc, cur, encoding); 778 cur = cur->next; 779 } 780 } 781 782 783 784 /** 785 * htmlNodeListDumpOutput: 786 * @buf: the HTML buffer output 787 * @doc: the document 788 * @cur: the first node 789 * @encoding: the encoding string 790 * @format: should formatting spaces been added 791 * 792 * Dump an HTML node list, recursive behaviour,children are printed too. 793 */ 794 static void 795 htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 796 xmlNodePtr cur, const char *encoding, int format) { 797 if (cur == NULL) { 798 return; 799 } 800 while (cur != NULL) { 801 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format); 802 cur = cur->next; 803 } 804 } 805 806 /** 807 * htmlNodeDumpFormatOutput: 808 * @buf: the HTML buffer output 809 * @doc: the document 810 * @cur: the current node 811 * @encoding: the encoding string 812 * @format: should formatting spaces been added 813 * 814 * Dump an HTML node, recursive behaviour,children are printed too. 815 */ 816 void 817 htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 818 xmlNodePtr cur, const char *encoding, int format) { 819 const htmlElemDesc * info; 820 821 xmlInitParser(); 822 823 if ((cur == NULL) || (buf == NULL)) { 824 return; 825 } 826 /* 827 * Special cases. 828 */ 829 if (cur->type == XML_DTD_NODE) 830 return; 831 if ((cur->type == XML_HTML_DOCUMENT_NODE) || 832 (cur->type == XML_DOCUMENT_NODE)){ 833 htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding); 834 return; 835 } 836 if (cur->type == XML_ATTRIBUTE_NODE) { 837 htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur, encoding); 838 return; 839 } 840 if (cur->type == HTML_TEXT_NODE) { 841 if (cur->content != NULL) { 842 if (((cur->name == (const xmlChar *)xmlStringText) || 843 (cur->name != (const xmlChar *)xmlStringTextNoenc)) && 844 ((cur->parent == NULL) || 845 ((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) && 846 (xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) { 847 xmlChar *buffer; 848 849 buffer = xmlEncodeEntitiesReentrant(doc, cur->content); 850 if (buffer != NULL) { 851 xmlOutputBufferWriteString(buf, (const char *)buffer); 852 xmlFree(buffer); 853 } 854 } else { 855 xmlOutputBufferWriteString(buf, (const char *)cur->content); 856 } 857 } 858 return; 859 } 860 if (cur->type == HTML_COMMENT_NODE) { 861 if (cur->content != NULL) { 862 xmlOutputBufferWriteString(buf, "<!--"); 863 xmlOutputBufferWriteString(buf, (const char *)cur->content); 864 xmlOutputBufferWriteString(buf, "-->"); 865 } 866 return; 867 } 868 if (cur->type == HTML_PI_NODE) { 869 if (cur->name == NULL) 870 return; 871 xmlOutputBufferWriteString(buf, "<?"); 872 xmlOutputBufferWriteString(buf, (const char *)cur->name); 873 if (cur->content != NULL) { 874 xmlOutputBufferWriteString(buf, " "); 875 xmlOutputBufferWriteString(buf, (const char *)cur->content); 876 } 877 xmlOutputBufferWriteString(buf, ">"); 878 return; 879 } 880 if (cur->type == HTML_ENTITY_REF_NODE) { 881 xmlOutputBufferWriteString(buf, "&"); 882 xmlOutputBufferWriteString(buf, (const char *)cur->name); 883 xmlOutputBufferWriteString(buf, ";"); 884 return; 885 } 886 if (cur->type == HTML_PRESERVE_NODE) { 887 if (cur->content != NULL) { 888 xmlOutputBufferWriteString(buf, (const char *)cur->content); 889 } 890 return; 891 } 892 893 /* 894 * Get specific HTML info for that node. 895 */ 896 if (cur->ns == NULL) 897 info = htmlTagLookup(cur->name); 898 else 899 info = NULL; 900 901 xmlOutputBufferWriteString(buf, "<"); 902 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 903 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 904 xmlOutputBufferWriteString(buf, ":"); 905 } 906 xmlOutputBufferWriteString(buf, (const char *)cur->name); 907 if (cur->nsDef) 908 xmlNsListDumpOutput(buf, cur->nsDef); 909 if (cur->properties != NULL) 910 htmlAttrListDumpOutput(buf, doc, cur->properties, encoding); 911 912 if ((info != NULL) && (info->empty)) { 913 xmlOutputBufferWriteString(buf, ">"); 914 if ((format) && (!info->isinline) && (cur->next != NULL)) { 915 if ((cur->next->type != HTML_TEXT_NODE) && 916 (cur->next->type != HTML_ENTITY_REF_NODE) && 917 (cur->parent != NULL) && 918 (cur->parent->name != NULL) && 919 (cur->parent->name[0] != 'p')) /* p, pre, param */ 920 xmlOutputBufferWriteString(buf, "\n"); 921 } 922 return; 923 } 924 if (((cur->type == XML_ELEMENT_NODE) || (cur->content == NULL)) && 925 (cur->children == NULL)) { 926 if ((info != NULL) && (info->saveEndTag != 0) && 927 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) && 928 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) { 929 xmlOutputBufferWriteString(buf, ">"); 930 } else { 931 xmlOutputBufferWriteString(buf, "></"); 932 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 933 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 934 xmlOutputBufferWriteString(buf, ":"); 935 } 936 xmlOutputBufferWriteString(buf, (const char *)cur->name); 937 xmlOutputBufferWriteString(buf, ">"); 938 } 939 if ((format) && (cur->next != NULL) && 940 (info != NULL) && (!info->isinline)) { 941 if ((cur->next->type != HTML_TEXT_NODE) && 942 (cur->next->type != HTML_ENTITY_REF_NODE) && 943 (cur->parent != NULL) && 944 (cur->parent->name != NULL) && 945 (cur->parent->name[0] != 'p')) /* p, pre, param */ 946 xmlOutputBufferWriteString(buf, "\n"); 947 } 948 return; 949 } 950 xmlOutputBufferWriteString(buf, ">"); 951 if ((cur->type != XML_ELEMENT_NODE) && 952 (cur->content != NULL)) { 953 /* 954 * Uses the OutputBuffer property to automatically convert 955 * invalids to charrefs 956 */ 957 958 xmlOutputBufferWriteString(buf, (const char *) cur->content); 959 } 960 if (cur->children != NULL) { 961 if ((format) && (info != NULL) && (!info->isinline) && 962 (cur->children->type != HTML_TEXT_NODE) && 963 (cur->children->type != HTML_ENTITY_REF_NODE) && 964 (cur->children != cur->last) && 965 (cur->name != NULL) && 966 (cur->name[0] != 'p')) /* p, pre, param */ 967 xmlOutputBufferWriteString(buf, "\n"); 968 htmlNodeListDumpOutput(buf, doc, cur->children, encoding, format); 969 if ((format) && (info != NULL) && (!info->isinline) && 970 (cur->last->type != HTML_TEXT_NODE) && 971 (cur->last->type != HTML_ENTITY_REF_NODE) && 972 (cur->children != cur->last) && 973 (cur->name != NULL) && 974 (cur->name[0] != 'p')) /* p, pre, param */ 975 xmlOutputBufferWriteString(buf, "\n"); 976 } 977 xmlOutputBufferWriteString(buf, "</"); 978 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 979 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 980 xmlOutputBufferWriteString(buf, ":"); 981 } 982 xmlOutputBufferWriteString(buf, (const char *)cur->name); 983 xmlOutputBufferWriteString(buf, ">"); 984 if ((format) && (info != NULL) && (!info->isinline) && 985 (cur->next != NULL)) { 986 if ((cur->next->type != HTML_TEXT_NODE) && 987 (cur->next->type != HTML_ENTITY_REF_NODE) && 988 (cur->parent != NULL) && 989 (cur->parent->name != NULL) && 990 (cur->parent->name[0] != 'p')) /* p, pre, param */ 991 xmlOutputBufferWriteString(buf, "\n"); 992 } 993 } 994 995 /** 996 * htmlNodeDumpOutput: 997 * @buf: the HTML buffer output 998 * @doc: the document 999 * @cur: the current node 1000 * @encoding: the encoding string 1001 * 1002 * Dump an HTML node, recursive behaviour,children are printed too, 1003 * and formatting returns/spaces are added. 1004 */ 1005 void 1006 htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 1007 xmlNodePtr cur, const char *encoding) { 1008 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, 1); 1009 } 1010 1011 /** 1012 * htmlDocContentDumpFormatOutput: 1013 * @buf: the HTML buffer output 1014 * @cur: the document 1015 * @encoding: the encoding string 1016 * @format: should formatting spaces been added 1017 * 1018 * Dump an HTML document. 1019 */ 1020 void 1021 htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, 1022 const char *encoding, int format) { 1023 int type; 1024 1025 xmlInitParser(); 1026 1027 if ((buf == NULL) || (cur == NULL)) 1028 return; 1029 1030 /* 1031 * force to output the stuff as HTML, especially for entities 1032 */ 1033 type = cur->type; 1034 cur->type = XML_HTML_DOCUMENT_NODE; 1035 if (cur->intSubset != NULL) { 1036 htmlDtdDumpOutput(buf, cur, NULL); 1037 } 1038 if (cur->children != NULL) { 1039 htmlNodeListDumpOutput(buf, cur, cur->children, encoding, format); 1040 } 1041 xmlOutputBufferWriteString(buf, "\n"); 1042 cur->type = (xmlElementType) type; 1043 } 1044 1045 /** 1046 * htmlDocContentDumpOutput: 1047 * @buf: the HTML buffer output 1048 * @cur: the document 1049 * @encoding: the encoding string 1050 * 1051 * Dump an HTML document. Formating return/spaces are added. 1052 */ 1053 void 1054 htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, 1055 const char *encoding) { 1056 htmlDocContentDumpFormatOutput(buf, cur, encoding, 1); 1057 } 1058 1059 /************************************************************************ 1060 * * 1061 * Saving functions front-ends * 1062 * * 1063 ************************************************************************/ 1064 1065 /** 1066 * htmlDocDump: 1067 * @f: the FILE* 1068 * @cur: the document 1069 * 1070 * Dump an HTML document to an open FILE. 1071 * 1072 * returns: the number of byte written or -1 in case of failure. 1073 */ 1074 int 1075 htmlDocDump(FILE *f, xmlDocPtr cur) { 1076 xmlOutputBufferPtr buf; 1077 xmlCharEncodingHandlerPtr handler = NULL; 1078 const char *encoding; 1079 int ret; 1080 1081 xmlInitParser(); 1082 1083 if ((cur == NULL) || (f == NULL)) { 1084 return(-1); 1085 } 1086 1087 encoding = (const char *) htmlGetMetaEncoding(cur); 1088 1089 if (encoding != NULL) { 1090 xmlCharEncoding enc; 1091 1092 enc = xmlParseCharEncoding(encoding); 1093 if (enc != XML_CHAR_ENCODING_UTF8) { 1094 handler = xmlFindCharEncodingHandler(encoding); 1095 if (handler == NULL) 1096 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 1097 } 1098 } else { 1099 /* 1100 * Fallback to HTML or ASCII when the encoding is unspecified 1101 */ 1102 if (handler == NULL) 1103 handler = xmlFindCharEncodingHandler("HTML"); 1104 if (handler == NULL) 1105 handler = xmlFindCharEncodingHandler("ascii"); 1106 } 1107 1108 buf = xmlOutputBufferCreateFile(f, handler); 1109 if (buf == NULL) return(-1); 1110 htmlDocContentDumpOutput(buf, cur, NULL); 1111 1112 ret = xmlOutputBufferClose(buf); 1113 return(ret); 1114 } 1115 1116 /** 1117 * htmlSaveFile: 1118 * @filename: the filename (or URL) 1119 * @cur: the document 1120 * 1121 * Dump an HTML document to a file. If @filename is "-" the stdout file is 1122 * used. 1123 * returns: the number of byte written or -1 in case of failure. 1124 */ 1125 int 1126 htmlSaveFile(const char *filename, xmlDocPtr cur) { 1127 xmlOutputBufferPtr buf; 1128 xmlCharEncodingHandlerPtr handler = NULL; 1129 const char *encoding; 1130 int ret; 1131 1132 if ((cur == NULL) || (filename == NULL)) 1133 return(-1); 1134 1135 xmlInitParser(); 1136 1137 encoding = (const char *) htmlGetMetaEncoding(cur); 1138 1139 if (encoding != NULL) { 1140 xmlCharEncoding enc; 1141 1142 enc = xmlParseCharEncoding(encoding); 1143 if (enc != XML_CHAR_ENCODING_UTF8) { 1144 handler = xmlFindCharEncodingHandler(encoding); 1145 if (handler == NULL) 1146 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 1147 } 1148 } else { 1149 /* 1150 * Fallback to HTML or ASCII when the encoding is unspecified 1151 */ 1152 if (handler == NULL) 1153 handler = xmlFindCharEncodingHandler("HTML"); 1154 if (handler == NULL) 1155 handler = xmlFindCharEncodingHandler("ascii"); 1156 } 1157 1158 /* 1159 * save the content to a temp buffer. 1160 */ 1161 buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression); 1162 if (buf == NULL) return(0); 1163 1164 htmlDocContentDumpOutput(buf, cur, NULL); 1165 1166 ret = xmlOutputBufferClose(buf); 1167 return(ret); 1168 } 1169 1170 /** 1171 * htmlSaveFileFormat: 1172 * @filename: the filename 1173 * @cur: the document 1174 * @format: should formatting spaces been added 1175 * @encoding: the document encoding 1176 * 1177 * Dump an HTML document to a file using a given encoding. 1178 * 1179 * returns: the number of byte written or -1 in case of failure. 1180 */ 1181 int 1182 htmlSaveFileFormat(const char *filename, xmlDocPtr cur, 1183 const char *encoding, int format) { 1184 xmlOutputBufferPtr buf; 1185 xmlCharEncodingHandlerPtr handler = NULL; 1186 int ret; 1187 1188 if ((cur == NULL) || (filename == NULL)) 1189 return(-1); 1190 1191 xmlInitParser(); 1192 1193 if (encoding != NULL) { 1194 xmlCharEncoding enc; 1195 1196 enc = xmlParseCharEncoding(encoding); 1197 if (enc != XML_CHAR_ENCODING_UTF8) { 1198 handler = xmlFindCharEncodingHandler(encoding); 1199 if (handler == NULL) 1200 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 1201 } 1202 htmlSetMetaEncoding(cur, (const xmlChar *) encoding); 1203 } else { 1204 htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8"); 1205 1206 /* 1207 * Fallback to HTML or ASCII when the encoding is unspecified 1208 */ 1209 if (handler == NULL) 1210 handler = xmlFindCharEncodingHandler("HTML"); 1211 if (handler == NULL) 1212 handler = xmlFindCharEncodingHandler("ascii"); 1213 } 1214 1215 /* 1216 * save the content to a temp buffer. 1217 */ 1218 buf = xmlOutputBufferCreateFilename(filename, handler, 0); 1219 if (buf == NULL) return(0); 1220 1221 htmlDocContentDumpFormatOutput(buf, cur, encoding, format); 1222 1223 ret = xmlOutputBufferClose(buf); 1224 return(ret); 1225 } 1226 1227 /** 1228 * htmlSaveFileEnc: 1229 * @filename: the filename 1230 * @cur: the document 1231 * @encoding: the document encoding 1232 * 1233 * Dump an HTML document to a file using a given encoding 1234 * and formatting returns/spaces are added. 1235 * 1236 * returns: the number of byte written or -1 in case of failure. 1237 */ 1238 int 1239 htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) { 1240 return(htmlSaveFileFormat(filename, cur, encoding, 1)); 1241 } 1242 1243 #endif /* LIBXML_OUTPUT_ENABLED */ 1244 1245 #define bottom_HTMLtree 1246 #include "elfgcchack.h" 1247 #endif /* LIBXML_HTML_ENABLED */ 1248