1 /*
2 * HTMLtree.c : implementation of access function for an HTML tree.
3 *
4 * See Copyright for the status of this software.
5 *
6 * daniel@veillard.com
7 */
8
9
10 #define IN_LIBXML
11 #include "libxml.h"
12 #ifdef LIBXML_HTML_ENABLED
13
14 #include <string.h> /* for memset() only ! */
15 #include <ctype.h>
16 #include <stdlib.h>
17
18 #include <libxml/xmlmemory.h>
19 #include <libxml/HTMLparser.h>
20 #include <libxml/HTMLtree.h>
21 #include <libxml/entities.h>
22 #include <libxml/valid.h>
23 #include <libxml/xmlerror.h>
24 #include <libxml/parserInternals.h>
25 #include <libxml/globals.h>
26 #include <libxml/uri.h>
27
28 #include "buf.h"
29
30 /************************************************************************
31 * *
32 * Getting/Setting encoding meta tags *
33 * *
34 ************************************************************************/
35
36 /**
37 * htmlGetMetaEncoding:
38 * @doc: the document
39 *
40 * Encoding definition lookup in the Meta tags
41 *
42 * Returns the current encoding as flagged in the HTML source
43 */
44 const xmlChar *
htmlGetMetaEncoding(htmlDocPtr doc)45 htmlGetMetaEncoding(htmlDocPtr doc) {
46 htmlNodePtr cur;
47 const xmlChar *content;
48 const xmlChar *encoding;
49
50 if (doc == NULL)
51 return(NULL);
52 cur = doc->children;
53
54 /*
55 * Search the html
56 */
57 while (cur != NULL) {
58 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
59 if (xmlStrEqual(cur->name, BAD_CAST"html"))
60 break;
61 if (xmlStrEqual(cur->name, BAD_CAST"head"))
62 goto found_head;
63 if (xmlStrEqual(cur->name, BAD_CAST"meta"))
64 goto found_meta;
65 }
66 cur = cur->next;
67 }
68 if (cur == NULL)
69 return(NULL);
70 cur = cur->children;
71
72 /*
73 * Search the head
74 */
75 while (cur != NULL) {
76 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
77 if (xmlStrEqual(cur->name, BAD_CAST"head"))
78 break;
79 if (xmlStrEqual(cur->name, BAD_CAST"meta"))
80 goto found_meta;
81 }
82 cur = cur->next;
83 }
84 if (cur == NULL)
85 return(NULL);
86 found_head:
87 cur = cur->children;
88
89 /*
90 * Search the meta elements
91 */
92 found_meta:
93 while (cur != NULL) {
94 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
95 if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
96 xmlAttrPtr attr = cur->properties;
97 int http;
98 const xmlChar *value;
99
100 content = NULL;
101 http = 0;
102 while (attr != NULL) {
103 if ((attr->children != NULL) &&
104 (attr->children->type == XML_TEXT_NODE) &&
105 (attr->children->next == NULL)) {
106 value = attr->children->content;
107 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
108 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
109 http = 1;
110 else if ((value != NULL)
111 && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
112 content = value;
113 if ((http != 0) && (content != NULL))
114 goto found_content;
115 }
116 attr = attr->next;
117 }
118 }
119 }
120 cur = cur->next;
121 }
122 return(NULL);
123
124 found_content:
125 encoding = xmlStrstr(content, BAD_CAST"charset=");
126 if (encoding == NULL)
127 encoding = xmlStrstr(content, BAD_CAST"Charset=");
128 if (encoding == NULL)
129 encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
130 if (encoding != NULL) {
131 encoding += 8;
132 } else {
133 encoding = xmlStrstr(content, BAD_CAST"charset =");
134 if (encoding == NULL)
135 encoding = xmlStrstr(content, BAD_CAST"Charset =");
136 if (encoding == NULL)
137 encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
138 if (encoding != NULL)
139 encoding += 9;
140 }
141 if (encoding != NULL) {
142 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
143 }
144 return(encoding);
145 }
146
147 /**
148 * htmlSetMetaEncoding:
149 * @doc: the document
150 * @encoding: the encoding string
151 *
152 * Sets the current encoding in the Meta tags
153 * NOTE: this will not change the document content encoding, just
154 * the META flag associated.
155 *
156 * Returns 0 in case of success and -1 in case of error
157 */
158 int
htmlSetMetaEncoding(htmlDocPtr doc,const xmlChar * encoding)159 htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
160 htmlNodePtr cur, meta = NULL, head = NULL;
161 const xmlChar *content = NULL;
162 char newcontent[100];
163
164 newcontent[0] = 0;
165
166 if (doc == NULL)
167 return(-1);
168
169 /* html isn't a real encoding it's just libxml2 way to get entities */
170 if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
171 return(-1);
172
173 if (encoding != NULL) {
174 snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
175 (char *)encoding);
176 newcontent[sizeof(newcontent) - 1] = 0;
177 }
178
179 cur = doc->children;
180
181 /*
182 * Search the html
183 */
184 while (cur != NULL) {
185 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
186 if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
187 break;
188 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
189 goto found_head;
190 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
191 goto found_meta;
192 }
193 cur = cur->next;
194 }
195 if (cur == NULL)
196 return(-1);
197 cur = cur->children;
198
199 /*
200 * Search the head
201 */
202 while (cur != NULL) {
203 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
204 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
205 break;
206 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
207 head = cur->parent;
208 goto found_meta;
209 }
210 }
211 cur = cur->next;
212 }
213 if (cur == NULL)
214 return(-1);
215 found_head:
216 head = cur;
217 if (cur->children == NULL)
218 goto create;
219 cur = cur->children;
220
221 found_meta:
222 /*
223 * Search and update all the remaining the meta elements carrying
224 * encoding information
225 */
226 while (cur != NULL) {
227 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
228 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
229 xmlAttrPtr attr = cur->properties;
230 int http;
231 const xmlChar *value;
232
233 content = NULL;
234 http = 0;
235 while (attr != NULL) {
236 if ((attr->children != NULL) &&
237 (attr->children->type == XML_TEXT_NODE) &&
238 (attr->children->next == NULL)) {
239 value = attr->children->content;
240 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
241 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
242 http = 1;
243 else
244 {
245 if ((value != NULL) &&
246 (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
247 content = value;
248 }
249 if ((http != 0) && (content != NULL))
250 break;
251 }
252 attr = attr->next;
253 }
254 if ((http != 0) && (content != NULL)) {
255 meta = cur;
256 break;
257 }
258
259 }
260 }
261 cur = cur->next;
262 }
263 create:
264 if (meta == NULL) {
265 if ((encoding != NULL) && (head != NULL)) {
266 /*
267 * Create a new Meta element with the right attributes
268 */
269
270 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
271 if (head->children == NULL)
272 xmlAddChild(head, meta);
273 else
274 xmlAddPrevSibling(head->children, meta);
275 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
276 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
277 }
278 } else {
279 /* remove the meta tag if NULL is passed */
280 if (encoding == NULL) {
281 xmlUnlinkNode(meta);
282 xmlFreeNode(meta);
283 }
284 /* change the document only if there is a real encoding change */
285 else if (xmlStrcasestr(content, encoding) == NULL) {
286 xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
287 }
288 }
289
290
291 return(0);
292 }
293
294 /**
295 * booleanHTMLAttrs:
296 *
297 * These are the HTML attributes which will be output
298 * in minimized form, i.e. <option selected="selected"> will be
299 * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
300 *
301 */
302 static const char* const htmlBooleanAttrs[] = {
303 "checked", "compact", "declare", "defer", "disabled", "ismap",
304 "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
305 "selected", NULL
306 };
307
308
309 /**
310 * htmlIsBooleanAttr:
311 * @name: the name of the attribute to check
312 *
313 * Determine if a given attribute is a boolean attribute.
314 *
315 * returns: false if the attribute is not boolean, true otherwise.
316 */
317 int
htmlIsBooleanAttr(const xmlChar * name)318 htmlIsBooleanAttr(const xmlChar *name)
319 {
320 int i = 0;
321
322 while (htmlBooleanAttrs[i] != NULL) {
323 if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
324 return 1;
325 i++;
326 }
327 return 0;
328 }
329
330 #ifdef LIBXML_OUTPUT_ENABLED
331 /*
332 * private routine exported from xmlIO.c
333 */
334 xmlOutputBufferPtr
335 xmlAllocOutputBufferInternal(xmlCharEncodingHandlerPtr encoder);
336 /************************************************************************
337 * *
338 * Output error handlers *
339 * *
340 ************************************************************************/
341 /**
342 * htmlSaveErrMemory:
343 * @extra: extra information
344 *
345 * Handle an out of memory condition
346 */
347 static void
htmlSaveErrMemory(const char * extra)348 htmlSaveErrMemory(const char *extra)
349 {
350 __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);
351 }
352
353 /**
354 * htmlSaveErr:
355 * @code: the error number
356 * @node: the location of the error.
357 * @extra: extra information
358 *
359 * Handle an out of memory condition
360 */
361 static void
htmlSaveErr(int code,xmlNodePtr node,const char * extra)362 htmlSaveErr(int code, xmlNodePtr node, const char *extra)
363 {
364 const char *msg = NULL;
365
366 switch(code) {
367 case XML_SAVE_NOT_UTF8:
368 msg = "string is not in UTF-8\n";
369 break;
370 case XML_SAVE_CHAR_INVALID:
371 msg = "invalid character value\n";
372 break;
373 case XML_SAVE_UNKNOWN_ENCODING:
374 msg = "unknown encoding %s\n";
375 break;
376 case XML_SAVE_NO_DOCTYPE:
377 msg = "HTML has no DOCTYPE\n";
378 break;
379 default:
380 msg = "unexpected error number\n";
381 }
382 __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);
383 }
384
385 /************************************************************************
386 * *
387 * Dumping HTML tree content to a simple buffer *
388 * *
389 ************************************************************************/
390
391 /**
392 * htmlBufNodeDumpFormat:
393 * @buf: the xmlBufPtr output
394 * @doc: the document
395 * @cur: the current node
396 * @format: should formatting spaces been added
397 *
398 * Dump an HTML node, recursive behaviour,children are printed too.
399 *
400 * Returns the number of byte written or -1 in case of error
401 */
402 static size_t
htmlBufNodeDumpFormat(xmlBufPtr buf,xmlDocPtr doc,xmlNodePtr cur,int format)403 htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
404 int format) {
405 size_t use;
406 int ret;
407 xmlOutputBufferPtr outbuf;
408
409 if (cur == NULL) {
410 return (-1);
411 }
412 if (buf == NULL) {
413 return (-1);
414 }
415 outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
416 if (outbuf == NULL) {
417 htmlSaveErrMemory("allocating HTML output buffer");
418 return (-1);
419 }
420 memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer));
421 outbuf->buffer = buf;
422 outbuf->encoder = NULL;
423 outbuf->writecallback = NULL;
424 outbuf->closecallback = NULL;
425 outbuf->context = NULL;
426 outbuf->written = 0;
427
428 use = xmlBufUse(buf);
429 htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
430 xmlFree(outbuf);
431 ret = xmlBufUse(buf) - use;
432 return (ret);
433 }
434
435 /**
436 * htmlNodeDump:
437 * @buf: the HTML buffer output
438 * @doc: the document
439 * @cur: the current node
440 *
441 * Dump an HTML node, recursive behaviour,children are printed too,
442 * and formatting returns are added.
443 *
444 * Returns the number of byte written or -1 in case of error
445 */
446 int
htmlNodeDump(xmlBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur)447 htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
448 xmlBufPtr buffer;
449 size_t ret;
450
451 if ((buf == NULL) || (cur == NULL))
452 return(-1);
453
454 xmlInitParser();
455 buffer = xmlBufFromBuffer(buf);
456 if (buffer == NULL)
457 return(-1);
458
459 ret = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
460
461 xmlBufBackToBuffer(buffer);
462
463 if (ret > INT_MAX)
464 return(-1);
465 return((int) ret);
466 }
467
468 /**
469 * htmlNodeDumpFileFormat:
470 * @out: the FILE pointer
471 * @doc: the document
472 * @cur: the current node
473 * @encoding: the document encoding
474 * @format: should formatting spaces been added
475 *
476 * Dump an HTML node, recursive behaviour,children are printed too.
477 *
478 * TODO: if encoding == NULL try to save in the doc encoding
479 *
480 * returns: the number of byte written or -1 in case of failure.
481 */
482 int
htmlNodeDumpFileFormat(FILE * out,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)483 htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
484 xmlNodePtr cur, const char *encoding, int format) {
485 xmlOutputBufferPtr buf;
486 xmlCharEncodingHandlerPtr handler = NULL;
487 int ret;
488
489 xmlInitParser();
490
491 if (encoding != NULL) {
492 xmlCharEncoding enc;
493
494 enc = xmlParseCharEncoding(encoding);
495 if (enc != XML_CHAR_ENCODING_UTF8) {
496 handler = xmlFindCharEncodingHandler(encoding);
497 if (handler == NULL)
498 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
499 }
500 } else {
501 /*
502 * Fallback to HTML or ASCII when the encoding is unspecified
503 */
504 if (handler == NULL)
505 handler = xmlFindCharEncodingHandler("HTML");
506 if (handler == NULL)
507 handler = xmlFindCharEncodingHandler("ascii");
508 }
509
510 /*
511 * save the content to a temp buffer.
512 */
513 buf = xmlOutputBufferCreateFile(out, handler);
514 if (buf == NULL) return(0);
515
516 htmlNodeDumpFormatOutput(buf, doc, cur, NULL, format);
517
518 ret = xmlOutputBufferClose(buf);
519 return(ret);
520 }
521
522 /**
523 * htmlNodeDumpFile:
524 * @out: the FILE pointer
525 * @doc: the document
526 * @cur: the current node
527 *
528 * Dump an HTML node, recursive behaviour,children are printed too,
529 * and formatting returns are added.
530 */
531 void
htmlNodeDumpFile(FILE * out,xmlDocPtr doc,xmlNodePtr cur)532 htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
533 htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
534 }
535
536 /**
537 * htmlDocDumpMemoryFormat:
538 * @cur: the document
539 * @mem: OUT: the memory pointer
540 * @size: OUT: the memory length
541 * @format: should formatting spaces been added
542 *
543 * Dump an HTML document in memory and return the xmlChar * and it's size.
544 * It's up to the caller to free the memory.
545 */
546 void
htmlDocDumpMemoryFormat(xmlDocPtr cur,xmlChar ** mem,int * size,int format)547 htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
548 xmlOutputBufferPtr buf;
549 xmlCharEncodingHandlerPtr handler = NULL;
550 const char *encoding;
551
552 xmlInitParser();
553
554 if ((mem == NULL) || (size == NULL))
555 return;
556 if (cur == NULL) {
557 *mem = NULL;
558 *size = 0;
559 return;
560 }
561
562 encoding = (const char *) htmlGetMetaEncoding(cur);
563
564 if (encoding != NULL) {
565 xmlCharEncoding enc;
566
567 enc = xmlParseCharEncoding(encoding);
568 if (enc != XML_CHAR_ENCODING_UTF8) {
569 handler = xmlFindCharEncodingHandler(encoding);
570 if (handler == NULL)
571 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
572
573 }
574 } else {
575 /*
576 * Fallback to HTML or ASCII when the encoding is unspecified
577 */
578 if (handler == NULL)
579 handler = xmlFindCharEncodingHandler("HTML");
580 if (handler == NULL)
581 handler = xmlFindCharEncodingHandler("ascii");
582 }
583
584 buf = xmlAllocOutputBufferInternal(handler);
585 if (buf == NULL) {
586 *mem = NULL;
587 *size = 0;
588 return;
589 }
590
591 htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
592
593 xmlOutputBufferFlush(buf);
594 if (buf->conv != NULL) {
595 *size = xmlBufUse(buf->conv);
596 *mem = xmlStrndup(xmlBufContent(buf->conv), *size);
597 } else {
598 *size = xmlBufUse(buf->buffer);
599 *mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
600 }
601 (void)xmlOutputBufferClose(buf);
602 }
603
604 /**
605 * htmlDocDumpMemory:
606 * @cur: the document
607 * @mem: OUT: the memory pointer
608 * @size: OUT: the memory length
609 *
610 * Dump an HTML document in memory and return the xmlChar * and it's size.
611 * It's up to the caller to free the memory.
612 */
613 void
htmlDocDumpMemory(xmlDocPtr cur,xmlChar ** mem,int * size)614 htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
615 htmlDocDumpMemoryFormat(cur, mem, size, 1);
616 }
617
618
619 /************************************************************************
620 * *
621 * Dumping HTML tree content to an I/O output buffer *
622 * *
623 ************************************************************************/
624
625 void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur);
626
627 /**
628 * htmlDtdDumpOutput:
629 * @buf: the HTML buffer output
630 * @doc: the document
631 * @encoding: the encoding string
632 *
633 * TODO: check whether encoding is needed
634 *
635 * Dump the HTML document DTD, if any.
636 */
637 static void
htmlDtdDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,const char * encoding ATTRIBUTE_UNUSED)638 htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
639 const char *encoding ATTRIBUTE_UNUSED) {
640 xmlDtdPtr cur = doc->intSubset;
641
642 if (cur == NULL) {
643 htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
644 return;
645 }
646 xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
647 xmlOutputBufferWriteString(buf, (const char *)cur->name);
648 if (cur->ExternalID != NULL) {
649 xmlOutputBufferWriteString(buf, " PUBLIC ");
650 xmlBufWriteQuotedString(buf->buffer, cur->ExternalID);
651 if (cur->SystemID != NULL) {
652 xmlOutputBufferWriteString(buf, " ");
653 xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
654 }
655 } else if (cur->SystemID != NULL &&
656 xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
657 xmlOutputBufferWriteString(buf, " SYSTEM ");
658 xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
659 }
660 xmlOutputBufferWriteString(buf, ">\n");
661 }
662
663 /**
664 * htmlAttrDumpOutput:
665 * @buf: the HTML buffer output
666 * @doc: the document
667 * @cur: the attribute pointer
668 *
669 * Dump an HTML attribute
670 */
671 static void
htmlAttrDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlAttrPtr cur)672 htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
673 xmlChar *value;
674
675 /*
676 * The html output method should not escape a & character
677 * occurring in an attribute value immediately followed by
678 * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
679 * This is implemented in xmlEncodeEntitiesReentrant
680 */
681
682 if (cur == NULL) {
683 return;
684 }
685 xmlOutputBufferWriteString(buf, " ");
686 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
687 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
688 xmlOutputBufferWriteString(buf, ":");
689 }
690 xmlOutputBufferWriteString(buf, (const char *)cur->name);
691 if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
692 value = xmlNodeListGetString(doc, cur->children, 0);
693 if (value) {
694 xmlOutputBufferWriteString(buf, "=");
695 if ((cur->ns == NULL) && (cur->parent != NULL) &&
696 (cur->parent->ns == NULL) &&
697 ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
698 (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
699 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
700 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
701 (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
702 xmlChar *escaped;
703 xmlChar *tmp = value;
704
705 while (IS_BLANK_CH(*tmp)) tmp++;
706
707 /*
708 * the < and > have already been escaped at the entity level
709 * And doing so here breaks server side includes
710 */
711 escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+<>");
712 if (escaped != NULL) {
713 xmlBufWriteQuotedString(buf->buffer, escaped);
714 xmlFree(escaped);
715 } else {
716 xmlBufWriteQuotedString(buf->buffer, value);
717 }
718 } else {
719 xmlBufWriteQuotedString(buf->buffer, value);
720 }
721 xmlFree(value);
722 } else {
723 xmlOutputBufferWriteString(buf, "=\"\"");
724 }
725 }
726 }
727
728 /**
729 * htmlNodeDumpFormatOutput:
730 * @buf: the HTML buffer output
731 * @doc: the document
732 * @cur: the current node
733 * @encoding: the encoding string (unused)
734 * @format: should formatting spaces been added
735 *
736 * Dump an HTML node, recursive behaviour,children are printed too.
737 */
738 void
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding ATTRIBUTE_UNUSED,int format)739 htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
740 xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED,
741 int format) {
742 xmlNodePtr root, parent;
743 xmlAttrPtr attr;
744 const htmlElemDesc * info;
745
746 xmlInitParser();
747
748 if ((cur == NULL) || (buf == NULL)) {
749 return;
750 }
751
752 root = cur;
753 parent = cur->parent;
754 while (1) {
755 switch (cur->type) {
756 case XML_HTML_DOCUMENT_NODE:
757 case XML_DOCUMENT_NODE:
758 if (((xmlDocPtr) cur)->intSubset != NULL) {
759 htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL);
760 }
761 if (cur->children != NULL) {
762 /* Always validate cur->parent when descending. */
763 if (cur->parent == parent) {
764 parent = cur;
765 cur = cur->children;
766 continue;
767 }
768 } else {
769 xmlOutputBufferWriteString(buf, "\n");
770 }
771 break;
772
773 case XML_ELEMENT_NODE:
774 /*
775 * Some users like lxml are known to pass nodes with a corrupted
776 * tree structure. Fall back to a recursive call to handle this
777 * case.
778 */
779 if ((cur->parent != parent) && (cur->children != NULL)) {
780 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
781 break;
782 }
783
784 /*
785 * Get specific HTML info for that node.
786 */
787 if (cur->ns == NULL)
788 info = htmlTagLookup(cur->name);
789 else
790 info = NULL;
791
792 xmlOutputBufferWriteString(buf, "<");
793 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
794 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
795 xmlOutputBufferWriteString(buf, ":");
796 }
797 xmlOutputBufferWriteString(buf, (const char *)cur->name);
798 if (cur->nsDef)
799 xmlNsListDumpOutput(buf, cur->nsDef);
800 attr = cur->properties;
801 while (attr != NULL) {
802 htmlAttrDumpOutput(buf, doc, attr);
803 attr = attr->next;
804 }
805
806 if ((info != NULL) && (info->empty)) {
807 xmlOutputBufferWriteString(buf, ">");
808 } else if (cur->children == NULL) {
809 if ((info != NULL) && (info->saveEndTag != 0) &&
810 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
811 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
812 xmlOutputBufferWriteString(buf, ">");
813 } else {
814 xmlOutputBufferWriteString(buf, "></");
815 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
816 xmlOutputBufferWriteString(buf,
817 (const char *)cur->ns->prefix);
818 xmlOutputBufferWriteString(buf, ":");
819 }
820 xmlOutputBufferWriteString(buf, (const char *)cur->name);
821 xmlOutputBufferWriteString(buf, ">");
822 }
823 } else {
824 xmlOutputBufferWriteString(buf, ">");
825 if ((format) && (info != NULL) && (!info->isinline) &&
826 (cur->children->type != HTML_TEXT_NODE) &&
827 (cur->children->type != HTML_ENTITY_REF_NODE) &&
828 (cur->children != cur->last) &&
829 (cur->name != NULL) &&
830 (cur->name[0] != 'p')) /* p, pre, param */
831 xmlOutputBufferWriteString(buf, "\n");
832 parent = cur;
833 cur = cur->children;
834 continue;
835 }
836
837 if ((format) && (cur->next != NULL) &&
838 (info != NULL) && (!info->isinline)) {
839 if ((cur->next->type != HTML_TEXT_NODE) &&
840 (cur->next->type != HTML_ENTITY_REF_NODE) &&
841 (parent != NULL) &&
842 (parent->name != NULL) &&
843 (parent->name[0] != 'p')) /* p, pre, param */
844 xmlOutputBufferWriteString(buf, "\n");
845 }
846
847 break;
848
849 case XML_ATTRIBUTE_NODE:
850 htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur);
851 break;
852
853 case HTML_TEXT_NODE:
854 if (cur->content == NULL)
855 break;
856 if (((cur->name == (const xmlChar *)xmlStringText) ||
857 (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
858 ((parent == NULL) ||
859 ((xmlStrcasecmp(parent->name, BAD_CAST "script")) &&
860 (xmlStrcasecmp(parent->name, BAD_CAST "style"))))) {
861 xmlChar *buffer;
862
863 buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
864 if (buffer != NULL) {
865 xmlOutputBufferWriteString(buf, (const char *)buffer);
866 xmlFree(buffer);
867 }
868 } else {
869 xmlOutputBufferWriteString(buf, (const char *)cur->content);
870 }
871 break;
872
873 case HTML_COMMENT_NODE:
874 if (cur->content != NULL) {
875 xmlOutputBufferWriteString(buf, "<!--");
876 xmlOutputBufferWriteString(buf, (const char *)cur->content);
877 xmlOutputBufferWriteString(buf, "-->");
878 }
879 break;
880
881 case HTML_PI_NODE:
882 if (cur->name != NULL) {
883 xmlOutputBufferWriteString(buf, "<?");
884 xmlOutputBufferWriteString(buf, (const char *)cur->name);
885 if (cur->content != NULL) {
886 xmlOutputBufferWriteString(buf, " ");
887 xmlOutputBufferWriteString(buf,
888 (const char *)cur->content);
889 }
890 xmlOutputBufferWriteString(buf, ">");
891 }
892 break;
893
894 case HTML_ENTITY_REF_NODE:
895 xmlOutputBufferWriteString(buf, "&");
896 xmlOutputBufferWriteString(buf, (const char *)cur->name);
897 xmlOutputBufferWriteString(buf, ";");
898 break;
899
900 case HTML_PRESERVE_NODE:
901 if (cur->content != NULL) {
902 xmlOutputBufferWriteString(buf, (const char *)cur->content);
903 }
904 break;
905
906 default:
907 break;
908 }
909
910 while (1) {
911 if (cur == root)
912 return;
913 if (cur->next != NULL) {
914 cur = cur->next;
915 break;
916 }
917
918 cur = parent;
919 /* cur->parent was validated when descending. */
920 parent = cur->parent;
921
922 if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
923 (cur->type == XML_DOCUMENT_NODE)) {
924 xmlOutputBufferWriteString(buf, "\n");
925 } else {
926 if ((format) && (cur->ns == NULL))
927 info = htmlTagLookup(cur->name);
928 else
929 info = NULL;
930
931 if ((format) && (info != NULL) && (!info->isinline) &&
932 (cur->last->type != HTML_TEXT_NODE) &&
933 (cur->last->type != HTML_ENTITY_REF_NODE) &&
934 (cur->children != cur->last) &&
935 (cur->name != NULL) &&
936 (cur->name[0] != 'p')) /* p, pre, param */
937 xmlOutputBufferWriteString(buf, "\n");
938
939 xmlOutputBufferWriteString(buf, "</");
940 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
941 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
942 xmlOutputBufferWriteString(buf, ":");
943 }
944 xmlOutputBufferWriteString(buf, (const char *)cur->name);
945 xmlOutputBufferWriteString(buf, ">");
946
947 if ((format) && (info != NULL) && (!info->isinline) &&
948 (cur->next != NULL)) {
949 if ((cur->next->type != HTML_TEXT_NODE) &&
950 (cur->next->type != HTML_ENTITY_REF_NODE) &&
951 (parent != NULL) &&
952 (parent->name != NULL) &&
953 (parent->name[0] != 'p')) /* p, pre, param */
954 xmlOutputBufferWriteString(buf, "\n");
955 }
956 }
957 }
958 }
959 }
960
961 /**
962 * htmlNodeDumpOutput:
963 * @buf: the HTML buffer output
964 * @doc: the document
965 * @cur: the current node
966 * @encoding: the encoding string (unused)
967 *
968 * Dump an HTML node, recursive behaviour,children are printed too,
969 * and formatting returns/spaces are added.
970 */
971 void
htmlNodeDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding ATTRIBUTE_UNUSED)972 htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
973 xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED) {
974 htmlNodeDumpFormatOutput(buf, doc, cur, NULL, 1);
975 }
976
977 /**
978 * htmlDocContentDumpFormatOutput:
979 * @buf: the HTML buffer output
980 * @cur: the document
981 * @encoding: the encoding string (unused)
982 * @format: should formatting spaces been added
983 *
984 * Dump an HTML document.
985 */
986 void
htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding ATTRIBUTE_UNUSED,int format)987 htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
988 const char *encoding ATTRIBUTE_UNUSED,
989 int format) {
990 int type = 0;
991 if (cur) {
992 type = cur->type;
993 cur->type = XML_HTML_DOCUMENT_NODE;
994 }
995 htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, format);
996 if (cur)
997 cur->type = (xmlElementType) type;
998 }
999
1000 /**
1001 * htmlDocContentDumpOutput:
1002 * @buf: the HTML buffer output
1003 * @cur: the document
1004 * @encoding: the encoding string (unused)
1005 *
1006 * Dump an HTML document. Formatting return/spaces are added.
1007 */
1008 void
htmlDocContentDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding ATTRIBUTE_UNUSED)1009 htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1010 const char *encoding ATTRIBUTE_UNUSED) {
1011 htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, 1);
1012 }
1013
1014 /************************************************************************
1015 * *
1016 * Saving functions front-ends *
1017 * *
1018 ************************************************************************/
1019
1020 /**
1021 * htmlDocDump:
1022 * @f: the FILE*
1023 * @cur: the document
1024 *
1025 * Dump an HTML document to an open FILE.
1026 *
1027 * returns: the number of byte written or -1 in case of failure.
1028 */
1029 int
htmlDocDump(FILE * f,xmlDocPtr cur)1030 htmlDocDump(FILE *f, xmlDocPtr cur) {
1031 xmlOutputBufferPtr buf;
1032 xmlCharEncodingHandlerPtr handler = NULL;
1033 const char *encoding;
1034 int ret;
1035
1036 xmlInitParser();
1037
1038 if ((cur == NULL) || (f == NULL)) {
1039 return(-1);
1040 }
1041
1042 encoding = (const char *) htmlGetMetaEncoding(cur);
1043
1044 if (encoding != NULL) {
1045 xmlCharEncoding enc;
1046
1047 enc = xmlParseCharEncoding(encoding);
1048 if (enc != XML_CHAR_ENCODING_UTF8) {
1049 handler = xmlFindCharEncodingHandler(encoding);
1050 if (handler == NULL)
1051 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1052 }
1053 } else {
1054 /*
1055 * Fallback to HTML or ASCII when the encoding is unspecified
1056 */
1057 if (handler == NULL)
1058 handler = xmlFindCharEncodingHandler("HTML");
1059 if (handler == NULL)
1060 handler = xmlFindCharEncodingHandler("ascii");
1061 }
1062
1063 buf = xmlOutputBufferCreateFile(f, handler);
1064 if (buf == NULL) return(-1);
1065 htmlDocContentDumpOutput(buf, cur, NULL);
1066
1067 ret = xmlOutputBufferClose(buf);
1068 return(ret);
1069 }
1070
1071 /**
1072 * htmlSaveFile:
1073 * @filename: the filename (or URL)
1074 * @cur: the document
1075 *
1076 * Dump an HTML document to a file. If @filename is "-" the stdout file is
1077 * used.
1078 * returns: the number of byte written or -1 in case of failure.
1079 */
1080 int
htmlSaveFile(const char * filename,xmlDocPtr cur)1081 htmlSaveFile(const char *filename, xmlDocPtr cur) {
1082 xmlOutputBufferPtr buf;
1083 xmlCharEncodingHandlerPtr handler = NULL;
1084 const char *encoding;
1085 int ret;
1086
1087 if ((cur == NULL) || (filename == NULL))
1088 return(-1);
1089
1090 xmlInitParser();
1091
1092 encoding = (const char *) htmlGetMetaEncoding(cur);
1093
1094 if (encoding != NULL) {
1095 xmlCharEncoding enc;
1096
1097 enc = xmlParseCharEncoding(encoding);
1098 if (enc != XML_CHAR_ENCODING_UTF8) {
1099 handler = xmlFindCharEncodingHandler(encoding);
1100 if (handler == NULL)
1101 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1102 }
1103 } else {
1104 /*
1105 * Fallback to HTML or ASCII when the encoding is unspecified
1106 */
1107 if (handler == NULL)
1108 handler = xmlFindCharEncodingHandler("HTML");
1109 if (handler == NULL)
1110 handler = xmlFindCharEncodingHandler("ascii");
1111 }
1112
1113 /*
1114 * save the content to a temp buffer.
1115 */
1116 buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1117 if (buf == NULL) return(0);
1118
1119 htmlDocContentDumpOutput(buf, cur, NULL);
1120
1121 ret = xmlOutputBufferClose(buf);
1122 return(ret);
1123 }
1124
1125 /**
1126 * htmlSaveFileFormat:
1127 * @filename: the filename
1128 * @cur: the document
1129 * @format: should formatting spaces been added
1130 * @encoding: the document encoding
1131 *
1132 * Dump an HTML document to a file using a given encoding.
1133 *
1134 * returns: the number of byte written or -1 in case of failure.
1135 */
1136 int
htmlSaveFileFormat(const char * filename,xmlDocPtr cur,const char * encoding,int format)1137 htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1138 const char *encoding, int format) {
1139 xmlOutputBufferPtr buf;
1140 xmlCharEncodingHandlerPtr handler = NULL;
1141 int ret;
1142
1143 if ((cur == NULL) || (filename == NULL))
1144 return(-1);
1145
1146 xmlInitParser();
1147
1148 if (encoding != NULL) {
1149 xmlCharEncoding enc;
1150
1151 enc = xmlParseCharEncoding(encoding);
1152 if (enc != XML_CHAR_ENCODING_UTF8) {
1153 handler = xmlFindCharEncodingHandler(encoding);
1154 if (handler == NULL)
1155 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1156 }
1157 htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1158 } else {
1159 htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1160
1161 /*
1162 * Fallback to HTML or ASCII when the encoding is unspecified
1163 */
1164 if (handler == NULL)
1165 handler = xmlFindCharEncodingHandler("HTML");
1166 if (handler == NULL)
1167 handler = xmlFindCharEncodingHandler("ascii");
1168 }
1169
1170 /*
1171 * save the content to a temp buffer.
1172 */
1173 buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1174 if (buf == NULL) return(0);
1175
1176 htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1177
1178 ret = xmlOutputBufferClose(buf);
1179 return(ret);
1180 }
1181
1182 /**
1183 * htmlSaveFileEnc:
1184 * @filename: the filename
1185 * @cur: the document
1186 * @encoding: the document encoding
1187 *
1188 * Dump an HTML document to a file using a given encoding
1189 * and formatting returns/spaces are added.
1190 *
1191 * returns: the number of byte written or -1 in case of failure.
1192 */
1193 int
htmlSaveFileEnc(const char * filename,xmlDocPtr cur,const char * encoding)1194 htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1195 return(htmlSaveFileFormat(filename, cur, encoding, 1));
1196 }
1197
1198 #endif /* LIBXML_OUTPUT_ENABLED */
1199
1200 #endif /* LIBXML_HTML_ENABLED */
1201