xref: /reactos/sdk/lib/3rdparty/libxml2/HTMLtree.c (revision 911153da)
1 /*
2  * HTMLtree.c : implementation of access function for an HTML tree.
3  *
4  * See Copyright for the status of this software.
5  *
6  * daniel@veillard.com
7  */
8 
9 
10 #define IN_LIBXML
11 #include "libxml.h"
12 #ifdef LIBXML_HTML_ENABLED
13 
14 #include <string.h> /* for memset() only ! */
15 #include <ctype.h>
16 #include <stdlib.h>
17 
18 #include <libxml/xmlmemory.h>
19 #include <libxml/HTMLparser.h>
20 #include <libxml/HTMLtree.h>
21 #include <libxml/entities.h>
22 #include <libxml/valid.h>
23 #include <libxml/xmlerror.h>
24 #include <libxml/parserInternals.h>
25 #include <libxml/globals.h>
26 #include <libxml/uri.h>
27 
28 #include "buf.h"
29 
30 /************************************************************************
31  *									*
32  *		Getting/Setting encoding meta tags			*
33  *									*
34  ************************************************************************/
35 
36 /**
37  * htmlGetMetaEncoding:
38  * @doc:  the document
39  *
40  * Encoding definition lookup in the Meta tags
41  *
42  * Returns the current encoding as flagged in the HTML source
43  */
44 const xmlChar *
htmlGetMetaEncoding(htmlDocPtr doc)45 htmlGetMetaEncoding(htmlDocPtr doc) {
46     htmlNodePtr cur;
47     const xmlChar *content;
48     const xmlChar *encoding;
49 
50     if (doc == NULL)
51 	return(NULL);
52     cur = doc->children;
53 
54     /*
55      * Search the html
56      */
57     while (cur != NULL) {
58 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
59 	    if (xmlStrEqual(cur->name, BAD_CAST"html"))
60 		break;
61 	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
62 		goto found_head;
63 	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
64 		goto found_meta;
65 	}
66 	cur = cur->next;
67     }
68     if (cur == NULL)
69 	return(NULL);
70     cur = cur->children;
71 
72     /*
73      * Search the head
74      */
75     while (cur != NULL) {
76 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
77 	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
78 		break;
79 	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
80 		goto found_meta;
81 	}
82 	cur = cur->next;
83     }
84     if (cur == NULL)
85 	return(NULL);
86 found_head:
87     cur = cur->children;
88 
89     /*
90      * Search the meta elements
91      */
92 found_meta:
93     while (cur != NULL) {
94 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
95 	    if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
96 		xmlAttrPtr attr = cur->properties;
97 		int http;
98 		const xmlChar *value;
99 
100 		content = NULL;
101 		http = 0;
102 		while (attr != NULL) {
103 		    if ((attr->children != NULL) &&
104 		        (attr->children->type == XML_TEXT_NODE) &&
105 		        (attr->children->next == NULL)) {
106 			value = attr->children->content;
107 			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
108 			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
109 			    http = 1;
110 			else if ((value != NULL)
111 			 && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
112 			    content = value;
113 			if ((http != 0) && (content != NULL))
114 			    goto found_content;
115 		    }
116 		    attr = attr->next;
117 		}
118 	    }
119 	}
120 	cur = cur->next;
121     }
122     return(NULL);
123 
124 found_content:
125     encoding = xmlStrstr(content, BAD_CAST"charset=");
126     if (encoding == NULL)
127 	encoding = xmlStrstr(content, BAD_CAST"Charset=");
128     if (encoding == NULL)
129 	encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
130     if (encoding != NULL) {
131 	encoding += 8;
132     } else {
133 	encoding = xmlStrstr(content, BAD_CAST"charset =");
134 	if (encoding == NULL)
135 	    encoding = xmlStrstr(content, BAD_CAST"Charset =");
136 	if (encoding == NULL)
137 	    encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
138 	if (encoding != NULL)
139 	    encoding += 9;
140     }
141     if (encoding != NULL) {
142 	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
143     }
144     return(encoding);
145 }
146 
147 /**
148  * htmlSetMetaEncoding:
149  * @doc:  the document
150  * @encoding:  the encoding string
151  *
152  * Sets the current encoding in the Meta tags
153  * NOTE: this will not change the document content encoding, just
154  * the META flag associated.
155  *
156  * Returns 0 in case of success and -1 in case of error
157  */
158 int
htmlSetMetaEncoding(htmlDocPtr doc,const xmlChar * encoding)159 htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
160     htmlNodePtr cur, meta = NULL, head = NULL;
161     const xmlChar *content = NULL;
162     char newcontent[100];
163 
164     newcontent[0] = 0;
165 
166     if (doc == NULL)
167 	return(-1);
168 
169     /* html isn't a real encoding it's just libxml2 way to get entities */
170     if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
171         return(-1);
172 
173     if (encoding != NULL) {
174 	snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
175                 (char *)encoding);
176 	newcontent[sizeof(newcontent) - 1] = 0;
177     }
178 
179     cur = doc->children;
180 
181     /*
182      * Search the html
183      */
184     while (cur != NULL) {
185 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
186 	    if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
187 		break;
188 	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
189 		goto found_head;
190 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
191 		goto found_meta;
192 	}
193 	cur = cur->next;
194     }
195     if (cur == NULL)
196 	return(-1);
197     cur = cur->children;
198 
199     /*
200      * Search the head
201      */
202     while (cur != NULL) {
203 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
204 	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
205 		break;
206 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
207                 head = cur->parent;
208 		goto found_meta;
209             }
210 	}
211 	cur = cur->next;
212     }
213     if (cur == NULL)
214 	return(-1);
215 found_head:
216     head = cur;
217     if (cur->children == NULL)
218         goto create;
219     cur = cur->children;
220 
221 found_meta:
222     /*
223      * Search and update all the remaining the meta elements carrying
224      * encoding information
225      */
226     while (cur != NULL) {
227 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
228 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
229 		xmlAttrPtr attr = cur->properties;
230 		int http;
231 		const xmlChar *value;
232 
233 		content = NULL;
234 		http = 0;
235 		while (attr != NULL) {
236 		    if ((attr->children != NULL) &&
237 		        (attr->children->type == XML_TEXT_NODE) &&
238 		        (attr->children->next == NULL)) {
239 			value = attr->children->content;
240 			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
241 			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
242 			    http = 1;
243 			else
244                         {
245                            if ((value != NULL) &&
246                                (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
247 			       content = value;
248                         }
249 		        if ((http != 0) && (content != NULL))
250 			    break;
251 		    }
252 		    attr = attr->next;
253 		}
254 		if ((http != 0) && (content != NULL)) {
255 		    meta = cur;
256 		    break;
257 		}
258 
259 	    }
260 	}
261 	cur = cur->next;
262     }
263 create:
264     if (meta == NULL) {
265         if ((encoding != NULL) && (head != NULL)) {
266             /*
267              * Create a new Meta element with the right attributes
268              */
269 
270             meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
271             if (head->children == NULL)
272                 xmlAddChild(head, meta);
273             else
274                 xmlAddPrevSibling(head->children, meta);
275             xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
276             xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
277         }
278     } else {
279         /* remove the meta tag if NULL is passed */
280         if (encoding == NULL) {
281             xmlUnlinkNode(meta);
282             xmlFreeNode(meta);
283         }
284         /* change the document only if there is a real encoding change */
285         else if (xmlStrcasestr(content, encoding) == NULL) {
286             xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
287         }
288     }
289 
290 
291     return(0);
292 }
293 
294 /**
295  * booleanHTMLAttrs:
296  *
297  * These are the HTML attributes which will be output
298  * in minimized form, i.e. <option selected="selected"> will be
299  * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
300  *
301  */
302 static const char* const htmlBooleanAttrs[] = {
303   "checked", "compact", "declare", "defer", "disabled", "ismap",
304   "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
305   "selected", NULL
306 };
307 
308 
309 /**
310  * htmlIsBooleanAttr:
311  * @name:  the name of the attribute to check
312  *
313  * Determine if a given attribute is a boolean attribute.
314  *
315  * returns: false if the attribute is not boolean, true otherwise.
316  */
317 int
htmlIsBooleanAttr(const xmlChar * name)318 htmlIsBooleanAttr(const xmlChar *name)
319 {
320     int i = 0;
321 
322     while (htmlBooleanAttrs[i] != NULL) {
323         if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
324             return 1;
325         i++;
326     }
327     return 0;
328 }
329 
330 #ifdef LIBXML_OUTPUT_ENABLED
331 /*
332  * private routine exported from xmlIO.c
333  */
334 xmlOutputBufferPtr
335 xmlAllocOutputBufferInternal(xmlCharEncodingHandlerPtr encoder);
336 /************************************************************************
337  *									*
338  *			Output error handlers				*
339  *									*
340  ************************************************************************/
341 /**
342  * htmlSaveErrMemory:
343  * @extra:  extra information
344  *
345  * Handle an out of memory condition
346  */
347 static void
htmlSaveErrMemory(const char * extra)348 htmlSaveErrMemory(const char *extra)
349 {
350     __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);
351 }
352 
353 /**
354  * htmlSaveErr:
355  * @code:  the error number
356  * @node:  the location of the error.
357  * @extra:  extra information
358  *
359  * Handle an out of memory condition
360  */
361 static void
htmlSaveErr(int code,xmlNodePtr node,const char * extra)362 htmlSaveErr(int code, xmlNodePtr node, const char *extra)
363 {
364     const char *msg = NULL;
365 
366     switch(code) {
367         case XML_SAVE_NOT_UTF8:
368 	    msg = "string is not in UTF-8\n";
369 	    break;
370 	case XML_SAVE_CHAR_INVALID:
371 	    msg = "invalid character value\n";
372 	    break;
373 	case XML_SAVE_UNKNOWN_ENCODING:
374 	    msg = "unknown encoding %s\n";
375 	    break;
376 	case XML_SAVE_NO_DOCTYPE:
377 	    msg = "HTML has no DOCTYPE\n";
378 	    break;
379 	default:
380 	    msg = "unexpected error number\n";
381     }
382     __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);
383 }
384 
385 /************************************************************************
386  *									*
387  *		Dumping HTML tree content to a simple buffer		*
388  *									*
389  ************************************************************************/
390 
391 /**
392  * htmlBufNodeDumpFormat:
393  * @buf:  the xmlBufPtr output
394  * @doc:  the document
395  * @cur:  the current node
396  * @format:  should formatting spaces been added
397  *
398  * Dump an HTML node, recursive behaviour,children are printed too.
399  *
400  * Returns the number of byte written or -1 in case of error
401  */
402 static size_t
htmlBufNodeDumpFormat(xmlBufPtr buf,xmlDocPtr doc,xmlNodePtr cur,int format)403 htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
404 	           int format) {
405     size_t use;
406     int ret;
407     xmlOutputBufferPtr outbuf;
408 
409     if (cur == NULL) {
410 	return (-1);
411     }
412     if (buf == NULL) {
413 	return (-1);
414     }
415     outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
416     if (outbuf == NULL) {
417         htmlSaveErrMemory("allocating HTML output buffer");
418 	return (-1);
419     }
420     memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer));
421     outbuf->buffer = buf;
422     outbuf->encoder = NULL;
423     outbuf->writecallback = NULL;
424     outbuf->closecallback = NULL;
425     outbuf->context = NULL;
426     outbuf->written = 0;
427 
428     use = xmlBufUse(buf);
429     htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
430     xmlFree(outbuf);
431     ret = xmlBufUse(buf) - use;
432     return (ret);
433 }
434 
435 /**
436  * htmlNodeDump:
437  * @buf:  the HTML buffer output
438  * @doc:  the document
439  * @cur:  the current node
440  *
441  * Dump an HTML node, recursive behaviour,children are printed too,
442  * and formatting returns are added.
443  *
444  * Returns the number of byte written or -1 in case of error
445  */
446 int
htmlNodeDump(xmlBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur)447 htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
448     xmlBufPtr buffer;
449     size_t ret;
450 
451     if ((buf == NULL) || (cur == NULL))
452         return(-1);
453 
454     xmlInitParser();
455     buffer = xmlBufFromBuffer(buf);
456     if (buffer == NULL)
457         return(-1);
458 
459     ret = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
460 
461     xmlBufBackToBuffer(buffer);
462 
463     if (ret > INT_MAX)
464         return(-1);
465     return((int) ret);
466 }
467 
468 /**
469  * htmlNodeDumpFileFormat:
470  * @out:  the FILE pointer
471  * @doc:  the document
472  * @cur:  the current node
473  * @encoding: the document encoding
474  * @format:  should formatting spaces been added
475  *
476  * Dump an HTML node, recursive behaviour,children are printed too.
477  *
478  * TODO: if encoding == NULL try to save in the doc encoding
479  *
480  * returns: the number of byte written or -1 in case of failure.
481  */
482 int
htmlNodeDumpFileFormat(FILE * out,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)483 htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
484 	               xmlNodePtr cur, const char *encoding, int format) {
485     xmlOutputBufferPtr buf;
486     xmlCharEncodingHandlerPtr handler = NULL;
487     int ret;
488 
489     xmlInitParser();
490 
491     if (encoding != NULL) {
492 	xmlCharEncoding enc;
493 
494 	enc = xmlParseCharEncoding(encoding);
495 	if (enc != XML_CHAR_ENCODING_UTF8) {
496 	    handler = xmlFindCharEncodingHandler(encoding);
497 	    if (handler == NULL)
498 		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
499 	}
500     } else {
501         /*
502          * Fallback to HTML or ASCII when the encoding is unspecified
503          */
504         if (handler == NULL)
505             handler = xmlFindCharEncodingHandler("HTML");
506         if (handler == NULL)
507             handler = xmlFindCharEncodingHandler("ascii");
508     }
509 
510     /*
511      * save the content to a temp buffer.
512      */
513     buf = xmlOutputBufferCreateFile(out, handler);
514     if (buf == NULL) return(0);
515 
516     htmlNodeDumpFormatOutput(buf, doc, cur, NULL, format);
517 
518     ret = xmlOutputBufferClose(buf);
519     return(ret);
520 }
521 
522 /**
523  * htmlNodeDumpFile:
524  * @out:  the FILE pointer
525  * @doc:  the document
526  * @cur:  the current node
527  *
528  * Dump an HTML node, recursive behaviour,children are printed too,
529  * and formatting returns are added.
530  */
531 void
htmlNodeDumpFile(FILE * out,xmlDocPtr doc,xmlNodePtr cur)532 htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
533     htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
534 }
535 
536 /**
537  * htmlDocDumpMemoryFormat:
538  * @cur:  the document
539  * @mem:  OUT: the memory pointer
540  * @size:  OUT: the memory length
541  * @format:  should formatting spaces been added
542  *
543  * Dump an HTML document in memory and return the xmlChar * and it's size.
544  * It's up to the caller to free the memory.
545  */
546 void
htmlDocDumpMemoryFormat(xmlDocPtr cur,xmlChar ** mem,int * size,int format)547 htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
548     xmlOutputBufferPtr buf;
549     xmlCharEncodingHandlerPtr handler = NULL;
550     const char *encoding;
551 
552     xmlInitParser();
553 
554     if ((mem == NULL) || (size == NULL))
555         return;
556     if (cur == NULL) {
557 	*mem = NULL;
558 	*size = 0;
559 	return;
560     }
561 
562     encoding = (const char *) htmlGetMetaEncoding(cur);
563 
564     if (encoding != NULL) {
565 	xmlCharEncoding enc;
566 
567 	enc = xmlParseCharEncoding(encoding);
568 	if (enc != XML_CHAR_ENCODING_UTF8) {
569 	    handler = xmlFindCharEncodingHandler(encoding);
570 	    if (handler == NULL)
571                 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
572 
573 	}
574     } else {
575         /*
576          * Fallback to HTML or ASCII when the encoding is unspecified
577          */
578         if (handler == NULL)
579             handler = xmlFindCharEncodingHandler("HTML");
580         if (handler == NULL)
581             handler = xmlFindCharEncodingHandler("ascii");
582     }
583 
584     buf = xmlAllocOutputBufferInternal(handler);
585     if (buf == NULL) {
586 	*mem = NULL;
587 	*size = 0;
588 	return;
589     }
590 
591     htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
592 
593     xmlOutputBufferFlush(buf);
594     if (buf->conv != NULL) {
595 	*size = xmlBufUse(buf->conv);
596 	*mem = xmlStrndup(xmlBufContent(buf->conv), *size);
597     } else {
598 	*size = xmlBufUse(buf->buffer);
599 	*mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
600     }
601     (void)xmlOutputBufferClose(buf);
602 }
603 
604 /**
605  * htmlDocDumpMemory:
606  * @cur:  the document
607  * @mem:  OUT: the memory pointer
608  * @size:  OUT: the memory length
609  *
610  * Dump an HTML document in memory and return the xmlChar * and it's size.
611  * It's up to the caller to free the memory.
612  */
613 void
htmlDocDumpMemory(xmlDocPtr cur,xmlChar ** mem,int * size)614 htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
615 	htmlDocDumpMemoryFormat(cur, mem, size, 1);
616 }
617 
618 
619 /************************************************************************
620  *									*
621  *		Dumping HTML tree content to an I/O output buffer	*
622  *									*
623  ************************************************************************/
624 
625 void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur);
626 
627 /**
628  * htmlDtdDumpOutput:
629  * @buf:  the HTML buffer output
630  * @doc:  the document
631  * @encoding:  the encoding string
632  *
633  * TODO: check whether encoding is needed
634  *
635  * Dump the HTML document DTD, if any.
636  */
637 static void
htmlDtdDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,const char * encoding ATTRIBUTE_UNUSED)638 htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
639 	          const char *encoding ATTRIBUTE_UNUSED) {
640     xmlDtdPtr cur = doc->intSubset;
641 
642     if (cur == NULL) {
643 	htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
644 	return;
645     }
646     xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
647     xmlOutputBufferWriteString(buf, (const char *)cur->name);
648     if (cur->ExternalID != NULL) {
649 	xmlOutputBufferWriteString(buf, " PUBLIC ");
650 	xmlBufWriteQuotedString(buf->buffer, cur->ExternalID);
651 	if (cur->SystemID != NULL) {
652 	    xmlOutputBufferWriteString(buf, " ");
653 	    xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
654 	}
655     } else if (cur->SystemID != NULL &&
656 	       xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
657 	xmlOutputBufferWriteString(buf, " SYSTEM ");
658 	xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
659     }
660     xmlOutputBufferWriteString(buf, ">\n");
661 }
662 
663 /**
664  * htmlAttrDumpOutput:
665  * @buf:  the HTML buffer output
666  * @doc:  the document
667  * @cur:  the attribute pointer
668  *
669  * Dump an HTML attribute
670  */
671 static void
htmlAttrDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlAttrPtr cur)672 htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
673     xmlChar *value;
674 
675     /*
676      * The html output method should not escape a & character
677      * occurring in an attribute value immediately followed by
678      * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
679      * This is implemented in xmlEncodeEntitiesReentrant
680      */
681 
682     if (cur == NULL) {
683 	return;
684     }
685     xmlOutputBufferWriteString(buf, " ");
686     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
687         xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
688 	xmlOutputBufferWriteString(buf, ":");
689     }
690     xmlOutputBufferWriteString(buf, (const char *)cur->name);
691     if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
692 	value = xmlNodeListGetString(doc, cur->children, 0);
693 	if (value) {
694 	    xmlOutputBufferWriteString(buf, "=");
695 	    if ((cur->ns == NULL) && (cur->parent != NULL) &&
696 		(cur->parent->ns == NULL) &&
697 		((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
698 	         (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
699 		 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
700 		 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
701 		  (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
702 		xmlChar *escaped;
703 		xmlChar *tmp = value;
704 
705 		while (IS_BLANK_CH(*tmp)) tmp++;
706 
707 		/*
708 		 * the < and > have already been escaped at the entity level
709 		 * And doing so here breaks server side includes
710 		 */
711 		escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+<>");
712 		if (escaped != NULL) {
713 		    xmlBufWriteQuotedString(buf->buffer, escaped);
714 		    xmlFree(escaped);
715 		} else {
716 		    xmlBufWriteQuotedString(buf->buffer, value);
717 		}
718 	    } else {
719 		xmlBufWriteQuotedString(buf->buffer, value);
720 	    }
721 	    xmlFree(value);
722 	} else  {
723 	    xmlOutputBufferWriteString(buf, "=\"\"");
724 	}
725     }
726 }
727 
728 /**
729  * htmlNodeDumpFormatOutput:
730  * @buf:  the HTML buffer output
731  * @doc:  the document
732  * @cur:  the current node
733  * @encoding:  the encoding string (unused)
734  * @format:  should formatting spaces been added
735  *
736  * Dump an HTML node, recursive behaviour,children are printed too.
737  */
738 void
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding ATTRIBUTE_UNUSED,int format)739 htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
740 	                 xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED,
741                          int format) {
742     xmlNodePtr root, parent;
743     xmlAttrPtr attr;
744     const htmlElemDesc * info;
745 
746     xmlInitParser();
747 
748     if ((cur == NULL) || (buf == NULL)) {
749 	return;
750     }
751 
752     root = cur;
753     parent = cur->parent;
754     while (1) {
755         switch (cur->type) {
756         case XML_HTML_DOCUMENT_NODE:
757         case XML_DOCUMENT_NODE:
758             if (((xmlDocPtr) cur)->intSubset != NULL) {
759                 htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL);
760             }
761             if (cur->children != NULL) {
762                 /* Always validate cur->parent when descending. */
763                 if (cur->parent == parent) {
764                     parent = cur;
765                     cur = cur->children;
766                     continue;
767                 }
768             } else {
769                 xmlOutputBufferWriteString(buf, "\n");
770             }
771             break;
772 
773         case XML_ELEMENT_NODE:
774             /*
775              * Some users like lxml are known to pass nodes with a corrupted
776              * tree structure. Fall back to a recursive call to handle this
777              * case.
778              */
779             if ((cur->parent != parent) && (cur->children != NULL)) {
780                 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
781                 break;
782             }
783 
784             /*
785              * Get specific HTML info for that node.
786              */
787             if (cur->ns == NULL)
788                 info = htmlTagLookup(cur->name);
789             else
790                 info = NULL;
791 
792             xmlOutputBufferWriteString(buf, "<");
793             if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
794                 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
795                 xmlOutputBufferWriteString(buf, ":");
796             }
797             xmlOutputBufferWriteString(buf, (const char *)cur->name);
798             if (cur->nsDef)
799                 xmlNsListDumpOutput(buf, cur->nsDef);
800             attr = cur->properties;
801             while (attr != NULL) {
802                 htmlAttrDumpOutput(buf, doc, attr);
803                 attr = attr->next;
804             }
805 
806             if ((info != NULL) && (info->empty)) {
807                 xmlOutputBufferWriteString(buf, ">");
808             } else if (cur->children == NULL) {
809                 if ((info != NULL) && (info->saveEndTag != 0) &&
810                     (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
811                     (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
812                     xmlOutputBufferWriteString(buf, ">");
813                 } else {
814                     xmlOutputBufferWriteString(buf, "></");
815                     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
816                         xmlOutputBufferWriteString(buf,
817                                 (const char *)cur->ns->prefix);
818                         xmlOutputBufferWriteString(buf, ":");
819                     }
820                     xmlOutputBufferWriteString(buf, (const char *)cur->name);
821                     xmlOutputBufferWriteString(buf, ">");
822                 }
823             } else {
824                 xmlOutputBufferWriteString(buf, ">");
825                 if ((format) && (info != NULL) && (!info->isinline) &&
826                     (cur->children->type != HTML_TEXT_NODE) &&
827                     (cur->children->type != HTML_ENTITY_REF_NODE) &&
828                     (cur->children != cur->last) &&
829                     (cur->name != NULL) &&
830                     (cur->name[0] != 'p')) /* p, pre, param */
831                     xmlOutputBufferWriteString(buf, "\n");
832                 parent = cur;
833                 cur = cur->children;
834                 continue;
835             }
836 
837             if ((format) && (cur->next != NULL) &&
838                 (info != NULL) && (!info->isinline)) {
839                 if ((cur->next->type != HTML_TEXT_NODE) &&
840                     (cur->next->type != HTML_ENTITY_REF_NODE) &&
841                     (parent != NULL) &&
842                     (parent->name != NULL) &&
843                     (parent->name[0] != 'p')) /* p, pre, param */
844                     xmlOutputBufferWriteString(buf, "\n");
845             }
846 
847             break;
848 
849         case XML_ATTRIBUTE_NODE:
850             htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur);
851             break;
852 
853         case HTML_TEXT_NODE:
854             if (cur->content == NULL)
855                 break;
856             if (((cur->name == (const xmlChar *)xmlStringText) ||
857                  (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
858                 ((parent == NULL) ||
859                  ((xmlStrcasecmp(parent->name, BAD_CAST "script")) &&
860                   (xmlStrcasecmp(parent->name, BAD_CAST "style"))))) {
861                 xmlChar *buffer;
862 
863                 buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
864                 if (buffer != NULL) {
865                     xmlOutputBufferWriteString(buf, (const char *)buffer);
866                     xmlFree(buffer);
867                 }
868             } else {
869                 xmlOutputBufferWriteString(buf, (const char *)cur->content);
870             }
871             break;
872 
873         case HTML_COMMENT_NODE:
874             if (cur->content != NULL) {
875                 xmlOutputBufferWriteString(buf, "<!--");
876                 xmlOutputBufferWriteString(buf, (const char *)cur->content);
877                 xmlOutputBufferWriteString(buf, "-->");
878             }
879             break;
880 
881         case HTML_PI_NODE:
882             if (cur->name != NULL) {
883                 xmlOutputBufferWriteString(buf, "<?");
884                 xmlOutputBufferWriteString(buf, (const char *)cur->name);
885                 if (cur->content != NULL) {
886                     xmlOutputBufferWriteString(buf, " ");
887                     xmlOutputBufferWriteString(buf,
888                             (const char *)cur->content);
889                 }
890                 xmlOutputBufferWriteString(buf, ">");
891             }
892             break;
893 
894         case HTML_ENTITY_REF_NODE:
895             xmlOutputBufferWriteString(buf, "&");
896             xmlOutputBufferWriteString(buf, (const char *)cur->name);
897             xmlOutputBufferWriteString(buf, ";");
898             break;
899 
900         case HTML_PRESERVE_NODE:
901             if (cur->content != NULL) {
902                 xmlOutputBufferWriteString(buf, (const char *)cur->content);
903             }
904             break;
905 
906         default:
907             break;
908         }
909 
910         while (1) {
911             if (cur == root)
912                 return;
913             if (cur->next != NULL) {
914                 cur = cur->next;
915                 break;
916             }
917 
918             cur = parent;
919             /* cur->parent was validated when descending. */
920             parent = cur->parent;
921 
922             if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
923                 (cur->type == XML_DOCUMENT_NODE)) {
924                 xmlOutputBufferWriteString(buf, "\n");
925             } else {
926                 if ((format) && (cur->ns == NULL))
927                     info = htmlTagLookup(cur->name);
928                 else
929                     info = NULL;
930 
931                 if ((format) && (info != NULL) && (!info->isinline) &&
932                     (cur->last->type != HTML_TEXT_NODE) &&
933                     (cur->last->type != HTML_ENTITY_REF_NODE) &&
934                     (cur->children != cur->last) &&
935                     (cur->name != NULL) &&
936                     (cur->name[0] != 'p')) /* p, pre, param */
937                     xmlOutputBufferWriteString(buf, "\n");
938 
939                 xmlOutputBufferWriteString(buf, "</");
940                 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
941                     xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
942                     xmlOutputBufferWriteString(buf, ":");
943                 }
944                 xmlOutputBufferWriteString(buf, (const char *)cur->name);
945                 xmlOutputBufferWriteString(buf, ">");
946 
947                 if ((format) && (info != NULL) && (!info->isinline) &&
948                     (cur->next != NULL)) {
949                     if ((cur->next->type != HTML_TEXT_NODE) &&
950                         (cur->next->type != HTML_ENTITY_REF_NODE) &&
951                         (parent != NULL) &&
952                         (parent->name != NULL) &&
953                         (parent->name[0] != 'p')) /* p, pre, param */
954                         xmlOutputBufferWriteString(buf, "\n");
955                 }
956             }
957         }
958     }
959 }
960 
961 /**
962  * htmlNodeDumpOutput:
963  * @buf:  the HTML buffer output
964  * @doc:  the document
965  * @cur:  the current node
966  * @encoding:  the encoding string (unused)
967  *
968  * Dump an HTML node, recursive behaviour,children are printed too,
969  * and formatting returns/spaces are added.
970  */
971 void
htmlNodeDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding ATTRIBUTE_UNUSED)972 htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
973 	           xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED) {
974     htmlNodeDumpFormatOutput(buf, doc, cur, NULL, 1);
975 }
976 
977 /**
978  * htmlDocContentDumpFormatOutput:
979  * @buf:  the HTML buffer output
980  * @cur:  the document
981  * @encoding:  the encoding string (unused)
982  * @format:  should formatting spaces been added
983  *
984  * Dump an HTML document.
985  */
986 void
htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding ATTRIBUTE_UNUSED,int format)987 htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
988 	                       const char *encoding ATTRIBUTE_UNUSED,
989                                int format) {
990     int type = 0;
991     if (cur) {
992         type = cur->type;
993         cur->type = XML_HTML_DOCUMENT_NODE;
994     }
995     htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, format);
996     if (cur)
997         cur->type = (xmlElementType) type;
998 }
999 
1000 /**
1001  * htmlDocContentDumpOutput:
1002  * @buf:  the HTML buffer output
1003  * @cur:  the document
1004  * @encoding:  the encoding string (unused)
1005  *
1006  * Dump an HTML document. Formatting return/spaces are added.
1007  */
1008 void
htmlDocContentDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding ATTRIBUTE_UNUSED)1009 htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1010 	                 const char *encoding ATTRIBUTE_UNUSED) {
1011     htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, 1);
1012 }
1013 
1014 /************************************************************************
1015  *									*
1016  *		Saving functions front-ends				*
1017  *									*
1018  ************************************************************************/
1019 
1020 /**
1021  * htmlDocDump:
1022  * @f:  the FILE*
1023  * @cur:  the document
1024  *
1025  * Dump an HTML document to an open FILE.
1026  *
1027  * returns: the number of byte written or -1 in case of failure.
1028  */
1029 int
htmlDocDump(FILE * f,xmlDocPtr cur)1030 htmlDocDump(FILE *f, xmlDocPtr cur) {
1031     xmlOutputBufferPtr buf;
1032     xmlCharEncodingHandlerPtr handler = NULL;
1033     const char *encoding;
1034     int ret;
1035 
1036     xmlInitParser();
1037 
1038     if ((cur == NULL) || (f == NULL)) {
1039 	return(-1);
1040     }
1041 
1042     encoding = (const char *) htmlGetMetaEncoding(cur);
1043 
1044     if (encoding != NULL) {
1045 	xmlCharEncoding enc;
1046 
1047 	enc = xmlParseCharEncoding(encoding);
1048 	if (enc != XML_CHAR_ENCODING_UTF8) {
1049 	    handler = xmlFindCharEncodingHandler(encoding);
1050 	    if (handler == NULL)
1051 		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1052 	}
1053     } else {
1054         /*
1055          * Fallback to HTML or ASCII when the encoding is unspecified
1056          */
1057         if (handler == NULL)
1058             handler = xmlFindCharEncodingHandler("HTML");
1059         if (handler == NULL)
1060             handler = xmlFindCharEncodingHandler("ascii");
1061     }
1062 
1063     buf = xmlOutputBufferCreateFile(f, handler);
1064     if (buf == NULL) return(-1);
1065     htmlDocContentDumpOutput(buf, cur, NULL);
1066 
1067     ret = xmlOutputBufferClose(buf);
1068     return(ret);
1069 }
1070 
1071 /**
1072  * htmlSaveFile:
1073  * @filename:  the filename (or URL)
1074  * @cur:  the document
1075  *
1076  * Dump an HTML document to a file. If @filename is "-" the stdout file is
1077  * used.
1078  * returns: the number of byte written or -1 in case of failure.
1079  */
1080 int
htmlSaveFile(const char * filename,xmlDocPtr cur)1081 htmlSaveFile(const char *filename, xmlDocPtr cur) {
1082     xmlOutputBufferPtr buf;
1083     xmlCharEncodingHandlerPtr handler = NULL;
1084     const char *encoding;
1085     int ret;
1086 
1087     if ((cur == NULL) || (filename == NULL))
1088         return(-1);
1089 
1090     xmlInitParser();
1091 
1092     encoding = (const char *) htmlGetMetaEncoding(cur);
1093 
1094     if (encoding != NULL) {
1095 	xmlCharEncoding enc;
1096 
1097 	enc = xmlParseCharEncoding(encoding);
1098 	if (enc != XML_CHAR_ENCODING_UTF8) {
1099 	    handler = xmlFindCharEncodingHandler(encoding);
1100 	    if (handler == NULL)
1101 		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1102 	}
1103     } else {
1104         /*
1105          * Fallback to HTML or ASCII when the encoding is unspecified
1106          */
1107         if (handler == NULL)
1108             handler = xmlFindCharEncodingHandler("HTML");
1109         if (handler == NULL)
1110             handler = xmlFindCharEncodingHandler("ascii");
1111     }
1112 
1113     /*
1114      * save the content to a temp buffer.
1115      */
1116     buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1117     if (buf == NULL) return(0);
1118 
1119     htmlDocContentDumpOutput(buf, cur, NULL);
1120 
1121     ret = xmlOutputBufferClose(buf);
1122     return(ret);
1123 }
1124 
1125 /**
1126  * htmlSaveFileFormat:
1127  * @filename:  the filename
1128  * @cur:  the document
1129  * @format:  should formatting spaces been added
1130  * @encoding: the document encoding
1131  *
1132  * Dump an HTML document to a file using a given encoding.
1133  *
1134  * returns: the number of byte written or -1 in case of failure.
1135  */
1136 int
htmlSaveFileFormat(const char * filename,xmlDocPtr cur,const char * encoding,int format)1137 htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1138 	           const char *encoding, int format) {
1139     xmlOutputBufferPtr buf;
1140     xmlCharEncodingHandlerPtr handler = NULL;
1141     int ret;
1142 
1143     if ((cur == NULL) || (filename == NULL))
1144         return(-1);
1145 
1146     xmlInitParser();
1147 
1148     if (encoding != NULL) {
1149 	xmlCharEncoding enc;
1150 
1151 	enc = xmlParseCharEncoding(encoding);
1152 	if (enc != XML_CHAR_ENCODING_UTF8) {
1153 	    handler = xmlFindCharEncodingHandler(encoding);
1154 	    if (handler == NULL)
1155 		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1156 	}
1157         htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1158     } else {
1159 	htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1160 
1161         /*
1162          * Fallback to HTML or ASCII when the encoding is unspecified
1163          */
1164         if (handler == NULL)
1165             handler = xmlFindCharEncodingHandler("HTML");
1166         if (handler == NULL)
1167             handler = xmlFindCharEncodingHandler("ascii");
1168     }
1169 
1170     /*
1171      * save the content to a temp buffer.
1172      */
1173     buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1174     if (buf == NULL) return(0);
1175 
1176     htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1177 
1178     ret = xmlOutputBufferClose(buf);
1179     return(ret);
1180 }
1181 
1182 /**
1183  * htmlSaveFileEnc:
1184  * @filename:  the filename
1185  * @cur:  the document
1186  * @encoding: the document encoding
1187  *
1188  * Dump an HTML document to a file using a given encoding
1189  * and formatting returns/spaces are added.
1190  *
1191  * returns: the number of byte written or -1 in case of failure.
1192  */
1193 int
htmlSaveFileEnc(const char * filename,xmlDocPtr cur,const char * encoding)1194 htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1195     return(htmlSaveFileFormat(filename, cur, encoding, 1));
1196 }
1197 
1198 #endif /* LIBXML_OUTPUT_ENABLED */
1199 
1200 #endif /* LIBXML_HTML_ENABLED */
1201