1 /* libxml2 - Library for parsing XML documents
2  * Copyright (C) 2006-2019 Free Software Foundation, Inc.
3  *
4  * This file is not part of the GNU gettext program, but is used with
5  * GNU gettext.
6  *
7  * The original copyright notice is as follows:
8  */
9 
10 /*
11  * Copyright (C) 1998-2012 Daniel Veillard.  All Rights Reserved.
12  *
13  * Permission is hereby granted, free of charge, to any person obtaining a copy
14  * of this software and associated documentation files (the "Software"), to deal
15  * in the Software without restriction, including without limitation the rights
16  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17  * copies of the Software, and to permit persons to whom the Software is fur-
18  * nished to do so, subject to the following conditions:
19  *
20  * The above copyright notice and this permission notice shall be included in
21  * all copies or substantial portions of the Software.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FIT-
25  * NESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
26  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
29  * THE SOFTWARE.
30  *
31  * daniel@veillard.com
32  */
33 
34 /*
35  * HTMLtree.c : implementation of access function for an HTML tree.
36  */
37 
38 #define IN_LIBXML
39 #include "libxml.h"
40 #ifdef LIBXML_HTML_ENABLED
41 
42 #include <string.h> /* for memset() only ! */
43 
44 #ifdef HAVE_CTYPE_H
45 #include <ctype.h>
46 #endif
47 #ifdef HAVE_STDLIB_H
48 #include <stdlib.h>
49 #endif
50 
51 #include <libxml/xmlmemory.h>
52 #include <libxml/HTMLparser.h>
53 #include <libxml/HTMLtree.h>
54 #include <libxml/entities.h>
55 #include <libxml/valid.h>
56 #include <libxml/xmlerror.h>
57 #include <libxml/parserInternals.h>
58 #include <libxml/globals.h>
59 #include <libxml/uri.h>
60 
61 #include "buf.h"
62 
63 /************************************************************************
64  *									*
65  *		Getting/Setting encoding meta tags			*
66  *									*
67  ************************************************************************/
68 
69 /**
70  * htmlGetMetaEncoding:
71  * @doc:  the document
72  *
73  * Encoding definition lookup in the Meta tags
74  *
75  * Returns the current encoding as flagged in the HTML source
76  */
77 const xmlChar *
htmlGetMetaEncoding(htmlDocPtr doc)78 htmlGetMetaEncoding(htmlDocPtr doc) {
79     htmlNodePtr cur;
80     const xmlChar *content;
81     const xmlChar *encoding;
82 
83     if (doc == NULL)
84 	return(NULL);
85     cur = doc->children;
86 
87     /*
88      * Search the html
89      */
90     while (cur != NULL) {
91 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
92 	    if (xmlStrEqual(cur->name, BAD_CAST"html"))
93 		break;
94 	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
95 		goto found_head;
96 	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
97 		goto found_meta;
98 	}
99 	cur = cur->next;
100     }
101     if (cur == NULL)
102 	return(NULL);
103     cur = cur->children;
104 
105     /*
106      * Search the head
107      */
108     while (cur != NULL) {
109 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
110 	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
111 		break;
112 	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
113 		goto found_meta;
114 	}
115 	cur = cur->next;
116     }
117     if (cur == NULL)
118 	return(NULL);
119 found_head:
120     cur = cur->children;
121 
122     /*
123      * Search the meta elements
124      */
125 found_meta:
126     while (cur != NULL) {
127 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
128 	    if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
129 		xmlAttrPtr attr = cur->properties;
130 		int http;
131 		const xmlChar *value;
132 
133 		content = NULL;
134 		http = 0;
135 		while (attr != NULL) {
136 		    if ((attr->children != NULL) &&
137 		        (attr->children->type == XML_TEXT_NODE) &&
138 		        (attr->children->next == NULL)) {
139 			value = attr->children->content;
140 			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
141 			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
142 			    http = 1;
143 			else if ((value != NULL)
144 			 && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
145 			    content = value;
146 			if ((http != 0) && (content != NULL))
147 			    goto found_content;
148 		    }
149 		    attr = attr->next;
150 		}
151 	    }
152 	}
153 	cur = cur->next;
154     }
155     return(NULL);
156 
157 found_content:
158     encoding = xmlStrstr(content, BAD_CAST"charset=");
159     if (encoding == NULL)
160 	encoding = xmlStrstr(content, BAD_CAST"Charset=");
161     if (encoding == NULL)
162 	encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
163     if (encoding != NULL) {
164 	encoding += 8;
165     } else {
166 	encoding = xmlStrstr(content, BAD_CAST"charset =");
167 	if (encoding == NULL)
168 	    encoding = xmlStrstr(content, BAD_CAST"Charset =");
169 	if (encoding == NULL)
170 	    encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
171 	if (encoding != NULL)
172 	    encoding += 9;
173     }
174     if (encoding != NULL) {
175 	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
176     }
177     return(encoding);
178 }
179 
180 /**
181  * htmlSetMetaEncoding:
182  * @doc:  the document
183  * @encoding:  the encoding string
184  *
185  * Sets the current encoding in the Meta tags
186  * NOTE: this will not change the document content encoding, just
187  * the META flag associated.
188  *
189  * Returns 0 in case of success and -1 in case of error
190  */
191 int
htmlSetMetaEncoding(htmlDocPtr doc,const xmlChar * encoding)192 htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
193     htmlNodePtr cur, meta = NULL, head = NULL;
194     const xmlChar *content = NULL;
195     char newcontent[100];
196 
197     newcontent[0] = 0;
198 
199     if (doc == NULL)
200 	return(-1);
201 
202     /* html isn't a real encoding it's just libxml2 way to get entities */
203     if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
204         return(-1);
205 
206     if (encoding != NULL) {
207 	snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
208                 (char *)encoding);
209 	newcontent[sizeof(newcontent) - 1] = 0;
210     }
211 
212     cur = doc->children;
213 
214     /*
215      * Search the html
216      */
217     while (cur != NULL) {
218 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
219 	    if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
220 		break;
221 	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
222 		goto found_head;
223 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
224 		goto found_meta;
225 	}
226 	cur = cur->next;
227     }
228     if (cur == NULL)
229 	return(-1);
230     cur = cur->children;
231 
232     /*
233      * Search the head
234      */
235     while (cur != NULL) {
236 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
237 	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
238 		break;
239 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
240                 head = cur->parent;
241 		goto found_meta;
242             }
243 	}
244 	cur = cur->next;
245     }
246     if (cur == NULL)
247 	return(-1);
248 found_head:
249     head = cur;
250     if (cur->children == NULL)
251         goto create;
252     cur = cur->children;
253 
254 found_meta:
255     /*
256      * Search and update all the remaining the meta elements carrying
257      * encoding informations
258      */
259     while (cur != NULL) {
260 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
261 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
262 		xmlAttrPtr attr = cur->properties;
263 		int http;
264 		const xmlChar *value;
265 
266 		content = NULL;
267 		http = 0;
268 		while (attr != NULL) {
269 		    if ((attr->children != NULL) &&
270 		        (attr->children->type == XML_TEXT_NODE) &&
271 		        (attr->children->next == NULL)) {
272 			value = attr->children->content;
273 			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
274 			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
275 			    http = 1;
276 			else
277                         {
278                            if ((value != NULL) &&
279                                (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
280 			       content = value;
281                         }
282 		        if ((http != 0) && (content != NULL))
283 			    break;
284 		    }
285 		    attr = attr->next;
286 		}
287 		if ((http != 0) && (content != NULL)) {
288 		    meta = cur;
289 		    break;
290 		}
291 
292 	    }
293 	}
294 	cur = cur->next;
295     }
296 create:
297     if (meta == NULL) {
298         if ((encoding != NULL) && (head != NULL)) {
299             /*
300              * Create a new Meta element with the right attributes
301              */
302 
303             meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
304             if (head->children == NULL)
305                 xmlAddChild(head, meta);
306             else
307                 xmlAddPrevSibling(head->children, meta);
308             xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
309             xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
310         }
311     } else {
312         /* remove the meta tag if NULL is passed */
313         if (encoding == NULL) {
314             xmlUnlinkNode(meta);
315             xmlFreeNode(meta);
316         }
317         /* change the document only if there is a real encoding change */
318         else if (xmlStrcasestr(content, encoding) == NULL) {
319             xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
320         }
321     }
322 
323 
324     return(0);
325 }
326 
327 /**
328  * booleanHTMLAttrs:
329  *
330  * These are the HTML attributes which will be output
331  * in minimized form, i.e. <option selected="selected"> will be
332  * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
333  *
334  */
335 static const char* htmlBooleanAttrs[] = {
336   "checked", "compact", "declare", "defer", "disabled", "ismap",
337   "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
338   "selected", NULL
339 };
340 
341 
342 /**
343  * htmlIsBooleanAttr:
344  * @name:  the name of the attribute to check
345  *
346  * Determine if a given attribute is a boolean attribute.
347  *
348  * returns: false if the attribute is not boolean, true otherwise.
349  */
350 int
htmlIsBooleanAttr(const xmlChar * name)351 htmlIsBooleanAttr(const xmlChar *name)
352 {
353     int i = 0;
354 
355     while (htmlBooleanAttrs[i] != NULL) {
356         if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
357             return 1;
358         i++;
359     }
360     return 0;
361 }
362 
363 #ifdef LIBXML_OUTPUT_ENABLED
364 /*
365  * private routine exported from xmlIO.c
366  */
367 xmlOutputBufferPtr
368 xmlAllocOutputBufferInternal(xmlCharEncodingHandlerPtr encoder);
369 /************************************************************************
370  *									*
371  *			Output error handlers				*
372  *									*
373  ************************************************************************/
374 /**
375  * htmlSaveErrMemory:
376  * @extra:  extra informations
377  *
378  * Handle an out of memory condition
379  */
380 static void
htmlSaveErrMemory(const char * extra)381 htmlSaveErrMemory(const char *extra)
382 {
383     __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);
384 }
385 
386 /**
387  * htmlSaveErr:
388  * @code:  the error number
389  * @node:  the location of the error.
390  * @extra:  extra informations
391  *
392  * Handle an out of memory condition
393  */
394 static void
htmlSaveErr(int code,xmlNodePtr node,const char * extra)395 htmlSaveErr(int code, xmlNodePtr node, const char *extra)
396 {
397     const char *msg = NULL;
398 
399     switch(code) {
400         case XML_SAVE_NOT_UTF8:
401 	    msg = "string is not in UTF-8\n";
402 	    break;
403 	case XML_SAVE_CHAR_INVALID:
404 	    msg = "invalid character value\n";
405 	    break;
406 	case XML_SAVE_UNKNOWN_ENCODING:
407 	    msg = "unknown encoding %s\n";
408 	    break;
409 	case XML_SAVE_NO_DOCTYPE:
410 	    msg = "HTML has no DOCTYPE\n";
411 	    break;
412 	default:
413 	    msg = "unexpected error number\n";
414     }
415     __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);
416 }
417 
418 /************************************************************************
419  *									*
420  *		Dumping HTML tree content to a simple buffer		*
421  *									*
422  ************************************************************************/
423 
424 /**
425  * htmlBufNodeDumpFormat:
426  * @buf:  the xmlBufPtr output
427  * @doc:  the document
428  * @cur:  the current node
429  * @format:  should formatting spaces been added
430  *
431  * Dump an HTML node, recursive behaviour,children are printed too.
432  *
433  * Returns the number of byte written or -1 in case of error
434  */
435 static size_t
htmlBufNodeDumpFormat(xmlBufPtr buf,xmlDocPtr doc,xmlNodePtr cur,int format)436 htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
437 	           int format) {
438     size_t use;
439     int ret;
440     xmlOutputBufferPtr outbuf;
441 
442     if (cur == NULL) {
443 	return (-1);
444     }
445     if (buf == NULL) {
446 	return (-1);
447     }
448     outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
449     if (outbuf == NULL) {
450         htmlSaveErrMemory("allocating HTML output buffer");
451 	return (-1);
452     }
453     memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer));
454     outbuf->buffer = buf;
455     outbuf->encoder = NULL;
456     outbuf->writecallback = NULL;
457     outbuf->closecallback = NULL;
458     outbuf->context = NULL;
459     outbuf->written = 0;
460 
461     use = xmlBufUse(buf);
462     htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
463     xmlFree(outbuf);
464     ret = xmlBufUse(buf) - use;
465     return (ret);
466 }
467 
468 /**
469  * htmlNodeDump:
470  * @buf:  the HTML buffer output
471  * @doc:  the document
472  * @cur:  the current node
473  *
474  * Dump an HTML node, recursive behaviour,children are printed too,
475  * and formatting returns are added.
476  *
477  * Returns the number of byte written or -1 in case of error
478  */
479 int
htmlNodeDump(xmlBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur)480 htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
481     xmlBufPtr buffer;
482     size_t ret;
483 
484     if ((buf == NULL) || (cur == NULL))
485         return(-1);
486 
487     xmlInitParser();
488     buffer = xmlBufFromBuffer(buf);
489     if (buffer == NULL)
490         return(-1);
491 
492     ret = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
493 
494     xmlBufBackToBuffer(buffer);
495 
496     if (ret > INT_MAX)
497         return(-1);
498     return((int) ret);
499 }
500 
501 /**
502  * htmlNodeDumpFileFormat:
503  * @out:  the FILE pointer
504  * @doc:  the document
505  * @cur:  the current node
506  * @encoding: the document encoding
507  * @format:  should formatting spaces been added
508  *
509  * Dump an HTML node, recursive behaviour,children are printed too.
510  *
511  * TODO: if encoding == NULL try to save in the doc encoding
512  *
513  * returns: the number of byte written or -1 in case of failure.
514  */
515 int
htmlNodeDumpFileFormat(FILE * out,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)516 htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
517 	               xmlNodePtr cur, const char *encoding, int format) {
518     xmlOutputBufferPtr buf;
519     xmlCharEncodingHandlerPtr handler = NULL;
520     int ret;
521 
522     xmlInitParser();
523 
524     if (encoding != NULL) {
525 	xmlCharEncoding enc;
526 
527 	enc = xmlParseCharEncoding(encoding);
528 	if (enc != XML_CHAR_ENCODING_UTF8) {
529 	    handler = xmlFindCharEncodingHandler(encoding);
530 	    if (handler == NULL)
531 		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
532 	}
533     } else {
534         /*
535          * Fallback to HTML or ASCII when the encoding is unspecified
536          */
537         if (handler == NULL)
538             handler = xmlFindCharEncodingHandler("HTML");
539         if (handler == NULL)
540             handler = xmlFindCharEncodingHandler("ascii");
541     }
542 
543     /*
544      * save the content to a temp buffer.
545      */
546     buf = xmlOutputBufferCreateFile(out, handler);
547     if (buf == NULL) return(0);
548 
549     htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
550 
551     ret = xmlOutputBufferClose(buf);
552     return(ret);
553 }
554 
555 /**
556  * htmlNodeDumpFile:
557  * @out:  the FILE pointer
558  * @doc:  the document
559  * @cur:  the current node
560  *
561  * Dump an HTML node, recursive behaviour,children are printed too,
562  * and formatting returns are added.
563  */
564 void
htmlNodeDumpFile(FILE * out,xmlDocPtr doc,xmlNodePtr cur)565 htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
566     htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
567 }
568 
569 /**
570  * htmlDocDumpMemoryFormat:
571  * @cur:  the document
572  * @mem:  OUT: the memory pointer
573  * @size:  OUT: the memory length
574  * @format:  should formatting spaces been added
575  *
576  * Dump an HTML document in memory and return the xmlChar * and it's size.
577  * It's up to the caller to free the memory.
578  */
579 void
htmlDocDumpMemoryFormat(xmlDocPtr cur,xmlChar ** mem,int * size,int format)580 htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
581     xmlOutputBufferPtr buf;
582     xmlCharEncodingHandlerPtr handler = NULL;
583     const char *encoding;
584 
585     xmlInitParser();
586 
587     if ((mem == NULL) || (size == NULL))
588         return;
589     if (cur == NULL) {
590 	*mem = NULL;
591 	*size = 0;
592 	return;
593     }
594 
595     encoding = (const char *) htmlGetMetaEncoding(cur);
596 
597     if (encoding != NULL) {
598 	xmlCharEncoding enc;
599 
600 	enc = xmlParseCharEncoding(encoding);
601 	if (enc != XML_CHAR_ENCODING_UTF8) {
602 	    handler = xmlFindCharEncodingHandler(encoding);
603 	    if (handler == NULL)
604                 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
605 
606 	}
607     } else {
608         /*
609          * Fallback to HTML or ASCII when the encoding is unspecified
610          */
611         if (handler == NULL)
612             handler = xmlFindCharEncodingHandler("HTML");
613         if (handler == NULL)
614             handler = xmlFindCharEncodingHandler("ascii");
615     }
616 
617     buf = xmlAllocOutputBufferInternal(handler);
618     if (buf == NULL) {
619 	*mem = NULL;
620 	*size = 0;
621 	return;
622     }
623 
624     htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
625 
626     xmlOutputBufferFlush(buf);
627     if (buf->conv != NULL) {
628 	*size = xmlBufUse(buf->conv);
629 	*mem = xmlStrndup(xmlBufContent(buf->conv), *size);
630     } else {
631 	*size = xmlBufUse(buf->buffer);
632 	*mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
633     }
634     (void)xmlOutputBufferClose(buf);
635 }
636 
637 /**
638  * htmlDocDumpMemory:
639  * @cur:  the document
640  * @mem:  OUT: the memory pointer
641  * @size:  OUT: the memory length
642  *
643  * Dump an HTML document in memory and return the xmlChar * and it's size.
644  * It's up to the caller to free the memory.
645  */
646 void
htmlDocDumpMemory(xmlDocPtr cur,xmlChar ** mem,int * size)647 htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
648 	htmlDocDumpMemoryFormat(cur, mem, size, 1);
649 }
650 
651 
652 /************************************************************************
653  *									*
654  *		Dumping HTML tree content to an I/O output buffer	*
655  *									*
656  ************************************************************************/
657 
658 void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur);
659 
660 /**
661  * htmlDtdDumpOutput:
662  * @buf:  the HTML buffer output
663  * @doc:  the document
664  * @encoding:  the encoding string
665  *
666  * TODO: check whether encoding is needed
667  *
668  * Dump the HTML document DTD, if any.
669  */
670 static void
htmlDtdDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,const char * encoding ATTRIBUTE_UNUSED)671 htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
672 	          const char *encoding ATTRIBUTE_UNUSED) {
673     xmlDtdPtr cur = doc->intSubset;
674 
675     if (cur == NULL) {
676 	htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
677 	return;
678     }
679     xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
680     xmlOutputBufferWriteString(buf, (const char *)cur->name);
681     if (cur->ExternalID != NULL) {
682 	xmlOutputBufferWriteString(buf, " PUBLIC ");
683 	xmlBufWriteQuotedString(buf->buffer, cur->ExternalID);
684 	if (cur->SystemID != NULL) {
685 	    xmlOutputBufferWriteString(buf, " ");
686 	    xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
687 	}
688     } else if (cur->SystemID != NULL &&
689 	       xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
690 	xmlOutputBufferWriteString(buf, " SYSTEM ");
691 	xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
692     }
693     xmlOutputBufferWriteString(buf, ">\n");
694 }
695 
696 /**
697  * htmlAttrDumpOutput:
698  * @buf:  the HTML buffer output
699  * @doc:  the document
700  * @cur:  the attribute pointer
701  * @encoding:  the encoding string
702  *
703  * Dump an HTML attribute
704  */
705 static void
htmlAttrDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlAttrPtr cur,const char * encoding ATTRIBUTE_UNUSED)706 htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur,
707 	           const char *encoding ATTRIBUTE_UNUSED) {
708     xmlChar *value;
709 
710     /*
711      * The html output method should not escape a & character
712      * occurring in an attribute value immediately followed by
713      * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
714      * This is implemented in xmlEncodeEntitiesReentrant
715      */
716 
717     if (cur == NULL) {
718 	return;
719     }
720     xmlOutputBufferWriteString(buf, " ");
721     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
722         xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
723 	xmlOutputBufferWriteString(buf, ":");
724     }
725     xmlOutputBufferWriteString(buf, (const char *)cur->name);
726     if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
727 	value = xmlNodeListGetString(doc, cur->children, 0);
728 	if (value) {
729 	    xmlOutputBufferWriteString(buf, "=");
730 	    if ((cur->ns == NULL) && (cur->parent != NULL) &&
731 		(cur->parent->ns == NULL) &&
732 		((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
733 	         (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
734 		 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
735 		 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
736 		  (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
737 		xmlChar *tmp = value;
738 		/* xmlURIEscapeStr() escapes '"' so it can be safely used. */
739 		xmlBufCCat(buf->buffer, "\"");
740 
741 		while (IS_BLANK_CH(*tmp)) tmp++;
742 
743 		/* URI Escape everything, except server side includes. */
744 		for ( ; ; ) {
745 		    xmlChar *escaped;
746 		    xmlChar endChar;
747 		    xmlChar *end = NULL;
748 		    xmlChar *start = (xmlChar *)xmlStrstr(tmp, BAD_CAST "<!--");
749 		    if (start != NULL) {
750 			end = (xmlChar *)xmlStrstr(tmp, BAD_CAST "-->");
751 			if (end != NULL) {
752 			    *start = '\0';
753 			}
754 		    }
755 
756 		    /* Escape the whole string, or until start (set to '\0'). */
757 		    escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+");
758 		    if (escaped != NULL) {
759 		        xmlBufCat(buf->buffer, escaped);
760 		        xmlFree(escaped);
761 		    } else {
762 		        xmlBufCat(buf->buffer, tmp);
763 		    }
764 
765 		    if (end == NULL) { /* Everything has been written. */
766 			break;
767 		    }
768 
769 		    /* Do not escape anything within server side includes. */
770 		    *start = '<'; /* Restore the first character of "<!--". */
771 		    end += 3; /* strlen("-->") */
772 		    endChar = *end;
773 		    *end = '\0';
774 		    xmlBufCat(buf->buffer, start);
775 		    *end = endChar;
776 		    tmp = end;
777 		}
778 
779 		xmlBufCCat(buf->buffer, "\"");
780 	    } else {
781 		xmlBufWriteQuotedString(buf->buffer, value);
782 	    }
783 	    xmlFree(value);
784 	} else  {
785 	    xmlOutputBufferWriteString(buf, "=\"\"");
786 	}
787     }
788 }
789 
790 /**
791  * htmlAttrListDumpOutput:
792  * @buf:  the HTML buffer output
793  * @doc:  the document
794  * @cur:  the first attribute pointer
795  * @encoding:  the encoding string
796  *
797  * Dump a list of HTML attributes
798  */
799 static void
htmlAttrListDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlAttrPtr cur,const char * encoding)800 htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
801     if (cur == NULL) {
802 	return;
803     }
804     while (cur != NULL) {
805         htmlAttrDumpOutput(buf, doc, cur, encoding);
806 	cur = cur->next;
807     }
808 }
809 
810 
811 
812 /**
813  * htmlNodeListDumpOutput:
814  * @buf:  the HTML buffer output
815  * @doc:  the document
816  * @cur:  the first node
817  * @encoding:  the encoding string
818  * @format:  should formatting spaces been added
819  *
820  * Dump an HTML node list, recursive behaviour,children are printed too.
821  */
822 static void
htmlNodeListDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)823 htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
824 	               xmlNodePtr cur, const char *encoding, int format) {
825     if (cur == NULL) {
826 	return;
827     }
828     while (cur != NULL) {
829         htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
830 	cur = cur->next;
831     }
832 }
833 
834 /**
835  * htmlNodeDumpFormatOutput:
836  * @buf:  the HTML buffer output
837  * @doc:  the document
838  * @cur:  the current node
839  * @encoding:  the encoding string
840  * @format:  should formatting spaces been added
841  *
842  * Dump an HTML node, recursive behaviour,children are printed too.
843  */
844 void
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)845 htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
846 	                 xmlNodePtr cur, const char *encoding, int format) {
847     const htmlElemDesc * info;
848 
849     xmlInitParser();
850 
851     if ((cur == NULL) || (buf == NULL)) {
852 	return;
853     }
854     /*
855      * Special cases.
856      */
857     if (cur->type == XML_DTD_NODE)
858 	return;
859     if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
860         (cur->type == XML_DOCUMENT_NODE)){
861 	htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding);
862 	return;
863     }
864     if (cur->type == XML_ATTRIBUTE_NODE) {
865         htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur, encoding);
866 	return;
867     }
868     if (cur->type == HTML_TEXT_NODE) {
869 	if (cur->content != NULL) {
870 	    if (((cur->name == (const xmlChar *)xmlStringText) ||
871 		 (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
872 		((cur->parent == NULL) ||
873 		 ((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) &&
874 		  (xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) {
875 		xmlChar *buffer;
876 
877 		buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
878 		if (buffer != NULL) {
879 		    xmlOutputBufferWriteString(buf, (const char *)buffer);
880 		    xmlFree(buffer);
881 		}
882 	    } else {
883 		xmlOutputBufferWriteString(buf, (const char *)cur->content);
884 	    }
885 	}
886 	return;
887     }
888     if (cur->type == HTML_COMMENT_NODE) {
889 	if (cur->content != NULL) {
890 	    xmlOutputBufferWriteString(buf, "<!--");
891 	    xmlOutputBufferWriteString(buf, (const char *)cur->content);
892 	    xmlOutputBufferWriteString(buf, "-->");
893 	}
894 	return;
895     }
896     if (cur->type == HTML_PI_NODE) {
897 	if (cur->name == NULL)
898 	    return;
899 	xmlOutputBufferWriteString(buf, "<?");
900 	xmlOutputBufferWriteString(buf, (const char *)cur->name);
901 	if (cur->content != NULL) {
902 	    xmlOutputBufferWriteString(buf, " ");
903 	    xmlOutputBufferWriteString(buf, (const char *)cur->content);
904 	}
905 	xmlOutputBufferWriteString(buf, ">");
906 	return;
907     }
908     if (cur->type == HTML_ENTITY_REF_NODE) {
909         xmlOutputBufferWriteString(buf, "&");
910 	xmlOutputBufferWriteString(buf, (const char *)cur->name);
911         xmlOutputBufferWriteString(buf, ";");
912 	return;
913     }
914     if (cur->type == HTML_PRESERVE_NODE) {
915 	if (cur->content != NULL) {
916 	    xmlOutputBufferWriteString(buf, (const char *)cur->content);
917 	}
918 	return;
919     }
920 
921     /*
922      * Get specific HTML info for that node.
923      */
924     if (cur->ns == NULL)
925 	info = htmlTagLookup(cur->name);
926     else
927 	info = NULL;
928 
929     xmlOutputBufferWriteString(buf, "<");
930     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
931         xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
932 	xmlOutputBufferWriteString(buf, ":");
933     }
934     xmlOutputBufferWriteString(buf, (const char *)cur->name);
935     if (cur->nsDef)
936 	xmlNsListDumpOutput(buf, cur->nsDef);
937     if (cur->properties != NULL)
938         htmlAttrListDumpOutput(buf, doc, cur->properties, encoding);
939 
940     if ((info != NULL) && (info->empty)) {
941         xmlOutputBufferWriteString(buf, ">");
942 	if ((format) && (!info->isinline) && (cur->next != NULL)) {
943 	    if ((cur->next->type != HTML_TEXT_NODE) &&
944 		(cur->next->type != HTML_ENTITY_REF_NODE) &&
945 		(cur->parent != NULL) &&
946 		(cur->parent->name != NULL) &&
947 		(cur->parent->name[0] != 'p')) /* p, pre, param */
948 		xmlOutputBufferWriteString(buf, "\n");
949 	}
950 	return;
951     }
952     if (((cur->type == XML_ELEMENT_NODE) || (cur->content == NULL)) &&
953 	(cur->children == NULL)) {
954         if ((info != NULL) && (info->saveEndTag != 0) &&
955 	    (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
956 	    (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
957 	    xmlOutputBufferWriteString(buf, ">");
958 	} else {
959 	    xmlOutputBufferWriteString(buf, "></");
960             if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
961                 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
962                 xmlOutputBufferWriteString(buf, ":");
963             }
964 	    xmlOutputBufferWriteString(buf, (const char *)cur->name);
965 	    xmlOutputBufferWriteString(buf, ">");
966 	}
967 	if ((format) && (cur->next != NULL) &&
968             (info != NULL) && (!info->isinline)) {
969 	    if ((cur->next->type != HTML_TEXT_NODE) &&
970 		(cur->next->type != HTML_ENTITY_REF_NODE) &&
971 		(cur->parent != NULL) &&
972 		(cur->parent->name != NULL) &&
973 		(cur->parent->name[0] != 'p')) /* p, pre, param */
974 		xmlOutputBufferWriteString(buf, "\n");
975 	}
976 	return;
977     }
978     xmlOutputBufferWriteString(buf, ">");
979     if ((cur->type != XML_ELEMENT_NODE) &&
980 	(cur->content != NULL)) {
981 	    /*
982 	     * Uses the OutputBuffer property to automatically convert
983 	     * invalids to charrefs
984 	     */
985 
986             xmlOutputBufferWriteString(buf, (const char *) cur->content);
987     }
988     if (cur->children != NULL) {
989         if ((format) && (info != NULL) && (!info->isinline) &&
990 	    (cur->children->type != HTML_TEXT_NODE) &&
991 	    (cur->children->type != HTML_ENTITY_REF_NODE) &&
992 	    (cur->children != cur->last) &&
993 	    (cur->name != NULL) &&
994 	    (cur->name[0] != 'p')) /* p, pre, param */
995 	    xmlOutputBufferWriteString(buf, "\n");
996 	htmlNodeListDumpOutput(buf, doc, cur->children, encoding, format);
997         if ((format) && (info != NULL) && (!info->isinline) &&
998 	    (cur->last->type != HTML_TEXT_NODE) &&
999 	    (cur->last->type != HTML_ENTITY_REF_NODE) &&
1000 	    (cur->children != cur->last) &&
1001 	    (cur->name != NULL) &&
1002 	    (cur->name[0] != 'p')) /* p, pre, param */
1003 	    xmlOutputBufferWriteString(buf, "\n");
1004     }
1005     xmlOutputBufferWriteString(buf, "</");
1006     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
1007         xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
1008 	xmlOutputBufferWriteString(buf, ":");
1009     }
1010     xmlOutputBufferWriteString(buf, (const char *)cur->name);
1011     xmlOutputBufferWriteString(buf, ">");
1012     if ((format) && (info != NULL) && (!info->isinline) &&
1013 	(cur->next != NULL)) {
1014         if ((cur->next->type != HTML_TEXT_NODE) &&
1015 	    (cur->next->type != HTML_ENTITY_REF_NODE) &&
1016 	    (cur->parent != NULL) &&
1017 	    (cur->parent->name != NULL) &&
1018 	    (cur->parent->name[0] != 'p')) /* p, pre, param */
1019 	    xmlOutputBufferWriteString(buf, "\n");
1020     }
1021 }
1022 
1023 /**
1024  * htmlNodeDumpOutput:
1025  * @buf:  the HTML buffer output
1026  * @doc:  the document
1027  * @cur:  the current node
1028  * @encoding:  the encoding string
1029  *
1030  * Dump an HTML node, recursive behaviour,children are printed too,
1031  * and formatting returns/spaces are added.
1032  */
1033 void
htmlNodeDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding)1034 htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
1035 	           xmlNodePtr cur, const char *encoding) {
1036     htmlNodeDumpFormatOutput(buf, doc, cur, encoding, 1);
1037 }
1038 
1039 /**
1040  * htmlDocContentDumpFormatOutput:
1041  * @buf:  the HTML buffer output
1042  * @cur:  the document
1043  * @encoding:  the encoding string
1044  * @format:  should formatting spaces been added
1045  *
1046  * Dump an HTML document.
1047  */
1048 void
htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding,int format)1049 htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1050 	                       const char *encoding, int format) {
1051     int type;
1052 
1053     xmlInitParser();
1054 
1055     if ((buf == NULL) || (cur == NULL))
1056         return;
1057 
1058     /*
1059      * force to output the stuff as HTML, especially for entities
1060      */
1061     type = cur->type;
1062     cur->type = XML_HTML_DOCUMENT_NODE;
1063     if (cur->intSubset != NULL) {
1064         htmlDtdDumpOutput(buf, cur, NULL);
1065     }
1066     if (cur->children != NULL) {
1067         htmlNodeListDumpOutput(buf, cur, cur->children, encoding, format);
1068     }
1069     xmlOutputBufferWriteString(buf, "\n");
1070     cur->type = (xmlElementType) type;
1071 }
1072 
1073 /**
1074  * htmlDocContentDumpOutput:
1075  * @buf:  the HTML buffer output
1076  * @cur:  the document
1077  * @encoding:  the encoding string
1078  *
1079  * Dump an HTML document. Formating return/spaces are added.
1080  */
1081 void
htmlDocContentDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding)1082 htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1083 	                 const char *encoding) {
1084     htmlDocContentDumpFormatOutput(buf, cur, encoding, 1);
1085 }
1086 
1087 /************************************************************************
1088  *									*
1089  *		Saving functions front-ends				*
1090  *									*
1091  ************************************************************************/
1092 
1093 /**
1094  * htmlDocDump:
1095  * @f:  the FILE*
1096  * @cur:  the document
1097  *
1098  * Dump an HTML document to an open FILE.
1099  *
1100  * returns: the number of byte written or -1 in case of failure.
1101  */
1102 int
htmlDocDump(FILE * f,xmlDocPtr cur)1103 htmlDocDump(FILE *f, xmlDocPtr cur) {
1104     xmlOutputBufferPtr buf;
1105     xmlCharEncodingHandlerPtr handler = NULL;
1106     const char *encoding;
1107     int ret;
1108 
1109     xmlInitParser();
1110 
1111     if ((cur == NULL) || (f == NULL)) {
1112 	return(-1);
1113     }
1114 
1115     encoding = (const char *) htmlGetMetaEncoding(cur);
1116 
1117     if (encoding != NULL) {
1118 	xmlCharEncoding enc;
1119 
1120 	enc = xmlParseCharEncoding(encoding);
1121 	if (enc != XML_CHAR_ENCODING_UTF8) {
1122 	    handler = xmlFindCharEncodingHandler(encoding);
1123 	    if (handler == NULL)
1124 		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1125 	}
1126     } else {
1127         /*
1128          * Fallback to HTML or ASCII when the encoding is unspecified
1129          */
1130         if (handler == NULL)
1131             handler = xmlFindCharEncodingHandler("HTML");
1132         if (handler == NULL)
1133             handler = xmlFindCharEncodingHandler("ascii");
1134     }
1135 
1136     buf = xmlOutputBufferCreateFile(f, handler);
1137     if (buf == NULL) return(-1);
1138     htmlDocContentDumpOutput(buf, cur, NULL);
1139 
1140     ret = xmlOutputBufferClose(buf);
1141     return(ret);
1142 }
1143 
1144 /**
1145  * htmlSaveFile:
1146  * @filename:  the filename (or URL)
1147  * @cur:  the document
1148  *
1149  * Dump an HTML document to a file. If @filename is "-" the stdout file is
1150  * used.
1151  * returns: the number of byte written or -1 in case of failure.
1152  */
1153 int
htmlSaveFile(const char * filename,xmlDocPtr cur)1154 htmlSaveFile(const char *filename, xmlDocPtr cur) {
1155     xmlOutputBufferPtr buf;
1156     xmlCharEncodingHandlerPtr handler = NULL;
1157     const char *encoding;
1158     int ret;
1159 
1160     if ((cur == NULL) || (filename == NULL))
1161         return(-1);
1162 
1163     xmlInitParser();
1164 
1165     encoding = (const char *) htmlGetMetaEncoding(cur);
1166 
1167     if (encoding != NULL) {
1168 	xmlCharEncoding enc;
1169 
1170 	enc = xmlParseCharEncoding(encoding);
1171 	if (enc != XML_CHAR_ENCODING_UTF8) {
1172 	    handler = xmlFindCharEncodingHandler(encoding);
1173 	    if (handler == NULL)
1174 		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1175 	}
1176     } else {
1177         /*
1178          * Fallback to HTML or ASCII when the encoding is unspecified
1179          */
1180         if (handler == NULL)
1181             handler = xmlFindCharEncodingHandler("HTML");
1182         if (handler == NULL)
1183             handler = xmlFindCharEncodingHandler("ascii");
1184     }
1185 
1186     /*
1187      * save the content to a temp buffer.
1188      */
1189     buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1190     if (buf == NULL) return(0);
1191 
1192     htmlDocContentDumpOutput(buf, cur, NULL);
1193 
1194     ret = xmlOutputBufferClose(buf);
1195     return(ret);
1196 }
1197 
1198 /**
1199  * htmlSaveFileFormat:
1200  * @filename:  the filename
1201  * @cur:  the document
1202  * @format:  should formatting spaces been added
1203  * @encoding: the document encoding
1204  *
1205  * Dump an HTML document to a file using a given encoding.
1206  *
1207  * returns: the number of byte written or -1 in case of failure.
1208  */
1209 int
htmlSaveFileFormat(const char * filename,xmlDocPtr cur,const char * encoding,int format)1210 htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1211 	           const char *encoding, int format) {
1212     xmlOutputBufferPtr buf;
1213     xmlCharEncodingHandlerPtr handler = NULL;
1214     int ret;
1215 
1216     if ((cur == NULL) || (filename == NULL))
1217         return(-1);
1218 
1219     xmlInitParser();
1220 
1221     if (encoding != NULL) {
1222 	xmlCharEncoding enc;
1223 
1224 	enc = xmlParseCharEncoding(encoding);
1225 	if (enc != XML_CHAR_ENCODING_UTF8) {
1226 	    handler = xmlFindCharEncodingHandler(encoding);
1227 	    if (handler == NULL)
1228 		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1229 	}
1230         htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1231     } else {
1232 	htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1233 
1234         /*
1235          * Fallback to HTML or ASCII when the encoding is unspecified
1236          */
1237         if (handler == NULL)
1238             handler = xmlFindCharEncodingHandler("HTML");
1239         if (handler == NULL)
1240             handler = xmlFindCharEncodingHandler("ascii");
1241     }
1242 
1243     /*
1244      * save the content to a temp buffer.
1245      */
1246     buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1247     if (buf == NULL) return(0);
1248 
1249     htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1250 
1251     ret = xmlOutputBufferClose(buf);
1252     return(ret);
1253 }
1254 
1255 /**
1256  * htmlSaveFileEnc:
1257  * @filename:  the filename
1258  * @cur:  the document
1259  * @encoding: the document encoding
1260  *
1261  * Dump an HTML document to a file using a given encoding
1262  * and formatting returns/spaces are added.
1263  *
1264  * returns: the number of byte written or -1 in case of failure.
1265  */
1266 int
htmlSaveFileEnc(const char * filename,xmlDocPtr cur,const char * encoding)1267 htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1268     return(htmlSaveFileFormat(filename, cur, encoding, 1));
1269 }
1270 
1271 #endif /* LIBXML_OUTPUT_ENABLED */
1272 
1273 #define bottom_HTMLtree
1274 #include "elfgcchack.h"
1275 #endif /* LIBXML_HTML_ENABLED */
1276