1 /* libxml2 - Library for parsing XML documents
2 * Copyright (C) 2006-2019 Free Software Foundation, Inc.
3 *
4 * This file is not part of the GNU gettext program, but is used with
5 * GNU gettext.
6 *
7 * The original copyright notice is as follows:
8 */
9
10 /*
11 * Copyright (C) 1998-2012 Daniel Veillard. All Rights Reserved.
12 *
13 * Permission is hereby granted, free of charge, to any person obtaining a copy
14 * of this software and associated documentation files (the "Software"), to deal
15 * in the Software without restriction, including without limitation the rights
16 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17 * copies of the Software, and to permit persons to whom the Software is fur-
18 * nished to do so, subject to the following conditions:
19 *
20 * The above copyright notice and this permission notice shall be included in
21 * all copies or substantial portions of the Software.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FIT-
25 * NESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
29 * THE SOFTWARE.
30 *
31 * daniel@veillard.com
32 */
33
34 /*
35 * HTMLtree.c : implementation of access function for an HTML tree.
36 */
37
38 #define IN_LIBXML
39 #include "libxml.h"
40 #ifdef LIBXML_HTML_ENABLED
41
42 #include <string.h> /* for memset() only ! */
43
44 #ifdef HAVE_CTYPE_H
45 #include <ctype.h>
46 #endif
47 #ifdef HAVE_STDLIB_H
48 #include <stdlib.h>
49 #endif
50
51 #include <libxml/xmlmemory.h>
52 #include <libxml/HTMLparser.h>
53 #include <libxml/HTMLtree.h>
54 #include <libxml/entities.h>
55 #include <libxml/valid.h>
56 #include <libxml/xmlerror.h>
57 #include <libxml/parserInternals.h>
58 #include <libxml/globals.h>
59 #include <libxml/uri.h>
60
61 #include "buf.h"
62
63 /************************************************************************
64 * *
65 * Getting/Setting encoding meta tags *
66 * *
67 ************************************************************************/
68
69 /**
70 * htmlGetMetaEncoding:
71 * @doc: the document
72 *
73 * Encoding definition lookup in the Meta tags
74 *
75 * Returns the current encoding as flagged in the HTML source
76 */
77 const xmlChar *
htmlGetMetaEncoding(htmlDocPtr doc)78 htmlGetMetaEncoding(htmlDocPtr doc) {
79 htmlNodePtr cur;
80 const xmlChar *content;
81 const xmlChar *encoding;
82
83 if (doc == NULL)
84 return(NULL);
85 cur = doc->children;
86
87 /*
88 * Search the html
89 */
90 while (cur != NULL) {
91 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
92 if (xmlStrEqual(cur->name, BAD_CAST"html"))
93 break;
94 if (xmlStrEqual(cur->name, BAD_CAST"head"))
95 goto found_head;
96 if (xmlStrEqual(cur->name, BAD_CAST"meta"))
97 goto found_meta;
98 }
99 cur = cur->next;
100 }
101 if (cur == NULL)
102 return(NULL);
103 cur = cur->children;
104
105 /*
106 * Search the head
107 */
108 while (cur != NULL) {
109 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
110 if (xmlStrEqual(cur->name, BAD_CAST"head"))
111 break;
112 if (xmlStrEqual(cur->name, BAD_CAST"meta"))
113 goto found_meta;
114 }
115 cur = cur->next;
116 }
117 if (cur == NULL)
118 return(NULL);
119 found_head:
120 cur = cur->children;
121
122 /*
123 * Search the meta elements
124 */
125 found_meta:
126 while (cur != NULL) {
127 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
128 if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
129 xmlAttrPtr attr = cur->properties;
130 int http;
131 const xmlChar *value;
132
133 content = NULL;
134 http = 0;
135 while (attr != NULL) {
136 if ((attr->children != NULL) &&
137 (attr->children->type == XML_TEXT_NODE) &&
138 (attr->children->next == NULL)) {
139 value = attr->children->content;
140 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
141 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
142 http = 1;
143 else if ((value != NULL)
144 && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
145 content = value;
146 if ((http != 0) && (content != NULL))
147 goto found_content;
148 }
149 attr = attr->next;
150 }
151 }
152 }
153 cur = cur->next;
154 }
155 return(NULL);
156
157 found_content:
158 encoding = xmlStrstr(content, BAD_CAST"charset=");
159 if (encoding == NULL)
160 encoding = xmlStrstr(content, BAD_CAST"Charset=");
161 if (encoding == NULL)
162 encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
163 if (encoding != NULL) {
164 encoding += 8;
165 } else {
166 encoding = xmlStrstr(content, BAD_CAST"charset =");
167 if (encoding == NULL)
168 encoding = xmlStrstr(content, BAD_CAST"Charset =");
169 if (encoding == NULL)
170 encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
171 if (encoding != NULL)
172 encoding += 9;
173 }
174 if (encoding != NULL) {
175 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
176 }
177 return(encoding);
178 }
179
180 /**
181 * htmlSetMetaEncoding:
182 * @doc: the document
183 * @encoding: the encoding string
184 *
185 * Sets the current encoding in the Meta tags
186 * NOTE: this will not change the document content encoding, just
187 * the META flag associated.
188 *
189 * Returns 0 in case of success and -1 in case of error
190 */
191 int
htmlSetMetaEncoding(htmlDocPtr doc,const xmlChar * encoding)192 htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
193 htmlNodePtr cur, meta = NULL, head = NULL;
194 const xmlChar *content = NULL;
195 char newcontent[100];
196
197 newcontent[0] = 0;
198
199 if (doc == NULL)
200 return(-1);
201
202 /* html isn't a real encoding it's just libxml2 way to get entities */
203 if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
204 return(-1);
205
206 if (encoding != NULL) {
207 snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
208 (char *)encoding);
209 newcontent[sizeof(newcontent) - 1] = 0;
210 }
211
212 cur = doc->children;
213
214 /*
215 * Search the html
216 */
217 while (cur != NULL) {
218 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
219 if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
220 break;
221 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
222 goto found_head;
223 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
224 goto found_meta;
225 }
226 cur = cur->next;
227 }
228 if (cur == NULL)
229 return(-1);
230 cur = cur->children;
231
232 /*
233 * Search the head
234 */
235 while (cur != NULL) {
236 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
237 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
238 break;
239 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
240 head = cur->parent;
241 goto found_meta;
242 }
243 }
244 cur = cur->next;
245 }
246 if (cur == NULL)
247 return(-1);
248 found_head:
249 head = cur;
250 if (cur->children == NULL)
251 goto create;
252 cur = cur->children;
253
254 found_meta:
255 /*
256 * Search and update all the remaining the meta elements carrying
257 * encoding informations
258 */
259 while (cur != NULL) {
260 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
261 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
262 xmlAttrPtr attr = cur->properties;
263 int http;
264 const xmlChar *value;
265
266 content = NULL;
267 http = 0;
268 while (attr != NULL) {
269 if ((attr->children != NULL) &&
270 (attr->children->type == XML_TEXT_NODE) &&
271 (attr->children->next == NULL)) {
272 value = attr->children->content;
273 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
274 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
275 http = 1;
276 else
277 {
278 if ((value != NULL) &&
279 (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
280 content = value;
281 }
282 if ((http != 0) && (content != NULL))
283 break;
284 }
285 attr = attr->next;
286 }
287 if ((http != 0) && (content != NULL)) {
288 meta = cur;
289 break;
290 }
291
292 }
293 }
294 cur = cur->next;
295 }
296 create:
297 if (meta == NULL) {
298 if ((encoding != NULL) && (head != NULL)) {
299 /*
300 * Create a new Meta element with the right attributes
301 */
302
303 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
304 if (head->children == NULL)
305 xmlAddChild(head, meta);
306 else
307 xmlAddPrevSibling(head->children, meta);
308 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
309 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
310 }
311 } else {
312 /* remove the meta tag if NULL is passed */
313 if (encoding == NULL) {
314 xmlUnlinkNode(meta);
315 xmlFreeNode(meta);
316 }
317 /* change the document only if there is a real encoding change */
318 else if (xmlStrcasestr(content, encoding) == NULL) {
319 xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
320 }
321 }
322
323
324 return(0);
325 }
326
327 /**
328 * booleanHTMLAttrs:
329 *
330 * These are the HTML attributes which will be output
331 * in minimized form, i.e. <option selected="selected"> will be
332 * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
333 *
334 */
335 static const char* htmlBooleanAttrs[] = {
336 "checked", "compact", "declare", "defer", "disabled", "ismap",
337 "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
338 "selected", NULL
339 };
340
341
342 /**
343 * htmlIsBooleanAttr:
344 * @name: the name of the attribute to check
345 *
346 * Determine if a given attribute is a boolean attribute.
347 *
348 * returns: false if the attribute is not boolean, true otherwise.
349 */
350 int
htmlIsBooleanAttr(const xmlChar * name)351 htmlIsBooleanAttr(const xmlChar *name)
352 {
353 int i = 0;
354
355 while (htmlBooleanAttrs[i] != NULL) {
356 if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
357 return 1;
358 i++;
359 }
360 return 0;
361 }
362
363 #ifdef LIBXML_OUTPUT_ENABLED
364 /*
365 * private routine exported from xmlIO.c
366 */
367 xmlOutputBufferPtr
368 xmlAllocOutputBufferInternal(xmlCharEncodingHandlerPtr encoder);
369 /************************************************************************
370 * *
371 * Output error handlers *
372 * *
373 ************************************************************************/
374 /**
375 * htmlSaveErrMemory:
376 * @extra: extra informations
377 *
378 * Handle an out of memory condition
379 */
380 static void
htmlSaveErrMemory(const char * extra)381 htmlSaveErrMemory(const char *extra)
382 {
383 __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);
384 }
385
386 /**
387 * htmlSaveErr:
388 * @code: the error number
389 * @node: the location of the error.
390 * @extra: extra informations
391 *
392 * Handle an out of memory condition
393 */
394 static void
htmlSaveErr(int code,xmlNodePtr node,const char * extra)395 htmlSaveErr(int code, xmlNodePtr node, const char *extra)
396 {
397 const char *msg = NULL;
398
399 switch(code) {
400 case XML_SAVE_NOT_UTF8:
401 msg = "string is not in UTF-8\n";
402 break;
403 case XML_SAVE_CHAR_INVALID:
404 msg = "invalid character value\n";
405 break;
406 case XML_SAVE_UNKNOWN_ENCODING:
407 msg = "unknown encoding %s\n";
408 break;
409 case XML_SAVE_NO_DOCTYPE:
410 msg = "HTML has no DOCTYPE\n";
411 break;
412 default:
413 msg = "unexpected error number\n";
414 }
415 __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);
416 }
417
418 /************************************************************************
419 * *
420 * Dumping HTML tree content to a simple buffer *
421 * *
422 ************************************************************************/
423
424 /**
425 * htmlBufNodeDumpFormat:
426 * @buf: the xmlBufPtr output
427 * @doc: the document
428 * @cur: the current node
429 * @format: should formatting spaces been added
430 *
431 * Dump an HTML node, recursive behaviour,children are printed too.
432 *
433 * Returns the number of byte written or -1 in case of error
434 */
435 static size_t
htmlBufNodeDumpFormat(xmlBufPtr buf,xmlDocPtr doc,xmlNodePtr cur,int format)436 htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
437 int format) {
438 size_t use;
439 int ret;
440 xmlOutputBufferPtr outbuf;
441
442 if (cur == NULL) {
443 return (-1);
444 }
445 if (buf == NULL) {
446 return (-1);
447 }
448 outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
449 if (outbuf == NULL) {
450 htmlSaveErrMemory("allocating HTML output buffer");
451 return (-1);
452 }
453 memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer));
454 outbuf->buffer = buf;
455 outbuf->encoder = NULL;
456 outbuf->writecallback = NULL;
457 outbuf->closecallback = NULL;
458 outbuf->context = NULL;
459 outbuf->written = 0;
460
461 use = xmlBufUse(buf);
462 htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
463 xmlFree(outbuf);
464 ret = xmlBufUse(buf) - use;
465 return (ret);
466 }
467
468 /**
469 * htmlNodeDump:
470 * @buf: the HTML buffer output
471 * @doc: the document
472 * @cur: the current node
473 *
474 * Dump an HTML node, recursive behaviour,children are printed too,
475 * and formatting returns are added.
476 *
477 * Returns the number of byte written or -1 in case of error
478 */
479 int
htmlNodeDump(xmlBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur)480 htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
481 xmlBufPtr buffer;
482 size_t ret;
483
484 if ((buf == NULL) || (cur == NULL))
485 return(-1);
486
487 xmlInitParser();
488 buffer = xmlBufFromBuffer(buf);
489 if (buffer == NULL)
490 return(-1);
491
492 ret = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
493
494 xmlBufBackToBuffer(buffer);
495
496 if (ret > INT_MAX)
497 return(-1);
498 return((int) ret);
499 }
500
501 /**
502 * htmlNodeDumpFileFormat:
503 * @out: the FILE pointer
504 * @doc: the document
505 * @cur: the current node
506 * @encoding: the document encoding
507 * @format: should formatting spaces been added
508 *
509 * Dump an HTML node, recursive behaviour,children are printed too.
510 *
511 * TODO: if encoding == NULL try to save in the doc encoding
512 *
513 * returns: the number of byte written or -1 in case of failure.
514 */
515 int
htmlNodeDumpFileFormat(FILE * out,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)516 htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
517 xmlNodePtr cur, const char *encoding, int format) {
518 xmlOutputBufferPtr buf;
519 xmlCharEncodingHandlerPtr handler = NULL;
520 int ret;
521
522 xmlInitParser();
523
524 if (encoding != NULL) {
525 xmlCharEncoding enc;
526
527 enc = xmlParseCharEncoding(encoding);
528 if (enc != XML_CHAR_ENCODING_UTF8) {
529 handler = xmlFindCharEncodingHandler(encoding);
530 if (handler == NULL)
531 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
532 }
533 } else {
534 /*
535 * Fallback to HTML or ASCII when the encoding is unspecified
536 */
537 if (handler == NULL)
538 handler = xmlFindCharEncodingHandler("HTML");
539 if (handler == NULL)
540 handler = xmlFindCharEncodingHandler("ascii");
541 }
542
543 /*
544 * save the content to a temp buffer.
545 */
546 buf = xmlOutputBufferCreateFile(out, handler);
547 if (buf == NULL) return(0);
548
549 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
550
551 ret = xmlOutputBufferClose(buf);
552 return(ret);
553 }
554
555 /**
556 * htmlNodeDumpFile:
557 * @out: the FILE pointer
558 * @doc: the document
559 * @cur: the current node
560 *
561 * Dump an HTML node, recursive behaviour,children are printed too,
562 * and formatting returns are added.
563 */
564 void
htmlNodeDumpFile(FILE * out,xmlDocPtr doc,xmlNodePtr cur)565 htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
566 htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
567 }
568
569 /**
570 * htmlDocDumpMemoryFormat:
571 * @cur: the document
572 * @mem: OUT: the memory pointer
573 * @size: OUT: the memory length
574 * @format: should formatting spaces been added
575 *
576 * Dump an HTML document in memory and return the xmlChar * and it's size.
577 * It's up to the caller to free the memory.
578 */
579 void
htmlDocDumpMemoryFormat(xmlDocPtr cur,xmlChar ** mem,int * size,int format)580 htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
581 xmlOutputBufferPtr buf;
582 xmlCharEncodingHandlerPtr handler = NULL;
583 const char *encoding;
584
585 xmlInitParser();
586
587 if ((mem == NULL) || (size == NULL))
588 return;
589 if (cur == NULL) {
590 *mem = NULL;
591 *size = 0;
592 return;
593 }
594
595 encoding = (const char *) htmlGetMetaEncoding(cur);
596
597 if (encoding != NULL) {
598 xmlCharEncoding enc;
599
600 enc = xmlParseCharEncoding(encoding);
601 if (enc != XML_CHAR_ENCODING_UTF8) {
602 handler = xmlFindCharEncodingHandler(encoding);
603 if (handler == NULL)
604 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
605
606 }
607 } else {
608 /*
609 * Fallback to HTML or ASCII when the encoding is unspecified
610 */
611 if (handler == NULL)
612 handler = xmlFindCharEncodingHandler("HTML");
613 if (handler == NULL)
614 handler = xmlFindCharEncodingHandler("ascii");
615 }
616
617 buf = xmlAllocOutputBufferInternal(handler);
618 if (buf == NULL) {
619 *mem = NULL;
620 *size = 0;
621 return;
622 }
623
624 htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
625
626 xmlOutputBufferFlush(buf);
627 if (buf->conv != NULL) {
628 *size = xmlBufUse(buf->conv);
629 *mem = xmlStrndup(xmlBufContent(buf->conv), *size);
630 } else {
631 *size = xmlBufUse(buf->buffer);
632 *mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
633 }
634 (void)xmlOutputBufferClose(buf);
635 }
636
637 /**
638 * htmlDocDumpMemory:
639 * @cur: the document
640 * @mem: OUT: the memory pointer
641 * @size: OUT: the memory length
642 *
643 * Dump an HTML document in memory and return the xmlChar * and it's size.
644 * It's up to the caller to free the memory.
645 */
646 void
htmlDocDumpMemory(xmlDocPtr cur,xmlChar ** mem,int * size)647 htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
648 htmlDocDumpMemoryFormat(cur, mem, size, 1);
649 }
650
651
652 /************************************************************************
653 * *
654 * Dumping HTML tree content to an I/O output buffer *
655 * *
656 ************************************************************************/
657
658 void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur);
659
660 /**
661 * htmlDtdDumpOutput:
662 * @buf: the HTML buffer output
663 * @doc: the document
664 * @encoding: the encoding string
665 *
666 * TODO: check whether encoding is needed
667 *
668 * Dump the HTML document DTD, if any.
669 */
670 static void
htmlDtdDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,const char * encoding ATTRIBUTE_UNUSED)671 htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
672 const char *encoding ATTRIBUTE_UNUSED) {
673 xmlDtdPtr cur = doc->intSubset;
674
675 if (cur == NULL) {
676 htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
677 return;
678 }
679 xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
680 xmlOutputBufferWriteString(buf, (const char *)cur->name);
681 if (cur->ExternalID != NULL) {
682 xmlOutputBufferWriteString(buf, " PUBLIC ");
683 xmlBufWriteQuotedString(buf->buffer, cur->ExternalID);
684 if (cur->SystemID != NULL) {
685 xmlOutputBufferWriteString(buf, " ");
686 xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
687 }
688 } else if (cur->SystemID != NULL &&
689 xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
690 xmlOutputBufferWriteString(buf, " SYSTEM ");
691 xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
692 }
693 xmlOutputBufferWriteString(buf, ">\n");
694 }
695
696 /**
697 * htmlAttrDumpOutput:
698 * @buf: the HTML buffer output
699 * @doc: the document
700 * @cur: the attribute pointer
701 * @encoding: the encoding string
702 *
703 * Dump an HTML attribute
704 */
705 static void
htmlAttrDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlAttrPtr cur,const char * encoding ATTRIBUTE_UNUSED)706 htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur,
707 const char *encoding ATTRIBUTE_UNUSED) {
708 xmlChar *value;
709
710 /*
711 * The html output method should not escape a & character
712 * occurring in an attribute value immediately followed by
713 * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
714 * This is implemented in xmlEncodeEntitiesReentrant
715 */
716
717 if (cur == NULL) {
718 return;
719 }
720 xmlOutputBufferWriteString(buf, " ");
721 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
722 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
723 xmlOutputBufferWriteString(buf, ":");
724 }
725 xmlOutputBufferWriteString(buf, (const char *)cur->name);
726 if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
727 value = xmlNodeListGetString(doc, cur->children, 0);
728 if (value) {
729 xmlOutputBufferWriteString(buf, "=");
730 if ((cur->ns == NULL) && (cur->parent != NULL) &&
731 (cur->parent->ns == NULL) &&
732 ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
733 (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
734 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
735 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
736 (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
737 xmlChar *tmp = value;
738 /* xmlURIEscapeStr() escapes '"' so it can be safely used. */
739 xmlBufCCat(buf->buffer, "\"");
740
741 while (IS_BLANK_CH(*tmp)) tmp++;
742
743 /* URI Escape everything, except server side includes. */
744 for ( ; ; ) {
745 xmlChar *escaped;
746 xmlChar endChar;
747 xmlChar *end = NULL;
748 xmlChar *start = (xmlChar *)xmlStrstr(tmp, BAD_CAST "<!--");
749 if (start != NULL) {
750 end = (xmlChar *)xmlStrstr(tmp, BAD_CAST "-->");
751 if (end != NULL) {
752 *start = '\0';
753 }
754 }
755
756 /* Escape the whole string, or until start (set to '\0'). */
757 escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+");
758 if (escaped != NULL) {
759 xmlBufCat(buf->buffer, escaped);
760 xmlFree(escaped);
761 } else {
762 xmlBufCat(buf->buffer, tmp);
763 }
764
765 if (end == NULL) { /* Everything has been written. */
766 break;
767 }
768
769 /* Do not escape anything within server side includes. */
770 *start = '<'; /* Restore the first character of "<!--". */
771 end += 3; /* strlen("-->") */
772 endChar = *end;
773 *end = '\0';
774 xmlBufCat(buf->buffer, start);
775 *end = endChar;
776 tmp = end;
777 }
778
779 xmlBufCCat(buf->buffer, "\"");
780 } else {
781 xmlBufWriteQuotedString(buf->buffer, value);
782 }
783 xmlFree(value);
784 } else {
785 xmlOutputBufferWriteString(buf, "=\"\"");
786 }
787 }
788 }
789
790 /**
791 * htmlAttrListDumpOutput:
792 * @buf: the HTML buffer output
793 * @doc: the document
794 * @cur: the first attribute pointer
795 * @encoding: the encoding string
796 *
797 * Dump a list of HTML attributes
798 */
799 static void
htmlAttrListDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlAttrPtr cur,const char * encoding)800 htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
801 if (cur == NULL) {
802 return;
803 }
804 while (cur != NULL) {
805 htmlAttrDumpOutput(buf, doc, cur, encoding);
806 cur = cur->next;
807 }
808 }
809
810
811
812 /**
813 * htmlNodeListDumpOutput:
814 * @buf: the HTML buffer output
815 * @doc: the document
816 * @cur: the first node
817 * @encoding: the encoding string
818 * @format: should formatting spaces been added
819 *
820 * Dump an HTML node list, recursive behaviour,children are printed too.
821 */
822 static void
htmlNodeListDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)823 htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
824 xmlNodePtr cur, const char *encoding, int format) {
825 if (cur == NULL) {
826 return;
827 }
828 while (cur != NULL) {
829 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
830 cur = cur->next;
831 }
832 }
833
834 /**
835 * htmlNodeDumpFormatOutput:
836 * @buf: the HTML buffer output
837 * @doc: the document
838 * @cur: the current node
839 * @encoding: the encoding string
840 * @format: should formatting spaces been added
841 *
842 * Dump an HTML node, recursive behaviour,children are printed too.
843 */
844 void
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)845 htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
846 xmlNodePtr cur, const char *encoding, int format) {
847 const htmlElemDesc * info;
848
849 xmlInitParser();
850
851 if ((cur == NULL) || (buf == NULL)) {
852 return;
853 }
854 /*
855 * Special cases.
856 */
857 if (cur->type == XML_DTD_NODE)
858 return;
859 if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
860 (cur->type == XML_DOCUMENT_NODE)){
861 htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding);
862 return;
863 }
864 if (cur->type == XML_ATTRIBUTE_NODE) {
865 htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur, encoding);
866 return;
867 }
868 if (cur->type == HTML_TEXT_NODE) {
869 if (cur->content != NULL) {
870 if (((cur->name == (const xmlChar *)xmlStringText) ||
871 (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
872 ((cur->parent == NULL) ||
873 ((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) &&
874 (xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) {
875 xmlChar *buffer;
876
877 buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
878 if (buffer != NULL) {
879 xmlOutputBufferWriteString(buf, (const char *)buffer);
880 xmlFree(buffer);
881 }
882 } else {
883 xmlOutputBufferWriteString(buf, (const char *)cur->content);
884 }
885 }
886 return;
887 }
888 if (cur->type == HTML_COMMENT_NODE) {
889 if (cur->content != NULL) {
890 xmlOutputBufferWriteString(buf, "<!--");
891 xmlOutputBufferWriteString(buf, (const char *)cur->content);
892 xmlOutputBufferWriteString(buf, "-->");
893 }
894 return;
895 }
896 if (cur->type == HTML_PI_NODE) {
897 if (cur->name == NULL)
898 return;
899 xmlOutputBufferWriteString(buf, "<?");
900 xmlOutputBufferWriteString(buf, (const char *)cur->name);
901 if (cur->content != NULL) {
902 xmlOutputBufferWriteString(buf, " ");
903 xmlOutputBufferWriteString(buf, (const char *)cur->content);
904 }
905 xmlOutputBufferWriteString(buf, ">");
906 return;
907 }
908 if (cur->type == HTML_ENTITY_REF_NODE) {
909 xmlOutputBufferWriteString(buf, "&");
910 xmlOutputBufferWriteString(buf, (const char *)cur->name);
911 xmlOutputBufferWriteString(buf, ";");
912 return;
913 }
914 if (cur->type == HTML_PRESERVE_NODE) {
915 if (cur->content != NULL) {
916 xmlOutputBufferWriteString(buf, (const char *)cur->content);
917 }
918 return;
919 }
920
921 /*
922 * Get specific HTML info for that node.
923 */
924 if (cur->ns == NULL)
925 info = htmlTagLookup(cur->name);
926 else
927 info = NULL;
928
929 xmlOutputBufferWriteString(buf, "<");
930 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
931 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
932 xmlOutputBufferWriteString(buf, ":");
933 }
934 xmlOutputBufferWriteString(buf, (const char *)cur->name);
935 if (cur->nsDef)
936 xmlNsListDumpOutput(buf, cur->nsDef);
937 if (cur->properties != NULL)
938 htmlAttrListDumpOutput(buf, doc, cur->properties, encoding);
939
940 if ((info != NULL) && (info->empty)) {
941 xmlOutputBufferWriteString(buf, ">");
942 if ((format) && (!info->isinline) && (cur->next != NULL)) {
943 if ((cur->next->type != HTML_TEXT_NODE) &&
944 (cur->next->type != HTML_ENTITY_REF_NODE) &&
945 (cur->parent != NULL) &&
946 (cur->parent->name != NULL) &&
947 (cur->parent->name[0] != 'p')) /* p, pre, param */
948 xmlOutputBufferWriteString(buf, "\n");
949 }
950 return;
951 }
952 if (((cur->type == XML_ELEMENT_NODE) || (cur->content == NULL)) &&
953 (cur->children == NULL)) {
954 if ((info != NULL) && (info->saveEndTag != 0) &&
955 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
956 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
957 xmlOutputBufferWriteString(buf, ">");
958 } else {
959 xmlOutputBufferWriteString(buf, "></");
960 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
961 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
962 xmlOutputBufferWriteString(buf, ":");
963 }
964 xmlOutputBufferWriteString(buf, (const char *)cur->name);
965 xmlOutputBufferWriteString(buf, ">");
966 }
967 if ((format) && (cur->next != NULL) &&
968 (info != NULL) && (!info->isinline)) {
969 if ((cur->next->type != HTML_TEXT_NODE) &&
970 (cur->next->type != HTML_ENTITY_REF_NODE) &&
971 (cur->parent != NULL) &&
972 (cur->parent->name != NULL) &&
973 (cur->parent->name[0] != 'p')) /* p, pre, param */
974 xmlOutputBufferWriteString(buf, "\n");
975 }
976 return;
977 }
978 xmlOutputBufferWriteString(buf, ">");
979 if ((cur->type != XML_ELEMENT_NODE) &&
980 (cur->content != NULL)) {
981 /*
982 * Uses the OutputBuffer property to automatically convert
983 * invalids to charrefs
984 */
985
986 xmlOutputBufferWriteString(buf, (const char *) cur->content);
987 }
988 if (cur->children != NULL) {
989 if ((format) && (info != NULL) && (!info->isinline) &&
990 (cur->children->type != HTML_TEXT_NODE) &&
991 (cur->children->type != HTML_ENTITY_REF_NODE) &&
992 (cur->children != cur->last) &&
993 (cur->name != NULL) &&
994 (cur->name[0] != 'p')) /* p, pre, param */
995 xmlOutputBufferWriteString(buf, "\n");
996 htmlNodeListDumpOutput(buf, doc, cur->children, encoding, format);
997 if ((format) && (info != NULL) && (!info->isinline) &&
998 (cur->last->type != HTML_TEXT_NODE) &&
999 (cur->last->type != HTML_ENTITY_REF_NODE) &&
1000 (cur->children != cur->last) &&
1001 (cur->name != NULL) &&
1002 (cur->name[0] != 'p')) /* p, pre, param */
1003 xmlOutputBufferWriteString(buf, "\n");
1004 }
1005 xmlOutputBufferWriteString(buf, "</");
1006 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
1007 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
1008 xmlOutputBufferWriteString(buf, ":");
1009 }
1010 xmlOutputBufferWriteString(buf, (const char *)cur->name);
1011 xmlOutputBufferWriteString(buf, ">");
1012 if ((format) && (info != NULL) && (!info->isinline) &&
1013 (cur->next != NULL)) {
1014 if ((cur->next->type != HTML_TEXT_NODE) &&
1015 (cur->next->type != HTML_ENTITY_REF_NODE) &&
1016 (cur->parent != NULL) &&
1017 (cur->parent->name != NULL) &&
1018 (cur->parent->name[0] != 'p')) /* p, pre, param */
1019 xmlOutputBufferWriteString(buf, "\n");
1020 }
1021 }
1022
1023 /**
1024 * htmlNodeDumpOutput:
1025 * @buf: the HTML buffer output
1026 * @doc: the document
1027 * @cur: the current node
1028 * @encoding: the encoding string
1029 *
1030 * Dump an HTML node, recursive behaviour,children are printed too,
1031 * and formatting returns/spaces are added.
1032 */
1033 void
htmlNodeDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding)1034 htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
1035 xmlNodePtr cur, const char *encoding) {
1036 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, 1);
1037 }
1038
1039 /**
1040 * htmlDocContentDumpFormatOutput:
1041 * @buf: the HTML buffer output
1042 * @cur: the document
1043 * @encoding: the encoding string
1044 * @format: should formatting spaces been added
1045 *
1046 * Dump an HTML document.
1047 */
1048 void
htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding,int format)1049 htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1050 const char *encoding, int format) {
1051 int type;
1052
1053 xmlInitParser();
1054
1055 if ((buf == NULL) || (cur == NULL))
1056 return;
1057
1058 /*
1059 * force to output the stuff as HTML, especially for entities
1060 */
1061 type = cur->type;
1062 cur->type = XML_HTML_DOCUMENT_NODE;
1063 if (cur->intSubset != NULL) {
1064 htmlDtdDumpOutput(buf, cur, NULL);
1065 }
1066 if (cur->children != NULL) {
1067 htmlNodeListDumpOutput(buf, cur, cur->children, encoding, format);
1068 }
1069 xmlOutputBufferWriteString(buf, "\n");
1070 cur->type = (xmlElementType) type;
1071 }
1072
1073 /**
1074 * htmlDocContentDumpOutput:
1075 * @buf: the HTML buffer output
1076 * @cur: the document
1077 * @encoding: the encoding string
1078 *
1079 * Dump an HTML document. Formating return/spaces are added.
1080 */
1081 void
htmlDocContentDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding)1082 htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1083 const char *encoding) {
1084 htmlDocContentDumpFormatOutput(buf, cur, encoding, 1);
1085 }
1086
1087 /************************************************************************
1088 * *
1089 * Saving functions front-ends *
1090 * *
1091 ************************************************************************/
1092
1093 /**
1094 * htmlDocDump:
1095 * @f: the FILE*
1096 * @cur: the document
1097 *
1098 * Dump an HTML document to an open FILE.
1099 *
1100 * returns: the number of byte written or -1 in case of failure.
1101 */
1102 int
htmlDocDump(FILE * f,xmlDocPtr cur)1103 htmlDocDump(FILE *f, xmlDocPtr cur) {
1104 xmlOutputBufferPtr buf;
1105 xmlCharEncodingHandlerPtr handler = NULL;
1106 const char *encoding;
1107 int ret;
1108
1109 xmlInitParser();
1110
1111 if ((cur == NULL) || (f == NULL)) {
1112 return(-1);
1113 }
1114
1115 encoding = (const char *) htmlGetMetaEncoding(cur);
1116
1117 if (encoding != NULL) {
1118 xmlCharEncoding enc;
1119
1120 enc = xmlParseCharEncoding(encoding);
1121 if (enc != XML_CHAR_ENCODING_UTF8) {
1122 handler = xmlFindCharEncodingHandler(encoding);
1123 if (handler == NULL)
1124 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1125 }
1126 } else {
1127 /*
1128 * Fallback to HTML or ASCII when the encoding is unspecified
1129 */
1130 if (handler == NULL)
1131 handler = xmlFindCharEncodingHandler("HTML");
1132 if (handler == NULL)
1133 handler = xmlFindCharEncodingHandler("ascii");
1134 }
1135
1136 buf = xmlOutputBufferCreateFile(f, handler);
1137 if (buf == NULL) return(-1);
1138 htmlDocContentDumpOutput(buf, cur, NULL);
1139
1140 ret = xmlOutputBufferClose(buf);
1141 return(ret);
1142 }
1143
1144 /**
1145 * htmlSaveFile:
1146 * @filename: the filename (or URL)
1147 * @cur: the document
1148 *
1149 * Dump an HTML document to a file. If @filename is "-" the stdout file is
1150 * used.
1151 * returns: the number of byte written or -1 in case of failure.
1152 */
1153 int
htmlSaveFile(const char * filename,xmlDocPtr cur)1154 htmlSaveFile(const char *filename, xmlDocPtr cur) {
1155 xmlOutputBufferPtr buf;
1156 xmlCharEncodingHandlerPtr handler = NULL;
1157 const char *encoding;
1158 int ret;
1159
1160 if ((cur == NULL) || (filename == NULL))
1161 return(-1);
1162
1163 xmlInitParser();
1164
1165 encoding = (const char *) htmlGetMetaEncoding(cur);
1166
1167 if (encoding != NULL) {
1168 xmlCharEncoding enc;
1169
1170 enc = xmlParseCharEncoding(encoding);
1171 if (enc != XML_CHAR_ENCODING_UTF8) {
1172 handler = xmlFindCharEncodingHandler(encoding);
1173 if (handler == NULL)
1174 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1175 }
1176 } else {
1177 /*
1178 * Fallback to HTML or ASCII when the encoding is unspecified
1179 */
1180 if (handler == NULL)
1181 handler = xmlFindCharEncodingHandler("HTML");
1182 if (handler == NULL)
1183 handler = xmlFindCharEncodingHandler("ascii");
1184 }
1185
1186 /*
1187 * save the content to a temp buffer.
1188 */
1189 buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1190 if (buf == NULL) return(0);
1191
1192 htmlDocContentDumpOutput(buf, cur, NULL);
1193
1194 ret = xmlOutputBufferClose(buf);
1195 return(ret);
1196 }
1197
1198 /**
1199 * htmlSaveFileFormat:
1200 * @filename: the filename
1201 * @cur: the document
1202 * @format: should formatting spaces been added
1203 * @encoding: the document encoding
1204 *
1205 * Dump an HTML document to a file using a given encoding.
1206 *
1207 * returns: the number of byte written or -1 in case of failure.
1208 */
1209 int
htmlSaveFileFormat(const char * filename,xmlDocPtr cur,const char * encoding,int format)1210 htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1211 const char *encoding, int format) {
1212 xmlOutputBufferPtr buf;
1213 xmlCharEncodingHandlerPtr handler = NULL;
1214 int ret;
1215
1216 if ((cur == NULL) || (filename == NULL))
1217 return(-1);
1218
1219 xmlInitParser();
1220
1221 if (encoding != NULL) {
1222 xmlCharEncoding enc;
1223
1224 enc = xmlParseCharEncoding(encoding);
1225 if (enc != XML_CHAR_ENCODING_UTF8) {
1226 handler = xmlFindCharEncodingHandler(encoding);
1227 if (handler == NULL)
1228 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1229 }
1230 htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1231 } else {
1232 htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1233
1234 /*
1235 * Fallback to HTML or ASCII when the encoding is unspecified
1236 */
1237 if (handler == NULL)
1238 handler = xmlFindCharEncodingHandler("HTML");
1239 if (handler == NULL)
1240 handler = xmlFindCharEncodingHandler("ascii");
1241 }
1242
1243 /*
1244 * save the content to a temp buffer.
1245 */
1246 buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1247 if (buf == NULL) return(0);
1248
1249 htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1250
1251 ret = xmlOutputBufferClose(buf);
1252 return(ret);
1253 }
1254
1255 /**
1256 * htmlSaveFileEnc:
1257 * @filename: the filename
1258 * @cur: the document
1259 * @encoding: the document encoding
1260 *
1261 * Dump an HTML document to a file using a given encoding
1262 * and formatting returns/spaces are added.
1263 *
1264 * returns: the number of byte written or -1 in case of failure.
1265 */
1266 int
htmlSaveFileEnc(const char * filename,xmlDocPtr cur,const char * encoding)1267 htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1268 return(htmlSaveFileFormat(filename, cur, encoding, 1));
1269 }
1270
1271 #endif /* LIBXML_OUTPUT_ENABLED */
1272
1273 #define bottom_HTMLtree
1274 #include "elfgcchack.h"
1275 #endif /* LIBXML_HTML_ENABLED */
1276