1 /*
2  * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
3  * Copyright (C) 2009, 2010 Google Inc. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
18  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
19  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
20  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 #include "config.h"
28 #include "MarkupAccumulator.h"
29 
30 #include "CDATASection.h"
31 #include "Comment.h"
32 #include "DocumentFragment.h"
33 #include "DocumentType.h"
34 #include "Editor.h"
35 #include "HTMLElement.h"
36 #include "HTMLNames.h"
37 #include "KURL.h"
38 #include "ProcessingInstruction.h"
39 #include "XMLNSNames.h"
40 #include <wtf/unicode/CharacterNames.h>
41 
42 namespace WebCore {
43 
44 using namespace HTMLNames;
45 
appendCharactersReplacingEntities(Vector<UChar> & out,const UChar * content,size_t length,EntityMask entityMask)46 void appendCharactersReplacingEntities(Vector<UChar>& out, const UChar* content, size_t length, EntityMask entityMask)
47 {
48     DEFINE_STATIC_LOCAL(const String, ampReference, ("&amp;"));
49     DEFINE_STATIC_LOCAL(const String, ltReference, ("&lt;"));
50     DEFINE_STATIC_LOCAL(const String, gtReference, ("&gt;"));
51     DEFINE_STATIC_LOCAL(const String, quotReference, ("&quot;"));
52     DEFINE_STATIC_LOCAL(const String, nbspReference, ("&nbsp;"));
53 
54     static const EntityDescription entityMaps[] = {
55         { '&', ampReference, EntityAmp },
56         { '<', ltReference, EntityLt },
57         { '>', gtReference, EntityGt },
58         { '"', quotReference, EntityQuot },
59         { noBreakSpace, nbspReference, EntityNbsp },
60     };
61 
62     size_t positionAfterLastEntity = 0;
63     for (size_t i = 0; i < length; ++i) {
64         for (size_t m = 0; m < WTF_ARRAY_LENGTH(entityMaps); ++m) {
65             if (content[i] == entityMaps[m].entity && entityMaps[m].mask & entityMask) {
66                 out.append(content + positionAfterLastEntity, i - positionAfterLastEntity);
67                 append(out, entityMaps[m].reference);
68                 positionAfterLastEntity = i + 1;
69                 break;
70             }
71         }
72     }
73     out.append(content + positionAfterLastEntity, length - positionAfterLastEntity);
74 }
75 
MarkupAccumulator(Vector<Node * > * nodes,EAbsoluteURLs shouldResolveURLs,const Range * range)76 MarkupAccumulator::MarkupAccumulator(Vector<Node*>* nodes, EAbsoluteURLs shouldResolveURLs, const Range* range)
77     : m_nodes(nodes)
78     , m_range(range)
79     , m_shouldResolveURLs(shouldResolveURLs)
80 {
81 }
82 
~MarkupAccumulator()83 MarkupAccumulator::~MarkupAccumulator()
84 {
85 }
86 
serializeNodes(Node * node,Node * nodeToSkip,EChildrenOnly childrenOnly)87 String MarkupAccumulator::serializeNodes(Node* node, Node* nodeToSkip, EChildrenOnly childrenOnly)
88 {
89     Vector<UChar> out;
90     serializeNodesWithNamespaces(node, nodeToSkip, childrenOnly, 0);
91     out.reserveInitialCapacity(length());
92     concatenateMarkup(out);
93     return String::adopt(out);
94 }
95 
serializeNodesWithNamespaces(Node * node,Node * nodeToSkip,EChildrenOnly childrenOnly,const Namespaces * namespaces)96 void MarkupAccumulator::serializeNodesWithNamespaces(Node* node, Node* nodeToSkip, EChildrenOnly childrenOnly, const Namespaces* namespaces)
97 {
98     if (node == nodeToSkip)
99         return;
100 
101     Namespaces namespaceHash;
102     if (namespaces)
103         namespaceHash = *namespaces;
104 
105     if (!childrenOnly)
106         appendStartTag(node, &namespaceHash);
107 
108     if (!(node->document()->isHTMLDocument() && elementCannotHaveEndTag(node))) {
109         for (Node* current = node->firstChild(); current; current = current->nextSibling())
110             serializeNodesWithNamespaces(current, nodeToSkip, IncludeNode, &namespaceHash);
111     }
112 
113     if (!childrenOnly)
114         appendEndTag(node);
115 }
116 
appendString(const String & string)117 void MarkupAccumulator::appendString(const String& string)
118 {
119     m_succeedingMarkup.append(string);
120 }
121 
appendStartTag(Node * node,Namespaces * namespaces)122 void MarkupAccumulator::appendStartTag(Node* node, Namespaces* namespaces)
123 {
124     Vector<UChar> markup;
125     appendStartMarkup(markup, node, namespaces);
126     appendString(String::adopt(markup));
127     if (m_nodes)
128         m_nodes->append(node);
129 }
130 
appendEndTag(Node * node)131 void MarkupAccumulator::appendEndTag(Node* node)
132 {
133     Vector<UChar> markup;
134     appendEndMarkup(markup, node);
135     appendString(String::adopt(markup));
136 }
137 
totalLength(const Vector<String> & strings)138 size_t MarkupAccumulator::totalLength(const Vector<String>& strings)
139 {
140     size_t length = 0;
141     for (size_t i = 0; i < strings.size(); ++i)
142         length += strings[i].length();
143     return length;
144 }
145 
146 // FIXME: This is a very inefficient way of accumulating the markup.
147 // We're converting results of appendStartMarkup and appendEndMarkup from Vector<UChar> to String
148 // and then back to Vector<UChar> and again to String here.
concatenateMarkup(Vector<UChar> & out)149 void MarkupAccumulator::concatenateMarkup(Vector<UChar>& out)
150 {
151     for (size_t i = 0; i < m_succeedingMarkup.size(); ++i)
152         append(out, m_succeedingMarkup[i]);
153 }
154 
appendAttributeValue(Vector<UChar> & result,const String & attribute,bool documentIsHTML)155 void MarkupAccumulator::appendAttributeValue(Vector<UChar>& result, const String& attribute, bool documentIsHTML)
156 {
157     appendCharactersReplacingEntities(result, attribute.characters(), attribute.length(),
158         documentIsHTML ? EntityMaskInHTMLAttributeValue : EntityMaskInAttributeValue);
159 }
160 
appendCustomAttributes(Vector<UChar> &,Element *,Namespaces *)161 void MarkupAccumulator::appendCustomAttributes(Vector<UChar>&, Element*, Namespaces*)
162 {
163 }
164 
appendQuotedURLAttributeValue(Vector<UChar> & result,const String & urlString)165 void MarkupAccumulator::appendQuotedURLAttributeValue(Vector<UChar>& result, const String& urlString)
166 {
167     UChar quoteChar = '\"';
168     String strippedURLString = urlString.stripWhiteSpace();
169     if (protocolIsJavaScript(strippedURLString)) {
170         // minimal escaping for javascript urls
171         if (strippedURLString.contains('"')) {
172             if (strippedURLString.contains('\''))
173                 strippedURLString.replace('\"', "&quot;");
174             else
175                 quoteChar = '\'';
176         }
177         result.append(quoteChar);
178         append(result, strippedURLString);
179         result.append(quoteChar);
180         return;
181     }
182 
183     // FIXME: This does not fully match other browsers. Firefox percent-escapes non-ASCII characters for innerHTML.
184     result.append(quoteChar);
185     appendAttributeValue(result, urlString, false);
186     result.append(quoteChar);
187 }
188 
appendNodeValue(Vector<UChar> & out,const Node * node,const Range * range,EntityMask entityMask)189 void MarkupAccumulator::appendNodeValue(Vector<UChar>& out, const Node* node, const Range* range, EntityMask entityMask)
190 {
191     String str = node->nodeValue();
192     const UChar* characters = str.characters();
193     size_t length = str.length();
194 
195     if (range) {
196         ExceptionCode ec;
197         if (node == range->endContainer(ec))
198             length = range->endOffset(ec);
199         if (node == range->startContainer(ec)) {
200             size_t start = range->startOffset(ec);
201             characters += start;
202             length -= start;
203         }
204     }
205 
206     appendCharactersReplacingEntities(out, characters, length, entityMask);
207 }
208 
shouldAddNamespaceElement(const Element * element)209 bool MarkupAccumulator::shouldAddNamespaceElement(const Element* element)
210 {
211     // Don't add namespace attribute if it is already defined for this elem.
212     const AtomicString& prefix = element->prefix();
213     AtomicString attr = !prefix.isEmpty() ? "xmlns:" + prefix : "xmlns";
214     return !element->hasAttribute(attr);
215 }
216 
shouldAddNamespaceAttribute(const Attribute & attribute,Namespaces & namespaces)217 bool MarkupAccumulator::shouldAddNamespaceAttribute(const Attribute& attribute, Namespaces& namespaces)
218 {
219     namespaces.checkConsistency();
220 
221     // Don't add namespace attributes twice
222     if (attribute.name() == XMLNSNames::xmlnsAttr) {
223         namespaces.set(emptyAtom.impl(), attribute.value().impl());
224         return false;
225     }
226 
227     QualifiedName xmlnsPrefixAttr(xmlnsAtom, attribute.localName(), XMLNSNames::xmlnsNamespaceURI);
228     if (attribute.name() == xmlnsPrefixAttr) {
229         namespaces.set(attribute.localName().impl(), attribute.value().impl());
230         return false;
231     }
232 
233     return true;
234 }
235 
appendNamespace(Vector<UChar> & result,const AtomicString & prefix,const AtomicString & namespaceURI,Namespaces & namespaces)236 void MarkupAccumulator::appendNamespace(Vector<UChar>& result, const AtomicString& prefix, const AtomicString& namespaceURI, Namespaces& namespaces)
237 {
238     namespaces.checkConsistency();
239     if (namespaceURI.isEmpty())
240         return;
241 
242     // Use emptyAtoms's impl() for both null and empty strings since the HashMap can't handle 0 as a key
243     AtomicStringImpl* pre = prefix.isEmpty() ? emptyAtom.impl() : prefix.impl();
244     AtomicStringImpl* foundNS = namespaces.get(pre);
245     if (foundNS != namespaceURI.impl()) {
246         namespaces.set(pre, namespaceURI.impl());
247         result.append(' ');
248         append(result, xmlnsAtom.string());
249         if (!prefix.isEmpty()) {
250             result.append(':');
251             append(result, prefix);
252         }
253 
254         result.append('=');
255         result.append('"');
256         appendAttributeValue(result, namespaceURI, false);
257         result.append('"');
258     }
259 }
260 
entityMaskForText(Text * text) const261 EntityMask MarkupAccumulator::entityMaskForText(Text* text) const
262 {
263     const QualifiedName* parentName = 0;
264     if (text->parentElement())
265         parentName = &static_cast<Element*>(text->parentElement())->tagQName();
266 
267     if (parentName && (*parentName == scriptTag || *parentName == styleTag || *parentName == xmpTag))
268         return EntityMaskInCDATA;
269 
270     return text->document()->isHTMLDocument() ? EntityMaskInHTMLPCDATA : EntityMaskInPCDATA;
271 }
272 
appendText(Vector<UChar> & out,Text * text)273 void MarkupAccumulator::appendText(Vector<UChar>& out, Text* text)
274 {
275     appendNodeValue(out, text, m_range, entityMaskForText(text));
276 }
277 
appendComment(Vector<UChar> & out,const String & comment)278 void MarkupAccumulator::appendComment(Vector<UChar>& out, const String& comment)
279 {
280     // FIXME: Comment content is not escaped, but XMLSerializer (and possibly other callers) should raise an exception if it includes "-->".
281     append(out, "<!--");
282     append(out, comment);
283     append(out, "-->");
284 }
285 
appendDocumentType(Vector<UChar> & result,const DocumentType * n)286 void MarkupAccumulator::appendDocumentType(Vector<UChar>& result, const DocumentType* n)
287 {
288     if (n->name().isEmpty())
289         return;
290 
291     append(result, "<!DOCTYPE ");
292     append(result, n->name());
293     if (!n->publicId().isEmpty()) {
294         append(result, " PUBLIC \"");
295         append(result, n->publicId());
296         append(result, "\"");
297         if (!n->systemId().isEmpty()) {
298             append(result, " \"");
299             append(result, n->systemId());
300             append(result, "\"");
301         }
302     } else if (!n->systemId().isEmpty()) {
303         append(result, " SYSTEM \"");
304         append(result, n->systemId());
305         append(result, "\"");
306     }
307     if (!n->internalSubset().isEmpty()) {
308         append(result, " [");
309         append(result, n->internalSubset());
310         append(result, "]");
311     }
312     append(result, ">");
313 }
314 
appendProcessingInstruction(Vector<UChar> & out,const String & target,const String & data)315 void MarkupAccumulator::appendProcessingInstruction(Vector<UChar>& out, const String& target, const String& data)
316 {
317     // FIXME: PI data is not escaped, but XMLSerializer (and possibly other callers) this should raise an exception if it includes "?>".
318     append(out, "<?");
319     append(out, target);
320     append(out, " ");
321     append(out, data);
322     append(out, "?>");
323 }
324 
appendElement(Vector<UChar> & out,Element * element,Namespaces * namespaces)325 void MarkupAccumulator::appendElement(Vector<UChar>& out, Element* element, Namespaces* namespaces)
326 {
327     appendOpenTag(out, element, namespaces);
328 
329     NamedNodeMap* attributes = element->attributes();
330     unsigned length = attributes->length();
331     for (unsigned int i = 0; i < length; i++)
332         appendAttribute(out, element, *attributes->attributeItem(i), namespaces);
333 
334     // Give an opportunity to subclasses to add their own attributes.
335     appendCustomAttributes(out, element, namespaces);
336 
337     appendCloseTag(out, element);
338 }
339 
appendOpenTag(Vector<UChar> & out,Element * element,Namespaces * namespaces)340 void MarkupAccumulator::appendOpenTag(Vector<UChar>& out, Element* element, Namespaces* namespaces)
341 {
342     out.append('<');
343     append(out, element->nodeNamePreservingCase());
344     if (!element->document()->isHTMLDocument() && namespaces && shouldAddNamespaceElement(element))
345         appendNamespace(out, element->prefix(), element->namespaceURI(), *namespaces);
346 }
347 
appendCloseTag(Vector<UChar> & out,Element * element)348 void MarkupAccumulator::appendCloseTag(Vector<UChar>& out, Element* element)
349 {
350     if (shouldSelfClose(element)) {
351         if (element->isHTMLElement())
352             out.append(' '); // XHTML 1.0 <-> HTML compatibility.
353         out.append('/');
354     }
355     out.append('>');
356 }
357 
appendAttribute(Vector<UChar> & out,Element * element,const Attribute & attribute,Namespaces * namespaces)358 void MarkupAccumulator::appendAttribute(Vector<UChar>& out, Element* element, const Attribute& attribute, Namespaces* namespaces)
359 {
360     bool documentIsHTML = element->document()->isHTMLDocument();
361 
362     out.append(' ');
363 
364     if (documentIsHTML)
365         append(out, attribute.name().localName());
366     else
367         append(out, attribute.name().toString());
368 
369     out.append('=');
370 
371     if (element->isURLAttribute(const_cast<Attribute*>(&attribute))) {
372         // We don't want to complete file:/// URLs because it may contain sensitive information
373         // about the user's system.
374         if (shouldResolveURLs() && !element->document()->url().isLocalFile())
375             appendQuotedURLAttributeValue(out, element->document()->completeURL(attribute.value()).string());
376         else
377             appendQuotedURLAttributeValue(out, attribute.value());
378     } else {
379         out.append('\"');
380         appendAttributeValue(out, attribute.value(), documentIsHTML);
381         out.append('\"');
382     }
383 
384     if (!documentIsHTML && namespaces && shouldAddNamespaceAttribute(attribute, *namespaces))
385         appendNamespace(out, attribute.prefix(), attribute.namespaceURI(), *namespaces);
386 }
387 
appendCDATASection(Vector<UChar> & out,const String & section)388 void MarkupAccumulator::appendCDATASection(Vector<UChar>& out, const String& section)
389 {
390     // FIXME: CDATA content is not escaped, but XMLSerializer (and possibly other callers) should raise an exception if it includes "]]>".
391     append(out, "<![CDATA[");
392     append(out, section);
393     append(out, "]]>");
394 }
395 
appendStartMarkup(Vector<UChar> & result,const Node * node,Namespaces * namespaces)396 void MarkupAccumulator::appendStartMarkup(Vector<UChar>& result, const Node* node, Namespaces* namespaces)
397 {
398     if (namespaces)
399         namespaces->checkConsistency();
400 
401     switch (node->nodeType()) {
402     case Node::TEXT_NODE:
403         appendText(result, static_cast<Text*>(const_cast<Node*>(node)));
404         break;
405     case Node::COMMENT_NODE:
406         appendComment(result, static_cast<const Comment*>(node)->data());
407         break;
408     case Node::DOCUMENT_NODE:
409     case Node::DOCUMENT_FRAGMENT_NODE:
410         break;
411     case Node::DOCUMENT_TYPE_NODE:
412         appendDocumentType(result, static_cast<const DocumentType*>(node));
413         break;
414     case Node::PROCESSING_INSTRUCTION_NODE:
415         appendProcessingInstruction(result, static_cast<const ProcessingInstruction*>(node)->target(), static_cast<const ProcessingInstruction*>(node)->data());
416         break;
417     case Node::ELEMENT_NODE:
418         appendElement(result, static_cast<Element*>(const_cast<Node*>(node)), namespaces);
419         break;
420     case Node::CDATA_SECTION_NODE:
421         appendCDATASection(result, static_cast<const CDATASection*>(node)->data());
422         break;
423     case Node::ATTRIBUTE_NODE:
424     case Node::ENTITY_NODE:
425     case Node::ENTITY_REFERENCE_NODE:
426     case Node::NOTATION_NODE:
427     case Node::XPATH_NAMESPACE_NODE:
428     case Node::SHADOW_ROOT_NODE:
429         ASSERT_NOT_REACHED();
430         break;
431     }
432 }
433 
434 // Rules of self-closure
435 // 1. No elements in HTML documents use the self-closing syntax.
436 // 2. Elements w/ children never self-close because they use a separate end tag.
437 // 3. HTML elements which do not have a "forbidden" end tag will close with a separate end tag.
438 // 4. Other elements self-close.
shouldSelfClose(const Node * node)439 bool MarkupAccumulator::shouldSelfClose(const Node* node)
440 {
441     if (node->document()->isHTMLDocument())
442         return false;
443     if (node->hasChildNodes())
444         return false;
445     if (node->isHTMLElement() && !elementCannotHaveEndTag(node))
446         return false;
447     return true;
448 }
449 
elementCannotHaveEndTag(const Node * node)450 bool MarkupAccumulator::elementCannotHaveEndTag(const Node* node)
451 {
452     if (!node->isHTMLElement())
453         return false;
454 
455     // FIXME: ieForbidsInsertHTML may not be the right function to call here
456     // ieForbidsInsertHTML is used to disallow setting innerHTML/outerHTML
457     // or createContextualFragment.  It does not necessarily align with
458     // which elements should be serialized w/o end tags.
459     return static_cast<const HTMLElement*>(node)->ieForbidsInsertHTML();
460 }
461 
appendEndMarkup(Vector<UChar> & result,const Node * node)462 void MarkupAccumulator::appendEndMarkup(Vector<UChar>& result, const Node* node)
463 {
464     if (!node->isElementNode() || shouldSelfClose(node) || (!node->hasChildNodes() && elementCannotHaveEndTag(node)))
465         return;
466 
467     result.append('<');
468     result.append('/');
469     append(result, static_cast<const Element*>(node)->nodeNamePreservingCase());
470     result.append('>');
471 }
472 
473 }
474