1 /*
2  * Copyright (C) 2011 Google Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are
6  * met:
7  *
8  *     * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  *     * Redistributions in binary form must reproduce the above
11  * copyright notice, this list of conditions and the following disclaimer
12  * in the documentation and/or other materials provided with the
13  * distribution.
14  *     * Neither the name of Google Inc. nor the names of its
15  * contributors may be used to endorse or promote products derived from
16  * this software without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 #include "config.h"
32 #include "PageSerializer.h"
33 
34 #include "CSSImageValue.h"
35 #include "CSSImportRule.h"
36 #include "CSSStyleRule.h"
37 #include "CachedImage.h"
38 #include "Document.h"
39 #include "Element.h"
40 #include "Frame.h"
41 #include "HTMLFrameOwnerElement.h"
42 #include "HTMLHeadElement.h"
43 #include "HTMLImageElement.h"
44 #include "HTMLLinkElement.h"
45 #include "HTMLMetaCharsetParser.h"
46 #include "HTMLNames.h"
47 #include "HTMLStyleElement.h"
48 #include "HTTPParsers.h"
49 #include "Image.h"
50 #include "MIMETypeRegistry.h"
51 #include "MarkupAccumulator.h"
52 #include "Page.h"
53 #include "StyleCachedImage.h"
54 #include "StyleImage.h"
55 #include "Text.h"
56 #include "TextEncoding.h"
57 #include <wtf/text/StringBuilder.h>
58 #include <wtf/text/StringConcatenate.h>
59 
60 namespace WebCore {
61 
isCharsetSpecifyingNode(Node * node)62 static bool isCharsetSpecifyingNode(Node* node)
63 {
64     if (!node->isHTMLElement())
65         return false;
66 
67     HTMLElement* element = toHTMLElement(node);
68     if (!element->hasTagName(HTMLNames::metaTag))
69         return false;
70     HTMLMetaCharsetParser::AttributeList attributes;
71     const NamedNodeMap* attributesMap = element->attributes(true);
72     for (unsigned i = 0; i < attributesMap->length(); ++i) {
73         Attribute* item = attributesMap->attributeItem(i);
74         // FIXME: We should deal appropriately with the attribute if they have a namespace.
75         attributes.append(make_pair(item->name().toString(), item->value().string()));
76     }
77     TextEncoding textEncoding = HTMLMetaCharsetParser::encodingFromMetaAttributes(attributes);
78     return textEncoding.isValid();
79 }
80 
shouldIgnoreElement(Element * element)81 static bool shouldIgnoreElement(Element* element)
82 {
83     return element->hasTagName(HTMLNames::scriptTag) || element->hasTagName(HTMLNames::noscriptTag) || isCharsetSpecifyingNode(element);
84 }
85 
frameOwnerURLAttributeName(const HTMLFrameOwnerElement & frameOwner)86 static const QualifiedName& frameOwnerURLAttributeName(const HTMLFrameOwnerElement& frameOwner)
87 {
88     // FIXME: We should support all frame owners including applets.
89     return frameOwner.hasTagName(HTMLNames::objectTag) ? HTMLNames::dataAttr : HTMLNames::srcAttr;
90 }
91 
92 class SerializerMarkupAccumulator : public WebCore::MarkupAccumulator {
93 public:
94     SerializerMarkupAccumulator(PageSerializer*, Document*, Vector<Node*>*);
95     virtual ~SerializerMarkupAccumulator();
96 
97 protected:
98     virtual void appendText(Vector<UChar>& out, Text*);
99     virtual void appendElement(Vector<UChar>& out, Element*, Namespaces*);
100     virtual void appendCustomAttributes(Vector<UChar>& out, Element*, Namespaces*);
101     virtual void appendEndTag(Node*);
102 
103 private:
104     PageSerializer* m_serializer;
105     Document* m_document;
106 };
107 
SerializerMarkupAccumulator(PageSerializer * serializer,Document * document,Vector<Node * > * nodes)108 SerializerMarkupAccumulator::SerializerMarkupAccumulator(PageSerializer* serializer, Document* document, Vector<Node*>* nodes)
109     : MarkupAccumulator(nodes, AbsoluteURLs)
110     , m_serializer(serializer)
111     , m_document(document)
112 {
113     // MarkupAccumulator does not serialize the <?xml ... line, so we add it explicitely to ensure the right encoding is specified.
114     if (m_document->isXHTMLDocument() || m_document->xmlStandalone() || m_document->isSVGDocument())
115         appendString(makeString("<?xml version=\"", m_document->xmlVersion(), "\" encoding=\"", m_document->charset(), "\"?>"));
116 }
117 
~SerializerMarkupAccumulator()118 SerializerMarkupAccumulator::~SerializerMarkupAccumulator()
119 {
120 }
121 
appendText(Vector<UChar> & out,Text * text)122 void SerializerMarkupAccumulator::appendText(Vector<UChar>& out, Text* text)
123 {
124     Element* parent = text->parentElement();
125     if (parent && !shouldIgnoreElement(parent))
126         MarkupAccumulator::appendText(out, text);
127 }
128 
appendElement(Vector<UChar> & out,Element * element,Namespaces * namespaces)129 void SerializerMarkupAccumulator::appendElement(Vector<UChar>& out, Element* element, Namespaces* namespaces)
130 {
131     if (!shouldIgnoreElement(element))
132         MarkupAccumulator::appendElement(out, element, namespaces);
133 
134     if (element->hasTagName(HTMLNames::headTag)) {
135         String meta = makeString("<meta charset=\"", m_document->charset(), "\">");
136         out.append(meta.characters(), meta.length());
137     }
138 
139     // FIXME: For object (plugins) tags and video tag we could replace them by an image of their current contents.
140 }
141 
appendCustomAttributes(Vector<UChar> & out,Element * element,Namespaces * namespaces)142 void SerializerMarkupAccumulator::appendCustomAttributes(Vector<UChar>& out, Element* element, Namespaces* namespaces)
143 {
144     if (!element->isFrameOwnerElement())
145         return;
146 
147     HTMLFrameOwnerElement* frameOwner = static_cast<HTMLFrameOwnerElement*>(element);
148     Frame* frame = frameOwner->contentFrame();
149     if (!frame)
150         return;
151 
152     KURL url = frame->document()->url();
153     if (url.isValid() && !url.protocolIs("about"))
154         return;
155 
156     // We need to give a fake location to blank frames so they can be referenced by the serialized frame.
157     url = m_serializer->urlForBlankFrame(frame);
158     RefPtr<Attribute> attribute = Attribute::create(frameOwnerURLAttributeName(*frameOwner), url.string());
159     appendAttribute(out, element, *attribute, namespaces);
160 }
161 
appendEndTag(Node * node)162 void SerializerMarkupAccumulator::appendEndTag(Node* node)
163 {
164     if (node->isElementNode() && !shouldIgnoreElement(toElement(node)))
165         MarkupAccumulator::appendEndTag(node);
166 }
167 
Resource()168 PageSerializer::Resource::Resource()
169 {
170 }
171 
Resource(const KURL & url,const String & mimeType,PassRefPtr<SharedBuffer> data)172 PageSerializer::Resource::Resource(const KURL& url, const String& mimeType, PassRefPtr<SharedBuffer> data)
173     : url(url)
174     , mimeType(mimeType)
175     , data(data)
176 {
177 }
178 
PageSerializer(Vector<PageSerializer::Resource> * resources)179 PageSerializer::PageSerializer(Vector<PageSerializer::Resource>* resources)
180     : m_resources(resources)
181     , m_blankFrameCounter(0)
182 {
183 }
184 
serialize(Page * page)185 void PageSerializer::serialize(Page* page)
186 {
187     serializeFrame(page->mainFrame());
188 }
189 
serializeFrame(Frame * frame)190 void PageSerializer::serializeFrame(Frame* frame)
191 {
192     Document* document = frame->document();
193     KURL url = document->url();
194     if (!url.isValid() || url.protocolIs("about")) {
195         // For blank frames we generate a fake URL so they can be referenced by their containing frame.
196         url = urlForBlankFrame(frame);
197     }
198 
199     if (m_resourceURLs.contains(url)) {
200         // FIXME: We could have 2 frame with the same URL but which were dynamically changed and have now
201         // different content. So we should serialize both and somehow rename the frame src in the containing
202         // frame. Arg!
203         return;
204     }
205 
206     Vector<Node*> nodes;
207     SerializerMarkupAccumulator accumulator(this, document, &nodes);
208     TextEncoding textEncoding(TextEncoding(document->charset()));
209     ASSERT(textEncoding.isValid());
210     String text = accumulator.serializeNodes(document->documentElement(), 0, IncludeNode);
211     CString frameHTML = textEncoding.encode(text.characters(), text.length(), EntitiesForUnencodables);
212     m_resources->append(Resource(url, document->suggestedMIMEType(), SharedBuffer::create(frameHTML.data(), frameHTML.length())));
213     m_resourceURLs.add(url);
214 
215     for (Vector<Node*>::iterator iter = nodes.begin(); iter != nodes.end(); ++iter) {
216         Node* node = *iter;
217         if (!node->isElementNode())
218             continue;
219 
220         Element* element = toElement(node);
221         // We have to process in-line style as it might contain some resources (typically background images).
222         retrieveResourcesForCSSDeclaration(element->style());
223 
224         if (element->hasTagName(HTMLNames::imgTag)) {
225             HTMLImageElement* imageElement = static_cast<HTMLImageElement*>(element);
226             KURL url = document->completeURL(imageElement->getAttribute(HTMLNames::srcAttr));
227             CachedImage* cachedImage = imageElement->cachedImage();
228             addImageToResources(cachedImage, url);
229         } else if (element->hasTagName(HTMLNames::linkTag)) {
230             HTMLLinkElement* linkElement = static_cast<HTMLLinkElement*>(element);
231             StyleSheet* sheet = linkElement->sheet();
232             if (sheet && sheet->isCSSStyleSheet()) {
233                 KURL url = document->completeURL(linkElement->getAttribute(HTMLNames::hrefAttr));
234                 serializeCSSStyleSheet(static_cast<CSSStyleSheet*>(sheet), url);
235                 ASSERT(m_resourceURLs.contains(url));
236             }
237         } else if (element->hasTagName(HTMLNames::styleTag)) {
238             HTMLStyleElement* styleElement = static_cast<HTMLStyleElement*>(element);
239             StyleSheet* sheet = styleElement->sheet();
240             if (sheet && sheet->isCSSStyleSheet())
241                 serializeCSSStyleSheet(static_cast<CSSStyleSheet*>(sheet), KURL());
242         }
243     }
244 
245     for (Frame* childFrame = frame->tree()->firstChild(); childFrame; childFrame = childFrame->tree()->nextSibling())
246         serializeFrame(childFrame);
247 }
248 
serializeCSSStyleSheet(CSSStyleSheet * styleSheet,const KURL & url)249 void PageSerializer::serializeCSSStyleSheet(CSSStyleSheet* styleSheet, const KURL& url)
250 {
251     StringBuilder cssText;
252     for (unsigned i = 0; i < styleSheet->length(); ++i) {
253         StyleBase* item = styleSheet->item(i);
254         String itemText = item->cssText();
255         if (!itemText.isEmpty()) {
256             cssText.append(itemText);
257             if (i < styleSheet->length() - 1)
258                 cssText.append("\n\n");
259         }
260         // Some rules have resources associated with them that we need to retrieve.
261         if (item->isImportRule()) {
262             CSSImportRule* importRule = static_cast<CSSImportRule*>(item);
263             KURL importURL = styleSheet->document()->completeURL(importRule->href());
264             if (m_resourceURLs.contains(importURL))
265                 continue;
266             serializeCSSStyleSheet(importRule->styleSheet(), importURL);
267         } else if (item->isFontFaceRule()) {
268             // FIXME: Add support for font face rule. It is not clear to me at this point if the actual otf/eot file can
269             // be retrieved from the CSSFontFaceRule object.
270         } else if (item->isStyleRule())
271             retrieveResourcesForCSSRule(static_cast<CSSStyleRule*>(item));
272     }
273 
274     if (url.isValid() && !m_resourceURLs.contains(url)) {
275         // FIXME: We should check whether a charset has been specified and if none was found add one.
276         TextEncoding textEncoding = TextEncoding(styleSheet->charset());
277         ASSERT(textEncoding.isValid());
278         String textString = cssText.toString();
279         CString text = textEncoding.encode(textString.characters(), textString.length(), EntitiesForUnencodables);
280         m_resources->append(Resource(url, String("text/css"), SharedBuffer::create(text.data(), text.length())));
281         m_resourceURLs.add(url);
282     }
283 }
284 
addImageToResources(CachedImage * image,const KURL & url)285 void PageSerializer::addImageToResources(CachedImage* image, const KURL& url)
286 {
287     if (!url.isValid() || m_resourceURLs.contains(url))
288         return;
289 
290     if (!image || image->image() == Image::nullImage())
291         return;
292 
293     String mimeType = image->response().mimeType();
294     m_resources->append(Resource(url, mimeType, image->image()->data()));
295     m_resourceURLs.add(url);
296 }
297 
retrieveResourcesForCSSRule(CSSStyleRule * rule)298 void PageSerializer::retrieveResourcesForCSSRule(CSSStyleRule* rule)
299 {
300     retrieveResourcesForCSSDeclaration(rule->style());
301 }
302 
retrieveResourcesForCSSDeclaration(CSSStyleDeclaration * styleDeclaration)303 void PageSerializer::retrieveResourcesForCSSDeclaration(CSSStyleDeclaration* styleDeclaration)
304 {
305     if (!styleDeclaration)
306         return;
307 
308     if (!styleDeclaration->stylesheet()->isCSSStyleSheet())
309         return;
310 
311     CSSStyleSheet* cssStyleSheet = static_cast<CSSStyleSheet*>(styleDeclaration->stylesheet());
312 
313     // The background-image and list-style-image (for ul or ol) are the CSS properties
314     // that make use of images. We iterate to make sure we include any other
315     // image properties there might be.
316     for (unsigned i = 0; i < styleDeclaration->length(); ++i) {
317         // FIXME: It's kind of ridiculous to get the property name and then get
318         // the value out of the name. Ideally we would get the value out of the
319         // property ID, but CSSStyleDeclaration only gives access to property
320         // names, not IDs.
321         RefPtr<CSSValue> cssValue = styleDeclaration->getPropertyCSSValue(styleDeclaration->item(i));
322         if (!cssValue->isImageValue())
323             continue;
324 
325         CSSImageValue* imageValue = static_cast<CSSImageValue*>(cssValue.get());
326         StyleImage* styleImage = imageValue->cachedOrPendingImage();
327         // Non cached-images are just place-holders and do not contain data.
328         if (!styleImage || !styleImage->isCachedImage())
329             continue;
330 
331         CachedImage* image = static_cast<StyleCachedImage*>(styleImage)->cachedImage();
332 
333         KURL url = cssStyleSheet->document()->completeURL(image->url());
334         addImageToResources(image, url);
335     }
336 }
337 
urlForBlankFrame(Frame * frame)338 KURL PageSerializer::urlForBlankFrame(Frame* frame)
339 {
340     HashMap<Frame*, KURL>::iterator iter = m_blankFrameURLs.find(frame);
341     if (iter != m_blankFrameURLs.end())
342         return iter->second;
343     String url = makeString("wyciwyg://frame/", String::number(m_blankFrameCounter++));
344     KURL fakeURL(ParsedURLString, url);
345     m_blankFrameURLs.add(frame, fakeURL);
346 
347     return fakeURL;
348 }
349 
350 }
351