1 /*
2  * Copyright (C) 2010 Google, Inc. All Rights Reserved.
3  * Copyright (C) 2011 Apple Inc. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY GOOGLE INC. ``AS IS'' AND ANY
15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL GOOGLE INC. OR
18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 #ifndef HTMLTreeBuilder_h
28 #define HTMLTreeBuilder_h
29 
30 #include "Element.h"
31 #include "FragmentScriptingPermission.h"
32 #include "HTMLConstructionSite.h"
33 #include "HTMLElementStack.h"
34 #include "HTMLFormattingElementList.h"
35 #include "HTMLTokenizer.h"
36 #include <wtf/text/TextPosition.h>
37 #include <wtf/Noncopyable.h>
38 #include <wtf/OwnPtr.h>
39 #include <wtf/PassOwnPtr.h>
40 #include <wtf/PassRefPtr.h>
41 #include <wtf/RefPtr.h>
42 #include <wtf/unicode/Unicode.h>
43 
44 namespace WebCore {
45 
46 class AtomicHTMLToken;
47 class Document;
48 class DocumentFragment;
49 class Frame;
50 class HTMLToken;
51 class HTMLDocument;
52 class Node;
53 class HTMLDocumentParser;
54 
55 class HTMLTreeBuilder {
56     WTF_MAKE_NONCOPYABLE(HTMLTreeBuilder); WTF_MAKE_FAST_ALLOCATED;
57 public:
create(HTMLDocumentParser * parser,HTMLDocument * document,bool reportErrors,bool usePreHTML5ParserQuirks)58     static PassOwnPtr<HTMLTreeBuilder> create(HTMLDocumentParser* parser, HTMLDocument* document, bool reportErrors, bool usePreHTML5ParserQuirks)
59     {
60         return adoptPtr(new HTMLTreeBuilder(parser, document, reportErrors, usePreHTML5ParserQuirks));
61     }
create(HTMLDocumentParser * parser,DocumentFragment * fragment,Element * contextElement,FragmentScriptingPermission scriptingPermission,bool usePreHTML5ParserQuirks)62     static PassOwnPtr<HTMLTreeBuilder> create(HTMLDocumentParser* parser, DocumentFragment* fragment, Element* contextElement, FragmentScriptingPermission scriptingPermission, bool usePreHTML5ParserQuirks)
63     {
64         return adoptPtr(new HTMLTreeBuilder(parser, fragment, contextElement, scriptingPermission, usePreHTML5ParserQuirks));
65     }
66     ~HTMLTreeBuilder();
67 
isParsingFragment()68     bool isParsingFragment() const { return !!m_fragmentContext.fragment(); }
69 
70     void detach();
71 
setPaused(bool paused)72     void setPaused(bool paused) { m_isPaused = paused; }
isPaused()73     bool isPaused() const { return m_isPaused; }
74 
75     // The token really should be passed as a const& since it's never modified.
76     void constructTreeFromToken(HTMLToken&);
77     void constructTreeFromAtomicToken(AtomicHTMLToken&);
78 
79     // Must be called when parser is paused before calling the parser again.
80     PassRefPtr<Element> takeScriptToProcess(TextPosition1& scriptStartPosition);
81 
82     // Done, close any open tags, etc.
83     void finished();
84 
85     static bool scriptEnabled(Frame*);
86     static bool pluginsEnabled(Frame*);
87 
88 private:
89     class FakeInsertionMode;
90     class ExternalCharacterTokenBuffer;
91     // Represents HTML5 "insertion mode"
92     // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#insertion-mode
93     enum InsertionMode {
94         InitialMode,
95         BeforeHTMLMode,
96         BeforeHeadMode,
97         InHeadMode,
98         InHeadNoscriptMode,
99         AfterHeadMode,
100         InBodyMode,
101         TextMode,
102         InTableMode,
103         InTableTextMode,
104         InCaptionMode,
105         InColumnGroupMode,
106         InTableBodyMode,
107         InRowMode,
108         InCellMode,
109         InSelectMode,
110         InSelectInTableMode,
111         InForeignContentMode,
112         AfterBodyMode,
113         InFramesetMode,
114         AfterFramesetMode,
115         AfterAfterBodyMode,
116         AfterAfterFramesetMode,
117     };
118 
119     HTMLTreeBuilder(HTMLDocumentParser* parser, HTMLDocument*, bool reportErrors, bool usePreHTML5ParserQuirks);
120     HTMLTreeBuilder(HTMLDocumentParser* parser, DocumentFragment*, Element* contextElement, FragmentScriptingPermission, bool usePreHTML5ParserQuirks);
121 
122     void processToken(AtomicHTMLToken&);
123 
124     void processDoctypeToken(AtomicHTMLToken&);
125     void processStartTag(AtomicHTMLToken&);
126     void processEndTag(AtomicHTMLToken&);
127     void processComment(AtomicHTMLToken&);
128     void processCharacter(AtomicHTMLToken&);
129     void processEndOfFile(AtomicHTMLToken&);
130 
131     bool processStartTagForInHead(AtomicHTMLToken&);
132     void processStartTagForInBody(AtomicHTMLToken&);
133     void processStartTagForInTable(AtomicHTMLToken&);
134     void processEndTagForInBody(AtomicHTMLToken&);
135     void processEndTagForInTable(AtomicHTMLToken&);
136     void processEndTagForInTableBody(AtomicHTMLToken&);
137     void processEndTagForInRow(AtomicHTMLToken&);
138     void processEndTagForInCell(AtomicHTMLToken&);
139 
140     void processIsindexStartTagForInBody(AtomicHTMLToken&);
141     bool processBodyEndTagForInBody(AtomicHTMLToken&);
142     bool processTableEndTagForInTable();
143     bool processCaptionEndTagForInCaption();
144     bool processColgroupEndTagForInColumnGroup();
145     bool processTrEndTagForInRow();
146     // FIXME: This function should be inlined into its one call site or it
147     // needs to assert which tokens it can be called with.
148     void processAnyOtherEndTagForInBody(AtomicHTMLToken&);
149 
150     void processCharacterBuffer(ExternalCharacterTokenBuffer&);
151 
152     void processFakeStartTag(const QualifiedName&, PassRefPtr<NamedNodeMap> attributes = 0);
153     void processFakeEndTag(const QualifiedName&);
154     void processFakeCharacters(const String&);
155     void processFakePEndTagIfPInButtonScope();
156 
157     void processGenericRCDATAStartTag(AtomicHTMLToken&);
158     void processGenericRawTextStartTag(AtomicHTMLToken&);
159     void processScriptStartTag(AtomicHTMLToken&);
160 
161     // Default processing for the different insertion modes.
162     void defaultForInitial();
163     void defaultForBeforeHTML();
164     void defaultForBeforeHead();
165     void defaultForInHead();
166     void defaultForInHeadNoscript();
167     void defaultForAfterHead();
168     void defaultForInTableText();
169 
170     void prepareToReprocessToken();
171 
172     void reprocessStartTag(AtomicHTMLToken&);
173     void reprocessEndTag(AtomicHTMLToken&);
174 
175     PassRefPtr<NamedNodeMap> attributesForIsindexInput(AtomicHTMLToken&);
176 
177     HTMLElementStack::ElementRecord* furthestBlockForFormattingElement(Element*);
178     void callTheAdoptionAgency(AtomicHTMLToken&);
179 
180     void closeTheCell();
181 
182     template <bool shouldClose(const ContainerNode*)>
183     void processCloseWhenNestedTag(AtomicHTMLToken&);
184 
185     bool m_framesetOk;
186 
187     void parseError(AtomicHTMLToken&);
188 
insertionMode()189     InsertionMode insertionMode() const { return m_insertionMode; }
setInsertionMode(InsertionMode mode)190     void setInsertionMode(InsertionMode mode)
191     {
192         m_insertionMode = mode;
193         m_isFakeInsertionMode = false;
194     }
195 
isFakeInsertionMode()196     bool isFakeInsertionMode() { return m_isFakeInsertionMode; }
setFakeInsertionMode(InsertionMode mode)197     void setFakeInsertionMode(InsertionMode mode)
198     {
199         m_insertionMode = mode;
200         m_isFakeInsertionMode = true;
201     }
202 
203     void resetInsertionModeAppropriately();
204 
205     void processForeignContentUsingInBodyModeAndResetMode(AtomicHTMLToken& token);
206     void resetForeignInsertionMode();
207 
208     class FragmentParsingContext {
209         WTF_MAKE_NONCOPYABLE(FragmentParsingContext);
210     public:
211         FragmentParsingContext();
212         FragmentParsingContext(DocumentFragment*, Element* contextElement, FragmentScriptingPermission);
213         ~FragmentParsingContext();
214 
fragment()215         DocumentFragment* fragment() const { return m_fragment; }
contextElement()216         Element* contextElement() const { ASSERT(m_fragment); return m_contextElement; }
scriptingPermission()217         FragmentScriptingPermission scriptingPermission() const { ASSERT(m_fragment); return m_scriptingPermission; }
218 
219     private:
220         DocumentFragment* m_fragment;
221         Element* m_contextElement;
222 
223         // FragmentScriptingNotAllowed causes the Parser to remove children
224         // from <script> tags (so javascript doesn't show up in pastes).
225         FragmentScriptingPermission m_scriptingPermission;
226     };
227 
228     FragmentParsingContext m_fragmentContext;
229 
230     Document* m_document;
231     HTMLConstructionSite m_tree;
232 
233     bool m_reportErrors;
234     bool m_isPaused;
235     bool m_isFakeInsertionMode;
236 
237     // FIXME: InsertionModes should be a separate object to prevent direct
238     // manipulation of these variables.  For now, be careful to always use
239     // setInsertionMode and never set m_insertionMode directly.
240     InsertionMode m_insertionMode;
241     InsertionMode m_originalInsertionMode;
242 
243     // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#pending-table-character-tokens
244     Vector<UChar> m_pendingTableCharacters;
245 
246     // We access parser because HTML5 spec requires that we be able to change the state of the tokenizer
247     // from within parser actions. We also need it to track the current position.
248     HTMLDocumentParser* m_parser;
249 
250     RefPtr<Element> m_scriptToProcess; // <script> tag which needs processing before resuming the parser.
251     TextPosition1 m_scriptToProcessStartPosition; // Starting line number of the script tag needing processing.
252 
253     // FIXME: We probably want to remove this member.  Originally, it was
254     // created to service the legacy tree builder, but it seems to be used for
255     // some other things now.
256     TextPosition0 m_lastScriptElementStartPosition;
257 
258     bool m_usePreHTML5ParserQuirks;
259 
260     bool m_hasPendingForeignInsertionModeSteps;
261 };
262 
263 }
264 
265 #endif
266