1 /*
2  * Copyright (C) 2010 Google, Inc. All Rights Reserved.
3  * Copyright (C) 2011 Apple Inc. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY GOOGLE INC. ``AS IS'' AND ANY
15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL GOOGLE INC. OR
18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 #ifndef THIRD_PARTY_BLINK_RENDERER_CORE_HTML_PARSER_HTML_TREE_BUILDER_H_
28 #define THIRD_PARTY_BLINK_RENDERER_CORE_HTML_PARSER_HTML_TREE_BUILDER_H_
29 
30 #include "base/macros.h"
31 #include "base/memory/scoped_refptr.h"
32 #include "third_party/blink/renderer/core/html/parser/html_construction_site.h"
33 #include "third_party/blink/renderer/core/html/parser/html_element_stack.h"
34 #include "third_party/blink/renderer/core/html/parser/html_parser_options.h"
35 #include "third_party/blink/renderer/platform/heap/handle.h"
36 #include "third_party/blink/renderer/platform/wtf/text/string_builder.h"
37 #include "third_party/blink/renderer/platform/wtf/text/text_position.h"
38 #include "third_party/blink/renderer/platform/wtf/vector.h"
39 
40 namespace blink {
41 
42 class AtomicHTMLToken;
43 class DocumentFragment;
44 class Element;
45 class HTMLDocument;
46 class HTMLDocumentParser;
47 
48 class HTMLTreeBuilder final : public GarbageCollected<HTMLTreeBuilder> {
49  public:
50   // HTMLTreeBuilder can be created for non-HTMLDocument (XHTMLDocument) from
51   // editing code.
52   // TODO(kouhei): Fix editing code to always invoke HTML parser on
53   // HTMLDocument.
54   HTMLTreeBuilder(HTMLDocumentParser*,
55                   Document&,
56                   ParserContentPolicy,
57                   const HTMLParserOptions&,
58                   bool allow_shadow_root);
59   HTMLTreeBuilder(HTMLDocumentParser*,
60                   DocumentFragment*,
61                   Element* context_element,
62                   ParserContentPolicy,
63                   const HTMLParserOptions&,
64                   bool allow_shadow_root);
65   ~HTMLTreeBuilder();
66   void Trace(Visitor*) const;
67 
OpenElements()68   const HTMLElementStack* OpenElements() const { return tree_.OpenElements(); }
69 
IsParsingFragment()70   bool IsParsingFragment() const { return !!fragment_context_.Fragment(); }
IsParsingTemplateContents()71   bool IsParsingTemplateContents() const {
72     return tree_.OpenElements()->HasTemplateInHTMLScope();
73   }
IsParsingFragmentOrTemplateContents()74   bool IsParsingFragmentOrTemplateContents() const {
75     return IsParsingFragment() || IsParsingTemplateContents();
76   }
77 
78   void Detach();
79 
80   void ConstructTree(AtomicHTMLToken*);
81 
HasParserBlockingScript()82   bool HasParserBlockingScript() const { return !!script_to_process_; }
83   // Must be called to take the parser-blocking script before calling the parser
84   // again.
85   Element* TakeScriptToProcess(TextPosition& script_start_position);
86 
87   // Done, close any open tags, etc.
88   void Finished();
89 
90   // Synchronously flush pending text and queued tasks, possibly creating more
91   // DOM nodes. Flushing pending text depends on |mode|.
Flush(FlushMode mode)92   void Flush(FlushMode mode) { tree_.Flush(mode); }
93 
SetShouldSkipLeadingNewline(bool should_skip)94   void SetShouldSkipLeadingNewline(bool should_skip) {
95     should_skip_leading_newline_ = should_skip;
96   }
97 
98  private:
99   class CharacterTokenBuffer;
100   // Represents HTML5 "insertion mode"
101   // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#insertion-mode
102   enum InsertionMode {
103     kInitialMode,
104     kBeforeHTMLMode,
105     kBeforeHeadMode,
106     kInHeadMode,
107     kInHeadNoscriptMode,
108     kAfterHeadMode,
109     kTemplateContentsMode,
110     kInBodyMode,
111     kTextMode,
112     kInTableMode,
113     kInTableTextMode,
114     kInCaptionMode,
115     kInColumnGroupMode,
116     kInTableBodyMode,
117     kInRowMode,
118     kInCellMode,
119     kInSelectMode,
120     kInSelectInTableMode,
121     kAfterBodyMode,
122     kInFramesetMode,
123     kAfterFramesetMode,
124     kAfterAfterBodyMode,
125     kAfterAfterFramesetMode,
126   };
127 #ifndef DEBUG
128   static const char* ToString(InsertionMode);
129 #endif
130 
131   void ProcessToken(AtomicHTMLToken*);
132 
133   void ProcessDoctypeToken(AtomicHTMLToken*);
134   void ProcessStartTag(AtomicHTMLToken*);
135   void ProcessEndTag(AtomicHTMLToken*);
136   void ProcessComment(AtomicHTMLToken*);
137   void ProcessCharacter(AtomicHTMLToken*);
138   void ProcessEndOfFile(AtomicHTMLToken*);
139 
140   bool ProcessStartTagForInHead(AtomicHTMLToken*);
141   void ProcessStartTagForInBody(AtomicHTMLToken*);
142   void ProcessStartTagForInTable(AtomicHTMLToken*);
143   void ProcessEndTagForInBody(AtomicHTMLToken*);
144   void ProcessEndTagForInTable(AtomicHTMLToken*);
145   void ProcessEndTagForInTableBody(AtomicHTMLToken*);
146   void ProcessEndTagForInRow(AtomicHTMLToken*);
147   void ProcessEndTagForInCell(AtomicHTMLToken*);
148 
149   void ProcessHtmlStartTagForInBody(AtomicHTMLToken*);
150   bool ProcessBodyEndTagForInBody(AtomicHTMLToken*);
151   bool ProcessTableEndTagForInTable();
152   bool ProcessCaptionEndTagForInCaption();
153   bool ProcessColgroupEndTagForInColumnGroup();
154   bool ProcessTrEndTagForInRow();
155   // FIXME: This function should be inlined into its one call site or it
156   // needs to assert which tokens it can be called with.
157   void ProcessAnyOtherEndTagForInBody(AtomicHTMLToken*);
158 
159   void ProcessCharacterBuffer(CharacterTokenBuffer&);
160   inline void ProcessCharacterBufferForInBody(CharacterTokenBuffer&);
161 
162   void ProcessFakeStartTag(
163       const QualifiedName&,
164       const Vector<Attribute>& attributes = Vector<Attribute>());
165   void ProcessFakeEndTag(const QualifiedName&);
166   void ProcessFakeEndTag(const AtomicString&);
167   void ProcessFakePEndTagIfPInButtonScope();
168 
169   void ProcessGenericRCDATAStartTag(AtomicHTMLToken*);
170   void ProcessGenericRawTextStartTag(AtomicHTMLToken*);
171   void ProcessScriptStartTag(AtomicHTMLToken*);
172 
173   // Default processing for the different insertion modes.
174   void DefaultForInitial();
175   void DefaultForBeforeHTML();
176   void DefaultForBeforeHead();
177   void DefaultForInHead();
178   void DefaultForInHeadNoscript();
179   void DefaultForAfterHead();
180   void DefaultForInTableText();
181 
182   inline HTMLStackItem* AdjustedCurrentStackItem() const;
183   inline bool ShouldProcessTokenInForeignContent(AtomicHTMLToken*);
184   void ProcessTokenInForeignContent(AtomicHTMLToken*);
185 
186   void CallTheAdoptionAgency(AtomicHTMLToken*);
187 
188   void CloseTheCell();
189 
190   template <bool shouldClose(const HTMLStackItem*)>
191   void ProcessCloseWhenNestedTag(AtomicHTMLToken*);
192 
193   void ParseError(AtomicHTMLToken*);
194 
GetInsertionMode()195   InsertionMode GetInsertionMode() const { return insertion_mode_; }
SetInsertionMode(InsertionMode mode)196   void SetInsertionMode(InsertionMode mode) { insertion_mode_ = mode; }
197 
198   void ResetInsertionModeAppropriately();
199 
200   void ProcessTemplateStartTag(AtomicHTMLToken*);
201   bool ProcessTemplateEndTag(AtomicHTMLToken*);
202   bool ProcessEndOfFileForInTemplateContents(AtomicHTMLToken*);
203 
204   class FragmentParsingContext {
205     DISALLOW_NEW();
206 
207    public:
208     FragmentParsingContext() = default;
209     void Init(DocumentFragment*, Element* context_element);
210 
Fragment()211     DocumentFragment* Fragment() const { return fragment_; }
ContextElement()212     Element* ContextElement() const {
213       DCHECK(fragment_);
214       return context_element_stack_item_->GetElement();
215     }
ContextElementStackItem()216     HTMLStackItem* ContextElementStackItem() const {
217       DCHECK(fragment_);
218       return context_element_stack_item_.Get();
219     }
220 
221     void Trace(Visitor*) const;
222 
223    private:
224     Member<DocumentFragment> fragment_;
225     Member<HTMLStackItem> context_element_stack_item_;
226 
227     DISALLOW_COPY_AND_ASSIGN(FragmentParsingContext);
228   };
229 
230   // https://html.spec.whatwg.org/C/#frameset-ok-flag
231   bool frameset_ok_;
232 #if DCHECK_IS_ON()
233   bool is_attached_ = true;
234 #endif
235   FragmentParsingContext fragment_context_;
236   HTMLConstructionSite tree_;
237 
238   // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#insertion-mode
239   InsertionMode insertion_mode_;
240 
241   // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#original-insertion-mode
242   InsertionMode original_insertion_mode_;
243 
244   Vector<InsertionMode> template_insertion_modes_;
245 
246   // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#pending-table-character-tokens
247   StringBuilder pending_table_characters_;
248 
249   bool should_skip_leading_newline_;
250 
251   const bool allow_shadow_root_;
252 
253   // We access parser because HTML5 spec requires that we be able to change the
254   // state of the tokenizer from within parser actions. We also need it to track
255   // the current position.
256   Member<HTMLDocumentParser> parser_;
257 
258   // <script> tag which needs processing before resuming the parser.
259   Member<Element> script_to_process_;
260 
261   // Starting line number of the script tag needing processing.
262   TextPosition script_to_process_start_position_;
263 
264   HTMLParserOptions options_;
265 
266   DISALLOW_COPY_AND_ASSIGN(HTMLTreeBuilder);
267 };
268 
269 }  // namespace blink
270 
271 #endif
272