1 /*
2  * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3  * Copyright (C) 2010 Google, Inc. All Rights Reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 #ifndef THIRD_PARTY_BLINK_RENDERER_CORE_HTML_PARSER_HTML_TOKENIZER_H_
28 #define THIRD_PARTY_BLINK_RENDERER_CORE_HTML_PARSER_HTML_TOKENIZER_H_
29 
30 #include <memory>
31 
32 #include "base/macros.h"
33 #include "base/memory/ptr_util.h"
34 #include "third_party/blink/renderer/core/core_export.h"
35 #include "third_party/blink/renderer/core/html/parser/html_parser_options.h"
36 #include "third_party/blink/renderer/core/html/parser/html_token.h"
37 #include "third_party/blink/renderer/core/html/parser/input_stream_preprocessor.h"
38 #include "third_party/blink/renderer/platform/text/segmented_string.h"
39 
40 namespace blink {
41 
42 class CORE_EXPORT HTMLTokenizer {
43   USING_FAST_MALLOC(HTMLTokenizer);
44 
45  public:
46   explicit HTMLTokenizer(const HTMLParserOptions&);
47   ~HTMLTokenizer();
48 
49   void Reset();
50 
51   enum State {
52     kDataState,
53     kCharacterReferenceInDataState,
54     kRCDATAState,
55     kCharacterReferenceInRCDATAState,
56     kRAWTEXTState,
57     kScriptDataState,
58     kPLAINTEXTState,
59     kTagOpenState,
60     kEndTagOpenState,
61     kTagNameState,
62     kRCDATALessThanSignState,
63     kRCDATAEndTagOpenState,
64     kRCDATAEndTagNameState,
65     kRAWTEXTLessThanSignState,
66     kRAWTEXTEndTagOpenState,
67     kRAWTEXTEndTagNameState,
68     kScriptDataLessThanSignState,
69     kScriptDataEndTagOpenState,
70     kScriptDataEndTagNameState,
71     kScriptDataEscapeStartState,
72     kScriptDataEscapeStartDashState,
73     kScriptDataEscapedState,
74     kScriptDataEscapedDashState,
75     kScriptDataEscapedDashDashState,
76     kScriptDataEscapedLessThanSignState,
77     kScriptDataEscapedEndTagOpenState,
78     kScriptDataEscapedEndTagNameState,
79     kScriptDataDoubleEscapeStartState,
80     kScriptDataDoubleEscapedState,
81     kScriptDataDoubleEscapedDashState,
82     kScriptDataDoubleEscapedDashDashState,
83     kScriptDataDoubleEscapedLessThanSignState,
84     kScriptDataDoubleEscapeEndState,
85     kBeforeAttributeNameState,
86     kAttributeNameState,
87     kAfterAttributeNameState,
88     kBeforeAttributeValueState,
89     kAttributeValueDoubleQuotedState,
90     kAttributeValueSingleQuotedState,
91     kAttributeValueUnquotedState,
92     kCharacterReferenceInAttributeValueState,
93     kAfterAttributeValueQuotedState,
94     kSelfClosingStartTagState,
95     kBogusCommentState,
96     // The ContinueBogusCommentState is not in the HTML5 spec, but we use
97     // it internally to keep track of whether we've started the bogus
98     // comment token yet.
99     kContinueBogusCommentState,
100     kMarkupDeclarationOpenState,
101     kCommentStartState,
102     kCommentStartDashState,
103     kCommentState,
104     kCommentEndDashState,
105     kCommentEndState,
106     kCommentEndBangState,
107     kDOCTYPEState,
108     kBeforeDOCTYPENameState,
109     kDOCTYPENameState,
110     kAfterDOCTYPENameState,
111     kAfterDOCTYPEPublicKeywordState,
112     kBeforeDOCTYPEPublicIdentifierState,
113     kDOCTYPEPublicIdentifierDoubleQuotedState,
114     kDOCTYPEPublicIdentifierSingleQuotedState,
115     kAfterDOCTYPEPublicIdentifierState,
116     kBetweenDOCTYPEPublicAndSystemIdentifiersState,
117     kAfterDOCTYPESystemKeywordState,
118     kBeforeDOCTYPESystemIdentifierState,
119     kDOCTYPESystemIdentifierDoubleQuotedState,
120     kDOCTYPESystemIdentifierSingleQuotedState,
121     kAfterDOCTYPESystemIdentifierState,
122     kBogusDOCTYPEState,
123     kCDATASectionState,
124     kCDATASectionBracketState,
125     kCDATASectionEndState,
126   };
127 
128   // This function returns true if it emits a token. Otherwise, callers
129   // must provide the same (in progress) token on the next call (unless
130   // they call reset() first).
131   bool NextToken(SegmentedString&, HTMLToken&);
132 
133   // Returns a copy of any characters buffered internally by the tokenizer.
134   // The tokenizer buffers characters when searching for the </script> token
135   // that terminates a script element.
136   String BufferedCharacters() const;
137 
NumberOfBufferedCharacters()138   wtf_size_t NumberOfBufferedCharacters() const {
139     // Notice that we add 2 to the length of the temporary_buffer_ to
140     // account for the "</" characters, which are effectively buffered in
141     // the tokenizer's state machine.
142     return temporary_buffer_.size() ? temporary_buffer_.size() + 2 : 0;
143   }
144 
145   // Updates the tokenizer's state according to the given tag name. This is
146   // an approximation of how the tree builder would update the tokenizer's
147   // state. This method is useful for approximating HTML tokenization. To
148   // get exactly the correct tokenization, you need the real tree builder.
149   //
150   // The main failures in the approximation are as follows:
151   //
152   //  * The first set of character tokens emitted for a <pre> element might
153   //    contain an extra leading newline.
154   //  * The replacement of U+0000 with U+FFFD will not be sensitive to the
155   //    tree builder's insertion mode.
156   //  * CDATA sections in foreign content will be tokenized as bogus comments
157   //    instead of as character tokens.
158   //
159   void UpdateStateFor(const String& tag_name);
160 
ForceNullCharacterReplacement()161   bool ForceNullCharacterReplacement() const {
162     return force_null_character_replacement_;
163   }
SetForceNullCharacterReplacement(bool value)164   void SetForceNullCharacterReplacement(bool value) {
165     force_null_character_replacement_ = value;
166   }
167 
ShouldAllowCDATA()168   bool ShouldAllowCDATA() const { return should_allow_cdata_; }
SetShouldAllowCDATA(bool value)169   void SetShouldAllowCDATA(bool value) { should_allow_cdata_ = value; }
170 
GetState()171   State GetState() const { return state_; }
SetState(State state)172   void SetState(State state) { state_ = state; }
173 
ShouldSkipNullCharacters()174   inline bool ShouldSkipNullCharacters() const {
175     return !force_null_character_replacement_ &&
176            (state_ == HTMLTokenizer::kDataState ||
177             state_ == HTMLTokenizer::kRCDATAState ||
178             state_ == HTMLTokenizer::kRAWTEXTState);
179   }
180 
IsEndTagBufferingState(HTMLTokenizer::State state)181   inline static bool IsEndTagBufferingState(HTMLTokenizer::State state) {
182     switch (state) {
183       case HTMLTokenizer::kRCDATAEndTagOpenState:
184       case HTMLTokenizer::kRCDATAEndTagNameState:
185       case HTMLTokenizer::kRAWTEXTEndTagOpenState:
186       case HTMLTokenizer::kRAWTEXTEndTagNameState:
187       case HTMLTokenizer::kScriptDataEndTagOpenState:
188       case HTMLTokenizer::kScriptDataEndTagNameState:
189       case HTMLTokenizer::kScriptDataEscapedEndTagOpenState:
190       case HTMLTokenizer::kScriptDataEscapedEndTagNameState:
191         return true;
192       default:
193         return false;
194     }
195   }
196 
197  private:
198   inline bool ProcessEntity(SegmentedString&);
199 
200   inline void ParseError();
201 
BufferCharacter(UChar character)202   inline void BufferCharacter(UChar character) {
203     DCHECK_NE(character, kEndOfFileMarker);
204     token_->EnsureIsCharacterToken();
205     token_->AppendToCharacter(character);
206   }
207 
EmitAndResumeIn(SegmentedString & source,State state)208   inline bool EmitAndResumeIn(SegmentedString& source, State state) {
209     SaveEndTagNameIfNeeded();
210     state_ = state;
211     source.AdvanceAndUpdateLineNumber();
212     return true;
213   }
214 
EmitAndReconsumeIn(SegmentedString &,State state)215   inline bool EmitAndReconsumeIn(SegmentedString&, State state) {
216     SaveEndTagNameIfNeeded();
217     state_ = state;
218     return true;
219   }
220 
EmitEndOfFile(SegmentedString & source)221   inline bool EmitEndOfFile(SegmentedString& source) {
222     if (HaveBufferedCharacterToken())
223       return true;
224     state_ = HTMLTokenizer::kDataState;
225     source.AdvanceAndUpdateLineNumber();
226     token_->Clear();
227     token_->MakeEndOfFile();
228     return true;
229   }
230 
231   inline bool FlushEmitAndResumeIn(SegmentedString&, State);
232 
233   // Return whether we need to emit a character token before dealing with
234   // the buffered end tag.
235   inline bool FlushBufferedEndTag(SegmentedString&);
236   inline bool TemporaryBufferIs(const String&);
237 
238   // Sometimes we speculatively consume input characters and we don't
239   // know whether they represent end tags or RCDATA, etc. These
240   // functions help manage these state.
241   inline void AddToPossibleEndTag(LChar cc);
242 
SaveEndTagNameIfNeeded()243   inline void SaveEndTagNameIfNeeded() {
244     DCHECK_NE(token_->GetType(), HTMLToken::kUninitialized);
245     if (token_->GetType() == HTMLToken::kStartTag)
246       appropriate_end_tag_name_ = token_->GetName();
247   }
248   inline bool IsAppropriateEndTag();
249 
HaveBufferedCharacterToken()250   inline bool HaveBufferedCharacterToken() {
251     return token_->GetType() == HTMLToken::kCharacter;
252   }
253 
254   State state_;
255   bool force_null_character_replacement_;
256   bool should_allow_cdata_;
257 
258   // token_ is owned by the caller. If NextToken is not on the stack,
259   // this member might be pointing to unallocated memory.
260   HTMLToken* token_;
261 
262   // http://www.whatwg.org/specs/web-apps/current-work/#additional-allowed-character
263   UChar additional_allowed_character_;
264 
265   // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
266   InputStreamPreprocessor<HTMLTokenizer> input_stream_preprocessor_;
267 
268   Vector<UChar, 32> appropriate_end_tag_name_;
269 
270   // http://www.whatwg.org/specs/web-apps/current-work/#temporary-buffer
271   Vector<LChar, 32> temporary_buffer_;
272 
273   // We occationally want to emit both a character token and an end tag
274   // token (e.g., when lexing script). We buffer the name of the end tag
275   // token here so we remember it next time we re-enter the tokenizer.
276   Vector<LChar, 32> buffered_end_tag_name_;
277 
278   HTMLParserOptions options_;
279 
280   DISALLOW_COPY_AND_ASSIGN(HTMLTokenizer);
281 };
282 
283 }  // namespace blink
284 
285 #endif
286