1 /* 2 * Copyright (C) 2008 Apple Inc. All Rights Reserved. 3 * Copyright (C) 2010 Google, Inc. All Rights Reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 #ifndef THIRD_PARTY_BLINK_RENDERER_CORE_HTML_PARSER_HTML_TOKENIZER_H_ 28 #define THIRD_PARTY_BLINK_RENDERER_CORE_HTML_PARSER_HTML_TOKENIZER_H_ 29 30 #include <memory> 31 32 #include "base/macros.h" 33 #include "base/memory/ptr_util.h" 34 #include "third_party/blink/renderer/core/core_export.h" 35 #include "third_party/blink/renderer/core/html/parser/html_parser_options.h" 36 #include "third_party/blink/renderer/core/html/parser/html_token.h" 37 #include "third_party/blink/renderer/core/html/parser/input_stream_preprocessor.h" 38 #include "third_party/blink/renderer/platform/text/segmented_string.h" 39 40 namespace blink { 41 42 class CORE_EXPORT HTMLTokenizer { 43 USING_FAST_MALLOC(HTMLTokenizer); 44 45 public: 46 explicit HTMLTokenizer(const HTMLParserOptions&); 47 ~HTMLTokenizer(); 48 49 void Reset(); 50 51 enum State { 52 kDataState, 53 kCharacterReferenceInDataState, 54 kRCDATAState, 55 kCharacterReferenceInRCDATAState, 56 kRAWTEXTState, 57 kScriptDataState, 58 kPLAINTEXTState, 59 kTagOpenState, 60 kEndTagOpenState, 61 kTagNameState, 62 kRCDATALessThanSignState, 63 kRCDATAEndTagOpenState, 64 kRCDATAEndTagNameState, 65 kRAWTEXTLessThanSignState, 66 kRAWTEXTEndTagOpenState, 67 kRAWTEXTEndTagNameState, 68 kScriptDataLessThanSignState, 69 kScriptDataEndTagOpenState, 70 kScriptDataEndTagNameState, 71 kScriptDataEscapeStartState, 72 kScriptDataEscapeStartDashState, 73 kScriptDataEscapedState, 74 kScriptDataEscapedDashState, 75 kScriptDataEscapedDashDashState, 76 kScriptDataEscapedLessThanSignState, 77 kScriptDataEscapedEndTagOpenState, 78 kScriptDataEscapedEndTagNameState, 79 kScriptDataDoubleEscapeStartState, 80 kScriptDataDoubleEscapedState, 81 kScriptDataDoubleEscapedDashState, 82 kScriptDataDoubleEscapedDashDashState, 83 kScriptDataDoubleEscapedLessThanSignState, 84 kScriptDataDoubleEscapeEndState, 85 kBeforeAttributeNameState, 86 kAttributeNameState, 87 kAfterAttributeNameState, 88 kBeforeAttributeValueState, 89 kAttributeValueDoubleQuotedState, 90 kAttributeValueSingleQuotedState, 91 kAttributeValueUnquotedState, 92 kCharacterReferenceInAttributeValueState, 93 kAfterAttributeValueQuotedState, 94 kSelfClosingStartTagState, 95 kBogusCommentState, 96 // The ContinueBogusCommentState is not in the HTML5 spec, but we use 97 // it internally to keep track of whether we've started the bogus 98 // comment token yet. 99 kContinueBogusCommentState, 100 kMarkupDeclarationOpenState, 101 kCommentStartState, 102 kCommentStartDashState, 103 kCommentState, 104 kCommentEndDashState, 105 kCommentEndState, 106 kCommentEndBangState, 107 kDOCTYPEState, 108 kBeforeDOCTYPENameState, 109 kDOCTYPENameState, 110 kAfterDOCTYPENameState, 111 kAfterDOCTYPEPublicKeywordState, 112 kBeforeDOCTYPEPublicIdentifierState, 113 kDOCTYPEPublicIdentifierDoubleQuotedState, 114 kDOCTYPEPublicIdentifierSingleQuotedState, 115 kAfterDOCTYPEPublicIdentifierState, 116 kBetweenDOCTYPEPublicAndSystemIdentifiersState, 117 kAfterDOCTYPESystemKeywordState, 118 kBeforeDOCTYPESystemIdentifierState, 119 kDOCTYPESystemIdentifierDoubleQuotedState, 120 kDOCTYPESystemIdentifierSingleQuotedState, 121 kAfterDOCTYPESystemIdentifierState, 122 kBogusDOCTYPEState, 123 kCDATASectionState, 124 kCDATASectionBracketState, 125 kCDATASectionEndState, 126 }; 127 128 // This function returns true if it emits a token. Otherwise, callers 129 // must provide the same (in progress) token on the next call (unless 130 // they call reset() first). 131 bool NextToken(SegmentedString&, HTMLToken&); 132 133 // Returns a copy of any characters buffered internally by the tokenizer. 134 // The tokenizer buffers characters when searching for the </script> token 135 // that terminates a script element. 136 String BufferedCharacters() const; 137 NumberOfBufferedCharacters()138 wtf_size_t NumberOfBufferedCharacters() const { 139 // Notice that we add 2 to the length of the temporary_buffer_ to 140 // account for the "</" characters, which are effectively buffered in 141 // the tokenizer's state machine. 142 return temporary_buffer_.size() ? temporary_buffer_.size() + 2 : 0; 143 } 144 145 // Updates the tokenizer's state according to the given tag name. This is 146 // an approximation of how the tree builder would update the tokenizer's 147 // state. This method is useful for approximating HTML tokenization. To 148 // get exactly the correct tokenization, you need the real tree builder. 149 // 150 // The main failures in the approximation are as follows: 151 // 152 // * The first set of character tokens emitted for a <pre> element might 153 // contain an extra leading newline. 154 // * The replacement of U+0000 with U+FFFD will not be sensitive to the 155 // tree builder's insertion mode. 156 // * CDATA sections in foreign content will be tokenized as bogus comments 157 // instead of as character tokens. 158 // 159 void UpdateStateFor(const String& tag_name); 160 ForceNullCharacterReplacement()161 bool ForceNullCharacterReplacement() const { 162 return force_null_character_replacement_; 163 } SetForceNullCharacterReplacement(bool value)164 void SetForceNullCharacterReplacement(bool value) { 165 force_null_character_replacement_ = value; 166 } 167 ShouldAllowCDATA()168 bool ShouldAllowCDATA() const { return should_allow_cdata_; } SetShouldAllowCDATA(bool value)169 void SetShouldAllowCDATA(bool value) { should_allow_cdata_ = value; } 170 GetState()171 State GetState() const { return state_; } SetState(State state)172 void SetState(State state) { state_ = state; } 173 ShouldSkipNullCharacters()174 inline bool ShouldSkipNullCharacters() const { 175 return !force_null_character_replacement_ && 176 (state_ == HTMLTokenizer::kDataState || 177 state_ == HTMLTokenizer::kRCDATAState || 178 state_ == HTMLTokenizer::kRAWTEXTState); 179 } 180 IsEndTagBufferingState(HTMLTokenizer::State state)181 inline static bool IsEndTagBufferingState(HTMLTokenizer::State state) { 182 switch (state) { 183 case HTMLTokenizer::kRCDATAEndTagOpenState: 184 case HTMLTokenizer::kRCDATAEndTagNameState: 185 case HTMLTokenizer::kRAWTEXTEndTagOpenState: 186 case HTMLTokenizer::kRAWTEXTEndTagNameState: 187 case HTMLTokenizer::kScriptDataEndTagOpenState: 188 case HTMLTokenizer::kScriptDataEndTagNameState: 189 case HTMLTokenizer::kScriptDataEscapedEndTagOpenState: 190 case HTMLTokenizer::kScriptDataEscapedEndTagNameState: 191 return true; 192 default: 193 return false; 194 } 195 } 196 197 private: 198 inline bool ProcessEntity(SegmentedString&); 199 200 inline void ParseError(); 201 BufferCharacter(UChar character)202 inline void BufferCharacter(UChar character) { 203 DCHECK_NE(character, kEndOfFileMarker); 204 token_->EnsureIsCharacterToken(); 205 token_->AppendToCharacter(character); 206 } 207 EmitAndResumeIn(SegmentedString & source,State state)208 inline bool EmitAndResumeIn(SegmentedString& source, State state) { 209 SaveEndTagNameIfNeeded(); 210 state_ = state; 211 source.AdvanceAndUpdateLineNumber(); 212 return true; 213 } 214 EmitAndReconsumeIn(SegmentedString &,State state)215 inline bool EmitAndReconsumeIn(SegmentedString&, State state) { 216 SaveEndTagNameIfNeeded(); 217 state_ = state; 218 return true; 219 } 220 EmitEndOfFile(SegmentedString & source)221 inline bool EmitEndOfFile(SegmentedString& source) { 222 if (HaveBufferedCharacterToken()) 223 return true; 224 state_ = HTMLTokenizer::kDataState; 225 source.AdvanceAndUpdateLineNumber(); 226 token_->Clear(); 227 token_->MakeEndOfFile(); 228 return true; 229 } 230 231 inline bool FlushEmitAndResumeIn(SegmentedString&, State); 232 233 // Return whether we need to emit a character token before dealing with 234 // the buffered end tag. 235 inline bool FlushBufferedEndTag(SegmentedString&); 236 inline bool TemporaryBufferIs(const String&); 237 238 // Sometimes we speculatively consume input characters and we don't 239 // know whether they represent end tags or RCDATA, etc. These 240 // functions help manage these state. 241 inline void AddToPossibleEndTag(LChar cc); 242 SaveEndTagNameIfNeeded()243 inline void SaveEndTagNameIfNeeded() { 244 DCHECK_NE(token_->GetType(), HTMLToken::kUninitialized); 245 if (token_->GetType() == HTMLToken::kStartTag) 246 appropriate_end_tag_name_ = token_->GetName(); 247 } 248 inline bool IsAppropriateEndTag(); 249 HaveBufferedCharacterToken()250 inline bool HaveBufferedCharacterToken() { 251 return token_->GetType() == HTMLToken::kCharacter; 252 } 253 254 State state_; 255 bool force_null_character_replacement_; 256 bool should_allow_cdata_; 257 258 // token_ is owned by the caller. If NextToken is not on the stack, 259 // this member might be pointing to unallocated memory. 260 HTMLToken* token_; 261 262 // http://www.whatwg.org/specs/web-apps/current-work/#additional-allowed-character 263 UChar additional_allowed_character_; 264 265 // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream 266 InputStreamPreprocessor<HTMLTokenizer> input_stream_preprocessor_; 267 268 Vector<UChar, 32> appropriate_end_tag_name_; 269 270 // http://www.whatwg.org/specs/web-apps/current-work/#temporary-buffer 271 Vector<LChar, 32> temporary_buffer_; 272 273 // We occationally want to emit both a character token and an end tag 274 // token (e.g., when lexing script). We buffer the name of the end tag 275 // token here so we remember it next time we re-enter the tokenizer. 276 Vector<LChar, 32> buffered_end_tag_name_; 277 278 HTMLParserOptions options_; 279 280 DISALLOW_COPY_AND_ASSIGN(HTMLTokenizer); 281 }; 282 283 } // namespace blink 284 285 #endif 286