html/parser/html_tokenizer.h

/*
 * Copyright (C) 2008 Apple Inc. All Rights Reserved.
 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef THIRD_PARTY_BLINK_RENDERER_CORE_HTML_PARSER_HTML_TOKENIZER_H_
#define THIRD_PARTY_BLINK_RENDERER_CORE_HTML_PARSER_HTML_TOKENIZER_H_

#include <memory>

#include "base/macros.h"
#include "base/memory/ptr_util.h"
#include "third_party/blink/renderer/core/core_export.h"
#include "third_party/blink/renderer/core/html/parser/html_parser_options.h"
#include "third_party/blink/renderer/core/html/parser/html_token.h"
#include "third_party/blink/renderer/core/html/parser/input_stream_preprocessor.h"
#include "third_party/blink/renderer/platform/text/segmented_string.h"

namespace blink {

class CORE_EXPORT HTMLTokenizer {
  USING_FAST_MALLOC(HTMLTokenizer);

 public:
  explicit HTMLTokenizer(const HTMLParserOptions&);
  ~HTMLTokenizer();

  void Reset();

  enum State {
    kDataState,
    kCharacterReferenceInDataState,
    kRCDATAState,
    kCharacterReferenceInRCDATAState,
    kRAWTEXTState,
    kScriptDataState,
    kPLAINTEXTState,
    kTagOpenState,
    kEndTagOpenState,
    kTagNameState,
    kRCDATALessThanSignState,
    kRCDATAEndTagOpenState,
    kRCDATAEndTagNameState,
    kRAWTEXTLessThanSignState,
    kRAWTEXTEndTagOpenState,
    kRAWTEXTEndTagNameState,
    kScriptDataLessThanSignState,
    kScriptDataEndTagOpenState,
    kScriptDataEndTagNameState,
    kScriptDataEscapeStartState,
    kScriptDataEscapeStartDashState,
    kScriptDataEscapedState,
    kScriptDataEscapedDashState,
    kScriptDataEscapedDashDashState,
    kScriptDataEscapedLessThanSignState,
    kScriptDataEscapedEndTagOpenState,
    kScriptDataEscapedEndTagNameState,
    kScriptDataDoubleEscapeStartState,
    kScriptDataDoubleEscapedState,
    kScriptDataDoubleEscapedDashState,
    kScriptDataDoubleEscapedDashDashState,
    kScriptDataDoubleEscapedLessThanSignState,
    kScriptDataDoubleEscapeEndState,
    kBeforeAttributeNameState,
    kAttributeNameState,
    kAfterAttributeNameState,
    kBeforeAttributeValueState,
    kAttributeValueDoubleQuotedState,
    kAttributeValueSingleQuotedState,
    kAttributeValueUnquotedState,
    kCharacterReferenceInAttributeValueState,
    kAfterAttributeValueQuotedState,
    kSelfClosingStartTagState,
    kBogusCommentState,
    // The ContinueBogusCommentState is not in the HTML5 spec, but we use
    // it internally to keep track of whether we've started the bogus
    // comment token yet.
    kContinueBogusCommentState,
    kMarkupDeclarationOpenState,
    kCommentStartState,
    kCommentStartDashState,
    kCommentState,
    kCommentEndDashState,
    kCommentEndState,
    kCommentEndBangState,
    kDOCTYPEState,
    kBeforeDOCTYPENameState,
    kDOCTYPENameState,
    kAfterDOCTYPENameState,
    kAfterDOCTYPEPublicKeywordState,
    kBeforeDOCTYPEPublicIdentifierState,
    kDOCTYPEPublicIdentifierDoubleQuotedState,
    kDOCTYPEPublicIdentifierSingleQuotedState,
    kAfterDOCTYPEPublicIdentifierState,
    kBetweenDOCTYPEPublicAndSystemIdentifiersState,
    kAfterDOCTYPESystemKeywordState,
    kBeforeDOCTYPESystemIdentifierState,
    kDOCTYPESystemIdentifierDoubleQuotedState,
    kDOCTYPESystemIdentifierSingleQuotedState,
    kAfterDOCTYPESystemIdentifierState,
    kBogusDOCTYPEState,
    kCDATASectionState,
    kCDATASectionBracketState,
    kCDATASectionEndState,
  };

  // This function returns true if it emits a token. Otherwise, callers
  // must provide the same (in progress) token on the next call (unless
  // they call reset() first).
  bool NextToken(SegmentedString&, HTMLToken&);

  // Returns a copy of any characters buffered internally by the tokenizer.
  // The tokenizer buffers characters when searching for the </script> token
  // that terminates a script element.
  String BufferedCharacters() const;

  wtf_size_t NumberOfBufferedCharacters() const {
    // Notice that we add 2 to the length of the temporary_buffer_ to
    // account for the "</" characters, which are effectively buffered in
    // the tokenizer's state machine.
    return temporary_buffer_.size() ? temporary_buffer_.size() + 2 : 0;
  }

  // Updates the tokenizer's state according to the given tag name. This is
  // an approximation of how the tree builder would update the tokenizer's
  // state. This method is useful for approximating HTML tokenization. To
  // get exactly the correct tokenization, you need the real tree builder.
  //
  // The main failures in the approximation are as follows:
  //
  //  * The first set of character tokens emitted for a <pre> element might
  //    contain an extra leading newline.
  //  * The replacement of U+0000 with U+FFFD will not be sensitive to the
  //    tree builder's insertion mode.
  //  * CDATA sections in foreign content will be tokenized as bogus comments
  //    instead of as character tokens.
  //
  void UpdateStateFor(const String& tag_name);

  bool ForceNullCharacterReplacement() const {
    return force_null_character_replacement_;
  }
  void SetForceNullCharacterReplacement(bool value) {
    force_null_character_replacement_ = value;
  }

  bool ShouldAllowCDATA() const { return should_allow_cdata_; }
  void SetShouldAllowCDATA(bool value) { should_allow_cdata_ = value; }

  State GetState() const { return state_; }
  void SetState(State state) { state_ = state; }

  inline bool ShouldSkipNullCharacters() const {
    return !force_null_character_replacement_ &&
           (state_ == HTMLTokenizer::kDataState ||
            state_ == HTMLTokenizer::kRCDATAState ||
            state_ == HTMLTokenizer::kRAWTEXTState);
  }

  inline static bool IsEndTagBufferingState(HTMLTokenizer::State state) {
    switch (state) {
      case HTMLTokenizer::kRCDATAEndTagOpenState:
      case HTMLTokenizer::kRCDATAEndTagNameState:
      case HTMLTokenizer::kRAWTEXTEndTagOpenState:
      case HTMLTokenizer::kRAWTEXTEndTagNameState:
      case HTMLTokenizer::kScriptDataEndTagOpenState:
      case HTMLTokenizer::kScriptDataEndTagNameState:
      case HTMLTokenizer::kScriptDataEscapedEndTagOpenState:
      case HTMLTokenizer::kScriptDataEscapedEndTagNameState:
        return true;
      default:
        return false;
    }
  }

 private:
  inline bool ProcessEntity(SegmentedString&);

  inline void ParseError();

  inline void BufferCharacter(UChar character) {
    DCHECK_NE(character, kEndOfFileMarker);
    token_->EnsureIsCharacterToken();
    token_->AppendToCharacter(character);
  }

  inline bool EmitAndResumeIn(SegmentedString& source, State state) {
    SaveEndTagNameIfNeeded();
    state_ = state;
    source.AdvanceAndUpdateLineNumber();
    return true;
  }

  inline bool EmitAndReconsumeIn(SegmentedString&, State state) {
    SaveEndTagNameIfNeeded();
    state_ = state;
    return true;
  }

  inline bool EmitEndOfFile(SegmentedString& source) {
    if (HaveBufferedCharacterToken())
      return true;
    state_ = HTMLTokenizer::kDataState;
    source.AdvanceAndUpdateLineNumber();
    token_->Clear();
    token_->MakeEndOfFile();
    return true;
  }

  inline bool FlushEmitAndResumeIn(SegmentedString&, State);

  // Return whether we need to emit a character token before dealing with
  // the buffered end tag.
  inline bool FlushBufferedEndTag(SegmentedString&);
  inline bool TemporaryBufferIs(const String&);

  // Sometimes we speculatively consume input characters and we don't
  // know whether they represent end tags or RCDATA, etc. These
  // functions help manage these state.
  inline void AddToPossibleEndTag(LChar cc);

  inline void SaveEndTagNameIfNeeded() {
    DCHECK_NE(token_->GetType(), HTMLToken::kUninitialized);
    if (token_->GetType() == HTMLToken::kStartTag)
      appropriate_end_tag_name_ = token_->GetName();
  }
  inline bool IsAppropriateEndTag();

  inline bool HaveBufferedCharacterToken() {
    return token_->GetType() == HTMLToken::kCharacter;
  }

  State state_;
  bool force_null_character_replacement_;
  bool should_allow_cdata_;

  // token_ is owned by the caller. If NextToken is not on the stack,
  // this member might be pointing to unallocated memory.
  HTMLToken* token_;

  // http://www.whatwg.org/specs/web-apps/current-work/#additional-allowed-character
  UChar additional_allowed_character_;

  // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
  InputStreamPreprocessor<HTMLTokenizer> input_stream_preprocessor_;

  Vector<UChar, 32> appropriate_end_tag_name_;

  // http://www.whatwg.org/specs/web-apps/current-work/#temporary-buffer
  Vector<LChar, 32> temporary_buffer_;

  // We occationally want to emit both a character token and an end tag
  // token (e.g., when lexing script). We buffer the name of the end tag
  // token here so we remember it next time we re-enter the tokenizer.
  Vector<LChar, 32> buffered_end_tag_name_;

  HTMLParserOptions options_;

  DISALLOW_COPY_AND_ASSIGN(HTMLTokenizer);
};

}  // namespace blink

#endif