// Copyright 2014-2017 The html5ever Project Developers. See the
// COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 or the MIT license
// , at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! High-level interface to the parser.
use crate::buffer_queue::BufferQueue;
use crate::tokenizer::{Tokenizer, TokenizerOpts, TokenizerResult};
use crate::tree_builder::{create_element, TreeBuilder, TreeBuilderOpts, TreeSink};
use crate::{Attribute, QualName};
use std::borrow::Cow;
use crate::tendril;
use crate::tendril::stream::{TendrilSink, Utf8LossyDecoder};
use crate::tendril::StrTendril;
/// All-encompassing options struct for the parser.
#[derive(Clone, Default)]
pub struct ParseOpts {
/// Tokenizer options.
pub tokenizer: TokenizerOpts,
/// Tree builder options.
pub tree_builder: TreeBuilderOpts,
}
/// Parse an HTML document
///
/// The returned value implements `tendril::TendrilSink`
/// so that Unicode input may be provided incrementally,
/// or all at once with the `one` method.
///
/// If your input is bytes, use `Parser::from_utf8`.
pub fn parse_document(sink: Sink, opts: ParseOpts) -> Parser
where
Sink: TreeSink,
{
let tb = TreeBuilder::new(sink, opts.tree_builder);
let tok = Tokenizer::new(tb, opts.tokenizer);
Parser {
tokenizer: tok,
input_buffer: BufferQueue::new(),
}
}
/// Parse an HTML fragment
///
/// The returned value implements `tendril::TendrilSink`
/// so that Unicode input may be provided incrementally,
/// or all at once with the `one` method.
///
/// If your input is bytes, use `Parser::from_utf8`.
pub fn parse_fragment(
mut sink: Sink,
opts: ParseOpts,
context_name: QualName,
context_attrs: Vec,
) -> Parser
where
Sink: TreeSink,
{
let context_elem = create_element(&mut sink, context_name, context_attrs);
parse_fragment_for_element(sink, opts, context_elem, None)
}
/// Like `parse_fragment`, but with an existing context element
/// and optionally a form element.
pub fn parse_fragment_for_element(
sink: Sink,
opts: ParseOpts,
context_element: Sink::Handle,
form_element: Option,
) -> Parser
where
Sink: TreeSink,
{
let tb = TreeBuilder::new_for_fragment(sink, context_element, form_element, opts.tree_builder);
let tok_opts = TokenizerOpts {
initial_state: Some(tb.tokenizer_state_for_context_elem()),
..opts.tokenizer
};
let tok = Tokenizer::new(tb, tok_opts);
Parser {
tokenizer: tok,
input_buffer: BufferQueue::new(),
}
}
/// An HTML parser,
/// ready to receive Unicode input through the `tendril::TendrilSink` trait’s methods.
pub struct Parser
where
Sink: TreeSink,
{
pub tokenizer: Tokenizer>,
pub input_buffer: BufferQueue,
}
impl TendrilSink for Parser {
fn process(&mut self, t: StrTendril) {
self.input_buffer.push_back(t);
// FIXME: Properly support somehow.
while let TokenizerResult::Script(_) = self.tokenizer.feed(&mut self.input_buffer) {}
}
// FIXME: Is it too noisy to report every character decoding error?
fn error(&mut self, desc: Cow<'static, str>) {
self.tokenizer.sink.sink.parse_error(desc)
}
type Output = Sink::Output;
fn finish(mut self) -> Self::Output {
// FIXME: Properly support somehow.
while let TokenizerResult::Script(_) = self.tokenizer.feed(&mut self.input_buffer) {}
assert!(self.input_buffer.is_empty());
self.tokenizer.end();
self.tokenizer.sink.sink.finish()
}
}
impl Parser {
/// Wrap this parser into a `TendrilSink` that accepts UTF-8 bytes.
///
/// Use this when your input is bytes that are known to be in the UTF-8 encoding.
/// Decoding is lossy, like `String::from_utf8_lossy`.
pub fn from_utf8(self) -> Utf8LossyDecoder {
Utf8LossyDecoder::new(self)
}
}