1 // Copyright 2014-2017 The html5ever Project Developers. See the
2 // COPYRIGHT file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 //! High-level interface to the parser.
11 
12 use crate::buffer_queue::BufferQueue;
13 use crate::tokenizer::{Tokenizer, TokenizerOpts, TokenizerResult};
14 use crate::tree_builder::{create_element, TreeBuilder, TreeBuilderOpts, TreeSink};
15 use crate::{Attribute, QualName};
16 
17 use std::borrow::Cow;
18 
19 use crate::tendril;
20 use crate::tendril::stream::{TendrilSink, Utf8LossyDecoder};
21 use crate::tendril::StrTendril;
22 
23 /// All-encompassing options struct for the parser.
24 #[derive(Clone, Default)]
25 pub struct ParseOpts {
26     /// Tokenizer options.
27     pub tokenizer: TokenizerOpts,
28 
29     /// Tree builder options.
30     pub tree_builder: TreeBuilderOpts,
31 }
32 
33 /// Parse an HTML document
34 ///
35 /// The returned value implements `tendril::TendrilSink`
36 /// so that Unicode input may be provided incrementally,
37 /// or all at once with the `one` method.
38 ///
39 /// If your input is bytes, use `Parser::from_utf8`.
parse_document<Sink>(sink: Sink, opts: ParseOpts) -> Parser<Sink> where Sink: TreeSink,40 pub fn parse_document<Sink>(sink: Sink, opts: ParseOpts) -> Parser<Sink>
41 where
42     Sink: TreeSink,
43 {
44     let tb = TreeBuilder::new(sink, opts.tree_builder);
45     let tok = Tokenizer::new(tb, opts.tokenizer);
46     Parser {
47         tokenizer: tok,
48         input_buffer: BufferQueue::new(),
49     }
50 }
51 
52 /// Parse an HTML fragment
53 ///
54 /// The returned value implements `tendril::TendrilSink`
55 /// so that Unicode input may be provided incrementally,
56 /// or all at once with the `one` method.
57 ///
58 /// If your input is bytes, use `Parser::from_utf8`.
parse_fragment<Sink>( mut sink: Sink, opts: ParseOpts, context_name: QualName, context_attrs: Vec<Attribute>, ) -> Parser<Sink> where Sink: TreeSink,59 pub fn parse_fragment<Sink>(
60     mut sink: Sink,
61     opts: ParseOpts,
62     context_name: QualName,
63     context_attrs: Vec<Attribute>,
64 ) -> Parser<Sink>
65 where
66     Sink: TreeSink,
67 {
68     let context_elem = create_element(&mut sink, context_name, context_attrs);
69     parse_fragment_for_element(sink, opts, context_elem, None)
70 }
71 
72 /// Like `parse_fragment`, but with an existing context element
73 /// and optionally a form element.
parse_fragment_for_element<Sink>( sink: Sink, opts: ParseOpts, context_element: Sink::Handle, form_element: Option<Sink::Handle>, ) -> Parser<Sink> where Sink: TreeSink,74 pub fn parse_fragment_for_element<Sink>(
75     sink: Sink,
76     opts: ParseOpts,
77     context_element: Sink::Handle,
78     form_element: Option<Sink::Handle>,
79 ) -> Parser<Sink>
80 where
81     Sink: TreeSink,
82 {
83     let tb = TreeBuilder::new_for_fragment(sink, context_element, form_element, opts.tree_builder);
84     let tok_opts = TokenizerOpts {
85         initial_state: Some(tb.tokenizer_state_for_context_elem()),
86         ..opts.tokenizer
87     };
88     let tok = Tokenizer::new(tb, tok_opts);
89     Parser {
90         tokenizer: tok,
91         input_buffer: BufferQueue::new(),
92     }
93 }
94 
95 /// An HTML parser,
96 /// ready to receive Unicode input through the `tendril::TendrilSink` trait’s methods.
97 pub struct Parser<Sink>
98 where
99     Sink: TreeSink,
100 {
101     pub tokenizer: Tokenizer<TreeBuilder<Sink::Handle, Sink>>,
102     pub input_buffer: BufferQueue,
103 }
104 
105 impl<Sink: TreeSink> TendrilSink<tendril::fmt::UTF8> for Parser<Sink> {
process(&mut self, t: StrTendril)106     fn process(&mut self, t: StrTendril) {
107         self.input_buffer.push_back(t);
108         // FIXME: Properly support </script> somehow.
109         while let TokenizerResult::Script(_) = self.tokenizer.feed(&mut self.input_buffer) {}
110     }
111 
112     // FIXME: Is it too noisy to report every character decoding error?
error(&mut self, desc: Cow<'static, str>)113     fn error(&mut self, desc: Cow<'static, str>) {
114         self.tokenizer.sink.sink.parse_error(desc)
115     }
116 
117     type Output = Sink::Output;
118 
finish(mut self) -> Self::Output119     fn finish(mut self) -> Self::Output {
120         // FIXME: Properly support </script> somehow.
121         while let TokenizerResult::Script(_) = self.tokenizer.feed(&mut self.input_buffer) {}
122         assert!(self.input_buffer.is_empty());
123         self.tokenizer.end();
124         self.tokenizer.sink.sink.finish()
125     }
126 }
127 
128 impl<Sink: TreeSink> Parser<Sink> {
129     /// Wrap this parser into a `TendrilSink` that accepts UTF-8 bytes.
130     ///
131     /// Use this when your input is bytes that are known to be in the UTF-8 encoding.
132     /// Decoding is lossy, like `String::from_utf8_lossy`.
from_utf8(self) -> Utf8LossyDecoder<Self>133     pub fn from_utf8(self) -> Utf8LossyDecoder<Self> {
134         Utf8LossyDecoder::new(self)
135     }
136 }
137