1# encoding: utf-8
2"""Use the HTMLParser library to parse HTML files that aren't too bad."""
3
4# Use of this source code is governed by the MIT license.
5__license__ = "MIT"
6
7__all__ = [
8    'HTMLParserTreeBuilder',
9    ]
10
11from html.parser import HTMLParser
12
13try:
14    from html.parser import HTMLParseError
15except ImportError as e:
16    # HTMLParseError is removed in Python 3.5. Since it can never be
17    # thrown in 3.5, we can just define our own class as a placeholder.
18    class HTMLParseError(Exception):
19        pass
20
21import sys
22import warnings
23
24# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
25# argument, which we'd like to set to False. Unfortunately,
26# http://bugs.python.org/issue13273 makes strict=True a better bet
27# before Python 3.2.3.
28#
29# At the end of this file, we monkeypatch HTMLParser so that
30# strict=True works well on Python 3.2.2.
31major, minor, release = sys.version_info[:3]
32CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
33CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
34CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
35
36
37from bs4.element import (
38    CData,
39    Comment,
40    Declaration,
41    Doctype,
42    ProcessingInstruction,
43    )
44from bs4.dammit import EntitySubstitution, UnicodeDammit
45
46from bs4.builder import (
47    HTML,
48    HTMLTreeBuilder,
49    STRICT,
50    )
51
52
53HTMLPARSER = 'html.parser'
54
55class BeautifulSoupHTMLParser(HTMLParser):
56    """A subclass of the Python standard library's HTMLParser class, which
57    listens for HTMLParser events and translates them into calls
58    to Beautiful Soup's tree construction API.
59    """
60
61    # Strategies for handling duplicate attributes
62    IGNORE = 'ignore'
63    REPLACE = 'replace'
64
65    def __init__(self, *args, **kwargs):
66        """Constructor.
67
68        :param on_duplicate_attribute: A strategy for what to do if a
69            tag includes the same attribute more than once. Accepted
70            values are: REPLACE (replace earlier values with later
71            ones, the default), IGNORE (keep the earliest value
72            encountered), or a callable. A callable must take three
73            arguments: the dictionary of attributes already processed,
74            the name of the duplicate attribute, and the most recent value
75            encountered.
76        """
77        self.on_duplicate_attribute = kwargs.pop(
78            'on_duplicate_attribute', self.REPLACE
79        )
80        HTMLParser.__init__(self, *args, **kwargs)
81
82        # Keep a list of empty-element tags that were encountered
83        # without an explicit closing tag. If we encounter a closing tag
84        # of this type, we'll associate it with one of those entries.
85        #
86        # This isn't a stack because we don't care about the
87        # order. It's a list of closing tags we've already handled and
88        # will ignore, assuming they ever show up.
89        self.already_closed_empty_element = []
90
91    def error(self, msg):
92        """In Python 3, HTMLParser subclasses must implement error(), although
93        this requirement doesn't appear to be documented.
94
95        In Python 2, HTMLParser implements error() by raising an exception,
96        which we don't want to do.
97
98        In any event, this method is called only on very strange
99        markup and our best strategy is to pretend it didn't happen
100        and keep going.
101        """
102        warnings.warn(msg)
103
104    def handle_startendtag(self, name, attrs):
105        """Handle an incoming empty-element tag.
106
107        This is only called when the markup looks like <tag/>.
108
109        :param name: Name of the tag.
110        :param attrs: Dictionary of the tag's attributes.
111        """
112        # is_startend() tells handle_starttag not to close the tag
113        # just because its name matches a known empty-element tag. We
114        # know that this is an empty-element tag and we want to call
115        # handle_endtag ourselves.
116        tag = self.handle_starttag(name, attrs, handle_empty_element=False)
117        self.handle_endtag(name)
118
119    def handle_starttag(self, name, attrs, handle_empty_element=True):
120        """Handle an opening tag, e.g. '<tag>'
121
122        :param name: Name of the tag.
123        :param attrs: Dictionary of the tag's attributes.
124        :param handle_empty_element: True if this tag is known to be
125            an empty-element tag (i.e. there is not expected to be any
126            closing tag).
127        """
128        # XXX namespace
129        attr_dict = {}
130        for key, value in attrs:
131            # Change None attribute values to the empty string
132            # for consistency with the other tree builders.
133            if value is None:
134                value = ''
135            if key in attr_dict:
136                # A single attribute shows up multiple times in this
137                # tag. How to handle it depends on the
138                # on_duplicate_attribute setting.
139                on_dupe = self.on_duplicate_attribute
140                if on_dupe == self.IGNORE:
141                    pass
142                elif on_dupe in (None, self.REPLACE):
143                    attr_dict[key] = value
144                else:
145                    on_dupe(attr_dict, key, value)
146            else:
147                attr_dict[key] = value
148            attrvalue = '""'
149        #print("START", name)
150        sourceline, sourcepos = self.getpos()
151        tag = self.soup.handle_starttag(
152            name, None, None, attr_dict, sourceline=sourceline,
153            sourcepos=sourcepos
154        )
155        if tag and tag.is_empty_element and handle_empty_element:
156            # Unlike other parsers, html.parser doesn't send separate end tag
157            # events for empty-element tags. (It's handled in
158            # handle_startendtag, but only if the original markup looked like
159            # <tag/>.)
160            #
161            # So we need to call handle_endtag() ourselves. Since we
162            # know the start event is identical to the end event, we
163            # don't want handle_endtag() to cross off any previous end
164            # events for tags of this name.
165            self.handle_endtag(name, check_already_closed=False)
166
167            # But we might encounter an explicit closing tag for this tag
168            # later on. If so, we want to ignore it.
169            self.already_closed_empty_element.append(name)
170
171    def handle_endtag(self, name, check_already_closed=True):
172        """Handle a closing tag, e.g. '</tag>'
173
174        :param name: A tag name.
175        :param check_already_closed: True if this tag is expected to
176           be the closing portion of an empty-element tag,
177           e.g. '<tag></tag>'.
178        """
179        #print("END", name)
180        if check_already_closed and name in self.already_closed_empty_element:
181            # This is a redundant end tag for an empty-element tag.
182            # We've already called handle_endtag() for it, so just
183            # check it off the list.
184            #print("ALREADY CLOSED", name)
185            self.already_closed_empty_element.remove(name)
186        else:
187            self.soup.handle_endtag(name)
188
189    def handle_data(self, data):
190        """Handle some textual data that shows up between tags."""
191        self.soup.handle_data(data)
192
193    def handle_charref(self, name):
194        """Handle a numeric character reference by converting it to the
195        corresponding Unicode character and treating it as textual
196        data.
197
198        :param name: Character number, possibly in hexadecimal.
199        """
200        # XXX workaround for a bug in HTMLParser. Remove this once
201        # it's fixed in all supported versions.
202        # http://bugs.python.org/issue13633
203        if name.startswith('x'):
204            real_name = int(name.lstrip('x'), 16)
205        elif name.startswith('X'):
206            real_name = int(name.lstrip('X'), 16)
207        else:
208            real_name = int(name)
209
210        data = None
211        if real_name < 256:
212            # HTML numeric entities are supposed to reference Unicode
213            # code points, but sometimes they reference code points in
214            # some other encoding (ahem, Windows-1252). E.g. &#147;
215            # instead of &#201; for LEFT DOUBLE QUOTATION MARK. This
216            # code tries to detect this situation and compensate.
217            for encoding in (self.soup.original_encoding, 'windows-1252'):
218                if not encoding:
219                    continue
220                try:
221                    data = bytearray([real_name]).decode(encoding)
222                except UnicodeDecodeError as e:
223                    pass
224        if not data:
225            try:
226                data = chr(real_name)
227            except (ValueError, OverflowError) as e:
228                pass
229        data = data or "\N{REPLACEMENT CHARACTER}"
230        self.handle_data(data)
231
232    def handle_entityref(self, name):
233        """Handle a named entity reference by converting it to the
234        corresponding Unicode character(s) and treating it as textual
235        data.
236
237        :param name: Name of the entity reference.
238        """
239        character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
240        if character is not None:
241            data = character
242        else:
243            # If this were XML, it would be ambiguous whether "&foo"
244            # was an character entity reference with a missing
245            # semicolon or the literal string "&foo". Since this is
246            # HTML, we have a complete list of all character entity references,
247            # and this one wasn't found, so assume it's the literal string "&foo".
248            data = "&%s" % name
249        self.handle_data(data)
250
251    def handle_comment(self, data):
252        """Handle an HTML comment.
253
254        :param data: The text of the comment.
255        """
256        self.soup.endData()
257        self.soup.handle_data(data)
258        self.soup.endData(Comment)
259
260    def handle_decl(self, data):
261        """Handle a DOCTYPE declaration.
262
263        :param data: The text of the declaration.
264        """
265        self.soup.endData()
266        data = data[len("DOCTYPE "):]
267        self.soup.handle_data(data)
268        self.soup.endData(Doctype)
269
270    def unknown_decl(self, data):
271        """Handle a declaration of unknown type -- probably a CDATA block.
272
273        :param data: The text of the declaration.
274        """
275        if data.upper().startswith('CDATA['):
276            cls = CData
277            data = data[len('CDATA['):]
278        else:
279            cls = Declaration
280        self.soup.endData()
281        self.soup.handle_data(data)
282        self.soup.endData(cls)
283
284    def handle_pi(self, data):
285        """Handle a processing instruction.
286
287        :param data: The text of the instruction.
288        """
289        self.soup.endData()
290        self.soup.handle_data(data)
291        self.soup.endData(ProcessingInstruction)
292
293
294class HTMLParserTreeBuilder(HTMLTreeBuilder):
295    """A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser,
296    found in the Python standard library.
297    """
298    is_xml = False
299    picklable = True
300    NAME = HTMLPARSER
301    features = [NAME, HTML, STRICT]
302
303    # The html.parser knows which line number and position in the
304    # original file is the source of an element.
305    TRACKS_LINE_NUMBERS = True
306
307    def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
308        """Constructor.
309
310        :param parser_args: Positional arguments to pass into
311            the BeautifulSoupHTMLParser constructor, once it's
312            invoked.
313        :param parser_kwargs: Keyword arguments to pass into
314            the BeautifulSoupHTMLParser constructor, once it's
315            invoked.
316        :param kwargs: Keyword arguments for the superclass constructor.
317        """
318        # Some keyword arguments will be pulled out of kwargs and placed
319        # into parser_kwargs.
320        extra_parser_kwargs = dict()
321        for arg in ('on_duplicate_attribute',):
322            if arg in kwargs:
323                value = kwargs.pop(arg)
324                extra_parser_kwargs[arg] = value
325        super(HTMLParserTreeBuilder, self).__init__(**kwargs)
326        parser_args = parser_args or []
327        parser_kwargs = parser_kwargs or {}
328        parser_kwargs.update(extra_parser_kwargs)
329        if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
330            parser_kwargs['strict'] = False
331        if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
332            parser_kwargs['convert_charrefs'] = False
333        self.parser_args = (parser_args, parser_kwargs)
334
335    def prepare_markup(self, markup, user_specified_encoding=None,
336                       document_declared_encoding=None, exclude_encodings=None):
337
338        """Run any preliminary steps necessary to make incoming markup
339        acceptable to the parser.
340
341        :param markup: Some markup -- probably a bytestring.
342        :param user_specified_encoding: The user asked to try this encoding.
343        :param document_declared_encoding: The markup itself claims to be
344            in this encoding.
345        :param exclude_encodings: The user asked _not_ to try any of
346            these encodings.
347
348        :yield: A series of 4-tuples:
349         (markup, encoding, declared encoding,
350          has undergone character replacement)
351
352         Each 4-tuple represents a strategy for converting the
353         document to Unicode and parsing it. Each strategy will be tried
354         in turn.
355        """
356        if isinstance(markup, str):
357            # Parse Unicode as-is.
358            yield (markup, None, None, False)
359            return
360
361        # Ask UnicodeDammit to sniff the most likely encoding.
362
363        # This was provided by the end-user; treat it as a known
364        # definite encoding per the algorithm laid out in the HTML5
365        # spec.  (See the EncodingDetector class for details.)
366        known_definite_encodings = [user_specified_encoding]
367
368        # This was found in the document; treat it as a slightly lower-priority
369        # user encoding.
370        user_encodings = [document_declared_encoding]
371
372        try_encodings = [user_specified_encoding, document_declared_encoding]
373        dammit = UnicodeDammit(
374            markup,
375            known_definite_encodings=known_definite_encodings,
376            user_encodings=user_encodings,
377            is_html=True,
378            exclude_encodings=exclude_encodings
379        )
380        yield (dammit.markup, dammit.original_encoding,
381               dammit.declared_html_encoding,
382               dammit.contains_replacement_characters)
383
384    def feed(self, markup):
385        """Run some incoming markup through some parsing process,
386        populating the `BeautifulSoup` object in self.soup.
387        """
388        args, kwargs = self.parser_args
389        parser = BeautifulSoupHTMLParser(*args, **kwargs)
390        parser.soup = self.soup
391        try:
392            parser.feed(markup)
393            parser.close()
394        except HTMLParseError as e:
395            warnings.warn(RuntimeWarning(
396                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
397            raise e
398        parser.already_closed_empty_element = []
399
400# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
401# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
402# string.
403#
404# XXX This code can be removed once most Python 3 users are on 3.2.3.
405if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
406    import re
407    attrfind_tolerant = re.compile(
408        r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
409        r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
410    HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
411
412    locatestarttagend = re.compile(r"""
413  <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
414  (?:\s+                             # whitespace before attribute name
415    (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
416      (?:\s*=\s*                     # value indicator
417        (?:'[^']*'                   # LITA-enclosed value
418          |\"[^\"]*\"                # LIT-enclosed value
419          |[^'\">\s]+                # bare value
420         )
421       )?
422     )
423   )*
424  \s*                                # trailing whitespace
425""", re.VERBOSE)
426    BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
427
428    from html.parser import tagfind, attrfind
429
430    def parse_starttag(self, i):
431        self.__starttag_text = None
432        endpos = self.check_for_whole_start_tag(i)
433        if endpos < 0:
434            return endpos
435        rawdata = self.rawdata
436        self.__starttag_text = rawdata[i:endpos]
437
438        # Now parse the data between i+1 and j into a tag and attrs
439        attrs = []
440        match = tagfind.match(rawdata, i+1)
441        assert match, 'unexpected call to parse_starttag()'
442        k = match.end()
443        self.lasttag = tag = rawdata[i+1:k].lower()
444        while k < endpos:
445            if self.strict:
446                m = attrfind.match(rawdata, k)
447            else:
448                m = attrfind_tolerant.match(rawdata, k)
449            if not m:
450                break
451            attrname, rest, attrvalue = m.group(1, 2, 3)
452            if not rest:
453                attrvalue = None
454            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
455                 attrvalue[:1] == '"' == attrvalue[-1:]:
456                attrvalue = attrvalue[1:-1]
457            if attrvalue:
458                attrvalue = self.unescape(attrvalue)
459            attrs.append((attrname.lower(), attrvalue))
460            k = m.end()
461
462        end = rawdata[k:endpos].strip()
463        if end not in (">", "/>"):
464            lineno, offset = self.getpos()
465            if "\n" in self.__starttag_text:
466                lineno = lineno + self.__starttag_text.count("\n")
467                offset = len(self.__starttag_text) \
468                         - self.__starttag_text.rfind("\n")
469            else:
470                offset = offset + len(self.__starttag_text)
471            if self.strict:
472                self.error("junk characters in start tag: %r"
473                           % (rawdata[k:endpos][:20],))
474            self.handle_data(rawdata[i:endpos])
475            return endpos
476        if end.endswith('/>'):
477            # XHTML-style empty tag: <span attr="value" />
478            self.handle_startendtag(tag, attrs)
479        else:
480            self.handle_starttag(tag, attrs)
481            if tag in self.CDATA_CONTENT_ELEMENTS:
482                self.set_cdata_mode(tag)
483        return endpos
484
485    def set_cdata_mode(self, elem):
486        self.cdata_elem = elem.lower()
487        self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
488
489    BeautifulSoupHTMLParser.parse_starttag = parse_starttag
490    BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
491
492    CONSTRUCTOR_TAKES_STRICT = True
493