1#!/usr/bin/env python
2# vim:fileencoding=utf-8
3# License: Apache 2.0 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
4
5from __future__ import absolute_import, division, print_function, unicode_literals
6
7import codecs
8import importlib
9import sys
10from collections import namedtuple
11from locale import getpreferredencoding
12
13if not hasattr(sys, 'generating_docs_via_sphinx'):
14    from lxml import etree  # Must be imported before html_parser to initialize libxml
15
16    try:
17        from . import html_parser
18    except ImportError:
19        raise
20    else:
21        version = namedtuple('Version', 'major minor patch')(
22            html_parser.MAJOR, html_parser.MINOR, html_parser.PATCH)
23
24        if not hasattr(etree, 'adopt_external_document'):
25            raise ImportError('Your version of lxml is too old, version 3.8.0 is minimum')
26
27        LIBXML_VERSION = ((html_parser.LIBXML_VERSION // 10000) % 100,
28                          (html_parser.LIBXML_VERSION // 100) % 100,
29                          html_parser.LIBXML_VERSION % 100, )
30        if LIBXML_VERSION[:2] != etree.LIBXML_VERSION[:2]:
31            raise RuntimeError(
32                'html5-parser and lxml are using different versions of libxml2.'
33                ' This happens commonly when using pip installed versions of lxml.'
34                ' Use pip install --no-binary lxml lxml instead.'
35                ' libxml2 versions: html5-parser: {} != lxml: {}'.format(
36                    LIBXML_VERSION, etree.LIBXML_VERSION))
37
38BOMS = (codecs.BOM_UTF8, codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE)
39
40
41def check_bom(data):
42    for bom in BOMS:
43        if data.startswith(bom):
44            return bom
45
46
47def check_for_meta_charset(raw):
48    from .encoding_parser import EncodingParser  # delay load
49    q = raw[:10 * 1024]
50    parser = EncodingParser(q)
51    encoding = parser()
52    if encoding in ("utf-16", "utf-16be", "utf-16le"):
53        encoding = "utf-8"
54    return encoding
55
56
57def detect_encoding(raw):
58    from chardet import detect  # delay load
59    q = raw[:50 * 1024]
60    return detect(q)['encoding']
61
62
63passthrough_encodings = frozenset(('utf-8', 'utf8', 'ascii'))
64
65
66def safe_get_preferred_encoding():
67    try:
68        ans = getpreferredencoding(False)
69    except Exception:
70        pass
71    else:
72        try:
73            return codecs.lookup(ans).name
74        except LookupError:
75            pass
76
77
78def as_utf8(bytes_or_unicode, transport_encoding=None, fallback_encoding=None):
79    if isinstance(bytes_or_unicode, bytes):
80        data = bytes_or_unicode
81        if transport_encoding:
82            if transport_encoding.lower() not in passthrough_encodings:
83                data = bytes_or_unicode.decode(transport_encoding).encode('utf-8')
84        else:
85            # See
86            # https://www.w3.org/TR/2011/WD-html5-20110113/parsing.html#determining-the-character-encoding
87            bom = check_bom(data)
88            if bom is not None:
89                data = data[len(bom):]
90                if bom is not codecs.BOM_UTF8:
91                    data = data.decode(bom).encode('utf-8')
92            else:
93                encoding = (
94                    check_for_meta_charset(data) or detect_encoding(data) or fallback_encoding or
95                    safe_get_preferred_encoding() or 'cp-1252')
96                if encoding and encoding.lower() not in passthrough_encodings:
97                    if encoding == 'x-user-defined':
98                        # https://encoding.spec.whatwg.org/#x-user-defined
99                        buf = (b if b <= 0x7F else 0xF780 + b - 0x80 for b in bytearray(data))
100                        try:
101                            chr = unichr
102                        except NameError:
103                            pass
104                        data = ''.join(map(chr, buf))
105                    else:
106                        data = data.decode(encoding).encode('utf-8')
107    else:
108        data = bytes_or_unicode.encode('utf-8')
109    return data
110
111
112def normalize_treebuilder(x):
113    if hasattr(x, 'lower'):
114        x = x.lower()
115    return {'lxml.etree': 'lxml', 'etree': 'stdlib_etree'}.get(x, x)
116
117
118NAMESPACE_SUPPORTING_BUILDERS = frozenset('lxml stdlib_etree dom lxml_html'.split())
119
120
121def parse(
122    html,
123    transport_encoding=None,
124    namespace_elements=False,
125    treebuilder='lxml',
126    fallback_encoding=None,
127    keep_doctype=True,
128    maybe_xhtml=False,
129    return_root=True,
130    line_number_attr=None,
131    sanitize_names=True,
132    stack_size=16 * 1024,
133    fragment_context=None,
134):
135    '''
136    Parse the specified :attr:`html` and return the parsed representation.
137
138    :param html: The HTML to be parsed. Can be either bytes or a unicode string.
139
140    :param transport_encoding: If specified, assume the passed in bytes are in this encoding.
141        Ignored if :attr:`html` is unicode.
142
143    :param namespace_elements:
144        Add XML namespaces when parsing so that the resulting tree is XHTML.
145
146    :param treebuilder:
147        The type of tree to return. Note that only the lxml treebuilder is fast, as all
148        other treebuilders are implemented in python, not C. Supported values are:
149          * `lxml <https://lxml.de>`_  -- the default, and fastest
150          * `lxml_html <https://lxml.de>`_  -- tree of lxml.html.HtmlElement, same speed as lxml
151            (new in *0.4.10*)
152          * etree (the python stdlib :mod:`xml.etree.ElementTree`)
153          * dom (the python stdlib :mod:`xml.dom.minidom`)
154          * `soup <https://www.crummy.com/software/BeautifulSoup>`_ -- BeautifulSoup,
155            which must be installed or it will raise an :class:`ImportError`
156
157    :param fallback_encoding: If no encoding could be detected, then use this encoding.
158        Defaults to an encoding based on system locale.
159
160    :param keep_doctype: Keep the <DOCTYPE> (if any).
161
162    :param maybe_xhtml: Useful when it is unknown if the HTML to be parsed is
163        actually XHTML. Changes the HTML 5 parsing algorithm to be more
164        suitable for XHTML. In particular handles self-closed CDATA elements.
165        So a ``<title/>`` or ``<style/>`` in the HTML will not completely break
166        parsing. Also preserves namespaced tags and attributes even for namespaces
167        not supported by HTML 5 (this works only with the ``lxml`` and ``lxml_html``
168        treebuilders).
169        Note that setting this also implicitly sets ``namespace_elements``.
170
171    :param return_root: If True, return the root node of the document, otherwise
172        return the tree object for the document.
173
174    :param line_number_attr: The optional name of an attribute used to store the line number
175        of every element. If set, this attribute will be added to each element with the
176        element's line number.
177
178    :param sanitize_names: Ensure tag and attributes contain only ASCII alphanumeric
179        charactes, underscores, hyphens and periods. This ensures that the resulting
180        tree is also valid XML. Any characters outside this set are replaced by
181        underscores. Note that this is not strictly HTML 5 spec compliant, so turn it
182        off if you need strict spec compliance.
183
184    :param stack_size: The initial size (number of items) in the stack. The
185        default is sufficient to avoid memory allocations for all but the
186        largest documents.
187
188    :param fragment_context: the tag name under which to parse the HTML when the html
189        is a fragment. Common choices are ``div`` or ``body``. To use SVG or MATHML tags
190        prefix the tag name with ``svg:`` or ``math:`` respectively. Note that currently
191        using a non-HTML fragment_context is not supported. New in *0.4.10*.
192    '''
193    data = as_utf8(html or b'', transport_encoding, fallback_encoding)
194    treebuilder = normalize_treebuilder(treebuilder)
195    if treebuilder == 'soup':
196        from .soup import parse
197        return parse(
198            data, return_root=return_root, keep_doctype=keep_doctype, stack_size=stack_size)
199    if treebuilder not in NAMESPACE_SUPPORTING_BUILDERS:
200        namespace_elements = False
201    fragment_namespace = html_parser.GUMBO_NAMESPACE_HTML
202    if fragment_context:
203        fragment_context = fragment_context.lower()
204        if ':' in fragment_context:
205            ns, fragment_context = fragment_context.split(':', 1)
206            fragment_namespace = {
207                'svg': html_parser.GUMBO_NAMESPACE_SVG, 'math': html_parser.GUMBO_NAMESPACE_MATHML,
208                'html': html_parser.GUMBO_NAMESPACE_HTML
209            }[ns]
210
211    capsule = html_parser.parse(
212        data,
213        namespace_elements=namespace_elements or maybe_xhtml,
214        keep_doctype=keep_doctype,
215        maybe_xhtml=maybe_xhtml,
216        line_number_attr=line_number_attr,
217        sanitize_names=sanitize_names,
218        stack_size=stack_size,
219        fragment_context=fragment_context,
220        fragment_namespace=fragment_namespace,
221        )
222
223    interpreter = None
224    if treebuilder == 'lxml_html':
225        from lxml.html import HTMLParser
226        interpreter = HTMLParser()
227    ans = etree.adopt_external_document(capsule, parser=interpreter)
228    if treebuilder in ('lxml', 'lxml_html'):
229        return ans.getroot() if return_root else ans
230    m = importlib.import_module('html5_parser.' + treebuilder)
231    return m.adapt(ans, return_root=return_root)
232