1#!/usr/bin/env python
2#
3# Copyright 2009 Facebook
4#
5# Licensed under the Apache License, Version 2.0 (the "License"); you may
6# not use this file except in compliance with the License. You may obtain
7# a copy of the License at
8#
9#     http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14# License for the specific language governing permissions and limitations
15# under the License.
16
17"""Escaping/unescaping methods for HTML, JSON, URLs, and others.
18
19Also includes a few other miscellaneous string manipulation functions that
20have crept in over time.
21"""
22
23from __future__ import absolute_import, division, print_function
24
25import json
26import re
27
28from tornado.util import PY3, unicode_type, basestring_type
29
30if PY3:
31    from urllib.parse import parse_qs as _parse_qs
32    import html.entities as htmlentitydefs
33    import urllib.parse as urllib_parse
34    unichr = chr
35else:
36    from urlparse import parse_qs as _parse_qs
37    import htmlentitydefs
38    import urllib as urllib_parse
39
40try:
41    import typing  # noqa
42except ImportError:
43    pass
44
45
46_XHTML_ESCAPE_RE = re.compile('[&<>"\']')
47_XHTML_ESCAPE_DICT = {'&': '&amp;', '<': '&lt;', '>': '&gt;', '"': '&quot;',
48                      '\'': '&#39;'}
49
50
51def xhtml_escape(value):
52    """Escapes a string so it is valid within HTML or XML.
53
54    Escapes the characters ``<``, ``>``, ``"``, ``'``, and ``&``.
55    When used in attribute values the escaped strings must be enclosed
56    in quotes.
57
58    .. versionchanged:: 3.2
59
60       Added the single quote to the list of escaped characters.
61    """
62    return _XHTML_ESCAPE_RE.sub(lambda match: _XHTML_ESCAPE_DICT[match.group(0)],
63                                to_basestring(value))
64
65
66def xhtml_unescape(value):
67    """Un-escapes an XML-escaped string."""
68    return re.sub(r"&(#?)(\w+?);", _convert_entity, _unicode(value))
69
70
71# The fact that json_encode wraps json.dumps is an implementation detail.
72# Please see https://github.com/tornadoweb/tornado/pull/706
73# before sending a pull request that adds **kwargs to this function.
74def json_encode(value):
75    """JSON-encodes the given Python object."""
76    # JSON permits but does not require forward slashes to be escaped.
77    # This is useful when json data is emitted in a <script> tag
78    # in HTML, as it prevents </script> tags from prematurely terminating
79    # the javascript.  Some json libraries do this escaping by default,
80    # although python's standard library does not, so we do it here.
81    # http://stackoverflow.com/questions/1580647/json-why-are-forward-slashes-escaped
82    return json.dumps(value).replace("</", "<\\/")
83
84
85def json_decode(value):
86    """Returns Python objects for the given JSON string."""
87    return json.loads(to_basestring(value))
88
89
90def squeeze(value):
91    """Replace all sequences of whitespace chars with a single space."""
92    return re.sub(r"[\x00-\x20]+", " ", value).strip()
93
94
95def url_escape(value, plus=True):
96    """Returns a URL-encoded version of the given value.
97
98    If ``plus`` is true (the default), spaces will be represented
99    as "+" instead of "%20".  This is appropriate for query strings
100    but not for the path component of a URL.  Note that this default
101    is the reverse of Python's urllib module.
102
103    .. versionadded:: 3.1
104        The ``plus`` argument
105    """
106    quote = urllib_parse.quote_plus if plus else urllib_parse.quote
107    return quote(utf8(value))
108
109
110# python 3 changed things around enough that we need two separate
111# implementations of url_unescape.  We also need our own implementation
112# of parse_qs since python 3's version insists on decoding everything.
113if not PY3:
114    def url_unescape(value, encoding='utf-8', plus=True):
115        """Decodes the given value from a URL.
116
117        The argument may be either a byte or unicode string.
118
119        If encoding is None, the result will be a byte string.  Otherwise,
120        the result is a unicode string in the specified encoding.
121
122        If ``plus`` is true (the default), plus signs will be interpreted
123        as spaces (literal plus signs must be represented as "%2B").  This
124        is appropriate for query strings and form-encoded values but not
125        for the path component of a URL.  Note that this default is the
126        reverse of Python's urllib module.
127
128        .. versionadded:: 3.1
129           The ``plus`` argument
130        """
131        unquote = (urllib_parse.unquote_plus if plus else urllib_parse.unquote)
132        if encoding is None:
133            return unquote(utf8(value))
134        else:
135            return unicode_type(unquote(utf8(value)), encoding)
136
137    parse_qs_bytes = _parse_qs
138else:
139    def url_unescape(value, encoding='utf-8', plus=True):
140        """Decodes the given value from a URL.
141
142        The argument may be either a byte or unicode string.
143
144        If encoding is None, the result will be a byte string.  Otherwise,
145        the result is a unicode string in the specified encoding.
146
147        If ``plus`` is true (the default), plus signs will be interpreted
148        as spaces (literal plus signs must be represented as "%2B").  This
149        is appropriate for query strings and form-encoded values but not
150        for the path component of a URL.  Note that this default is the
151        reverse of Python's urllib module.
152
153        .. versionadded:: 3.1
154           The ``plus`` argument
155        """
156        if encoding is None:
157            if plus:
158                # unquote_to_bytes doesn't have a _plus variant
159                value = to_basestring(value).replace('+', ' ')
160            return urllib_parse.unquote_to_bytes(value)
161        else:
162            unquote = (urllib_parse.unquote_plus if plus
163                       else urllib_parse.unquote)
164            return unquote(to_basestring(value), encoding=encoding)
165
166    def parse_qs_bytes(qs, keep_blank_values=False, strict_parsing=False):
167        """Parses a query string like urlparse.parse_qs, but returns the
168        values as byte strings.
169
170        Keys still become type str (interpreted as latin1 in python3!)
171        because it's too painful to keep them as byte strings in
172        python3 and in practice they're nearly always ascii anyway.
173        """
174        # This is gross, but python3 doesn't give us another way.
175        # Latin1 is the universal donor of character encodings.
176        result = _parse_qs(qs, keep_blank_values, strict_parsing,
177                           encoding='latin1', errors='strict')
178        encoded = {}
179        for k, v in result.items():
180            encoded[k] = [i.encode('latin1') for i in v]
181        return encoded
182
183
184_UTF8_TYPES = (bytes, type(None))
185
186
187def utf8(value):
188    # type: (typing.Union[bytes,unicode_type,None])->typing.Union[bytes,None]
189    """Converts a string argument to a byte string.
190
191    If the argument is already a byte string or None, it is returned unchanged.
192    Otherwise it must be a unicode string and is encoded as utf8.
193    """
194    if isinstance(value, _UTF8_TYPES):
195        return value
196    if not isinstance(value, unicode_type):
197        raise TypeError(
198            "Expected bytes, unicode, or None; got %r" % type(value)
199        )
200    return value.encode("utf-8")
201
202
203_TO_UNICODE_TYPES = (unicode_type, type(None))
204
205
206def to_unicode(value):
207    """Converts a string argument to a unicode string.
208
209    If the argument is already a unicode string or None, it is returned
210    unchanged.  Otherwise it must be a byte string and is decoded as utf8.
211    """
212    if isinstance(value, _TO_UNICODE_TYPES):
213        return value
214    if not isinstance(value, bytes):
215        raise TypeError(
216            "Expected bytes, unicode, or None; got %r" % type(value)
217        )
218    return value.decode("utf-8")
219
220
221# to_unicode was previously named _unicode not because it was private,
222# but to avoid conflicts with the built-in unicode() function/type
223_unicode = to_unicode
224
225# When dealing with the standard library across python 2 and 3 it is
226# sometimes useful to have a direct conversion to the native string type
227if str is unicode_type:
228    native_str = to_unicode
229else:
230    native_str = utf8
231
232_BASESTRING_TYPES = (basestring_type, type(None))
233
234
235def to_basestring(value):
236    """Converts a string argument to a subclass of basestring.
237
238    In python2, byte and unicode strings are mostly interchangeable,
239    so functions that deal with a user-supplied argument in combination
240    with ascii string constants can use either and should return the type
241    the user supplied.  In python3, the two types are not interchangeable,
242    so this method is needed to convert byte strings to unicode.
243    """
244    if isinstance(value, _BASESTRING_TYPES):
245        return value
246    if not isinstance(value, bytes):
247        raise TypeError(
248            "Expected bytes, unicode, or None; got %r" % type(value)
249        )
250    return value.decode("utf-8")
251
252
253def recursive_unicode(obj):
254    """Walks a simple data structure, converting byte strings to unicode.
255
256    Supports lists, tuples, and dictionaries.
257    """
258    if isinstance(obj, dict):
259        return dict((recursive_unicode(k), recursive_unicode(v)) for (k, v) in obj.items())
260    elif isinstance(obj, list):
261        return list(recursive_unicode(i) for i in obj)
262    elif isinstance(obj, tuple):
263        return tuple(recursive_unicode(i) for i in obj)
264    elif isinstance(obj, bytes):
265        return to_unicode(obj)
266    else:
267        return obj
268
269
270# I originally used the regex from
271# http://daringfireball.net/2010/07/improved_regex_for_matching_urls
272# but it gets all exponential on certain patterns (such as too many trailing
273# dots), causing the regex matcher to never return.
274# This regex should avoid those problems.
275# Use to_unicode instead of tornado.util.u - we don't want backslashes getting
276# processed as escapes.
277_URL_RE = re.compile(to_unicode(r"""\b((?:([\w-]+):(/{1,3})|www[.])(?:(?:(?:[^\s&()]|&amp;|&quot;)*(?:[^!"#$%&'()*+,.:;<=>?@\[\]^`{|}~\s]))|(?:\((?:[^\s&()]|&amp;|&quot;)*\)))+)"""))
278
279
280def linkify(text, shorten=False, extra_params="",
281            require_protocol=False, permitted_protocols=["http", "https"]):
282    """Converts plain text into HTML with links.
283
284    For example: ``linkify("Hello http://tornadoweb.org!")`` would return
285    ``Hello <a href="http://tornadoweb.org">http://tornadoweb.org</a>!``
286
287    Parameters:
288
289    * ``shorten``: Long urls will be shortened for display.
290
291    * ``extra_params``: Extra text to include in the link tag, or a callable
292        taking the link as an argument and returning the extra text
293        e.g. ``linkify(text, extra_params='rel="nofollow" class="external"')``,
294        or::
295
296            def extra_params_cb(url):
297                if url.startswith("http://example.com"):
298                    return 'class="internal"'
299                else:
300                    return 'class="external" rel="nofollow"'
301            linkify(text, extra_params=extra_params_cb)
302
303    * ``require_protocol``: Only linkify urls which include a protocol. If
304        this is False, urls such as www.facebook.com will also be linkified.
305
306    * ``permitted_protocols``: List (or set) of protocols which should be
307        linkified, e.g. ``linkify(text, permitted_protocols=["http", "ftp",
308        "mailto"])``. It is very unsafe to include protocols such as
309        ``javascript``.
310    """
311    if extra_params and not callable(extra_params):
312        extra_params = " " + extra_params.strip()
313
314    def make_link(m):
315        url = m.group(1)
316        proto = m.group(2)
317        if require_protocol and not proto:
318            return url  # not protocol, no linkify
319
320        if proto and proto not in permitted_protocols:
321            return url  # bad protocol, no linkify
322
323        href = m.group(1)
324        if not proto:
325            href = "http://" + href   # no proto specified, use http
326
327        if callable(extra_params):
328            params = " " + extra_params(href).strip()
329        else:
330            params = extra_params
331
332        # clip long urls. max_len is just an approximation
333        max_len = 30
334        if shorten and len(url) > max_len:
335            before_clip = url
336            if proto:
337                proto_len = len(proto) + 1 + len(m.group(3) or "")  # +1 for :
338            else:
339                proto_len = 0
340
341            parts = url[proto_len:].split("/")
342            if len(parts) > 1:
343                # Grab the whole host part plus the first bit of the path
344                # The path is usually not that interesting once shortened
345                # (no more slug, etc), so it really just provides a little
346                # extra indication of shortening.
347                url = url[:proto_len] + parts[0] + "/" + \
348                    parts[1][:8].split('?')[0].split('.')[0]
349
350            if len(url) > max_len * 1.5:  # still too long
351                url = url[:max_len]
352
353            if url != before_clip:
354                amp = url.rfind('&')
355                # avoid splitting html char entities
356                if amp > max_len - 5:
357                    url = url[:amp]
358                url += "..."
359
360                if len(url) >= len(before_clip):
361                    url = before_clip
362                else:
363                    # full url is visible on mouse-over (for those who don't
364                    # have a status bar, such as Safari by default)
365                    params += ' title="%s"' % href
366
367        return u'<a href="%s"%s>%s</a>' % (href, params, url)
368
369    # First HTML-escape so that our strings are all safe.
370    # The regex is modified to avoid character entites other than &amp; so
371    # that we won't pick up &quot;, etc.
372    text = _unicode(xhtml_escape(text))
373    return _URL_RE.sub(make_link, text)
374
375
376def _convert_entity(m):
377    if m.group(1) == "#":
378        try:
379            if m.group(2)[:1].lower() == 'x':
380                return unichr(int(m.group(2)[1:], 16))
381            else:
382                return unichr(int(m.group(2)))
383        except ValueError:
384            return "&#%s;" % m.group(2)
385    try:
386        return _HTML_UNICODE_MAP[m.group(2)]
387    except KeyError:
388        return "&%s;" % m.group(2)
389
390
391def _build_unicode_map():
392    unicode_map = {}
393    for name, value in htmlentitydefs.name2codepoint.items():
394        unicode_map[name] = unichr(value)
395    return unicode_map
396
397
398_HTML_UNICODE_MAP = _build_unicode_map()
399