1#
2# Copyright 2009 Facebook
3#
4# Licensed under the Apache License, Version 2.0 (the "License"); you may
5# not use this file except in compliance with the License. You may obtain
6# a copy of the License at
7#
8#     http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13# License for the specific language governing permissions and limitations
14# under the License.
15
16"""Escaping/unescaping methods for HTML, JSON, URLs, and others.
17
18Also includes a few other miscellaneous string manipulation functions that
19have crept in over time.
20"""
21
22import html.entities
23import json
24import re
25import urllib.parse
26
27from tornado.util import unicode_type
28
29import typing
30from typing import Union, Any, Optional, Dict, List, Callable
31
32
33_XHTML_ESCAPE_RE = re.compile("[&<>\"']")
34_XHTML_ESCAPE_DICT = {
35    "&": "&amp;",
36    "<": "&lt;",
37    ">": "&gt;",
38    '"': "&quot;",
39    "'": "&#39;",
40}
41
42
43def xhtml_escape(value: Union[str, bytes]) -> str:
44    """Escapes a string so it is valid within HTML or XML.
45
46    Escapes the characters ``<``, ``>``, ``"``, ``'``, and ``&``.
47    When used in attribute values the escaped strings must be enclosed
48    in quotes.
49
50    .. versionchanged:: 3.2
51
52       Added the single quote to the list of escaped characters.
53    """
54    return _XHTML_ESCAPE_RE.sub(
55        lambda match: _XHTML_ESCAPE_DICT[match.group(0)], to_basestring(value)
56    )
57
58
59def xhtml_unescape(value: Union[str, bytes]) -> str:
60    """Un-escapes an XML-escaped string."""
61    return re.sub(r"&(#?)(\w+?);", _convert_entity, _unicode(value))
62
63
64# The fact that json_encode wraps json.dumps is an implementation detail.
65# Please see https://github.com/tornadoweb/tornado/pull/706
66# before sending a pull request that adds **kwargs to this function.
67def json_encode(value: Any) -> str:
68    """JSON-encodes the given Python object."""
69    # JSON permits but does not require forward slashes to be escaped.
70    # This is useful when json data is emitted in a <script> tag
71    # in HTML, as it prevents </script> tags from prematurely terminating
72    # the JavaScript.  Some json libraries do this escaping by default,
73    # although python's standard library does not, so we do it here.
74    # http://stackoverflow.com/questions/1580647/json-why-are-forward-slashes-escaped
75    return json.dumps(value).replace("</", "<\\/")
76
77
78def json_decode(value: Union[str, bytes]) -> Any:
79    """Returns Python objects for the given JSON string.
80
81    Supports both `str` and `bytes` inputs.
82    """
83    return json.loads(to_basestring(value))
84
85
86def squeeze(value: str) -> str:
87    """Replace all sequences of whitespace chars with a single space."""
88    return re.sub(r"[\x00-\x20]+", " ", value).strip()
89
90
91def url_escape(value: Union[str, bytes], plus: bool = True) -> str:
92    """Returns a URL-encoded version of the given value.
93
94    If ``plus`` is true (the default), spaces will be represented
95    as "+" instead of "%20".  This is appropriate for query strings
96    but not for the path component of a URL.  Note that this default
97    is the reverse of Python's urllib module.
98
99    .. versionadded:: 3.1
100        The ``plus`` argument
101    """
102    quote = urllib.parse.quote_plus if plus else urllib.parse.quote
103    return quote(utf8(value))
104
105
106@typing.overload
107def url_unescape(value: Union[str, bytes], encoding: None, plus: bool = True) -> bytes:
108    pass
109
110
111@typing.overload  # noqa: F811
112def url_unescape(
113    value: Union[str, bytes], encoding: str = "utf-8", plus: bool = True
114) -> str:
115    pass
116
117
118def url_unescape(  # noqa: F811
119    value: Union[str, bytes], encoding: Optional[str] = "utf-8", plus: bool = True
120) -> Union[str, bytes]:
121    """Decodes the given value from a URL.
122
123    The argument may be either a byte or unicode string.
124
125    If encoding is None, the result will be a byte string.  Otherwise,
126    the result is a unicode string in the specified encoding.
127
128    If ``plus`` is true (the default), plus signs will be interpreted
129    as spaces (literal plus signs must be represented as "%2B").  This
130    is appropriate for query strings and form-encoded values but not
131    for the path component of a URL.  Note that this default is the
132    reverse of Python's urllib module.
133
134    .. versionadded:: 3.1
135       The ``plus`` argument
136    """
137    if encoding is None:
138        if plus:
139            # unquote_to_bytes doesn't have a _plus variant
140            value = to_basestring(value).replace("+", " ")
141        return urllib.parse.unquote_to_bytes(value)
142    else:
143        unquote = urllib.parse.unquote_plus if plus else urllib.parse.unquote
144        return unquote(to_basestring(value), encoding=encoding)
145
146
147def parse_qs_bytes(
148    qs: Union[str, bytes], keep_blank_values: bool = False, strict_parsing: bool = False
149) -> Dict[str, List[bytes]]:
150    """Parses a query string like urlparse.parse_qs,
151    but takes bytes and returns the values as byte strings.
152
153    Keys still become type str (interpreted as latin1 in python3!)
154    because it's too painful to keep them as byte strings in
155    python3 and in practice they're nearly always ascii anyway.
156    """
157    # This is gross, but python3 doesn't give us another way.
158    # Latin1 is the universal donor of character encodings.
159    if isinstance(qs, bytes):
160        qs = qs.decode("latin1")
161    result = urllib.parse.parse_qs(
162        qs, keep_blank_values, strict_parsing, encoding="latin1", errors="strict"
163    )
164    encoded = {}
165    for k, v in result.items():
166        encoded[k] = [i.encode("latin1") for i in v]
167    return encoded
168
169
170_UTF8_TYPES = (bytes, type(None))
171
172
173@typing.overload
174def utf8(value: bytes) -> bytes:
175    pass
176
177
178@typing.overload  # noqa: F811
179def utf8(value: str) -> bytes:
180    pass
181
182
183@typing.overload  # noqa: F811
184def utf8(value: None) -> None:
185    pass
186
187
188def utf8(value: Union[None, str, bytes]) -> Optional[bytes]:  # noqa: F811
189    """Converts a string argument to a byte string.
190
191    If the argument is already a byte string or None, it is returned unchanged.
192    Otherwise it must be a unicode string and is encoded as utf8.
193    """
194    if isinstance(value, _UTF8_TYPES):
195        return value
196    if not isinstance(value, unicode_type):
197        raise TypeError("Expected bytes, unicode, or None; got %r" % type(value))
198    return value.encode("utf-8")
199
200
201_TO_UNICODE_TYPES = (unicode_type, type(None))
202
203
204@typing.overload
205def to_unicode(value: str) -> str:
206    pass
207
208
209@typing.overload  # noqa: F811
210def to_unicode(value: bytes) -> str:
211    pass
212
213
214@typing.overload  # noqa: F811
215def to_unicode(value: None) -> None:
216    pass
217
218
219def to_unicode(value: Union[None, str, bytes]) -> Optional[str]:  # noqa: F811
220    """Converts a string argument to a unicode string.
221
222    If the argument is already a unicode string or None, it is returned
223    unchanged.  Otherwise it must be a byte string and is decoded as utf8.
224    """
225    if isinstance(value, _TO_UNICODE_TYPES):
226        return value
227    if not isinstance(value, bytes):
228        raise TypeError("Expected bytes, unicode, or None; got %r" % type(value))
229    return value.decode("utf-8")
230
231
232# to_unicode was previously named _unicode not because it was private,
233# but to avoid conflicts with the built-in unicode() function/type
234_unicode = to_unicode
235
236# When dealing with the standard library across python 2 and 3 it is
237# sometimes useful to have a direct conversion to the native string type
238native_str = to_unicode
239to_basestring = to_unicode
240
241
242def recursive_unicode(obj: Any) -> Any:
243    """Walks a simple data structure, converting byte strings to unicode.
244
245    Supports lists, tuples, and dictionaries.
246    """
247    if isinstance(obj, dict):
248        return dict(
249            (recursive_unicode(k), recursive_unicode(v)) for (k, v) in obj.items()
250        )
251    elif isinstance(obj, list):
252        return list(recursive_unicode(i) for i in obj)
253    elif isinstance(obj, tuple):
254        return tuple(recursive_unicode(i) for i in obj)
255    elif isinstance(obj, bytes):
256        return to_unicode(obj)
257    else:
258        return obj
259
260
261# I originally used the regex from
262# http://daringfireball.net/2010/07/improved_regex_for_matching_urls
263# but it gets all exponential on certain patterns (such as too many trailing
264# dots), causing the regex matcher to never return.
265# This regex should avoid those problems.
266# Use to_unicode instead of tornado.util.u - we don't want backslashes getting
267# processed as escapes.
268_URL_RE = re.compile(
269    to_unicode(
270        r"""\b((?:([\w-]+):(/{1,3})|www[.])(?:(?:(?:[^\s&()]|&amp;|&quot;)*(?:[^!"#$%&'()*+,.:;<=>?@\[\]^`{|}~\s]))|(?:\((?:[^\s&()]|&amp;|&quot;)*\)))+)"""  # noqa: E501
271    )
272)
273
274
275def linkify(
276    text: Union[str, bytes],
277    shorten: bool = False,
278    extra_params: Union[str, Callable[[str], str]] = "",
279    require_protocol: bool = False,
280    permitted_protocols: List[str] = ["http", "https"],
281) -> str:
282    """Converts plain text into HTML with links.
283
284    For example: ``linkify("Hello http://tornadoweb.org!")`` would return
285    ``Hello <a href="http://tornadoweb.org">http://tornadoweb.org</a>!``
286
287    Parameters:
288
289    * ``shorten``: Long urls will be shortened for display.
290
291    * ``extra_params``: Extra text to include in the link tag, or a callable
292      taking the link as an argument and returning the extra text
293      e.g. ``linkify(text, extra_params='rel="nofollow" class="external"')``,
294      or::
295
296          def extra_params_cb(url):
297              if url.startswith("http://example.com"):
298                  return 'class="internal"'
299              else:
300                  return 'class="external" rel="nofollow"'
301          linkify(text, extra_params=extra_params_cb)
302
303    * ``require_protocol``: Only linkify urls which include a protocol. If
304      this is False, urls such as www.facebook.com will also be linkified.
305
306    * ``permitted_protocols``: List (or set) of protocols which should be
307      linkified, e.g. ``linkify(text, permitted_protocols=["http", "ftp",
308      "mailto"])``. It is very unsafe to include protocols such as
309      ``javascript``.
310    """
311    if extra_params and not callable(extra_params):
312        extra_params = " " + extra_params.strip()
313
314    def make_link(m: typing.Match) -> str:
315        url = m.group(1)
316        proto = m.group(2)
317        if require_protocol and not proto:
318            return url  # not protocol, no linkify
319
320        if proto and proto not in permitted_protocols:
321            return url  # bad protocol, no linkify
322
323        href = m.group(1)
324        if not proto:
325            href = "http://" + href  # no proto specified, use http
326
327        if callable(extra_params):
328            params = " " + extra_params(href).strip()
329        else:
330            params = extra_params
331
332        # clip long urls. max_len is just an approximation
333        max_len = 30
334        if shorten and len(url) > max_len:
335            before_clip = url
336            if proto:
337                proto_len = len(proto) + 1 + len(m.group(3) or "")  # +1 for :
338            else:
339                proto_len = 0
340
341            parts = url[proto_len:].split("/")
342            if len(parts) > 1:
343                # Grab the whole host part plus the first bit of the path
344                # The path is usually not that interesting once shortened
345                # (no more slug, etc), so it really just provides a little
346                # extra indication of shortening.
347                url = (
348                    url[:proto_len]
349                    + parts[0]
350                    + "/"
351                    + parts[1][:8].split("?")[0].split(".")[0]
352                )
353
354            if len(url) > max_len * 1.5:  # still too long
355                url = url[:max_len]
356
357            if url != before_clip:
358                amp = url.rfind("&")
359                # avoid splitting html char entities
360                if amp > max_len - 5:
361                    url = url[:amp]
362                url += "..."
363
364                if len(url) >= len(before_clip):
365                    url = before_clip
366                else:
367                    # full url is visible on mouse-over (for those who don't
368                    # have a status bar, such as Safari by default)
369                    params += ' title="%s"' % href
370
371        return u'<a href="%s"%s>%s</a>' % (href, params, url)
372
373    # First HTML-escape so that our strings are all safe.
374    # The regex is modified to avoid character entites other than &amp; so
375    # that we won't pick up &quot;, etc.
376    text = _unicode(xhtml_escape(text))
377    return _URL_RE.sub(make_link, text)
378
379
380def _convert_entity(m: typing.Match) -> str:
381    if m.group(1) == "#":
382        try:
383            if m.group(2)[:1].lower() == "x":
384                return chr(int(m.group(2)[1:], 16))
385            else:
386                return chr(int(m.group(2)))
387        except ValueError:
388            return "&#%s;" % m.group(2)
389    try:
390        return _HTML_UNICODE_MAP[m.group(2)]
391    except KeyError:
392        return "&%s;" % m.group(2)
393
394
395def _build_unicode_map() -> Dict[str, str]:
396    unicode_map = {}
397    for name, value in html.entities.name2codepoint.items():
398        unicode_map[name] = chr(value)
399    return unicode_map
400
401
402_HTML_UNICODE_MAP = _build_unicode_map()
403