1# 2# Copyright 2009 Facebook 3# 4# Licensed under the Apache License, Version 2.0 (the "License"); you may 5# not use this file except in compliance with the License. You may obtain 6# a copy of the License at 7# 8# http://www.apache.org/licenses/LICENSE-2.0 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13# License for the specific language governing permissions and limitations 14# under the License. 15 16"""Escaping/unescaping methods for HTML, JSON, URLs, and others. 17 18Also includes a few other miscellaneous string manipulation functions that 19have crept in over time. 20""" 21 22import html.entities 23import json 24import re 25import urllib.parse 26 27from tornado.util import unicode_type 28 29import typing 30from typing import Union, Any, Optional, Dict, List, Callable 31 32 33_XHTML_ESCAPE_RE = re.compile("[&<>\"']") 34_XHTML_ESCAPE_DICT = { 35 "&": "&", 36 "<": "<", 37 ">": ">", 38 '"': """, 39 "'": "'", 40} 41 42 43def xhtml_escape(value: Union[str, bytes]) -> str: 44 """Escapes a string so it is valid within HTML or XML. 45 46 Escapes the characters ``<``, ``>``, ``"``, ``'``, and ``&``. 47 When used in attribute values the escaped strings must be enclosed 48 in quotes. 49 50 .. versionchanged:: 3.2 51 52 Added the single quote to the list of escaped characters. 53 """ 54 return _XHTML_ESCAPE_RE.sub( 55 lambda match: _XHTML_ESCAPE_DICT[match.group(0)], to_basestring(value) 56 ) 57 58 59def xhtml_unescape(value: Union[str, bytes]) -> str: 60 """Un-escapes an XML-escaped string.""" 61 return re.sub(r"&(#?)(\w+?);", _convert_entity, _unicode(value)) 62 63 64# The fact that json_encode wraps json.dumps is an implementation detail. 65# Please see https://github.com/tornadoweb/tornado/pull/706 66# before sending a pull request that adds **kwargs to this function. 67def json_encode(value: Any) -> str: 68 """JSON-encodes the given Python object.""" 69 # JSON permits but does not require forward slashes to be escaped. 70 # This is useful when json data is emitted in a <script> tag 71 # in HTML, as it prevents </script> tags from prematurely terminating 72 # the JavaScript. Some json libraries do this escaping by default, 73 # although python's standard library does not, so we do it here. 74 # http://stackoverflow.com/questions/1580647/json-why-are-forward-slashes-escaped 75 return json.dumps(value).replace("</", "<\\/") 76 77 78def json_decode(value: Union[str, bytes]) -> Any: 79 """Returns Python objects for the given JSON string. 80 81 Supports both `str` and `bytes` inputs. 82 """ 83 return json.loads(to_basestring(value)) 84 85 86def squeeze(value: str) -> str: 87 """Replace all sequences of whitespace chars with a single space.""" 88 return re.sub(r"[\x00-\x20]+", " ", value).strip() 89 90 91def url_escape(value: Union[str, bytes], plus: bool = True) -> str: 92 """Returns a URL-encoded version of the given value. 93 94 If ``plus`` is true (the default), spaces will be represented 95 as "+" instead of "%20". This is appropriate for query strings 96 but not for the path component of a URL. Note that this default 97 is the reverse of Python's urllib module. 98 99 .. versionadded:: 3.1 100 The ``plus`` argument 101 """ 102 quote = urllib.parse.quote_plus if plus else urllib.parse.quote 103 return quote(utf8(value)) 104 105 106@typing.overload 107def url_unescape(value: Union[str, bytes], encoding: None, plus: bool = True) -> bytes: 108 pass 109 110 111@typing.overload # noqa: F811 112def url_unescape( 113 value: Union[str, bytes], encoding: str = "utf-8", plus: bool = True 114) -> str: 115 pass 116 117 118def url_unescape( # noqa: F811 119 value: Union[str, bytes], encoding: Optional[str] = "utf-8", plus: bool = True 120) -> Union[str, bytes]: 121 """Decodes the given value from a URL. 122 123 The argument may be either a byte or unicode string. 124 125 If encoding is None, the result will be a byte string. Otherwise, 126 the result is a unicode string in the specified encoding. 127 128 If ``plus`` is true (the default), plus signs will be interpreted 129 as spaces (literal plus signs must be represented as "%2B"). This 130 is appropriate for query strings and form-encoded values but not 131 for the path component of a URL. Note that this default is the 132 reverse of Python's urllib module. 133 134 .. versionadded:: 3.1 135 The ``plus`` argument 136 """ 137 if encoding is None: 138 if plus: 139 # unquote_to_bytes doesn't have a _plus variant 140 value = to_basestring(value).replace("+", " ") 141 return urllib.parse.unquote_to_bytes(value) 142 else: 143 unquote = urllib.parse.unquote_plus if plus else urllib.parse.unquote 144 return unquote(to_basestring(value), encoding=encoding) 145 146 147def parse_qs_bytes( 148 qs: Union[str, bytes], keep_blank_values: bool = False, strict_parsing: bool = False 149) -> Dict[str, List[bytes]]: 150 """Parses a query string like urlparse.parse_qs, 151 but takes bytes and returns the values as byte strings. 152 153 Keys still become type str (interpreted as latin1 in python3!) 154 because it's too painful to keep them as byte strings in 155 python3 and in practice they're nearly always ascii anyway. 156 """ 157 # This is gross, but python3 doesn't give us another way. 158 # Latin1 is the universal donor of character encodings. 159 if isinstance(qs, bytes): 160 qs = qs.decode("latin1") 161 result = urllib.parse.parse_qs( 162 qs, keep_blank_values, strict_parsing, encoding="latin1", errors="strict" 163 ) 164 encoded = {} 165 for k, v in result.items(): 166 encoded[k] = [i.encode("latin1") for i in v] 167 return encoded 168 169 170_UTF8_TYPES = (bytes, type(None)) 171 172 173@typing.overload 174def utf8(value: bytes) -> bytes: 175 pass 176 177 178@typing.overload # noqa: F811 179def utf8(value: str) -> bytes: 180 pass 181 182 183@typing.overload # noqa: F811 184def utf8(value: None) -> None: 185 pass 186 187 188def utf8(value: Union[None, str, bytes]) -> Optional[bytes]: # noqa: F811 189 """Converts a string argument to a byte string. 190 191 If the argument is already a byte string or None, it is returned unchanged. 192 Otherwise it must be a unicode string and is encoded as utf8. 193 """ 194 if isinstance(value, _UTF8_TYPES): 195 return value 196 if not isinstance(value, unicode_type): 197 raise TypeError("Expected bytes, unicode, or None; got %r" % type(value)) 198 return value.encode("utf-8") 199 200 201_TO_UNICODE_TYPES = (unicode_type, type(None)) 202 203 204@typing.overload 205def to_unicode(value: str) -> str: 206 pass 207 208 209@typing.overload # noqa: F811 210def to_unicode(value: bytes) -> str: 211 pass 212 213 214@typing.overload # noqa: F811 215def to_unicode(value: None) -> None: 216 pass 217 218 219def to_unicode(value: Union[None, str, bytes]) -> Optional[str]: # noqa: F811 220 """Converts a string argument to a unicode string. 221 222 If the argument is already a unicode string or None, it is returned 223 unchanged. Otherwise it must be a byte string and is decoded as utf8. 224 """ 225 if isinstance(value, _TO_UNICODE_TYPES): 226 return value 227 if not isinstance(value, bytes): 228 raise TypeError("Expected bytes, unicode, or None; got %r" % type(value)) 229 return value.decode("utf-8") 230 231 232# to_unicode was previously named _unicode not because it was private, 233# but to avoid conflicts with the built-in unicode() function/type 234_unicode = to_unicode 235 236# When dealing with the standard library across python 2 and 3 it is 237# sometimes useful to have a direct conversion to the native string type 238native_str = to_unicode 239to_basestring = to_unicode 240 241 242def recursive_unicode(obj: Any) -> Any: 243 """Walks a simple data structure, converting byte strings to unicode. 244 245 Supports lists, tuples, and dictionaries. 246 """ 247 if isinstance(obj, dict): 248 return dict( 249 (recursive_unicode(k), recursive_unicode(v)) for (k, v) in obj.items() 250 ) 251 elif isinstance(obj, list): 252 return list(recursive_unicode(i) for i in obj) 253 elif isinstance(obj, tuple): 254 return tuple(recursive_unicode(i) for i in obj) 255 elif isinstance(obj, bytes): 256 return to_unicode(obj) 257 else: 258 return obj 259 260 261# I originally used the regex from 262# http://daringfireball.net/2010/07/improved_regex_for_matching_urls 263# but it gets all exponential on certain patterns (such as too many trailing 264# dots), causing the regex matcher to never return. 265# This regex should avoid those problems. 266# Use to_unicode instead of tornado.util.u - we don't want backslashes getting 267# processed as escapes. 268_URL_RE = re.compile( 269 to_unicode( 270 r"""\b((?:([\w-]+):(/{1,3})|www[.])(?:(?:(?:[^\s&()]|&|")*(?:[^!"#$%&'()*+,.:;<=>?@\[\]^`{|}~\s]))|(?:\((?:[^\s&()]|&|")*\)))+)""" # noqa: E501 271 ) 272) 273 274 275def linkify( 276 text: Union[str, bytes], 277 shorten: bool = False, 278 extra_params: Union[str, Callable[[str], str]] = "", 279 require_protocol: bool = False, 280 permitted_protocols: List[str] = ["http", "https"], 281) -> str: 282 """Converts plain text into HTML with links. 283 284 For example: ``linkify("Hello http://tornadoweb.org!")`` would return 285 ``Hello <a href="http://tornadoweb.org">http://tornadoweb.org</a>!`` 286 287 Parameters: 288 289 * ``shorten``: Long urls will be shortened for display. 290 291 * ``extra_params``: Extra text to include in the link tag, or a callable 292 taking the link as an argument and returning the extra text 293 e.g. ``linkify(text, extra_params='rel="nofollow" class="external"')``, 294 or:: 295 296 def extra_params_cb(url): 297 if url.startswith("http://example.com"): 298 return 'class="internal"' 299 else: 300 return 'class="external" rel="nofollow"' 301 linkify(text, extra_params=extra_params_cb) 302 303 * ``require_protocol``: Only linkify urls which include a protocol. If 304 this is False, urls such as www.facebook.com will also be linkified. 305 306 * ``permitted_protocols``: List (or set) of protocols which should be 307 linkified, e.g. ``linkify(text, permitted_protocols=["http", "ftp", 308 "mailto"])``. It is very unsafe to include protocols such as 309 ``javascript``. 310 """ 311 if extra_params and not callable(extra_params): 312 extra_params = " " + extra_params.strip() 313 314 def make_link(m: typing.Match) -> str: 315 url = m.group(1) 316 proto = m.group(2) 317 if require_protocol and not proto: 318 return url # not protocol, no linkify 319 320 if proto and proto not in permitted_protocols: 321 return url # bad protocol, no linkify 322 323 href = m.group(1) 324 if not proto: 325 href = "http://" + href # no proto specified, use http 326 327 if callable(extra_params): 328 params = " " + extra_params(href).strip() 329 else: 330 params = extra_params 331 332 # clip long urls. max_len is just an approximation 333 max_len = 30 334 if shorten and len(url) > max_len: 335 before_clip = url 336 if proto: 337 proto_len = len(proto) + 1 + len(m.group(3) or "") # +1 for : 338 else: 339 proto_len = 0 340 341 parts = url[proto_len:].split("/") 342 if len(parts) > 1: 343 # Grab the whole host part plus the first bit of the path 344 # The path is usually not that interesting once shortened 345 # (no more slug, etc), so it really just provides a little 346 # extra indication of shortening. 347 url = ( 348 url[:proto_len] 349 + parts[0] 350 + "/" 351 + parts[1][:8].split("?")[0].split(".")[0] 352 ) 353 354 if len(url) > max_len * 1.5: # still too long 355 url = url[:max_len] 356 357 if url != before_clip: 358 amp = url.rfind("&") 359 # avoid splitting html char entities 360 if amp > max_len - 5: 361 url = url[:amp] 362 url += "..." 363 364 if len(url) >= len(before_clip): 365 url = before_clip 366 else: 367 # full url is visible on mouse-over (for those who don't 368 # have a status bar, such as Safari by default) 369 params += ' title="%s"' % href 370 371 return u'<a href="%s"%s>%s</a>' % (href, params, url) 372 373 # First HTML-escape so that our strings are all safe. 374 # The regex is modified to avoid character entites other than & so 375 # that we won't pick up ", etc. 376 text = _unicode(xhtml_escape(text)) 377 return _URL_RE.sub(make_link, text) 378 379 380def _convert_entity(m: typing.Match) -> str: 381 if m.group(1) == "#": 382 try: 383 if m.group(2)[:1].lower() == "x": 384 return chr(int(m.group(2)[1:], 16)) 385 else: 386 return chr(int(m.group(2))) 387 except ValueError: 388 return "&#%s;" % m.group(2) 389 try: 390 return _HTML_UNICODE_MAP[m.group(2)] 391 except KeyError: 392 return "&%s;" % m.group(2) 393 394 395def _build_unicode_map() -> Dict[str, str]: 396 unicode_map = {} 397 for name, value in html.entities.name2codepoint.items(): 398 unicode_map[name] = chr(value) 399 return unicode_map 400 401 402_HTML_UNICODE_MAP = _build_unicode_map() 403