1#!/usr/bin/env python 2# 3# Copyright 2009 Facebook 4# 5# Licensed under the Apache License, Version 2.0 (the "License"); you may 6# not use this file except in compliance with the License. You may obtain 7# a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, software 12# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 14# License for the specific language governing permissions and limitations 15# under the License. 16 17"""Escaping/unescaping methods for HTML, JSON, URLs, and others. 18 19Also includes a few other miscellaneous string manipulation functions that 20have crept in over time. 21""" 22 23from __future__ import absolute_import, division, print_function 24 25import json 26import re 27 28from tornado.util import PY3, unicode_type, basestring_type 29 30if PY3: 31 from urllib.parse import parse_qs as _parse_qs 32 import html.entities as htmlentitydefs 33 import urllib.parse as urllib_parse 34 unichr = chr 35else: 36 from urlparse import parse_qs as _parse_qs 37 import htmlentitydefs 38 import urllib as urllib_parse 39 40try: 41 import typing # noqa 42except ImportError: 43 pass 44 45 46_XHTML_ESCAPE_RE = re.compile('[&<>"\']') 47_XHTML_ESCAPE_DICT = {'&': '&', '<': '<', '>': '>', '"': '"', 48 '\'': '''} 49 50 51def xhtml_escape(value): 52 """Escapes a string so it is valid within HTML or XML. 53 54 Escapes the characters ``<``, ``>``, ``"``, ``'``, and ``&``. 55 When used in attribute values the escaped strings must be enclosed 56 in quotes. 57 58 .. versionchanged:: 3.2 59 60 Added the single quote to the list of escaped characters. 61 """ 62 return _XHTML_ESCAPE_RE.sub(lambda match: _XHTML_ESCAPE_DICT[match.group(0)], 63 to_basestring(value)) 64 65 66def xhtml_unescape(value): 67 """Un-escapes an XML-escaped string.""" 68 return re.sub(r"&(#?)(\w+?);", _convert_entity, _unicode(value)) 69 70 71# The fact that json_encode wraps json.dumps is an implementation detail. 72# Please see https://github.com/tornadoweb/tornado/pull/706 73# before sending a pull request that adds **kwargs to this function. 74def json_encode(value): 75 """JSON-encodes the given Python object.""" 76 # JSON permits but does not require forward slashes to be escaped. 77 # This is useful when json data is emitted in a <script> tag 78 # in HTML, as it prevents </script> tags from prematurely terminating 79 # the javascript. Some json libraries do this escaping by default, 80 # although python's standard library does not, so we do it here. 81 # http://stackoverflow.com/questions/1580647/json-why-are-forward-slashes-escaped 82 return json.dumps(value).replace("</", "<\\/") 83 84 85def json_decode(value): 86 """Returns Python objects for the given JSON string.""" 87 return json.loads(to_basestring(value)) 88 89 90def squeeze(value): 91 """Replace all sequences of whitespace chars with a single space.""" 92 return re.sub(r"[\x00-\x20]+", " ", value).strip() 93 94 95def url_escape(value, plus=True): 96 """Returns a URL-encoded version of the given value. 97 98 If ``plus`` is true (the default), spaces will be represented 99 as "+" instead of "%20". This is appropriate for query strings 100 but not for the path component of a URL. Note that this default 101 is the reverse of Python's urllib module. 102 103 .. versionadded:: 3.1 104 The ``plus`` argument 105 """ 106 quote = urllib_parse.quote_plus if plus else urllib_parse.quote 107 return quote(utf8(value)) 108 109 110# python 3 changed things around enough that we need two separate 111# implementations of url_unescape. We also need our own implementation 112# of parse_qs since python 3's version insists on decoding everything. 113if not PY3: 114 def url_unescape(value, encoding='utf-8', plus=True): 115 """Decodes the given value from a URL. 116 117 The argument may be either a byte or unicode string. 118 119 If encoding is None, the result will be a byte string. Otherwise, 120 the result is a unicode string in the specified encoding. 121 122 If ``plus`` is true (the default), plus signs will be interpreted 123 as spaces (literal plus signs must be represented as "%2B"). This 124 is appropriate for query strings and form-encoded values but not 125 for the path component of a URL. Note that this default is the 126 reverse of Python's urllib module. 127 128 .. versionadded:: 3.1 129 The ``plus`` argument 130 """ 131 unquote = (urllib_parse.unquote_plus if plus else urllib_parse.unquote) 132 if encoding is None: 133 return unquote(utf8(value)) 134 else: 135 return unicode_type(unquote(utf8(value)), encoding) 136 137 parse_qs_bytes = _parse_qs 138else: 139 def url_unescape(value, encoding='utf-8', plus=True): 140 """Decodes the given value from a URL. 141 142 The argument may be either a byte or unicode string. 143 144 If encoding is None, the result will be a byte string. Otherwise, 145 the result is a unicode string in the specified encoding. 146 147 If ``plus`` is true (the default), plus signs will be interpreted 148 as spaces (literal plus signs must be represented as "%2B"). This 149 is appropriate for query strings and form-encoded values but not 150 for the path component of a URL. Note that this default is the 151 reverse of Python's urllib module. 152 153 .. versionadded:: 3.1 154 The ``plus`` argument 155 """ 156 if encoding is None: 157 if plus: 158 # unquote_to_bytes doesn't have a _plus variant 159 value = to_basestring(value).replace('+', ' ') 160 return urllib_parse.unquote_to_bytes(value) 161 else: 162 unquote = (urllib_parse.unquote_plus if plus 163 else urllib_parse.unquote) 164 return unquote(to_basestring(value), encoding=encoding) 165 166 def parse_qs_bytes(qs, keep_blank_values=False, strict_parsing=False): 167 """Parses a query string like urlparse.parse_qs, but returns the 168 values as byte strings. 169 170 Keys still become type str (interpreted as latin1 in python3!) 171 because it's too painful to keep them as byte strings in 172 python3 and in practice they're nearly always ascii anyway. 173 """ 174 # This is gross, but python3 doesn't give us another way. 175 # Latin1 is the universal donor of character encodings. 176 result = _parse_qs(qs, keep_blank_values, strict_parsing, 177 encoding='latin1', errors='strict') 178 encoded = {} 179 for k, v in result.items(): 180 encoded[k] = [i.encode('latin1') for i in v] 181 return encoded 182 183 184_UTF8_TYPES = (bytes, type(None)) 185 186 187def utf8(value): 188 # type: (typing.Union[bytes,unicode_type,None])->typing.Union[bytes,None] 189 """Converts a string argument to a byte string. 190 191 If the argument is already a byte string or None, it is returned unchanged. 192 Otherwise it must be a unicode string and is encoded as utf8. 193 """ 194 if isinstance(value, _UTF8_TYPES): 195 return value 196 if not isinstance(value, unicode_type): 197 raise TypeError( 198 "Expected bytes, unicode, or None; got %r" % type(value) 199 ) 200 return value.encode("utf-8") 201 202 203_TO_UNICODE_TYPES = (unicode_type, type(None)) 204 205 206def to_unicode(value): 207 """Converts a string argument to a unicode string. 208 209 If the argument is already a unicode string or None, it is returned 210 unchanged. Otherwise it must be a byte string and is decoded as utf8. 211 """ 212 if isinstance(value, _TO_UNICODE_TYPES): 213 return value 214 if not isinstance(value, bytes): 215 raise TypeError( 216 "Expected bytes, unicode, or None; got %r" % type(value) 217 ) 218 return value.decode("utf-8") 219 220 221# to_unicode was previously named _unicode not because it was private, 222# but to avoid conflicts with the built-in unicode() function/type 223_unicode = to_unicode 224 225# When dealing with the standard library across python 2 and 3 it is 226# sometimes useful to have a direct conversion to the native string type 227if str is unicode_type: 228 native_str = to_unicode 229else: 230 native_str = utf8 231 232_BASESTRING_TYPES = (basestring_type, type(None)) 233 234 235def to_basestring(value): 236 """Converts a string argument to a subclass of basestring. 237 238 In python2, byte and unicode strings are mostly interchangeable, 239 so functions that deal with a user-supplied argument in combination 240 with ascii string constants can use either and should return the type 241 the user supplied. In python3, the two types are not interchangeable, 242 so this method is needed to convert byte strings to unicode. 243 """ 244 if isinstance(value, _BASESTRING_TYPES): 245 return value 246 if not isinstance(value, bytes): 247 raise TypeError( 248 "Expected bytes, unicode, or None; got %r" % type(value) 249 ) 250 return value.decode("utf-8") 251 252 253def recursive_unicode(obj): 254 """Walks a simple data structure, converting byte strings to unicode. 255 256 Supports lists, tuples, and dictionaries. 257 """ 258 if isinstance(obj, dict): 259 return dict((recursive_unicode(k), recursive_unicode(v)) for (k, v) in obj.items()) 260 elif isinstance(obj, list): 261 return list(recursive_unicode(i) for i in obj) 262 elif isinstance(obj, tuple): 263 return tuple(recursive_unicode(i) for i in obj) 264 elif isinstance(obj, bytes): 265 return to_unicode(obj) 266 else: 267 return obj 268 269 270# I originally used the regex from 271# http://daringfireball.net/2010/07/improved_regex_for_matching_urls 272# but it gets all exponential on certain patterns (such as too many trailing 273# dots), causing the regex matcher to never return. 274# This regex should avoid those problems. 275# Use to_unicode instead of tornado.util.u - we don't want backslashes getting 276# processed as escapes. 277_URL_RE = re.compile(to_unicode(r"""\b((?:([\w-]+):(/{1,3})|www[.])(?:(?:(?:[^\s&()]|&|")*(?:[^!"#$%&'()*+,.:;<=>?@\[\]^`{|}~\s]))|(?:\((?:[^\s&()]|&|")*\)))+)""")) 278 279 280def linkify(text, shorten=False, extra_params="", 281 require_protocol=False, permitted_protocols=["http", "https"]): 282 """Converts plain text into HTML with links. 283 284 For example: ``linkify("Hello http://tornadoweb.org!")`` would return 285 ``Hello <a href="http://tornadoweb.org">http://tornadoweb.org</a>!`` 286 287 Parameters: 288 289 * ``shorten``: Long urls will be shortened for display. 290 291 * ``extra_params``: Extra text to include in the link tag, or a callable 292 taking the link as an argument and returning the extra text 293 e.g. ``linkify(text, extra_params='rel="nofollow" class="external"')``, 294 or:: 295 296 def extra_params_cb(url): 297 if url.startswith("http://example.com"): 298 return 'class="internal"' 299 else: 300 return 'class="external" rel="nofollow"' 301 linkify(text, extra_params=extra_params_cb) 302 303 * ``require_protocol``: Only linkify urls which include a protocol. If 304 this is False, urls such as www.facebook.com will also be linkified. 305 306 * ``permitted_protocols``: List (or set) of protocols which should be 307 linkified, e.g. ``linkify(text, permitted_protocols=["http", "ftp", 308 "mailto"])``. It is very unsafe to include protocols such as 309 ``javascript``. 310 """ 311 if extra_params and not callable(extra_params): 312 extra_params = " " + extra_params.strip() 313 314 def make_link(m): 315 url = m.group(1) 316 proto = m.group(2) 317 if require_protocol and not proto: 318 return url # not protocol, no linkify 319 320 if proto and proto not in permitted_protocols: 321 return url # bad protocol, no linkify 322 323 href = m.group(1) 324 if not proto: 325 href = "http://" + href # no proto specified, use http 326 327 if callable(extra_params): 328 params = " " + extra_params(href).strip() 329 else: 330 params = extra_params 331 332 # clip long urls. max_len is just an approximation 333 max_len = 30 334 if shorten and len(url) > max_len: 335 before_clip = url 336 if proto: 337 proto_len = len(proto) + 1 + len(m.group(3) or "") # +1 for : 338 else: 339 proto_len = 0 340 341 parts = url[proto_len:].split("/") 342 if len(parts) > 1: 343 # Grab the whole host part plus the first bit of the path 344 # The path is usually not that interesting once shortened 345 # (no more slug, etc), so it really just provides a little 346 # extra indication of shortening. 347 url = url[:proto_len] + parts[0] + "/" + \ 348 parts[1][:8].split('?')[0].split('.')[0] 349 350 if len(url) > max_len * 1.5: # still too long 351 url = url[:max_len] 352 353 if url != before_clip: 354 amp = url.rfind('&') 355 # avoid splitting html char entities 356 if amp > max_len - 5: 357 url = url[:amp] 358 url += "..." 359 360 if len(url) >= len(before_clip): 361 url = before_clip 362 else: 363 # full url is visible on mouse-over (for those who don't 364 # have a status bar, such as Safari by default) 365 params += ' title="%s"' % href 366 367 return u'<a href="%s"%s>%s</a>' % (href, params, url) 368 369 # First HTML-escape so that our strings are all safe. 370 # The regex is modified to avoid character entites other than & so 371 # that we won't pick up ", etc. 372 text = _unicode(xhtml_escape(text)) 373 return _URL_RE.sub(make_link, text) 374 375 376def _convert_entity(m): 377 if m.group(1) == "#": 378 try: 379 if m.group(2)[:1].lower() == 'x': 380 return unichr(int(m.group(2)[1:], 16)) 381 else: 382 return unichr(int(m.group(2))) 383 except ValueError: 384 return "&#%s;" % m.group(2) 385 try: 386 return _HTML_UNICODE_MAP[m.group(2)] 387 except KeyError: 388 return "&%s;" % m.group(2) 389 390 391def _build_unicode_map(): 392 unicode_map = {} 393 for name, value in htmlentitydefs.name2codepoint.items(): 394 unicode_map[name] = unichr(value) 395 return unicode_map 396 397 398_HTML_UNICODE_MAP = _build_unicode_map() 399