1import unittest 2 3import tornado.escape 4from tornado.escape import ( 5 utf8, 6 xhtml_escape, 7 xhtml_unescape, 8 url_escape, 9 url_unescape, 10 to_unicode, 11 json_decode, 12 json_encode, 13 squeeze, 14 recursive_unicode, 15) 16from tornado.util import unicode_type 17 18from typing import List, Tuple, Union, Dict, Any # noqa: F401 19 20linkify_tests = [ 21 # (input, linkify_kwargs, expected_output) 22 ( 23 "hello http://world.com/!", 24 {}, 25 u'hello <a href="http://world.com/">http://world.com/</a>!', 26 ), 27 ( 28 "hello http://world.com/with?param=true&stuff=yes", 29 {}, 30 u'hello <a href="http://world.com/with?param=true&stuff=yes">http://world.com/with?param=true&stuff=yes</a>', # noqa: E501 31 ), 32 # an opened paren followed by many chars killed Gruber's regex 33 ( 34 "http://url.com/w(aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 35 {}, 36 u'<a href="http://url.com/w">http://url.com/w</a>(aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa', # noqa: E501 37 ), 38 # as did too many dots at the end 39 ( 40 "http://url.com/withmany.......................................", 41 {}, 42 u'<a href="http://url.com/withmany">http://url.com/withmany</a>.......................................', # noqa: E501 43 ), 44 ( 45 "http://url.com/withmany((((((((((((((((((((((((((((((((((a)", 46 {}, 47 u'<a href="http://url.com/withmany">http://url.com/withmany</a>((((((((((((((((((((((((((((((((((a)', # noqa: E501 48 ), 49 # some examples from http://daringfireball.net/2009/11/liberal_regex_for_matching_urls 50 # plus a fex extras (such as multiple parentheses). 51 ( 52 "http://foo.com/blah_blah", 53 {}, 54 u'<a href="http://foo.com/blah_blah">http://foo.com/blah_blah</a>', 55 ), 56 ( 57 "http://foo.com/blah_blah/", 58 {}, 59 u'<a href="http://foo.com/blah_blah/">http://foo.com/blah_blah/</a>', 60 ), 61 ( 62 "(Something like http://foo.com/blah_blah)", 63 {}, 64 u'(Something like <a href="http://foo.com/blah_blah">http://foo.com/blah_blah</a>)', 65 ), 66 ( 67 "http://foo.com/blah_blah_(wikipedia)", 68 {}, 69 u'<a href="http://foo.com/blah_blah_(wikipedia)">http://foo.com/blah_blah_(wikipedia)</a>', 70 ), 71 ( 72 "http://foo.com/blah_(blah)_(wikipedia)_blah", 73 {}, 74 u'<a href="http://foo.com/blah_(blah)_(wikipedia)_blah">http://foo.com/blah_(blah)_(wikipedia)_blah</a>', # noqa: E501 75 ), 76 ( 77 "(Something like http://foo.com/blah_blah_(wikipedia))", 78 {}, 79 u'(Something like <a href="http://foo.com/blah_blah_(wikipedia)">http://foo.com/blah_blah_(wikipedia)</a>)', # noqa: E501 80 ), 81 ( 82 "http://foo.com/blah_blah.", 83 {}, 84 u'<a href="http://foo.com/blah_blah">http://foo.com/blah_blah</a>.', 85 ), 86 ( 87 "http://foo.com/blah_blah/.", 88 {}, 89 u'<a href="http://foo.com/blah_blah/">http://foo.com/blah_blah/</a>.', 90 ), 91 ( 92 "<http://foo.com/blah_blah>", 93 {}, 94 u'<<a href="http://foo.com/blah_blah">http://foo.com/blah_blah</a>>', 95 ), 96 ( 97 "<http://foo.com/blah_blah/>", 98 {}, 99 u'<<a href="http://foo.com/blah_blah/">http://foo.com/blah_blah/</a>>', 100 ), 101 ( 102 "http://foo.com/blah_blah,", 103 {}, 104 u'<a href="http://foo.com/blah_blah">http://foo.com/blah_blah</a>,', 105 ), 106 ( 107 "http://www.example.com/wpstyle/?p=364.", 108 {}, 109 u'<a href="http://www.example.com/wpstyle/?p=364">http://www.example.com/wpstyle/?p=364</a>.', # noqa: E501 110 ), 111 ( 112 "rdar://1234", 113 {"permitted_protocols": ["http", "rdar"]}, 114 u'<a href="rdar://1234">rdar://1234</a>', 115 ), 116 ( 117 "rdar:/1234", 118 {"permitted_protocols": ["rdar"]}, 119 u'<a href="rdar:/1234">rdar:/1234</a>', 120 ), 121 ( 122 "http://userid:password@example.com:8080", 123 {}, 124 u'<a href="http://userid:password@example.com:8080">http://userid:password@example.com:8080</a>', # noqa: E501 125 ), 126 ( 127 "http://userid@example.com", 128 {}, 129 u'<a href="http://userid@example.com">http://userid@example.com</a>', 130 ), 131 ( 132 "http://userid@example.com:8080", 133 {}, 134 u'<a href="http://userid@example.com:8080">http://userid@example.com:8080</a>', 135 ), 136 ( 137 "http://userid:password@example.com", 138 {}, 139 u'<a href="http://userid:password@example.com">http://userid:password@example.com</a>', 140 ), 141 ( 142 "message://%3c330e7f8409726r6a4ba78dkf1fd71420c1bf6ff@mail.gmail.com%3e", 143 {"permitted_protocols": ["http", "message"]}, 144 u'<a href="message://%3c330e7f8409726r6a4ba78dkf1fd71420c1bf6ff@mail.gmail.com%3e">' 145 u"message://%3c330e7f8409726r6a4ba78dkf1fd71420c1bf6ff@mail.gmail.com%3e</a>", 146 ), 147 ( 148 u"http://\u27a1.ws/\u4a39", 149 {}, 150 u'<a href="http://\u27a1.ws/\u4a39">http://\u27a1.ws/\u4a39</a>', 151 ), 152 ( 153 "<tag>http://example.com</tag>", 154 {}, 155 u'<tag><a href="http://example.com">http://example.com</a></tag>', 156 ), 157 ( 158 "Just a www.example.com link.", 159 {}, 160 u'Just a <a href="http://www.example.com">www.example.com</a> link.', 161 ), 162 ( 163 "Just a www.example.com link.", 164 {"require_protocol": True}, 165 u"Just a www.example.com link.", 166 ), 167 ( 168 "A http://reallylong.com/link/that/exceedsthelenglimit.html", 169 {"require_protocol": True, "shorten": True}, 170 u'A <a href="http://reallylong.com/link/that/exceedsthelenglimit.html"' 171 u' title="http://reallylong.com/link/that/exceedsthelenglimit.html">http://reallylong.com/link...</a>', # noqa: E501 172 ), 173 ( 174 "A http://reallylongdomainnamethatwillbetoolong.com/hi!", 175 {"shorten": True}, 176 u'A <a href="http://reallylongdomainnamethatwillbetoolong.com/hi"' 177 u' title="http://reallylongdomainnamethatwillbetoolong.com/hi">http://reallylongdomainnametha...</a>!', # noqa: E501 178 ), 179 ( 180 "A file:///passwords.txt and http://web.com link", 181 {}, 182 u'A file:///passwords.txt and <a href="http://web.com">http://web.com</a> link', 183 ), 184 ( 185 "A file:///passwords.txt and http://web.com link", 186 {"permitted_protocols": ["file"]}, 187 u'A <a href="file:///passwords.txt">file:///passwords.txt</a> and http://web.com link', 188 ), 189 ( 190 "www.external-link.com", 191 {"extra_params": 'rel="nofollow" class="external"'}, 192 u'<a href="http://www.external-link.com" rel="nofollow" class="external">www.external-link.com</a>', # noqa: E501 193 ), 194 ( 195 "www.external-link.com and www.internal-link.com/blogs extra", 196 { 197 "extra_params": lambda href: 'class="internal"' 198 if href.startswith("http://www.internal-link.com") 199 else 'rel="nofollow" class="external"' 200 }, 201 u'<a href="http://www.external-link.com" rel="nofollow" class="external">www.external-link.com</a>' # noqa: E501 202 u' and <a href="http://www.internal-link.com/blogs" class="internal">www.internal-link.com/blogs</a> extra', # noqa: E501 203 ), 204 ( 205 "www.external-link.com", 206 {"extra_params": lambda href: ' rel="nofollow" class="external" '}, 207 u'<a href="http://www.external-link.com" rel="nofollow" class="external">www.external-link.com</a>', # noqa: E501 208 ), 209] # type: List[Tuple[Union[str, bytes], Dict[str, Any], str]] 210 211 212class EscapeTestCase(unittest.TestCase): 213 def test_linkify(self): 214 for text, kwargs, html in linkify_tests: 215 linked = tornado.escape.linkify(text, **kwargs) 216 self.assertEqual(linked, html) 217 218 def test_xhtml_escape(self): 219 tests = [ 220 ("<foo>", "<foo>"), 221 (u"<foo>", u"<foo>"), 222 (b"<foo>", b"<foo>"), 223 ("<>&\"'", "<>&"'"), 224 ("&", "&amp;"), 225 (u"<\u00e9>", u"<\u00e9>"), 226 (b"<\xc3\xa9>", b"<\xc3\xa9>"), 227 ] # type: List[Tuple[Union[str, bytes], Union[str, bytes]]] 228 for unescaped, escaped in tests: 229 self.assertEqual(utf8(xhtml_escape(unescaped)), utf8(escaped)) 230 self.assertEqual(utf8(unescaped), utf8(xhtml_unescape(escaped))) 231 232 def test_xhtml_unescape_numeric(self): 233 tests = [ 234 ("foo bar", "foo bar"), 235 ("foo bar", "foo bar"), 236 ("foo bar", "foo bar"), 237 ("foo઼bar", u"foo\u0abcbar"), 238 ("foo&#xyz;bar", "foo&#xyz;bar"), # invalid encoding 239 ("foo&#;bar", "foo&#;bar"), # invalid encoding 240 ("foo&#x;bar", "foo&#x;bar"), # invalid encoding 241 ] 242 for escaped, unescaped in tests: 243 self.assertEqual(unescaped, xhtml_unescape(escaped)) 244 245 def test_url_escape_unicode(self): 246 tests = [ 247 # byte strings are passed through as-is 248 (u"\u00e9".encode("utf8"), "%C3%A9"), 249 (u"\u00e9".encode("latin1"), "%E9"), 250 # unicode strings become utf8 251 (u"\u00e9", "%C3%A9"), 252 ] # type: List[Tuple[Union[str, bytes], str]] 253 for unescaped, escaped in tests: 254 self.assertEqual(url_escape(unescaped), escaped) 255 256 def test_url_unescape_unicode(self): 257 tests = [ 258 ("%C3%A9", u"\u00e9", "utf8"), 259 ("%C3%A9", u"\u00c3\u00a9", "latin1"), 260 ("%C3%A9", utf8(u"\u00e9"), None), 261 ] 262 for escaped, unescaped, encoding in tests: 263 # input strings to url_unescape should only contain ascii 264 # characters, but make sure the function accepts both byte 265 # and unicode strings. 266 self.assertEqual(url_unescape(to_unicode(escaped), encoding), unescaped) 267 self.assertEqual(url_unescape(utf8(escaped), encoding), unescaped) 268 269 def test_url_escape_quote_plus(self): 270 unescaped = "+ #%" 271 plus_escaped = "%2B+%23%25" 272 escaped = "%2B%20%23%25" 273 self.assertEqual(url_escape(unescaped), plus_escaped) 274 self.assertEqual(url_escape(unescaped, plus=False), escaped) 275 self.assertEqual(url_unescape(plus_escaped), unescaped) 276 self.assertEqual(url_unescape(escaped, plus=False), unescaped) 277 self.assertEqual(url_unescape(plus_escaped, encoding=None), utf8(unescaped)) 278 self.assertEqual( 279 url_unescape(escaped, encoding=None, plus=False), utf8(unescaped) 280 ) 281 282 def test_escape_return_types(self): 283 # On python2 the escape methods should generally return the same 284 # type as their argument 285 self.assertEqual(type(xhtml_escape("foo")), str) 286 self.assertEqual(type(xhtml_escape(u"foo")), unicode_type) 287 288 def test_json_decode(self): 289 # json_decode accepts both bytes and unicode, but strings it returns 290 # are always unicode. 291 self.assertEqual(json_decode(b'"foo"'), u"foo") 292 self.assertEqual(json_decode(u'"foo"'), u"foo") 293 294 # Non-ascii bytes are interpreted as utf8 295 self.assertEqual(json_decode(utf8(u'"\u00e9"')), u"\u00e9") 296 297 def test_json_encode(self): 298 # json deals with strings, not bytes. On python 2 byte strings will 299 # convert automatically if they are utf8; on python 3 byte strings 300 # are not allowed. 301 self.assertEqual(json_decode(json_encode(u"\u00e9")), u"\u00e9") 302 if bytes is str: 303 self.assertEqual(json_decode(json_encode(utf8(u"\u00e9"))), u"\u00e9") 304 self.assertRaises(UnicodeDecodeError, json_encode, b"\xe9") 305 306 def test_squeeze(self): 307 self.assertEqual( 308 squeeze(u"sequences of whitespace chars"), 309 u"sequences of whitespace chars", 310 ) 311 312 def test_recursive_unicode(self): 313 tests = { 314 "dict": {b"foo": b"bar"}, 315 "list": [b"foo", b"bar"], 316 "tuple": (b"foo", b"bar"), 317 "bytes": b"foo", 318 } 319 self.assertEqual(recursive_unicode(tests["dict"]), {u"foo": u"bar"}) 320 self.assertEqual(recursive_unicode(tests["list"]), [u"foo", u"bar"]) 321 self.assertEqual(recursive_unicode(tests["tuple"]), (u"foo", u"bar")) 322 self.assertEqual(recursive_unicode(tests["bytes"]), u"foo") 323