1import unittest
2
3import tornado.escape
4from tornado.escape import (
5    utf8,
6    xhtml_escape,
7    xhtml_unescape,
8    url_escape,
9    url_unescape,
10    to_unicode,
11    json_decode,
12    json_encode,
13    squeeze,
14    recursive_unicode,
15)
16from tornado.util import unicode_type
17
18from typing import List, Tuple, Union, Dict, Any  # noqa: F401
19
20linkify_tests = [
21    # (input, linkify_kwargs, expected_output)
22    (
23        "hello http://world.com/!",
24        {},
25        u'hello <a href="http://world.com/">http://world.com/</a>!',
26    ),
27    (
28        "hello http://world.com/with?param=true&stuff=yes",
29        {},
30        u'hello <a href="http://world.com/with?param=true&amp;stuff=yes">http://world.com/with?param=true&amp;stuff=yes</a>',  # noqa: E501
31    ),
32    # an opened paren followed by many chars killed Gruber's regex
33    (
34        "http://url.com/w(aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
35        {},
36        u'<a href="http://url.com/w">http://url.com/w</a>(aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa',  # noqa: E501
37    ),
38    # as did too many dots at the end
39    (
40        "http://url.com/withmany.......................................",
41        {},
42        u'<a href="http://url.com/withmany">http://url.com/withmany</a>.......................................',  # noqa: E501
43    ),
44    (
45        "http://url.com/withmany((((((((((((((((((((((((((((((((((a)",
46        {},
47        u'<a href="http://url.com/withmany">http://url.com/withmany</a>((((((((((((((((((((((((((((((((((a)',  # noqa: E501
48    ),
49    # some examples from http://daringfireball.net/2009/11/liberal_regex_for_matching_urls
50    # plus a fex extras (such as multiple parentheses).
51    (
52        "http://foo.com/blah_blah",
53        {},
54        u'<a href="http://foo.com/blah_blah">http://foo.com/blah_blah</a>',
55    ),
56    (
57        "http://foo.com/blah_blah/",
58        {},
59        u'<a href="http://foo.com/blah_blah/">http://foo.com/blah_blah/</a>',
60    ),
61    (
62        "(Something like http://foo.com/blah_blah)",
63        {},
64        u'(Something like <a href="http://foo.com/blah_blah">http://foo.com/blah_blah</a>)',
65    ),
66    (
67        "http://foo.com/blah_blah_(wikipedia)",
68        {},
69        u'<a href="http://foo.com/blah_blah_(wikipedia)">http://foo.com/blah_blah_(wikipedia)</a>',
70    ),
71    (
72        "http://foo.com/blah_(blah)_(wikipedia)_blah",
73        {},
74        u'<a href="http://foo.com/blah_(blah)_(wikipedia)_blah">http://foo.com/blah_(blah)_(wikipedia)_blah</a>',  # noqa: E501
75    ),
76    (
77        "(Something like http://foo.com/blah_blah_(wikipedia))",
78        {},
79        u'(Something like <a href="http://foo.com/blah_blah_(wikipedia)">http://foo.com/blah_blah_(wikipedia)</a>)',  # noqa: E501
80    ),
81    (
82        "http://foo.com/blah_blah.",
83        {},
84        u'<a href="http://foo.com/blah_blah">http://foo.com/blah_blah</a>.',
85    ),
86    (
87        "http://foo.com/blah_blah/.",
88        {},
89        u'<a href="http://foo.com/blah_blah/">http://foo.com/blah_blah/</a>.',
90    ),
91    (
92        "<http://foo.com/blah_blah>",
93        {},
94        u'&lt;<a href="http://foo.com/blah_blah">http://foo.com/blah_blah</a>&gt;',
95    ),
96    (
97        "<http://foo.com/blah_blah/>",
98        {},
99        u'&lt;<a href="http://foo.com/blah_blah/">http://foo.com/blah_blah/</a>&gt;',
100    ),
101    (
102        "http://foo.com/blah_blah,",
103        {},
104        u'<a href="http://foo.com/blah_blah">http://foo.com/blah_blah</a>,',
105    ),
106    (
107        "http://www.example.com/wpstyle/?p=364.",
108        {},
109        u'<a href="http://www.example.com/wpstyle/?p=364">http://www.example.com/wpstyle/?p=364</a>.',  # noqa: E501
110    ),
111    (
112        "rdar://1234",
113        {"permitted_protocols": ["http", "rdar"]},
114        u'<a href="rdar://1234">rdar://1234</a>',
115    ),
116    (
117        "rdar:/1234",
118        {"permitted_protocols": ["rdar"]},
119        u'<a href="rdar:/1234">rdar:/1234</a>',
120    ),
121    (
122        "http://userid:password@example.com:8080",
123        {},
124        u'<a href="http://userid:password@example.com:8080">http://userid:password@example.com:8080</a>',  # noqa: E501
125    ),
126    (
127        "http://userid@example.com",
128        {},
129        u'<a href="http://userid@example.com">http://userid@example.com</a>',
130    ),
131    (
132        "http://userid@example.com:8080",
133        {},
134        u'<a href="http://userid@example.com:8080">http://userid@example.com:8080</a>',
135    ),
136    (
137        "http://userid:password@example.com",
138        {},
139        u'<a href="http://userid:password@example.com">http://userid:password@example.com</a>',
140    ),
141    (
142        "message://%3c330e7f8409726r6a4ba78dkf1fd71420c1bf6ff@mail.gmail.com%3e",
143        {"permitted_protocols": ["http", "message"]},
144        u'<a href="message://%3c330e7f8409726r6a4ba78dkf1fd71420c1bf6ff@mail.gmail.com%3e">'
145        u"message://%3c330e7f8409726r6a4ba78dkf1fd71420c1bf6ff@mail.gmail.com%3e</a>",
146    ),
147    (
148        u"http://\u27a1.ws/\u4a39",
149        {},
150        u'<a href="http://\u27a1.ws/\u4a39">http://\u27a1.ws/\u4a39</a>',
151    ),
152    (
153        "<tag>http://example.com</tag>",
154        {},
155        u'&lt;tag&gt;<a href="http://example.com">http://example.com</a>&lt;/tag&gt;',
156    ),
157    (
158        "Just a www.example.com link.",
159        {},
160        u'Just a <a href="http://www.example.com">www.example.com</a> link.',
161    ),
162    (
163        "Just a www.example.com link.",
164        {"require_protocol": True},
165        u"Just a www.example.com link.",
166    ),
167    (
168        "A http://reallylong.com/link/that/exceedsthelenglimit.html",
169        {"require_protocol": True, "shorten": True},
170        u'A <a href="http://reallylong.com/link/that/exceedsthelenglimit.html"'
171        u' title="http://reallylong.com/link/that/exceedsthelenglimit.html">http://reallylong.com/link...</a>',  # noqa: E501
172    ),
173    (
174        "A http://reallylongdomainnamethatwillbetoolong.com/hi!",
175        {"shorten": True},
176        u'A <a href="http://reallylongdomainnamethatwillbetoolong.com/hi"'
177        u' title="http://reallylongdomainnamethatwillbetoolong.com/hi">http://reallylongdomainnametha...</a>!',  # noqa: E501
178    ),
179    (
180        "A file:///passwords.txt and http://web.com link",
181        {},
182        u'A file:///passwords.txt and <a href="http://web.com">http://web.com</a> link',
183    ),
184    (
185        "A file:///passwords.txt and http://web.com link",
186        {"permitted_protocols": ["file"]},
187        u'A <a href="file:///passwords.txt">file:///passwords.txt</a> and http://web.com link',
188    ),
189    (
190        "www.external-link.com",
191        {"extra_params": 'rel="nofollow" class="external"'},
192        u'<a href="http://www.external-link.com" rel="nofollow" class="external">www.external-link.com</a>',  # noqa: E501
193    ),
194    (
195        "www.external-link.com and www.internal-link.com/blogs extra",
196        {
197            "extra_params": lambda href: 'class="internal"'
198            if href.startswith("http://www.internal-link.com")
199            else 'rel="nofollow" class="external"'
200        },
201        u'<a href="http://www.external-link.com" rel="nofollow" class="external">www.external-link.com</a>'  # noqa: E501
202        u' and <a href="http://www.internal-link.com/blogs" class="internal">www.internal-link.com/blogs</a> extra',  # noqa: E501
203    ),
204    (
205        "www.external-link.com",
206        {"extra_params": lambda href: '    rel="nofollow" class="external"  '},
207        u'<a href="http://www.external-link.com" rel="nofollow" class="external">www.external-link.com</a>',  # noqa: E501
208    ),
209]  # type: List[Tuple[Union[str, bytes], Dict[str, Any], str]]
210
211
212class EscapeTestCase(unittest.TestCase):
213    def test_linkify(self):
214        for text, kwargs, html in linkify_tests:
215            linked = tornado.escape.linkify(text, **kwargs)
216            self.assertEqual(linked, html)
217
218    def test_xhtml_escape(self):
219        tests = [
220            ("<foo>", "&lt;foo&gt;"),
221            (u"<foo>", u"&lt;foo&gt;"),
222            (b"<foo>", b"&lt;foo&gt;"),
223            ("<>&\"'", "&lt;&gt;&amp;&quot;&#39;"),
224            ("&amp;", "&amp;amp;"),
225            (u"<\u00e9>", u"&lt;\u00e9&gt;"),
226            (b"<\xc3\xa9>", b"&lt;\xc3\xa9&gt;"),
227        ]  # type: List[Tuple[Union[str, bytes], Union[str, bytes]]]
228        for unescaped, escaped in tests:
229            self.assertEqual(utf8(xhtml_escape(unescaped)), utf8(escaped))
230            self.assertEqual(utf8(unescaped), utf8(xhtml_unescape(escaped)))
231
232    def test_xhtml_unescape_numeric(self):
233        tests = [
234            ("foo&#32;bar", "foo bar"),
235            ("foo&#x20;bar", "foo bar"),
236            ("foo&#X20;bar", "foo bar"),
237            ("foo&#xabc;bar", u"foo\u0abcbar"),
238            ("foo&#xyz;bar", "foo&#xyz;bar"),  # invalid encoding
239            ("foo&#;bar", "foo&#;bar"),  # invalid encoding
240            ("foo&#x;bar", "foo&#x;bar"),  # invalid encoding
241        ]
242        for escaped, unescaped in tests:
243            self.assertEqual(unescaped, xhtml_unescape(escaped))
244
245    def test_url_escape_unicode(self):
246        tests = [
247            # byte strings are passed through as-is
248            (u"\u00e9".encode("utf8"), "%C3%A9"),
249            (u"\u00e9".encode("latin1"), "%E9"),
250            # unicode strings become utf8
251            (u"\u00e9", "%C3%A9"),
252        ]  # type: List[Tuple[Union[str, bytes], str]]
253        for unescaped, escaped in tests:
254            self.assertEqual(url_escape(unescaped), escaped)
255
256    def test_url_unescape_unicode(self):
257        tests = [
258            ("%C3%A9", u"\u00e9", "utf8"),
259            ("%C3%A9", u"\u00c3\u00a9", "latin1"),
260            ("%C3%A9", utf8(u"\u00e9"), None),
261        ]
262        for escaped, unescaped, encoding in tests:
263            # input strings to url_unescape should only contain ascii
264            # characters, but make sure the function accepts both byte
265            # and unicode strings.
266            self.assertEqual(url_unescape(to_unicode(escaped), encoding), unescaped)
267            self.assertEqual(url_unescape(utf8(escaped), encoding), unescaped)
268
269    def test_url_escape_quote_plus(self):
270        unescaped = "+ #%"
271        plus_escaped = "%2B+%23%25"
272        escaped = "%2B%20%23%25"
273        self.assertEqual(url_escape(unescaped), plus_escaped)
274        self.assertEqual(url_escape(unescaped, plus=False), escaped)
275        self.assertEqual(url_unescape(plus_escaped), unescaped)
276        self.assertEqual(url_unescape(escaped, plus=False), unescaped)
277        self.assertEqual(url_unescape(plus_escaped, encoding=None), utf8(unescaped))
278        self.assertEqual(
279            url_unescape(escaped, encoding=None, plus=False), utf8(unescaped)
280        )
281
282    def test_escape_return_types(self):
283        # On python2 the escape methods should generally return the same
284        # type as their argument
285        self.assertEqual(type(xhtml_escape("foo")), str)
286        self.assertEqual(type(xhtml_escape(u"foo")), unicode_type)
287
288    def test_json_decode(self):
289        # json_decode accepts both bytes and unicode, but strings it returns
290        # are always unicode.
291        self.assertEqual(json_decode(b'"foo"'), u"foo")
292        self.assertEqual(json_decode(u'"foo"'), u"foo")
293
294        # Non-ascii bytes are interpreted as utf8
295        self.assertEqual(json_decode(utf8(u'"\u00e9"')), u"\u00e9")
296
297    def test_json_encode(self):
298        # json deals with strings, not bytes.  On python 2 byte strings will
299        # convert automatically if they are utf8; on python 3 byte strings
300        # are not allowed.
301        self.assertEqual(json_decode(json_encode(u"\u00e9")), u"\u00e9")
302        if bytes is str:
303            self.assertEqual(json_decode(json_encode(utf8(u"\u00e9"))), u"\u00e9")
304            self.assertRaises(UnicodeDecodeError, json_encode, b"\xe9")
305
306    def test_squeeze(self):
307        self.assertEqual(
308            squeeze(u"sequences     of    whitespace   chars"),
309            u"sequences of whitespace chars",
310        )
311
312    def test_recursive_unicode(self):
313        tests = {
314            "dict": {b"foo": b"bar"},
315            "list": [b"foo", b"bar"],
316            "tuple": (b"foo", b"bar"),
317            "bytes": b"foo",
318        }
319        self.assertEqual(recursive_unicode(tests["dict"]), {u"foo": u"bar"})
320        self.assertEqual(recursive_unicode(tests["list"]), [u"foo", u"bar"])
321        self.assertEqual(recursive_unicode(tests["tuple"]), (u"foo", u"bar"))
322        self.assertEqual(recursive_unicode(tests["bytes"]), u"foo")
323