1from __future__ import unicode_literals
2
3import re
4
5import pytest
6from six.moves.urllib_parse import quote_plus
7
8from bleach import linkify, DEFAULT_CALLBACKS as DC
9from bleach.linkifier import Linker, LinkifyFilter
10from bleach.sanitizer import Cleaner
11
12
13def test_empty():
14    assert linkify('') == ''
15
16
17def test_simple_link():
18    assert (
19        linkify('a http://example.com link') ==
20        'a <a href="http://example.com" rel="nofollow">http://example.com</a> link'
21    )
22    assert (
23        linkify('a https://example.com link') ==
24        'a <a href="https://example.com" rel="nofollow">https://example.com</a> link'
25    )
26    assert (
27        linkify('a example.com link') ==
28        'a <a href="http://example.com" rel="nofollow">example.com</a> link'
29    )
30
31
32def test_trailing_slash():
33    assert (
34        linkify('http://examp.com/') ==
35        '<a href="http://examp.com/" rel="nofollow">http://examp.com/</a>'
36    )
37    assert (
38        linkify('http://example.com/foo/') ==
39        '<a href="http://example.com/foo/" rel="nofollow">http://example.com/foo/</a>'
40    )
41    assert (
42        linkify('http://example.com/foo/bar/') ==
43        '<a href="http://example.com/foo/bar/" rel="nofollow">http://example.com/foo/bar/</a>'
44    )
45
46
47def test_mangle_link():
48    """We can muck with the href attribute of the link."""
49    def filter_url(attrs, new=False):
50        if not attrs.get((None, 'href'), '').startswith('http://bouncer'):
51            quoted = quote_plus(attrs[(None, 'href')])
52            attrs[(None, 'href')] = 'http://bouncer/?u={0!s}'.format(quoted)
53        return attrs
54
55    assert (
56        linkify('http://example.com', callbacks=DC + [filter_url]) ==
57        '<a href="http://bouncer/?u=http%3A%2F%2Fexample.com" rel="nofollow">http://example.com</a>'
58    )
59
60
61def test_mangle_text():
62    """We can muck with the inner text of a link."""
63
64    def ft(attrs, new=False):
65        attrs['_text'] = 'bar'
66        return attrs
67
68    assert (
69        linkify('http://ex.mp <a href="http://ex.mp/foo">foo</a>', callbacks=[ft]) ==
70        '<a href="http://ex.mp">bar</a> <a href="http://ex.mp/foo">bar</a>'
71    )
72
73
74@pytest.mark.parametrize('data,parse_email,expected', [
75    (
76        'a james@example.com mailto',
77        False,
78        'a james@example.com mailto'
79    ),
80    (
81        'a james@example.com.au mailto',
82        False,
83        'a james@example.com.au mailto'
84    ),
85    (
86        'a james@example.com mailto',
87        True,
88        'a <a href="mailto:james@example.com">james@example.com</a> mailto'
89    ),
90    (
91        'aussie james@example.com.au mailto',
92        True,
93        'aussie <a href="mailto:james@example.com.au">james@example.com.au</a> mailto'
94    ),
95    # This is kind of a pathological case. I guess we do our best here.
96    (
97        'email to <a href="james@example.com">james@example.com</a>',
98        True,
99        'email to <a href="james@example.com" rel="nofollow">james@example.com</a>'
100    ),
101    (
102        '<br>jinkyun@example.com',
103        True,
104        '<br><a href="mailto:jinkyun@example.com">jinkyun@example.com</a>'
105    ),
106    # Mailto links at the end of a sentence.
107    (
108        'mailto james@example.com.au.',
109        True,
110        'mailto <a href="mailto:james@example.com.au">james@example.com.au</a>.'
111    ),
112    # Incorrect email
113    (
114        '"\\\n"@opa.ru',
115        True,
116        '"\\\n"@opa.ru'
117    ),
118
119])
120def test_email_link(data, parse_email, expected):
121    assert linkify(data, parse_email=parse_email) == expected
122
123
124@pytest.mark.parametrize('data, expected', [
125    (
126        '"james"@example.com',
127        '''<a href='mailto:"james"@example.com'>"james"@example.com</a>'''
128    ),
129    (
130        '"j\'ames"@example.com',
131        '''<a href="mailto:&quot;j'ames&quot;@example.com">"j'ames"@example.com</a>'''
132    ),
133    (
134        '"ja>mes"@example.com',
135        '''<a href='mailto:"ja>mes"@example.com'>"ja&gt;mes"@example.com</a>'''
136    ),
137])
138def test_email_link_escaping(data, expected):
139    assert linkify(data, parse_email=True) == expected
140
141
142def no_new_links(attrs, new=False):
143    if new:
144        return None
145    return attrs
146
147
148def no_old_links(attrs, new=False):
149    if not new:
150        return None
151    return attrs
152
153
154def noop(attrs, new=False):
155    return attrs
156
157
158@pytest.mark.parametrize('callback,expected', [
159    (
160        [noop],
161        'a <a href="http://ex.mp">ex.mp</a> <a href="http://example.com">example</a>'
162    ),
163    (
164        [no_new_links, noop],
165        'a ex.mp <a href="http://example.com">example</a>'
166    ),
167    (
168        [noop, no_new_links],
169        'a ex.mp <a href="http://example.com">example</a>'
170    ),
171    (
172        [no_old_links, noop],
173        'a <a href="http://ex.mp">ex.mp</a> example'
174    ),
175    (
176        [noop, no_old_links],
177        'a <a href="http://ex.mp">ex.mp</a> example'
178    ),
179    (
180        [no_old_links, no_new_links],
181        'a ex.mp example'
182    )
183])
184def test_prevent_links(callback, expected):
185    """Returning None from any callback should remove links or prevent them
186    from being created."""
187    text = 'a ex.mp <a href="http://example.com">example</a>'
188    assert linkify(text, callbacks=callback) == expected
189
190
191def test_set_attrs():
192    """We can set random attributes on links."""
193
194    def set_attr(attrs, new=False):
195        attrs[(None, 'rev')] = 'canonical'
196        return attrs
197
198    assert (
199        linkify('ex.mp', callbacks=[set_attr]) ==
200        '<a href="http://ex.mp" rev="canonical">ex.mp</a>'
201    )
202
203
204def test_only_proto_links():
205    """Only create links if there's a protocol."""
206    def only_proto(attrs, new=False):
207        if new and not attrs['_text'].startswith(('http:', 'https:')):
208            return None
209        return attrs
210
211    in_text = 'a ex.mp http://ex.mp <a href="/foo">bar</a>'
212    assert (
213        linkify(in_text, callbacks=[only_proto]) ==
214        'a ex.mp <a href="http://ex.mp">http://ex.mp</a> <a href="/foo">bar</a>'
215    )
216
217
218def test_stop_email():
219    """Returning None should prevent a link from being created."""
220    def no_email(attrs, new=False):
221        if attrs[(None, 'href')].startswith('mailto:'):
222            return None
223        return attrs
224    text = 'do not link james@example.com'
225
226    assert linkify(text, parse_email=True, callbacks=[no_email]) == text
227
228
229@pytest.mark.parametrize('data,expected', [
230    # tlds
231    ('example.com', '<a href="http://example.com" rel="nofollow">example.com</a>'),
232    ('example.co', '<a href="http://example.co" rel="nofollow">example.co</a>'),
233    ('example.co.uk', '<a href="http://example.co.uk" rel="nofollow">example.co.uk</a>'),
234    ('example.edu', '<a href="http://example.edu" rel="nofollow">example.edu</a>'),
235    ('example.xxx', '<a href="http://example.xxx" rel="nofollow">example.xxx</a>'),
236    ('bit.ly/fun', '<a href="http://bit.ly/fun" rel="nofollow">bit.ly/fun</a>'),
237
238    # non-tlds
239    ('example.yyy', 'example.yyy'),
240    ('brie', 'brie'),
241])
242def test_tlds(data, expected):
243    assert linkify(data) == expected
244
245
246def test_escaping():
247    assert linkify('< unrelated') == '&lt; unrelated'
248
249
250def test_nofollow_off():
251    assert linkify('example.com', callbacks=[]) == '<a href="http://example.com">example.com</a>'
252
253
254def test_link_in_html():
255    assert (
256        linkify('<i>http://yy.com</i>') ==
257        '<i><a href="http://yy.com" rel="nofollow">http://yy.com</a></i>'
258    )
259    assert (
260        linkify('<em><strong>http://xx.com</strong></em>') ==
261        '<em><strong><a href="http://xx.com" rel="nofollow">http://xx.com</a></strong></em>'
262    )
263
264
265def test_links_https():
266    assert (
267        linkify('https://yy.com') ==
268        '<a href="https://yy.com" rel="nofollow">https://yy.com</a>'
269    )
270
271
272def test_add_rel_nofollow():
273    """Verify that rel="nofollow" is added to an existing link"""
274    assert (
275        linkify('<a href="http://yy.com">http://yy.com</a>') ==
276        '<a href="http://yy.com" rel="nofollow">http://yy.com</a>'
277    )
278
279
280def test_url_with_path():
281    assert (
282        linkify('http://example.com/path/to/file') ==
283        '<a href="http://example.com/path/to/file" rel="nofollow">'
284        'http://example.com/path/to/file</a>'
285    )
286
287
288def test_link_ftp():
289    assert (
290        linkify('ftp://ftp.mozilla.org/some/file') ==
291        '<a href="ftp://ftp.mozilla.org/some/file" rel="nofollow">'
292        'ftp://ftp.mozilla.org/some/file</a>'
293    )
294
295
296def test_link_query():
297    assert (
298        linkify('http://xx.com/?test=win') ==
299        '<a href="http://xx.com/?test=win" rel="nofollow">http://xx.com/?test=win</a>'
300    )
301    assert (
302        linkify('xx.com/?test=win') ==
303        '<a href="http://xx.com/?test=win" rel="nofollow">xx.com/?test=win</a>'
304    )
305    assert (
306        linkify('xx.com?test=win') ==
307        '<a href="http://xx.com?test=win" rel="nofollow">xx.com?test=win</a>'
308    )
309
310
311def test_link_fragment():
312    assert (
313        linkify('http://xx.com/path#frag') ==
314        '<a href="http://xx.com/path#frag" rel="nofollow">http://xx.com/path#frag</a>'
315    )
316
317
318def test_link_entities():
319    assert (
320        linkify('http://xx.com/?a=1&b=2') ==
321        '<a href="http://xx.com/?a=1&amp;b=2" rel="nofollow">http://xx.com/?a=1&amp;b=2</a>'
322    )
323
324
325def test_escaped_html():
326    """If I pass in escaped HTML, it should probably come out escaped."""
327    s = '&lt;em&gt;strong&lt;/em&gt;'
328    assert linkify(s) == s
329
330
331def test_link_http_complete():
332    assert (
333        linkify('https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f') ==
334        '<a href="https://user:pass@ftp.mozilla.org/x/y.exe?a=b&amp;c=d&amp;e#f" rel="nofollow">'
335        'https://user:pass@ftp.mozilla.org/x/y.exe?a=b&amp;c=d&amp;e#f</a>'
336    )
337
338
339def test_non_url():
340    """document.vulnerable should absolutely not be linkified."""
341    s = 'document.vulnerable'
342    assert linkify(s) == s
343
344
345def test_javascript_url():
346    """javascript: urls should never be linkified."""
347    s = 'javascript:document.vulnerable'
348    assert linkify(s) == s
349
350
351def test_unsafe_url():
352    """Any unsafe char ({}[]<>, etc.) in the path should end URL scanning."""
353    assert (
354        linkify('All your{"xx.yy.com/grover.png"}base are') ==
355        'All your{"<a href="http://xx.yy.com/grover.png" rel="nofollow">xx.yy.com/grover.png</a>"}'
356        'base are'
357    )
358
359
360def test_skip_tags():
361    """Skip linkification in skip tags"""
362    simple = 'http://xx.com <pre>http://xx.com</pre>'
363    linked = ('<a href="http://xx.com" rel="nofollow">http://xx.com</a> '
364              '<pre>http://xx.com</pre>')
365    all_linked = ('<a href="http://xx.com" rel="nofollow">http://xx.com</a> '
366                  '<pre><a href="http://xx.com" rel="nofollow">http://xx.com'
367                  '</a></pre>')
368    assert linkify(simple, skip_tags=['pre']) == linked
369    assert linkify(simple) == all_linked
370
371    already_linked = '<pre><a href="http://xx.com">xx</a></pre>'
372    nofollowed = '<pre><a href="http://xx.com" rel="nofollow">xx</a></pre>'
373    assert linkify(already_linked) == nofollowed
374    assert linkify(already_linked, skip_tags=['pre']) == nofollowed
375
376    assert (
377        linkify('<pre><code>http://example.com</code></pre>http://example.com', skip_tags=['pre']) ==
378        (
379            '<pre><code>http://example.com</code></pre>'
380            '<a href="http://example.com" rel="nofollow">http://example.com</a>'
381        )
382    )
383
384
385def test_libgl():
386    """libgl.so.1 should not be linkified."""
387    s = 'libgl.so.1'
388    assert linkify(s) == s
389
390
391@pytest.mark.parametrize('url,periods', [
392    ('example.com', '.'),
393    ('example.com', '...'),
394    ('ex.com/foo', '.'),
395    ('ex.com/foo', '....'),
396])
397def test_end_of_sentence(url, periods):
398    """example.com. should match."""
399    out = '<a href="http://{0!s}" rel="nofollow">{0!s}</a>{1!s}'
400    intxt = '{0!s}{1!s}'
401
402    assert linkify(intxt.format(url, periods)) == out.format(url, periods)
403
404
405def test_end_of_clause():
406    """example.com/foo, shouldn't include the ,"""
407    assert (
408        linkify('ex.com/foo, bar') ==
409        '<a href="http://ex.com/foo" rel="nofollow">ex.com/foo</a>, bar'
410    )
411
412
413def test_sarcasm():
414    """Jokes should crash.<sarcasm/>"""
415    assert linkify('Yeah right <sarcasm/>') == 'Yeah right &lt;sarcasm/&gt;'
416
417
418@pytest.mark.parametrize('data,expected_data', [
419    (
420        '(example.com)',
421        ('(', 'example.com', 'example.com', ')')
422    ),
423    (
424        '(example.com/)',
425        ('(', 'example.com/', 'example.com/', ')')
426    ),
427    (
428        '(example.com/foo)',
429        ('(', 'example.com/foo', 'example.com/foo', ')')
430    ),
431    (
432        '(((example.com/))))',
433        ('(((', 'example.com/', 'example.com/', '))))')
434    ),
435    (
436        'example.com/))',
437        ('', 'example.com/', 'example.com/', '))')
438    ),
439    (
440        '(foo http://example.com/)',
441        ('(foo ', 'example.com/', 'http://example.com/', ')')
442    ),
443    (
444        '(foo http://example.com)',
445        ('(foo ', 'example.com', 'http://example.com', ')')
446    ),
447    (
448        'http://en.wikipedia.org/wiki/Test_(assessment)',
449        ('', 'en.wikipedia.org/wiki/Test_(assessment)',
450         'http://en.wikipedia.org/wiki/Test_(assessment)', '')
451    ),
452    (
453        '(http://en.wikipedia.org/wiki/Test_(assessment))',
454        ('(', 'en.wikipedia.org/wiki/Test_(assessment)',
455         'http://en.wikipedia.org/wiki/Test_(assessment)', ')')
456    ),
457    (
458        '((http://en.wikipedia.org/wiki/Test_(assessment))',
459        ('((', 'en.wikipedia.org/wiki/Test_(assessment',
460         'http://en.wikipedia.org/wiki/Test_(assessment', '))')
461    ),
462    (
463        '(http://en.wikipedia.org/wiki/Test_(assessment)))',
464        ('(', 'en.wikipedia.org/wiki/Test_(assessment))',
465         'http://en.wikipedia.org/wiki/Test_(assessment))', ')')
466    ),
467    (
468        '(http://en.wikipedia.org/wiki/)Test_(assessment',
469        ('(', 'en.wikipedia.org/wiki/)Test_(assessment',
470         'http://en.wikipedia.org/wiki/)Test_(assessment', '')
471    ),
472    (
473        'hello (http://www.mu.de/blah.html) world',
474        ('hello (', 'www.mu.de/blah.html', 'http://www.mu.de/blah.html', ') world')
475    ),
476    (
477        'hello (http://www.mu.de/blah.html). world',
478        ('hello (', 'www.mu.de/blah.html', 'http://www.mu.de/blah.html', '). world')
479    )
480])
481def test_wrapping_parentheses(data, expected_data):
482    """URLs wrapped in parantheses should not include them."""
483    out = '{0!s}<a href="http://{1!s}" rel="nofollow">{2!s}</a>{3!s}'
484
485    assert linkify(data) == out.format(*expected_data)
486
487
488def test_parentheses_with_removing():
489    expected = '(test.py)'
490    assert linkify(expected, callbacks=[lambda *a: None]) == expected
491
492
493@pytest.mark.parametrize('data,expected_data', [
494    # Test valid ports
495    ('http://foo.com:8000', ('http://foo.com:8000', '')),
496    ('http://foo.com:8000/', ('http://foo.com:8000/', '')),
497
498    # Test non ports
499    ('http://bar.com:xkcd', ('http://bar.com', ':xkcd')),
500    ('http://foo.com:81/bar', ('http://foo.com:81/bar', '')),
501    ('http://foo.com:', ('http://foo.com', ':')),
502
503    # Test non-ascii ports
504    ('http://foo.com:\u0663\u0669/', ('http://foo.com', ':\u0663\u0669/')),
505    ('http://foo.com:\U0001d7e0\U0001d7d8/', ('http://foo.com', ':\U0001d7e0\U0001d7d8/')),
506])
507def test_ports(data, expected_data):
508    """URLs can contain port numbers."""
509    out = '<a href="{0}" rel="nofollow">{0}</a>{1}'
510    assert linkify(data) == out.format(*expected_data)
511
512
513def test_ignore_bad_protocols():
514    assert (
515        linkify('foohttp://bar') ==
516        'foohttp://bar'
517    )
518    assert (
519        linkify('fohttp://exampl.com') ==
520        'fohttp://<a href="http://exampl.com" rel="nofollow">exampl.com</a>'
521    )
522
523
524def test_link_emails_and_urls():
525    """parse_email=True shouldn't prevent URLs from getting linkified."""
526    assert (
527        linkify('http://example.com person@example.com', parse_email=True) ==
528        (
529            '<a href="http://example.com" rel="nofollow">'
530            'http://example.com</a> <a href="mailto:person@example.com">'
531            'person@example.com</a>'
532        )
533    )
534
535
536def test_links_case_insensitive():
537    """Protocols and domain names are case insensitive."""
538    expect = '<a href="HTTP://EXAMPLE.COM" rel="nofollow">HTTP://EXAMPLE.COM</a>'
539    assert linkify('HTTP://EXAMPLE.COM') == expect
540
541
542def test_elements_inside_links():
543    assert (
544        linkify('<a href="#">hello<br></a>') ==
545        '<a href="#" rel="nofollow">hello<br></a>'
546    )
547
548    assert (
549        linkify('<a href="#"><strong>bold</strong> hello<br></a>') ==
550        '<a href="#" rel="nofollow"><strong>bold</strong> hello<br></a>'
551    )
552
553
554def test_drop_link_tags():
555    """Verify that dropping link tags *just* drops the tag and not the content"""
556    html = (
557        'first <a href="http://example.com/1/">second</a> third <a href="http://example.com/2/">'
558        'fourth</a> fifth'
559    )
560    assert (
561        linkify(html, callbacks=[lambda attrs, new: None]) ==
562        'first second third fourth fifth'
563    )
564
565
566@pytest.mark.parametrize('text, expected', [
567    ('&lt;br&gt;', '&lt;br&gt;'),
568    (
569        '&lt;br&gt; http://example.com',
570        '&lt;br&gt; <a href="http://example.com" rel="nofollow">http://example.com</a>'
571    ),
572    (
573        '&lt;br&gt; <br> http://example.com',
574        '&lt;br&gt; <br> <a href="http://example.com" rel="nofollow">http://example.com</a>'
575    )
576])
577def test_naughty_unescaping(text, expected):
578    """Verify that linkify is not unescaping things it shouldn't be"""
579    assert linkify(text) == expected
580
581
582def test_hang():
583    """This string would hang linkify. Issue #200"""
584    assert (
585        linkify("an@email.com<mailto:an@email.com>", parse_email=True) ==
586        '<a href="mailto:an@email.com">an@email.com</a>&lt;mailto:<a href="mailto:an@email.com">an@email.com</a>&gt;'  # noqa
587    )
588
589
590def test_hyphen_in_mail():
591    """Test hyphens `-` in mails. Issue #300."""
592    assert (
593        linkify('ex@am-ple.com', parse_email=True) ==
594        '<a href="mailto:ex@am-ple.com">ex@am-ple.com</a>'
595    )
596
597
598def test_url_re_arg():
599    """Verifies that a specified url_re is used"""
600    fred_re = re.compile(r"""(fred\.com)""")
601
602    linker = Linker(url_re=fred_re)
603    assert (
604        linker.linkify('a b c fred.com d e f') ==
605        'a b c <a href="http://fred.com" rel="nofollow">fred.com</a> d e f'
606    )
607
608    assert (
609        linker.linkify('a b c http://example.com d e f') ==
610        'a b c http://example.com d e f'
611    )
612
613
614def test_email_re_arg():
615    """Verifies that a specified email_re is used"""
616    fred_re = re.compile(r"""(fred@example\.com)""")
617
618    linker = Linker(parse_email=True, email_re=fred_re)
619    assert (
620        linker.linkify('a b c fred@example.com d e f') ==
621        'a b c <a href="mailto:fred@example.com">fred@example.com</a> d e f'
622    )
623
624    assert (
625        linker.linkify('a b c jim@example.com d e f') ==
626        'a b c jim@example.com d e f'
627    )
628
629
630def test_recognized_tags_arg():
631    """Verifies that recognized_tags works"""
632    # The html parser doesn't recognize "sarcasm" as a tag, so it escapes it
633    linker = Linker(recognized_tags=['p'])
634    assert (
635        linker.linkify('<p>http://example.com/</p><sarcasm>') ==
636        '<p><a href="http://example.com/" rel="nofollow">http://example.com/</a></p>&lt;sarcasm&gt;'  # noqa
637    )
638
639    # The html parser recognizes "sarcasm" as a tag and fixes it
640    linker = Linker(recognized_tags=['p', 'sarcasm'])
641    assert (
642        linker.linkify('<p>http://example.com/</p><sarcasm>') ==
643        '<p><a href="http://example.com/" rel="nofollow">http://example.com/</a></p><sarcasm></sarcasm>'  # noqa
644    )
645
646
647def test_linkify_idempotent():
648    dirty = '<span>invalid & </span> < extra http://link.com<em>'
649    assert linkify(linkify(dirty)) == linkify(dirty)
650
651
652class TestLinkify:
653    def test_no_href_links(self):
654        s = '<a name="anchor">x</a>'
655        assert linkify(s) == s
656
657    def test_rel_already_there(self):
658        """Make sure rel attribute is updated not replaced"""
659        linked = ('Click <a href="http://example.com" rel="tooltip">'
660                  'here</a>.')
661
662        link_good = 'Click <a href="http://example.com" rel="tooltip nofollow">here</a>.'
663
664        assert linkify(linked) == link_good
665        assert linkify(link_good) == link_good
666
667    def test_only_text_is_linkified(self):
668        some_text = 'text'
669        some_type = int
670        no_type = None
671
672        assert linkify(some_text) == some_text
673
674        with pytest.raises(TypeError):
675            linkify(some_type)
676
677        with pytest.raises(TypeError):
678            linkify(no_type)
679
680
681@pytest.mark.parametrize('text, expected', [
682    ('abc', 'abc'),
683    ('example.com', '<a href="http://example.com">example.com</a>'),
684    (
685        'http://example.com?b=1&c=2',
686        '<a href="http://example.com?b=1&amp;c=2">http://example.com?b=1&amp;c=2</a>'
687    ),
688    (
689        'http://example.com?b=1&amp;c=2',
690        '<a href="http://example.com?b=1&amp;c=2">http://example.com?b=1&amp;c=2</a>'
691    ),
692    (
693        'link: https://example.com/watch#anchor',
694        'link: <a href="https://example.com/watch#anchor">https://example.com/watch#anchor</a>'
695    )
696])
697def test_linkify_filter(text, expected):
698    cleaner = Cleaner(filters=[LinkifyFilter])
699    assert cleaner.clean(text) == expected
700