1from __future__ import unicode_literals 2 3import re 4 5import pytest 6from six.moves.urllib_parse import quote_plus 7 8from bleach import linkify, DEFAULT_CALLBACKS as DC 9from bleach.linkifier import Linker, LinkifyFilter 10from bleach.sanitizer import Cleaner 11 12 13def test_empty(): 14 assert linkify('') == '' 15 16 17def test_simple_link(): 18 assert ( 19 linkify('a http://example.com link') == 20 'a <a href="http://example.com" rel="nofollow">http://example.com</a> link' 21 ) 22 assert ( 23 linkify('a https://example.com link') == 24 'a <a href="https://example.com" rel="nofollow">https://example.com</a> link' 25 ) 26 assert ( 27 linkify('a example.com link') == 28 'a <a href="http://example.com" rel="nofollow">example.com</a> link' 29 ) 30 31 32def test_trailing_slash(): 33 assert ( 34 linkify('http://examp.com/') == 35 '<a href="http://examp.com/" rel="nofollow">http://examp.com/</a>' 36 ) 37 assert ( 38 linkify('http://example.com/foo/') == 39 '<a href="http://example.com/foo/" rel="nofollow">http://example.com/foo/</a>' 40 ) 41 assert ( 42 linkify('http://example.com/foo/bar/') == 43 '<a href="http://example.com/foo/bar/" rel="nofollow">http://example.com/foo/bar/</a>' 44 ) 45 46 47def test_mangle_link(): 48 """We can muck with the href attribute of the link.""" 49 def filter_url(attrs, new=False): 50 if not attrs.get((None, 'href'), '').startswith('http://bouncer'): 51 quoted = quote_plus(attrs[(None, 'href')]) 52 attrs[(None, 'href')] = 'http://bouncer/?u={0!s}'.format(quoted) 53 return attrs 54 55 assert ( 56 linkify('http://example.com', callbacks=DC + [filter_url]) == 57 '<a href="http://bouncer/?u=http%3A%2F%2Fexample.com" rel="nofollow">http://example.com</a>' 58 ) 59 60 61def test_mangle_text(): 62 """We can muck with the inner text of a link.""" 63 64 def ft(attrs, new=False): 65 attrs['_text'] = 'bar' 66 return attrs 67 68 assert ( 69 linkify('http://ex.mp <a href="http://ex.mp/foo">foo</a>', callbacks=[ft]) == 70 '<a href="http://ex.mp">bar</a> <a href="http://ex.mp/foo">bar</a>' 71 ) 72 73 74@pytest.mark.parametrize('data,parse_email,expected', [ 75 ( 76 'a james@example.com mailto', 77 False, 78 'a james@example.com mailto' 79 ), 80 ( 81 'a james@example.com.au mailto', 82 False, 83 'a james@example.com.au mailto' 84 ), 85 ( 86 'a james@example.com mailto', 87 True, 88 'a <a href="mailto:james@example.com">james@example.com</a> mailto' 89 ), 90 ( 91 'aussie james@example.com.au mailto', 92 True, 93 'aussie <a href="mailto:james@example.com.au">james@example.com.au</a> mailto' 94 ), 95 # This is kind of a pathological case. I guess we do our best here. 96 ( 97 'email to <a href="james@example.com">james@example.com</a>', 98 True, 99 'email to <a href="james@example.com" rel="nofollow">james@example.com</a>' 100 ), 101 ( 102 '<br>jinkyun@example.com', 103 True, 104 '<br><a href="mailto:jinkyun@example.com">jinkyun@example.com</a>' 105 ), 106 # Mailto links at the end of a sentence. 107 ( 108 'mailto james@example.com.au.', 109 True, 110 'mailto <a href="mailto:james@example.com.au">james@example.com.au</a>.' 111 ), 112 # Incorrect email 113 ( 114 '"\\\n"@opa.ru', 115 True, 116 '"\\\n"@opa.ru' 117 ), 118 119]) 120def test_email_link(data, parse_email, expected): 121 assert linkify(data, parse_email=parse_email) == expected 122 123 124@pytest.mark.parametrize('data, expected', [ 125 ( 126 '"james"@example.com', 127 '''<a href='mailto:"james"@example.com'>"james"@example.com</a>''' 128 ), 129 ( 130 '"j\'ames"@example.com', 131 '''<a href="mailto:"j'ames"@example.com">"j'ames"@example.com</a>''' 132 ), 133 ( 134 '"ja>mes"@example.com', 135 '''<a href='mailto:"ja>mes"@example.com'>"ja>mes"@example.com</a>''' 136 ), 137]) 138def test_email_link_escaping(data, expected): 139 assert linkify(data, parse_email=True) == expected 140 141 142def no_new_links(attrs, new=False): 143 if new: 144 return None 145 return attrs 146 147 148def no_old_links(attrs, new=False): 149 if not new: 150 return None 151 return attrs 152 153 154def noop(attrs, new=False): 155 return attrs 156 157 158@pytest.mark.parametrize('callback,expected', [ 159 ( 160 [noop], 161 'a <a href="http://ex.mp">ex.mp</a> <a href="http://example.com">example</a>' 162 ), 163 ( 164 [no_new_links, noop], 165 'a ex.mp <a href="http://example.com">example</a>' 166 ), 167 ( 168 [noop, no_new_links], 169 'a ex.mp <a href="http://example.com">example</a>' 170 ), 171 ( 172 [no_old_links, noop], 173 'a <a href="http://ex.mp">ex.mp</a> example' 174 ), 175 ( 176 [noop, no_old_links], 177 'a <a href="http://ex.mp">ex.mp</a> example' 178 ), 179 ( 180 [no_old_links, no_new_links], 181 'a ex.mp example' 182 ) 183]) 184def test_prevent_links(callback, expected): 185 """Returning None from any callback should remove links or prevent them 186 from being created.""" 187 text = 'a ex.mp <a href="http://example.com">example</a>' 188 assert linkify(text, callbacks=callback) == expected 189 190 191def test_set_attrs(): 192 """We can set random attributes on links.""" 193 194 def set_attr(attrs, new=False): 195 attrs[(None, 'rev')] = 'canonical' 196 return attrs 197 198 assert ( 199 linkify('ex.mp', callbacks=[set_attr]) == 200 '<a href="http://ex.mp" rev="canonical">ex.mp</a>' 201 ) 202 203 204def test_only_proto_links(): 205 """Only create links if there's a protocol.""" 206 def only_proto(attrs, new=False): 207 if new and not attrs['_text'].startswith(('http:', 'https:')): 208 return None 209 return attrs 210 211 in_text = 'a ex.mp http://ex.mp <a href="/foo">bar</a>' 212 assert ( 213 linkify(in_text, callbacks=[only_proto]) == 214 'a ex.mp <a href="http://ex.mp">http://ex.mp</a> <a href="/foo">bar</a>' 215 ) 216 217 218def test_stop_email(): 219 """Returning None should prevent a link from being created.""" 220 def no_email(attrs, new=False): 221 if attrs[(None, 'href')].startswith('mailto:'): 222 return None 223 return attrs 224 text = 'do not link james@example.com' 225 226 assert linkify(text, parse_email=True, callbacks=[no_email]) == text 227 228 229@pytest.mark.parametrize('data,expected', [ 230 # tlds 231 ('example.com', '<a href="http://example.com" rel="nofollow">example.com</a>'), 232 ('example.co', '<a href="http://example.co" rel="nofollow">example.co</a>'), 233 ('example.co.uk', '<a href="http://example.co.uk" rel="nofollow">example.co.uk</a>'), 234 ('example.edu', '<a href="http://example.edu" rel="nofollow">example.edu</a>'), 235 ('example.xxx', '<a href="http://example.xxx" rel="nofollow">example.xxx</a>'), 236 ('bit.ly/fun', '<a href="http://bit.ly/fun" rel="nofollow">bit.ly/fun</a>'), 237 238 # non-tlds 239 ('example.yyy', 'example.yyy'), 240 ('brie', 'brie'), 241]) 242def test_tlds(data, expected): 243 assert linkify(data) == expected 244 245 246def test_escaping(): 247 assert linkify('< unrelated') == '< unrelated' 248 249 250def test_nofollow_off(): 251 assert linkify('example.com', callbacks=[]) == '<a href="http://example.com">example.com</a>' 252 253 254def test_link_in_html(): 255 assert ( 256 linkify('<i>http://yy.com</i>') == 257 '<i><a href="http://yy.com" rel="nofollow">http://yy.com</a></i>' 258 ) 259 assert ( 260 linkify('<em><strong>http://xx.com</strong></em>') == 261 '<em><strong><a href="http://xx.com" rel="nofollow">http://xx.com</a></strong></em>' 262 ) 263 264 265def test_links_https(): 266 assert ( 267 linkify('https://yy.com') == 268 '<a href="https://yy.com" rel="nofollow">https://yy.com</a>' 269 ) 270 271 272def test_add_rel_nofollow(): 273 """Verify that rel="nofollow" is added to an existing link""" 274 assert ( 275 linkify('<a href="http://yy.com">http://yy.com</a>') == 276 '<a href="http://yy.com" rel="nofollow">http://yy.com</a>' 277 ) 278 279 280def test_url_with_path(): 281 assert ( 282 linkify('http://example.com/path/to/file') == 283 '<a href="http://example.com/path/to/file" rel="nofollow">' 284 'http://example.com/path/to/file</a>' 285 ) 286 287 288def test_link_ftp(): 289 assert ( 290 linkify('ftp://ftp.mozilla.org/some/file') == 291 '<a href="ftp://ftp.mozilla.org/some/file" rel="nofollow">' 292 'ftp://ftp.mozilla.org/some/file</a>' 293 ) 294 295 296def test_link_query(): 297 assert ( 298 linkify('http://xx.com/?test=win') == 299 '<a href="http://xx.com/?test=win" rel="nofollow">http://xx.com/?test=win</a>' 300 ) 301 assert ( 302 linkify('xx.com/?test=win') == 303 '<a href="http://xx.com/?test=win" rel="nofollow">xx.com/?test=win</a>' 304 ) 305 assert ( 306 linkify('xx.com?test=win') == 307 '<a href="http://xx.com?test=win" rel="nofollow">xx.com?test=win</a>' 308 ) 309 310 311def test_link_fragment(): 312 assert ( 313 linkify('http://xx.com/path#frag') == 314 '<a href="http://xx.com/path#frag" rel="nofollow">http://xx.com/path#frag</a>' 315 ) 316 317 318def test_link_entities(): 319 assert ( 320 linkify('http://xx.com/?a=1&b=2') == 321 '<a href="http://xx.com/?a=1&b=2" rel="nofollow">http://xx.com/?a=1&b=2</a>' 322 ) 323 324 325def test_escaped_html(): 326 """If I pass in escaped HTML, it should probably come out escaped.""" 327 s = '<em>strong</em>' 328 assert linkify(s) == s 329 330 331def test_link_http_complete(): 332 assert ( 333 linkify('https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f') == 334 '<a href="https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f" rel="nofollow">' 335 'https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f</a>' 336 ) 337 338 339def test_non_url(): 340 """document.vulnerable should absolutely not be linkified.""" 341 s = 'document.vulnerable' 342 assert linkify(s) == s 343 344 345def test_javascript_url(): 346 """javascript: urls should never be linkified.""" 347 s = 'javascript:document.vulnerable' 348 assert linkify(s) == s 349 350 351def test_unsafe_url(): 352 """Any unsafe char ({}[]<>, etc.) in the path should end URL scanning.""" 353 assert ( 354 linkify('All your{"xx.yy.com/grover.png"}base are') == 355 'All your{"<a href="http://xx.yy.com/grover.png" rel="nofollow">xx.yy.com/grover.png</a>"}' 356 'base are' 357 ) 358 359 360def test_skip_tags(): 361 """Skip linkification in skip tags""" 362 simple = 'http://xx.com <pre>http://xx.com</pre>' 363 linked = ('<a href="http://xx.com" rel="nofollow">http://xx.com</a> ' 364 '<pre>http://xx.com</pre>') 365 all_linked = ('<a href="http://xx.com" rel="nofollow">http://xx.com</a> ' 366 '<pre><a href="http://xx.com" rel="nofollow">http://xx.com' 367 '</a></pre>') 368 assert linkify(simple, skip_tags=['pre']) == linked 369 assert linkify(simple) == all_linked 370 371 already_linked = '<pre><a href="http://xx.com">xx</a></pre>' 372 nofollowed = '<pre><a href="http://xx.com" rel="nofollow">xx</a></pre>' 373 assert linkify(already_linked) == nofollowed 374 assert linkify(already_linked, skip_tags=['pre']) == nofollowed 375 376 assert ( 377 linkify('<pre><code>http://example.com</code></pre>http://example.com', skip_tags=['pre']) == 378 ( 379 '<pre><code>http://example.com</code></pre>' 380 '<a href="http://example.com" rel="nofollow">http://example.com</a>' 381 ) 382 ) 383 384 385def test_libgl(): 386 """libgl.so.1 should not be linkified.""" 387 s = 'libgl.so.1' 388 assert linkify(s) == s 389 390 391@pytest.mark.parametrize('url,periods', [ 392 ('example.com', '.'), 393 ('example.com', '...'), 394 ('ex.com/foo', '.'), 395 ('ex.com/foo', '....'), 396]) 397def test_end_of_sentence(url, periods): 398 """example.com. should match.""" 399 out = '<a href="http://{0!s}" rel="nofollow">{0!s}</a>{1!s}' 400 intxt = '{0!s}{1!s}' 401 402 assert linkify(intxt.format(url, periods)) == out.format(url, periods) 403 404 405def test_end_of_clause(): 406 """example.com/foo, shouldn't include the ,""" 407 assert ( 408 linkify('ex.com/foo, bar') == 409 '<a href="http://ex.com/foo" rel="nofollow">ex.com/foo</a>, bar' 410 ) 411 412 413def test_sarcasm(): 414 """Jokes should crash.<sarcasm/>""" 415 assert linkify('Yeah right <sarcasm/>') == 'Yeah right <sarcasm/>' 416 417 418@pytest.mark.parametrize('data,expected_data', [ 419 ( 420 '(example.com)', 421 ('(', 'example.com', 'example.com', ')') 422 ), 423 ( 424 '(example.com/)', 425 ('(', 'example.com/', 'example.com/', ')') 426 ), 427 ( 428 '(example.com/foo)', 429 ('(', 'example.com/foo', 'example.com/foo', ')') 430 ), 431 ( 432 '(((example.com/))))', 433 ('(((', 'example.com/', 'example.com/', '))))') 434 ), 435 ( 436 'example.com/))', 437 ('', 'example.com/', 'example.com/', '))') 438 ), 439 ( 440 '(foo http://example.com/)', 441 ('(foo ', 'example.com/', 'http://example.com/', ')') 442 ), 443 ( 444 '(foo http://example.com)', 445 ('(foo ', 'example.com', 'http://example.com', ')') 446 ), 447 ( 448 'http://en.wikipedia.org/wiki/Test_(assessment)', 449 ('', 'en.wikipedia.org/wiki/Test_(assessment)', 450 'http://en.wikipedia.org/wiki/Test_(assessment)', '') 451 ), 452 ( 453 '(http://en.wikipedia.org/wiki/Test_(assessment))', 454 ('(', 'en.wikipedia.org/wiki/Test_(assessment)', 455 'http://en.wikipedia.org/wiki/Test_(assessment)', ')') 456 ), 457 ( 458 '((http://en.wikipedia.org/wiki/Test_(assessment))', 459 ('((', 'en.wikipedia.org/wiki/Test_(assessment', 460 'http://en.wikipedia.org/wiki/Test_(assessment', '))') 461 ), 462 ( 463 '(http://en.wikipedia.org/wiki/Test_(assessment)))', 464 ('(', 'en.wikipedia.org/wiki/Test_(assessment))', 465 'http://en.wikipedia.org/wiki/Test_(assessment))', ')') 466 ), 467 ( 468 '(http://en.wikipedia.org/wiki/)Test_(assessment', 469 ('(', 'en.wikipedia.org/wiki/)Test_(assessment', 470 'http://en.wikipedia.org/wiki/)Test_(assessment', '') 471 ), 472 ( 473 'hello (http://www.mu.de/blah.html) world', 474 ('hello (', 'www.mu.de/blah.html', 'http://www.mu.de/blah.html', ') world') 475 ), 476 ( 477 'hello (http://www.mu.de/blah.html). world', 478 ('hello (', 'www.mu.de/blah.html', 'http://www.mu.de/blah.html', '). world') 479 ) 480]) 481def test_wrapping_parentheses(data, expected_data): 482 """URLs wrapped in parantheses should not include them.""" 483 out = '{0!s}<a href="http://{1!s}" rel="nofollow">{2!s}</a>{3!s}' 484 485 assert linkify(data) == out.format(*expected_data) 486 487 488def test_parentheses_with_removing(): 489 expected = '(test.py)' 490 assert linkify(expected, callbacks=[lambda *a: None]) == expected 491 492 493@pytest.mark.parametrize('data,expected_data', [ 494 # Test valid ports 495 ('http://foo.com:8000', ('http://foo.com:8000', '')), 496 ('http://foo.com:8000/', ('http://foo.com:8000/', '')), 497 498 # Test non ports 499 ('http://bar.com:xkcd', ('http://bar.com', ':xkcd')), 500 ('http://foo.com:81/bar', ('http://foo.com:81/bar', '')), 501 ('http://foo.com:', ('http://foo.com', ':')), 502 503 # Test non-ascii ports 504 ('http://foo.com:\u0663\u0669/', ('http://foo.com', ':\u0663\u0669/')), 505 ('http://foo.com:\U0001d7e0\U0001d7d8/', ('http://foo.com', ':\U0001d7e0\U0001d7d8/')), 506]) 507def test_ports(data, expected_data): 508 """URLs can contain port numbers.""" 509 out = '<a href="{0}" rel="nofollow">{0}</a>{1}' 510 assert linkify(data) == out.format(*expected_data) 511 512 513def test_ignore_bad_protocols(): 514 assert ( 515 linkify('foohttp://bar') == 516 'foohttp://bar' 517 ) 518 assert ( 519 linkify('fohttp://exampl.com') == 520 'fohttp://<a href="http://exampl.com" rel="nofollow">exampl.com</a>' 521 ) 522 523 524def test_link_emails_and_urls(): 525 """parse_email=True shouldn't prevent URLs from getting linkified.""" 526 assert ( 527 linkify('http://example.com person@example.com', parse_email=True) == 528 ( 529 '<a href="http://example.com" rel="nofollow">' 530 'http://example.com</a> <a href="mailto:person@example.com">' 531 'person@example.com</a>' 532 ) 533 ) 534 535 536def test_links_case_insensitive(): 537 """Protocols and domain names are case insensitive.""" 538 expect = '<a href="HTTP://EXAMPLE.COM" rel="nofollow">HTTP://EXAMPLE.COM</a>' 539 assert linkify('HTTP://EXAMPLE.COM') == expect 540 541 542def test_elements_inside_links(): 543 assert ( 544 linkify('<a href="#">hello<br></a>') == 545 '<a href="#" rel="nofollow">hello<br></a>' 546 ) 547 548 assert ( 549 linkify('<a href="#"><strong>bold</strong> hello<br></a>') == 550 '<a href="#" rel="nofollow"><strong>bold</strong> hello<br></a>' 551 ) 552 553 554def test_drop_link_tags(): 555 """Verify that dropping link tags *just* drops the tag and not the content""" 556 html = ( 557 'first <a href="http://example.com/1/">second</a> third <a href="http://example.com/2/">' 558 'fourth</a> fifth' 559 ) 560 assert ( 561 linkify(html, callbacks=[lambda attrs, new: None]) == 562 'first second third fourth fifth' 563 ) 564 565 566@pytest.mark.parametrize('text, expected', [ 567 ('<br>', '<br>'), 568 ( 569 '<br> http://example.com', 570 '<br> <a href="http://example.com" rel="nofollow">http://example.com</a>' 571 ), 572 ( 573 '<br> <br> http://example.com', 574 '<br> <br> <a href="http://example.com" rel="nofollow">http://example.com</a>' 575 ) 576]) 577def test_naughty_unescaping(text, expected): 578 """Verify that linkify is not unescaping things it shouldn't be""" 579 assert linkify(text) == expected 580 581 582def test_hang(): 583 """This string would hang linkify. Issue #200""" 584 assert ( 585 linkify("an@email.com<mailto:an@email.com>", parse_email=True) == 586 '<a href="mailto:an@email.com">an@email.com</a><mailto:<a href="mailto:an@email.com">an@email.com</a>>' # noqa 587 ) 588 589 590def test_hyphen_in_mail(): 591 """Test hyphens `-` in mails. Issue #300.""" 592 assert ( 593 linkify('ex@am-ple.com', parse_email=True) == 594 '<a href="mailto:ex@am-ple.com">ex@am-ple.com</a>' 595 ) 596 597 598def test_url_re_arg(): 599 """Verifies that a specified url_re is used""" 600 fred_re = re.compile(r"""(fred\.com)""") 601 602 linker = Linker(url_re=fred_re) 603 assert ( 604 linker.linkify('a b c fred.com d e f') == 605 'a b c <a href="http://fred.com" rel="nofollow">fred.com</a> d e f' 606 ) 607 608 assert ( 609 linker.linkify('a b c http://example.com d e f') == 610 'a b c http://example.com d e f' 611 ) 612 613 614def test_email_re_arg(): 615 """Verifies that a specified email_re is used""" 616 fred_re = re.compile(r"""(fred@example\.com)""") 617 618 linker = Linker(parse_email=True, email_re=fred_re) 619 assert ( 620 linker.linkify('a b c fred@example.com d e f') == 621 'a b c <a href="mailto:fred@example.com">fred@example.com</a> d e f' 622 ) 623 624 assert ( 625 linker.linkify('a b c jim@example.com d e f') == 626 'a b c jim@example.com d e f' 627 ) 628 629 630def test_recognized_tags_arg(): 631 """Verifies that recognized_tags works""" 632 # The html parser doesn't recognize "sarcasm" as a tag, so it escapes it 633 linker = Linker(recognized_tags=['p']) 634 assert ( 635 linker.linkify('<p>http://example.com/</p><sarcasm>') == 636 '<p><a href="http://example.com/" rel="nofollow">http://example.com/</a></p><sarcasm>' # noqa 637 ) 638 639 # The html parser recognizes "sarcasm" as a tag and fixes it 640 linker = Linker(recognized_tags=['p', 'sarcasm']) 641 assert ( 642 linker.linkify('<p>http://example.com/</p><sarcasm>') == 643 '<p><a href="http://example.com/" rel="nofollow">http://example.com/</a></p><sarcasm></sarcasm>' # noqa 644 ) 645 646 647def test_linkify_idempotent(): 648 dirty = '<span>invalid & </span> < extra http://link.com<em>' 649 assert linkify(linkify(dirty)) == linkify(dirty) 650 651 652class TestLinkify: 653 def test_no_href_links(self): 654 s = '<a name="anchor">x</a>' 655 assert linkify(s) == s 656 657 def test_rel_already_there(self): 658 """Make sure rel attribute is updated not replaced""" 659 linked = ('Click <a href="http://example.com" rel="tooltip">' 660 'here</a>.') 661 662 link_good = 'Click <a href="http://example.com" rel="tooltip nofollow">here</a>.' 663 664 assert linkify(linked) == link_good 665 assert linkify(link_good) == link_good 666 667 def test_only_text_is_linkified(self): 668 some_text = 'text' 669 some_type = int 670 no_type = None 671 672 assert linkify(some_text) == some_text 673 674 with pytest.raises(TypeError): 675 linkify(some_type) 676 677 with pytest.raises(TypeError): 678 linkify(no_type) 679 680 681@pytest.mark.parametrize('text, expected', [ 682 ('abc', 'abc'), 683 ('example.com', '<a href="http://example.com">example.com</a>'), 684 ( 685 'http://example.com?b=1&c=2', 686 '<a href="http://example.com?b=1&c=2">http://example.com?b=1&c=2</a>' 687 ), 688 ( 689 'http://example.com?b=1&c=2', 690 '<a href="http://example.com?b=1&c=2">http://example.com?b=1&c=2</a>' 691 ), 692 ( 693 'link: https://example.com/watch#anchor', 694 'link: <a href="https://example.com/watch#anchor">https://example.com/watch#anchor</a>' 695 ) 696]) 697def test_linkify_filter(text, expected): 698 cleaner = Cleaner(filters=[LinkifyFilter]) 699 assert cleaner.clean(text) == expected 700