1""" 2Run chardet on a bunch of documents and see that we get the correct encodings. 3 4:author: Dan Blanchard 5:author: Ian Cordasco 6""" 7 8from __future__ import with_statement 9 10import textwrap 11from difflib import ndiff 12from io import open 13from os import listdir 14from os.path import dirname, isdir, join, realpath, relpath, splitext 15 16try: 17 import hypothesis.strategies as st 18 from hypothesis import given, assume, settings, Verbosity 19 HAVE_HYPOTHESIS = True 20except ImportError: 21 HAVE_HYPOTHESIS = False 22import pytest 23 24import chardet 25 26 27# TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250) after we 28# retrain model. 29MISSING_ENCODINGS = {'iso-8859-2', 'iso-8859-6', 'windows-1250', 30 'windows-1254', 'windows-1256'} 31EXPECTED_FAILURES = {'tests/iso-8859-7-greek/disabled.gr.xml', 32 'tests/iso-8859-9-turkish/divxplanet.com.xml', 33 'tests/iso-8859-9-turkish/subtitle.srt', 34 'tests/iso-8859-9-turkish/wikitop_tr_ISO-8859-9.txt'} 35 36def gen_test_params(): 37 """Yields tuples of paths and encodings to use for test_encoding_detection""" 38 base_path = relpath(join(dirname(realpath(__file__)), 'tests')) 39 for encoding in listdir(base_path): 40 path = join(base_path, encoding) 41 # Skip files in tests directory 42 if not isdir(path): 43 continue 44 # Remove language suffixes from encoding if pressent 45 encoding = encoding.lower() 46 for postfix in ['-arabic', '-bulgarian', '-cyrillic', '-greek', 47 '-hebrew', '-hungarian', '-turkish']: 48 if encoding.endswith(postfix): 49 encoding = encoding.rpartition(postfix)[0] 50 break 51 # Skip directories for encodings we don't handle yet. 52 if encoding in MISSING_ENCODINGS: 53 continue 54 # Test encoding detection for each file we have of encoding for 55 for file_name in listdir(path): 56 ext = splitext(file_name)[1].lower() 57 if ext not in ['.html', '.txt', '.xml', '.srt']: 58 continue 59 full_path = join(path, file_name) 60 test_case = full_path, encoding 61 if full_path in EXPECTED_FAILURES: 62 test_case = pytest.param(*test_case, marks=pytest.mark.xfail) 63 yield test_case 64 65 66@pytest.mark.parametrize ('file_name, encoding', gen_test_params()) 67def test_encoding_detection(file_name, encoding): 68 with open(file_name, 'rb') as f: 69 input_bytes = f.read() 70 result = chardet.detect(input_bytes) 71 try: 72 expected_unicode = input_bytes.decode(encoding) 73 except LookupError: 74 expected_unicode = '' 75 try: 76 detected_unicode = input_bytes.decode(result['encoding']) 77 except (LookupError, UnicodeDecodeError, TypeError): 78 detected_unicode = '' 79 if result: 80 encoding_match = (result['encoding'] or '').lower() == encoding 81 else: 82 encoding_match = False 83 # Only care about mismatches that would actually result in different 84 # behavior when decoding 85 if not encoding_match and expected_unicode != detected_unicode: 86 wrapped_expected = '\n'.join(textwrap.wrap(expected_unicode, 100)) + '\n' 87 wrapped_detected = '\n'.join(textwrap.wrap(detected_unicode, 100)) + '\n' 88 diff = ''.join(ndiff(wrapped_expected.splitlines(True), 89 wrapped_detected.splitlines(True))) 90 else: 91 diff = '' 92 encoding_match = True 93 assert encoding_match, ("Expected %s, but got %s for %s. Character " 94 "differences: \n%s" % (encoding, 95 result, 96 file_name, 97 diff)) 98 99 100if HAVE_HYPOTHESIS: 101 class JustALengthIssue(Exception): 102 pass 103 104 105 @pytest.mark.xfail 106 @given(st.text(min_size=1), st.sampled_from(['ascii', 'utf-8', 'utf-16', 107 'utf-32', 'iso-8859-7', 108 'iso-8859-8', 'windows-1255']), 109 st.randoms()) 110 @settings(max_examples=200) 111 def test_never_fails_to_detect_if_there_is_a_valid_encoding(txt, enc, rnd): 112 try: 113 data = txt.encode(enc) 114 except UnicodeEncodeError: 115 assume(False) 116 detected = chardet.detect(data)['encoding'] 117 if detected is None: 118 with pytest.raises(JustALengthIssue): 119 @given(st.text(), random=rnd) 120 @settings(verbosity=Verbosity.quiet, max_shrinks=0, max_examples=50) 121 def string_poisons_following_text(suffix): 122 try: 123 extended = (txt + suffix).encode(enc) 124 except UnicodeEncodeError: 125 assume(False) 126 result = chardet.detect(extended) 127 if result and result['encoding'] is not None: 128 raise JustALengthIssue() 129 130 131 @given(st.text(min_size=1), st.sampled_from(['ascii', 'utf-8', 'utf-16', 132 'utf-32', 'iso-8859-7', 133 'iso-8859-8', 'windows-1255']), 134 st.randoms()) 135 @settings(max_examples=200) 136 def test_detect_all_and_detect_one_should_agree(txt, enc, rnd): 137 try: 138 data = txt.encode(enc) 139 except UnicodeEncodeError: 140 assume(False) 141 try: 142 result = chardet.detect(data) 143 results = chardet.detect_all(data) 144 assert result['encoding'] == results[0]['encoding'] 145 except Exception: 146 raise Exception('%s != %s' % (result, results)) 147