1"""
2Run chardet on a bunch of documents and see that we get the correct encodings.
3
4:author: Dan Blanchard
5:author: Ian Cordasco
6"""
7
8from __future__ import with_statement
9
10import textwrap
11from difflib import ndiff
12from io import open
13from os import listdir
14from os.path import dirname, isdir, join, realpath, relpath, splitext
15
16try:
17    import hypothesis.strategies as st
18    from hypothesis import given, assume, settings, Verbosity
19    HAVE_HYPOTHESIS = True
20except ImportError:
21    HAVE_HYPOTHESIS = False
22import pytest
23
24import chardet
25
26
27# TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250) after we
28#       retrain model.
29MISSING_ENCODINGS = {'iso-8859-2', 'iso-8859-6', 'windows-1250',
30                     'windows-1254', 'windows-1256'}
31EXPECTED_FAILURES = {'tests/iso-8859-7-greek/disabled.gr.xml',
32                     'tests/iso-8859-9-turkish/divxplanet.com.xml',
33                     'tests/iso-8859-9-turkish/subtitle.srt',
34                     'tests/iso-8859-9-turkish/wikitop_tr_ISO-8859-9.txt'}
35
36def gen_test_params():
37    """Yields tuples of paths and encodings to use for test_encoding_detection"""
38    base_path = relpath(join(dirname(realpath(__file__)), 'tests'))
39    for encoding in listdir(base_path):
40        path = join(base_path, encoding)
41        # Skip files in tests directory
42        if not isdir(path):
43            continue
44        # Remove language suffixes from encoding if pressent
45        encoding = encoding.lower()
46        for postfix in ['-arabic', '-bulgarian', '-cyrillic', '-greek',
47                        '-hebrew', '-hungarian', '-turkish']:
48            if encoding.endswith(postfix):
49                encoding = encoding.rpartition(postfix)[0]
50                break
51        # Skip directories for encodings we don't handle yet.
52        if encoding in MISSING_ENCODINGS:
53            continue
54        # Test encoding detection for each file we have of encoding for
55        for file_name in listdir(path):
56            ext = splitext(file_name)[1].lower()
57            if ext not in ['.html', '.txt', '.xml', '.srt']:
58                continue
59            full_path = join(path, file_name)
60            test_case = full_path, encoding
61            if full_path in EXPECTED_FAILURES:
62                test_case = pytest.param(*test_case, marks=pytest.mark.xfail)
63            yield test_case
64
65
66@pytest.mark.parametrize ('file_name, encoding', gen_test_params())
67def test_encoding_detection(file_name, encoding):
68    with open(file_name, 'rb') as f:
69        input_bytes = f.read()
70        result = chardet.detect(input_bytes)
71        try:
72            expected_unicode = input_bytes.decode(encoding)
73        except LookupError:
74            expected_unicode = ''
75        try:
76            detected_unicode = input_bytes.decode(result['encoding'])
77        except (LookupError, UnicodeDecodeError, TypeError):
78            detected_unicode = ''
79    if result:
80        encoding_match = (result['encoding'] or '').lower() == encoding
81    else:
82        encoding_match = False
83    # Only care about mismatches that would actually result in different
84    # behavior when decoding
85    if not encoding_match and expected_unicode != detected_unicode:
86        wrapped_expected = '\n'.join(textwrap.wrap(expected_unicode, 100)) + '\n'
87        wrapped_detected = '\n'.join(textwrap.wrap(detected_unicode, 100)) + '\n'
88        diff = ''.join(ndiff(wrapped_expected.splitlines(True),
89                             wrapped_detected.splitlines(True)))
90    else:
91        diff = ''
92        encoding_match = True
93    assert encoding_match, ("Expected %s, but got %s for %s.  Character "
94                            "differences: \n%s" % (encoding,
95                                                   result,
96                                                   file_name,
97                                                   diff))
98
99
100if HAVE_HYPOTHESIS:
101    class JustALengthIssue(Exception):
102        pass
103
104
105    @pytest.mark.xfail
106    @given(st.text(min_size=1), st.sampled_from(['ascii', 'utf-8', 'utf-16',
107                                                 'utf-32', 'iso-8859-7',
108                                                 'iso-8859-8', 'windows-1255']),
109           st.randoms())
110    @settings(max_examples=200)
111    def test_never_fails_to_detect_if_there_is_a_valid_encoding(txt, enc, rnd):
112        try:
113            data = txt.encode(enc)
114        except UnicodeEncodeError:
115            assume(False)
116        detected = chardet.detect(data)['encoding']
117        if detected is None:
118            with pytest.raises(JustALengthIssue):
119                @given(st.text(), random=rnd)
120                @settings(verbosity=Verbosity.quiet, max_shrinks=0, max_examples=50)
121                def string_poisons_following_text(suffix):
122                    try:
123                        extended = (txt + suffix).encode(enc)
124                    except UnicodeEncodeError:
125                        assume(False)
126                    result = chardet.detect(extended)
127                    if result and result['encoding'] is not None:
128                        raise JustALengthIssue()
129
130
131    @given(st.text(min_size=1), st.sampled_from(['ascii', 'utf-8', 'utf-16',
132                                                 'utf-32', 'iso-8859-7',
133                                                 'iso-8859-8', 'windows-1255']),
134           st.randoms())
135    @settings(max_examples=200)
136    def test_detect_all_and_detect_one_should_agree(txt, enc, rnd):
137        try:
138            data = txt.encode(enc)
139        except UnicodeEncodeError:
140            assume(False)
141        try:
142            result = chardet.detect(data)
143            results = chardet.detect_all(data)
144            assert result['encoding'] == results[0]['encoding']
145        except Exception:
146            raise Exception('%s != %s' % (result, results))
147