1"""
2Simple test suite for acora.
3"""
4
5import acora
6
7DOTDEBUG = False  # False
8
9if acora.BytesAcora is acora.PyAcora or acora.UnicodeAcora is acora.PyAcora:
10    print("WARNING: '_acora' C extension not imported, only testing Python implementation")
11
12try:
13    from acora._acora import tree_to_dot
14except ImportError:
15    tree_to_dot = lambda x: None
16
17try:
18    from acora._cacora import machine_to_dot
19except ImportError:
20    machine_to_dot = lambda x: None
21
22import sys
23import unittest
24import codecs
25import string
26
27# compat stuff ...
28
29try:
30    unicode
31except NameError:
32    unicode = str
33
34try:
35    bytes
36except NameError:
37    bytes = str
38
39try:
40    # Python 2.6+
41    from io import StringIO as _StringIO, BytesIO as _BytesIO
42except ImportError:
43    # Python 2
44    from StringIO import StringIO as _StringIO
45    _BytesIO = _StringIO
46
47def BytesIO(*args):
48    if args and isinstance(args[0], unicode):
49        args = (args[0].encode("UTF-8"),)
50    return _BytesIO(*args)
51
52def StringIO(*args):
53    if args and isinstance(args[0], bytes):
54        args = (args[0].decode("UTF-8"),)
55    return _BytesIO(*args)
56
57unicode_unescaper = codecs.lookup("unicode_escape")
58def unescape_unicode(s):
59    return unicode_unescaper.decode(s)[0]
60
61
62def prepare_test_data():
63    s = ('bdfdaskdjfhaslkdhfsadhfklashdflabcasdabcdJAKHDBVDFLNFCBLSADHFCALKSJ'
64        'jklhcnajskbhfasjhancfksjdfhbvaliuradefhzcbdegnashdgfbcjaabesdhgkfcnash'
65        'fdkhbdegxcbgjsvdhabcabcfcgbnxahsdbgfbcakjsdhgnfcxsababcmdabe')
66    s = s.lower() + s + s.upper()
67    search_string = s * 1000
68
69    all_keywords = [
70        'ab', 'abc', 'abcd', 'abcabc', 'ababc', 'ABBBC', 'ABCABC',
71        'bdfd', 'ade', 'abe', 'bdeg', 'fklash',
72        'gnfcxsababcmdabe', 'SADHFCAL',
73        'notthere', 'not-to-be-found', 'not-to-be-found-either',
74        ]
75
76    if sys.version_info[0] < 3:
77        all_keywords = list(map(unicode, all_keywords))
78        search_string = unicode(search_string)
79
80    return search_string, all_keywords
81
82
83class AcoraTest(object):
84    search_string, all_keywords = prepare_test_data()
85
86    def _build(self, *keywords):
87        keywords = list(map(self._swrap, keywords))
88        builder = acora.AcoraBuilder(*keywords)
89        if DOTDEBUG:
90            print('Initial tree:')
91            tree_to_dot(builder.tree)
92        machine = builder.build(acora=self.acora)
93        if DOTDEBUG:
94            print('\nProcessed tree:')
95            tree_to_dot(builder.tree)
96            if not isinstance(machine, acora.PyAcora):
97                print('\nMachine:')
98                machine_to_dot(machine)
99        return machine
100
101    def _build_ignore_case(self, *keywords):
102        keywords = list(map(self._swrap, keywords))
103        builder = acora.AcoraBuilder(*keywords, ignore_case=True)
104        if DOTDEBUG:
105            print('Initial tree:')
106            tree_to_dot(builder.tree)
107        machine = builder.build(acora=self.acora)
108        if DOTDEBUG:
109            print('\nProcessed tree:')
110            tree_to_dot(builder.tree)
111            if not isinstance(machine, acora.PyAcora):
112                print('\nMachine:')
113                machine_to_dot(machine)
114        return machine
115
116    def _result(self, result):
117        s = self._swrap
118        return [(s(k), pos) for k,pos in result]
119
120    # basic tests
121
122    def test_finditer_empty(self):
123        s = self._swrap
124        finditer = self._build().finditer
125        self.assertEqual(
126            sorted(finditer(s('abcd'))),
127            self._result([]))
128
129    def test_finditer_single_keyword(self):
130        s = self._swrap
131        finditer = self._build('bc').finditer
132        self.assertEqual(
133            sorted(finditer(s('abcd'))),
134            self._result([('bc', 1)]))
135
136    def test_finditer_many_keywords(self):
137        s = self._swrap
138        finditer = self._build(*string.ascii_letters).finditer
139        self.assertEqual(
140            sorted(finditer(s('abcd'))),
141            self._result([('a', 0), ('b', 1), ('c', 2), ('d', 3)]))
142
143    def test_finditer_many_keywords_not_found(self):
144        s = self._swrap
145        finditer = self._build(*string.ascii_letters).finditer
146        self.assertEqual(sorted(finditer(s(string.digits*100))), [])
147
148    def test_finditer_sequential(self):
149        s = self._swrap
150        finditer = self._build('a', 'b', 'c', 'd').finditer
151        self.assertEqual(
152            sorted(finditer(s('abcd'))),
153            self._result([('a', 0), ('b', 1), ('c', 2), ('d', 3)]))
154
155    def test_finditer_redundant(self):
156        s = self._swrap
157        finditer = self._build('a', 'b', 'A', 'B').finditer
158        self.assertEqual(
159            sorted(finditer(s('AaBb'))),
160            self._result([('A', 0), ('B', 2), ('a', 1), ('b', 3)]))
161
162    def test_finditer_overlap(self):
163        s = self._swrap
164        finditer = self._build('a', 'ab', 'abc', 'abcd').finditer
165        self.assertEqual(
166            sorted(finditer(s('abcd'))),
167            self._result([('a', 0), ('ab', 0), ('abc', 0), ('abcd', 0)]))
168
169    def test_finditer_reverse_overlap(self):
170        s = self._swrap
171        finditer = self._build('d', 'cd', 'bcd', 'abcd').finditer
172        self.assertEqual(
173            sorted(finditer(s('abcd'))),
174            self._result([('abcd', 0), ('bcd', 1), ('cd', 2), ('d', 3)]))
175
176    def test_deepcopy_builder(self):
177        from copy import deepcopy
178        s = self._swrap
179
180        builder1 = acora.AcoraBuilder(*list(map(s, ['a', 'b', 'c'])))
181        builder2 = deepcopy(builder1)
182        builder2.add(s('ab'), s('bc'))
183
184        finditer1 = builder1.build(acora=self.acora).finditer
185        finditer2 = builder2.build(acora=self.acora).finditer
186
187        self.assertEqual(
188            sorted(finditer1(s('abcd'))),
189            self._result([('a', 0), ('b', 1), ('c', 2)]))
190
191        self.assertEqual(
192            sorted(finditer2(s('abcd'))),
193            self._result([('a', 0), ('ab', 0), ('b', 1), ('bc', 1), ('c', 2)]))
194
195    def test_deepcopy_machine(self):
196        from copy import deepcopy
197        s = self._swrap
198
199        builder = acora.AcoraBuilder(*list(map(s, ['a', 'b', 'c'])))
200        ac1 = builder.build(acora=self.acora)
201        ac2 = deepcopy(ac1)
202
203        self.assertEqual(
204            sorted(ac1.finditer(s('abcd'))),
205            self._result([('a', 0), ('b', 1), ('c', 2)]))
206
207        self.assertEqual(
208            sorted(ac2.finditer(s('abcd'))),
209            self._result([('a', 0), ('b', 1), ('c', 2)]))
210
211    def test_pickle_machine(self):
212        import pickle
213        s = self._swrap
214
215        builder = acora.AcoraBuilder(*list(map(s, ['a', 'b', 'c'])))
216        ac1 = builder.build(acora=self.acora)
217        #if not isinstance(ac1, acora.PyAcora):
218        #    machine_to_dot(ac1)
219        ac2 = pickle.loads(pickle.dumps(ac1))
220        #if not isinstance(ac2, acora.PyAcora):
221        #    machine_to_dot(ac2)
222
223        self.assertEqual(
224            sorted(ac1.finditer(s('abcd'))),
225            self._result([('a', 0), ('b', 1), ('c', 2)]))
226
227        self.assertEqual(
228            sorted(ac2.finditer(s('abcd'))),
229            self._result([('a', 0), ('b', 1), ('c', 2)]))
230
231    def test_pickle2_machine(self):
232        import pickle
233        s = self._swrap
234
235        builder = acora.AcoraBuilder(*list(map(s, ['a', 'b', 'c'])))
236        ac1 = builder.build(acora=self.acora)
237        #if not isinstance(ac1, acora.PyAcora):
238        #    machine_to_dot(ac1)
239        ac2 = pickle.loads(pickle.dumps(ac1, protocol=pickle.HIGHEST_PROTOCOL))
240        #if not isinstance(ac2, acora.PyAcora):
241        #    machine_to_dot(ac2)
242
243        self.assertEqual(
244            sorted(ac1.finditer(s('abcd'))),
245            self._result([('a', 0), ('b', 1), ('c', 2)]))
246
247        self.assertEqual(
248            sorted(ac2.finditer(s('abcd'))),
249            self._result([('a', 0), ('b', 1), ('c', 2)]))
250
251    def test_pickle_machine_new(self):
252        s = self._swrap
253
254        builder = acora.AcoraBuilder(*list(map(s, ['a', 'bc', 'c'])))
255        ac = builder.build(acora=self.acora)
256        #if not isinstance(ac, acora.PyAcora):
257        #    machine_to_dot(ac)
258
259        import pickle
260        p = pickle.dumps(ac)
261
262        del builder, ac
263        import gc
264        gc.collect()
265
266        ac = pickle.loads(p)
267        #if not isinstance(ac, acora.PyAcora):
268        #    machine_to_dot(ac)
269        self.assertEqual(
270            sorted(ac.finditer(s('abcd'))),
271            self._result([('a', 0), ('bc', 1), ('c', 2)]))
272
273
274class UnicodeAcoraTest(unittest.TestCase, AcoraTest):
275    # only unicode data tests
276    from acora import UnicodeAcora as acora
277
278    def _swrap(self, s):
279        if isinstance(s, unicode):
280            s = s.encode('ascii')
281        return unescape_unicode(s)
282
283    def test_finditer_line_endings(self):
284        s = self._swrap
285        finditer = self._build_ignore_case('a', 'b', 'c', 'd', '\r', '\n').finditer
286
287        line = 0
288        line_matches = []
289        current_line_matches = []
290        last_ending = None
291        for kw, pos in finditer(s('Aa\r\nB\nbC\n\rcD\r\nd')):
292            if kw in '\r\n':
293                if last_ending == '\r' and kw == '\n':
294                    continue
295                line_matches.append(tuple(current_line_matches))
296                del current_line_matches[:]
297                last_ending = kw
298                line += 1
299            else:
300                last_ending = None
301                current_line_matches.append(kw)
302
303        line_matches.append(tuple(current_line_matches))
304
305        self.assertEqual(line, 5)
306        self.assertEqual(
307            line_matches,
308            [('a', 'a'), ('b',), ('b', 'c'), (), ('c', 'd'), ('d',)])
309
310    def test_finditer_single_keyword_unicode(self):
311        s = self._swrap
312        finditer = self._build("\\uF8D2").finditer
313        self.assertEqual(
314            list(finditer(s("\\uF8D1\\uF8D2\\uF8D3"))),
315            self._result([("\\uF8D2", 1)]))
316
317    def test_finditer_single_keyword_non_bmp(self):
318        s = self._swrap
319        finditer = self._build("\\U0001F8D2").finditer
320        self.assertEqual(
321            list(finditer(s("\\U0001F8D1\\U0001F8D2\\uF8D3"))),
322            self._result([("\\U0001F8D2", 1)]))
323
324    def test_finditer_ignore_case_single_char(self):
325        s = self._swrap
326        finditer = self._build_ignore_case('a', 'b', 'c', 'd').finditer
327        self.assertEqual(
328            sorted(finditer(s('AaBbCcDd'))),
329            self._result([('a', 0), ('a', 1), ('b', 2), ('b', 3),
330                          ('c', 4), ('c', 5), ('d', 6), ('d', 7)]))
331
332    def test_finditer_ignore_case_words(self):
333        s = self._swrap
334        finditer = self._build_ignore_case('aAbb', 'bc', 'cc', 'Cd', 'ccD', 'bbb', 'cB').finditer
335        self.assertEqual(
336            sorted(finditer(s('AaBbCcDd'))),
337            self._result([('Cd', 5), ('aAbb', 0), ('bc', 3), ('cc', 4), ('ccD', 4)]))
338
339    def test_finditer_ignore_case_redundant(self):
340        s = self._swrap
341        finditer = self._build_ignore_case('a', 'b', 'A', 'B').finditer
342        self.assertEqual(
343            sorted(finditer(s('AaBb'))),
344            self._result([('A', 0), ('A', 1), ('B', 2), ('B', 3),
345                          ('a', 0), ('a', 1), ('b', 2), ('b', 3)]))
346
347
348class BytesAcoraTest(unittest.TestCase, AcoraTest):
349    # only byte data tests
350    from acora import BytesAcora as acora
351
352    simple_data = 'abc' + ('a'*100+'b'*100)*1000 + 'abcde'
353    simple_kwds = ['abc'.encode('ASCII'),
354                   'abcde'.encode('ASCII')]
355    last_match_pos = len(simple_data) - 5
356    expected_result = [(simple_kwds[0], 0),
357                       (simple_kwds[0], last_match_pos),
358                       (simple_kwds[1], last_match_pos)]
359
360    def _swrap(self, s):
361        if isinstance(s, unicode):
362            s = s.encode('ISO-8859-1')
363        return s
364
365    def _search_in_file(self, ac, data):
366        import tempfile
367        tmp = tempfile.TemporaryFile()
368        try:
369            tmp.write(data.encode('ASCII'))
370            tmp.seek(0)
371            return list(ac.filefind(tmp))
372        finally:
373            tmp.close()
374
375    def test_filefind_empty(self):
376        filefind= self._build().filefind
377        data = BytesIO(self.search_string)
378        self.assertEqual(list(filefind(data)), [])
379
380    def test_large_filelike_searching(self):
381        filefind = self._build('SADHFCAL'.encode('ASCII'),
382                               'bdeg'.encode('ASCII')).filefind
383        data = BytesIO(self.search_string)
384        result = list(filefind(data))
385        self.assertEqual(len(result), 6000)
386
387    def test_large_filelike_searching_check(self):
388        ac = self._build(*self.simple_kwds)
389        data = BytesIO(self.simple_data)
390        result = list(ac.filefind(data))
391        self.assertEqual(result, self.expected_result)
392
393    def test_file_searching(self):
394        ac = self._build([ kw.encode('ASCII')
395                           for kw in ('a', 'b', 'ab', 'abc') ])
396        result = self._search_in_file(ac, 'abbabc')
397        self.assertEqual(len(result), 8)
398
399    def test_large_file_searching(self):
400        ac = self._build('SADHFCAL'.encode('ASCII'),
401                         'bdeg'.encode('ASCII'))
402        result = self._search_in_file(ac, self.search_string)
403        self.assertEqual(len(result), 6000)
404
405    def test_large_file_searching_check(self):
406        ac = self._build(*self.simple_kwds)
407        result = self._search_in_file(ac, self.simple_data)
408        self.assertEqual(result, self.expected_result)
409
410    def test_binary_data_search(self):
411        pattern = self._swrap('\xa5\x66\x80')
412        ac = self._build(pattern)
413        mainString = self._swrap(10 * '\xf0') + pattern + self._swrap(10 * '\xf0')
414        result = ac.findall(mainString)
415        self.assertEqual(result, [(pattern, 10)])
416
417    def test_binary_data_search_start(self):
418        pattern = self._swrap('\xa5\x66\x80')
419        ac = self._build(pattern)
420        mainString = pattern + self._swrap(10 * '\xf0')
421        result = ac.findall(mainString)
422        self.assertEqual(result, [(pattern, 0)])
423
424    def test_binary_data_search_end(self):
425        pattern = self._swrap('\xa5\x66\x80')
426        ac = self._build(pattern)
427        mainString = self._swrap(10 * '\xf0') + pattern
428        result = ac.findall(mainString)
429        self.assertEqual(result, [(pattern, 10)])
430
431
432class PyUnicodeAcoraTest(UnicodeAcoraTest):
433    from acora import PyAcora as acora
434
435
436class PyBytesAcoraTest(BytesAcoraTest):
437    from acora import PyAcora as acora
438
439
440def suite():
441    import doctest
442    tests = unittest.TestSuite([
443        unittest.makeSuite(UnicodeAcoraTest),
444        unittest.makeSuite(PyUnicodeAcoraTest),
445        unittest.makeSuite(BytesAcoraTest),
446        unittest.makeSuite(PyBytesAcoraTest),
447        doctest.DocTestSuite(),
448        doctest.DocFileSuite('README.rst'),
449    ])
450    return tests
451
452
453if __name__ == "__main__":
454    args = sys.argv[1:]
455    verbosity = min(2, args.count('-v') + args.count('-vv')*2)
456    unittest.TextTestRunner(verbosity=verbosity).run(suite())
457