1""" 2Simple test suite for acora. 3""" 4 5import acora 6 7DOTDEBUG = False # False 8 9if acora.BytesAcora is acora.PyAcora or acora.UnicodeAcora is acora.PyAcora: 10 print("WARNING: '_acora' C extension not imported, only testing Python implementation") 11 12try: 13 from acora._acora import tree_to_dot 14except ImportError: 15 tree_to_dot = lambda x: None 16 17try: 18 from acora._cacora import machine_to_dot 19except ImportError: 20 machine_to_dot = lambda x: None 21 22import sys 23import unittest 24import codecs 25import string 26 27# compat stuff ... 28 29try: 30 unicode 31except NameError: 32 unicode = str 33 34try: 35 bytes 36except NameError: 37 bytes = str 38 39try: 40 # Python 2.6+ 41 from io import StringIO as _StringIO, BytesIO as _BytesIO 42except ImportError: 43 # Python 2 44 from StringIO import StringIO as _StringIO 45 _BytesIO = _StringIO 46 47def BytesIO(*args): 48 if args and isinstance(args[0], unicode): 49 args = (args[0].encode("UTF-8"),) 50 return _BytesIO(*args) 51 52def StringIO(*args): 53 if args and isinstance(args[0], bytes): 54 args = (args[0].decode("UTF-8"),) 55 return _BytesIO(*args) 56 57unicode_unescaper = codecs.lookup("unicode_escape") 58def unescape_unicode(s): 59 return unicode_unescaper.decode(s)[0] 60 61 62def prepare_test_data(): 63 s = ('bdfdaskdjfhaslkdhfsadhfklashdflabcasdabcdJAKHDBVDFLNFCBLSADHFCALKSJ' 64 'jklhcnajskbhfasjhancfksjdfhbvaliuradefhzcbdegnashdgfbcjaabesdhgkfcnash' 65 'fdkhbdegxcbgjsvdhabcabcfcgbnxahsdbgfbcakjsdhgnfcxsababcmdabe') 66 s = s.lower() + s + s.upper() 67 search_string = s * 1000 68 69 all_keywords = [ 70 'ab', 'abc', 'abcd', 'abcabc', 'ababc', 'ABBBC', 'ABCABC', 71 'bdfd', 'ade', 'abe', 'bdeg', 'fklash', 72 'gnfcxsababcmdabe', 'SADHFCAL', 73 'notthere', 'not-to-be-found', 'not-to-be-found-either', 74 ] 75 76 if sys.version_info[0] < 3: 77 all_keywords = list(map(unicode, all_keywords)) 78 search_string = unicode(search_string) 79 80 return search_string, all_keywords 81 82 83class AcoraTest(object): 84 search_string, all_keywords = prepare_test_data() 85 86 def _build(self, *keywords): 87 keywords = list(map(self._swrap, keywords)) 88 builder = acora.AcoraBuilder(*keywords) 89 if DOTDEBUG: 90 print('Initial tree:') 91 tree_to_dot(builder.tree) 92 machine = builder.build(acora=self.acora) 93 if DOTDEBUG: 94 print('\nProcessed tree:') 95 tree_to_dot(builder.tree) 96 if not isinstance(machine, acora.PyAcora): 97 print('\nMachine:') 98 machine_to_dot(machine) 99 return machine 100 101 def _build_ignore_case(self, *keywords): 102 keywords = list(map(self._swrap, keywords)) 103 builder = acora.AcoraBuilder(*keywords, ignore_case=True) 104 if DOTDEBUG: 105 print('Initial tree:') 106 tree_to_dot(builder.tree) 107 machine = builder.build(acora=self.acora) 108 if DOTDEBUG: 109 print('\nProcessed tree:') 110 tree_to_dot(builder.tree) 111 if not isinstance(machine, acora.PyAcora): 112 print('\nMachine:') 113 machine_to_dot(machine) 114 return machine 115 116 def _result(self, result): 117 s = self._swrap 118 return [(s(k), pos) for k,pos in result] 119 120 # basic tests 121 122 def test_finditer_empty(self): 123 s = self._swrap 124 finditer = self._build().finditer 125 self.assertEqual( 126 sorted(finditer(s('abcd'))), 127 self._result([])) 128 129 def test_finditer_single_keyword(self): 130 s = self._swrap 131 finditer = self._build('bc').finditer 132 self.assertEqual( 133 sorted(finditer(s('abcd'))), 134 self._result([('bc', 1)])) 135 136 def test_finditer_many_keywords(self): 137 s = self._swrap 138 finditer = self._build(*string.ascii_letters).finditer 139 self.assertEqual( 140 sorted(finditer(s('abcd'))), 141 self._result([('a', 0), ('b', 1), ('c', 2), ('d', 3)])) 142 143 def test_finditer_many_keywords_not_found(self): 144 s = self._swrap 145 finditer = self._build(*string.ascii_letters).finditer 146 self.assertEqual(sorted(finditer(s(string.digits*100))), []) 147 148 def test_finditer_sequential(self): 149 s = self._swrap 150 finditer = self._build('a', 'b', 'c', 'd').finditer 151 self.assertEqual( 152 sorted(finditer(s('abcd'))), 153 self._result([('a', 0), ('b', 1), ('c', 2), ('d', 3)])) 154 155 def test_finditer_redundant(self): 156 s = self._swrap 157 finditer = self._build('a', 'b', 'A', 'B').finditer 158 self.assertEqual( 159 sorted(finditer(s('AaBb'))), 160 self._result([('A', 0), ('B', 2), ('a', 1), ('b', 3)])) 161 162 def test_finditer_overlap(self): 163 s = self._swrap 164 finditer = self._build('a', 'ab', 'abc', 'abcd').finditer 165 self.assertEqual( 166 sorted(finditer(s('abcd'))), 167 self._result([('a', 0), ('ab', 0), ('abc', 0), ('abcd', 0)])) 168 169 def test_finditer_reverse_overlap(self): 170 s = self._swrap 171 finditer = self._build('d', 'cd', 'bcd', 'abcd').finditer 172 self.assertEqual( 173 sorted(finditer(s('abcd'))), 174 self._result([('abcd', 0), ('bcd', 1), ('cd', 2), ('d', 3)])) 175 176 def test_deepcopy_builder(self): 177 from copy import deepcopy 178 s = self._swrap 179 180 builder1 = acora.AcoraBuilder(*list(map(s, ['a', 'b', 'c']))) 181 builder2 = deepcopy(builder1) 182 builder2.add(s('ab'), s('bc')) 183 184 finditer1 = builder1.build(acora=self.acora).finditer 185 finditer2 = builder2.build(acora=self.acora).finditer 186 187 self.assertEqual( 188 sorted(finditer1(s('abcd'))), 189 self._result([('a', 0), ('b', 1), ('c', 2)])) 190 191 self.assertEqual( 192 sorted(finditer2(s('abcd'))), 193 self._result([('a', 0), ('ab', 0), ('b', 1), ('bc', 1), ('c', 2)])) 194 195 def test_deepcopy_machine(self): 196 from copy import deepcopy 197 s = self._swrap 198 199 builder = acora.AcoraBuilder(*list(map(s, ['a', 'b', 'c']))) 200 ac1 = builder.build(acora=self.acora) 201 ac2 = deepcopy(ac1) 202 203 self.assertEqual( 204 sorted(ac1.finditer(s('abcd'))), 205 self._result([('a', 0), ('b', 1), ('c', 2)])) 206 207 self.assertEqual( 208 sorted(ac2.finditer(s('abcd'))), 209 self._result([('a', 0), ('b', 1), ('c', 2)])) 210 211 def test_pickle_machine(self): 212 import pickle 213 s = self._swrap 214 215 builder = acora.AcoraBuilder(*list(map(s, ['a', 'b', 'c']))) 216 ac1 = builder.build(acora=self.acora) 217 #if not isinstance(ac1, acora.PyAcora): 218 # machine_to_dot(ac1) 219 ac2 = pickle.loads(pickle.dumps(ac1)) 220 #if not isinstance(ac2, acora.PyAcora): 221 # machine_to_dot(ac2) 222 223 self.assertEqual( 224 sorted(ac1.finditer(s('abcd'))), 225 self._result([('a', 0), ('b', 1), ('c', 2)])) 226 227 self.assertEqual( 228 sorted(ac2.finditer(s('abcd'))), 229 self._result([('a', 0), ('b', 1), ('c', 2)])) 230 231 def test_pickle2_machine(self): 232 import pickle 233 s = self._swrap 234 235 builder = acora.AcoraBuilder(*list(map(s, ['a', 'b', 'c']))) 236 ac1 = builder.build(acora=self.acora) 237 #if not isinstance(ac1, acora.PyAcora): 238 # machine_to_dot(ac1) 239 ac2 = pickle.loads(pickle.dumps(ac1, protocol=pickle.HIGHEST_PROTOCOL)) 240 #if not isinstance(ac2, acora.PyAcora): 241 # machine_to_dot(ac2) 242 243 self.assertEqual( 244 sorted(ac1.finditer(s('abcd'))), 245 self._result([('a', 0), ('b', 1), ('c', 2)])) 246 247 self.assertEqual( 248 sorted(ac2.finditer(s('abcd'))), 249 self._result([('a', 0), ('b', 1), ('c', 2)])) 250 251 def test_pickle_machine_new(self): 252 s = self._swrap 253 254 builder = acora.AcoraBuilder(*list(map(s, ['a', 'bc', 'c']))) 255 ac = builder.build(acora=self.acora) 256 #if not isinstance(ac, acora.PyAcora): 257 # machine_to_dot(ac) 258 259 import pickle 260 p = pickle.dumps(ac) 261 262 del builder, ac 263 import gc 264 gc.collect() 265 266 ac = pickle.loads(p) 267 #if not isinstance(ac, acora.PyAcora): 268 # machine_to_dot(ac) 269 self.assertEqual( 270 sorted(ac.finditer(s('abcd'))), 271 self._result([('a', 0), ('bc', 1), ('c', 2)])) 272 273 274class UnicodeAcoraTest(unittest.TestCase, AcoraTest): 275 # only unicode data tests 276 from acora import UnicodeAcora as acora 277 278 def _swrap(self, s): 279 if isinstance(s, unicode): 280 s = s.encode('ascii') 281 return unescape_unicode(s) 282 283 def test_finditer_line_endings(self): 284 s = self._swrap 285 finditer = self._build_ignore_case('a', 'b', 'c', 'd', '\r', '\n').finditer 286 287 line = 0 288 line_matches = [] 289 current_line_matches = [] 290 last_ending = None 291 for kw, pos in finditer(s('Aa\r\nB\nbC\n\rcD\r\nd')): 292 if kw in '\r\n': 293 if last_ending == '\r' and kw == '\n': 294 continue 295 line_matches.append(tuple(current_line_matches)) 296 del current_line_matches[:] 297 last_ending = kw 298 line += 1 299 else: 300 last_ending = None 301 current_line_matches.append(kw) 302 303 line_matches.append(tuple(current_line_matches)) 304 305 self.assertEqual(line, 5) 306 self.assertEqual( 307 line_matches, 308 [('a', 'a'), ('b',), ('b', 'c'), (), ('c', 'd'), ('d',)]) 309 310 def test_finditer_single_keyword_unicode(self): 311 s = self._swrap 312 finditer = self._build("\\uF8D2").finditer 313 self.assertEqual( 314 list(finditer(s("\\uF8D1\\uF8D2\\uF8D3"))), 315 self._result([("\\uF8D2", 1)])) 316 317 def test_finditer_single_keyword_non_bmp(self): 318 s = self._swrap 319 finditer = self._build("\\U0001F8D2").finditer 320 self.assertEqual( 321 list(finditer(s("\\U0001F8D1\\U0001F8D2\\uF8D3"))), 322 self._result([("\\U0001F8D2", 1)])) 323 324 def test_finditer_ignore_case_single_char(self): 325 s = self._swrap 326 finditer = self._build_ignore_case('a', 'b', 'c', 'd').finditer 327 self.assertEqual( 328 sorted(finditer(s('AaBbCcDd'))), 329 self._result([('a', 0), ('a', 1), ('b', 2), ('b', 3), 330 ('c', 4), ('c', 5), ('d', 6), ('d', 7)])) 331 332 def test_finditer_ignore_case_words(self): 333 s = self._swrap 334 finditer = self._build_ignore_case('aAbb', 'bc', 'cc', 'Cd', 'ccD', 'bbb', 'cB').finditer 335 self.assertEqual( 336 sorted(finditer(s('AaBbCcDd'))), 337 self._result([('Cd', 5), ('aAbb', 0), ('bc', 3), ('cc', 4), ('ccD', 4)])) 338 339 def test_finditer_ignore_case_redundant(self): 340 s = self._swrap 341 finditer = self._build_ignore_case('a', 'b', 'A', 'B').finditer 342 self.assertEqual( 343 sorted(finditer(s('AaBb'))), 344 self._result([('A', 0), ('A', 1), ('B', 2), ('B', 3), 345 ('a', 0), ('a', 1), ('b', 2), ('b', 3)])) 346 347 348class BytesAcoraTest(unittest.TestCase, AcoraTest): 349 # only byte data tests 350 from acora import BytesAcora as acora 351 352 simple_data = 'abc' + ('a'*100+'b'*100)*1000 + 'abcde' 353 simple_kwds = ['abc'.encode('ASCII'), 354 'abcde'.encode('ASCII')] 355 last_match_pos = len(simple_data) - 5 356 expected_result = [(simple_kwds[0], 0), 357 (simple_kwds[0], last_match_pos), 358 (simple_kwds[1], last_match_pos)] 359 360 def _swrap(self, s): 361 if isinstance(s, unicode): 362 s = s.encode('ISO-8859-1') 363 return s 364 365 def _search_in_file(self, ac, data): 366 import tempfile 367 tmp = tempfile.TemporaryFile() 368 try: 369 tmp.write(data.encode('ASCII')) 370 tmp.seek(0) 371 return list(ac.filefind(tmp)) 372 finally: 373 tmp.close() 374 375 def test_filefind_empty(self): 376 filefind= self._build().filefind 377 data = BytesIO(self.search_string) 378 self.assertEqual(list(filefind(data)), []) 379 380 def test_large_filelike_searching(self): 381 filefind = self._build('SADHFCAL'.encode('ASCII'), 382 'bdeg'.encode('ASCII')).filefind 383 data = BytesIO(self.search_string) 384 result = list(filefind(data)) 385 self.assertEqual(len(result), 6000) 386 387 def test_large_filelike_searching_check(self): 388 ac = self._build(*self.simple_kwds) 389 data = BytesIO(self.simple_data) 390 result = list(ac.filefind(data)) 391 self.assertEqual(result, self.expected_result) 392 393 def test_file_searching(self): 394 ac = self._build([ kw.encode('ASCII') 395 for kw in ('a', 'b', 'ab', 'abc') ]) 396 result = self._search_in_file(ac, 'abbabc') 397 self.assertEqual(len(result), 8) 398 399 def test_large_file_searching(self): 400 ac = self._build('SADHFCAL'.encode('ASCII'), 401 'bdeg'.encode('ASCII')) 402 result = self._search_in_file(ac, self.search_string) 403 self.assertEqual(len(result), 6000) 404 405 def test_large_file_searching_check(self): 406 ac = self._build(*self.simple_kwds) 407 result = self._search_in_file(ac, self.simple_data) 408 self.assertEqual(result, self.expected_result) 409 410 def test_binary_data_search(self): 411 pattern = self._swrap('\xa5\x66\x80') 412 ac = self._build(pattern) 413 mainString = self._swrap(10 * '\xf0') + pattern + self._swrap(10 * '\xf0') 414 result = ac.findall(mainString) 415 self.assertEqual(result, [(pattern, 10)]) 416 417 def test_binary_data_search_start(self): 418 pattern = self._swrap('\xa5\x66\x80') 419 ac = self._build(pattern) 420 mainString = pattern + self._swrap(10 * '\xf0') 421 result = ac.findall(mainString) 422 self.assertEqual(result, [(pattern, 0)]) 423 424 def test_binary_data_search_end(self): 425 pattern = self._swrap('\xa5\x66\x80') 426 ac = self._build(pattern) 427 mainString = self._swrap(10 * '\xf0') + pattern 428 result = ac.findall(mainString) 429 self.assertEqual(result, [(pattern, 10)]) 430 431 432class PyUnicodeAcoraTest(UnicodeAcoraTest): 433 from acora import PyAcora as acora 434 435 436class PyBytesAcoraTest(BytesAcoraTest): 437 from acora import PyAcora as acora 438 439 440def suite(): 441 import doctest 442 tests = unittest.TestSuite([ 443 unittest.makeSuite(UnicodeAcoraTest), 444 unittest.makeSuite(PyUnicodeAcoraTest), 445 unittest.makeSuite(BytesAcoraTest), 446 unittest.makeSuite(PyBytesAcoraTest), 447 doctest.DocTestSuite(), 448 doctest.DocFileSuite('README.rst'), 449 ]) 450 return tests 451 452 453if __name__ == "__main__": 454 args = sys.argv[1:] 455 verbosity = min(2, args.count('-v') + args.count('-vv')*2) 456 unittest.TextTestRunner(verbosity=verbosity).run(suite()) 457