1# cython: profile=False 2""" 3Cython wrapper for libdatrie. 4""" 5 6from cpython.version cimport PY_MAJOR_VERSION 7from cython.operator import dereference as deref 8from libc.stdlib cimport malloc, free 9from libc cimport stdio 10from libc cimport string 11cimport stdio_ext 12cimport cdatrie 13 14import itertools 15import warnings 16import sys 17import tempfile 18 19try: 20 from collections.abc import MutableMapping 21except ImportError: 22 from collections import MutableMapping 23 24try: 25 import cPickle as pickle 26except ImportError: 27 import pickle 28 29class DatrieError(Exception): 30 pass 31 32RAISE_KEY_ERROR = object() 33RERAISE_KEY_ERROR = object() 34DELETED_OBJECT = object() 35 36 37cdef class BaseTrie: 38 """ 39 Wrapper for libdatrie's trie. 40 41 Keys are unicode strings, values are integers -2147483648 <= x <= 2147483647. 42 """ 43 44 cdef AlphaMap alpha_map 45 cdef cdatrie.Trie *_c_trie 46 47 def __init__(self, alphabet=None, ranges=None, AlphaMap alpha_map=None, _create=True): 48 """ 49 For efficiency trie needs to know what unicode symbols 50 it should be able to store so this constructor requires 51 either ``alphabet`` (a string/iterable with all allowed characters), 52 ``ranges`` (a list of (begin, end) pairs, e.g. [('a', 'z')]) 53 or ``alpha_map`` (:class:`datrie.AlphaMap` instance). 54 """ 55 if self._c_trie is not NULL: 56 return 57 58 if not _create: 59 return 60 61 if alphabet is None and ranges is None and alpha_map is None: 62 raise ValueError( 63 "Please provide alphabet, ranges or alpha_map argument.") 64 65 if alpha_map is None: 66 alpha_map = AlphaMap(alphabet, ranges) 67 68 self.alpha_map = alpha_map 69 self._c_trie = cdatrie.trie_new(alpha_map._c_alpha_map) 70 if self._c_trie is NULL: 71 raise MemoryError() 72 73 def __dealloc__(self): 74 if self._c_trie is not NULL: 75 cdatrie.trie_free(self._c_trie) 76 77 def update(self, other=(), **kwargs): 78 if PY_MAJOR_VERSION == 2: 79 if kwargs: 80 raise TypeError("keyword arguments are not supported.") 81 82 if hasattr(other, "keys"): 83 for key in other: 84 self[key] = other[key] 85 else: 86 for key, value in other: 87 self[key] = value 88 89 for key in kwargs: 90 self[key] = kwargs[key] 91 92 def clear(self): 93 cdef AlphaMap alpha_map = self.alpha_map.copy() 94 _c_trie = cdatrie.trie_new(alpha_map._c_alpha_map) 95 if _c_trie is NULL: 96 raise MemoryError() 97 98 cdatrie.trie_free(self._c_trie) 99 self._c_trie = _c_trie 100 101 cpdef bint is_dirty(self): 102 """ 103 Returns True if the trie is dirty with some pending changes 104 and needs saving to synchronize with the file. 105 """ 106 return cdatrie.trie_is_dirty(self._c_trie) 107 108 def save(self, path): 109 """ 110 Saves this trie. 111 """ 112 with open(path, "wb", 0) as f: 113 self.write(f) 114 115 def write(self, f): 116 """ 117 Writes a trie to a file. File-like objects without real 118 file descriptors are not supported. 119 """ 120 f.flush() 121 122 cdef stdio.FILE* f_ptr = stdio_ext.fdopen(f.fileno(), "w") 123 if f_ptr == NULL: 124 raise IOError("Can't open file descriptor") 125 126 cdef int res = cdatrie.trie_fwrite(self._c_trie, f_ptr) 127 if res == -1: 128 raise IOError("Can't write to file") 129 130 stdio.fflush(f_ptr) 131 132 @classmethod 133 def load(cls, path): 134 """ 135 Loads a trie from file. 136 """ 137 with open(path, "rb", 0) as f: 138 return cls.read(f) 139 140 @classmethod 141 def read(cls, f): 142 """ 143 Creates a new Trie by reading it from file. 144 File-like objects without real file descriptors are not supported. 145 146 # XXX: does it work properly in subclasses? 147 """ 148 cdef BaseTrie trie = cls(_create=False) 149 trie._c_trie = _load_from_file(f) 150 return trie 151 152 def __reduce__(self): 153 with tempfile.NamedTemporaryFile() as f: 154 self.write(f) 155 f.seek(0) 156 state = f.read() 157 return BaseTrie, (None, None, None, False), state 158 159 def __setstate__(self, bytes state): 160 assert self._c_trie is NULL 161 with tempfile.NamedTemporaryFile() as f: 162 f.write(state) 163 f.flush() 164 f.seek(0) 165 self._c_trie = _load_from_file(f) 166 167 def __setitem__(self, unicode key, cdatrie.TrieData value): 168 self._setitem(key, value) 169 170 cdef void _setitem(self, unicode key, cdatrie.TrieData value): 171 cdef cdatrie.AlphaChar* c_key = new_alpha_char_from_unicode(key) 172 try: 173 cdatrie.trie_store(self._c_trie, c_key, value) 174 finally: 175 free(c_key) 176 177 def __getitem__(self, unicode key): 178 return self._getitem(key) 179 180 def get(self, unicode key, default=None): 181 try: 182 return self._getitem(key) 183 except KeyError: 184 return default 185 186 cdef cdatrie.TrieData _getitem(self, unicode key) except -1: 187 cdef cdatrie.TrieData data 188 cdef cdatrie.AlphaChar* c_key = new_alpha_char_from_unicode(key) 189 190 try: 191 found = cdatrie.trie_retrieve(self._c_trie, c_key, &data) 192 finally: 193 free(c_key) 194 195 if not found: 196 raise KeyError(key) 197 return data 198 199 def __contains__(self, unicode key): 200 cdef cdatrie.AlphaChar* c_key = new_alpha_char_from_unicode(key) 201 try: 202 return cdatrie.trie_retrieve(self._c_trie, c_key, NULL) 203 finally: 204 free(c_key) 205 206 def __delitem__(self, unicode key): 207 self._delitem(key) 208 209 def pop(self, unicode key, default=None): 210 try: 211 value = self[key] 212 self._delitem(key) 213 return value 214 except KeyError: 215 return default 216 217 cpdef bint _delitem(self, unicode key) except -1: 218 """ 219 Deletes an entry for the given key from the trie. Returns 220 boolean value indicating whether the key exists and is removed. 221 """ 222 cdef cdatrie.AlphaChar* c_key = new_alpha_char_from_unicode(key) 223 try: 224 found = cdatrie.trie_delete(self._c_trie, c_key) 225 finally: 226 free(c_key) 227 228 if not found: 229 raise KeyError(key) 230 231 @staticmethod 232 cdef int len_enumerator(cdatrie.AlphaChar *key, cdatrie.TrieData key_data, 233 void *counter_ptr): 234 (<int *>counter_ptr)[0] += 1 235 return True 236 237 def __len__(self): 238 cdef int counter = 0 239 cdatrie.trie_enumerate(self._c_trie, 240 <cdatrie.TrieEnumFunc>(self.len_enumerator), 241 &counter) 242 return counter 243 244 def __richcmp__(self, other, int op): 245 if op == 2: # == 246 if other is self: 247 return True 248 elif not isinstance(other, BaseTrie): 249 return False 250 251 for key in self: 252 if self[key] != other[key]: 253 return False 254 255 # XXX this can be written more efficiently via explicit iterators. 256 return len(self) == len(other) 257 elif op == 3: # != 258 return not (self == other) 259 260 raise TypeError("unorderable types: {0} and {1}".format( 261 self.__class__, other.__class__)) 262 263 def setdefault(self, unicode key, cdatrie.TrieData value): 264 return self._setdefault(key, value) 265 266 cdef cdatrie.TrieData _setdefault(self, unicode key, cdatrie.TrieData value): 267 cdef cdatrie.AlphaChar* c_key = new_alpha_char_from_unicode(key) 268 cdef cdatrie.TrieData data 269 270 try: 271 found = cdatrie.trie_retrieve(self._c_trie, c_key, &data) 272 if found: 273 return data 274 else: 275 cdatrie.trie_store(self._c_trie, c_key, value) 276 return value 277 finally: 278 free(c_key) 279 280 def iter_prefixes(self, unicode key): 281 ''' 282 Returns an iterator over the keys of this trie that are prefixes 283 of ``key``. 284 ''' 285 cdef cdatrie.TrieState* state = cdatrie.trie_root(self._c_trie) 286 if state == NULL: 287 raise MemoryError() 288 289 cdef int index = 1 290 try: 291 for char in key: 292 if not cdatrie.trie_state_walk(state, <cdatrie.AlphaChar> char): 293 return 294 if cdatrie.trie_state_is_terminal(state): 295 yield key[:index] 296 index += 1 297 finally: 298 cdatrie.trie_state_free(state) 299 300 def iter_prefix_items(self, unicode key): 301 ''' 302 Returns an iterator over the items (``(key,value)`` tuples) 303 of this trie that are associated with keys that are prefixes of ``key``. 304 ''' 305 cdef cdatrie.TrieState* state = cdatrie.trie_root(self._c_trie) 306 307 if state == NULL: 308 raise MemoryError() 309 310 cdef int index = 1 311 try: 312 for char in key: 313 if not cdatrie.trie_state_walk(state, <cdatrie.AlphaChar> char): 314 return 315 if cdatrie.trie_state_is_terminal(state): # word is found 316 yield key[:index], cdatrie.trie_state_get_data(state) 317 index += 1 318 finally: 319 cdatrie.trie_state_free(state) 320 321 def iter_prefix_values(self, unicode key): 322 ''' 323 Returns an iterator over the values of this trie that are associated 324 with keys that are prefixes of ``key``. 325 ''' 326 cdef cdatrie.TrieState* state = cdatrie.trie_root(self._c_trie) 327 328 if state == NULL: 329 raise MemoryError() 330 331 try: 332 for char in key: 333 if not cdatrie.trie_state_walk(state, <cdatrie.AlphaChar> char): 334 return 335 if cdatrie.trie_state_is_terminal(state): 336 yield cdatrie.trie_state_get_data(state) 337 finally: 338 cdatrie.trie_state_free(state) 339 340 def prefixes(self, unicode key): 341 ''' 342 Returns a list with keys of this trie that are prefixes of ``key``. 343 ''' 344 cdef cdatrie.TrieState* state = cdatrie.trie_root(self._c_trie) 345 if state == NULL: 346 raise MemoryError() 347 348 cdef list result = [] 349 cdef int index = 1 350 try: 351 for char in key: 352 if not cdatrie.trie_state_walk(state, <cdatrie.AlphaChar> char): 353 break 354 if cdatrie.trie_state_is_terminal(state): 355 result.append(key[:index]) 356 index += 1 357 return result 358 finally: 359 cdatrie.trie_state_free(state) 360 361 cpdef suffixes(self, unicode prefix=u''): 362 """ 363 Returns a list of this trie's suffixes. 364 If ``prefix`` is not empty, returns only the suffixes of words prefixed by ``prefix``. 365 """ 366 cdef bint success 367 cdef list res = [] 368 cdef BaseState state = BaseState(self) 369 370 if prefix is not None: 371 success = state.walk(prefix) 372 if not success: 373 return res 374 375 cdef BaseIterator iter = BaseIterator(state) 376 while iter.next(): 377 res.append(iter.key()) 378 379 return res 380 381 def prefix_items(self, unicode key): 382 ''' 383 Returns a list of the items (``(key,value)`` tuples) 384 of this trie that are associated with keys that are 385 prefixes of ``key``. 386 ''' 387 return self._prefix_items(key) 388 389 cdef list _prefix_items(self, unicode key): 390 cdef cdatrie.TrieState* state = cdatrie.trie_root(self._c_trie) 391 392 if state == NULL: 393 raise MemoryError() 394 395 cdef list result = [] 396 cdef int index = 1 397 try: 398 for char in key: 399 if not cdatrie.trie_state_walk(state, <cdatrie.AlphaChar> char): 400 break 401 if cdatrie.trie_state_is_terminal(state): # word is found 402 result.append( 403 (key[:index], 404 cdatrie.trie_state_get_data(state)) 405 ) 406 index += 1 407 return result 408 finally: 409 cdatrie.trie_state_free(state) 410 411 def prefix_values(self, unicode key): 412 ''' 413 Returns a list of the values of this trie that are associated 414 with keys that are prefixes of ``key``. 415 ''' 416 return self._prefix_values(key) 417 418 cdef list _prefix_values(self, unicode key): 419 cdef cdatrie.TrieState* state = cdatrie.trie_root(self._c_trie) 420 421 if state == NULL: 422 raise MemoryError() 423 424 cdef list result = [] 425 try: 426 for char in key: 427 if not cdatrie.trie_state_walk(state, <cdatrie.AlphaChar> char): 428 break 429 if cdatrie.trie_state_is_terminal(state): # word is found 430 result.append(cdatrie.trie_state_get_data(state)) 431 return result 432 finally: 433 cdatrie.trie_state_free(state) 434 435 def longest_prefix(self, unicode key, default=RAISE_KEY_ERROR): 436 """ 437 Returns the longest key in this trie that is a prefix of ``key``. 438 439 If the trie doesn't contain any prefix of ``key``: 440 - if ``default`` is given, returns it, 441 - otherwise raises ``KeyError``. 442 """ 443 cdef cdatrie.TrieState* state = cdatrie.trie_root(self._c_trie) 444 445 if state == NULL: 446 raise MemoryError() 447 448 cdef int index = 0, last_terminal_index = 0 449 450 try: 451 for ch in key: 452 if not cdatrie.trie_state_walk(state, <cdatrie.AlphaChar> ch): 453 break 454 455 index += 1 456 if cdatrie.trie_state_is_terminal(state): 457 last_terminal_index = index 458 459 if not last_terminal_index: 460 if default is RAISE_KEY_ERROR: 461 raise KeyError(key) 462 return default 463 464 return key[:last_terminal_index] 465 finally: 466 cdatrie.trie_state_free(state) 467 468 def longest_prefix_item(self, unicode key, default=RAISE_KEY_ERROR): 469 """ 470 Returns the item (``(key,value)`` tuple) associated with the longest 471 key in this trie that is a prefix of ``key``. 472 473 If the trie doesn't contain any prefix of ``key``: 474 - if ``default`` is given, returns it, 475 - otherwise raises ``KeyError``. 476 """ 477 return self._longest_prefix_item(key, default) 478 479 cdef _longest_prefix_item(self, unicode key, default=RAISE_KEY_ERROR): 480 cdef cdatrie.TrieState* state = cdatrie.trie_root(self._c_trie) 481 482 if state == NULL: 483 raise MemoryError() 484 485 cdef int index = 0, last_terminal_index = 0, data 486 487 try: 488 for ch in key: 489 if not cdatrie.trie_state_walk(state, <cdatrie.AlphaChar> ch): 490 break 491 492 index += 1 493 if cdatrie.trie_state_is_terminal(state): 494 last_terminal_index = index 495 data = cdatrie.trie_state_get_data(state) 496 497 if not last_terminal_index: 498 if default is RAISE_KEY_ERROR: 499 raise KeyError(key) 500 return default 501 502 return key[:last_terminal_index], data 503 504 finally: 505 cdatrie.trie_state_free(state) 506 507 def longest_prefix_value(self, unicode key, default=RAISE_KEY_ERROR): 508 """ 509 Returns the value associated with the longest key in this trie that is 510 a prefix of ``key``. 511 512 If the trie doesn't contain any prefix of ``key``: 513 - if ``default`` is given, return it 514 - otherwise raise ``KeyError`` 515 """ 516 return self._longest_prefix_value(key, default) 517 518 cdef _longest_prefix_value(self, unicode key, default=RAISE_KEY_ERROR): 519 cdef cdatrie.TrieState* state = cdatrie.trie_root(self._c_trie) 520 521 if state == NULL: 522 raise MemoryError() 523 524 cdef int data = 0 525 cdef char found = 0 526 527 try: 528 for ch in key: 529 if not cdatrie.trie_state_walk(state, <cdatrie.AlphaChar> ch): 530 break 531 532 if cdatrie.trie_state_is_terminal(state): 533 found = 1 534 data = cdatrie.trie_state_get_data(state) 535 536 if not found: 537 if default is RAISE_KEY_ERROR: 538 raise KeyError(key) 539 return default 540 541 return data 542 543 finally: 544 cdatrie.trie_state_free(state) 545 546 def has_keys_with_prefix(self, unicode prefix): 547 """ 548 Returns True if any key in the trie begins with ``prefix``. 549 """ 550 cdef cdatrie.TrieState* state = cdatrie.trie_root(self._c_trie) 551 if state == NULL: 552 raise MemoryError() 553 try: 554 for char in prefix: 555 if not cdatrie.trie_state_walk(state, <cdatrie.AlphaChar> char): 556 return False 557 return True 558 finally: 559 cdatrie.trie_state_free(state) 560 561 cpdef items(self, unicode prefix=None): 562 """ 563 Returns a list of this trie's items (``(key,value)`` tuples). 564 565 If ``prefix`` is not None, returns only the items 566 associated with keys prefixed by ``prefix``. 567 """ 568 cdef bint success 569 cdef list res = [] 570 cdef BaseState state = BaseState(self) 571 572 if prefix is not None: 573 success = state.walk(prefix) 574 if not success: 575 return res 576 577 cdef BaseIterator iter = BaseIterator(state) 578 579 if prefix is None: 580 while iter.next(): 581 res.append((iter.key(), iter.data())) 582 else: 583 while iter.next(): 584 res.append((prefix+iter.key(), iter.data())) 585 586 return res 587 588 def __iter__(self): 589 cdef BaseIterator iter = BaseIterator(BaseState(self)) 590 while iter.next(): 591 yield iter.key() 592 593 cpdef keys(self, unicode prefix=None): 594 """ 595 Returns a list of this trie's keys. 596 597 If ``prefix`` is not None, returns only the keys prefixed by ``prefix``. 598 """ 599 cdef bint success 600 cdef list res = [] 601 cdef BaseState state = BaseState(self) 602 603 if prefix is not None: 604 success = state.walk(prefix) 605 if not success: 606 return res 607 608 cdef BaseIterator iter = BaseIterator(state) 609 610 if prefix is None: 611 while iter.next(): 612 res.append(iter.key()) 613 else: 614 while iter.next(): 615 res.append(prefix+iter.key()) 616 617 return res 618 619 cpdef values(self, unicode prefix=None): 620 """ 621 Returns a list of this trie's values. 622 623 If ``prefix`` is not None, returns only the values 624 associated with keys prefixed by ``prefix``. 625 """ 626 cdef bint success 627 cdef list res = [] 628 cdef BaseState state = BaseState(self) 629 630 if prefix is not None: 631 success = state.walk(prefix) 632 if not success: 633 return res 634 635 cdef BaseIterator iter = BaseIterator(state) 636 while iter.next(): 637 res.append(iter.data()) 638 return res 639 640 cdef _index_to_value(self, cdatrie.TrieData index): 641 return index 642 643 644cdef class Trie(BaseTrie): 645 """ 646 Wrapper for libdatrie's trie. 647 Keys are unicode strings, values are Python objects. 648 """ 649 650 cdef list _values 651 652 def __init__(self, alphabet=None, ranges=None, AlphaMap alpha_map=None, _create=True): 653 """ 654 For efficiency trie needs to know what unicode symbols 655 it should be able to store so this constructor requires 656 either ``alphabet`` (a string/iterable with all allowed characters), 657 ``ranges`` (a list of (begin, end) pairs, e.g. [('a', 'z')]) 658 or ``alpha_map`` (:class:`datrie.AlphaMap` instance). 659 """ 660 self._values = [] 661 super(Trie, self).__init__(alphabet, ranges, alpha_map, _create) 662 663 def __reduce__(self): 664 with tempfile.NamedTemporaryFile() as f: 665 self.write(f) 666 pickle.dump(self._values, f) 667 f.seek(0) 668 state = f.read() 669 return Trie, (None, None, None, False), state 670 671 def __setstate__(self, bytes state): 672 assert self._c_trie is NULL 673 with tempfile.NamedTemporaryFile() as f: 674 f.write(state) 675 f.flush() 676 f.seek(0) 677 self._c_trie = _load_from_file(f) 678 self._values = pickle.load(f) 679 680 def __getitem__(self, unicode key): 681 cdef cdatrie.TrieData index = self._getitem(key) 682 return self._values[index] 683 684 def get(self, unicode key, default=None): 685 cdef cdatrie.TrieData index 686 try: 687 index = self._getitem(key) 688 return self._values[index] 689 except KeyError: 690 return default 691 692 def __setitem__(self, unicode key, object value): 693 cdef cdatrie.TrieData next_index = len(self._values) 694 cdef cdatrie.TrieData index = self._setdefault(key, next_index) 695 if index == next_index: 696 self._values.append(value) # insert 697 else: 698 self._values[index] = value # update 699 700 def setdefault(self, unicode key, object value): 701 cdef cdatrie.TrieData next_index = len(self._values) 702 cdef cdatrie.TrieData index = self._setdefault(key, next_index) 703 if index == next_index: 704 self._values.append(value) # insert 705 return value 706 else: 707 return self._values[index] # lookup 708 709 def __delitem__(self, unicode key): 710 # XXX: this could be faster (key is encoded twice here) 711 cdef cdatrie.TrieData index = self._getitem(key) 712 self._values[index] = DELETED_OBJECT 713 self._delitem(key) 714 715 def write(self, f): 716 """ 717 Writes a trie to a file. File-like objects without real 718 file descriptors are not supported. 719 """ 720 super(Trie, self).write(f) 721 pickle.dump(self._values, f) 722 723 @classmethod 724 def read(cls, f): 725 """ 726 Creates a new Trie by reading it from file. 727 File-like objects without real file descriptors are not supported. 728 """ 729 cdef Trie trie = super(Trie, cls).read(f) 730 trie._values = pickle.load(f) 731 return trie 732 733 cpdef items(self, unicode prefix=None): 734 """ 735 Returns a list of this trie's items (``(key,value)`` tuples). 736 737 If ``prefix`` is not None, returns only the items 738 associated with keys prefixed by ``prefix``. 739 """ 740 741 # the following code is 742 # 743 # [(k, self._values[v]) for (k,v) in BaseTrie.items(self, prefix)] 744 # 745 # but inlined for speed. 746 747 cdef bint success 748 cdef list res = [] 749 cdef BaseState state = BaseState(self) 750 751 if prefix is not None: 752 success = state.walk(prefix) 753 if not success: 754 return res 755 756 cdef BaseIterator iter = BaseIterator(state) 757 758 if prefix is None: 759 while iter.next(): 760 res.append((iter.key(), self._values[iter.data()])) 761 else: 762 while iter.next(): 763 res.append((prefix+iter.key(), self._values[iter.data()])) 764 765 return res 766 767 cpdef values(self, unicode prefix=None): 768 """ 769 Returns a list of this trie's values. 770 771 If ``prefix`` is not None, returns only the values 772 associated with keys prefixed by ``prefix``. 773 """ 774 775 # the following code is 776 # 777 # [self._values[v] for v in BaseTrie.values(self, prefix)] 778 # 779 # but inlined for speed. 780 781 cdef list res = [] 782 cdef BaseState state = BaseState(self) 783 cdef bint success 784 785 if prefix is not None: 786 success = state.walk(prefix) 787 if not success: 788 return res 789 790 cdef BaseIterator iter = BaseIterator(state) 791 792 while iter.next(): 793 res.append(self._values[iter.data()]) 794 795 return res 796 797 def longest_prefix_item(self, unicode key, default=RAISE_KEY_ERROR): 798 """ 799 Returns the item (``(key,value)`` tuple) associated with the longest 800 key in this trie that is a prefix of ``key``. 801 802 If the trie doesn't contain any prefix of ``key``: 803 - if ``default`` is given, returns it, 804 - otherwise raises ``KeyError``. 805 """ 806 cdef res = self._longest_prefix_item(key, RERAISE_KEY_ERROR) 807 if res is RERAISE_KEY_ERROR: # error 808 if default is RAISE_KEY_ERROR: 809 raise KeyError(key) 810 return default 811 812 return res[0], self._values[res[1]] 813 814 def longest_prefix_value(self, unicode key, default=RAISE_KEY_ERROR): 815 """ 816 Returns the value associated with the longest key in this trie that is 817 a prefix of ``key``. 818 819 If the trie doesn't contain any prefix of ``key``: 820 - if ``default`` is given, return it 821 - otherwise raise ``KeyError`` 822 """ 823 cdef res = self._longest_prefix_value(key, RERAISE_KEY_ERROR) 824 if res is RERAISE_KEY_ERROR: # error 825 if default is RAISE_KEY_ERROR: 826 raise KeyError(key) 827 return default 828 829 return self._values[res] 830 831 def prefix_items(self, unicode key): 832 ''' 833 Returns a list of the items (``(key,value)`` tuples) 834 of this trie that are associated with keys that are 835 prefixes of ``key``. 836 ''' 837 return [(k, self._values[v]) for (k, v) in self._prefix_items(key)] 838 839 def iter_prefix_items(self, unicode key): 840 for k, v in super(Trie, self).iter_prefix_items(key): 841 yield k, self._values[v] 842 843 def prefix_values(self, unicode key): 844 ''' 845 Returns a list of the values of this trie that are associated 846 with keys that are prefixes of ``key``. 847 ''' 848 return [self._values[v] for v in self._prefix_values(key)] 849 850 def iter_prefix_values(self, unicode key): 851 for v in super(Trie, self).iter_prefix_values(key): 852 yield self._values[v] 853 854 cdef _index_to_value(self, cdatrie.TrieData index): 855 return self._values[index] 856 857 858cdef class _TrieState: 859 cdef cdatrie.TrieState* _state 860 cdef BaseTrie _trie 861 862 def __cinit__(self, BaseTrie trie): 863 self._state = cdatrie.trie_root(trie._c_trie) 864 if self._state is NULL: 865 raise MemoryError() 866 self._trie = trie 867 868 def __dealloc__(self): 869 if self._state is not NULL: 870 cdatrie.trie_state_free(self._state) 871 872 cpdef walk(self, unicode to): 873 cdef bint res 874 for ch in to: 875 if not self.walk_char(<cdatrie.AlphaChar> ch): 876 return False 877 return True 878 879 cdef bint walk_char(self, cdatrie.AlphaChar char): 880 """ 881 Walks the trie stepwise, using a given character ``char``. 882 On return, the state is updated to the new state if successfully walked. 883 Returns boolean value indicating the success of the walk. 884 """ 885 return cdatrie.trie_state_walk(self._state, char) 886 887 cpdef copy_to(self, _TrieState state): 888 """ Copies trie state to another """ 889 cdatrie.trie_state_copy(state._state, self._state) 890 891 cpdef rewind(self): 892 """ Puts the state at root """ 893 cdatrie.trie_state_rewind(self._state) 894 895 cpdef bint is_terminal(self): 896 return cdatrie.trie_state_is_terminal(self._state) 897 898 cpdef bint is_single(self): 899 return cdatrie.trie_state_is_single(self._state) 900 901 cpdef bint is_leaf(self): 902 return cdatrie.trie_state_is_leaf(self._state) 903 904 def __unicode__(self): 905 return u"data:%d, term:%s, leaf:%s, single: %s" % ( 906 self.data(), 907 self.is_terminal(), 908 self.is_leaf(), 909 self.is_single(), 910 ) 911 912 def __repr__(self): 913 return self.__unicode__() # XXX: this is incorrect under Python 2.x 914 915 916cdef class BaseState(_TrieState): 917 """ 918 cdatrie.TrieState wrapper. It can be used for custom trie traversal. 919 """ 920 cpdef int data(self): 921 return cdatrie.trie_state_get_data(self._state) 922 923 924cdef class State(_TrieState): 925 926 def __cinit__(self, Trie trie): # this is overriden for extra type check 927 self._state = cdatrie.trie_root(trie._c_trie) 928 if self._state is NULL: 929 raise MemoryError() 930 self._trie = trie 931 932 cpdef data(self): 933 cdef cdatrie.TrieData data = cdatrie.trie_state_get_data(self._state) 934 return self._trie._index_to_value(data) 935 936 937cdef class _TrieIterator: 938 cdef cdatrie.TrieIterator* _iter 939 cdef _TrieState _root 940 941 def __cinit__(self, _TrieState state): 942 self._root = state # prevent garbage collection of state 943 self._iter = cdatrie.trie_iterator_new(state._state) 944 if self._iter is NULL: 945 raise MemoryError() 946 947 def __dealloc__(self): 948 if self._iter is not NULL: 949 cdatrie.trie_iterator_free(self._iter) 950 951 cpdef bint next(self): 952 return cdatrie.trie_iterator_next(self._iter) 953 954 cpdef unicode key(self): 955 cdef cdatrie.AlphaChar* key = cdatrie.trie_iterator_get_key(self._iter) 956 try: 957 return unicode_from_alpha_char(key) 958 finally: 959 free(key) 960 961 962cdef class BaseIterator(_TrieIterator): 963 """ 964 cdatrie.TrieIterator wrapper. It can be used for custom datrie.BaseTrie 965 traversal. 966 """ 967 cpdef cdatrie.TrieData data(self): 968 return cdatrie.trie_iterator_get_data(self._iter) 969 970 971cdef class Iterator(_TrieIterator): 972 """ 973 cdatrie.TrieIterator wrapper. It can be used for custom datrie.Trie 974 traversal. 975 """ 976 def __cinit__(self, State state): # this is overriden for extra type check 977 self._root = state # prevent garbage collection of state 978 self._iter = cdatrie.trie_iterator_new(state._state) 979 if self._iter is NULL: 980 raise MemoryError() 981 982 cpdef data(self): 983 cdef cdatrie.TrieData data = cdatrie.trie_iterator_get_data(self._iter) 984 return self._root._trie._index_to_value(data) 985 986 987cdef (cdatrie.Trie* ) _load_from_file(f) except NULL: 988 cdef int fd = f.fileno() 989 cdef stdio.FILE* f_ptr = stdio_ext.fdopen(fd, "r") 990 if f_ptr == NULL: 991 raise IOError() 992 cdef cdatrie.Trie* trie = cdatrie.trie_fread(f_ptr) 993 if trie == NULL: 994 raise DatrieError("Can't load trie from stream") 995 996 cdef int f_pos = stdio.ftell(f_ptr) 997 f.seek(f_pos) 998 999 return trie 1000 1001#cdef (cdatrie.Trie*) _load_from_file(path) except NULL: 1002# str_path = path.encode(sys.getfilesystemencoding()) 1003# cdef char* c_path = str_path 1004# cdef cdatrie.Trie* trie = cdatrie.trie_new_from_file(c_path) 1005# if trie is NULL: 1006# raise DatrieError("Can't load trie from file") 1007# 1008# return trie 1009 1010 1011# ============================ AlphaMap & utils ================================ 1012 1013cdef class AlphaMap: 1014 """ 1015 Alphabet map. 1016 1017 For sparse data compactness, the trie alphabet set should 1018 be continuous, but that is usually not the case in general 1019 character sets. Therefore, a map between the input character 1020 and the low-level alphabet set for the trie is created in the 1021 middle. You will have to define your input character set by 1022 listing their continuous ranges of character codes creating a 1023 trie. Then, each character will be automatically assigned 1024 internal codes of continuous values. 1025 """ 1026 1027 cdef cdatrie.AlphaMap *_c_alpha_map 1028 1029 def __cinit__(self): 1030 self._c_alpha_map = cdatrie.alpha_map_new() 1031 1032 def __dealloc__(self): 1033 if self._c_alpha_map is not NULL: 1034 cdatrie.alpha_map_free(self._c_alpha_map) 1035 1036 def __init__(self, alphabet=None, ranges=None, _create=True): 1037 if not _create: 1038 return 1039 1040 if ranges is not None: 1041 for range in ranges: 1042 self.add_range(*range) 1043 1044 if alphabet is not None: 1045 self.add_alphabet(alphabet) 1046 1047 cdef AlphaMap copy(self): 1048 cdef AlphaMap clone = AlphaMap(_create=False) 1049 clone._c_alpha_map = cdatrie.alpha_map_clone(self._c_alpha_map) 1050 if clone._c_alpha_map is NULL: 1051 raise MemoryError() 1052 1053 return clone 1054 1055 def add_alphabet(self, alphabet): 1056 """ 1057 Adds all chars from iterable to the alphabet set. 1058 """ 1059 for begin, end in alphabet_to_ranges(alphabet): 1060 self._add_range(begin, end) 1061 1062 def add_range(self, begin, end): 1063 """ 1064 Add a range of character codes from ``begin`` to ``end`` 1065 to the alphabet set. 1066 1067 ``begin`` - the first character of the range; 1068 ``end`` - the last character of the range. 1069 """ 1070 self._add_range(ord(begin), ord(end)) 1071 1072 cpdef _add_range(self, cdatrie.AlphaChar begin, cdatrie.AlphaChar end): 1073 if begin > end: 1074 raise DatrieError('range begin > end') 1075 code = cdatrie.alpha_map_add_range(self._c_alpha_map, begin, end) 1076 if code != 0: 1077 raise MemoryError() 1078 1079 1080cdef cdatrie.AlphaChar* new_alpha_char_from_unicode(unicode txt): 1081 """ 1082 Converts Python unicode string to libdatrie's AlphaChar* format. 1083 libdatrie wants null-terminated array of 4-byte LE symbols. 1084 1085 The caller should free the result of this function. 1086 """ 1087 cdef int txt_len = len(txt) 1088 cdef int size = (txt_len + 1) * sizeof(cdatrie.AlphaChar) 1089 1090 # allocate buffer 1091 cdef cdatrie.AlphaChar* data = <cdatrie.AlphaChar*> malloc(size) 1092 if data is NULL: 1093 raise MemoryError() 1094 1095 # Copy text contents to buffer. 1096 # XXX: is it safe? The safe alternative is to decode txt 1097 # to utf32_le and then use memcpy to copy the content: 1098 # 1099 # py_str = txt.encode('utf_32_le') 1100 # cdef char* c_str = py_str 1101 # string.memcpy(data, c_str, size-1) 1102 # 1103 # but the following is much (say 10x) faster and this 1104 # function is really in a hot spot. 1105 cdef int i = 0 1106 for char in txt: 1107 data[i] = <cdatrie.AlphaChar> char 1108 i+=1 1109 1110 # Buffer must be null-terminated (last 4 bytes must be zero). 1111 data[txt_len] = 0 1112 return data 1113 1114 1115cdef unicode unicode_from_alpha_char(cdatrie.AlphaChar* key, int len=0): 1116 """ 1117 Converts libdatrie's AlphaChar* to Python unicode. 1118 """ 1119 cdef int length = len 1120 if length == 0: 1121 length = cdatrie.alpha_char_strlen(key)*sizeof(cdatrie.AlphaChar) 1122 cdef char* c_str = <char*> key 1123 return c_str[:length].decode('utf_32_le') 1124 1125 1126def to_ranges(lst): 1127 """ 1128 Converts a list of numbers to a list of ranges:: 1129 1130 >>> numbers = [1,2,3,5,6] 1131 >>> list(to_ranges(numbers)) 1132 [(1, 3), (5, 6)] 1133 """ 1134 for a, b in itertools.groupby(enumerate(lst), lambda t: t[1] - t[0]): 1135 b = list(b) 1136 yield b[0][1], b[-1][1] 1137 1138 1139def alphabet_to_ranges(alphabet): 1140 for begin, end in to_ranges(sorted(map(ord, iter(alphabet)))): 1141 yield begin, end 1142 1143 1144def new(alphabet=None, ranges=None, AlphaMap alpha_map=None): 1145 warnings.warn('datrie.new is deprecated; please use datrie.Trie.', 1146 DeprecationWarning) 1147 return Trie(alphabet, ranges, alpha_map) 1148 1149 1150MutableMapping.register(Trie) 1151MutableMapping.register(BaseTrie) 1152