1# cython: profile=False
2"""
3Cython wrapper for libdatrie.
4"""
5
6from cpython.version cimport PY_MAJOR_VERSION
7from cython.operator import dereference as deref
8from libc.stdlib cimport malloc, free
9from libc cimport stdio
10from libc cimport string
11cimport stdio_ext
12cimport cdatrie
13
14import itertools
15import warnings
16import sys
17import tempfile
18
19try:
20    from collections.abc import MutableMapping
21except ImportError:
22    from collections import MutableMapping
23
24try:
25    import cPickle as pickle
26except ImportError:
27    import pickle
28
29class DatrieError(Exception):
30    pass
31
32RAISE_KEY_ERROR = object()
33RERAISE_KEY_ERROR = object()
34DELETED_OBJECT = object()
35
36
37cdef class BaseTrie:
38    """
39    Wrapper for libdatrie's trie.
40
41    Keys are unicode strings, values are integers -2147483648 <= x <= 2147483647.
42    """
43
44    cdef AlphaMap alpha_map
45    cdef cdatrie.Trie *_c_trie
46
47    def __init__(self, alphabet=None, ranges=None, AlphaMap alpha_map=None, _create=True):
48        """
49        For efficiency trie needs to know what unicode symbols
50        it should be able to store so this constructor requires
51        either ``alphabet`` (a string/iterable with all allowed characters),
52        ``ranges`` (a list of (begin, end) pairs, e.g. [('a', 'z')])
53        or ``alpha_map`` (:class:`datrie.AlphaMap` instance).
54        """
55        if self._c_trie is not NULL:
56            return
57
58        if not _create:
59            return
60
61        if alphabet is None and ranges is None and alpha_map is None:
62            raise ValueError(
63                "Please provide alphabet, ranges or alpha_map argument.")
64
65        if alpha_map is None:
66            alpha_map = AlphaMap(alphabet, ranges)
67
68        self.alpha_map = alpha_map
69        self._c_trie = cdatrie.trie_new(alpha_map._c_alpha_map)
70        if self._c_trie is NULL:
71            raise MemoryError()
72
73    def __dealloc__(self):
74        if self._c_trie is not NULL:
75            cdatrie.trie_free(self._c_trie)
76
77    def update(self, other=(), **kwargs):
78        if PY_MAJOR_VERSION == 2:
79            if kwargs:
80                raise TypeError("keyword arguments are not supported.")
81
82        if hasattr(other, "keys"):
83            for key in other:
84                self[key] = other[key]
85        else:
86            for key, value in other:
87                self[key] = value
88
89        for key in kwargs:
90            self[key] = kwargs[key]
91
92    def clear(self):
93        cdef AlphaMap alpha_map = self.alpha_map.copy()
94        _c_trie = cdatrie.trie_new(alpha_map._c_alpha_map)
95        if _c_trie is NULL:
96            raise MemoryError()
97
98        cdatrie.trie_free(self._c_trie)
99        self._c_trie = _c_trie
100
101    cpdef bint is_dirty(self):
102        """
103        Returns True if the trie is dirty with some pending changes
104        and needs saving to synchronize with the file.
105        """
106        return cdatrie.trie_is_dirty(self._c_trie)
107
108    def save(self, path):
109        """
110        Saves this trie.
111        """
112        with open(path, "wb", 0) as f:
113            self.write(f)
114
115    def write(self, f):
116        """
117        Writes a trie to a file. File-like objects without real
118        file descriptors are not supported.
119        """
120        f.flush()
121
122        cdef stdio.FILE* f_ptr = stdio_ext.fdopen(f.fileno(), "w")
123        if f_ptr == NULL:
124            raise IOError("Can't open file descriptor")
125
126        cdef int res = cdatrie.trie_fwrite(self._c_trie, f_ptr)
127        if res == -1:
128            raise IOError("Can't write to file")
129
130        stdio.fflush(f_ptr)
131
132    @classmethod
133    def load(cls, path):
134        """
135        Loads a trie from file.
136        """
137        with open(path, "rb", 0) as f:
138            return cls.read(f)
139
140    @classmethod
141    def read(cls, f):
142        """
143        Creates a new Trie by reading it from file.
144        File-like objects without real file descriptors are not supported.
145
146        # XXX: does it work properly in subclasses?
147        """
148        cdef BaseTrie trie = cls(_create=False)
149        trie._c_trie = _load_from_file(f)
150        return trie
151
152    def __reduce__(self):
153        with tempfile.NamedTemporaryFile() as f:
154            self.write(f)
155            f.seek(0)
156            state = f.read()
157            return BaseTrie, (None, None, None, False), state
158
159    def __setstate__(self, bytes state):
160        assert self._c_trie is NULL
161        with tempfile.NamedTemporaryFile() as f:
162            f.write(state)
163            f.flush()
164            f.seek(0)
165            self._c_trie = _load_from_file(f)
166
167    def __setitem__(self, unicode key, cdatrie.TrieData value):
168        self._setitem(key, value)
169
170    cdef void _setitem(self, unicode key, cdatrie.TrieData value):
171        cdef cdatrie.AlphaChar* c_key = new_alpha_char_from_unicode(key)
172        try:
173            cdatrie.trie_store(self._c_trie, c_key, value)
174        finally:
175            free(c_key)
176
177    def __getitem__(self, unicode key):
178        return self._getitem(key)
179
180    def get(self, unicode key, default=None):
181        try:
182            return self._getitem(key)
183        except KeyError:
184            return default
185
186    cdef cdatrie.TrieData _getitem(self, unicode key) except -1:
187        cdef cdatrie.TrieData data
188        cdef cdatrie.AlphaChar* c_key = new_alpha_char_from_unicode(key)
189
190        try:
191            found = cdatrie.trie_retrieve(self._c_trie, c_key, &data)
192        finally:
193            free(c_key)
194
195        if not found:
196            raise KeyError(key)
197        return data
198
199    def __contains__(self, unicode key):
200        cdef cdatrie.AlphaChar* c_key = new_alpha_char_from_unicode(key)
201        try:
202            return cdatrie.trie_retrieve(self._c_trie, c_key, NULL)
203        finally:
204            free(c_key)
205
206    def __delitem__(self, unicode key):
207        self._delitem(key)
208
209    def pop(self, unicode key, default=None):
210        try:
211            value = self[key]
212            self._delitem(key)
213            return value
214        except KeyError:
215            return default
216
217    cpdef bint _delitem(self, unicode key) except -1:
218        """
219        Deletes an entry for the given key from the trie. Returns
220        boolean value indicating whether the key exists and is removed.
221        """
222        cdef cdatrie.AlphaChar* c_key = new_alpha_char_from_unicode(key)
223        try:
224            found = cdatrie.trie_delete(self._c_trie, c_key)
225        finally:
226            free(c_key)
227
228        if not found:
229            raise KeyError(key)
230
231    @staticmethod
232    cdef int len_enumerator(cdatrie.AlphaChar *key, cdatrie.TrieData key_data,
233                            void *counter_ptr):
234        (<int *>counter_ptr)[0] += 1
235        return True
236
237    def __len__(self):
238        cdef int counter = 0
239        cdatrie.trie_enumerate(self._c_trie,
240                               <cdatrie.TrieEnumFunc>(self.len_enumerator),
241                               &counter)
242        return counter
243
244    def __richcmp__(self, other, int op):
245        if op == 2:    # ==
246            if other is self:
247                return True
248            elif not isinstance(other, BaseTrie):
249                return False
250
251            for key in self:
252                if self[key] != other[key]:
253                    return False
254
255            # XXX this can be written more efficiently via explicit iterators.
256            return len(self) == len(other)
257        elif op == 3:  # !=
258            return not (self == other)
259
260        raise TypeError("unorderable types: {0} and {1}".format(
261            self.__class__, other.__class__))
262
263    def setdefault(self, unicode key, cdatrie.TrieData value):
264        return self._setdefault(key, value)
265
266    cdef cdatrie.TrieData _setdefault(self, unicode key, cdatrie.TrieData value):
267        cdef cdatrie.AlphaChar* c_key = new_alpha_char_from_unicode(key)
268        cdef cdatrie.TrieData data
269
270        try:
271            found = cdatrie.trie_retrieve(self._c_trie, c_key, &data)
272            if found:
273                return data
274            else:
275                cdatrie.trie_store(self._c_trie, c_key, value)
276                return value
277        finally:
278            free(c_key)
279
280    def iter_prefixes(self, unicode key):
281        '''
282        Returns an iterator over the keys of this trie that are prefixes
283        of ``key``.
284        '''
285        cdef cdatrie.TrieState* state = cdatrie.trie_root(self._c_trie)
286        if state == NULL:
287            raise MemoryError()
288
289        cdef int index = 1
290        try:
291            for char in key:
292                if not cdatrie.trie_state_walk(state, <cdatrie.AlphaChar> char):
293                    return
294                if cdatrie.trie_state_is_terminal(state):
295                    yield key[:index]
296                index += 1
297        finally:
298            cdatrie.trie_state_free(state)
299
300    def iter_prefix_items(self, unicode key):
301        '''
302        Returns an iterator over the items (``(key,value)`` tuples)
303        of this trie that are associated with keys that are prefixes of ``key``.
304        '''
305        cdef cdatrie.TrieState* state = cdatrie.trie_root(self._c_trie)
306
307        if state == NULL:
308            raise MemoryError()
309
310        cdef int index = 1
311        try:
312            for char in key:
313                if not cdatrie.trie_state_walk(state, <cdatrie.AlphaChar> char):
314                    return
315                if cdatrie.trie_state_is_terminal(state): # word is found
316                    yield key[:index], cdatrie.trie_state_get_data(state)
317                index += 1
318        finally:
319            cdatrie.trie_state_free(state)
320
321    def iter_prefix_values(self, unicode key):
322        '''
323        Returns an iterator over the values of this trie that are associated
324        with keys that are prefixes of ``key``.
325        '''
326        cdef cdatrie.TrieState* state = cdatrie.trie_root(self._c_trie)
327
328        if state == NULL:
329            raise MemoryError()
330
331        try:
332            for char in key:
333                if not cdatrie.trie_state_walk(state, <cdatrie.AlphaChar> char):
334                    return
335                if cdatrie.trie_state_is_terminal(state):
336                    yield cdatrie.trie_state_get_data(state)
337        finally:
338            cdatrie.trie_state_free(state)
339
340    def prefixes(self, unicode key):
341        '''
342        Returns a list with keys of this trie that are prefixes of ``key``.
343        '''
344        cdef cdatrie.TrieState* state = cdatrie.trie_root(self._c_trie)
345        if state == NULL:
346            raise MemoryError()
347
348        cdef list result = []
349        cdef int index = 1
350        try:
351            for char in key:
352                if not cdatrie.trie_state_walk(state, <cdatrie.AlphaChar> char):
353                    break
354                if cdatrie.trie_state_is_terminal(state):
355                    result.append(key[:index])
356                index += 1
357            return result
358        finally:
359            cdatrie.trie_state_free(state)
360
361    cpdef suffixes(self, unicode prefix=u''):
362        """
363        Returns a list of this trie's suffixes.
364        If ``prefix`` is not empty, returns only the suffixes of words prefixed by ``prefix``.
365        """
366        cdef bint success
367        cdef list res = []
368        cdef BaseState state = BaseState(self)
369
370        if prefix is not None:
371            success = state.walk(prefix)
372            if not success:
373                return res
374
375        cdef BaseIterator iter = BaseIterator(state)
376        while iter.next():
377            res.append(iter.key())
378
379        return res
380
381    def prefix_items(self, unicode key):
382        '''
383        Returns a list of the items (``(key,value)`` tuples)
384        of this trie that are associated with keys that are
385        prefixes of ``key``.
386        '''
387        return self._prefix_items(key)
388
389    cdef list _prefix_items(self, unicode key):
390        cdef cdatrie.TrieState* state = cdatrie.trie_root(self._c_trie)
391
392        if state == NULL:
393            raise MemoryError()
394
395        cdef list result = []
396        cdef int index = 1
397        try:
398            for char in key:
399                if not cdatrie.trie_state_walk(state, <cdatrie.AlphaChar> char):
400                    break
401                if cdatrie.trie_state_is_terminal(state): # word is found
402                    result.append(
403                        (key[:index],
404                         cdatrie.trie_state_get_data(state))
405                    )
406                index += 1
407            return result
408        finally:
409            cdatrie.trie_state_free(state)
410
411    def prefix_values(self, unicode key):
412        '''
413        Returns a list of the values of this trie that are associated
414        with keys that are prefixes of ``key``.
415        '''
416        return self._prefix_values(key)
417
418    cdef list _prefix_values(self, unicode key):
419        cdef cdatrie.TrieState* state = cdatrie.trie_root(self._c_trie)
420
421        if state == NULL:
422            raise MemoryError()
423
424        cdef list result = []
425        try:
426            for char in key:
427                if not cdatrie.trie_state_walk(state, <cdatrie.AlphaChar> char):
428                    break
429                if cdatrie.trie_state_is_terminal(state): # word is found
430                    result.append(cdatrie.trie_state_get_data(state))
431            return result
432        finally:
433            cdatrie.trie_state_free(state)
434
435    def longest_prefix(self, unicode key, default=RAISE_KEY_ERROR):
436        """
437        Returns the longest key in this trie that is a prefix of ``key``.
438
439        If the trie doesn't contain any prefix of ``key``:
440          - if ``default`` is given, returns it,
441          - otherwise raises ``KeyError``.
442        """
443        cdef cdatrie.TrieState* state = cdatrie.trie_root(self._c_trie)
444
445        if state == NULL:
446            raise MemoryError()
447
448        cdef int index = 0, last_terminal_index = 0
449
450        try:
451            for ch in key:
452                if not cdatrie.trie_state_walk(state, <cdatrie.AlphaChar> ch):
453                    break
454
455                index += 1
456                if cdatrie.trie_state_is_terminal(state):
457                    last_terminal_index = index
458
459            if not last_terminal_index:
460                if default is RAISE_KEY_ERROR:
461                    raise KeyError(key)
462                return default
463
464            return key[:last_terminal_index]
465        finally:
466            cdatrie.trie_state_free(state)
467
468    def longest_prefix_item(self, unicode key, default=RAISE_KEY_ERROR):
469        """
470        Returns the item (``(key,value)`` tuple) associated with the longest
471        key in this trie that is a prefix of ``key``.
472
473        If the trie doesn't contain any prefix of ``key``:
474          - if ``default`` is given, returns it,
475          - otherwise raises ``KeyError``.
476        """
477        return self._longest_prefix_item(key, default)
478
479    cdef _longest_prefix_item(self, unicode key, default=RAISE_KEY_ERROR):
480        cdef cdatrie.TrieState* state = cdatrie.trie_root(self._c_trie)
481
482        if state == NULL:
483            raise MemoryError()
484
485        cdef int index = 0, last_terminal_index = 0, data
486
487        try:
488            for ch in key:
489                if not cdatrie.trie_state_walk(state, <cdatrie.AlphaChar> ch):
490                    break
491
492                index += 1
493                if cdatrie.trie_state_is_terminal(state):
494                    last_terminal_index = index
495                    data = cdatrie.trie_state_get_data(state)
496
497            if not last_terminal_index:
498                if default is RAISE_KEY_ERROR:
499                    raise KeyError(key)
500                return default
501
502            return key[:last_terminal_index], data
503
504        finally:
505            cdatrie.trie_state_free(state)
506
507    def longest_prefix_value(self, unicode key, default=RAISE_KEY_ERROR):
508        """
509        Returns the value associated with the longest key in this trie that is
510        a prefix of ``key``.
511
512        If the trie doesn't contain any prefix of ``key``:
513          - if ``default`` is given, return it
514          - otherwise raise ``KeyError``
515        """
516        return self._longest_prefix_value(key, default)
517
518    cdef _longest_prefix_value(self, unicode key, default=RAISE_KEY_ERROR):
519        cdef cdatrie.TrieState* state = cdatrie.trie_root(self._c_trie)
520
521        if state == NULL:
522            raise MemoryError()
523
524        cdef int data = 0
525        cdef char found = 0
526
527        try:
528            for ch in key:
529                if not cdatrie.trie_state_walk(state, <cdatrie.AlphaChar> ch):
530                    break
531
532                if cdatrie.trie_state_is_terminal(state):
533                    found = 1
534                    data = cdatrie.trie_state_get_data(state)
535
536            if not found:
537                if default is RAISE_KEY_ERROR:
538                    raise KeyError(key)
539                return default
540
541            return data
542
543        finally:
544            cdatrie.trie_state_free(state)
545
546    def has_keys_with_prefix(self, unicode prefix):
547        """
548        Returns True if any key in the trie begins with ``prefix``.
549        """
550        cdef cdatrie.TrieState* state = cdatrie.trie_root(self._c_trie)
551        if state == NULL:
552            raise MemoryError()
553        try:
554            for char in prefix:
555                if not cdatrie.trie_state_walk(state, <cdatrie.AlphaChar> char):
556                    return False
557            return True
558        finally:
559            cdatrie.trie_state_free(state)
560
561    cpdef items(self, unicode prefix=None):
562        """
563        Returns a list of this trie's items (``(key,value)`` tuples).
564
565        If ``prefix`` is not None, returns only the items
566        associated with keys prefixed by ``prefix``.
567        """
568        cdef bint success
569        cdef list res = []
570        cdef BaseState state = BaseState(self)
571
572        if prefix is not None:
573            success = state.walk(prefix)
574            if not success:
575                return res
576
577        cdef BaseIterator iter = BaseIterator(state)
578
579        if prefix is None:
580            while iter.next():
581                res.append((iter.key(), iter.data()))
582        else:
583            while iter.next():
584                res.append((prefix+iter.key(), iter.data()))
585
586        return res
587
588    def __iter__(self):
589        cdef BaseIterator iter = BaseIterator(BaseState(self))
590        while iter.next():
591            yield iter.key()
592
593    cpdef keys(self, unicode prefix=None):
594        """
595        Returns a list of this trie's keys.
596
597        If ``prefix`` is not None, returns only the keys prefixed by ``prefix``.
598        """
599        cdef bint success
600        cdef list res = []
601        cdef BaseState state = BaseState(self)
602
603        if prefix is not None:
604            success = state.walk(prefix)
605            if not success:
606                return res
607
608        cdef BaseIterator iter = BaseIterator(state)
609
610        if prefix is None:
611            while iter.next():
612                res.append(iter.key())
613        else:
614            while iter.next():
615                res.append(prefix+iter.key())
616
617        return res
618
619    cpdef values(self, unicode prefix=None):
620        """
621        Returns a list of this trie's values.
622
623        If ``prefix`` is not None, returns only the values
624        associated with keys prefixed by ``prefix``.
625        """
626        cdef bint success
627        cdef list res = []
628        cdef BaseState state = BaseState(self)
629
630        if prefix is not None:
631            success = state.walk(prefix)
632            if not success:
633                return res
634
635        cdef BaseIterator iter = BaseIterator(state)
636        while iter.next():
637            res.append(iter.data())
638        return res
639
640    cdef _index_to_value(self, cdatrie.TrieData index):
641        return index
642
643
644cdef class Trie(BaseTrie):
645    """
646    Wrapper for libdatrie's trie.
647    Keys are unicode strings, values are Python objects.
648    """
649
650    cdef list _values
651
652    def __init__(self, alphabet=None, ranges=None, AlphaMap alpha_map=None, _create=True):
653        """
654        For efficiency trie needs to know what unicode symbols
655        it should be able to store so this constructor requires
656        either ``alphabet`` (a string/iterable with all allowed characters),
657        ``ranges`` (a list of (begin, end) pairs, e.g. [('a', 'z')])
658        or ``alpha_map`` (:class:`datrie.AlphaMap` instance).
659        """
660        self._values = []
661        super(Trie, self).__init__(alphabet, ranges, alpha_map, _create)
662
663    def __reduce__(self):
664        with tempfile.NamedTemporaryFile() as f:
665            self.write(f)
666            pickle.dump(self._values, f)
667            f.seek(0)
668            state = f.read()
669            return Trie, (None, None, None, False), state
670
671    def __setstate__(self, bytes state):
672        assert self._c_trie is NULL
673        with tempfile.NamedTemporaryFile() as f:
674            f.write(state)
675            f.flush()
676            f.seek(0)
677            self._c_trie = _load_from_file(f)
678            self._values = pickle.load(f)
679
680    def __getitem__(self, unicode key):
681        cdef cdatrie.TrieData index = self._getitem(key)
682        return self._values[index]
683
684    def get(self, unicode key, default=None):
685        cdef cdatrie.TrieData index
686        try:
687            index = self._getitem(key)
688            return self._values[index]
689        except KeyError:
690            return default
691
692    def __setitem__(self, unicode key, object value):
693        cdef cdatrie.TrieData next_index = len(self._values)
694        cdef cdatrie.TrieData index = self._setdefault(key, next_index)
695        if index == next_index:
696            self._values.append(value)   # insert
697        else:
698            self._values[index] = value  # update
699
700    def setdefault(self, unicode key, object value):
701        cdef cdatrie.TrieData next_index = len(self._values)
702        cdef cdatrie.TrieData index = self._setdefault(key, next_index)
703        if index == next_index:
704            self._values.append(value)   # insert
705            return value
706        else:
707            return self._values[index]   # lookup
708
709    def __delitem__(self, unicode key):
710        # XXX: this could be faster (key is encoded twice here)
711        cdef cdatrie.TrieData index = self._getitem(key)
712        self._values[index] = DELETED_OBJECT
713        self._delitem(key)
714
715    def write(self, f):
716        """
717        Writes a trie to a file. File-like objects without real
718        file descriptors are not supported.
719        """
720        super(Trie, self).write(f)
721        pickle.dump(self._values, f)
722
723    @classmethod
724    def read(cls, f):
725        """
726        Creates a new Trie by reading it from file.
727        File-like objects without real file descriptors are not supported.
728        """
729        cdef Trie trie = super(Trie, cls).read(f)
730        trie._values = pickle.load(f)
731        return trie
732
733    cpdef items(self, unicode prefix=None):
734        """
735        Returns a list of this trie's items (``(key,value)`` tuples).
736
737        If ``prefix`` is not None, returns only the items
738        associated with keys prefixed by ``prefix``.
739        """
740
741        # the following code is
742        #
743        #    [(k, self._values[v]) for (k,v) in BaseTrie.items(self, prefix)]
744        #
745        # but inlined for speed.
746
747        cdef bint success
748        cdef list res = []
749        cdef BaseState state = BaseState(self)
750
751        if prefix is not None:
752            success = state.walk(prefix)
753            if not success:
754                return res
755
756        cdef BaseIterator iter = BaseIterator(state)
757
758        if prefix is None:
759            while iter.next():
760                res.append((iter.key(), self._values[iter.data()]))
761        else:
762            while iter.next():
763                res.append((prefix+iter.key(), self._values[iter.data()]))
764
765        return res
766
767    cpdef values(self, unicode prefix=None):
768        """
769        Returns a list of this trie's values.
770
771        If ``prefix`` is not None, returns only the values
772        associated with keys prefixed by ``prefix``.
773        """
774
775        # the following code is
776        #
777        #     [self._values[v] for v in BaseTrie.values(self, prefix)]
778        #
779        # but inlined for speed.
780
781        cdef list res = []
782        cdef BaseState state = BaseState(self)
783        cdef bint success
784
785        if prefix is not None:
786            success = state.walk(prefix)
787            if not success:
788                return res
789
790        cdef BaseIterator iter = BaseIterator(state)
791
792        while iter.next():
793            res.append(self._values[iter.data()])
794
795        return res
796
797    def longest_prefix_item(self, unicode key, default=RAISE_KEY_ERROR):
798        """
799        Returns the item (``(key,value)`` tuple) associated with the longest
800        key in this trie that is a prefix of ``key``.
801
802        If the trie doesn't contain any prefix of ``key``:
803          - if ``default`` is given, returns it,
804          - otherwise raises ``KeyError``.
805        """
806        cdef res = self._longest_prefix_item(key, RERAISE_KEY_ERROR)
807        if res is RERAISE_KEY_ERROR: # error
808            if default is RAISE_KEY_ERROR:
809                raise KeyError(key)
810            return default
811
812        return res[0], self._values[res[1]]
813
814    def longest_prefix_value(self, unicode key, default=RAISE_KEY_ERROR):
815        """
816        Returns the value associated with the longest key in this trie that is
817        a prefix of ``key``.
818
819        If the trie doesn't contain any prefix of ``key``:
820          - if ``default`` is given, return it
821          - otherwise raise ``KeyError``
822        """
823        cdef res = self._longest_prefix_value(key, RERAISE_KEY_ERROR)
824        if res is RERAISE_KEY_ERROR: # error
825            if default is RAISE_KEY_ERROR:
826                raise KeyError(key)
827            return default
828
829        return self._values[res]
830
831    def prefix_items(self, unicode key):
832        '''
833        Returns a list of the items (``(key,value)`` tuples)
834        of this trie that are associated with keys that are
835        prefixes of ``key``.
836        '''
837        return [(k, self._values[v]) for (k, v) in self._prefix_items(key)]
838
839    def iter_prefix_items(self, unicode key):
840        for k, v in super(Trie, self).iter_prefix_items(key):
841            yield k, self._values[v]
842
843    def prefix_values(self, unicode key):
844        '''
845        Returns a list of the values of this trie that are associated
846        with keys that are prefixes of ``key``.
847        '''
848        return [self._values[v] for v in self._prefix_values(key)]
849
850    def iter_prefix_values(self, unicode key):
851        for v in super(Trie, self).iter_prefix_values(key):
852            yield self._values[v]
853
854    cdef _index_to_value(self, cdatrie.TrieData index):
855        return self._values[index]
856
857
858cdef class _TrieState:
859    cdef cdatrie.TrieState* _state
860    cdef BaseTrie _trie
861
862    def __cinit__(self, BaseTrie trie):
863        self._state = cdatrie.trie_root(trie._c_trie)
864        if self._state is NULL:
865            raise MemoryError()
866        self._trie = trie
867
868    def __dealloc__(self):
869        if self._state is not NULL:
870            cdatrie.trie_state_free(self._state)
871
872    cpdef walk(self, unicode to):
873        cdef bint res
874        for ch in to:
875            if not self.walk_char(<cdatrie.AlphaChar> ch):
876                return False
877        return True
878
879    cdef bint walk_char(self, cdatrie.AlphaChar char):
880        """
881        Walks the trie stepwise, using a given character ``char``.
882        On return, the state is updated to the new state if successfully walked.
883        Returns boolean value indicating the success of the walk.
884        """
885        return cdatrie.trie_state_walk(self._state, char)
886
887    cpdef copy_to(self, _TrieState state):
888        """ Copies trie state to another """
889        cdatrie.trie_state_copy(state._state, self._state)
890
891    cpdef rewind(self):
892        """ Puts the state at root """
893        cdatrie.trie_state_rewind(self._state)
894
895    cpdef bint is_terminal(self):
896        return cdatrie.trie_state_is_terminal(self._state)
897
898    cpdef bint is_single(self):
899        return cdatrie.trie_state_is_single(self._state)
900
901    cpdef bint is_leaf(self):
902        return cdatrie.trie_state_is_leaf(self._state)
903
904    def __unicode__(self):
905        return u"data:%d, term:%s, leaf:%s, single: %s" % (
906            self.data(),
907            self.is_terminal(),
908            self.is_leaf(),
909            self.is_single(),
910        )
911
912    def __repr__(self):
913        return self.__unicode__()  # XXX: this is incorrect under Python 2.x
914
915
916cdef class BaseState(_TrieState):
917    """
918    cdatrie.TrieState wrapper. It can be used for custom trie traversal.
919    """
920    cpdef int data(self):
921        return cdatrie.trie_state_get_data(self._state)
922
923
924cdef class State(_TrieState):
925
926    def __cinit__(self, Trie trie): # this is overriden for extra type check
927        self._state = cdatrie.trie_root(trie._c_trie)
928        if self._state is NULL:
929            raise MemoryError()
930        self._trie = trie
931
932    cpdef data(self):
933        cdef cdatrie.TrieData data = cdatrie.trie_state_get_data(self._state)
934        return self._trie._index_to_value(data)
935
936
937cdef class _TrieIterator:
938    cdef cdatrie.TrieIterator* _iter
939    cdef _TrieState _root
940
941    def __cinit__(self, _TrieState state):
942        self._root = state # prevent garbage collection of state
943        self._iter = cdatrie.trie_iterator_new(state._state)
944        if self._iter is NULL:
945            raise MemoryError()
946
947    def __dealloc__(self):
948        if self._iter is not NULL:
949            cdatrie.trie_iterator_free(self._iter)
950
951    cpdef bint next(self):
952        return cdatrie.trie_iterator_next(self._iter)
953
954    cpdef unicode key(self):
955        cdef cdatrie.AlphaChar* key = cdatrie.trie_iterator_get_key(self._iter)
956        try:
957            return unicode_from_alpha_char(key)
958        finally:
959            free(key)
960
961
962cdef class BaseIterator(_TrieIterator):
963    """
964    cdatrie.TrieIterator wrapper. It can be used for custom datrie.BaseTrie
965    traversal.
966    """
967    cpdef cdatrie.TrieData data(self):
968        return cdatrie.trie_iterator_get_data(self._iter)
969
970
971cdef class Iterator(_TrieIterator):
972    """
973    cdatrie.TrieIterator wrapper. It can be used for custom datrie.Trie
974    traversal.
975    """
976    def __cinit__(self, State state): # this is overriden for extra type check
977        self._root = state # prevent garbage collection of state
978        self._iter = cdatrie.trie_iterator_new(state._state)
979        if self._iter is NULL:
980            raise MemoryError()
981
982    cpdef data(self):
983        cdef cdatrie.TrieData data = cdatrie.trie_iterator_get_data(self._iter)
984        return self._root._trie._index_to_value(data)
985
986
987cdef (cdatrie.Trie* ) _load_from_file(f) except NULL:
988    cdef int fd = f.fileno()
989    cdef stdio.FILE* f_ptr = stdio_ext.fdopen(fd, "r")
990    if f_ptr == NULL:
991        raise IOError()
992    cdef cdatrie.Trie* trie = cdatrie.trie_fread(f_ptr)
993    if trie == NULL:
994        raise DatrieError("Can't load trie from stream")
995
996    cdef int f_pos = stdio.ftell(f_ptr)
997    f.seek(f_pos)
998
999    return trie
1000
1001#cdef (cdatrie.Trie*) _load_from_file(path) except NULL:
1002#    str_path = path.encode(sys.getfilesystemencoding())
1003#    cdef char* c_path = str_path
1004#    cdef cdatrie.Trie* trie = cdatrie.trie_new_from_file(c_path)
1005#    if trie is NULL:
1006#        raise DatrieError("Can't load trie from file")
1007#
1008#    return trie
1009
1010
1011# ============================ AlphaMap & utils ================================
1012
1013cdef class AlphaMap:
1014    """
1015    Alphabet map.
1016
1017    For sparse data compactness, the trie alphabet set should
1018    be continuous, but that is usually not the case in general
1019    character sets. Therefore, a map between the input character
1020    and the low-level alphabet set for the trie is created in the
1021    middle. You will have to define your input character set by
1022    listing their continuous ranges of character codes creating a
1023    trie. Then, each character will be automatically assigned
1024    internal codes of continuous values.
1025    """
1026
1027    cdef cdatrie.AlphaMap *_c_alpha_map
1028
1029    def __cinit__(self):
1030        self._c_alpha_map = cdatrie.alpha_map_new()
1031
1032    def __dealloc__(self):
1033        if self._c_alpha_map is not NULL:
1034            cdatrie.alpha_map_free(self._c_alpha_map)
1035
1036    def __init__(self, alphabet=None, ranges=None, _create=True):
1037        if not _create:
1038            return
1039
1040        if ranges is not None:
1041            for range in ranges:
1042                self.add_range(*range)
1043
1044        if alphabet is not None:
1045            self.add_alphabet(alphabet)
1046
1047    cdef AlphaMap copy(self):
1048        cdef AlphaMap clone = AlphaMap(_create=False)
1049        clone._c_alpha_map = cdatrie.alpha_map_clone(self._c_alpha_map)
1050        if clone._c_alpha_map is NULL:
1051            raise MemoryError()
1052
1053        return clone
1054
1055    def add_alphabet(self, alphabet):
1056        """
1057        Adds all chars from iterable to the alphabet set.
1058        """
1059        for begin, end in alphabet_to_ranges(alphabet):
1060            self._add_range(begin, end)
1061
1062    def add_range(self, begin, end):
1063        """
1064        Add a range of character codes from ``begin`` to ``end``
1065        to the alphabet set.
1066
1067        ``begin`` - the first character of the range;
1068        ``end`` - the last character of the range.
1069        """
1070        self._add_range(ord(begin), ord(end))
1071
1072    cpdef _add_range(self, cdatrie.AlphaChar begin, cdatrie.AlphaChar end):
1073        if begin > end:
1074            raise DatrieError('range begin > end')
1075        code = cdatrie.alpha_map_add_range(self._c_alpha_map, begin, end)
1076        if code != 0:
1077            raise MemoryError()
1078
1079
1080cdef cdatrie.AlphaChar* new_alpha_char_from_unicode(unicode txt):
1081    """
1082    Converts Python unicode string to libdatrie's AlphaChar* format.
1083    libdatrie wants null-terminated array of 4-byte LE symbols.
1084
1085    The caller should free the result of this function.
1086    """
1087    cdef int txt_len = len(txt)
1088    cdef int size = (txt_len + 1) * sizeof(cdatrie.AlphaChar)
1089
1090    # allocate buffer
1091    cdef cdatrie.AlphaChar* data = <cdatrie.AlphaChar*> malloc(size)
1092    if data is NULL:
1093        raise MemoryError()
1094
1095    # Copy text contents to buffer.
1096    # XXX: is it safe? The safe alternative is to decode txt
1097    # to utf32_le and then use memcpy to copy the content:
1098    #
1099    #    py_str = txt.encode('utf_32_le')
1100    #    cdef char* c_str = py_str
1101    #    string.memcpy(data, c_str, size-1)
1102    #
1103    # but the following is much (say 10x) faster and this
1104    # function is really in a hot spot.
1105    cdef int i = 0
1106    for char in txt:
1107        data[i] = <cdatrie.AlphaChar> char
1108        i+=1
1109
1110    # Buffer must be null-terminated (last 4 bytes must be zero).
1111    data[txt_len] = 0
1112    return data
1113
1114
1115cdef unicode unicode_from_alpha_char(cdatrie.AlphaChar* key, int len=0):
1116    """
1117    Converts libdatrie's AlphaChar* to Python unicode.
1118    """
1119    cdef int length = len
1120    if length == 0:
1121        length = cdatrie.alpha_char_strlen(key)*sizeof(cdatrie.AlphaChar)
1122    cdef char* c_str = <char*> key
1123    return c_str[:length].decode('utf_32_le')
1124
1125
1126def to_ranges(lst):
1127    """
1128    Converts a list of numbers to a list of ranges::
1129
1130    >>> numbers = [1,2,3,5,6]
1131    >>> list(to_ranges(numbers))
1132    [(1, 3), (5, 6)]
1133    """
1134    for a, b in itertools.groupby(enumerate(lst), lambda t: t[1] - t[0]):
1135        b = list(b)
1136        yield b[0][1], b[-1][1]
1137
1138
1139def alphabet_to_ranges(alphabet):
1140    for begin, end in to_ranges(sorted(map(ord, iter(alphabet)))):
1141        yield begin, end
1142
1143
1144def new(alphabet=None, ranges=None, AlphaMap alpha_map=None):
1145    warnings.warn('datrie.new is deprecated; please use datrie.Trie.',
1146                  DeprecationWarning)
1147    return Trie(alphabet, ranges, alpha_map)
1148
1149
1150MutableMapping.register(Trie)
1151MutableMapping.register(BaseTrie)
1152